From 4a5542dee574e2b523e72c5b33ba8bf1605f2aa4 Mon Sep 17 00:00:00 2001 From: Daniel McIlvaney Date: Thu, 11 Jun 2026 16:34:50 -0700 Subject: [PATCH] ci: queue a package build using a CT scratch build --- .github/workflows/ado/pr-package-build.yml | 104 ++++++++ .../ado/templates/pr-package-build-stages.yml | 245 ++++++++++++++++++ .../containers/azldev-runner.Dockerfile | 11 + scripts/ci/components/README.md | 9 +- scripts/ci/control-tower/client.py | 73 +++++- scripts/ci/control-tower/run_package_build.py | 172 +++++++----- scripts/ci/control-tower/run_prcheck.py | 17 +- 7 files changed, 534 insertions(+), 97 deletions(-) create mode 100644 .github/workflows/ado/pr-package-build.yml create mode 100644 .github/workflows/ado/templates/pr-package-build-stages.yml diff --git a/.github/workflows/ado/pr-package-build.yml b/.github/workflows/ado/pr-package-build.yml new file mode 100644 index 00000000000..ac772425d60 --- /dev/null +++ b/.github/workflows/ado/pr-package-build.yml @@ -0,0 +1,104 @@ +# Microsoft Corporation +# +# Wrapper pipeline — passed to ADO as the entry point for the PR package-build +# check. It submits a *scratch* Control Tower build of the components a pull +# request changes, WAITS for that build to finish, and fails the check if the +# build fails (or is rejected). The build runs in Control Tower's own sandbox — +# NO PR-controlled code is built on the CI agent (only read-only change +# detection runs there). +# +# This file owns all OneBranch-specific wiring (governed templates repo, +# NonOfficial variant, featureFlags) and delegates the actual stages/jobs/steps +# to the raw stages template at: +# .github/workflows/ado/templates/pr-package-build-stages.yml +# +# WHY SCRATCH + REVIEWER-GATED: building unmerged PR code is only safe because +# (a) it is a *scratch* build (throwaway, never persisted to a production +# repo), so a malicious PR cannot poison published artifacts, and +# (b) the check is REVIEWER-GATED: it is wired in ADO so it does NOT auto-run +# on every PR push — a maintainer triggers it after eyeballing the diff, +# which bounds "unmerged code consumes build capacity / runs in CT" to an +# explicit human decision. See the reviewer-gate prerequisite below. +# run_package_build.py enforces the matching invariant: it refuses to submit an +# OFFICIAL (persisted) build for a PR trigger -- scratch is the default, so the +# PR check only ever produces throwaway builds. +# +# NonOfficial: this is PR validation. It calls the Control Tower DEV endpoint +# (via the DEV service connection) and produces only scratch builds; it is not +# a production-classified pipeline. +# +# Helper scripts live under: +# - scripts/ci/control-tower/ - (Control Tower client + submit script). +# - scripts/ci/components/ - cross-pipeline azldev change-set helpers (shared +# with the GitHub Actions PR gates and the Control Tower pipelines). +# +# Prerequisites (ADO / Azure Portal): +# 1. Entra ID App Registration with audience URI +# "api://" (see variable group below). +# 2. Federated identity credential on the app registration for the ADO +# service connection (issuer: https://vstoken.dev.azure.com/, +# subject: sc:////). +# 3. ARM service connection in ADO project settings using Workload Identity +# Federation (manual). +# 4. REVIEWER-GATED trigger (configured in ADO, not here): wire this pipeline +# as a build-validation check that does NOT automatically queue on every +# PR push (e.g. an optional/manual build-validation policy, or a manual +# approval check), so a maintainer must trigger it after reviewing the PR. +# +# Variable Group (ADO Pipelines > Library): +# Name: "ControlTower-PRCheck" +# Required variables: +# - ApiAudience : Entra ID audience URI for the Control Tower app +# - ApiBaseDirectUrl : Direct base URL of the Control Tower APIM endpoint (bypasses Azure Front Door) + +# Trigger controlled by ADO branch policy — not YAML triggers. +trigger: none + +pr: none + +resources: + repositories: + - repository: templates + type: git + name: OneBranch.Pipelines/GovernedTemplates + ref: refs/heads/main + +extends: + template: v2/OneBranch.NonOfficial.CrossPlat.yml@templates + parameters: + featureFlags: + golang: + internalModuleProxy: + enabled: true + LinuxHostVersion: + Network: R1 + runOnHost: true + EnableCDPxPAT: false + + # https://aka.ms/obpipelines/sdl + globalSdl: + disableLegacyManifest: true + sbom: + enabled: false + tsa: + enabled: false + + stages: + - template: /.github/workflows/ado/templates/pr-package-build-stages.yml@self + parameters: + outputDirectory: $(Build.ArtifactStagingDirectory)/output + artifactBaseName: prpackagebuild + containerImage: mcr.microsoft.com/onebranch/azurelinux/build:3.0 + poolType: linux + serviceConnection: CT-Endpoints-Access-ServiceConnection-DEV + variableGroup: ControlTower-PRCheck + # Control Tower package target for the 4.0 branch. + packageTarget: azl4 + # This check WAITS for the Control Tower build to finish (pass/fail), + # so the job must cover the full build. pollTimeoutSeconds caps how + # long run_package_build.py waits (21600 = 6h, our worst-case build); + # timeoutInMinutes sits above that plus setup headroom, so the + # script's own clear failure fires before ADO blunt-kills the job. + # Raise both together if a legitimate build is being killed. + pollTimeoutSeconds: 21600 + timeoutInMinutes: 420 diff --git a/.github/workflows/ado/templates/pr-package-build-stages.yml b/.github/workflows/ado/templates/pr-package-build-stages.yml new file mode 100644 index 00000000000..61e2f43fb27 --- /dev/null +++ b/.github/workflows/ado/templates/pr-package-build-stages.yml @@ -0,0 +1,245 @@ +# Microsoft Corporation +# +# Raw stages template for the PR package-build check. Wrapper-agnostic: declares +# the stages/jobs/steps and exposes the wrapper-coupled knobs as parameters. The +# wrapper at .github/workflows/ado/pr-package-build.yml supplies concrete +# values. See that wrapper for why this pipeline exists. +# +# What it does, per PR: +# 1. Ensure full git history (rpmautospec + change detection need it). +# 2. Authenticate to the internal pip feed and install host deps: azldev (for +# change detection only -- no mock, no build) and the Control Tower Python +# client. +# 3. Resolve the PR commit range from the merge commit's parents +# (^1 = target-branch tip, ^2 = PR head). +# 4. Compute the changed-component set (shared compute_change_set.sh). +# 5. Submit a *scratch* Control Tower build of the PR head for exactly those +# components (run_package_build.py --wait-for-completion). The build runs +# in Control Tower's own sandbox; this pipeline WAITS for it to reach a +# terminal state and fails the check if the build fails (or does not +# finish within the poll timeout). NO PR-controlled code is built on the +# CI agent -- only read-only change detection runs here. +# +# It deliberately does NOT reuse templates/steps/common-steps.yml: that shared +# step set resolves the commit range via the *previous CI build* (the +# post-merge delta logic), which is wrong for a PR. A PR range comes from the +# merge commit's parents, computed inline below. Reusing common-steps would +# also pull in the lock/render verify steps (already covered by the GitHub +# Actions PR gates) and force a refactor of a file shared by two production +# pipelines. +# +# Because it calls Control Tower, this pipeline needs the WIF service connection +# and the Control Tower variable group (audience + base URL); the wrapper +# supplies both as parameters. + +parameters: + - name: outputDirectory + type: string + - name: artifactBaseName + type: string + - name: containerImage + type: string + - name: poolType + type: string + default: linux + - name: serviceConnection + type: string + - name: variableGroup + type: string + # Control Tower package target for builds submitted from this pipeline + # (e.g. azl4 for the 4.0 branch, azl5 for 5.0). Bound per-branch by the + # wrapper so a branch's builds land in the correct target. + - name: packageTarget + type: string + - name: timeoutInMinutes + type: number + # Max seconds run_package_build.py waits for the Control Tower build to reach + # a terminal state. Keep below the job's timeoutInMinutes (above) so the + # script's own clear failure fires before ADO blunt-kills the job. Default + # 21600 = 6h (our worst-case build); the wrapper passes it alongside + # timeoutInMinutes so the two are raised together. + - name: pollTimeoutSeconds + type: number + default: 21600 + +stages: + - stage: PRPackageBuild + jobs: + - job: PRPackageBuild + # Fail-loud: a failed submission, an immediate Control Tower rejection, + # or a build that fails (or never reaches a terminal state) turns the PR + # check red. The build runs in Control Tower's own sandbox -- NOT on + # this agent -- but this pipeline WAITS for it to finish + # (run_package_build.py --wait-for-completion). Size the timeout to + # cover the FULL build: it must exceed the script's pollTimeoutSeconds + # (6h default) so the script's own clear failure fires before ADO + # blunt-kills the job. + timeoutInMinutes: ${{ parameters.timeoutInMinutes }} + pool: + type: ${{ parameters.poolType }} + variables: + - group: ${{ parameters.variableGroup }} + - name: ob_outputDirectory + value: ${{ parameters.outputDirectory }} + - name: ob_artifactBaseName + value: ${{ parameters.artifactBaseName }} + - name: LinuxContainerImage + value: ${{ parameters.containerImage }} + steps: + # Full history: `azldev component changed` tree-diffs two commits and + # rpmautospec derives Release/changelog from `git log`. The CI + # checkout may be shallow (depth 1); unshallow once, up front. Never + # `git fetch --depth=N` afterwards — that re-shallows a full clone and + # silently corrupts the rpmautospec Release calculation. + - script: | + set -euo pipefail + if [ "$(git rev-parse --is-shallow-repository)" = "true" ]; then + echo "##[group]Fetching full git history" + git fetch --unshallow + echo "##[endgroup]" + fi + displayName: "Ensure full git history" + + - task: PipAuthenticate@1 + displayName: "Authenticate pip" + inputs: + artifactFeeds: "azl/ControlTowerFeed" + + # azldev opens the repo with go-git, which rejects a config that + # declares the `worktreeconfig` extension while + # core.repositoryformatversion is still 0: + # "core.repositoryformatversion does not support extension: worktreeconfig" + # Native git tolerates this, and the ADO agent checkout leaves the + # extension set, so strip it before any azldev invocation. Each CI run + # is a fresh checkout so this is safe and self-contained. + # TODO: remove this step once azldev no longer needs the workaround + # (go-git v6 fixes the underlying bug): + # https://github.com/microsoft/azure-linux-dev-tools/issues/241 + - script: | + set -euo pipefail + if git config --get extensions.worktreeConfig >/dev/null 2>&1; then + echo "Removing extensions.worktreeConfig so go-git (azldev) can open the repo" + git config --unset-all extensions.worktreeConfig || true + fi + displayName: "Normalize git config for azldev (go-git)" + + # Host deps for change detection + the Control Tower submission only: + # azldev (`azldev component changed` + git diff -- no mock, no build) + # and the Control Tower Python client. The build itself never runs on + # the agent; it runs asynchronously in Control Tower's own sandbox. + - script: | + set -euo pipefail + echo "##[group]Azldev (host, for change-set)" + # Only the version string comes from the PR checkout; reject a + # malformed/garbage value before it reaches `go install`. + AZLDEV_VERSION="$(tr -d '\n' < .azldev-version)" + if ! printf '%s' "$AZLDEV_VERSION" | grep -Eq '^[0-9A-Za-z._+-]+$'; then + echo "##[error].azldev-version is empty or has unexpected characters" + exit 1 + fi + echo "Installing azldev@${AZLDEV_VERSION}..." + go install "github.com/microsoft/azure-linux-dev-tools/cmd/azldev@${AZLDEV_VERSION}" + + go_bin_path="$(go env GOPATH)/bin" + echo "##vso[task.prependpath]$go_bin_path" + + "$go_bin_path/azldev" --version + echo "##[endgroup]" + + echo "##[group]Python dependencies (Control Tower client)" + pip install -r scripts/ci/control-tower/requirements.txt + echo "##[endgroup]" + displayName: "Install host dependencies" + + # Resolve the PR commit range. A PR-policy build checks out the MERGE + # commit (Build.SourceVersion): parent ^1 is the target-branch tip, + # parent ^2 is the PR head. The diff ^1..^2 is exactly the PR's + # changes relative to the target branch. We read the range here and + # set pipeline variables so the wiring stays visible in the YAML. + - script: | + set -euo pipefail + if ! git rev-parse --verify -q "HEAD^2" >/dev/null; then + echo "##[error]HEAD is not a merge commit -- this pipeline must run as a PR build (Build.Reason=PullRequest)." + exit 1 + fi + target_commit="$(git rev-parse HEAD^1)" + source_commit="$(git rev-parse HEAD^2)" + # PR-supplied data is untrusted: validate both SHAs before use. + for sha in "$target_commit" "$source_commit"; do + if [[ ! "$sha" =~ ^[0-9a-f]{40}$ ]]; then + echo "##[error]invalid commit SHA: $sha" + exit 1 + fi + done + echo "Resolved range: target=$target_commit source=$source_commit" + echo "##vso[task.setvariable variable=sourceCommit;isreadonly=true]$source_commit" + echo "##vso[task.setvariable variable=targetCommit;isreadonly=true]$target_commit" + displayName: "Determine PR commit range" + + # Compute the changed-component set with the shared, cross-pipeline + # single-source-of-truth helper (also used by the GitHub Actions PR + # gates). changed-components.json holds the per-component change + # records consumed by the Control Tower submit step below. + # compute_change_set.sh hard-fails on the supply-chain drift tripwire + # (sourcesChange without an identity change) -- a guard we want to keep + # on PRs. The script self-prefixes AZLDEV_ALLOW_ROOT=1 internally. + - script: | + set -euo pipefail + change_set_dir="$(Build.ArtifactStagingDirectory)/change-set" + echo "##[group]Preparing change set" + scripts/ci/components/compute_change_set.sh \ + --output-dir "$change_set_dir" \ + --source-commit "$SOURCE_COMMIT" \ + --target-commit "$TARGET_COMMIT" + echo "##[endgroup]" + echo "##vso[task.setvariable variable=changedComponentsFile;isreadonly=true]$change_set_dir/changed-components.json" + env: + SOURCE_COMMIT: $(sourceCommit) + TARGET_COMMIT: $(targetCommit) + displayName: "Prepare change set" + + # Submit a SCRATCH Control Tower build of the PR head for the changed + # components. Scratch = throwaway: it never persists to a production + # repo, so building unmerged PR code is safe. Scratch is the default + # (no --official-build); run_package_build.py additionally refuses an + # OFFICIAL build for a PR trigger. --wait-for-completion makes the + # script block until the build reaches a terminal state and fail the + # check on a build failure (or if it does not finish within + # --poll-timeout-seconds, 6h below -- our worst-case build). No PR + # code is built on this agent. + # + # This step assumes the pipeline is wired as a REVIEWER-GATED check in + # ADO (see the wrapper header): it should not auto-run on every PR + # push, so that a maintainer eyeballs the diff before unmerged code is + # submitted for a build. + - task: AzureCLI@2 + displayName: "Submit scratch build to Control Tower" + inputs: + azureSubscription: ${{ parameters.serviceConnection }} + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -euo pipefail + + # --poll-timeout-seconds comes from the pollTimeoutSeconds + # parameter (6h default = our worst-case build). Keep it below + # the job's timeoutInMinutes (wrapper) so the script's own clear + # failure fires before ADO blunt-kills the job. + python3 scripts/ci/control-tower/run_package_build.py \ + --api-audience "$API_AUDIENCE" \ + --api-base-url "$API_BASE_URL" \ + --build-reason "$CT_BUILD_REASON" \ + --changed-components-file "$CHANGED_COMPONENTS_FILE" \ + --package-target "${{ parameters.packageTarget }}" \ + --commit-sha "$SOURCE_COMMIT" \ + --repo-uri "$UPSTREAM_REPO_URL" \ + --wait-for-completion \ + --poll-timeout-seconds ${{ parameters.pollTimeoutSeconds }} + env: + API_AUDIENCE: $(ApiAudience) + API_BASE_URL: $(ApiBaseDirectUrl) + # Non-reserved name: an `env:` override of the reserved BUILD_REASON var is silently ignored by the agent. + CT_BUILD_REASON: $(Build.Reason) + CHANGED_COMPONENTS_FILE: $(changedComponentsFile) + SOURCE_COMMIT: $(sourceCommit) + UPSTREAM_REPO_URL: $(Build.Repository.Uri) diff --git a/.github/workflows/containers/azldev-runner.Dockerfile b/.github/workflows/containers/azldev-runner.Dockerfile index 7f733883e4e..fe2fe36be8b 100644 --- a/.github/workflows/containers/azldev-runner.Dockerfile +++ b/.github/workflows/containers/azldev-runner.Dockerfile @@ -39,6 +39,17 @@ RUN tdnf -y install \ # root. Callers (check-rendered-specs.yml, etc.) read the file and pass it # via --build-arg so the Dockerfile never needs repo-root build context. # No default — omitting --build-arg will fail the build loudly. +# Optional Go module proxy for the `go install` below. Callers that build +# behind an internal-only proxy forward it via --build-arg GOPROXY=...; Docker +# exposes a declared ARG to the RUN environment, which `go install` reads. +# Callers with public egress (e.g. the GitHub Actions render gate) simply omit +# the build-arg: an *omitted* ARG (no default declared) stays UNSET in the RUN +# environment, and an unset GOPROXY is what makes Go fall back to its built-in +# default proxy — a no-op for them. Do NOT instead pass --build-arg GOPROXY="": +# an explicitly *empty* GOPROXY disables all module downloads (no proxy, no +# direct) and would break the install below. The ADO/OneBranch PR build +# forwards the host's internal proxy. +ARG GOPROXY ARG AZLDEV_VERSION RUN test -n "${AZLDEV_VERSION}" || { echo "ERROR: AZLDEV_VERSION build-arg is required (read from .azldev-version)" >&2; exit 1; } \ && GOBIN=/usr/local/bin go install \ diff --git a/scripts/ci/components/README.md b/scripts/ci/components/README.md index c7db2dd173a..8f16d9a8ce7 100644 --- a/scripts/ci/components/README.md +++ b/scripts/ci/components/README.md @@ -1,9 +1,11 @@ # Shared azldev component helpers -Pipeline-agnostic shell + Python helpers consumed by both the GitHub Actions -PR gates (`.github/workflows/check-rendered-specs.yml`) and the ADO +Pipeline-agnostic shell + Python helpers consumed by the GitHub Actions +PR gates (`.github/workflows/check-rendered-specs.yml`), the ADO Control Tower integration pipeline -(`.github/workflows/ado/templates/sources-upload-stages.yml`). +(`.github/workflows/ado/templates/sources-upload-stages.yml`), and the ADO +PR package-build check +(`.github/workflows/ado/templates/pr-package-build-stages.yml`). | Script | Purpose | | ------ | ------- | @@ -28,3 +30,4 @@ Control Tower integration pipeline - `check-rendered-specs.yml` `render` job → `compute_change_set.sh` - `sources-upload-stages.yml` "Prepare change set" step → `compute_change_set.sh` +- `pr-package-build-stages.yml` "Prepare change set" step → `compute_change_set.sh` diff --git a/scripts/ci/control-tower/client.py b/scripts/ci/control-tower/client.py index 2b824f409da..59477daadb5 100644 --- a/scripts/ci/control-tower/client.py +++ b/scripts/ci/control-tower/client.py @@ -14,10 +14,12 @@ ``DefaultAzureCredential`` discovers the session automatically. """ +from __future__ import annotations + import json import time from dataclasses import dataclass -from typing import Any, Optional +from typing import Any import requests from azure.identity import DefaultAzureCredential @@ -29,6 +31,14 @@ NON_TERMINAL_STATUSES = frozenset({"Queued", "Pending", "Running"}) SUCCESS_STATUS = "Completed" TERMINAL_FAILURE_STATUSES = frozenset({"Failed", "Cancelled", "CancelledByAdmin", "Unknown", "TimedOut"}) +# Statuses that END the poll. The poll exits ONLY on a status in this set; +# anything else is treated as still in progress (keep polling until a known +# terminal status or the local timeout). This way a newly-introduced Control +# Tower intermediate status (e.g. a future "Validating") is not misread as +# terminal and used to fail a build that is actually still starting. "Unknown" +# stays terminal on purpose: a missing/blank status is a real problem, not an +# unrecognized-but-valid new state. +TERMINAL_STATUSES = TERMINAL_FAILURE_STATUSES | {SUCCESS_STATUS} @dataclass @@ -41,17 +51,16 @@ class TokenHolder: def make_session() -> requests.Session: """Create a ``requests.Session`` with retries for idempotent GETs only. - Retry budget is tuned to complete quickly relative to the 10s default poll - interval: worst case ~7s of backoff (0.5 + 1 + 2 + 4s capped) across 3 - attempts on 429/5xx. + 6 retries with exponential backoff (0+4+8+16+32+64 = ~124 s, ~2 min worst + case; Retry-After honored). """ session = requests.Session() retry = Retry( - total=3, - connect=3, - read=3, - status=3, - backoff_factor=0.5, + total=6, + connect=6, + read=6, + status=6, + backoff_factor=2.0, status_forcelist=(429, 500, 502, 503, 504), allowed_methods=frozenset({"GET"}), raise_on_status=False, @@ -142,7 +151,7 @@ def _request_with_refresh( audience: str, token_holder: TokenHolder, *, - json_payload: Optional[dict] = None, + json_payload: dict | None = None, ) -> requests.Response: """Issue a request. On a 401, refresh the bearer token once and retry.""" response = session.request( @@ -248,6 +257,26 @@ def _summarize_tasks(tasks: Any) -> str: return f"{total} tasks ({parts})" +# Adaptive poll cadence: (elapsed-seconds threshold, interval-seconds). Tight +# early so short jobs stay responsive; backs off for long builds so a multi-hour +# build does not flood the logs with heartbeats (a fixed 10s interval would be +# ~2160 polls over 6h). Beyond the last threshold, _POLL_MAX_INTERVAL_SECONDS. +_POLL_SCHEDULE: tuple[tuple[int, int], ...] = ( + (600, 10), # first 10 min: every 10s + (1200, 30), # 10-20 min: every 30s + (3600, 60), # 20-60 min: every 60s +) +_POLL_MAX_INTERVAL_SECONDS = 120 # beyond 1 h: every 2 min + + +def _poll_interval_seconds(elapsed_seconds: float) -> int: + """Return the poll interval for the given elapsed time (adaptive backoff).""" + for threshold_seconds, interval_seconds in _POLL_SCHEDULE: + if elapsed_seconds < threshold_seconds: + return interval_seconds + return _POLL_MAX_INTERVAL_SECONDS + + def poll_until_terminal( session: requests.Session, base_url: str, @@ -255,7 +284,6 @@ def poll_until_terminal( audience: str, token_holder: TokenHolder, job_id: str, - poll_interval_seconds: int, poll_timeout_seconds: int, ) -> tuple[dict, bool]: """Poll the job status until it reaches a terminal state or the timeout expires. @@ -269,7 +297,7 @@ def poll_until_terminal( """ start = time.monotonic() deadline = start + poll_timeout_seconds - previous_status: Optional[str] = None + previous_status: str | None = None job_status_object: dict = {} while True: @@ -285,6 +313,18 @@ def poll_until_terminal( f"Job {job_id} status: {transition} (elapsed {elapsed}s){suffix}", flush=True, ) + # Surface schema drift: a status that is neither known-terminal nor + # known-non-terminal means Control Tower introduced a state this + # script doesn't know about. We keep polling (treat it as + # non-terminal) so an in-flight build isn't failed, but warn so the + # gap gets closed. + if current_status not in TERMINAL_STATUSES and current_status not in NON_TERMINAL_STATUSES: + print( + f"##[warning]Unrecognized job status '{current_status}' for job {job_id}; " + "treating it as non-terminal and continuing to poll. If Control Tower added a " + "new status, update NON_TERMINAL_STATUSES / TERMINAL_* in client.py.", + flush=True, + ) previous_status = current_status else: # Heartbeat so the user can see the script is alive and still polling. @@ -293,7 +333,11 @@ def poll_until_terminal( flush=True, ) - if current_status not in NON_TERMINAL_STATUSES: + # Exit ONLY on a known terminal status. An unrecognized status falls + # through and keeps polling (bounded by the timeout) rather than being + # misread as terminal -- which previously turned a still-starting build + # red the moment Control Tower reported a status we didn't enumerate. + if current_status in TERMINAL_STATUSES: return job_status_object, False remaining = deadline - time.monotonic() @@ -304,7 +348,8 @@ def poll_until_terminal( ) return job_status_object, True - time.sleep(min(poll_interval_seconds, max(1, int(remaining)))) + interval_seconds = _poll_interval_seconds(elapsed) + time.sleep(min(interval_seconds, max(1, int(remaining)))) def print_final_status(final: dict) -> None: diff --git a/scripts/ci/control-tower/run_package_build.py b/scripts/ci/control-tower/run_package_build.py index 68db9d3185d..ec5d8ea6867 100644 --- a/scripts/ci/control-tower/run_package_build.py +++ b/scripts/ci/control-tower/run_package_build.py @@ -1,66 +1,81 @@ """Submit a package-build job to the Control Tower service and wait briefly. Flow: - 1. Read the changed-components JSON. + 1. Read the changed-components JSON; an unrecognized ``changeType`` fails + the check closed. 2. Filter to the build set: ``changeType in {added, changed}`` -- any component whose inputs changed needs a rebuild, regardless of whether its ``sourcesChange`` flag is set. 3. POST ``/api/Scenario/package`` with the build request. - 4. Poll briefly (default 5 min) until the job reaches a terminal state - (success or failure) or the local timeout expires. The goal is to - catch jobs that fail immediately on submission, not to wait for the - full build -- a non-terminal status at timeout is treated as - acceptance and the build continues async. - 5. Exit 0 if the job started (or completed). Exit 1 only on submission - failure or immediate terminal failure. + 4. Poll until the job reaches a terminal state (success or failure) or the + poll timeout expires. Two modes: + * default (acceptance): poll briefly just to catch jobs that fail on + submission; a non-terminal status at timeout is treated as + acceptance and the build continues asynchronously. + * --wait-for-completion: poll for the full build; a non-terminal + status at timeout is a failure (for gating checks that must see the + build verdict before passing). + 5. Exit 0 on success (or acceptance in the default mode); exit 1 on + submission failure, terminal build failure, or -- with + --wait-for-completion -- if the build does not finish within the timeout. """ +from __future__ import annotations + import argparse import json import sys from pathlib import Path -from azure.identity import DefaultAzureCredential - import client as ct +from azure.identity import DefaultAzureCredential def _load_build_components(path: Path) -> list[str]: """Filter the ``azldev component changed`` JSON to the build set. The build set is every component with ``changeType`` in ``{added, changed}`` - — these are the components whose inputs differ between source and target + -- these are the components whose inputs differ between source and target and therefore need a rebuild. Unlike the upload set, we do NOT filter on ``sourcesChange`` here: a component can need a rebuild even if its source tarballs didn't change (e.g. an overlay or build-config change). Deleted components are excluded — there is nothing to build. """ + known_change_types = {"added", "changed", "unchanged", "deleted"} + build_change_types = {"added", "changed"} + try: raw = path.read_text(encoding="utf-8") except OSError as exc: - raise SystemExit(f"##[error]Failed to read --changed-components-file {path!s}: {exc}") from exc + print(f"##[error]Failed to read --changed-components-file {path!s}: {exc}") + raise SystemExit(1) from exc try: entries = json.loads(raw) except json.JSONDecodeError as exc: - raise SystemExit(f"##[error]--changed-components-file {path!s} is not valid JSON: {exc}") from exc + print(f"##[error]--changed-components-file {path!s} is not valid JSON: {exc}") + raise SystemExit(1) from exc if not isinstance(entries, list): - raise SystemExit( + print( f"##[error]--changed-components-file {path!s} top-level value " f"must be a JSON array (got {type(entries).__name__})." ) + raise SystemExit(1) - build_change_types = {"added", "changed"} components: list[str] = [] for entry in entries: - if not isinstance(entry, dict): - continue - if entry.get("changeType") in build_change_types: - name = entry.get("component") - if isinstance(name, str) and name: - components.append(name) + change_type = entry.get("changeType") + if change_type not in known_change_types: + print( + f"##[error]--changed-components-file {path!s} has an unrecognized " + f"changeType {change_type!r} (known: {sorted(known_change_types)}); " + "refusing to guess the build set." + ) + raise SystemExit(1) + if change_type in build_change_types: + components.append(entry["component"]) return sorted(set(components)) @@ -82,8 +97,9 @@ def _parse_args() -> argparse.Namespace: parser.add_argument( "--build-reason", required=True, - help="ADO build reason (PullRequest, IndividualCI, ...). Used for the " - "local skip guard -- package builds are not submitted for PR triggers.", + help="ADO build reason (PullRequest, IndividualCI, ...). A PullRequest " + "may submit a SCRATCH build, but an official (persisted) build is " + "refused for a PullRequest.", ) parser.add_argument( "--changed-components-file", @@ -117,13 +133,8 @@ def _parse_args() -> argparse.Namespace: default=False, help="Submit as a non-scratch (official, persisted) build. The default " "is to submit a scratch build -- official is opt-in so the caller has " - "to explicitly say they want a persisted artifact.", - ) - parser.add_argument( - "--poll-interval-seconds", - type=int, - default=10, - help="How often to poll the job status endpoint (default: 10).", + "to explicitly say they want a persisted artifact. Official builds are " + "rejected for PullRequest triggers (unmerged code must never persist).", ) parser.add_argument( "--poll-timeout-seconds", @@ -131,20 +142,66 @@ def _parse_args() -> argparse.Namespace: default=600, help=( "Maximum time to wait for the job to reach a terminal state " - "(default: 600 = 10 min). This is NOT the build timeout -- we " - "just want to catch jobs that fail immediately on submission. " - "A non-terminal status at timeout is treated as acceptance." + "(default: 600 = 10 min). In the default acceptance mode this just " + "catches jobs that fail immediately on submission. With " + "--wait-for-completion, set this to the full build budget -- a " + "non-terminal status at timeout then fails the run." ), ) + parser.add_argument( + "--wait-for-completion", + action="store_true", + default=False, + help="Block until the build reaches a terminal state (success or " + "failure) and exit accordingly; a non-terminal status at " + "--poll-timeout-seconds becomes a failure. Used by gating checks (the " + "PR package-build pipeline). The default fire-and-forget mode instead " + "treats a timeout as acceptance.", + ) return parser.parse_args() +def _build_payload(args: argparse.Namespace, components: list[str]) -> dict[str, object]: + """Assemble the Control Tower ``package`` scenario request body.""" + payload: dict[str, object] = { + "repoUri": args.repo_uri, + "packageTarget": args.package_target, + "packages": components, + "isScratchBuild": not args.official_build, + "buildReason": args.build_reason, + } + if args.commit_sha is not None: + payload["commitSha"] = args.commit_sha + if args.branch is not None: + payload["branch"] = args.branch + return payload + + +def _handle_non_terminal(args: argparse.Namespace, job_id: str, final: dict[str, object]) -> None: + """Handle a poll that ended before the job reached a terminal state. + + With --wait-for-completion this is a failure (a gating run must see the + build verdict); otherwise the non-terminal status is treated as acceptance + and the build continues asynchronously. + """ + last_status = final.get("status", "Unknown") + if args.wait_for_completion: + print( + f"##[error]Job {job_id} did not reach a terminal state within " + f"{args.poll_timeout_seconds}s (last status '{last_status}') -- failing the check." + ) + sys.exit(1) + print( + f"Job {job_id} still in non-terminal status '{last_status}' " + f"after {args.poll_timeout_seconds}s -- build accepted. " + f"Monitor progress in the Control Tower UI." + ) + + def main() -> None: + """Submit a package build to Control Tower and (optionally) wait for the verdict.""" args = _parse_args() - if args.poll_interval_seconds <= 0: - print("##[error]--poll-interval-seconds must be a positive integer.") - sys.exit(2) if args.poll_timeout_seconds <= 0: print("##[error]--poll-timeout-seconds must be a positive integer.") sys.exit(2) @@ -153,29 +210,23 @@ def main() -> None: base_url = args.api_base_url.rstrip("/") - if args.build_reason == "PullRequest": + # Unmerged PR code may only produce a throwaway scratch build; an official + # (persisted) build of a pull request must never happen. Scratch PR builds + # ARE allowed -- the PR package-build check relies on them, and capacity is + # bounded by the reviewer-gated pipeline trigger, not here. + if args.build_reason == "PullRequest" and args.official_build: print( - "Skipping Control Tower call -- pull request triggers do not submit " - "package builds (unmerged code should not consume build capacity)." + "##[error]Refusing to submit an official (persisted) build for a " + "pull request -- unmerged code must never produce official artifacts." ) - return + sys.exit(1) if not components: print("No components need a rebuild -- skipping package-build submission.") return # ── Build payload ──────────────────────────────────────────────── - payload: dict = { - "repoUri": args.repo_uri, - "packageTarget": args.package_target, - "packages": components, - "isScratchBuild": not args.official_build, - "buildReason": args.build_reason, - } - if args.commit_sha is not None: - payload["commitSha"] = args.commit_sha - if args.branch is not None: - payload["branch"] = args.branch + payload = _build_payload(args, components) print("Calling Control Tower 'package' endpoint...") print("Payload:") @@ -211,11 +262,8 @@ def main() -> None: print("##[error]Control Tower 'package' response did not include a 'jobId'. Cannot confirm job acceptance.") sys.exit(1) - # ── Brief poll — just confirm the job was accepted ─────────────── - print( - f"Polling job {job_id} for up to {args.poll_timeout_seconds}s to confirm " - f"acceptance (not waiting for full build completion)..." - ) + # ── Poll for a terminal status ───────────────────────────────── + print(f"Polling job {job_id} for up to {args.poll_timeout_seconds}s for a terminal status...") try: final, timed_out = ct.poll_until_terminal( session, @@ -224,7 +272,6 @@ def main() -> None: args.api_audience, token_holder, job_id, - args.poll_interval_seconds, args.poll_timeout_seconds, ) except RuntimeError as exc: @@ -232,16 +279,7 @@ def main() -> None: sys.exit(1) if timed_out: - # We don't wait for full build completion -- the goal of this poll - # is just to surface a fast-failing job. A non-terminal status at - # the timeout is acceptance enough; the build continues async and - # is monitored in the Control Tower UI. - last_status = final.get("status", "Unknown") - print( - f"Job {job_id} still in non-terminal status '{last_status}' " - f"after {args.poll_timeout_seconds}s -- build accepted. " - f"Monitor progress in the Control Tower UI." - ) + _handle_non_terminal(args, job_id, final) return ct.print_final_status(final) diff --git a/scripts/ci/control-tower/run_prcheck.py b/scripts/ci/control-tower/run_prcheck.py index aac9b3830e7..20aaef5090d 100644 --- a/scripts/ci/control-tower/run_prcheck.py +++ b/scripts/ci/control-tower/run_prcheck.py @@ -18,14 +18,15 @@ lookaside tarballs need to be (re-)uploaded. """ +from __future__ import annotations + import argparse import json import sys from pathlib import Path -from azure.identity import DefaultAzureCredential - import client as ct +from azure.identity import DefaultAzureCredential def _parse_components(value: str) -> list[str]: @@ -121,12 +122,6 @@ def _parse_args() -> argparse.Namespace: help="Target branch name (alternative to --target-commit)", ) parser.add_argument("--repo-uri", required=True, help="Upstream repository URI") - parser.add_argument( - "--poll-interval-seconds", - type=int, - default=10, - help="How often to poll the job status endpoint (default: 10).", - ) parser.add_argument( "--poll-timeout-seconds", type=int, @@ -139,9 +134,6 @@ def _parse_args() -> argparse.Namespace: def main() -> None: args = _parse_args() - if args.poll_interval_seconds <= 0: - print("##[error]--poll-interval-seconds must be a positive integer.") - sys.exit(2) if args.poll_timeout_seconds <= 0: print("##[error]--poll-timeout-seconds must be a positive integer.") sys.exit(2) @@ -214,7 +206,7 @@ def main() -> None: sys.exit(1) # ── Poll for job completion ────────────────────────────────────── - print(f"Polling job {job_id} every {args.poll_interval_seconds}s (timeout {args.poll_timeout_seconds}s)...") + print(f"Polling job {job_id} for up to {args.poll_timeout_seconds}s for a terminal status...") try: final, timed_out = ct.poll_until_terminal( session, @@ -223,7 +215,6 @@ def main() -> None: args.api_audience, token_holder, job_id, - args.poll_interval_seconds, args.poll_timeout_seconds, ) except RuntimeError as exc: