From 4a5542dee574e2b523e72c5b33ba8bf1605f2aa4 Mon Sep 17 00:00:00 2001
From: Daniel McIlvaney <damcilva@microsoft.com>
Date: Thu, 11 Jun 2026 16:34:50 -0700
Subject: [PATCH] ci: queue a package build using a CT scratch build

---
 .github/workflows/ado/pr-package-build.yml    | 104 ++++++++
 .../ado/templates/pr-package-build-stages.yml | 245 ++++++++++++++++++
 .../containers/azldev-runner.Dockerfile       |  11 +
 scripts/ci/components/README.md               |   9 +-
 scripts/ci/control-tower/client.py            |  73 +++++-
 scripts/ci/control-tower/run_package_build.py | 172 +++++++-----
 scripts/ci/control-tower/run_prcheck.py       |  17 +-
 7 files changed, 534 insertions(+), 97 deletions(-)
 create mode 100644 .github/workflows/ado/pr-package-build.yml
 create mode 100644 .github/workflows/ado/templates/pr-package-build-stages.yml
diff --git a/.github/workflows/ado/pr-package-build.yml b/.github/workflows/ado/pr-package-build.yml
new file mode 100644
index 00000000000..ac772425d60
--- /dev/null
+++ b/.github/workflows/ado/pr-package-build.yml
@@ -0,0 +1,104 @@
+# Microsoft Corporation
+#
+# Wrapper pipeline — passed to ADO as the entry point for the PR package-build
+# check. It submits a *scratch* Control Tower build of the components a pull
+# request changes, WAITS for that build to finish, and fails the check if the
+# build fails (or is rejected). The build runs in Control Tower's own sandbox —
+# NO PR-controlled code is built on the CI agent (only read-only change
+# detection runs there).
+#
+# This file owns all OneBranch-specific wiring (governed templates repo,
+# NonOfficial variant, featureFlags) and delegates the actual stages/jobs/steps
+# to the raw stages template at:
+#   .github/workflows/ado/templates/pr-package-build-stages.yml
+#
+# WHY SCRATCH + REVIEWER-GATED: building unmerged PR code is only safe because
+#   (a) it is a *scratch* build (throwaway, never persisted to a production
+#       repo), so a malicious PR cannot poison published artifacts, and
+#   (b) the check is REVIEWER-GATED: it is wired in ADO so it does NOT auto-run
+#       on every PR push — a maintainer triggers it after eyeballing the diff,
+#       which bounds "unmerged code consumes build capacity / runs in CT" to an
+#       explicit human decision. See the reviewer-gate prerequisite below.
+# run_package_build.py enforces the matching invariant: it refuses to submit an
+# OFFICIAL (persisted) build for a PR trigger -- scratch is the default, so the
+# PR check only ever produces throwaway builds.
+#
+# NonOfficial: this is PR validation. It calls the Control Tower DEV endpoint
+# (via the DEV service connection) and produces only scratch builds; it is not
+# a production-classified pipeline.
+#
+# Helper scripts live under:
+# - scripts/ci/control-tower/ - (Control Tower client + submit script).
+# - scripts/ci/components/ - cross-pipeline azldev change-set helpers (shared
+#   with the GitHub Actions PR gates and the Control Tower pipelines).
+#
+# Prerequisites (ADO / Azure Portal):
+#   1. Entra ID App Registration with audience URI
+#      "api://<ControlTower-ClientId>" (see variable group below).
+#   2. Federated identity credential on the app registration for the ADO
+#      service connection (issuer: https://vstoken.dev.azure.com/<org-id>,
+#      subject: sc://<org>/<project>/<service-connection-name>).
+#   3. ARM service connection in ADO project settings using Workload Identity
+#      Federation (manual).
+#   4. REVIEWER-GATED trigger (configured in ADO, not here): wire this pipeline
+#      as a build-validation check that does NOT automatically queue on every
+#      PR push (e.g. an optional/manual build-validation policy, or a manual
+#      approval check), so a maintainer must trigger it after reviewing the PR.
+#
+# Variable Group (ADO Pipelines > Library):
+#   Name: "ControlTower-PRCheck"
+#   Required variables:
+#     - ApiAudience          : Entra ID audience URI for the Control Tower app
+#     - ApiBaseDirectUrl     : Direct base URL of the Control Tower APIM endpoint (bypasses Azure Front Door)
+
+# Trigger controlled by ADO branch policy — not YAML triggers.
+trigger: none
+
+pr: none
+
+resources:
+  repositories:
+    - repository: templates
+      type: git
+      name: OneBranch.Pipelines/GovernedTemplates
+      ref: refs/heads/main
+
+extends:
+  template: v2/OneBranch.NonOfficial.CrossPlat.yml@templates
+  parameters:
+    featureFlags:
+      golang:
+        internalModuleProxy:
+          enabled: true
+      LinuxHostVersion:
+        Network: R1
+      runOnHost: true
+      EnableCDPxPAT: false
+
+    # https://aka.ms/obpipelines/sdl
+    globalSdl:
+      disableLegacyManifest: true
+      sbom:
+        enabled: false
+      tsa:
+        enabled: false
+
+    stages:
+      - template: /.github/workflows/ado/templates/pr-package-build-stages.yml@self
+        parameters:
+          outputDirectory: $(Build.ArtifactStagingDirectory)/output
+          artifactBaseName: prpackagebuild
+          containerImage: mcr.microsoft.com/onebranch/azurelinux/build:3.0
+          poolType: linux
+          serviceConnection: CT-Endpoints-Access-ServiceConnection-DEV
+          variableGroup: ControlTower-PRCheck
+          # Control Tower package target for the 4.0 branch.
+          packageTarget: azl4
+          # This check WAITS for the Control Tower build to finish (pass/fail),
+          # so the job must cover the full build. pollTimeoutSeconds caps how
+          # long run_package_build.py waits (21600 = 6h, our worst-case build);
+          # timeoutInMinutes sits above that plus setup headroom, so the
+          # script's own clear failure fires before ADO blunt-kills the job.
+          # Raise both together if a legitimate build is being killed.
+          pollTimeoutSeconds: 21600
+          timeoutInMinutes: 420
diff --git a/.github/workflows/ado/templates/pr-package-build-stages.yml b/.github/workflows/ado/templates/pr-package-build-stages.yml
new file mode 100644
index 00000000000..61e2f43fb27
--- /dev/null
+++ b/.github/workflows/ado/templates/pr-package-build-stages.yml
@@ -0,0 +1,245 @@
+# Microsoft Corporation
+#
+# Raw stages template for the PR package-build check. Wrapper-agnostic: declares
+# the stages/jobs/steps and exposes the wrapper-coupled knobs as parameters. The
+# wrapper at .github/workflows/ado/pr-package-build.yml supplies concrete
+# values. See that wrapper for why this pipeline exists.
+#
+# What it does, per PR:
+#   1. Ensure full git history (rpmautospec + change detection need it).
+#   2. Authenticate to the internal pip feed and install host deps: azldev (for
+#      change detection only -- no mock, no build) and the Control Tower Python
+#      client.
+#   3. Resolve the PR commit range from the merge commit's parents
+#      (^1 = target-branch tip, ^2 = PR head).
+#   4. Compute the changed-component set (shared compute_change_set.sh).
+#   5. Submit a *scratch* Control Tower build of the PR head for exactly those
+#      components (run_package_build.py --wait-for-completion). The build runs
+#      in Control Tower's own sandbox; this pipeline WAITS for it to reach a
+#      terminal state and fails the check if the build fails (or does not
+#      finish within the poll timeout). NO PR-controlled code is built on the
+#      CI agent -- only read-only change detection runs here.
+#
+# It deliberately does NOT reuse templates/steps/common-steps.yml: that shared
+# step set resolves the commit range via the *previous CI build* (the
+# post-merge delta logic), which is wrong for a PR. A PR range comes from the
+# merge commit's parents, computed inline below. Reusing common-steps would
+# also pull in the lock/render verify steps (already covered by the GitHub
+# Actions PR gates) and force a refactor of a file shared by two production
+# pipelines.
+#
+# Because it calls Control Tower, this pipeline needs the WIF service connection
+# and the Control Tower variable group (audience + base URL); the wrapper
+# supplies both as parameters.
+
+parameters:
+  - name: outputDirectory
+    type: string
+  - name: artifactBaseName
+    type: string
+  - name: containerImage
+    type: string
+  - name: poolType
+    type: string
+    default: linux
+  - name: serviceConnection
+    type: string
+  - name: variableGroup
+    type: string
+  # Control Tower package target for builds submitted from this pipeline
+  # (e.g. azl4 for the 4.0 branch, azl5 for 5.0). Bound per-branch by the
+  # wrapper so a branch's builds land in the correct target.
+  - name: packageTarget
+    type: string
+  - name: timeoutInMinutes
+    type: number
+  # Max seconds run_package_build.py waits for the Control Tower build to reach
+  # a terminal state. Keep below the job's timeoutInMinutes (above) so the
+  # script's own clear failure fires before ADO blunt-kills the job. Default
+  # 21600 = 6h (our worst-case build); the wrapper passes it alongside
+  # timeoutInMinutes so the two are raised together.
+  - name: pollTimeoutSeconds
+    type: number
+    default: 21600
+
+stages:
+  - stage: PRPackageBuild
+    jobs:
+      - job: PRPackageBuild
+        # Fail-loud: a failed submission, an immediate Control Tower rejection,
+        # or a build that fails (or never reaches a terminal state) turns the PR
+        # check red. The build runs in Control Tower's own sandbox -- NOT on
+        # this agent -- but this pipeline WAITS for it to finish
+        # (run_package_build.py --wait-for-completion). Size the timeout to
+        # cover the FULL build: it must exceed the script's pollTimeoutSeconds
+        # (6h default) so the script's own clear failure fires before ADO
+        # blunt-kills the job.
+        timeoutInMinutes: ${{ parameters.timeoutInMinutes }}
+        pool:
+          type: ${{ parameters.poolType }}
+        variables:
+          - group: ${{ parameters.variableGroup }}
+          - name: ob_outputDirectory
+            value: ${{ parameters.outputDirectory }}
+          - name: ob_artifactBaseName
+            value: ${{ parameters.artifactBaseName }}
+          - name: LinuxContainerImage
+            value: ${{ parameters.containerImage }}
+        steps:
+          # Full history: `azldev component changed` tree-diffs two commits and
+          # rpmautospec derives Release/changelog from `git log`. The CI
+          # checkout may be shallow (depth 1); unshallow once, up front. Never
+          # `git fetch --depth=N` afterwards — that re-shallows a full clone and
+          # silently corrupts the rpmautospec Release calculation.
+          - script: |
+              set -euo pipefail
+              if [ "$(git rev-parse --is-shallow-repository)" = "true" ]; then
+                echo "##[group]Fetching full git history"
+                git fetch --unshallow
+                echo "##[endgroup]"
+              fi
+            displayName: "Ensure full git history"
+
+          - task: PipAuthenticate@1
+            displayName: "Authenticate pip"
+            inputs:
+              artifactFeeds: "azl/ControlTowerFeed"
+
+          # azldev opens the repo with go-git, which rejects a config that
+          # declares the `worktreeconfig` extension while
+          # core.repositoryformatversion is still 0:
+          #   "core.repositoryformatversion does not support extension: worktreeconfig"
+          # Native git tolerates this, and the ADO agent checkout leaves the
+          # extension set, so strip it before any azldev invocation. Each CI run
+          # is a fresh checkout so this is safe and self-contained.
+          # TODO: remove this step once azldev no longer needs the workaround
+          # (go-git v6 fixes the underlying bug):
+          # https://github.com/microsoft/azure-linux-dev-tools/issues/241
+          - script: |
+              set -euo pipefail
+              if git config --get extensions.worktreeConfig >/dev/null 2>&1; then
+                echo "Removing extensions.worktreeConfig so go-git (azldev) can open the repo"
+                git config --unset-all extensions.worktreeConfig || true
+              fi
+            displayName: "Normalize git config for azldev (go-git)"
+
+          # Host deps for change detection + the Control Tower submission only:
+          # azldev (`azldev component changed` + git diff -- no mock, no build)
+          # and the Control Tower Python client. The build itself never runs on
+          # the agent; it runs asynchronously in Control Tower's own sandbox.
+          - script: |
+              set -euo pipefail
+              echo "##[group]Azldev (host, for change-set)"
+              # Only the version string comes from the PR checkout; reject a
+              # malformed/garbage value before it reaches `go install`.
+              AZLDEV_VERSION="$(tr -d '\n' < .azldev-version)"
+              if ! printf '%s' "$AZLDEV_VERSION" | grep -Eq '^[0-9A-Za-z._+-]+$'; then
+                echo "##[error].azldev-version is empty or has unexpected characters"
+                exit 1
+              fi
+              echo "Installing azldev@${AZLDEV_VERSION}..."
+              go install "github.com/microsoft/azure-linux-dev-tools/cmd/azldev@${AZLDEV_VERSION}"
+
+              go_bin_path="$(go env GOPATH)/bin"
+              echo "##vso[task.prependpath]$go_bin_path"
+
+              "$go_bin_path/azldev" --version
+              echo "##[endgroup]"
+
+              echo "##[group]Python dependencies (Control Tower client)"
+              pip install -r scripts/ci/control-tower/requirements.txt
+              echo "##[endgroup]"
+            displayName: "Install host dependencies"
+
+          # Resolve the PR commit range. A PR-policy build checks out the MERGE
+          # commit (Build.SourceVersion): parent ^1 is the target-branch tip,
+          # parent ^2 is the PR head. The diff ^1..^2 is exactly the PR's
+          # changes relative to the target branch. We read the range here and
+          # set pipeline variables so the wiring stays visible in the YAML.
+          - script: |
+              set -euo pipefail
+              if ! git rev-parse --verify -q "HEAD^2" >/dev/null; then
+                echo "##[error]HEAD is not a merge commit -- this pipeline must run as a PR build (Build.Reason=PullRequest)."
+                exit 1
+              fi
+              target_commit="$(git rev-parse HEAD^1)"
+              source_commit="$(git rev-parse HEAD^2)"
+              # PR-supplied data is untrusted: validate both SHAs before use.
+              for sha in "$target_commit" "$source_commit"; do
+                if [[ ! "$sha" =~ ^[0-9a-f]{40}$ ]]; then
+                  echo "##[error]invalid commit SHA: $sha"
+                  exit 1
+                fi
+              done
+              echo "Resolved range: target=$target_commit source=$source_commit"
+              echo "##vso[task.setvariable variable=sourceCommit;isreadonly=true]$source_commit"
+              echo "##vso[task.setvariable variable=targetCommit;isreadonly=true]$target_commit"
+            displayName: "Determine PR commit range"
+
+          # Compute the changed-component set with the shared, cross-pipeline
+          # single-source-of-truth helper (also used by the GitHub Actions PR
+          # gates). changed-components.json holds the per-component change
+          # records consumed by the Control Tower submit step below.
+          # compute_change_set.sh hard-fails on the supply-chain drift tripwire
+          # (sourcesChange without an identity change) -- a guard we want to keep
+          # on PRs. The script self-prefixes AZLDEV_ALLOW_ROOT=1 internally.
+          - script: |
+              set -euo pipefail
+              change_set_dir="$(Build.ArtifactStagingDirectory)/change-set"
+              echo "##[group]Preparing change set"
+              scripts/ci/components/compute_change_set.sh \
+                --output-dir "$change_set_dir" \
+                --source-commit "$SOURCE_COMMIT" \
+                --target-commit "$TARGET_COMMIT"
+              echo "##[endgroup]"
+              echo "##vso[task.setvariable variable=changedComponentsFile;isreadonly=true]$change_set_dir/changed-components.json"
+            env:
+              SOURCE_COMMIT: $(sourceCommit)
+              TARGET_COMMIT: $(targetCommit)
+            displayName: "Prepare change set"
+
+          # Submit a SCRATCH Control Tower build of the PR head for the changed
+          # components. Scratch = throwaway: it never persists to a production
+          # repo, so building unmerged PR code is safe. Scratch is the default
+          # (no --official-build); run_package_build.py additionally refuses an
+          # OFFICIAL build for a PR trigger. --wait-for-completion makes the
+          # script block until the build reaches a terminal state and fail the
+          # check on a build failure (or if it does not finish within
+          # --poll-timeout-seconds, 6h below -- our worst-case build). No PR
+          # code is built on this agent.
+          #
+          # This step assumes the pipeline is wired as a REVIEWER-GATED check in
+          # ADO (see the wrapper header): it should not auto-run on every PR
+          # push, so that a maintainer eyeballs the diff before unmerged code is
+          # submitted for a build.
+          - task: AzureCLI@2
+            displayName: "Submit scratch build to Control Tower"
+            inputs:
+              azureSubscription: ${{ parameters.serviceConnection }}
+              scriptType: bash
+              scriptLocation: inlineScript
+              inlineScript: |
+                set -euo pipefail
+
+                # --poll-timeout-seconds comes from the pollTimeoutSeconds
+                # parameter (6h default = our worst-case build). Keep it below
+                # the job's timeoutInMinutes (wrapper) so the script's own clear
+                # failure fires before ADO blunt-kills the job.
+                python3 scripts/ci/control-tower/run_package_build.py \
+                  --api-audience "$API_AUDIENCE" \
+                  --api-base-url "$API_BASE_URL" \
+                  --build-reason "$CT_BUILD_REASON" \
+                  --changed-components-file "$CHANGED_COMPONENTS_FILE" \
+                  --package-target "${{ parameters.packageTarget }}" \
+                  --commit-sha "$SOURCE_COMMIT" \
+                  --repo-uri "$UPSTREAM_REPO_URL" \
+                  --wait-for-completion \
+                  --poll-timeout-seconds ${{ parameters.pollTimeoutSeconds }}
+            env:
+              API_AUDIENCE: $(ApiAudience)
+              API_BASE_URL: $(ApiBaseDirectUrl)
+              # Non-reserved name: an `env:` override of the reserved BUILD_REASON var is silently ignored by the agent.
+              CT_BUILD_REASON: $(Build.Reason)
+              CHANGED_COMPONENTS_FILE: $(changedComponentsFile)
+              SOURCE_COMMIT: $(sourceCommit)
+              UPSTREAM_REPO_URL: $(Build.Repository.Uri)
diff --git a/.github/workflows/containers/azldev-runner.Dockerfile b/.github/workflows/containers/azldev-runner.Dockerfile
index 7f733883e4e..fe2fe36be8b 100644
--- a/.github/workflows/containers/azldev-runner.Dockerfile
+++ b/.github/workflows/containers/azldev-runner.Dockerfile
@@ -39,6 +39,17 @@ RUN tdnf -y install \
 # root.  Callers (check-rendered-specs.yml, etc.) read the file and pass it
 # via --build-arg so the Dockerfile never needs repo-root build context.
 # No default — omitting --build-arg will fail the build loudly.
+# Optional Go module proxy for the `go install` below. Callers that build
+# behind an internal-only proxy forward it via --build-arg GOPROXY=...; Docker
+# exposes a declared ARG to the RUN environment, which `go install` reads.
+# Callers with public egress (e.g. the GitHub Actions render gate) simply omit
+# the build-arg: an *omitted* ARG (no default declared) stays UNSET in the RUN
+# environment, and an unset GOPROXY is what makes Go fall back to its built-in
+# default proxy — a no-op for them. Do NOT instead pass --build-arg GOPROXY="":
+# an explicitly *empty* GOPROXY disables all module downloads (no proxy, no
+# direct) and would break the install below. The ADO/OneBranch PR build
+# forwards the host's internal proxy.
+ARG GOPROXY
 ARG AZLDEV_VERSION
 RUN test -n "${AZLDEV_VERSION}" || { echo "ERROR: AZLDEV_VERSION build-arg is required (read from .azldev-version)" >&2; exit 1; } \
     && GOBIN=/usr/local/bin go install \
diff --git a/scripts/ci/components/README.md b/scripts/ci/components/README.md
index c7db2dd173a..8f16d9a8ce7 100644
--- a/scripts/ci/components/README.md
+++ b/scripts/ci/components/README.md
@@ -1,9 +1,11 @@
 # Shared azldev component helpers
 
-Pipeline-agnostic shell + Python helpers consumed by both the GitHub Actions
-PR gates (`.github/workflows/check-rendered-specs.yml`) and the ADO
+Pipeline-agnostic shell + Python helpers consumed by the GitHub Actions
+PR gates (`.github/workflows/check-rendered-specs.yml`), the ADO
 Control Tower integration pipeline
-(`.github/workflows/ado/templates/sources-upload-stages.yml`).
+(`.github/workflows/ado/templates/sources-upload-stages.yml`), and the ADO
+PR package-build check
+(`.github/workflows/ado/templates/pr-package-build-stages.yml`).
 
 | Script | Purpose |
 | ------ | ------- |
@@ -28,3 +30,4 @@ Control Tower integration pipeline
 
 - `check-rendered-specs.yml` `render` job → `compute_change_set.sh`
 - `sources-upload-stages.yml` "Prepare change set" step → `compute_change_set.sh`
+- `pr-package-build-stages.yml` "Prepare change set" step → `compute_change_set.sh`
diff --git a/scripts/ci/control-tower/client.py b/scripts/ci/control-tower/client.py
index 2b824f409da..59477daadb5 100644
--- a/scripts/ci/control-tower/client.py
+++ b/scripts/ci/control-tower/client.py
@@ -14,10 +14,12 @@
     ``DefaultAzureCredential`` discovers the session automatically.
 """
 
+from __future__ import annotations
+
 import json
 import time
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 import requests
 from azure.identity import DefaultAzureCredential
@@ -29,6 +31,14 @@
 NON_TERMINAL_STATUSES = frozenset({"Queued", "Pending", "Running"})
 SUCCESS_STATUS = "Completed"
 TERMINAL_FAILURE_STATUSES = frozenset({"Failed", "Cancelled", "CancelledByAdmin", "Unknown", "TimedOut"})
+# Statuses that END the poll. The poll exits ONLY on a status in this set;
+# anything else is treated as still in progress (keep polling until a known
+# terminal status or the local timeout). This way a newly-introduced Control
+# Tower intermediate status (e.g. a future "Validating") is not misread as
+# terminal and used to fail a build that is actually still starting. "Unknown"
+# stays terminal on purpose: a missing/blank status is a real problem, not an
+# unrecognized-but-valid new state.
+TERMINAL_STATUSES = TERMINAL_FAILURE_STATUSES | {SUCCESS_STATUS}
 
 
 @dataclass
@@ -41,17 +51,16 @@ class TokenHolder:
 def make_session() -> requests.Session:
     """Create a ``requests.Session`` with retries for idempotent GETs only.
 
-    Retry budget is tuned to complete quickly relative to the 10s default poll
-    interval: worst case ~7s of backoff (0.5 + 1 + 2 + 4s capped) across 3
-    attempts on 429/5xx.
+    6 retries with exponential backoff (0+4+8+16+32+64 = ~124 s, ~2 min worst
+    case; Retry-After honored).
     """
     session = requests.Session()
     retry = Retry(
-        total=3,
-        connect=3,
-        read=3,
-        status=3,
-        backoff_factor=0.5,
+        total=6,
+        connect=6,
+        read=6,
+        status=6,
+        backoff_factor=2.0,
         status_forcelist=(429, 500, 502, 503, 504),
         allowed_methods=frozenset({"GET"}),
         raise_on_status=False,
@@ -142,7 +151,7 @@ def _request_with_refresh(
     audience: str,
     token_holder: TokenHolder,
     *,
-    json_payload: Optional[dict] = None,
+    json_payload: dict | None = None,
 ) -> requests.Response:
     """Issue a request. On a 401, refresh the bearer token once and retry."""
     response = session.request(
@@ -248,6 +257,26 @@ def _summarize_tasks(tasks: Any) -> str:
     return f"{total} tasks ({parts})"
 
 
+# Adaptive poll cadence: (elapsed-seconds threshold, interval-seconds). Tight
+# early so short jobs stay responsive; backs off for long builds so a multi-hour
+# build does not flood the logs with heartbeats (a fixed 10s interval would be
+# ~2160 polls over 6h). Beyond the last threshold, _POLL_MAX_INTERVAL_SECONDS.
+_POLL_SCHEDULE: tuple[tuple[int, int], ...] = (
+    (600, 10),  # first 10 min: every 10s
+    (1200, 30),  # 10-20 min: every 30s
+    (3600, 60),  # 20-60 min: every 60s
+)
+_POLL_MAX_INTERVAL_SECONDS = 120  # beyond 1 h: every 2 min
+
+
+def _poll_interval_seconds(elapsed_seconds: float) -> int:
+    """Return the poll interval for the given elapsed time (adaptive backoff)."""
+    for threshold_seconds, interval_seconds in _POLL_SCHEDULE:
+        if elapsed_seconds < threshold_seconds:
+            return interval_seconds
+    return _POLL_MAX_INTERVAL_SECONDS
+
+
 def poll_until_terminal(
     session: requests.Session,
     base_url: str,
@@ -255,7 +284,6 @@ def poll_until_terminal(
     audience: str,
     token_holder: TokenHolder,
     job_id: str,
-    poll_interval_seconds: int,
     poll_timeout_seconds: int,
 ) -> tuple[dict, bool]:
     """Poll the job status until it reaches a terminal state or the timeout expires.
@@ -269,7 +297,7 @@ def poll_until_terminal(
     """
     start = time.monotonic()
     deadline = start + poll_timeout_seconds
-    previous_status: Optional[str] = None
+    previous_status: str | None = None
     job_status_object: dict = {}
 
     while True:
@@ -285,6 +313,18 @@ def poll_until_terminal(
                 f"Job {job_id} status: {transition} (elapsed {elapsed}s){suffix}",
                 flush=True,
             )
+            # Surface schema drift: a status that is neither known-terminal nor
+            # known-non-terminal means Control Tower introduced a state this
+            # script doesn't know about. We keep polling (treat it as
+            # non-terminal) so an in-flight build isn't failed, but warn so the
+            # gap gets closed.
+            if current_status not in TERMINAL_STATUSES and current_status not in NON_TERMINAL_STATUSES:
+                print(
+                    f"##[warning]Unrecognized job status '{current_status}' for job {job_id}; "
+                    "treating it as non-terminal and continuing to poll. If Control Tower added a "
+                    "new status, update NON_TERMINAL_STATUSES / TERMINAL_* in client.py.",
+                    flush=True,
+                )
             previous_status = current_status
         else:
             # Heartbeat so the user can see the script is alive and still polling.
@@ -293,7 +333,11 @@ def poll_until_terminal(
                 flush=True,
             )
 
-        if current_status not in NON_TERMINAL_STATUSES:
+        # Exit ONLY on a known terminal status. An unrecognized status falls
+        # through and keeps polling (bounded by the timeout) rather than being
+        # misread as terminal -- which previously turned a still-starting build
+        # red the moment Control Tower reported a status we didn't enumerate.
+        if current_status in TERMINAL_STATUSES:
             return job_status_object, False
 
         remaining = deadline - time.monotonic()
@@ -304,7 +348,8 @@ def poll_until_terminal(
             )
             return job_status_object, True
 
-        time.sleep(min(poll_interval_seconds, max(1, int(remaining))))
+        interval_seconds = _poll_interval_seconds(elapsed)
+        time.sleep(min(interval_seconds, max(1, int(remaining))))
 
 
 def print_final_status(final: dict) -> None:
diff --git a/scripts/ci/control-tower/run_package_build.py b/scripts/ci/control-tower/run_package_build.py
index 68db9d3185d..ec5d8ea6867 100644
--- a/scripts/ci/control-tower/run_package_build.py
+++ b/scripts/ci/control-tower/run_package_build.py
@@ -1,66 +1,81 @@
 """Submit a package-build job to the Control Tower service and wait briefly.
 
 Flow:
-    1. Read the changed-components JSON.
+    1. Read the changed-components JSON; an unrecognized ``changeType`` fails
+       the check closed.
     2. Filter to the build set: ``changeType in {added, changed}`` -- any
        component whose inputs changed needs a rebuild, regardless of whether
        its ``sourcesChange`` flag is set.
     3. POST ``/api/Scenario/package`` with the build request.
-    4. Poll briefly (default 5 min) until the job reaches a terminal state
-       (success or failure) or the local timeout expires. The goal is to
-       catch jobs that fail immediately on submission, not to wait for the
-       full build -- a non-terminal status at timeout is treated as
-       acceptance and the build continues async.
-    5. Exit 0 if the job started (or completed). Exit 1 only on submission
-       failure or immediate terminal failure.
+    4. Poll until the job reaches a terminal state (success or failure) or the
+       poll timeout expires. Two modes:
+         * default (acceptance): poll briefly just to catch jobs that fail on
+           submission; a non-terminal status at timeout is treated as
+           acceptance and the build continues asynchronously.
+         * --wait-for-completion: poll for the full build; a non-terminal
+           status at timeout is a failure (for gating checks that must see the
+           build verdict before passing).
+    5. Exit 0 on success (or acceptance in the default mode); exit 1 on
+       submission failure, terminal build failure, or -- with
+       --wait-for-completion -- if the build does not finish within the timeout.
 """
 
+from __future__ import annotations
+
 import argparse
 import json
 import sys
 from pathlib import Path
 
-from azure.identity import DefaultAzureCredential
-
 import client as ct
+from azure.identity import DefaultAzureCredential
 
 
 def _load_build_components(path: Path) -> list[str]:
     """Filter the ``azldev component changed`` JSON to the build set.
 
     The build set is every component with ``changeType`` in ``{added, changed}``
-    — these are the components whose inputs differ between source and target
+    -- these are the components whose inputs differ between source and target
     and therefore need a rebuild. Unlike the upload set, we do NOT filter on
     ``sourcesChange`` here: a component can need a rebuild even if its source
     tarballs didn't change (e.g. an overlay or build-config change).
 
     Deleted components are excluded — there is nothing to build.
     """
+    known_change_types = {"added", "changed", "unchanged", "deleted"}
+    build_change_types = {"added", "changed"}
+
     try:
         raw = path.read_text(encoding="utf-8")
     except OSError as exc:
-        raise SystemExit(f"##[error]Failed to read --changed-components-file {path!s}: {exc}") from exc
+        print(f"##[error]Failed to read --changed-components-file {path!s}: {exc}")
+        raise SystemExit(1) from exc
 
     try:
         entries = json.loads(raw)
     except json.JSONDecodeError as exc:
-        raise SystemExit(f"##[error]--changed-components-file {path!s} is not valid JSON: {exc}") from exc
+        print(f"##[error]--changed-components-file {path!s} is not valid JSON: {exc}")
+        raise SystemExit(1) from exc
 
     if not isinstance(entries, list):
-        raise SystemExit(
+        print(
             f"##[error]--changed-components-file {path!s} top-level value "
             f"must be a JSON array (got {type(entries).__name__})."
         )
+        raise SystemExit(1)
 
-    build_change_types = {"added", "changed"}
     components: list[str] = []
     for entry in entries:
-        if not isinstance(entry, dict):
-            continue
-        if entry.get("changeType") in build_change_types:
-            name = entry.get("component")
-            if isinstance(name, str) and name:
-                components.append(name)
+        change_type = entry.get("changeType")
+        if change_type not in known_change_types:
+            print(
+                f"##[error]--changed-components-file {path!s} has an unrecognized "
+                f"changeType {change_type!r} (known: {sorted(known_change_types)}); "
+                "refusing to guess the build set."
+            )
+            raise SystemExit(1)
+        if change_type in build_change_types:
+            components.append(entry["component"])
 
     return sorted(set(components))
 
@@ -82,8 +97,9 @@ def _parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--build-reason",
         required=True,
-        help="ADO build reason (PullRequest, IndividualCI, ...). Used for the "
-        "local skip guard -- package builds are not submitted for PR triggers.",
+        help="ADO build reason (PullRequest, IndividualCI, ...). A PullRequest "
+        "may submit a SCRATCH build, but an official (persisted) build is "
+        "refused for a PullRequest.",
     )
     parser.add_argument(
         "--changed-components-file",
@@ -117,13 +133,8 @@ def _parse_args() -> argparse.Namespace:
         default=False,
         help="Submit as a non-scratch (official, persisted) build. The default "
         "is to submit a scratch build -- official is opt-in so the caller has "
-        "to explicitly say they want a persisted artifact.",
-    )
-    parser.add_argument(
-        "--poll-interval-seconds",
-        type=int,
-        default=10,
-        help="How often to poll the job status endpoint (default: 10).",
+        "to explicitly say they want a persisted artifact. Official builds are "
+        "rejected for PullRequest triggers (unmerged code must never persist).",
     )
     parser.add_argument(
         "--poll-timeout-seconds",
@@ -131,20 +142,66 @@ def _parse_args() -> argparse.Namespace:
         default=600,
         help=(
             "Maximum time to wait for the job to reach a terminal state "
-            "(default: 600 = 10 min). This is NOT the build timeout -- we "
-            "just want to catch jobs that fail immediately on submission. "
-            "A non-terminal status at timeout is treated as acceptance."
+            "(default: 600 = 10 min). In the default acceptance mode this just "
+            "catches jobs that fail immediately on submission. With "
+            "--wait-for-completion, set this to the full build budget -- a "
+            "non-terminal status at timeout then fails the run."
         ),
     )
+    parser.add_argument(
+        "--wait-for-completion",
+        action="store_true",
+        default=False,
+        help="Block until the build reaches a terminal state (success or "
+        "failure) and exit accordingly; a non-terminal status at "
+        "--poll-timeout-seconds becomes a failure. Used by gating checks (the "
+        "PR package-build pipeline). The default fire-and-forget mode instead "
+        "treats a timeout as acceptance.",
+    )
     return parser.parse_args()
 
 
+def _build_payload(args: argparse.Namespace, components: list[str]) -> dict[str, object]:
+    """Assemble the Control Tower ``package`` scenario request body."""
+    payload: dict[str, object] = {
+        "repoUri": args.repo_uri,
+        "packageTarget": args.package_target,
+        "packages": components,
+        "isScratchBuild": not args.official_build,
+        "buildReason": args.build_reason,
+    }
+    if args.commit_sha is not None:
+        payload["commitSha"] = args.commit_sha
+    if args.branch is not None:
+        payload["branch"] = args.branch
+    return payload
+
+
+def _handle_non_terminal(args: argparse.Namespace, job_id: str, final: dict[str, object]) -> None:
+    """Handle a poll that ended before the job reached a terminal state.
+
+    With --wait-for-completion this is a failure (a gating run must see the
+    build verdict); otherwise the non-terminal status is treated as acceptance
+    and the build continues asynchronously.
+    """
+    last_status = final.get("status", "Unknown")
+    if args.wait_for_completion:
+        print(
+            f"##[error]Job {job_id} did not reach a terminal state within "
+            f"{args.poll_timeout_seconds}s (last status '{last_status}') -- failing the check."
+        )
+        sys.exit(1)
+    print(
+        f"Job {job_id} still in non-terminal status '{last_status}' "
+        f"after {args.poll_timeout_seconds}s -- build accepted. "
+        f"Monitor progress in the Control Tower UI."
+    )
+
+
 def main() -> None:
+    """Submit a package build to Control Tower and (optionally) wait for the verdict."""
     args = _parse_args()
 
-    if args.poll_interval_seconds <= 0:
-        print("##[error]--poll-interval-seconds must be a positive integer.")
-        sys.exit(2)
     if args.poll_timeout_seconds <= 0:
         print("##[error]--poll-timeout-seconds must be a positive integer.")
         sys.exit(2)
@@ -153,29 +210,23 @@ def main() -> None:
 
     base_url = args.api_base_url.rstrip("/")
 
-    if args.build_reason == "PullRequest":
+    # Unmerged PR code may only produce a throwaway scratch build; an official
+    # (persisted) build of a pull request must never happen. Scratch PR builds
+    # ARE allowed -- the PR package-build check relies on them, and capacity is
+    # bounded by the reviewer-gated pipeline trigger, not here.
+    if args.build_reason == "PullRequest" and args.official_build:
         print(
-            "Skipping Control Tower call -- pull request triggers do not submit "
-            "package builds (unmerged code should not consume build capacity)."
+            "##[error]Refusing to submit an official (persisted) build for a "
+            "pull request -- unmerged code must never produce official artifacts."
         )
-        return
+        sys.exit(1)
 
     if not components:
         print("No components need a rebuild -- skipping package-build submission.")
         return
 
     # ── Build payload ────────────────────────────────────────────────
-    payload: dict = {
-        "repoUri": args.repo_uri,
-        "packageTarget": args.package_target,
-        "packages": components,
-        "isScratchBuild": not args.official_build,
-        "buildReason": args.build_reason,
-    }
-    if args.commit_sha is not None:
-        payload["commitSha"] = args.commit_sha
-    if args.branch is not None:
-        payload["branch"] = args.branch
+    payload = _build_payload(args, components)
 
     print("Calling Control Tower 'package' endpoint...")
     print("Payload:")
@@ -211,11 +262,8 @@ def main() -> None:
         print("##[error]Control Tower 'package' response did not include a 'jobId'. Cannot confirm job acceptance.")
         sys.exit(1)
 
-    # ── Brief poll — just confirm the job was accepted ───────────────
-    print(
-        f"Polling job {job_id} for up to {args.poll_timeout_seconds}s to confirm "
-        f"acceptance (not waiting for full build completion)..."
-    )
+    # ── Poll for a terminal status ─────────────────────────────────
+    print(f"Polling job {job_id} for up to {args.poll_timeout_seconds}s for a terminal status...")
     try:
         final, timed_out = ct.poll_until_terminal(
             session,
@@ -224,7 +272,6 @@ def main() -> None:
             args.api_audience,
             token_holder,
             job_id,
-            args.poll_interval_seconds,
             args.poll_timeout_seconds,
         )
     except RuntimeError as exc:
@@ -232,16 +279,7 @@ def main() -> None:
         sys.exit(1)
 
     if timed_out:
-        # We don't wait for full build completion -- the goal of this poll
-        # is just to surface a fast-failing job. A non-terminal status at
-        # the timeout is acceptance enough; the build continues async and
-        # is monitored in the Control Tower UI.
-        last_status = final.get("status", "Unknown")
-        print(
-            f"Job {job_id} still in non-terminal status '{last_status}' "
-            f"after {args.poll_timeout_seconds}s -- build accepted. "
-            f"Monitor progress in the Control Tower UI."
-        )
+        _handle_non_terminal(args, job_id, final)
         return
 
     ct.print_final_status(final)
diff --git a/scripts/ci/control-tower/run_prcheck.py b/scripts/ci/control-tower/run_prcheck.py
index aac9b3830e7..20aaef5090d 100644
--- a/scripts/ci/control-tower/run_prcheck.py
+++ b/scripts/ci/control-tower/run_prcheck.py
@@ -18,14 +18,15 @@
     lookaside tarballs need to be (re-)uploaded.
 """
 
+from __future__ import annotations
+
 import argparse
 import json
 import sys
 from pathlib import Path
 
-from azure.identity import DefaultAzureCredential
-
 import client as ct
+from azure.identity import DefaultAzureCredential
 
 
 def _parse_components(value: str) -> list[str]:
@@ -121,12 +122,6 @@ def _parse_args() -> argparse.Namespace:
         help="Target branch name (alternative to --target-commit)",
     )
     parser.add_argument("--repo-uri", required=True, help="Upstream repository URI")
-    parser.add_argument(
-        "--poll-interval-seconds",
-        type=int,
-        default=10,
-        help="How often to poll the job status endpoint (default: 10).",
-    )
     parser.add_argument(
         "--poll-timeout-seconds",
         type=int,
@@ -139,9 +134,6 @@ def _parse_args() -> argparse.Namespace:
 def main() -> None:
     args = _parse_args()
 
-    if args.poll_interval_seconds <= 0:
-        print("##[error]--poll-interval-seconds must be a positive integer.")
-        sys.exit(2)
     if args.poll_timeout_seconds <= 0:
         print("##[error]--poll-timeout-seconds must be a positive integer.")
         sys.exit(2)
@@ -214,7 +206,7 @@ def main() -> None:
         sys.exit(1)
 
     # ── Poll for job completion ──────────────────────────────────────
-    print(f"Polling job {job_id} every {args.poll_interval_seconds}s (timeout {args.poll_timeout_seconds}s)...")
+    print(f"Polling job {job_id} for up to {args.poll_timeout_seconds}s for a terminal status...")
     try:
         final, timed_out = ct.poll_until_terminal(
             session,
@@ -223,7 +215,6 @@ def main() -> None:
             args.api_audience,
             token_holder,
             job_id,
-            args.poll_interval_seconds,
             args.poll_timeout_seconds,
         )
     except RuntimeError as exc: