microsoft · dmcilvaney · Jun 20, 2026
@@ -17,9 +17,17 @@
 #
 # Security: the PR checkout is data-only. We never run PR scripts, and the
 # runner Dockerfile + build context come from the trusted base checkout. The
-# azldev binary is built from the version pinned in the PR's .azldev-version
-# (so a version-bump PR is validated with the binary that will actually run
-# post-merge). The Dockerfile hardcodes the module path to the Microsoft-owned
+# azldev binary is built from the POST-merge .azldev-version -- the value the
+# file will have on 4.0 once this PR lands -- read from GitHub's test-merge
+# commit (resolved by polling the PR's mergeability) via the Contents API, so no merge tree is ever checked
+# out. This validates every PR against the binary that will actually run
+# post-merge: a version-bump PR against its new binary, and a behind-base PR
+# against the base branch's current binary (not its own stale pin). Note that
+# only the azldev *binary version* is taken post-merge; the spec/overlay/lock
+# *data* is rendered from the PR-head worktree as-is, not from a materialized
+# merge tree -- so this catches binary-driven drift, not drift that would
+# surface only once the PR and base content are textually merged. The
+# Dockerfile hardcodes the module path to the Microsoft-owned
 # github.com/microsoft/azure-linux-dev-tools repo, so the PR can only choose a
 # *ref* (tag/branch/commit) -- it cannot redirect `go install` to a different
 # module. This is NOT the same as "only Microsoft-authored code runs": a ref
@@ -71,8 +79,17 @@ jobs:
     timeout-minutes: 60
     permissions:
       contents: read
+      pull-requests: read # poll PR mergeability + merge_commit_sha to resolve the post-merge azldev version
     outputs:
       patch-url: ${{ steps.upload-locks-patch.outputs.artifact-url }}
+      # NOTE: azldev-version / render-all are shared CI *context* (the resolved
+      # post-merge binary version), not lock/drift state -- update_locks just
+      # happens to be the single place that resolves them (it already polls PR
+      # mergeability). Resolved once here and consumed by the render job via
+      # needs.update_locks.outputs.*, so both check drift against the same
+      # post-merge azldev version -- no second, possibly-divergent poll.
+      azldev-version: ${{ steps.resolve.outputs.azldev-version }}
+      render-all: ${{ steps.resolve.outputs.render-all }}
     steps:
       # --- Trusted base branch (tools, scripts, container config) ---
       - name: Checkout base (trusted)
@@ -92,22 +109,138 @@ jobs:
           fetch-depth: 0
           persist-credentials: false
 
-      - name: Build azldev runner container
+      # Resolve the azldev version that will be in effect on 4.0 AFTER this PR
+      # merges -- the binary that will actually render specs post-merge -- and
+      # whether it differs from the base pin (RENDER_ALL). Done ONCE here and
+      # exposed as job outputs; the render job (which `needs:` this one) consumes
+      # them rather than polling a second time, which could resolve a different
+      # merge commit if the base advances between the two jobs.
+      - name: Resolve post-merge azldev version
+        id: resolve
+        env:
+          REPO: ${{ inputs.repo }}
+          PR_NUMBER: ${{ inputs.pr-number }}
+          PR_HEAD_SHA: ${{ inputs.pr-head-sha }}
+          GH_TOKEN: ${{ github.token }}
         run: |
           set -euo pipefail
-          # Build with the azldev version the PR proposes (pr-head/.azldev-version),
-          # NOT the base branch's. For a version-bump PR these differ, and the whole
-          # point of the drift check is to run with the binary that will actually run
-          # on 4.0 post-merge -- the base binary would pass the check and then drift
-          # the moment the bump lands. Only the version string comes from the PR; the
-          # Dockerfile + build context are the trusted base checkout, and the
-          # Dockerfile hardcodes the module path to the Microsoft-owned repo.
-          AZLDEV_VERSION="$(tr -d '\n' < pr-head/.azldev-version)"
-          # Hygiene: reject a malformed/garbage version before it reaches docker build.
-          if ! printf '%s' "$AZLDEV_VERSION" | grep -Eq '^[0-9A-Za-z._+-]+$'; then
-            echo "::error::pr-head/.azldev-version is empty or has unexpected characters"
+          # The post-merge azldev binary is the .azldev-version at GitHub's
+          # test-merge commit. We read just that one file via the Contents API --
+          # no merge tree is checked out, and only this regex-validated version
+          # string crosses to the host; the Dockerfile + build context are the
+          # trusted base checkout, and the Dockerfile hardcodes the module path
+          # to the Microsoft-owned repo.
+          #
+          # GitHub computes mergeability (and the test-merge commit)
+          # ASYNCHRONOUSLY, and the event payload's merge_commit_sha can be empty
+          # OR stale, so we never trust it -- always poll the PR endpoint for the
+          # authoritative current value.
+          if ! printf '%s' "$PR_NUMBER" | grep -Eq '^[0-9]+$'; then
+            echo "::error::pr-number is not a positive integer: '$PR_NUMBER'"
             exit 1
           fi
+          if ! printf '%s' "$REPO" | grep -Eq '^[A-Za-z0-9._-]+/[A-Za-z0-9._-]+$'; then
+            echo "::error::repo is not a valid owner/repo: '$REPO'"
+            exit 1
+          fi
+          merge_sha=""
+          # ~120s budget: usually instant once warm, but a large PR in a busy
+          # monorepo can take a while for GitHub to compute mergeability.
+          for attempt in {1..30}; do
+            # A transient API error (429/5xx/connectivity) should retry, not kill
+            # the job; only a definitive mergeable=false fails closed.
+            if ! pr_json="$(gh api "repos/${REPO}/pulls/${PR_NUMBER}" 2>/dev/null)"; then
+              echo "PR API call failed (attempt ${attempt}/30); retrying in 4s..."
+              sleep 4
+              continue
+            fi
+            # The checkout steps pin the PR head to the triggering event's
+            # SHA, but this poll reads the PR's CURRENT state. If the author
+            # pushed a newer head after the event, GitHub would compute the
+            # test-merge against that newer head while we render the event's
+            # (older) head -- a binary-vs-data mismatch. Bail; cancel-in-progress
+            # lets the newer run supersede this one.
+            polled_head="$(jq -r '.head.sha' <<<"$pr_json")"
+            if [ "$polled_head" != "$PR_HEAD_SHA" ]; then
+              echo "::error::PR head advanced ('$PR_HEAD_SHA' -> '$polled_head') since this run started; a newer run supersedes this one."
+              exit 1
+            fi
+            mergeable="$(jq -r '.mergeable' <<<"$pr_json")"
+            if [ "$mergeable" = "false" ]; then
+              echo "::error::PR is not mergeable (conflicts with the base branch) -- rebase / resolve conflicts, then re-run."
+              exit 1
+            fi
+            if [ "$mergeable" = "true" ]; then
+              # GitHub computes `mergeable` and `merge_commit_sha` independently
+              # and asynchronously, so `mergeable=true` can briefly coexist with
+              # a null merge_commit_sha. Only accept a fully-formed 40-hex SHA;
+              # otherwise keep polling rather than breaking out with an empty
+              # value (which would fail the post-loop check and force a needless
+              # manual re-run).
+              candidate_sha="$(jq -r '.merge_commit_sha // ""' <<<"$pr_json")"
+              if printf '%s' "$candidate_sha" | grep -Eq '^[0-9a-f]{40}$'; then
+                merge_sha="$candidate_sha"
+                break
+              fi
+              echo "PR is mergeable but GitHub has not published the test-merge commit yet (attempt ${attempt}/30); waiting 4s..."
+            else
+              echo "GitHub has not finished computing mergeability yet (attempt ${attempt}/30); waiting 4s..."
+            fi
+            sleep 4
+          done
+          if ! printf '%s' "$merge_sha" | grep -Eq '^[0-9a-f]{40}$'; then
+            echo "::error::Could not resolve the test-merge commit SHA after ~120s (GitHub never finished computing mergeability / publishing the test-merge commit, or the PR API calls kept failing -- see the per-attempt messages above). Re-run the job."
+            exit 1
+          fi
+          # Read the post-merge .azldev-version from the test-merge commit.
+          # `tr -d '\n'` strips ALL newlines (command substitution only removes
+          # the trailing one) BEFORE validating: the value flows into
+          # $GITHUB_OUTPUT as `azldev-version=<value>`, and $GITHUB_OUTPUT is
+          # parsed line-by-line with last-write-wins. Without collapsing to a
+          # single line, a multi-line file could smuggle a second,
+          # attacker-controlled `azldev-version=` line past the line-oriented
+          # `grep -q` below (which passes if ANY single line matches) and then
+          # override the output -- the value later flows into a `docker build
+          # --build-arg` and the Dockerfile's `RUN ... go install ...@<value>`,
+          # so it MUST be fully validated as a single token.
+          # The Contents API can briefly 404/5xx on a freshly-computed merge
+          # commit, so retry transient read failures instead of failing the job.
+          azldev_version=""
+          for attempt in {1..5}; do
+            if azldev_version="$(gh api -H 'Accept: application/vnd.github.raw' \
+                "repos/${REPO}/contents/.azldev-version?ref=${merge_sha}" | tr -d '\n')"; then
+              break
+            fi
+            echo "Could not read .azldev-version at the test-merge commit (attempt ${attempt}/5); retrying in 4s..."
+            azldev_version=""
+            sleep 4
+          done
+          if ! printf '%s' "$azldev_version" | grep -Eq '^[0-9A-Za-z._+-]+$'; then
+            echo "::error::post-merge .azldev-version is empty or has unexpected characters (could not read it from the test-merge commit, or its content is invalid)"
+            exit 1
+          fi
+          # render-all when the post-merge version differs from the base branch's
+          # CURRENT pin (.azldev-version in the trusted base checkout) -- i.e.
+          # this PR changes the azldev binary on 4.0, which can alter rendered
+          # output for ANY component, so a PR-scoped render would miss that
+          # drift. A behind-base PR (post-merge == base) takes the scoped path.
+          base_version="$(cat .azldev-version)"
+          if [ "$azldev_version" != "$base_version" ]; then
+            render_all=true
+          else
+            render_all=false
+          fi
+          echo "Resolved post-merge azldev version: ${azldev_version} (test-merge ${merge_sha}); RENDER_ALL=${render_all} (base pin ${base_version})"
+          {
+            echo "azldev-version=${azldev_version}"
+            echo "render-all=${render_all}"
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Build azldev runner container
+        env:
+          AZLDEV_VERSION: ${{ steps.resolve.outputs.azldev-version }}
+        run: |
+          set -euo pipefail
           docker build \
             --build-arg UID="$(id -u)" \
             --build-arg AZLDEV_VERSION="$AZLDEV_VERSION" \
@@ -321,21 +454,13 @@ jobs:
           persist-credentials: false
 
       - name: Build azldev runner container
+        # The azldev version is resolved once by the update_locks job (which this
+        # job `needs:`) and consumed here, so render checks drift against the
+        # SAME post-merge binary update_locks validated the locks against.
+        env:
+          AZLDEV_VERSION: ${{ needs.update_locks.outputs.azldev-version }}
         run: |
           set -euo pipefail
-          # Build with the azldev version the PR proposes (pr-head/.azldev-version),
-          # NOT the base branch's. For a version-bump PR these differ, and the whole
-          # point of the drift check is to run with the binary that will actually run
-          # on 4.0 post-merge -- the base binary would pass the check and then drift
-          # the moment the bump lands. Only the version string comes from the PR; the
-          # Dockerfile + build context are the trusted base checkout, and the
-          # Dockerfile hardcodes the module path to the Microsoft-owned repo.
-          AZLDEV_VERSION="$(tr -d '\n' < pr-head/.azldev-version)"
-          # Hygiene: reject a malformed/garbage version before it reaches docker build.
-          if ! printf '%s' "$AZLDEV_VERSION" | grep -Eq '^[0-9A-Za-z._+-]+$'; then
-            echo "::error::pr-head/.azldev-version is empty or has unexpected characters"
-            exit 1
-          fi
           docker build \
             --build-arg UID="$(id -u)" \
             --build-arg AZLDEV_VERSION="$AZLDEV_VERSION" \
@@ -366,6 +491,10 @@ jobs:
           PR_BASE_SHA: ${{ inputs.pr-base-sha }}
           PR_HEAD_SHA: ${{ inputs.pr-head-sha }}
           BASE_REPO: ${{ inputs.repo }}
+          # Resolved once by update_locks (post-merge azldev version != base
+          # pin). "true" => render ALL components (the azldev binary changes on
+          # 4.0); "false" => PR-scoped render.
+          RENDER_ALL: ${{ needs.update_locks.outputs.render-all }}
         run: |
           set -euo pipefail
           mkdir -p "$WORKSPACE/render-output"
@@ -385,30 +514,30 @@ jobs:
             -e PR_BASE_SHA \
             -e PR_HEAD_SHA \
             -e BASE_REPO \
+            -e RENDER_ALL \
             localhost/azldev-runner \
             bash -eu -o pipefail -c '
               SPECS_DIR=$(azldev config dump -q -f json | jq -r .project.renderedSpecsDir)
 
               # Stale forks may not have the upstream base SHA in their object
-              # store. Fetch it explicitly so the version-bump diff below and
-              # `compute_change_set.sh` can both resolve the base commit. The
-              # base repo is public so no auth is needed.
+              # store. Fetch it explicitly so `compute_change_set.sh` can
+              # resolve the base commit for the PR-scoped render set. The base
+              # repo is public so no auth is needed.
               git -C /workdir remote add base "https://github.com/$BASE_REPO.git" 2>/dev/null || true
               git -C /workdir fetch --no-tags base "$PR_BASE_SHA"
 
-              # A change to .azldev-version swaps the azldev binary, which can
-              # alter rendered output for ANY component -- not just the ones
-              # this PR edits -- so a PR-scoped render would miss that drift.
-              # Detect it with a plain git diff (both commits are local after
-              # the fetch above) and render everything when it moved. Using git
-              # here -- rather than the REST "list PR files" endpoint -- avoids
-              # the 3000-file cap and 30-results-per-page pagination of that
-              # endpoint, so detection stays correct for arbitrarily large PRs.
-              if ! git -C /workdir diff --quiet "$PR_BASE_SHA" "$PR_HEAD_SHA" -- .azldev-version; then
+              # RENDER_ALL is resolved on the host by the update_locks job's
+              # "Resolve post-merge azldev version" step: true when the azldev version that will be in
+              # effect on 4.0 AFTER this PR merges differs from the base branch's
+              # current pin -- i.e. this PR changes the azldev binary. An azldev
+              # swap can change rendered output for ANY component, not just the
+              # ones this PR edits, so a PR-scoped render would miss that drift;
+              # render everything in that case.
+              if [ "$RENDER_ALL" = "true" ]; then
                 # `--clean-stale` prunes orphaned spec dirs globally, which
                 # subsumes the targeted deleted-component cleanup the scoped
                 # branch does below.
-                echo ".azldev-version changed -- rendering ALL components."
+                echo "azldev version changes on merge -- rendering ALL components."
                 azldev component render -q -a --clean-stale -O json \
                   > /output/render-output.json
               else