diff --git a/.github/workflows/check-rendered-specs.yml b/.github/workflows/check-rendered-specs.yml index efe1c7e04ac..64152df52f4 100644 --- a/.github/workflows/check-rendered-specs.yml +++ b/.github/workflows/check-rendered-specs.yml @@ -17,9 +17,17 @@ # # Security: the PR checkout is data-only. We never run PR scripts, and the # runner Dockerfile + build context come from the trusted base checkout. The -# azldev binary is built from the version pinned in the PR's .azldev-version -# (so a version-bump PR is validated with the binary that will actually run -# post-merge). The Dockerfile hardcodes the module path to the Microsoft-owned +# azldev binary is built from the POST-merge .azldev-version -- the value the +# file will have on 4.0 once this PR lands -- read from GitHub's test-merge +# commit (resolved by polling the PR's mergeability) via the Contents API, so no merge tree is ever checked +# out. This validates every PR against the binary that will actually run +# post-merge: a version-bump PR against its new binary, and a behind-base PR +# against the base branch's current binary (not its own stale pin). Note that +# only the azldev *binary version* is taken post-merge; the spec/overlay/lock +# *data* is rendered from the PR-head worktree as-is, not from a materialized +# merge tree -- so this catches binary-driven drift, not drift that would +# surface only once the PR and base content are textually merged. The +# Dockerfile hardcodes the module path to the Microsoft-owned # github.com/microsoft/azure-linux-dev-tools repo, so the PR can only choose a # *ref* (tag/branch/commit) -- it cannot redirect `go install` to a different # module. This is NOT the same as "only Microsoft-authored code runs": a ref @@ -71,8 +79,17 @@ jobs: timeout-minutes: 60 permissions: contents: read + pull-requests: read # poll PR mergeability + merge_commit_sha to resolve the post-merge azldev version outputs: patch-url: ${{ steps.upload-locks-patch.outputs.artifact-url }} + # NOTE: azldev-version / render-all are shared CI *context* (the resolved + # post-merge binary version), not lock/drift state -- update_locks just + # happens to be the single place that resolves them (it already polls PR + # mergeability). Resolved once here and consumed by the render job via + # needs.update_locks.outputs.*, so both check drift against the same + # post-merge azldev version -- no second, possibly-divergent poll. + azldev-version: ${{ steps.resolve.outputs.azldev-version }} + render-all: ${{ steps.resolve.outputs.render-all }} steps: # --- Trusted base branch (tools, scripts, container config) --- - name: Checkout base (trusted) @@ -92,22 +109,138 @@ jobs: fetch-depth: 0 persist-credentials: false - - name: Build azldev runner container + # Resolve the azldev version that will be in effect on 4.0 AFTER this PR + # merges -- the binary that will actually render specs post-merge -- and + # whether it differs from the base pin (RENDER_ALL). Done ONCE here and + # exposed as job outputs; the render job (which `needs:` this one) consumes + # them rather than polling a second time, which could resolve a different + # merge commit if the base advances between the two jobs. + - name: Resolve post-merge azldev version + id: resolve + env: + REPO: ${{ inputs.repo }} + PR_NUMBER: ${{ inputs.pr-number }} + PR_HEAD_SHA: ${{ inputs.pr-head-sha }} + GH_TOKEN: ${{ github.token }} run: | set -euo pipefail - # Build with the azldev version the PR proposes (pr-head/.azldev-version), - # NOT the base branch's. For a version-bump PR these differ, and the whole - # point of the drift check is to run with the binary that will actually run - # on 4.0 post-merge -- the base binary would pass the check and then drift - # the moment the bump lands. Only the version string comes from the PR; the - # Dockerfile + build context are the trusted base checkout, and the - # Dockerfile hardcodes the module path to the Microsoft-owned repo. - AZLDEV_VERSION="$(tr -d '\n' < pr-head/.azldev-version)" - # Hygiene: reject a malformed/garbage version before it reaches docker build. - if ! printf '%s' "$AZLDEV_VERSION" | grep -Eq '^[0-9A-Za-z._+-]+$'; then - echo "::error::pr-head/.azldev-version is empty or has unexpected characters" + # The post-merge azldev binary is the .azldev-version at GitHub's + # test-merge commit. We read just that one file via the Contents API -- + # no merge tree is checked out, and only this regex-validated version + # string crosses to the host; the Dockerfile + build context are the + # trusted base checkout, and the Dockerfile hardcodes the module path + # to the Microsoft-owned repo. + # + # GitHub computes mergeability (and the test-merge commit) + # ASYNCHRONOUSLY, and the event payload's merge_commit_sha can be empty + # OR stale, so we never trust it -- always poll the PR endpoint for the + # authoritative current value. + if ! printf '%s' "$PR_NUMBER" | grep -Eq '^[0-9]+$'; then + echo "::error::pr-number is not a positive integer: '$PR_NUMBER'" exit 1 fi + if ! printf '%s' "$REPO" | grep -Eq '^[A-Za-z0-9._-]+/[A-Za-z0-9._-]+$'; then + echo "::error::repo is not a valid owner/repo: '$REPO'" + exit 1 + fi + merge_sha="" + # ~120s budget: usually instant once warm, but a large PR in a busy + # monorepo can take a while for GitHub to compute mergeability. + for attempt in {1..30}; do + # A transient API error (429/5xx/connectivity) should retry, not kill + # the job; only a definitive mergeable=false fails closed. + if ! pr_json="$(gh api "repos/${REPO}/pulls/${PR_NUMBER}" 2>/dev/null)"; then + echo "PR API call failed (attempt ${attempt}/30); retrying in 4s..." + sleep 4 + continue + fi + # The checkout steps pin the PR head to the triggering event's + # SHA, but this poll reads the PR's CURRENT state. If the author + # pushed a newer head after the event, GitHub would compute the + # test-merge against that newer head while we render the event's + # (older) head -- a binary-vs-data mismatch. Bail; cancel-in-progress + # lets the newer run supersede this one. + polled_head="$(jq -r '.head.sha' <<<"$pr_json")" + if [ "$polled_head" != "$PR_HEAD_SHA" ]; then + echo "::error::PR head advanced ('$PR_HEAD_SHA' -> '$polled_head') since this run started; a newer run supersedes this one." + exit 1 + fi + mergeable="$(jq -r '.mergeable' <<<"$pr_json")" + if [ "$mergeable" = "false" ]; then + echo "::error::PR is not mergeable (conflicts with the base branch) -- rebase / resolve conflicts, then re-run." + exit 1 + fi + if [ "$mergeable" = "true" ]; then + # GitHub computes `mergeable` and `merge_commit_sha` independently + # and asynchronously, so `mergeable=true` can briefly coexist with + # a null merge_commit_sha. Only accept a fully-formed 40-hex SHA; + # otherwise keep polling rather than breaking out with an empty + # value (which would fail the post-loop check and force a needless + # manual re-run). + candidate_sha="$(jq -r '.merge_commit_sha // ""' <<<"$pr_json")" + if printf '%s' "$candidate_sha" | grep -Eq '^[0-9a-f]{40}$'; then + merge_sha="$candidate_sha" + break + fi + echo "PR is mergeable but GitHub has not published the test-merge commit yet (attempt ${attempt}/30); waiting 4s..." + else + echo "GitHub has not finished computing mergeability yet (attempt ${attempt}/30); waiting 4s..." + fi + sleep 4 + done + if ! printf '%s' "$merge_sha" | grep -Eq '^[0-9a-f]{40}$'; then + echo "::error::Could not resolve the test-merge commit SHA after ~120s (GitHub never finished computing mergeability / publishing the test-merge commit, or the PR API calls kept failing -- see the per-attempt messages above). Re-run the job." + exit 1 + fi + # Read the post-merge .azldev-version from the test-merge commit. + # `tr -d '\n'` strips ALL newlines (command substitution only removes + # the trailing one) BEFORE validating: the value flows into + # $GITHUB_OUTPUT as `azldev-version=`, and $GITHUB_OUTPUT is + # parsed line-by-line with last-write-wins. Without collapsing to a + # single line, a multi-line file could smuggle a second, + # attacker-controlled `azldev-version=` line past the line-oriented + # `grep -q` below (which passes if ANY single line matches) and then + # override the output -- the value later flows into a `docker build + # --build-arg` and the Dockerfile's `RUN ... go install ...@`, + # so it MUST be fully validated as a single token. + # The Contents API can briefly 404/5xx on a freshly-computed merge + # commit, so retry transient read failures instead of failing the job. + azldev_version="" + for attempt in {1..5}; do + if azldev_version="$(gh api -H 'Accept: application/vnd.github.raw' \ + "repos/${REPO}/contents/.azldev-version?ref=${merge_sha}" | tr -d '\n')"; then + break + fi + echo "Could not read .azldev-version at the test-merge commit (attempt ${attempt}/5); retrying in 4s..." + azldev_version="" + sleep 4 + done + if ! printf '%s' "$azldev_version" | grep -Eq '^[0-9A-Za-z._+-]+$'; then + echo "::error::post-merge .azldev-version is empty or has unexpected characters (could not read it from the test-merge commit, or its content is invalid)" + exit 1 + fi + # render-all when the post-merge version differs from the base branch's + # CURRENT pin (.azldev-version in the trusted base checkout) -- i.e. + # this PR changes the azldev binary on 4.0, which can alter rendered + # output for ANY component, so a PR-scoped render would miss that + # drift. A behind-base PR (post-merge == base) takes the scoped path. + base_version="$(cat .azldev-version)" + if [ "$azldev_version" != "$base_version" ]; then + render_all=true + else + render_all=false + fi + echo "Resolved post-merge azldev version: ${azldev_version} (test-merge ${merge_sha}); RENDER_ALL=${render_all} (base pin ${base_version})" + { + echo "azldev-version=${azldev_version}" + echo "render-all=${render_all}" + } >> "$GITHUB_OUTPUT" + + - name: Build azldev runner container + env: + AZLDEV_VERSION: ${{ steps.resolve.outputs.azldev-version }} + run: | + set -euo pipefail docker build \ --build-arg UID="$(id -u)" \ --build-arg AZLDEV_VERSION="$AZLDEV_VERSION" \ @@ -321,21 +454,13 @@ jobs: persist-credentials: false - name: Build azldev runner container + # The azldev version is resolved once by the update_locks job (which this + # job `needs:`) and consumed here, so render checks drift against the + # SAME post-merge binary update_locks validated the locks against. + env: + AZLDEV_VERSION: ${{ needs.update_locks.outputs.azldev-version }} run: | set -euo pipefail - # Build with the azldev version the PR proposes (pr-head/.azldev-version), - # NOT the base branch's. For a version-bump PR these differ, and the whole - # point of the drift check is to run with the binary that will actually run - # on 4.0 post-merge -- the base binary would pass the check and then drift - # the moment the bump lands. Only the version string comes from the PR; the - # Dockerfile + build context are the trusted base checkout, and the - # Dockerfile hardcodes the module path to the Microsoft-owned repo. - AZLDEV_VERSION="$(tr -d '\n' < pr-head/.azldev-version)" - # Hygiene: reject a malformed/garbage version before it reaches docker build. - if ! printf '%s' "$AZLDEV_VERSION" | grep -Eq '^[0-9A-Za-z._+-]+$'; then - echo "::error::pr-head/.azldev-version is empty or has unexpected characters" - exit 1 - fi docker build \ --build-arg UID="$(id -u)" \ --build-arg AZLDEV_VERSION="$AZLDEV_VERSION" \ @@ -366,6 +491,10 @@ jobs: PR_BASE_SHA: ${{ inputs.pr-base-sha }} PR_HEAD_SHA: ${{ inputs.pr-head-sha }} BASE_REPO: ${{ inputs.repo }} + # Resolved once by update_locks (post-merge azldev version != base + # pin). "true" => render ALL components (the azldev binary changes on + # 4.0); "false" => PR-scoped render. + RENDER_ALL: ${{ needs.update_locks.outputs.render-all }} run: | set -euo pipefail mkdir -p "$WORKSPACE/render-output" @@ -385,30 +514,30 @@ jobs: -e PR_BASE_SHA \ -e PR_HEAD_SHA \ -e BASE_REPO \ + -e RENDER_ALL \ localhost/azldev-runner \ bash -eu -o pipefail -c ' SPECS_DIR=$(azldev config dump -q -f json | jq -r .project.renderedSpecsDir) # Stale forks may not have the upstream base SHA in their object - # store. Fetch it explicitly so the version-bump diff below and - # `compute_change_set.sh` can both resolve the base commit. The - # base repo is public so no auth is needed. + # store. Fetch it explicitly so `compute_change_set.sh` can + # resolve the base commit for the PR-scoped render set. The base + # repo is public so no auth is needed. git -C /workdir remote add base "https://github.com/$BASE_REPO.git" 2>/dev/null || true git -C /workdir fetch --no-tags base "$PR_BASE_SHA" - # A change to .azldev-version swaps the azldev binary, which can - # alter rendered output for ANY component -- not just the ones - # this PR edits -- so a PR-scoped render would miss that drift. - # Detect it with a plain git diff (both commits are local after - # the fetch above) and render everything when it moved. Using git - # here -- rather than the REST "list PR files" endpoint -- avoids - # the 3000-file cap and 30-results-per-page pagination of that - # endpoint, so detection stays correct for arbitrarily large PRs. - if ! git -C /workdir diff --quiet "$PR_BASE_SHA" "$PR_HEAD_SHA" -- .azldev-version; then + # RENDER_ALL is resolved on the host by the update_locks job's + # "Resolve post-merge azldev version" step: true when the azldev version that will be in + # effect on 4.0 AFTER this PR merges differs from the base branch's + # current pin -- i.e. this PR changes the azldev binary. An azldev + # swap can change rendered output for ANY component, not just the + # ones this PR edits, so a PR-scoped render would miss that drift; + # render everything in that case. + if [ "$RENDER_ALL" = "true" ]; then # `--clean-stale` prunes orphaned spec dirs globally, which # subsumes the targeted deleted-component cleanup the scoped # branch does below. - echo ".azldev-version changed -- rendering ALL components." + echo "azldev version changes on merge -- rendering ALL components." azldev component render -q -a --clean-stale -O json \ > /output/render-output.json else