diff --git a/actions/setup/js/create_pull_request.cjs b/actions/setup/js/create_pull_request.cjs index 37a41428e9a..cbd0146cb1f 100644 --- a/actions/setup/js/create_pull_request.cjs +++ b/actions/setup/js/create_pull_request.cjs @@ -35,6 +35,7 @@ const { tryEnforceArrayLimit } = require("./limit_enforcement_helpers.cjs"); const { findAgent, getIssueDetails, assignAgentToIssue } = require("./assign_agent_helpers.cjs"); const { globPatternToRegex } = require("./glob_pattern_helpers.cjs"); const { ensureFullHistoryForBundle, extractBundlePrerequisiteCommits } = require("./git_helpers.cjs"); +const { parseDiffGitHeader: parseDiffGitHeaderPaths, extractDiffGitHeaderEntries } = require("./patch_path_helpers.cjs"); /** * @typedef {import('./types/handler-factory').HandlerFactoryFunction} HandlerFactoryFunction @@ -417,93 +418,14 @@ async function createFallbackIssue(githubClient, repoParts, title, body, labels, const MAX_FILES = 100; /** - * Parses a single `diff --git` header line and returns the post-image (`b/`) - * path, the pre-image (`a/`) path, or `null` if the header could not be - * parsed. Handles both unquoted paths and C-style quoted paths emitted by - * git when filenames contain unusual characters (e.g. backslash-escaped - * quotes, control characters, or non-ASCII bytes when `core.quotepath=true`). + * Parses one `diff --git` header line and returns the preferred file path key. * - * Examples of supported forms: - * diff --git a/foo.txt b/foo.txt - * diff --git a/dir/with space/x b/dir/with space/x - * diff --git "a/foo\"bar" "b/foo\"bar" - * diff --git "a/foo\\bar" "b/foo\\bar" - * - * @param {string} headerLine - The full header line (must start with `diff --git `) - * @returns {string|null} The extracted file path, or null if parsing failed. + * @param {string} headerLine + * @returns {string|null} */ function parseDiffGitHeader(headerLine) { - // Strip the `diff --git ` prefix. - const rest = headerLine.replace(/^diff --git /, ""); - if (rest === headerLine) { - return null; - } - - // Walk the string and pull out the two pathspecs. Each is either: - // - A quoted C-style string ("..."), where backslash escapes any character - // including embedded quotes and backslashes. - // - An unquoted run of non-space characters. - // We don't actually need to unescape the contents; the raw token is fine - // for use as a Set key (uniqueness is preserved). All we need is to - // correctly delimit the two path tokens. - /** @type {string[]} */ - const tokens = []; - let i = 0; - while (i < rest.length && tokens.length < 2) { - // Skip leading whitespace between tokens. - while (i < rest.length && rest[i] === " ") { - i++; - } - if (i >= rest.length) { - break; - } - let token = ""; - if (rest[i] === '"') { - // Quoted form: consume until the matching unescaped quote. - token += rest[i++]; - while (i < rest.length) { - const ch = rest[i++]; - token += ch; - if (ch === "\\" && i < rest.length) { - // Escaped char: consume the next character verbatim. - token += rest[i++]; - } else if (ch === '"') { - break; - } - } - } else { - // Unquoted form: consume up to the next space. - while (i < rest.length && rest[i] !== " ") { - token += rest[i++]; - } - } - tokens.push(token); - } - - if (tokens.length < 2) { - return null; - } - - // Prefer the "b/" (post-image) token, falling back to "a/" if needed. - // The leading "a/" or "b/" prefix is preserved in the returned key so - // that quoted vs. unquoted forms of the same path don't collide - // accidentally with unrelated files; uniqueness is the only invariant - // that matters here. - const stripPrefix = tok => { - if (tok.startsWith('"a/') || tok.startsWith('"b/')) { - return tok.slice(3, tok.endsWith('"') ? -1 : undefined); - } - if (tok.startsWith("a/") || tok.startsWith("b/")) { - return tok.slice(2); - } - return tok; - }; - const bPath = stripPrefix(tokens[1]); - if (bPath) { - return bPath; - } - const aPath = stripPrefix(tokens[0]); - return aPath || null; + const parsed = parseDiffGitHeaderPaths(headerLine); + return parsed.newPath || parsed.oldPath || null; } /** @@ -527,22 +449,14 @@ function countUniquePatchFiles(patchContent) { return 0; } const files = new Set(); - // Find all `diff --git` headers (start of line). Each header corresponds - // to one file diff; we try to extract its path and fall back to a unique - // synthetic key per unparseable header so the file is still counted in - // the limit. This is a conservative choice: it never undercounts, so a - // single malformed header cannot bypass the safety limit. - const headerRe = /^diff --git .*$/gm; - let match; + const entries = extractDiffGitHeaderEntries(patchContent); let unparseableIdx = 0; - while ((match = headerRe.exec(patchContent)) !== null) { - const path = parseDiffGitHeader(match[0]); + for (const entry of entries) { + const path = entry.newPath || entry.oldPath; if (path) { files.add(path); } else { - // Use the byte offset of the header to ensure uniqueness across - // multiple unparseable headers, so each is counted exactly once. - files.add(`__unparseable_header_${match.index}_${unparseableIdx++}`); + files.add(`__unparseable_header_${entry.headerIndex}_${unparseableIdx++}`); } } return files.size; diff --git a/actions/setup/js/create_pull_request.test.cjs b/actions/setup/js/create_pull_request.test.cjs index 7fccfac3e74..0b7f75d0dd9 100644 --- a/actions/setup/js/create_pull_request.test.cjs +++ b/actions/setup/js/create_pull_request.test.cjs @@ -1052,6 +1052,8 @@ describe("create_pull_request - max limit enforcement", () => { expect(parseDiffGitHeader("diff --git a/foo.txt b/foo.txt")).toBe("foo.txt"); // Path with spaces (git always emits quoted form when path contains spaces) expect(parseDiffGitHeader('diff --git "a/dir/with space/x" "b/dir/with space/x"')).toBe("dir/with space/x"); + // CRLF line ending should not leak trailing carriage-return into path + expect(parseDiffGitHeader("diff --git a/crlf.txt b/crlf.txt\r")).toBe("crlf.txt"); // A patch with three different quoted/escaped files should count as 3. const patch = [ @@ -1089,7 +1091,7 @@ describe("create_pull_request - max limit enforcement", () => { expect(countUniquePatchFiles(patchContent)).toBe(3); // Mixed: 2 parseable + 2 unparseable = 4 unique entries. - const mixed = ["diff --git a/a.txt b/a.txt", "diff --git ", "diff --git b/b.txt c/b.txt", "diff --git "].join("\n"); + const mixed = ["diff --git a/a.txt b/a.txt", 'diff --git "a/missing b/missing', "diff --git b/b.txt c/b.txt", "diff --git "].join("\n"); expect(countUniquePatchFiles(mixed)).toBe(4); // 200 unparseable headers must still trigger the default 100-file limit. diff --git a/actions/setup/js/manifest_file_helpers.cjs b/actions/setup/js/manifest_file_helpers.cjs index 045fd6a117b..81d2beb4f73 100644 --- a/actions/setup/js/manifest_file_helpers.cjs +++ b/actions/setup/js/manifest_file_helpers.cjs @@ -1,6 +1,7 @@ // @ts-check /** @typedef {import('./types/handler-factory').HandlerConfig} HandlerConfig */ +const { extractDiffGitHeaderEntries } = require("./patch_path_helpers.cjs"); /** * Extracts the unique set of file basenames (filename without directory path) changed in a git patch. @@ -17,9 +18,12 @@ function extractFilenamesFromPatch(patchContent) { return []; } const fileSet = new Set(); - const matches = patchContent.matchAll(/^diff --git a\/(.+) b\/(.+)$/gm); - for (const match of matches) { - for (const filePath of [match[1], match[2]]) { + const entries = extractDiffGitHeaderEntries(patchContent); + for (const entry of entries) { + if (!entry.parseable) { + continue; + } + for (const filePath of [entry.oldPath, entry.newPath]) { // "dev/null" is the sentinel used when a file is created or deleted; skip it if (filePath && filePath !== "dev/null") { const parts = filePath.split("/"); @@ -51,9 +55,12 @@ function extractPathsFromPatch(patchContent) { return []; } const pathSet = new Set(); - const matches = patchContent.matchAll(/^diff --git a\/(.+) b\/(.+)$/gm); - for (const match of matches) { - for (const filePath of [match[1], match[2]]) { + const entries = extractDiffGitHeaderEntries(patchContent); + for (const entry of entries) { + if (!entry.parseable) { + continue; + } + for (const filePath of [entry.oldPath, entry.newPath]) { if (filePath && filePath !== "dev/null") { pathSet.add(filePath); } diff --git a/actions/setup/js/manifest_file_helpers.test.cjs b/actions/setup/js/manifest_file_helpers.test.cjs index b3cc503c7bd..58e3ef98948 100644 --- a/actions/setup/js/manifest_file_helpers.test.cjs +++ b/actions/setup/js/manifest_file_helpers.test.cjs @@ -79,6 +79,18 @@ rename to package.json.bak expect(result).toContain("package.json.bak"); }); + it("should parse quoted headers with spaces and escapes", () => { + const patch = `diff --git "a/dir/with space/package.json" "b/dir/with space/package-lock.json" +index abc..def 100644 +diff --git "a/foo\\\\bar/config.json" "b/foo\\\\bar/config.json" +index abc..def 100644 +`; + const result = extractFilenamesFromPatch(patch); + expect(result).toContain("package.json"); + expect(result).toContain("package-lock.json"); + expect(result).toContain("config.json"); + }); + it("should ignore dev/null sentinel in new-file diffs", () => { const patch = `diff --git a/dev/null b/src/new-file.js new file mode 100644 @@ -236,6 +248,27 @@ index 0000000..abc expect(result).toContain(".github/workflows/new.yml"); expect(result).not.toContain("dev/null"); }); + + it("should parse CRLF patch headers without trailing carriage returns", () => { + const patch = "diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml\r\nindex abc..def 100644\r\n"; + const result = extractPathsFromPatch(patch); + expect(result).toEqual([".github/workflows/ci.yml"]); + }); + + it("should parse quoted headers and ignore malformed headers", () => { + const patch = [`diff --git "a/.github/workflows/ci file.yml" "b/.github/workflows/ci file.yml"`, "index abc..def 100644", "diff --git ", "index abc..def 100644"].join("\n"); + const result = extractPathsFromPatch(patch); + expect(result).toContain(".github/workflows/ci file.yml"); + expect(result).toHaveLength(1); + }); + + it("should preserve full escaped paths from quoted headers", () => { + const patch = `diff --git "a/foo\\\\bar/config.json" "b/foo\\\\bar/config.json" +index abc..def 100644 +`; + const result = extractPathsFromPatch(patch); + expect(result).toContain("foo\\\\bar/config.json"); + }); }); describe("checkForProtectedPaths", () => { diff --git a/actions/setup/js/patch_path_helpers.cjs b/actions/setup/js/patch_path_helpers.cjs new file mode 100644 index 00000000000..9912c9506cf --- /dev/null +++ b/actions/setup/js/patch_path_helpers.cjs @@ -0,0 +1,132 @@ +// @ts-check +const A_PREFIX_LENGTH = 2; +const B_PREFIX_LENGTH = 2; +const QUOTED_PREFIX_LENGTH = 3; + +/** + * Parses a single `diff --git` header line and extracts both old/new paths. + * Handles unquoted and C-style quoted pathspecs. + * + * @param {string} headerLine + * @returns {{ oldPath: string|null, newPath: string|null, parseable: boolean }} + */ +function parseDiffGitHeader(headerLine) { + const sanitizedHeaderLine = headerLine.endsWith("\r") ? headerLine.slice(0, -1) : headerLine; + const rest = sanitizedHeaderLine.replace(/^diff --git /, ""); + if (rest === sanitizedHeaderLine) { + return { oldPath: null, newPath: null, parseable: false }; + } + + // Git may emit unquoted paths that still contain spaces in `diff --git` + // headers. In that case, split using the required ` b/` token boundary + // instead of generic whitespace tokenization. + if (rest.startsWith("a/")) { + const quotedSep = rest.indexOf(' "b/'); + const unquotedSep = rest.indexOf(" b/"); + const foundSeparatorIndices = [quotedSep, unquotedSep].filter(idx => idx >= 0); + if (foundSeparatorIndices.length > 0) { + const sep = Math.min(...foundSeparatorIndices); + const oldPath = rest.slice(A_PREFIX_LENGTH, sep) || null; + const newToken = rest.slice(sep + 1).trimEnd(); + let newPath = null; + if (newToken.startsWith('"b/')) { + if (newToken.endsWith('"')) { + newPath = newToken.slice(QUOTED_PREFIX_LENGTH, -1) || null; + } else { + newPath = newToken.slice(QUOTED_PREFIX_LENGTH) || null; + } + } else if (newToken.startsWith("b/")) { + newPath = newToken.slice(B_PREFIX_LENGTH) || null; + } + if (oldPath || newPath) { + return { oldPath, newPath, parseable: true }; + } + } + } + + /** @type {string[]} */ + const tokens = []; + const isWhitespace = ch => ch === " " || ch === "\t" || ch === "\r" || ch === "\n"; + let i = 0; + while (i < rest.length && tokens.length < 2) { + while (i < rest.length && isWhitespace(rest[i])) { + i++; + } + if (i >= rest.length) { + break; + } + + let token = ""; + if (rest[i] === '"') { + token += rest[i++]; + let closedQuote = false; + while (i < rest.length) { + const ch = rest[i++]; + token += ch; + if (ch === "\\" && i < rest.length) { + token += rest[i++]; + } else if (ch === '"') { + closedQuote = true; + break; + } + } + if (!closedQuote) { + return { oldPath: null, newPath: null, parseable: false }; + } + } else { + while (i < rest.length && !isWhitespace(rest[i])) { + token += rest[i++]; + } + } + tokens.push(token); + } + + if (tokens.length < 2) { + return { oldPath: null, newPath: null, parseable: false }; + } + + const stripPrefix = tok => { + if (tok.startsWith('"a/') || tok.startsWith('"b/')) { + return tok.slice(QUOTED_PREFIX_LENGTH, tok.endsWith('"') ? -1 : undefined); + } + if (tok.startsWith("a/") || tok.startsWith("b/")) { + return tok.slice(B_PREFIX_LENGTH); + } + return tok; + }; + + const oldPath = stripPrefix(tokens[0]) || null; + const newPath = stripPrefix(tokens[1]) || null; + if (!oldPath && !newPath) { + return { oldPath: null, newPath: null, parseable: false }; + } + + return { oldPath, newPath, parseable: true }; +} + +/** + * Extracts parsed entries for all `diff --git` headers in a patch. + * + * @param {string} patchContent + * @returns {{ oldPath: string|null, newPath: string|null, parseable: boolean, headerIndex: number, headerLine: string }[]} + */ +function extractDiffGitHeaderEntries(patchContent) { + if (!patchContent || !patchContent.trim()) { + return []; + } + + /** @type {{ oldPath: string|null, newPath: string|null, parseable: boolean, headerIndex: number, headerLine: string }[]} */ + const entries = []; + const headerRe = /^diff --git .*$/gm; + let match; + while ((match = headerRe.exec(patchContent)) !== null) { + entries.push({ + ...parseDiffGitHeader(match[0]), + headerIndex: match.index, + headerLine: match[0], + }); + } + return entries; +} + +module.exports = { parseDiffGitHeader, extractDiffGitHeaderEntries }; diff --git a/actions/setup/js/patch_path_helpers.integration.test.cjs b/actions/setup/js/patch_path_helpers.integration.test.cjs new file mode 100644 index 00000000000..be40762d685 --- /dev/null +++ b/actions/setup/js/patch_path_helpers.integration.test.cjs @@ -0,0 +1,111 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import fs from "fs"; +import os from "os"; +import path from "path"; +import { spawnSync } from "child_process"; + +import { extractDiffGitHeaderEntries } from "./patch_path_helpers.cjs"; +import { countUniquePatchFiles } from "./create_pull_request.cjs"; +import { extractPathsFromPatch } from "./manifest_file_helpers.cjs"; + +function execGit(args, options = {}) { + const result = spawnSync("git", args, { encoding: "utf8", ...options }); + if (result.error) throw result.error; + if (result.status !== 0 && !options.allowFailure) { + throw new Error(`git ${args.join(" ")} failed:\nstdout: ${result.stdout}\nstderr: ${result.stderr}`); + } + return result; +} + +function createRepo() { + const repoDir = fs.mkdtempSync(path.join(os.tmpdir(), "patch-path-helpers-it-")); + execGit(["init", "-q"], { cwd: repoDir }); + execGit(["config", "user.name", "Test"], { cwd: repoDir }); + execGit(["config", "user.email", "test@example.com"], { cwd: repoDir }); + execGit(["config", "commit.gpgsign", "false"], { cwd: repoDir }); + fs.writeFileSync(path.join(repoDir, "README.md"), "init\n"); + execGit(["add", "."], { cwd: repoDir }); + execGit(["commit", "-q", "-m", "init"], { cwd: repoDir }); + return repoDir; +} + +function cleanupRepo(repoDir) { + if (repoDir && fs.existsSync(repoDir)) { + fs.rmSync(repoDir, { recursive: true, force: true }); + } +} + +function lastCommitPatch(repoDir) { + return execGit(["show", "--pretty=format:", "--patch", "HEAD"], { cwd: repoDir }).stdout; +} + +describe("patch_path_helpers integration - real git outputs", () => { + let repoDir; + + beforeEach(() => { + repoDir = createRepo(); + }); + + afterEach(() => { + cleanupRepo(repoDir); + }); + + it("parses real git headers for unquoted paths containing spaces", () => { + fs.mkdirSync(path.join(repoDir, "dir with space"), { recursive: true }); + fs.writeFileSync(path.join(repoDir, "dir with space", "file name.txt"), "space\n"); + execGit(["add", "."], { cwd: repoDir }); + execGit(["commit", "-q", "-m", "add spaced path"], { cwd: repoDir }); + const patch = lastCommitPatch(repoDir); + + const entries = extractDiffGitHeaderEntries(patch); + expect(entries).toHaveLength(1); + expect(entries[0]).toEqual( + expect.objectContaining({ + parseable: true, + oldPath: "dir with space/file name.txt", + newPath: "dir with space/file name.txt", + }) + ); + expect(countUniquePatchFiles(patch)).toBe(1); + expect(extractPathsFromPatch(patch)).toContain("dir with space/file name.txt"); + }); + + it("parses real git headers for quoted escaped filenames", () => { + fs.writeFileSync(path.join(repoDir, 'foo"bar.txt'), "quoted\n"); + fs.writeFileSync(path.join(repoDir, "foo\\bar.txt"), "slash\n"); + execGit(["add", "."], { cwd: repoDir }); + execGit(["commit", "-q", "-m", "add escaped names"], { cwd: repoDir }); + const patch = lastCommitPatch(repoDir); + + const entries = extractDiffGitHeaderEntries(patch); + expect(entries).toHaveLength(2); + expect(entries[0].parseable).toBe(true); + expect(entries[1].parseable).toBe(true); + expect(countUniquePatchFiles(patch)).toBe(2); + expect(extractPathsFromPatch(patch)).toContain('foo\\"bar.txt'); + expect(extractPathsFromPatch(patch)).toContain("foo\\\\bar.txt"); + }); + + it("parses real git rename headers and exposes both old/new paths", () => { + fs.writeFileSync(path.join(repoDir, "old-name.txt"), "hello\n"); + execGit(["add", "."], { cwd: repoDir }); + execGit(["commit", "-q", "-m", "add old file"], { cwd: repoDir }); + + execGit(["mv", "old-name.txt", "new-name.txt"], { cwd: repoDir }); + execGit(["commit", "-q", "-m", "rename file"], { cwd: repoDir }); + const patch = lastCommitPatch(repoDir); + + const entries = extractDiffGitHeaderEntries(patch); + expect(entries).toHaveLength(1); + expect(entries[0]).toEqual( + expect.objectContaining({ + parseable: true, + oldPath: "old-name.txt", + newPath: "new-name.txt", + }) + ); + expect(countUniquePatchFiles(patch)).toBe(1); + expect(extractPathsFromPatch(patch)).toContain("old-name.txt"); + expect(extractPathsFromPatch(patch)).toContain("new-name.txt"); + }); +});