Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 10 additions & 96 deletions actions/setup/js/create_pull_request.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const { tryEnforceArrayLimit } = require("./limit_enforcement_helpers.cjs");
const { findAgent, getIssueDetails, assignAgentToIssue } = require("./assign_agent_helpers.cjs");
const { globPatternToRegex } = require("./glob_pattern_helpers.cjs");
const { ensureFullHistoryForBundle, extractBundlePrerequisiteCommits } = require("./git_helpers.cjs");
const { parseDiffGitHeader: parseDiffGitHeaderPaths, extractDiffGitHeaderEntries } = require("./patch_path_helpers.cjs");

/**
* @typedef {import('./types/handler-factory').HandlerFactoryFunction} HandlerFactoryFunction
Expand Down Expand Up @@ -417,93 +418,14 @@ async function createFallbackIssue(githubClient, repoParts, title, body, labels,
const MAX_FILES = 100;

/**
* Parses a single `diff --git` header line and returns the post-image (`b/`)
* path, the pre-image (`a/`) path, or `null` if the header could not be
* parsed. Handles both unquoted paths and C-style quoted paths emitted by
* git when filenames contain unusual characters (e.g. backslash-escaped
* quotes, control characters, or non-ASCII bytes when `core.quotepath=true`).
* Parses one `diff --git` header line and returns the preferred file path key.
*
* Examples of supported forms:
* diff --git a/foo.txt b/foo.txt
* diff --git a/dir/with space/x b/dir/with space/x
* diff --git "a/foo\"bar" "b/foo\"bar"
* diff --git "a/foo\\bar" "b/foo\\bar"
*
* @param {string} headerLine - The full header line (must start with `diff --git `)
* @returns {string|null} The extracted file path, or null if parsing failed.
* @param {string} headerLine
* @returns {string|null}
*/
function parseDiffGitHeader(headerLine) {
// Strip the `diff --git ` prefix.
const rest = headerLine.replace(/^diff --git /, "");
if (rest === headerLine) {
return null;
}

// Walk the string and pull out the two pathspecs. Each is either:
// - A quoted C-style string ("..."), where backslash escapes any character
// including embedded quotes and backslashes.
// - An unquoted run of non-space characters.
// We don't actually need to unescape the contents; the raw token is fine
// for use as a Set key (uniqueness is preserved). All we need is to
// correctly delimit the two path tokens.
/** @type {string[]} */
const tokens = [];
let i = 0;
while (i < rest.length && tokens.length < 2) {
// Skip leading whitespace between tokens.
while (i < rest.length && rest[i] === " ") {
i++;
}
if (i >= rest.length) {
break;
}
let token = "";
if (rest[i] === '"') {
// Quoted form: consume until the matching unescaped quote.
token += rest[i++];
while (i < rest.length) {
const ch = rest[i++];
token += ch;
if (ch === "\\" && i < rest.length) {
// Escaped char: consume the next character verbatim.
token += rest[i++];
} else if (ch === '"') {
break;
}
}
} else {
// Unquoted form: consume up to the next space.
while (i < rest.length && rest[i] !== " ") {
token += rest[i++];
}
}
tokens.push(token);
}

if (tokens.length < 2) {
return null;
}

// Prefer the "b/" (post-image) token, falling back to "a/" if needed.
// The leading "a/" or "b/" prefix is preserved in the returned key so
// that quoted vs. unquoted forms of the same path don't collide
// accidentally with unrelated files; uniqueness is the only invariant
// that matters here.
const stripPrefix = tok => {
if (tok.startsWith('"a/') || tok.startsWith('"b/')) {
return tok.slice(3, tok.endsWith('"') ? -1 : undefined);
}
if (tok.startsWith("a/") || tok.startsWith("b/")) {
return tok.slice(2);
}
return tok;
};
const bPath = stripPrefix(tokens[1]);
if (bPath) {
return bPath;
}
const aPath = stripPrefix(tokens[0]);
return aPath || null;
const parsed = parseDiffGitHeaderPaths(headerLine);
return parsed.newPath || parsed.oldPath || null;
}

/**
Expand All @@ -527,22 +449,14 @@ function countUniquePatchFiles(patchContent) {
return 0;
}
const files = new Set();
// Find all `diff --git` headers (start of line). Each header corresponds
// to one file diff; we try to extract its path and fall back to a unique
// synthetic key per unparseable header so the file is still counted in
// the limit. This is a conservative choice: it never undercounts, so a
// single malformed header cannot bypass the safety limit.
const headerRe = /^diff --git .*$/gm;
let match;
const entries = extractDiffGitHeaderEntries(patchContent);
let unparseableIdx = 0;
while ((match = headerRe.exec(patchContent)) !== null) {
const path = parseDiffGitHeader(match[0]);
for (const entry of entries) {
const path = entry.newPath || entry.oldPath;
if (path) {
files.add(path);
} else {
// Use the byte offset of the header to ensure uniqueness across
// multiple unparseable headers, so each is counted exactly once.
files.add(`__unparseable_header_${match.index}_${unparseableIdx++}`);
files.add(`__unparseable_header_${entry.headerIndex}_${unparseableIdx++}`);
}
}
return files.size;
Expand Down
4 changes: 3 additions & 1 deletion actions/setup/js/create_pull_request.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -1052,6 +1052,8 @@ describe("create_pull_request - max limit enforcement", () => {
expect(parseDiffGitHeader("diff --git a/foo.txt b/foo.txt")).toBe("foo.txt");
// Path with spaces (git always emits quoted form when path contains spaces)
expect(parseDiffGitHeader('diff --git "a/dir/with space/x" "b/dir/with space/x"')).toBe("dir/with space/x");
// CRLF line ending should not leak trailing carriage-return into path
expect(parseDiffGitHeader("diff --git a/crlf.txt b/crlf.txt\r")).toBe("crlf.txt");

// A patch with three different quoted/escaped files should count as 3.
const patch = [
Expand Down Expand Up @@ -1089,7 +1091,7 @@ describe("create_pull_request - max limit enforcement", () => {
expect(countUniquePatchFiles(patchContent)).toBe(3);

// Mixed: 2 parseable + 2 unparseable = 4 unique entries.
const mixed = ["diff --git a/a.txt b/a.txt", "diff --git ", "diff --git b/b.txt c/b.txt", "diff --git "].join("\n");
const mixed = ["diff --git a/a.txt b/a.txt", 'diff --git "a/missing b/missing', "diff --git b/b.txt c/b.txt", "diff --git "].join("\n");
expect(countUniquePatchFiles(mixed)).toBe(4);

// 200 unparseable headers must still trigger the default 100-file limit.
Expand Down
19 changes: 13 additions & 6 deletions actions/setup/js/manifest_file_helpers.cjs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// @ts-check

/** @typedef {import('./types/handler-factory').HandlerConfig} HandlerConfig */
const { extractDiffGitHeaderEntries } = require("./patch_path_helpers.cjs");

/**
* Extracts the unique set of file basenames (filename without directory path) changed in a git patch.
Expand All @@ -17,9 +18,12 @@ function extractFilenamesFromPatch(patchContent) {
return [];
}
const fileSet = new Set();
const matches = patchContent.matchAll(/^diff --git a\/(.+) b\/(.+)$/gm);
for (const match of matches) {
for (const filePath of [match[1], match[2]]) {
const entries = extractDiffGitHeaderEntries(patchContent);
for (const entry of entries) {
if (!entry.parseable) {
continue;
}
for (const filePath of [entry.oldPath, entry.newPath]) {
// "dev/null" is the sentinel used when a file is created or deleted; skip it
if (filePath && filePath !== "dev/null") {
const parts = filePath.split("/");
Expand Down Expand Up @@ -51,9 +55,12 @@ function extractPathsFromPatch(patchContent) {
return [];
}
const pathSet = new Set();
const matches = patchContent.matchAll(/^diff --git a\/(.+) b\/(.+)$/gm);
for (const match of matches) {
for (const filePath of [match[1], match[2]]) {
const entries = extractDiffGitHeaderEntries(patchContent);
for (const entry of entries) {
if (!entry.parseable) {
continue;
}
for (const filePath of [entry.oldPath, entry.newPath]) {
if (filePath && filePath !== "dev/null") {
pathSet.add(filePath);
}
Expand Down
33 changes: 33 additions & 0 deletions actions/setup/js/manifest_file_helpers.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,18 @@ rename to package.json.bak
expect(result).toContain("package.json.bak");
});

it("should parse quoted headers with spaces and escapes", () => {
const patch = `diff --git "a/dir/with space/package.json" "b/dir/with space/package-lock.json"
index abc..def 100644
diff --git "a/foo\\\\bar/config.json" "b/foo\\\\bar/config.json"
index abc..def 100644
`;
const result = extractFilenamesFromPatch(patch);
expect(result).toContain("package.json");
expect(result).toContain("package-lock.json");
expect(result).toContain("config.json");
});

it("should ignore dev/null sentinel in new-file diffs", () => {
const patch = `diff --git a/dev/null b/src/new-file.js
new file mode 100644
Expand Down Expand Up @@ -236,6 +248,27 @@ index 0000000..abc
expect(result).toContain(".github/workflows/new.yml");
expect(result).not.toContain("dev/null");
});

it("should parse CRLF patch headers without trailing carriage returns", () => {
const patch = "diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml\r\nindex abc..def 100644\r\n";
const result = extractPathsFromPatch(patch);
expect(result).toEqual([".github/workflows/ci.yml"]);
});

it("should parse quoted headers and ignore malformed headers", () => {
const patch = [`diff --git "a/.github/workflows/ci file.yml" "b/.github/workflows/ci file.yml"`, "index abc..def 100644", "diff --git ", "index abc..def 100644"].join("\n");
const result = extractPathsFromPatch(patch);
expect(result).toContain(".github/workflows/ci file.yml");
expect(result).toHaveLength(1);
});

it("should preserve full escaped paths from quoted headers", () => {
const patch = `diff --git "a/foo\\\\bar/config.json" "b/foo\\\\bar/config.json"
index abc..def 100644
`;
const result = extractPathsFromPatch(patch);
expect(result).toContain("foo\\\\bar/config.json");
});
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/zoom-out] The test name "should preserve full escaped paths from quoted headers" and its assertion expect(result).toContain("foo\\\\bar/config.json") (which resolves to the 4-char sequence foo\\bar) documents that paths with backslashes are returned in their raw (un-unescaped) form. This is good for regression protection, but it also serves as implicit confirmation that the paths returned are not filesystem-ready for consumers that do string comparison (see the related comment on stripPrefix). Consider extracting this fact into a // NOTE: paths are returned raw comment in extractDiffGitHeaderEntries so consumers don't have to infer it from a test.

});

describe("checkForProtectedPaths", () => {
Expand Down
132 changes: 132 additions & 0 deletions actions/setup/js/patch_path_helpers.cjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// @ts-check
const A_PREFIX_LENGTH = 2;
const B_PREFIX_LENGTH = 2;
const QUOTED_PREFIX_LENGTH = 3;

/**
* Parses a single `diff --git` header line and extracts both old/new paths.
* Handles unquoted and C-style quoted pathspecs.
*
* @param {string} headerLine
* @returns {{ oldPath: string|null, newPath: string|null, parseable: boolean }}
*/
function parseDiffGitHeader(headerLine) {
const sanitizedHeaderLine = headerLine.endsWith("\r") ? headerLine.slice(0, -1) : headerLine;
const rest = sanitizedHeaderLine.replace(/^diff --git /, "");
if (rest === sanitizedHeaderLine) {
return { oldPath: null, newPath: null, parseable: false };
}

// Git may emit unquoted paths that still contain spaces in `diff --git`
// headers. In that case, split using the required ` b/` token boundary
// instead of generic whitespace tokenization.
if (rest.startsWith("a/")) {
const quotedSep = rest.indexOf(' "b/');
const unquotedSep = rest.indexOf(" b/");
const foundSeparatorIndices = [quotedSep, unquotedSep].filter(idx => idx >= 0);
if (foundSeparatorIndices.length > 0) {
const sep = Math.min(...foundSeparatorIndices);
const oldPath = rest.slice(A_PREFIX_LENGTH, sep) || null;
const newToken = rest.slice(sep + 1).trimEnd();
let newPath = null;
if (newToken.startsWith('"b/')) {
if (newToken.endsWith('"')) {
newPath = newToken.slice(QUOTED_PREFIX_LENGTH, -1) || null;
} else {
newPath = newToken.slice(QUOTED_PREFIX_LENGTH) || null;
}
} else if (newToken.startsWith("b/")) {
newPath = newToken.slice(B_PREFIX_LENGTH) || null;
}
if (oldPath || newPath) {
return { oldPath, newPath, parseable: true };
}
}
}

/** @type {string[]} */
const tokens = [];
const isWhitespace = ch => ch === " " || ch === "\t" || ch === "\r" || ch === "\n";
let i = 0;
while (i < rest.length && tokens.length < 2) {
while (i < rest.length && isWhitespace(rest[i])) {
i++;
}
Comment on lines +51 to +54
if (i >= rest.length) {
break;
}

let token = "";
if (rest[i] === '"') {
token += rest[i++];
let closedQuote = false;
while (i < rest.length) {
const ch = rest[i++];
token += ch;
if (ch === "\\" && i < rest.length) {
token += rest[i++];
} else if (ch === '"') {
closedQuote = true;
break;
}
}
if (!closedQuote) {
return { oldPath: null, newPath: null, parseable: false };
}
} else {
while (i < rest.length && !isWhitespace(rest[i])) {
token += rest[i++];
}
}
tokens.push(token);
}

if (tokens.length < 2) {
return { oldPath: null, newPath: null, parseable: false };
}

const stripPrefix = tok => {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/zoom-out] stripPrefix has an implicit fallback: if a token doesn't start with a/, b/, "a/, or "b/, the raw token (including any surrounding quotes) is returned as-is. For parseable: true entries this could produce a path containing a leading " character, which would then be stored in the protection check's path set or file-limit set.

The original implementation had the same gap. Now that this is the shared module, it would be worth adding a guard — e.g. returning null when neither prefix is found — along with a test case.

if (tok.startsWith('"a/') || tok.startsWith('"b/')) {
return tok.slice(QUOTED_PREFIX_LENGTH, tok.endsWith('"') ? -1 : undefined);
}
if (tok.startsWith("a/") || tok.startsWith("b/")) {
return tok.slice(B_PREFIX_LENGTH);
}
return tok;
};
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/improve-codebase-architecture] stripPrefix preserves the raw token without unescaping C-style sequences (e.g. \\n, \\t, \\xNN, octal \\NNN). The original code documented this as intentional for set-uniqueness purposes. However, this module is now also used by checkForProtectedPaths, where the path is compared against protection rules. A file at a/foo\tbar/config.json would be stored as the raw string foo\tbar/config.json, which would fail to match a rule written as foo bar/config.json (with the actual tab).

Consider adding a unescape pass for the quoted form, or at minimum documenting on extractDiffGitHeaderEntries that returned paths may contain raw C-style escape sequences so consumers know they need to unescape before matching against filesystem paths.


const oldPath = stripPrefix(tokens[0]) || null;
const newPath = stripPrefix(tokens[1]) || null;
if (!oldPath && !newPath) {
return { oldPath: null, newPath: null, parseable: false };
}

return { oldPath, newPath, parseable: true };
}

/**
* Extracts parsed entries for all `diff --git` headers in a patch.
*
* @param {string} patchContent
* @returns {{ oldPath: string|null, newPath: string|null, parseable: boolean, headerIndex: number, headerLine: string }[]}
*/
function extractDiffGitHeaderEntries(patchContent) {
if (!patchContent || !patchContent.trim()) {
return [];
}

/** @type {{ oldPath: string|null, newPath: string|null, parseable: boolean, headerIndex: number, headerLine: string }[]} */
const entries = [];
const headerRe = /^diff --git .*$/gm;
let match;
while ((match = headerRe.exec(patchContent)) !== null) {
entries.push({
...parseDiffGitHeader(match[0]),
headerIndex: match.index,
headerLine: match[0],
});
}
return entries;
}

module.exports = { parseDiffGitHeader, extractDiffGitHeaderEntries };
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[/zoom-out] patch_path_helpers.cjs is now the single shared parser for all diff --git header handling across the codebase, but it has no dedicated test file. All coverage goes through the consumer integration tests in create_pull_request.test.cjs and manifest_file_helpers.test.cjs, which means a bug in the core parser surfaces as a failure in a consumer test rather than in the parser test.

A dedicated patch_path_helpers.test.cjs would give faster, isolated feedback on edge cases like unclosed-quote short-circuit, octal/hex escape sequences, rename headers (a/old b/new), or paths that don't start with a//b/.

Loading
Loading