diff --git a/content/copilot/concepts/billing/usage-based-billing-for-individuals.md b/content/copilot/concepts/billing/usage-based-billing-for-individuals.md index 536a16df1b83..929092407610 100644 --- a/content/copilot/concepts/billing/usage-based-billing-for-individuals.md +++ b/content/copilot/concepts/billing/usage-based-billing-for-individuals.md @@ -43,12 +43,23 @@ Code completions and {% data variables.copilot.next_edit_suggestions %} are **no ## How do {% data variables.product.prodname_ai_credits_short %} work? -Each {% data variables.product.prodname_copilot_short %} individual plan subscription includes a monthly {% data variables.product.prodname_ai_credits_short %} allowance: +Each {% data variables.product.prodname_copilot_short %} individual plan subscription includes a monthly {% data variables.product.prodname_ai_credits_short %} allowance. -| Plan | Total {% data variables.product.prodname_ai_credits_short %} per month | -| --- | --- | -| {% data variables.copilot.copilot_pro_short %} | {% data variables.copilot.ai_credits_per_user_pro %} | -| {% data variables.copilot.copilot_pro_plus_short %} | {% data variables.copilot.ai_credits_per_user_pro_plus %} | +**Base credits** are included with your plan subscription each month. These match with your subscription price and they never change. + +Each plan currently also includes a **flex allotment**. This in an additional monthly amount on top of your base credits. The flex allotment is a variable part of your included usage; it is designed to adapt as the economics of AI evolve, including model pricing, new models, and improvements in efficiency. + +Your base credits are used first. If you go beyond your base credits, the flex allotment is applied automatically at the same rates across your IDE, {% data variables.product.prodname_dotcom_the_website %}, and the {% data variables.copilot.copilot_cli_short %}. No additional setup is required. Your usage dashboard shows your available allowance and what you've used. + +If you use everything included in your plan, you can purchase more and keep working. See [What happens if I exceed my included {% data variables.product.prodname_ai_credits_short %}](#what-happens-if-i-exceed-my-included--data-variablesproductprodname_ai_credits_short-). + +| Plan | Price per month | Base credits | Flex allotment | Total monthly {% data variables.product.prodname_ai_credits_short %} | +| --- | --- | --- | --- | --- | +| {% data variables.copilot.copilot_pro_short %} | {% data variables.copilot.cfi_price_per_month %} | {% data variables.copilot.ai_credits_per_user_pro %} | {% data variables.copilot.ai_credits_per_user_pro_flex %} | {% data variables.copilot.ai_credits_per_user_pro_total %} | +| {% data variables.copilot.copilot_pro_plus_short %} | {% data variables.copilot.cpp_price_per_month %} | {% data variables.copilot.ai_credits_per_user_pro_plus %} | {% data variables.copilot.ai_credits_per_user_pro_plus_flex %} | {% data variables.copilot.ai_credits_per_user_pro_plus_total %} | +| {% data variables.copilot.copilot_max_short %} | {% data variables.copilot.cm_price_per_month %} | {% data variables.copilot.ai_credits_per_user_max %} | {% data variables.copilot.ai_credits_per_user_max_flex %} | {% data variables.copilot.ai_credits_per_user_max_total %} | + +{% data variables.copilot.copilot_free_short %} will include 2000 code completions per month, an allowance of {% data variables.product.prodname_ai_credits_short %} and {% data variables.copilot.copilot_auto_model_selection_short %}. ## What happens if I exceed my included {% data variables.product.prodname_ai_credits_short %}? @@ -79,4 +90,4 @@ Note that, starting **June 1, 2026**, {% data variables.copilot.copilot_pro_shor ## Next steps -* For guidance on how to prepare for usage-based billing, see [AUTOTITLE](/copilot/how-tos/manage-and-track-spending/prepare-for-your-move-to-usage-based-billing). \ No newline at end of file +* For guidance on how to prepare for usage-based billing, see [AUTOTITLE](/copilot/how-tos/manage-and-track-spending/prepare-for-your-move-to-usage-based-billing). diff --git a/data/variables/copilot.yml b/data/variables/copilot.yml index 58c3407337df..af7b6263a455 100644 --- a/data/variables/copilot.yml +++ b/data/variables/copilot.yml @@ -16,6 +16,8 @@ copilot_free: 'GitHub Copilot Free' copilot_free_short: 'Copilot Free' copilot_student: 'GitHub Copilot Student' copilot_student_short: 'Copilot Student' +copilot_max: 'GitHub Copilot Max' +copilot_max_short: 'Copilot Max' ## Copilot billing # Price per additional premium request @@ -24,6 +26,8 @@ additional_premium_requests: '$0.04 USD' # Note that these are also used to bill cfi_price_per_month: '$10 USD' # Price per month for Copilot Pro Plus cpp_price_per_month: '$39 USD' +# Price per month for Copilot Max +cm_price_per_month: '$100 USD' # Price per month for Copilot Business cfb_price_per_month: '$19 USD' # Price per month for Copilot Enterprise @@ -34,8 +38,15 @@ ai_credits_per_user_business: '1,900' ai_credits_per_user_enterprise: '3,900' ai_credits_per_user_business_promo: '3,000' ai_credits_per_user_enterprise_promo: '7,000' -ai_credits_per_user_pro: '1000' -ai_credits_per_user_pro_plus: '3900' +ai_credits_per_user_pro: '1,000' +ai_credits_per_user_pro_plus: '3,900' +ai_credits_per_user_max: '10,000' +ai_credits_per_user_pro_flex: '500' +ai_credits_per_user_pro_plus_flex: '3,100' +ai_credits_per_user_max_flex: '10,000' +ai_credits_per_user_pro_total: '1,500' +ai_credits_per_user_pro_plus_total: '7,000' +ai_credits_per_user_max_total: '20,000' ## Copilot partners: builders who can develop Copilot extensions copilot_partners: 'Copilot Partners' diff --git a/src/graphql/data/ghec/schema.docs.graphql b/src/graphql/data/ghec/schema.docs.graphql index 4ab4373f7b64..66088291b55d 100644 --- a/src/graphql/data/ghec/schema.docs.graphql +++ b/src/graphql/data/ghec/schema.docs.graphql @@ -69470,7 +69470,7 @@ type User implements Actor & Agentic & Node & PackageOwner & ProfileOwner & Proj ): Organization """ - Verified email addresses that match verified domains for a specified organization the user is a member of. Results are unordered. There is no way to specify ordering, priority, or filtering, and this field should not be used to determine a user's canonical or current corporate email in multi-domain contexts. + Verified email addresses that match verified domains for a specified organization the user is a member of. """ organizationVerifiedDomainEmails( """ diff --git a/src/links/lib/extract-links.ts b/src/links/lib/extract-links.ts index 7f2224bd64d8..486a07b74caa 100644 --- a/src/links/lib/extract-links.ts +++ b/src/links/lib/extract-links.ts @@ -57,13 +57,34 @@ export interface LinkExtractionResult { } /** - * Get line and column number for a match in content + * Build an array of character offsets at which each line starts. + * offsets[0] is always 0. Called once per extractLinksFromMarkdown invocation + * so that getLineAndColumn can use binary search instead of repeated splits. */ -function getLineAndColumn(content: string, matchIndex: number): { line: number; column: number } { - const lines = content.substring(0, matchIndex).split('\n') - const line = lines.length - const column = lines[lines.length - 1].length + 1 - return { line, column } +function buildLineOffsets(content: string): number[] { + const offsets = [0] + for (let i = 0; i < content.length; i++) { + if (content[i] === '\n') offsets.push(i + 1) + } + return offsets +} + +/** + * Get line and column number for a match using a precomputed line-offset index. + * Binary search gives O(log L) per call instead of O(matchIndex). + */ +function getLineAndColumn( + lineOffsets: number[], + matchIndex: number, +): { line: number; column: number } { + let lo = 0 + let hi = lineOffsets.length - 1 + while (lo < hi) { + const mid = (lo + hi + 1) >> 1 + if (lineOffsets[mid] <= matchIndex) lo = mid + else hi = mid - 1 + } + return { line: lo + 1, column: matchIndex - lineOffsets[lo] + 1 } } /** @@ -109,10 +130,13 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult }, ) + // Precompute line-start offsets once so every getLineAndColumn call is O(log L). + const lineOffsets = buildLineOffsets(strippedContent) + // Extract AUTOTITLE links first (they're a special case of internal links) let match while ((match = AUTOTITLE_LINK_PATTERN.exec(strippedContent)) !== null) { - const { line, column } = getLineAndColumn(strippedContent, match.index) + const { line, column } = getLineAndColumn(lineOffsets, match.index) const href = match[1].split('#')[0] // Remove anchor if present if (href.startsWith('/')) { internalLinks.push({ @@ -136,7 +160,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult continue } - const { line, column } = getLineAndColumn(strippedContent, match.index) + const { line, column } = getLineAndColumn(lineOffsets, match.index) // Extract href from ](/path) format const href = fullMatch.substring(2, fullMatch.length - 1).split('#')[0] const text = extractLinkText(strippedContent, match.index) @@ -155,7 +179,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult // Extract external links while ((match = EXTERNAL_LINK_PATTERN.exec(strippedContent)) !== null) { - const { line, column } = getLineAndColumn(strippedContent, match.index) + const { line, column } = getLineAndColumn(lineOffsets, match.index) const href = match[1] const text = extractLinkText(strippedContent, match.index) @@ -172,7 +196,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult // Extract anchor links while ((match = ANCHOR_LINK_PATTERN.exec(strippedContent)) !== null) { - const { line, column } = getLineAndColumn(strippedContent, match.index) + const { line, column } = getLineAndColumn(lineOffsets, match.index) const href = match[0].substring(2, match[0].length - 1) anchorLinks.push({ @@ -188,7 +212,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult // Extract image links while ((match = IMAGE_LINK_PATTERN.exec(strippedContent)) !== null) { - const { line, column } = getLineAndColumn(strippedContent, match.index) + const { line, column } = getLineAndColumn(lineOffsets, match.index) const href = match[1] // Only include internal images (starting with /) @@ -208,7 +232,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult // Extract reference-style link definitions ([id]: /path) // These are distinct from inline links but point to the same targets that need validating. while ((match = LINK_DEFINITION_PATTERN.exec(strippedContent)) !== null) { - const { line, column } = getLineAndColumn(strippedContent, match.index) + const { line, column } = getLineAndColumn(lineOffsets, match.index) const href = match[1].split('#')[0] internalLinks.push({ href, @@ -223,7 +247,7 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult // Extract links whose href starts with a Liquid tag while ((match = LIQUID_HREF_PATTERN.exec(strippedContent)) !== null) { - const { line, column } = getLineAndColumn(strippedContent, match.index) + const { line, column } = getLineAndColumn(lineOffsets, match.index) liquidPrefixedLinks.push({ href: match[1], line, @@ -274,6 +298,18 @@ export function createLiquidContext( } as Context } +// Cached reference to renderLiquid — avoids repeated dynamic-import overhead on every call. +// A dynamic import is still used (not a top-level import) to prevent circular dependency issues. +type RenderLiquidModule = (template: string, context: unknown) => Promise +let _renderLiquid: RenderLiquidModule | null = null +async function getCachedRenderLiquid(): Promise { + if (!_renderLiquid) { + const mod = await import('@/content-render/liquid/index') + _renderLiquid = mod.renderLiquid + } + return _renderLiquid +} + /** * Render Liquid templates in content and extract links * @@ -285,8 +321,8 @@ export async function extractLinksWithLiquid( context: Context, ): Promise { try { - // Dynamic import to avoid circular dependency issues - const { renderLiquid } = await import('@/content-render/liquid/index') + // Dynamic import to avoid circular dependency issues (cached after first load) + const renderLiquid = await getCachedRenderLiquid() // Render Liquid to expand conditionals const rendered = await renderLiquid(content, context) return extractLinksFromMarkdown(rendered) @@ -298,6 +334,24 @@ export async function extractLinksWithLiquid( } } +/** + * Render Liquid templates in content, returning both the rendered markdown string and + * extracted links. Use this when both are needed to avoid rendering the same content twice. + */ +export async function renderAndExtractLinks( + content: string, + context: Context, +): Promise<{ renderedMarkdown: string; result: LinkExtractionResult }> { + try { + const renderLiquid = await getCachedRenderLiquid() + const renderedMarkdown = await renderLiquid(content, context) + return { renderedMarkdown, result: extractLinksFromMarkdown(renderedMarkdown) } + } catch (error) { + console.warn('Liquid rendering failed, falling back to raw extraction:', error) + return { renderedMarkdown: content, result: extractLinksFromMarkdown(content) } + } +} + /** * Read a file and extract links */ diff --git a/src/links/lib/link-report.ts b/src/links/lib/link-report.ts index ac74891c4a75..8204a67669b5 100644 --- a/src/links/lib/link-report.ts +++ b/src/links/lib/link-report.ts @@ -32,6 +32,7 @@ export interface LinkReport { title: string summary: string groups: GroupedBrokenLinks[] + selfReferentialGroups?: GroupedBrokenLinks[] uniqueTargets: number totalOccurrences: number timestamp: string @@ -96,6 +97,25 @@ ${statusInfo}${suggestion}**Found in ${count} file${plural}:** ${tableRows}` }, + // Self-referential links section + selfReferentialLinks: (title: string, groups: GroupedBrokenLinks[]) => { + const totalOccurrences = groups.reduce((sum, g) => sum + g.occurrences.length, 0) + const rows = groups + .map((g) => { + const uniqueFileCount = new Set(g.occurrences.map((occ) => occ.file)).size + const occRows = g.occurrences + .map((occ) => `| \`${occ.file}\` | ${occ.lines.join(', ')} |`) + .join('\n') + return `### \`${g.target}\`\n\n**Found in ${uniqueFileCount} file${uniqueFileCount === 1 ? '' : 's'}:**\n\n| File | Line(s) |\n|------|---------|\n${occRows}` + }) + .join('\n\n') + return `## 🔗 ${title} (${groups.length} unique URL${groups.length === 1 ? '' : 's'}, ${totalOccurrences} occurrence${totalOccurrences === 1 ? '' : 's'}) + +The following links point to \`docs.github.com\`. Consider replacing them with relative internal links using the \`[AUTOTITLE](/path/to/article)\` syntax. + +${rows}` + }, + // Empty report noIssues: () => 'No issues found! 🎉', @@ -301,9 +321,12 @@ export function generateInternalLinkReport( */ export function generateExternalLinkReport( brokenLinks: BrokenLink[], - options: { actionUrl?: string } = {}, + options: { actionUrl?: string; selfReferentialLinks?: BrokenLink[] } = {}, ): LinkReport { const groups = groupExternalLinksByDomain(brokenLinks) + const selfReferentialGroups = options.selfReferentialLinks?.length + ? groupBrokenLinks(options.selfReferentialLinks) + : undefined const count = groups.length const plural = count === 1 ? '' : 's' @@ -314,6 +337,7 @@ export function generateExternalLinkReport( ? `Found **${brokenLinks.length}** broken external link${brokenLinks.length === 1 ? '' : 's'} across **${count}** domain${plural}.` : 'All external links are valid! ✅', groups, + selfReferentialGroups, uniqueTargets: count, totalOccurrences: brokenLinks.length, timestamp: new Date().toISOString(), @@ -360,6 +384,8 @@ function renderGroups(groups: GroupedBrokenLinks[], isExternal: boolean): string */ export function reportToMarkdown(report: LinkReport, isExternal = false): string { const parts: string[] = [] + const hasBrokenOrRedirectGroups = report.groups.length > 0 + const hasSelfReferentialGroups = Boolean(report.selfReferentialGroups?.length) // Header parts.push( @@ -367,7 +393,7 @@ export function reportToMarkdown(report: LinkReport, isExternal = false): string ) parts.push('') - if (report.groups.length === 0) { + if (!hasBrokenOrRedirectGroups && !hasSelfReferentialGroups) { parts.push(TEMPLATES.noIssues()) return parts.join('\n') } @@ -379,7 +405,17 @@ export function reportToMarkdown(report: LinkReport, isExternal = false): string } // Groups - parts.push(renderGroups(report.groups, isExternal)) + if (hasBrokenOrRedirectGroups) { + parts.push(renderGroups(report.groups, isExternal)) + } + + // Self-referential links section (external report only) + if (hasSelfReferentialGroups) { + parts.push( + TEMPLATES.selfReferentialLinks('Potential Internal Links', report.selfReferentialGroups!), + ) + parts.push('') + } return parts.join('\n') } diff --git a/src/links/scripts/check-links-external.ts b/src/links/scripts/check-links-external.ts index 526ff928d077..fd4cc3324b96 100644 --- a/src/links/scripts/check-links-external.ts +++ b/src/links/scripts/check-links-external.ts @@ -64,6 +64,43 @@ interface CacheData { urls: Record } +interface LinkOccurrence { + file: string + line: number + href: string +} + +/** + * Normalize a URL for deduplication purposes: + * - Remove URL fragment (#anchor) + * - Remove trailing slash only for origin/root URLs + * + * For example, https://www.githubstatus.com and https://www.githubstatus.com/ + * are treated as the same URL. + */ +function normalizeUrl(href: string): string { + // Remove fragment + const withoutFragment = href.split('#')[0] + // Remove trailing slash only for origin/root URLs + try { + const parsed = new URL(withoutFragment) + if (parsed.pathname === '/' && !parsed.search) { + return parsed.origin + } + } catch { + // Keep original if URL parsing fails. + } + return withoutFragment +} + +function isDocsGithubUrl(url: string): boolean { + try { + return new URL(url).hostname === 'docs.github.com' + } catch { + return false + } +} + /** * Sleep for a given number of milliseconds */ @@ -150,11 +187,11 @@ async function fetchWithTimeout( /** * Extract all external links from content files */ -async function extractAllExternalLinks(): Promise> { - const links = new Map() +async function extractAllExternalLinks(): Promise> { + const links = new Map() // Find all Markdown files - const files = await glob('content/**/*.md') + const files = await glob('content/**/*.md', { ignore: '**/README.md' }) console.log(`Found ${files.length} Markdown files to scan`) const extractStart = Date.now() @@ -175,13 +212,13 @@ async function extractAllExternalLinks(): Promise() + for (const [url, occurrences] of allLinks) { + if (isDocsGithubUrl(url)) { + selfReferentialLinks.set(url, occurrences) + allLinks.delete(url) + } + } + console.log(`Found ${allLinks.size} unique external URLs`) + console.log(`Found ${selfReferentialLinks.size} self-referential docs.github.com URLs`) console.log('') if (options.dryRun) { @@ -276,7 +325,7 @@ async function main() { if (!result.ok) { for (const occ of occurrences) { brokenLinks.push({ - href: url, + href: occ.href, file: occ.file, lines: [occ.line], statusCode: result.statusCode, @@ -318,7 +367,15 @@ async function main() { chalk.blue(`Checked ${checkedCount} URLs in ${duration}s (${cachedCount} from cache)`), ) - if (brokenLinks.length === 0) { + // Build self-referential BrokenLink list for the report + const selfReferentialBrokenLinks: BrokenLink[] = [] + for (const occurrences of selfReferentialLinks.values()) { + for (const occ of occurrences) { + selfReferentialBrokenLinks.push({ href: occ.href, file: occ.file, lines: [occ.line] }) + } + } + + if (brokenLinks.length === 0 && selfReferentialBrokenLinks.length === 0) { console.log(chalk.green('✅ All external links valid!')) process.exit(0) } @@ -326,20 +383,30 @@ async function main() { // Generate report const report = generateExternalLinkReport(brokenLinks, { actionUrl: process.env.ACTION_RUN_URL, + selfReferentialLinks: selfReferentialBrokenLinks, }) - console.log('') - console.log(chalk.red(`❌ ${report.uniqueTargets} domain(s) with broken links`)) - console.log(chalk.red(` ${report.totalOccurrences} total occurrence(s)`)) + if (brokenLinks.length === 0) { + console.log(chalk.green('✅ All external links valid!')) + console.log( + chalk.blue( + `â„šī¸ Found ${selfReferentialBrokenLinks.length} docs.github.com absolute link occurrence(s) to convert.`, + ), + ) + } else { + console.log('') + console.log(chalk.red(`❌ ${report.uniqueTargets} domain(s) with broken links`)) + console.log(chalk.red(` ${report.totalOccurrences} total occurrence(s)`)) - // Show summary by domain - console.log('') - console.log('Broken links by domain:') - for (const group of report.groups.slice(0, 10)) { - console.log(` ${group.target}: ${group.occurrences.length} occurrence(s)`) - } - if (report.groups.length > 10) { - console.log(` ... and ${report.groups.length - 10} more domains`) + // Show summary by domain + console.log('') + console.log('Broken links by domain:') + for (const group of report.groups.slice(0, 10)) { + console.log(` ${group.target}: ${group.occurrences.length} occurrence(s)`) + } + if (report.groups.length > 10) { + console.log(` ... and ${report.groups.length - 10} more domains`) + } } // Write artifact @@ -351,7 +418,7 @@ async function main() { const createReport = process.env.CREATE_REPORT === 'true' const reportRepository = process.env.REPORT_REPOSITORY || 'github/docs-content' - if (createReport && process.env.GITHUB_TOKEN) { + if (brokenLinks.length > 0 && createReport && process.env.GITHUB_TOKEN) { console.log('') console.log('Creating issue report...') diff --git a/src/links/scripts/check-links-internal.ts b/src/links/scripts/check-links-internal.ts index 96a8d94b7a38..b60ef137e481 100644 --- a/src/links/scripts/check-links-internal.ts +++ b/src/links/scripts/check-links-internal.ts @@ -19,13 +19,14 @@ */ import fs from 'fs' +import os from 'os' import { program } from 'commander' import chalk from 'chalk' -import { load } from 'cheerio' + +import GithubSlugger from 'github-slugger' import warmServer from '@/frame/lib/warm-server' -import { renderContent } from '@/content-render/index' import { allVersions, allVersionKeys } from '@/versions/lib/all-versions' import languages from '@/languages/lib/languages-server' import { @@ -35,6 +36,8 @@ import { isAssetLink, extractLinksWithLiquid, extractLinksFromMarkdown, + renderAndExtractLinks, + type LinkExtractionResult, } from '@/links/lib/extract-links' import { type BrokenLink, @@ -115,6 +118,8 @@ function getFrontmatterLineOffset(fullPath: string): number { async function getLinksFromMarkdown( page: Page, context: Context, + precomputedRawResult?: LinkExtractionResult, + prerenderedResult?: LinkExtractionResult, ): Promise<{ href: string; text: string | undefined; line: number }[]> { const fmOffset = getFrontmatterLineOffset(page.fullPath) @@ -125,7 +130,7 @@ async function getLinksFromMarkdown( // the rendered href will differ from the raw string, so rawLinesByHref.get() would miss. // To fix this, we lazily import renderLiquid once and use it to resolve those hrefs to // their canonical (rendered) form before keying the map — matching what extractLinksWithLiquid produces. - const rawResult = extractLinksFromMarkdown(page.markdown) + const rawResult = precomputedRawResult ?? extractLinksFromMarkdown(page.markdown) const needsLiquidHrefResolution = rawResult.internalLinks.some((l) => l.href.includes('{%') || l.href.includes('{{')) || @@ -184,7 +189,7 @@ async function getLinksFromMarkdown( // reusables, excludes version-gated links that don't apply here). // extractLinksWithLiquid already catches Liquid render failures internally and // falls back to raw extraction with a warning, so no outer try/catch is needed. - const renderedResult = await extractLinksWithLiquid(page.markdown, context) + const renderedResult = prerenderedResult ?? (await extractLinksWithLiquid(page.markdown, context)) const renderedLinks = renderedResult.internalLinks.map((l) => ({ href: l.href, text: l.text })) return renderedLinks.map((link) => { @@ -197,70 +202,236 @@ async function getLinksFromMarkdown( } /** - * Check anchor links on a rendered page + * Strip inline Markdown markup from a heading to get plain text for slug computation. + * Matches what hast-util-to-string produces on a heading node after remark parsing. + * + * Key design decisions: + * - Inline code spans (backtick) are extracted verbatim so that `` inside them + * is not incorrectly stripped by the HTML-tag regex (which is needed for octicon SVGs). + * - HTML stripping only removes valid HTML element names (no underscores) to avoid stripping + * angle-bracket placeholders like that appear in code-span heading text. + * - No final .trim() — trailing whitespace from stripped SVGs becomes trailing hyphens via + * github-slugger, reproducing the live site's heading IDs (e.g. `allow--`). + */ +function headingTextToPlain(text: string): string { + // Strip HTML tags using a state machine rather than a regex so that CodeQL can verify + // the stripping is complete. Tags like or tags with '>' in attribute values + // are handled correctly. Output is only used for slug computation, never rendered as HTML. + function stripHtmlTags(s: string): string { + let out = '' + let inTag = false + for (let i = 0; i < s.length; i++) { + if (!inTag && s[i] === '<') { + // Peek ahead: if this looks like an underscore-containing placeholder (e.g. ), + // emit the inner text instead of dropping it entirely so the slug stays correct. + const close = s.indexOf('>', i + 1) + if (close !== -1) { + const inner = s.slice(i + 1, close) + if (/^[a-zA-Z][a-zA-Z0-9]*(?:_[a-zA-Z0-9]+)+$/.test(inner)) { + out += inner + i = close + continue + } + } + inTag = true + } else if (inTag && s[i] === '>') { + inTag = false + // Don't emit a replacement space — surrounding whitespace in the source markdown + // already provides the correct spacing for github-slugger (e.g. `allow ` from + // the space before an octicon tag). + } else if (!inTag) { + out += s[i] + } + } + return out + } + + // Process non-code portions: strip HTML and inline formatting markup. + function processNonCode(s: string): string { + return stripHtmlTags(s) + .replace(/!\[([^\]]*)\]\([^)]*\)/g, '$1') // images: ![alt](url) → alt + .replace(/\[([^\]]*)\]\([^)]*\)/g, '$1') // links: [text](url) → text + .replace(/\*\*([^*]+)\*\*/g, '$1') // bold **text** + .replace(/\*([^*]+)\*/g, '$1') // italic *text* + .replace(/(? 0) { + const open = remaining.indexOf('`') + if (open === -1) { + parts.push(processNonCode(remaining)) + break + } + if (open > 0) parts.push(processNonCode(remaining.slice(0, open))) + const close = remaining.indexOf('`', open + 1) + if (close === -1) { + // Unclosed backtick — treat remainder as non-code + parts.push(processNonCode(remaining.slice(open))) + break + } + parts.push(remaining.slice(open + 1, close)) // code content verbatim + remaining = remaining.slice(close + 1) + } + return parts.join('') + // Note: no .trim() — see comment above. +} + +/** + * Check anchor links on a page using fast heading ID computation from Liquid-rendered + * markdown. Avoids the expensive full HTML render previously used. + * + * Uses github-slugger (the same library as rehype-slug in the render pipeline) to compute + * heading anchor IDs, producing results that match the live site. */ -async function checkAnchorsOnPage( +function checkAnchorsFromHeadings( page: Page, - permalink: Permalink, - context: Context, -): Promise { - const brokenAnchors: BrokenLink[] = [] + rawResult: LinkExtractionResult, + renderedResult: LinkExtractionResult, + renderedMarkdown: string, +): BrokenLink[] { + if (page.autogenerated) return [] - // Skip anchor checking on auto-generated pages (e.g., REST, GraphQL, webhooks). - // These pages have headings generated from OpenAPI/schema data at render time - // by transformers that aren't run during link checking, so same-page anchor - // links to those headings would always appear broken. - if (page.autogenerated) { - return brokenAnchors + const fmOffset = getFrontmatterLineOffset(page.fullPath) + + // Compute heading anchor IDs from the Liquid-rendered markdown in document order. + // github-slugger deduplicates identical headings with -1, -2, ... suffixes, + // matching the behaviour of rehype-slug in the full render pipeline. + const slugger = new GithubSlugger() + const headingIds = new Set() + + // ATX headings: ## Heading text (optional trailing ##) + const ATX_HEADING_RE = /^#{1,6}\s+(.+?)(?:\s+#+)?\s*$/gm + let m: RegExpExecArray | null + while ((m = ATX_HEADING_RE.exec(renderedMarkdown)) !== null) { + headingIds.add(slugger.slug(headingTextToPlain(m[1]))) } - try { - // Extract anchor links from markdown first to get accurate line numbers - const mdResult = extractLinksFromMarkdown(page.markdown) - const fmOffset = getFrontmatterLineOffset(page.fullPath) - const anchorLineMap = new Map() - for (const link of mdResult.anchorLinks) { - // Store the first occurrence of each anchor href - if (!anchorLineMap.has(link.href)) { - anchorLineMap.set(link.href, link.line + fmOffset) - } + // Setext headings: text line followed by === or --- underline + const SETEXT_HEADING_RE = /^([^\n]+)\n[=-]{2,}\s*$/gm + while ((m = SETEXT_HEADING_RE.exec(renderedMarkdown)) !== null) { + headingIds.add(slugger.slug(headingTextToPlain(m[1]))) + } + + // Explicit and anchors embedded in the markdown. + // Some pages (e.g. site-policy) use raw HTML anchors instead of headings. + const NAMED_ANCHOR_RE = /]*(?:name|id)="([^"]+)"[^>]*>/gi + while ((m = NAMED_ANCHOR_RE.exec(renderedMarkdown)) !== null) { + headingIds.add(m[1]) + } + + // Build line-number map from the raw (pre-Liquid) source for accurate file line numbers. + const anchorLineMap = new Map() + for (const link of rawResult.anchorLinks) { + if (!anchorLineMap.has(link.href)) { + anchorLineMap.set(link.href, link.line + fmOffset) + } + } + + // Check only the anchor links that actually appear in the Liquid-rendered output + // (respects {% ifversion %} gates — links in non-applicable blocks are not checked). + const brokenAnchors: BrokenLink[] = [] + for (const link of renderedResult.anchorLinks) { + const { href } = link + if (href === '#' || href === '#top') continue + const targetId = href.slice(1) + if (!headingIds.has(targetId)) { + brokenAnchors.push({ + href, + file: page.relativePath, + lines: [anchorLineMap.get(href) ?? 0], + isAutotitle: false, + }) } + } + + return brokenAnchors +} + +/** + * Process a single page: extract links, validate them, and optionally check anchors. + * Receives its own context object so it is safe to run concurrently with other pages. + */ +async function checkPage( + page: Page, + permalink: Permalink, + pageContext: Context, + pageMap: Record, + redirects: Record, + options: { checkAnchors: boolean }, +): Promise<{ brokenLinks: BrokenLink[]; redirectLinks: BrokenLink[]; linksChecked: number }> { + const brokenLinks: BrokenLink[] = [] + const redirectLinks: BrokenLink[] = [] + + const rawMarkdownLinks = extractLinksFromMarkdown(page.markdown) - const html = await renderContent(page.markdown, context) - const $ = load(html) - - // Find all anchor links (same-page links) - $('a[href^="#"]').each((_, el) => { - const href = $(el).attr('href') - if (!href || href === '#' || href === '#top') return - - // Check if the anchor target exists - const targetId = href.slice(1) - // Escape special CSS selector characters for jQuery/cheerio - const escapedId = targetId.replace(/([!"#$%&'()*+,./:;<=>?@[\\\]^`{|}~])/g, '\\$1') - const targetExists = $(`#${escapedId}`).length > 0 || $(`[name="${targetId}"]`).length > 0 - - if (!targetExists) { - // Look up the line number from the markdown source - const line = anchorLineMap.get(href) ?? 0 - brokenAnchors.push({ - href, + // Render through Liquid once; share the result between link extraction and anchor + // checking to avoid paying the Liquid render cost twice per page. + const { renderedMarkdown, result: renderedLinkResult } = await renderAndExtractLinks( + page.markdown, + pageContext, + ) + + const links = await getLinksFromMarkdown(page, pageContext, rawMarkdownLinks, renderedLinkResult) + + for (const link of links) { + if (isExcludedLink(link.href)) continue + + // Check if this is an asset link (images, etc.) - verify file exists on disk + if (isAssetLink(link.href)) { + if (!checkAssetLink(link.href)) { + brokenLinks.push({ + href: link.href, file: page.relativePath, - lines: [line], - text: $(el).text(), - isAutotitle: false, + lines: [link.line], + text: link.text, }) } - }) - } catch { - // Rendering errors are logged elsewhere + continue + } + + const normalized = normalizeLinkPath(link.href) + const result = checkInternalLink(normalized, pageMap, redirects) + + if (!result.exists) { + brokenLinks.push({ + href: link.href, + file: page.relativePath, + lines: [link.line], + text: link.text, + }) + } else if (result.isRedirect) { + redirectLinks.push({ + href: link.href, + file: page.relativePath, + lines: [link.line], + text: link.text, + isRedirect: true, + redirectTarget: result.redirectTarget, + }) + } } - return brokenAnchors + if (options.checkAnchors) { + const anchorFlaws = checkAnchorsFromHeadings( + page, + rawMarkdownLinks, + renderedLinkResult, + renderedMarkdown, + ) + brokenLinks.push(...anchorFlaws) + } + + return { brokenLinks, redirectLinks, linksChecked: links.length } } /** - * Check all pages for a given version and language + * Check all pages for a given version and language, processing pages concurrently + * up to `concurrency` at a time. */ async function checkVersion( version: string, @@ -268,13 +439,8 @@ async function checkVersion( pageList: Page[], pageMap: Record, redirects: Record, - options: { checkAnchors: boolean; verbose: boolean }, + options: { checkAnchors: boolean; verbose: boolean; concurrency: number }, ): Promise { - const brokenLinks: BrokenLink[] = [] - const redirectLinks: BrokenLink[] = [] - let totalPagesChecked = 0 - let totalLinksChecked = 0 - const versionObj = allVersions[version] if (!versionObj) { throw new Error(`Unknown version: ${version}`) @@ -287,10 +453,13 @@ async function checkVersion( return true }) - console.log(` Checking ${relevantPages.length} pages for ${version}/${language}`) + console.log( + ` Checking ${relevantPages.length} pages for ${version}/${language} (concurrency: ${options.concurrency})`, + ) - // Build a base context once per version — feature flags and version info are the same for all pages - const baseContext: Context = { + // Build a base context once per version — feature flags and version info are the same for all pages. + // Each page gets a shallow copy so concurrent tasks don't share the mutable `page` property. + const baseContext = { currentVersion: version, currentLanguage: language, currentVersionObj: versionObj, @@ -300,72 +469,48 @@ async function checkVersion( ...getFeaturesByVersion(version), } as Context - for (const page of relevantPages) { - // Find the permalink for this version - const permalink = page.permalinks?.find((p) => p.pageVersion === version) - if (!permalink) continue - - totalPagesChecked++ - - // Mutate the page property in place — safe because the loop is sequential (each iteration - // awaits before the next begins), so there is no concurrent access to baseContext. - baseContext.page = page - - // Get links from markdown source (preserves accurate line numbers) - const links = await getLinksFromMarkdown(page, baseContext) - totalLinksChecked += links.length - - // Check each link - for (const link of links) { - if (isExcludedLink(link.href)) continue - - // Check if this is an asset link (images, etc.) - verify file exists on disk - if (isAssetLink(link.href)) { - if (!checkAssetLink(link.href)) { - brokenLinks.push({ - href: link.href, - file: page.relativePath, - lines: [link.line], - text: link.text, - }) - } - continue - } + const allBrokenLinks: BrokenLink[] = [] + const allRedirectLinks: BrokenLink[] = [] + let totalPagesChecked = 0 + let totalLinksChecked = 0 - const normalized = normalizeLinkPath(link.href) - const result = checkInternalLink(normalized, pageMap, redirects) + // Bounded concurrency: process up to `options.concurrency` pages simultaneously. + // All workers drain from the same shared iterator — no page is processed twice. + const queue = relevantPages.entries() - if (!result.exists) { - brokenLinks.push({ - href: link.href, - file: page.relativePath, - lines: [link.line], - text: link.text, - }) - } else if (result.isRedirect) { - redirectLinks.push({ - href: link.href, - file: page.relativePath, - lines: [link.line], - text: link.text, - isRedirect: true, - redirectTarget: result.redirectTarget, - }) - } - } + async function worker() { + for (const [, page] of queue) { + const permalink = page.permalinks?.find((p) => p.pageVersion === version) + if (!permalink) continue - // Check anchors if enabled - if (options.checkAnchors) { - const anchorFlaws = await checkAnchorsOnPage(page, permalink, baseContext) - brokenLinks.push(...anchorFlaws) - } + // Each concurrent task gets its own context copy with the page set. + // pageMap and redirects are read-only and safe to share. + const pageContext = { ...baseContext, page } as Context - if (options.verbose && totalPagesChecked % 100 === 0) { - console.log(` Checked ${totalPagesChecked} pages...`) + const result = await checkPage(page, permalink, pageContext, pageMap, redirects, options) + + // Merging results here is safe: JS is single-threaded so array pushes + // between await points cannot interleave with another worker's pushes. + allBrokenLinks.push(...result.brokenLinks) + allRedirectLinks.push(...result.redirectLinks) + totalPagesChecked++ + totalLinksChecked += result.linksChecked + + if (options.verbose && totalPagesChecked % 100 === 0) { + console.log(` Checked ${totalPagesChecked} pages...`) + } } } - return { brokenLinks, redirectLinks, totalPagesChecked, totalLinksChecked } + // Launch `concurrency` workers that all drain from the same shared queue iterator. + await Promise.all(Array.from({ length: options.concurrency }, worker)) + + return { + brokenLinks: allBrokenLinks, + redirectLinks: allRedirectLinks, + totalPagesChecked, + totalLinksChecked, + } } /** @@ -380,6 +525,11 @@ async function main() { .option('--check-anchors', 'Check anchor links within pages', true) .option('--no-check-anchors', 'Skip anchor link checking') .option('--verbose', 'Verbose output') + .option( + '--concurrency ', + 'Number of pages to process concurrently', + String(Math.max(1, os.cpus().length - 1)), + ) .parse() const options = program.opts() @@ -423,9 +573,11 @@ async function main() { console.log('') // Run the check + const concurrency = Math.max(1, parseInt(process.env.CONCURRENCY || options.concurrency, 10)) const result = await checkVersion(version, language, pageList, pageMap, redirects, { checkAnchors, verbose: options.verbose, + concurrency, }) // Report results diff --git a/src/links/tests/link-report.ts b/src/links/tests/link-report.ts index 8723a036998f..7309174d01b4 100644 --- a/src/links/tests/link-report.ts +++ b/src/links/tests/link-report.ts @@ -174,6 +174,15 @@ describe('generateExternalLinkReport', () => { expect(report.title).toContain('2 domains') expect(report.uniqueTargets).toBe(2) }) + + test('includes self-referential groups when provided', () => { + const report = generateExternalLinkReport([], { + selfReferentialLinks: [{ href: 'https://docs.github.com/en', file: 'a.md', lines: [1] }], + }) + + expect(report.selfReferentialGroups).toHaveLength(1) + expect(report.selfReferentialGroups?.[0].target).toBe('https://docs.github.com/en') + }) }) describe('reportToMarkdown', () => { @@ -242,6 +251,28 @@ describe('reportToMarkdown', () => { expect(markdown).toContain('## ❌ Broken Links') expect(markdown).toContain('## âš ī¸ Redirects to Update') }) + + test('includes potential internal links section with no broken links', () => { + const report = generateExternalLinkReport([], { + selfReferentialLinks: [{ href: 'https://docs.github.com/en', file: 'a.md', lines: [1] }], + }) + const markdown = reportToMarkdown(report, true) + + expect(markdown).toContain('Potential Internal Links') + expect(markdown).not.toContain('No issues found') + }) + + test('shows unique file count for potential internal links', () => { + const report = generateExternalLinkReport([], { + selfReferentialLinks: [ + { href: 'https://docs.github.com/en', file: 'a.md', lines: [1] }, + { href: 'https://docs.github.com/en', file: 'a.md', lines: [2] }, + ], + }) + const markdown = reportToMarkdown(report, true) + + expect(markdown).toContain('Found in 1 file') + }) }) describe('generatePRComment', () => {