Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,90 @@ class PlgOnboarding extends BaseModel {

static IMS_ORG_ID_PATTERN = /^[a-z0-9]{24}@AdobeOrg$/i;

static DOMAIN_PATTERN = /^[a-z0-9]([a-z0-9-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9-]*[a-z0-9])?)*$/;
static MAX_HOSTNAME_LENGTH = 253; // RFC 1035 DNS name limit

// Practical cap, chosen for storage and sort-key index depth rather than for any
// specific browser/URL-bar limit (the domain field is a stored identifier, not a URL).
static MAX_DOMAIN_LENGTH = 2048;

// **WARNING for external consumers: do NOT use DOMAIN_PATTERN directly.**
// This regex is incomplete on its own — it has no length cap, no control-character
// rejection, no all-numeric-hostname check, no trailing-dot/consecutive-dot path
// rejection, and no typeof guard. Always call `PlgOnboarding.isValidDomain(value)`
// which composes this regex with the rest of the validator. The regex is exported
// only for legacy callers and may become module-private in a future major release.
//
// Matches lowercase hostnames (at least one dot required) and an optional subpath
// (e.g. nba.com, nba.com/kings, nba.com/us/kings).
// The final label (TLD) must be alphabetic (>= 2 chars) or punycode (xn--*). This
// structurally rejects every IP-literal form: dotted-quad (127.0.0.1), short-form
// (127.1), decimal (2130706433), hex (0x7f.0.0.1, 0xa9.254.169.254 → AWS IMDS),
// and octal (0177.0.0.1) — and also blocks foo.1-style typos. WHATWG URL would
// otherwise canonicalize hex/decimal IPs to their dotted-quad form, bypassing
// denylist-based SSRF gates downstream.
// Rejects: uppercase letters (use normalizeDomain() first), schemes (https://),
// ports (:8080), single-label hostnames (localhost, metadata), query strings,
// fragments, empty/trailing path segments, and any path segment starting with
// a dot (blocks ./, ../, .hidden, ..foo, etc.).
// Path-qualified domains (nba.com/kings) are distinct sort-key values from the bare
// hostname; callers must call normalizeDomain() before findByImsOrgIdAndDomain.
// Labels must not start or end with a hyphen (RFC 1035).
// Raw Unicode / IDN must be punycode-encoded before validation (xn-- form is accepted).
// Percent-encoded path characters (%20 etc.) are not accepted; decode before validation.
// Underscore is allowed in path segments but not in hostname labels.
static DOMAIN_PATTERN = /^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)*\.(?:[a-z]{2,}|xn--[a-z0-9-]+)(\/(?!\.)[a-z0-9._~-]+)*$/;

// Returns the canonical form of a domain value: lowercased.
// Note: non-string inputs (null/undefined/number/object) are returned unchanged.
// Callers MUST also run `isValidDomain(value)` before using the result — calling
// `normalizeDomain` alone does not guarantee the value is a string or safe to
// pass to `findByImsOrgIdAndDomain` (which would otherwise treat a non-string
// sort key as something it isn't).
static normalizeDomain(value) {
return typeof value === 'string' ? value.toLowerCase() : value;
}

// Complete domain validator used by the schema and intended for external consumers.
// Layers a typeof guard, case-canonical check, control-character rejection,
// all-numeric-hostname rejection (defense-in-depth; DOMAIN_PATTERN's alphabetic-TLD
// requirement already rejects dotted-quad, short-form, decimal, hex, and octal IPs),
// trailing-dot path-segment rejection, DOMAIN_PATTERN test, and length caps.
// Note: DOMAIN_PATTERN alone is not sufficient — always prefer this method.
// Lowercase-only (host AND path) is intentional canonicalization, not a bug. The
// domain field is part of the dedup sort key on findByImsOrgIdAndDomain; allowing
// mixed-case paths would let `nba.com/Kings` and `nba.com/kings` create distinct
// onboarding rows for the same site. Callers should call normalizeDomain() first.
// This is a syntactic / data-integrity validator, not an SSRF gate. Callers that
// make outbound fetches must layer their own private-IP and DNS-resolution checks.
static isValidDomain(value) {
if (typeof value !== 'string' || value !== value.toLowerCase()) {
return false;
}
// Length caps run BEFORE the regex test so a multi-MB pathological input is
// rejected in O(1) rather than driving a multi-MB regex scan. The regex itself
// is linear (no overlapping quantifiers) but external consumers may not bound
// input size upstream.
if (value.length > PlgOnboarding.MAX_DOMAIN_LENGTH) {
return false;
}
if (/[^\x21-\x7e]/.test(value)) {
return false;
}
const [hostname, ...pathParts] = value.split('/');
if (hostname.length > PlgOnboarding.MAX_HOSTNAME_LENGTH) {
return false;
}
if (/^[\d.]+$/.test(hostname)) {
return false;
}
// Reject path segments that are purely dots, end with a dot, or contain
// consecutive dots (foo., foo.., foo../bar, v1..0). DOMAIN_PATTERN's
// negative lookahead only blocks segments STARTING with a dot.
if (pathParts.some((seg) => /\.$/.test(seg) || seg.includes('..'))) {
return false;
}
return PlgOnboarding.DOMAIN_PATTERN.test(value);
}

static STATUSES = {
PRE_ONBOARDING: 'PRE_ONBOARDING',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ const schema = new SchemaBuilder(PlgOnboarding, PlgOnboardingCollection)
type: 'string',
required: true,
readOnly: true,
validate: (value) => PlgOnboarding.DOMAIN_PATTERN.test(value) && value.length <= 253,
validate: (value) => PlgOnboarding.isValidDomain(value),
})
.addAttribute('baseURL', {
type: 'string',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,226 @@ describe('PlgOnboardingModel', () => {
});
});

describe('DOMAIN_PATTERN', () => {
const { DOMAIN_PATTERN } = PlgOnboarding;

describe('valid values', () => {
[
'nba.com',
'www.nba.com',
'sub.domain.example.com',
'nba.com/kings',
'nba.com/us/kings',
'example.com/path/with-hyphens',
'example.com/path.with.dots',
'example.io/a/b/c',
'example.com/en-us',
'example.com/case_studies',
'xn--nba-6na.com',
// Hostnames that begin with a dotted-quad-shaped prefix but end in an
// alphabetic TLD remain valid — IP literals are blocked by the TLD
// requirement (final label must be [a-z]{2,} or xn--*), not by a
// lookahead. Covers nip.io-style wildcard DNS.
'1.2.3.4.example.com',
'192.168.1.1.nip.io',
].forEach((value) => {
it(`accepts "${value}"`, () => {
expect(DOMAIN_PATTERN.test(value)).to.be.true;
});
});
});

describe('invalid values', () => {
[
['empty string', ''],
['scheme prefix', 'https://nba.com'],
['scheme prefix http', 'http://nba.com'],
['IPv4 address', '127.0.0.1'],
['IPv4 address 8.8.8.8', '8.8.8.8'],
['IPv4 with path', '127.0.0.1/path'],
['query string', 'nba.com?foo=bar'],
['fragment', 'nba.com#section'],
['path with query string', 'nba.com/kings?q=1'],
['path with fragment', 'nba.com/kings#top'],
['trailing hyphen in label', 'nba-.com'],
['trailing hyphen in subdomain', 'foo-.nba.com'],
['trailing slash', 'nba.com/'],
['trailing slash after path', 'nba.com/kings/'],
['double slash', 'nba.com//kings'],
['port number', 'nba.com:8080'],
['path traversal dot-dot', 'nba.com/../etc'],
['path traversal dot', 'nba.com/./x'],
['path traversal dot-dot at end', 'nba.com/..'],
['path traversal dot at end', 'nba.com/.'],
['leading dot in path segment', 'nba.com/.hidden'],
['leading double-dot prefix in segment', 'nba.com/..foo'],
['trailing dot fqdn', 'nba.com.'],
['single-label hostname', 'localhost'],
['single-label intranet hostname', 'intranet'],
['uppercase hostname', 'NBA.COM'],
['uppercase path segment', 'nba.com/Kings'],
['uppercase locale path', 'example.com/en-US'],
['IPv6 bracketed', '[::1]'],
['IPv6 unbracketed', '2001:db8::1'],
['percent-encoded path', 'nba.com/path%20with%20space'],
// IP-literal forms rejected via the alphabetic/punycode TLD requirement.
['hex IPv4', '0x7f.0.0.1'],
['hex IPv4 (IMDS)', '0xa9.254.169.254'],
['octal IPv4', '0177.0.0.1'],
['short-form IPv4', '127.1'],
['numeric TLD', 'foo.1'],
].forEach(([label, value]) => {
it(`rejects ${label}: "${value}"`, () => {
expect(DOMAIN_PATTERN.test(value)).to.be.false;
});
});
});
});

describe('isValidDomain', () => {
describe('valid values', () => {
[
'nba.com',
'www.nba.com',
'nba.com/kings',
'nba.com/us/kings',
'example.com/en-us',
'example.com/case_studies',
'xn--nba-6na.com',
// Hostnames that start with a dotted-quad but continue with non-IP labels:
// IP literals are rejected via the alphabetic/punycode TLD requirement,
// not via a hostname-prefix lookahead, so these legitimate hosts pass.
'1.2.3.4.example.com',
'192.168.1.1.nip.io',
].forEach((value) => {
it(`accepts "${value}"`, () => {
expect(PlgOnboarding.isValidDomain(value)).to.be.true;
});
});
});

describe('invalid values', () => {
[
['empty string', ''],
['scheme prefix', 'https://nba.com'],
['IPv4 address', '127.0.0.1'],
['short-form IPv4', '127.1'],
['decimal IPv4', '2130706433'],
['uppercase hostname', 'NBA.COM'],
['uppercase path segment', 'nba.com/Kings'],
['null byte in domain', 'nba.com\x00/evil'],
['control character in path', 'nba.com/ki\x01ngs'],
['trailing dot path segment', 'nba.com/foo.'],
['trailing dot fqdn', 'nba.com.'],
['consecutive dots mid path segment', 'nba.com/v1..0'],
['consecutive dots mid path segment 2', 'nba.com/foo..bar'],
// Hex-encoded IPv4 literals (WHATWG URL canonicalizes these to dotted-quad,
// bypassing denylist-based SSRF gates that match raw strings).
['hex IPv4', '0x7f.0.0.1'],
['hex IPv4 (IMDS)', '0xa9.254.169.254'],
['hex IPv4 all hex', '0xa9.0xfe.0xa9.0xfe'],
['octal IPv4', '0177.0.0.1'],
['numeric TLD', 'foo.1'],
].forEach(([label, value]) => {
it(`rejects ${label}`, () => {
expect(PlgOnboarding.isValidDomain(value)).to.be.false;
});
});
});

describe('non-string inputs', () => {
[
['null', null],
['undefined', undefined],
['number', 123],
['boolean', true],
['object', { domain: 'nba.com' }],
['array', ['nba.com']],
].forEach(([label, value]) => {
it(`rejects ${label}`, () => {
expect(PlgOnboarding.isValidDomain(value)).to.be.false;
});
});
});

describe('length boundaries', () => {
it('accepts a hostname of exactly 253 chars', () => {
const hostname = `${'a'.repeat(249)}.com`;
expect(hostname.length).to.equal(253);
expect(PlgOnboarding.isValidDomain(hostname)).to.be.true;
});

it('rejects a hostname exceeding 253 chars', () => {
const hostname = `${'a'.repeat(250)}.com`;
expect(hostname.length).to.equal(254);
expect(PlgOnboarding.isValidDomain(hostname)).to.be.false;
});

it('accepts a domain of exactly 2048 chars', () => {
const value = `nba.com/${'a'.repeat(2040)}`;
expect(value.length).to.equal(2048);
expect(PlgOnboarding.isValidDomain(value)).to.be.true;
});

it('rejects a domain exceeding 2048 chars', () => {
const value = `nba.com/${'a'.repeat(2041)}`;
expect(value.length).to.equal(2049);
expect(PlgOnboarding.isValidDomain(value)).to.be.false;
});
});

describe('regression: DOMAIN_PATTERN alone is insufficient', () => {
// Pinning tests: these inputs pass the bare regex but are correctly rejected
// by the full validator. They exist to prevent regressions if a future caller
// is tempted to import DOMAIN_PATTERN directly instead of isValidDomain.
it('DOMAIN_PATTERN accepts trailing-dot path segment but isValidDomain rejects it', () => {
expect(PlgOnboarding.DOMAIN_PATTERN.test('nba.com/foo.')).to.be.true;
expect(PlgOnboarding.isValidDomain('nba.com/foo.')).to.be.false;
});

it('DOMAIN_PATTERN accepts consecutive-dot path segment but isValidDomain rejects it', () => {
expect(PlgOnboarding.DOMAIN_PATTERN.test('nba.com/v1..0')).to.be.true;
expect(PlgOnboarding.isValidDomain('nba.com/v1..0')).to.be.false;
});

it('DOMAIN_PATTERN has no length cap but isValidDomain enforces 2048', () => {
const tooLong = `nba.com/${'a'.repeat(2041)}`;
expect(tooLong.length).to.equal(2049);
expect(PlgOnboarding.DOMAIN_PATTERN.test(tooLong)).to.be.true;
expect(PlgOnboarding.isValidDomain(tooLong)).to.be.false;
});
});

describe('SSRF defense: IP-literal hostnames rejected via TLD requirement', () => {
// These would canonicalize to private/loopback IPs via WHATWG URL parsing
// (new URL('https://0xa9.254.169.254').hostname → '169.254.169.254').
// The alphabetic/punycode TLD requirement in DOMAIN_PATTERN rejects them at
// the structural level so downstream raw-string denylists cannot be bypassed.
[
['hex IPv4 loopback', '0x7f.0.0.1', '127.0.0.1'],
['hex IPv4 IMDS', '0xa9.254.169.254', '169.254.169.254'],
['hex IPv4 all hex', '0xa9.0xfe.0xa9.0xfe', '169.254.169.254'],
['hex IPv4 RFC1918', '0xa.0.0.1', '10.0.0.1'],
['octal IPv4', '0177.0.0.1', '127.0.0.1'],
].forEach(([label, input]) => {
it(`isValidDomain rejects ${label} (${input})`, () => {
expect(PlgOnboarding.isValidDomain(input)).to.be.false;
});
});
});
});

describe('normalizeDomain', () => {
it('lowercases a string value', () => {
expect(PlgOnboarding.normalizeDomain('NBA.COM/Kings')).to.equal('nba.com/kings');
});

it('returns non-string values unchanged', () => {
expect(PlgOnboarding.normalizeDomain(null)).to.be.null;
expect(PlgOnboarding.normalizeDomain(undefined)).to.be.undefined;
});
});

describe('REVIEW_DECISIONS', () => {
it('defines all expected review decisions', () => {
expect(PlgOnboarding.REVIEW_DECISIONS).to.deep.equal({
Expand Down
Loading
Loading