diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js index 25c834619..4523fc1ee 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js @@ -24,7 +24,90 @@ class PlgOnboarding extends BaseModel { static IMS_ORG_ID_PATTERN = /^[a-z0-9]{24}@AdobeOrg$/i; - static DOMAIN_PATTERN = /^[a-z0-9]([a-z0-9-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9-]*[a-z0-9])?)*$/; + static MAX_HOSTNAME_LENGTH = 253; // RFC 1035 DNS name limit + + // Practical cap, chosen for storage and sort-key index depth rather than for any + // specific browser/URL-bar limit (the domain field is a stored identifier, not a URL). + static MAX_DOMAIN_LENGTH = 2048; + + // **WARNING for external consumers: do NOT use DOMAIN_PATTERN directly.** + // This regex is incomplete on its own — it has no length cap, no control-character + // rejection, no all-numeric-hostname check, no trailing-dot/consecutive-dot path + // rejection, and no typeof guard. Always call `PlgOnboarding.isValidDomain(value)` + // which composes this regex with the rest of the validator. The regex is exported + // only for legacy callers and may become module-private in a future major release. + // + // Matches lowercase hostnames (at least one dot required) and an optional subpath + // (e.g. nba.com, nba.com/kings, nba.com/us/kings). + // The final label (TLD) must be alphabetic (>= 2 chars) or punycode (xn--*). This + // structurally rejects every IP-literal form: dotted-quad (127.0.0.1), short-form + // (127.1), decimal (2130706433), hex (0x7f.0.0.1, 0xa9.254.169.254 → AWS IMDS), + // and octal (0177.0.0.1) — and also blocks foo.1-style typos. WHATWG URL would + // otherwise canonicalize hex/decimal IPs to their dotted-quad form, bypassing + // denylist-based SSRF gates downstream. + // Rejects: uppercase letters (use normalizeDomain() first), schemes (https://), + // ports (:8080), single-label hostnames (localhost, metadata), query strings, + // fragments, empty/trailing path segments, and any path segment starting with + // a dot (blocks ./, ../, .hidden, ..foo, etc.). + // Path-qualified domains (nba.com/kings) are distinct sort-key values from the bare + // hostname; callers must call normalizeDomain() before findByImsOrgIdAndDomain. + // Labels must not start or end with a hyphen (RFC 1035). + // Raw Unicode / IDN must be punycode-encoded before validation (xn-- form is accepted). + // Percent-encoded path characters (%20 etc.) are not accepted; decode before validation. + // Underscore is allowed in path segments but not in hostname labels. + static DOMAIN_PATTERN = /^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)*\.(?:[a-z]{2,}|xn--[a-z0-9-]+)(\/(?!\.)[a-z0-9._~-]+)*$/; + + // Returns the canonical form of a domain value: lowercased. + // Note: non-string inputs (null/undefined/number/object) are returned unchanged. + // Callers MUST also run `isValidDomain(value)` before using the result — calling + // `normalizeDomain` alone does not guarantee the value is a string or safe to + // pass to `findByImsOrgIdAndDomain` (which would otherwise treat a non-string + // sort key as something it isn't). + static normalizeDomain(value) { + return typeof value === 'string' ? value.toLowerCase() : value; + } + + // Complete domain validator used by the schema and intended for external consumers. + // Layers a typeof guard, case-canonical check, control-character rejection, + // all-numeric-hostname rejection (defense-in-depth; DOMAIN_PATTERN's alphabetic-TLD + // requirement already rejects dotted-quad, short-form, decimal, hex, and octal IPs), + // trailing-dot path-segment rejection, DOMAIN_PATTERN test, and length caps. + // Note: DOMAIN_PATTERN alone is not sufficient — always prefer this method. + // Lowercase-only (host AND path) is intentional canonicalization, not a bug. The + // domain field is part of the dedup sort key on findByImsOrgIdAndDomain; allowing + // mixed-case paths would let `nba.com/Kings` and `nba.com/kings` create distinct + // onboarding rows for the same site. Callers should call normalizeDomain() first. + // This is a syntactic / data-integrity validator, not an SSRF gate. Callers that + // make outbound fetches must layer their own private-IP and DNS-resolution checks. + static isValidDomain(value) { + if (typeof value !== 'string' || value !== value.toLowerCase()) { + return false; + } + // Length caps run BEFORE the regex test so a multi-MB pathological input is + // rejected in O(1) rather than driving a multi-MB regex scan. The regex itself + // is linear (no overlapping quantifiers) but external consumers may not bound + // input size upstream. + if (value.length > PlgOnboarding.MAX_DOMAIN_LENGTH) { + return false; + } + if (/[^\x21-\x7e]/.test(value)) { + return false; + } + const [hostname, ...pathParts] = value.split('/'); + if (hostname.length > PlgOnboarding.MAX_HOSTNAME_LENGTH) { + return false; + } + if (/^[\d.]+$/.test(hostname)) { + return false; + } + // Reject path segments that are purely dots, end with a dot, or contain + // consecutive dots (foo., foo.., foo../bar, v1..0). DOMAIN_PATTERN's + // negative lookahead only blocks segments STARTING with a dot. + if (pathParts.some((seg) => /\.$/.test(seg) || seg.includes('..'))) { + return false; + } + return PlgOnboarding.DOMAIN_PATTERN.test(value); + } static STATUSES = { PRE_ONBOARDING: 'PRE_ONBOARDING', diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js index 5233a4f6b..ec327015d 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js @@ -28,7 +28,7 @@ const schema = new SchemaBuilder(PlgOnboarding, PlgOnboardingCollection) type: 'string', required: true, readOnly: true, - validate: (value) => PlgOnboarding.DOMAIN_PATTERN.test(value) && value.length <= 253, + validate: (value) => PlgOnboarding.isValidDomain(value), }) .addAttribute('baseURL', { type: 'string', diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js index 162e60248..537ec278f 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js @@ -79,6 +79,226 @@ describe('PlgOnboardingModel', () => { }); }); + describe('DOMAIN_PATTERN', () => { + const { DOMAIN_PATTERN } = PlgOnboarding; + + describe('valid values', () => { + [ + 'nba.com', + 'www.nba.com', + 'sub.domain.example.com', + 'nba.com/kings', + 'nba.com/us/kings', + 'example.com/path/with-hyphens', + 'example.com/path.with.dots', + 'example.io/a/b/c', + 'example.com/en-us', + 'example.com/case_studies', + 'xn--nba-6na.com', + // Hostnames that begin with a dotted-quad-shaped prefix but end in an + // alphabetic TLD remain valid — IP literals are blocked by the TLD + // requirement (final label must be [a-z]{2,} or xn--*), not by a + // lookahead. Covers nip.io-style wildcard DNS. + '1.2.3.4.example.com', + '192.168.1.1.nip.io', + ].forEach((value) => { + it(`accepts "${value}"`, () => { + expect(DOMAIN_PATTERN.test(value)).to.be.true; + }); + }); + }); + + describe('invalid values', () => { + [ + ['empty string', ''], + ['scheme prefix', 'https://nba.com'], + ['scheme prefix http', 'http://nba.com'], + ['IPv4 address', '127.0.0.1'], + ['IPv4 address 8.8.8.8', '8.8.8.8'], + ['IPv4 with path', '127.0.0.1/path'], + ['query string', 'nba.com?foo=bar'], + ['fragment', 'nba.com#section'], + ['path with query string', 'nba.com/kings?q=1'], + ['path with fragment', 'nba.com/kings#top'], + ['trailing hyphen in label', 'nba-.com'], + ['trailing hyphen in subdomain', 'foo-.nba.com'], + ['trailing slash', 'nba.com/'], + ['trailing slash after path', 'nba.com/kings/'], + ['double slash', 'nba.com//kings'], + ['port number', 'nba.com:8080'], + ['path traversal dot-dot', 'nba.com/../etc'], + ['path traversal dot', 'nba.com/./x'], + ['path traversal dot-dot at end', 'nba.com/..'], + ['path traversal dot at end', 'nba.com/.'], + ['leading dot in path segment', 'nba.com/.hidden'], + ['leading double-dot prefix in segment', 'nba.com/..foo'], + ['trailing dot fqdn', 'nba.com.'], + ['single-label hostname', 'localhost'], + ['single-label intranet hostname', 'intranet'], + ['uppercase hostname', 'NBA.COM'], + ['uppercase path segment', 'nba.com/Kings'], + ['uppercase locale path', 'example.com/en-US'], + ['IPv6 bracketed', '[::1]'], + ['IPv6 unbracketed', '2001:db8::1'], + ['percent-encoded path', 'nba.com/path%20with%20space'], + // IP-literal forms rejected via the alphabetic/punycode TLD requirement. + ['hex IPv4', '0x7f.0.0.1'], + ['hex IPv4 (IMDS)', '0xa9.254.169.254'], + ['octal IPv4', '0177.0.0.1'], + ['short-form IPv4', '127.1'], + ['numeric TLD', 'foo.1'], + ].forEach(([label, value]) => { + it(`rejects ${label}: "${value}"`, () => { + expect(DOMAIN_PATTERN.test(value)).to.be.false; + }); + }); + }); + }); + + describe('isValidDomain', () => { + describe('valid values', () => { + [ + 'nba.com', + 'www.nba.com', + 'nba.com/kings', + 'nba.com/us/kings', + 'example.com/en-us', + 'example.com/case_studies', + 'xn--nba-6na.com', + // Hostnames that start with a dotted-quad but continue with non-IP labels: + // IP literals are rejected via the alphabetic/punycode TLD requirement, + // not via a hostname-prefix lookahead, so these legitimate hosts pass. + '1.2.3.4.example.com', + '192.168.1.1.nip.io', + ].forEach((value) => { + it(`accepts "${value}"`, () => { + expect(PlgOnboarding.isValidDomain(value)).to.be.true; + }); + }); + }); + + describe('invalid values', () => { + [ + ['empty string', ''], + ['scheme prefix', 'https://nba.com'], + ['IPv4 address', '127.0.0.1'], + ['short-form IPv4', '127.1'], + ['decimal IPv4', '2130706433'], + ['uppercase hostname', 'NBA.COM'], + ['uppercase path segment', 'nba.com/Kings'], + ['null byte in domain', 'nba.com\x00/evil'], + ['control character in path', 'nba.com/ki\x01ngs'], + ['trailing dot path segment', 'nba.com/foo.'], + ['trailing dot fqdn', 'nba.com.'], + ['consecutive dots mid path segment', 'nba.com/v1..0'], + ['consecutive dots mid path segment 2', 'nba.com/foo..bar'], + // Hex-encoded IPv4 literals (WHATWG URL canonicalizes these to dotted-quad, + // bypassing denylist-based SSRF gates that match raw strings). + ['hex IPv4', '0x7f.0.0.1'], + ['hex IPv4 (IMDS)', '0xa9.254.169.254'], + ['hex IPv4 all hex', '0xa9.0xfe.0xa9.0xfe'], + ['octal IPv4', '0177.0.0.1'], + ['numeric TLD', 'foo.1'], + ].forEach(([label, value]) => { + it(`rejects ${label}`, () => { + expect(PlgOnboarding.isValidDomain(value)).to.be.false; + }); + }); + }); + + describe('non-string inputs', () => { + [ + ['null', null], + ['undefined', undefined], + ['number', 123], + ['boolean', true], + ['object', { domain: 'nba.com' }], + ['array', ['nba.com']], + ].forEach(([label, value]) => { + it(`rejects ${label}`, () => { + expect(PlgOnboarding.isValidDomain(value)).to.be.false; + }); + }); + }); + + describe('length boundaries', () => { + it('accepts a hostname of exactly 253 chars', () => { + const hostname = `${'a'.repeat(249)}.com`; + expect(hostname.length).to.equal(253); + expect(PlgOnboarding.isValidDomain(hostname)).to.be.true; + }); + + it('rejects a hostname exceeding 253 chars', () => { + const hostname = `${'a'.repeat(250)}.com`; + expect(hostname.length).to.equal(254); + expect(PlgOnboarding.isValidDomain(hostname)).to.be.false; + }); + + it('accepts a domain of exactly 2048 chars', () => { + const value = `nba.com/${'a'.repeat(2040)}`; + expect(value.length).to.equal(2048); + expect(PlgOnboarding.isValidDomain(value)).to.be.true; + }); + + it('rejects a domain exceeding 2048 chars', () => { + const value = `nba.com/${'a'.repeat(2041)}`; + expect(value.length).to.equal(2049); + expect(PlgOnboarding.isValidDomain(value)).to.be.false; + }); + }); + + describe('regression: DOMAIN_PATTERN alone is insufficient', () => { + // Pinning tests: these inputs pass the bare regex but are correctly rejected + // by the full validator. They exist to prevent regressions if a future caller + // is tempted to import DOMAIN_PATTERN directly instead of isValidDomain. + it('DOMAIN_PATTERN accepts trailing-dot path segment but isValidDomain rejects it', () => { + expect(PlgOnboarding.DOMAIN_PATTERN.test('nba.com/foo.')).to.be.true; + expect(PlgOnboarding.isValidDomain('nba.com/foo.')).to.be.false; + }); + + it('DOMAIN_PATTERN accepts consecutive-dot path segment but isValidDomain rejects it', () => { + expect(PlgOnboarding.DOMAIN_PATTERN.test('nba.com/v1..0')).to.be.true; + expect(PlgOnboarding.isValidDomain('nba.com/v1..0')).to.be.false; + }); + + it('DOMAIN_PATTERN has no length cap but isValidDomain enforces 2048', () => { + const tooLong = `nba.com/${'a'.repeat(2041)}`; + expect(tooLong.length).to.equal(2049); + expect(PlgOnboarding.DOMAIN_PATTERN.test(tooLong)).to.be.true; + expect(PlgOnboarding.isValidDomain(tooLong)).to.be.false; + }); + }); + + describe('SSRF defense: IP-literal hostnames rejected via TLD requirement', () => { + // These would canonicalize to private/loopback IPs via WHATWG URL parsing + // (new URL('https://0xa9.254.169.254').hostname → '169.254.169.254'). + // The alphabetic/punycode TLD requirement in DOMAIN_PATTERN rejects them at + // the structural level so downstream raw-string denylists cannot be bypassed. + [ + ['hex IPv4 loopback', '0x7f.0.0.1', '127.0.0.1'], + ['hex IPv4 IMDS', '0xa9.254.169.254', '169.254.169.254'], + ['hex IPv4 all hex', '0xa9.0xfe.0xa9.0xfe', '169.254.169.254'], + ['hex IPv4 RFC1918', '0xa.0.0.1', '10.0.0.1'], + ['octal IPv4', '0177.0.0.1', '127.0.0.1'], + ].forEach(([label, input]) => { + it(`isValidDomain rejects ${label} (${input})`, () => { + expect(PlgOnboarding.isValidDomain(input)).to.be.false; + }); + }); + }); + }); + + describe('normalizeDomain', () => { + it('lowercases a string value', () => { + expect(PlgOnboarding.normalizeDomain('NBA.COM/Kings')).to.equal('nba.com/kings'); + }); + + it('returns non-string values unchanged', () => { + expect(PlgOnboarding.normalizeDomain(null)).to.be.null; + expect(PlgOnboarding.normalizeDomain(undefined)).to.be.undefined; + }); + }); + describe('REVIEW_DECISIONS', () => { it('defines all expected review decisions', () => { expect(PlgOnboarding.REVIEW_DECISIONS).to.deep.equal({ diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js index ab5cc7f40..0918ea738 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js @@ -14,6 +14,128 @@ import { expect } from 'chai'; import plgOnboardingSchema from '../../../../src/models/plg-onboarding/plg-onboarding.schema.js'; describe('PlgOnboarding Schema', () => { + describe('domain attribute', () => { + let domainAttr; + + before(() => { + const attributes = plgOnboardingSchema.getAttributes(); + domainAttr = attributes.domain; + }); + + it('is a required read-only string', () => { + expect(domainAttr.type).to.equal('string'); + expect(domainAttr.required).to.be.true; + expect(domainAttr.readOnly).to.be.true; + }); + + it('has a validate function', () => { + expect(domainAttr.validate).to.be.a('function'); + }); + + describe('valid values', () => { + [ + 'nba.com', + 'www.nba.com', + 'nba.com/kings', + 'nba.com/us/kings', + 'example.com/path-with-hyphens', + 'example.com/en-us', + 'example.com/case_studies', + 'xn--nba-6na.com', + 'example.com/foo_bar', + 'example.com/us/foo_bar', + '1.2.3.4.example.com', + '192.168.1.1.nip.io', + ].forEach((value) => { + it(`accepts "${value}"`, () => { + expect(domainAttr.validate(value)).to.be.true; + }); + }); + }); + + describe('invalid values', () => { + [ + ['empty string', ''], + ['scheme prefix', 'https://nba.com'], + ['IPv4 address', '127.0.0.1'], + ['short-form IPv4', '127.1'], + ['decimal IPv4', '2130706433'], + ['query string', 'nba.com?q=1'], + ['fragment', 'nba.com#top'], + ['hostname over 253 chars', `${'a'.repeat(250)}.com`], + ['trailing hyphen in label', 'nba-.com'], + ['trailing slash', 'nba.com/kings/'], + ['path traversal', 'nba.com/../etc'], + ['leading dot in path segment', 'nba.com/.hidden'], + ['leading double-dot prefix in segment', 'nba.com/..foo'], + ['trailing dot fqdn', 'nba.com.'], + ['single-label hostname', 'localhost'], + ['uppercase hostname', 'NBA.COM'], + ['uppercase path segment', 'nba.com/Kings'], + ['uppercase locale path', 'example.com/en-US'], + ['underscore in hostname label', 'foo_bar.com'], + ['underscore in subdomain label', 'foo_bar.example.com'], + ['trailing dot in path segment', 'nba.com/foo.'], + ['trailing double-dot in path segment', 'nba.com/foo..'], + ['dot-dot mid-path segment', 'nba.com/foo../bar'], + ['consecutive dots mid path segment', 'nba.com/v1..0'], + ['consecutive dots in middle of segment', 'nba.com/foo..bar'], + ['null byte in domain', 'nba.com\x00/evil'], + ['control character in path', 'nba.com/ki\x01ngs'], + ['hex IPv4 loopback', '0x7f.0.0.1'], + ['hex IPv4 IMDS', '0xa9.254.169.254'], + ['octal IPv4', '0177.0.0.1'], + ['numeric TLD', 'foo.1'], + ].forEach(([label, value]) => { + it(`rejects ${label}`, () => { + expect(domainAttr.validate(value)).to.be.false; + }); + }); + }); + + it('allows a plain hostname of exactly 253 chars', () => { + const hostname = `${'a'.repeat(249)}.com`; + expect(hostname.length).to.equal(253); + expect(domainAttr.validate(hostname)).to.be.true; + }); + + it('rejects a plain hostname exceeding 253 chars', () => { + const hostname = `${'a'.repeat(250)}.com`; + expect(hostname.length).to.equal(254); + expect(domainAttr.validate(hostname)).to.be.false; + }); + + it('allows a subpath domain whose hostname is exactly 253 chars', () => { + const hostname = `${'a'.repeat(249)}.com`; + expect(hostname.length).to.equal(253); + expect(domainAttr.validate(`${hostname}/path`)).to.be.true; + }); + + it('rejects when only the hostname exceeds 253 chars (path does not inflate count)', () => { + const hostname = `${'a'.repeat(250)}.com`; + expect(hostname.length).to.equal(254); + expect(domainAttr.validate(`${hostname}/path`)).to.be.false; + }); + + it('accepts a domain of exactly 2048 chars', () => { + const longPath = `nba.com/${'a'.repeat(2040)}`; + expect(longPath.length).to.equal(2048); + expect(domainAttr.validate(longPath)).to.be.true; + }); + + it('rejects a domain of exactly 2049 chars', () => { + const longPath = `nba.com/${'a'.repeat(2041)}`; + expect(longPath.length).to.equal(2049); + expect(domainAttr.validate(longPath)).to.be.false; + }); + + it('rejects when total domain length exceeds 2048 chars', () => { + const longPath = `nba.com/${'a'.repeat(2042)}`; + expect(longPath.length).to.be.above(2048); + expect(domainAttr.validate(longPath)).to.be.false; + }); + }); + describe('reviews attribute', () => { let reviewsAttr;