From d786b633c1fd474c6bf46391cdeca18fba282849 Mon Sep 17 00:00:00 2001 From: Kanishka Date: Thu, 7 May 2026 22:50:40 +0530 Subject: [PATCH 1/9] feat(data-access): support subpath domains in PLG onboarding validation (LLMO-4187) Update DOMAIN_PATTERN to accept nba.com/kings-style subpath domains alongside plain hostnames, and add IPv4 rejection via negative lookahead. Schema length check now applies to the hostname only so subpaths don't count against the 253-char DNS limit. Co-Authored-By: Claude Sonnet 4.6 --- .../plg-onboarding/plg-onboarding.model.js | 4 +- .../plg-onboarding/plg-onboarding.schema.js | 2 +- .../plg-onboarding.model.test.js | 39 ++++++++++++ .../plg-onboarding.schema.test.js | 60 +++++++++++++++++++ 4 files changed, 103 insertions(+), 2 deletions(-) diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js index 4e7676dd0..797900854 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js @@ -24,7 +24,9 @@ class PlgOnboarding extends BaseModel { static IMS_ORG_ID_PATTERN = /^[a-z0-9]{24}@AdobeOrg$/i; - static DOMAIN_PATTERN = /^[a-z0-9]([a-z0-9-]*[a-z0-9])?(\.[a-z0-9]([a-z0-9-]*[a-z0-9])?)*$/; + // Matches plain hostnames and subpath domains (e.g. nba.com, nba.com/kings). + // Rejects schemes (https://), IPv4 addresses, and query strings/fragments. + static DOMAIN_PATTERN = /^(?!\d+(\.\d+){3})[a-z0-9][a-z0-9-]*(\.[a-z0-9][a-z0-9-]*)*(\/[a-z0-9._~-]*)*$/; static STATUSES = { PRE_ONBOARDING: 'PRE_ONBOARDING', diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js index 38c86f468..482a2f477 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js @@ -28,7 +28,7 @@ const schema = new SchemaBuilder(PlgOnboarding, PlgOnboardingCollection) type: 'string', required: true, readOnly: true, - validate: (value) => PlgOnboarding.DOMAIN_PATTERN.test(value) && value.length <= 253, + validate: (value) => PlgOnboarding.DOMAIN_PATTERN.test(value) && value.split('/')[0].length <= 253, }) .addAttribute('baseURL', { type: 'string', diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js index 39d4e817d..31445aa02 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js @@ -79,6 +79,45 @@ describe('PlgOnboardingModel', () => { }); }); + describe('DOMAIN_PATTERN', () => { + const { DOMAIN_PATTERN } = PlgOnboarding; + + describe('valid values', () => { + [ + 'nba.com', + 'www.nba.com', + 'sub.domain.example.com', + 'nba.com/kings', + 'nba.com/us/kings', + 'example.com/path/with-hyphens', + 'example.com/path.with.dots', + 'example.io/a/b/c', + ].forEach((value) => { + it(`accepts "${value}"`, () => { + expect(DOMAIN_PATTERN.test(value)).to.be.true; + }); + }); + }); + + describe('invalid values', () => { + [ + ['empty string', ''], + ['scheme prefix', 'https://nba.com'], + ['scheme prefix http', 'http://nba.com'], + ['IPv4 address', '127.0.0.1'], + ['IPv4 address 8.8.8.8', '8.8.8.8'], + ['query string', 'nba.com?foo=bar'], + ['fragment', 'nba.com#section'], + ['path with query string', 'nba.com/kings?q=1'], + ['path with fragment', 'nba.com/kings#top'], + ].forEach(([label, value]) => { + it(`rejects ${label}: "${value}"`, () => { + expect(DOMAIN_PATTERN.test(value)).to.be.false; + }); + }); + }); + }); + describe('REVIEW_DECISIONS', () => { it('defines all expected review decisions', () => { expect(PlgOnboarding.REVIEW_DECISIONS).to.deep.equal({ diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js index c9f5627d2..ad7612342 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js @@ -14,6 +14,66 @@ import { expect } from 'chai'; import plgOnboardingSchema from '../../../../src/models/plg-onboarding/plg-onboarding.schema.js'; describe('PlgOnboarding Schema', () => { + describe('domain attribute', () => { + let domainAttr; + + before(() => { + const attributes = plgOnboardingSchema.getAttributes(); + domainAttr = attributes.domain; + }); + + it('is a required read-only string', () => { + expect(domainAttr.type).to.equal('string'); + expect(domainAttr.required).to.be.true; + expect(domainAttr.readOnly).to.be.true; + }); + + it('has a validate function', () => { + expect(domainAttr.validate).to.be.a('function'); + }); + + describe('valid values', () => { + [ + 'nba.com', + 'www.nba.com', + 'nba.com/kings', + 'nba.com/us/kings', + 'example.com/path-with-hyphens', + ].forEach((value) => { + it(`accepts "${value}"`, () => { + expect(domainAttr.validate(value)).to.be.true; + }); + }); + }); + + describe('invalid values', () => { + [ + ['empty string', ''], + ['scheme prefix', 'https://nba.com'], + ['IPv4 address', '127.0.0.1'], + ['query string', 'nba.com?q=1'], + ['fragment', 'nba.com#top'], + ['hostname over 253 chars', `${'a'.repeat(250)}.com`], + ].forEach(([label, value]) => { + it(`rejects ${label}`, () => { + expect(domainAttr.validate(value)).to.be.false; + }); + }); + }); + + it('allows a subpath domain whose hostname is exactly 253 chars', () => { + const hostname = `${'a'.repeat(249)}.com`; + expect(hostname.length).to.equal(253); + expect(domainAttr.validate(`${hostname}/path`)).to.be.true; + }); + + it('rejects when only the hostname exceeds 253 chars (path does not inflate count)', () => { + const hostname = `${'a'.repeat(250)}.com`; + expect(hostname.length).to.equal(254); + expect(domainAttr.validate(`${hostname}/path`)).to.be.false; + }); + }); + describe('reviews attribute', () => { let reviewsAttr; From b8a8157908551a9fe93a97eaac3ac8d756a7ffb1 Mon Sep 17 00:00:00 2001 From: Kanishka Date: Thu, 14 May 2026 12:30:18 +0530 Subject: [PATCH 2/9] fix(data-access): harden DOMAIN_PATTERN - restore RFC 1035 label rules, add case-insensitivity, path traversal guard, and total length cap - Restore trailing-hyphen prohibition on DNS labels ([a-z0-9](?:[a-z0-9-]*[a-z0-9])?) - Add /i flag so uppercase input (NBA.COM, nba.com/Kings) is accepted; callers must normalize to lowercase before storage - Require non-empty path segments to reject trailing slashes and double slashes - Add negative lookahead to block dot-traversal segments (/. and /..) - Add 2048-char total length cap in schema validator alongside the existing 253-char hostname cap - Expand test tables: uppercase, trailing-hyphen labels, trailing slash, double slash, IPv4+path, port numbers, path traversal cases --- .../models/plg-onboarding/plg-onboarding.model.js | 12 +++++++++--- .../models/plg-onboarding/plg-onboarding.schema.js | 4 +++- .../plg-onboarding/plg-onboarding.model.test.js | 14 ++++++++++++++ .../plg-onboarding/plg-onboarding.schema.test.js | 12 ++++++++++++ 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js index 797900854..32bdced9b 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js @@ -24,9 +24,15 @@ class PlgOnboarding extends BaseModel { static IMS_ORG_ID_PATTERN = /^[a-z0-9]{24}@AdobeOrg$/i; - // Matches plain hostnames and subpath domains (e.g. nba.com, nba.com/kings). - // Rejects schemes (https://), IPv4 addresses, and query strings/fragments. - static DOMAIN_PATTERN = /^(?!\d+(\.\d+){3})[a-z0-9][a-z0-9-]*(\.[a-z0-9][a-z0-9-]*)*(\/[a-z0-9._~-]*)*$/; + // Matches plain hostnames and optional subpath (e.g. nba.com, nba.com/kings). + // Rejects: schemes (https://), bare IPv4 (127.0.0.1), ports (:8080), + // query strings, fragments, empty/trailing path segments, and dot-traversal (/./ and /../). + // Case-insensitive; callers must normalize to lowercase before storage to avoid + // duplicate records for logically identical sites (nba.com/Kings vs nba.com/kings). + // Path-qualified domains (nba.com/kings) are distinct sort-key values from the + // bare hostname; canonicalize before calling findByImsOrgIdAndDomain. + // Labels must not start or end with a hyphen (RFC 1035). + static DOMAIN_PATTERN = /^(?!\d+(\.\d+){3})[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)*(\/(?!\.{1,2}(\/|$))[a-z0-9._~-]+)*$/i; static STATUSES = { PRE_ONBOARDING: 'PRE_ONBOARDING', diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js index c76d47f44..541b10fec 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js @@ -28,7 +28,9 @@ const schema = new SchemaBuilder(PlgOnboarding, PlgOnboardingCollection) type: 'string', required: true, readOnly: true, - validate: (value) => PlgOnboarding.DOMAIN_PATTERN.test(value) && value.split('/')[0].length <= 253, + validate: (value) => PlgOnboarding.DOMAIN_PATTERN.test(value) + && value.split('/')[0].length <= 253 + && value.length <= 2048, }) .addAttribute('baseURL', { type: 'string', diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js index 31445aa02..779c19444 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js @@ -92,6 +92,9 @@ describe('PlgOnboardingModel', () => { 'example.com/path/with-hyphens', 'example.com/path.with.dots', 'example.io/a/b/c', + 'NBA.COM', + 'nba.com/Kings', + 'example.com/en-US', ].forEach((value) => { it(`accepts "${value}"`, () => { expect(DOMAIN_PATTERN.test(value)).to.be.true; @@ -106,10 +109,21 @@ describe('PlgOnboardingModel', () => { ['scheme prefix http', 'http://nba.com'], ['IPv4 address', '127.0.0.1'], ['IPv4 address 8.8.8.8', '8.8.8.8'], + ['IPv4 with path', '127.0.0.1/path'], ['query string', 'nba.com?foo=bar'], ['fragment', 'nba.com#section'], ['path with query string', 'nba.com/kings?q=1'], ['path with fragment', 'nba.com/kings#top'], + ['trailing hyphen in label', 'nba-.com'], + ['trailing hyphen in subdomain', 'foo-.nba.com'], + ['trailing slash', 'nba.com/'], + ['trailing slash after path', 'nba.com/kings/'], + ['double slash', 'nba.com//kings'], + ['port number', 'nba.com:8080'], + ['path traversal dot-dot', 'nba.com/../etc'], + ['path traversal dot', 'nba.com/./x'], + ['path traversal dot-dot at end', 'nba.com/..'], + ['path traversal dot at end', 'nba.com/.'], ].forEach(([label, value]) => { it(`rejects ${label}: "${value}"`, () => { expect(DOMAIN_PATTERN.test(value)).to.be.false; diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js index 34cbc48b0..7098a8d59 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js @@ -39,6 +39,9 @@ describe('PlgOnboarding Schema', () => { 'nba.com/kings', 'nba.com/us/kings', 'example.com/path-with-hyphens', + 'NBA.COM', + 'nba.com/Kings', + 'example.com/en-US', ].forEach((value) => { it(`accepts "${value}"`, () => { expect(domainAttr.validate(value)).to.be.true; @@ -54,6 +57,9 @@ describe('PlgOnboarding Schema', () => { ['query string', 'nba.com?q=1'], ['fragment', 'nba.com#top'], ['hostname over 253 chars', `${'a'.repeat(250)}.com`], + ['trailing hyphen in label', 'nba-.com'], + ['trailing slash', 'nba.com/kings/'], + ['path traversal', 'nba.com/../etc'], ].forEach(([label, value]) => { it(`rejects ${label}`, () => { expect(domainAttr.validate(value)).to.be.false; @@ -72,6 +78,12 @@ describe('PlgOnboarding Schema', () => { expect(hostname.length).to.equal(254); expect(domainAttr.validate(`${hostname}/path`)).to.be.false; }); + + it('rejects when total domain length exceeds 2048 chars', () => { + const longPath = `nba.com/${'a'.repeat(2042)}`; + expect(longPath.length).to.be.above(2048); + expect(domainAttr.validate(longPath)).to.be.false; + }); }); describe('reviews attribute', () => { From 91d43035f9749a66d5242937aa5b3aacfe12312b Mon Sep 17 00:00:00 2001 From: Kanishka Date: Thu, 14 May 2026 16:54:39 +0530 Subject: [PATCH 3/9] fix(data-access): harden PLG domain validator - enforce lowercase, tighten traversal guard, reject single-label hosts Addresses review findings on PR #1593: - Drop /i flag and add lowercase enforcement in schema validate to prevent duplicate sort-key records from mixed-case submissions (nba.com/Kings vs nba.com/kings) - Add normalizeDomain() static helper so callers have a canonical entry point - Require at least one dot in hostname, rejecting single-label names (localhost, metadata) - Tighten path traversal guard from (?!\.{1,2}(\/|$)) to (?!\.) to block any segment starting with a dot (.hidden, ..foo, ... in addition to ./ and ../) - Add schema-level all-numeric hostname check to reject short-form IPs (127.1, 2130706433) - Hoist 253 and 2048 limits into named static constants (MAX_HOSTNAME_LENGTH, MAX_DOMAIN_LENGTH) - Allow underscore in path segments for common patterns like /case_studies - Expand test coverage: plain-hostname 253-char boundaries, exact 2048/2049 boundary, uppercase rejection, short-form IPv4, single-label, .hidden/.foo traversal variants, IPv6, percent-encoding, punycode, normalizeDomain helper Co-Authored-By: Claude Sonnet 4.6 --- .../plg-onboarding/plg-onboarding.model.js | 33 ++++++++++++---- .../plg-onboarding/plg-onboarding.schema.js | 12 ++++-- .../plg-onboarding.model.test.js | 28 +++++++++++-- .../plg-onboarding.schema.test.js | 39 +++++++++++++++++-- 4 files changed, 95 insertions(+), 17 deletions(-) diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js index 32bdced9b..58d005b77 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js @@ -24,15 +24,32 @@ class PlgOnboarding extends BaseModel { static IMS_ORG_ID_PATTERN = /^[a-z0-9]{24}@AdobeOrg$/i; - // Matches plain hostnames and optional subpath (e.g. nba.com, nba.com/kings). - // Rejects: schemes (https://), bare IPv4 (127.0.0.1), ports (:8080), - // query strings, fragments, empty/trailing path segments, and dot-traversal (/./ and /../). - // Case-insensitive; callers must normalize to lowercase before storage to avoid - // duplicate records for logically identical sites (nba.com/Kings vs nba.com/kings). - // Path-qualified domains (nba.com/kings) are distinct sort-key values from the - // bare hostname; canonicalize before calling findByImsOrgIdAndDomain. + static MAX_HOSTNAME_LENGTH = 253; // RFC 1035 DNS name limit + + // Practical cap covering common browser (2000-2083) and CDN limits. + static MAX_DOMAIN_LENGTH = 2048; + + // Matches lowercase hostnames (at least one dot required) and an optional subpath + // (e.g. nba.com, nba.com/kings, nba.com/us/kings). + // Rejects: uppercase letters (use normalizeDomain() first), schemes (https://), + // bare/short-form IPv4 (127.0.0.1; see schema validator for all-numeric hostname + // check covering short forms like 127.1), ports (:8080), single-label hostnames + // (localhost, metadata), query strings, fragments, empty/trailing path segments, + // and any path segment starting with a dot (blocks ./, ../, .hidden, ..foo, etc.). + // Path-qualified domains (nba.com/kings) are distinct sort-key values from the bare + // hostname; callers must call normalizeDomain() before findByImsOrgIdAndDomain. // Labels must not start or end with a hyphen (RFC 1035). - static DOMAIN_PATTERN = /^(?!\d+(\.\d+){3})[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)*(\/(?!\.{1,2}(\/|$))[a-z0-9._~-]+)*$/i; + // Raw Unicode / IDN must be punycode-encoded before validation (xn-- form is accepted). + // Percent-encoded path characters (%20 etc.) are not accepted; decode before validation. + // Underscore is allowed in path segments but not in hostname labels. + static DOMAIN_PATTERN = /^(?!\d+(\.\d+){3})[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)+(\/(?!\.)[a-z0-9._~_-]+)*$/; + + // Returns the canonical form of a domain value: lowercased. + // Must be called on any user-supplied value before passing it to the domain + // attribute validator or to findByImsOrgIdAndDomain to prevent duplicate records. + static normalizeDomain(value) { + return typeof value === 'string' ? value.toLowerCase() : value; + } static STATUSES = { PRE_ONBOARDING: 'PRE_ONBOARDING', diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js index 541b10fec..d598efd4e 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js @@ -28,9 +28,15 @@ const schema = new SchemaBuilder(PlgOnboarding, PlgOnboardingCollection) type: 'string', required: true, readOnly: true, - validate: (value) => PlgOnboarding.DOMAIN_PATTERN.test(value) - && value.split('/')[0].length <= 253 - && value.length <= 2048, + validate: (value) => { + if (typeof value !== 'string' || value !== value.toLowerCase()) return false; + const hostname = value.split('/')[0]; + // Reject all-numeric hostname forms (short-form IPs: 127.1, 2130706433). + if (/^[\d.]+$/.test(hostname)) return false; + return PlgOnboarding.DOMAIN_PATTERN.test(value) + && hostname.length <= PlgOnboarding.MAX_HOSTNAME_LENGTH + && value.length <= PlgOnboarding.MAX_DOMAIN_LENGTH; + }, }) .addAttribute('baseURL', { type: 'string', diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js index 779c19444..5a84af543 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js @@ -92,9 +92,9 @@ describe('PlgOnboardingModel', () => { 'example.com/path/with-hyphens', 'example.com/path.with.dots', 'example.io/a/b/c', - 'NBA.COM', - 'nba.com/Kings', - 'example.com/en-US', + 'example.com/en-us', + 'example.com/case_studies', + 'xn--nba-6na.com', ].forEach((value) => { it(`accepts "${value}"`, () => { expect(DOMAIN_PATTERN.test(value)).to.be.true; @@ -124,6 +124,17 @@ describe('PlgOnboardingModel', () => { ['path traversal dot', 'nba.com/./x'], ['path traversal dot-dot at end', 'nba.com/..'], ['path traversal dot at end', 'nba.com/.'], + ['leading dot in path segment', 'nba.com/.hidden'], + ['leading double-dot prefix in segment', 'nba.com/..foo'], + ['trailing dot fqdn', 'nba.com.'], + ['single-label hostname', 'localhost'], + ['single-label intranet hostname', 'intranet'], + ['uppercase hostname', 'NBA.COM'], + ['uppercase path segment', 'nba.com/Kings'], + ['uppercase locale path', 'example.com/en-US'], + ['IPv6 bracketed', '[::1]'], + ['IPv6 unbracketed', '2001:db8::1'], + ['percent-encoded path', 'nba.com/path%20with%20space'], ].forEach(([label, value]) => { it(`rejects ${label}: "${value}"`, () => { expect(DOMAIN_PATTERN.test(value)).to.be.false; @@ -132,6 +143,17 @@ describe('PlgOnboardingModel', () => { }); }); + describe('normalizeDomain', () => { + it('lowercases a string value', () => { + expect(PlgOnboarding.normalizeDomain('NBA.COM/Kings')).to.equal('nba.com/kings'); + }); + + it('returns non-string values unchanged', () => { + expect(PlgOnboarding.normalizeDomain(null)).to.be.null; + expect(PlgOnboarding.normalizeDomain(undefined)).to.be.undefined; + }); + }); + describe('REVIEW_DECISIONS', () => { it('defines all expected review decisions', () => { expect(PlgOnboarding.REVIEW_DECISIONS).to.deep.equal({ diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js index 7098a8d59..7afb4105c 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js @@ -39,9 +39,9 @@ describe('PlgOnboarding Schema', () => { 'nba.com/kings', 'nba.com/us/kings', 'example.com/path-with-hyphens', - 'NBA.COM', - 'nba.com/Kings', - 'example.com/en-US', + 'example.com/en-us', + 'example.com/case_studies', + 'xn--nba-6na.com', ].forEach((value) => { it(`accepts "${value}"`, () => { expect(domainAttr.validate(value)).to.be.true; @@ -54,12 +54,21 @@ describe('PlgOnboarding Schema', () => { ['empty string', ''], ['scheme prefix', 'https://nba.com'], ['IPv4 address', '127.0.0.1'], + ['short-form IPv4', '127.1'], + ['decimal IPv4', '2130706433'], ['query string', 'nba.com?q=1'], ['fragment', 'nba.com#top'], ['hostname over 253 chars', `${'a'.repeat(250)}.com`], ['trailing hyphen in label', 'nba-.com'], ['trailing slash', 'nba.com/kings/'], ['path traversal', 'nba.com/../etc'], + ['leading dot in path segment', 'nba.com/.hidden'], + ['leading double-dot prefix in segment', 'nba.com/..foo'], + ['trailing dot fqdn', 'nba.com.'], + ['single-label hostname', 'localhost'], + ['uppercase hostname', 'NBA.COM'], + ['uppercase path segment', 'nba.com/Kings'], + ['uppercase locale path', 'example.com/en-US'], ].forEach(([label, value]) => { it(`rejects ${label}`, () => { expect(domainAttr.validate(value)).to.be.false; @@ -67,6 +76,18 @@ describe('PlgOnboarding Schema', () => { }); }); + it('allows a plain hostname of exactly 253 chars', () => { + const hostname = `${'a'.repeat(249)}.com`; + expect(hostname.length).to.equal(253); + expect(domainAttr.validate(hostname)).to.be.true; + }); + + it('rejects a plain hostname exceeding 253 chars', () => { + const hostname = `${'a'.repeat(250)}.com`; + expect(hostname.length).to.equal(254); + expect(domainAttr.validate(hostname)).to.be.false; + }); + it('allows a subpath domain whose hostname is exactly 253 chars', () => { const hostname = `${'a'.repeat(249)}.com`; expect(hostname.length).to.equal(253); @@ -79,6 +100,18 @@ describe('PlgOnboarding Schema', () => { expect(domainAttr.validate(`${hostname}/path`)).to.be.false; }); + it('accepts a domain of exactly 2048 chars', () => { + const longPath = `nba.com/${'a'.repeat(2040)}`; + expect(longPath.length).to.equal(2048); + expect(domainAttr.validate(longPath)).to.be.true; + }); + + it('rejects a domain of exactly 2049 chars', () => { + const longPath = `nba.com/${'a'.repeat(2041)}`; + expect(longPath.length).to.equal(2049); + expect(domainAttr.validate(longPath)).to.be.false; + }); + it('rejects when total domain length exceeds 2048 chars', () => { const longPath = `nba.com/${'a'.repeat(2042)}`; expect(longPath.length).to.be.above(2048); From f38a451a08069d0493b8a8d1517f31db8f7dc3bf Mon Sep 17 00:00:00 2001 From: Kanishka Date: Fri, 15 May 2026 18:12:27 +0530 Subject: [PATCH 4/9] fix(data-access): fix lint errors in plg-onboarding schema validate function Co-Authored-By: Claude Sonnet 4.6 --- .../src/models/plg-onboarding/plg-onboarding.schema.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js index d598efd4e..39e9dfad5 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js @@ -29,10 +29,14 @@ const schema = new SchemaBuilder(PlgOnboarding, PlgOnboardingCollection) required: true, readOnly: true, validate: (value) => { - if (typeof value !== 'string' || value !== value.toLowerCase()) return false; + if (typeof value !== 'string' || value !== value.toLowerCase()) { + return false; + } const hostname = value.split('/')[0]; // Reject all-numeric hostname forms (short-form IPs: 127.1, 2130706433). - if (/^[\d.]+$/.test(hostname)) return false; + if (/^[\d.]+$/.test(hostname)) { + return false; + } return PlgOnboarding.DOMAIN_PATTERN.test(value) && hostname.length <= PlgOnboarding.MAX_HOSTNAME_LENGTH && value.length <= PlgOnboarding.MAX_DOMAIN_LENGTH; From 9ef6e93de6cac958be01161b28a6d492fec3253c Mon Sep 17 00:00:00 2001 From: Kanishka Date: Mon, 18 May 2026 18:43:17 +0530 Subject: [PATCH 5/9] fix(data-access): harden PLG domain validator - reject dot-run path segments, control chars, and fix duplicate regex char Trailing/embedded dot segments (foo., foo.., foo../bar) were accepted by the leading-dot lookahead but can resolve as traversal forms on some canonicalizers. Control characters (null bytes, etc.) were only caught late by the regex, leaving them readable between earlier checks. Removed duplicate underscore from DOMAIN_PATTERN character class. Added tests for underscore allowed in path but not hostname, dot-run segments, and non-printable input. Co-Authored-By: Claude Sonnet 4.6 --- .../src/models/plg-onboarding/plg-onboarding.model.js | 2 +- .../src/models/plg-onboarding/plg-onboarding.schema.js | 9 ++++++++- .../models/plg-onboarding/plg-onboarding.schema.test.js | 9 +++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js index 58d005b77..3c240e109 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js @@ -42,7 +42,7 @@ class PlgOnboarding extends BaseModel { // Raw Unicode / IDN must be punycode-encoded before validation (xn-- form is accepted). // Percent-encoded path characters (%20 etc.) are not accepted; decode before validation. // Underscore is allowed in path segments but not in hostname labels. - static DOMAIN_PATTERN = /^(?!\d+(\.\d+){3})[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)+(\/(?!\.)[a-z0-9._~_-]+)*$/; + static DOMAIN_PATTERN = /^(?!\d+(\.\d+){3})[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)+(\/(?!\.)[a-z0-9._~-]+)*$/; // Returns the canonical form of a domain value: lowercased. // Must be called on any user-supplied value before passing it to the domain diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js index 39e9dfad5..a824ab9d7 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js @@ -32,11 +32,18 @@ const schema = new SchemaBuilder(PlgOnboarding, PlgOnboardingCollection) if (typeof value !== 'string' || value !== value.toLowerCase()) { return false; } - const hostname = value.split('/')[0]; + if (/[^\x21-\x7e]/.test(value)) { + return false; + } + const [hostname, ...pathParts] = value.split('/'); // Reject all-numeric hostname forms (short-form IPs: 127.1, 2130706433). if (/^[\d.]+$/.test(hostname)) { return false; } + // Reject path segments that are purely dots or end with a dot (e.g. foo., foo.., foo../bar). + if (pathParts.some((seg) => /\.$/.test(seg))) { + return false; + } return PlgOnboarding.DOMAIN_PATTERN.test(value) && hostname.length <= PlgOnboarding.MAX_HOSTNAME_LENGTH && value.length <= PlgOnboarding.MAX_DOMAIN_LENGTH; diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js index 7afb4105c..41c69ae87 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js @@ -42,6 +42,8 @@ describe('PlgOnboarding Schema', () => { 'example.com/en-us', 'example.com/case_studies', 'xn--nba-6na.com', + 'example.com/foo_bar', + 'example.com/us/foo_bar', ].forEach((value) => { it(`accepts "${value}"`, () => { expect(domainAttr.validate(value)).to.be.true; @@ -69,6 +71,13 @@ describe('PlgOnboarding Schema', () => { ['uppercase hostname', 'NBA.COM'], ['uppercase path segment', 'nba.com/Kings'], ['uppercase locale path', 'example.com/en-US'], + ['underscore in hostname label', 'foo_bar.com'], + ['underscore in subdomain label', 'foo_bar.example.com'], + ['trailing dot in path segment', 'nba.com/foo.'], + ['trailing double-dot in path segment', 'nba.com/foo..'], + ['dot-dot mid-path segment', 'nba.com/foo../bar'], + ['null byte in domain', 'nba.com\x00/evil'], + ['control character in path', 'nba.com/ki\x01ngs'], ].forEach(([label, value]) => { it(`rejects ${label}`, () => { expect(domainAttr.validate(value)).to.be.false; From 16b31b6b5cf26838465f0d97a3cb2adfa61e8a02 Mon Sep 17 00:00:00 2001 From: Kanishka Date: Fri, 22 May 2026 00:32:01 +0530 Subject: [PATCH 6/9] feat(data-access): expose PlgOnboarding.isValidDomain for shared use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lift the schema's inline domain validator into a static isValidDomain(value) on the PlgOnboarding model so the data-access schema, the api-service controller (PR #2363), and future consumers share one complete validator instead of importing the incomplete DOMAIN_PATTERN regex. Also tighten the validator to reject consecutive dots within a path segment (e.g. nba.com/v1..0, nba.com/foo..bar) — DOMAIN_PATTERN's negative lookahead only catches segments starting with a dot. Co-Authored-By: Claude Sonnet 4.6 --- .../plg-onboarding/plg-onboarding.model.js | 29 +++++ .../plg-onboarding/plg-onboarding.schema.js | 21 +--- .../plg-onboarding.model.test.js | 103 ++++++++++++++++++ .../plg-onboarding.schema.test.js | 2 + 4 files changed, 135 insertions(+), 20 deletions(-) diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js index 3c240e109..2a71f8dae 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js @@ -51,6 +51,35 @@ class PlgOnboarding extends BaseModel { return typeof value === 'string' ? value.toLowerCase() : value; } + // Complete domain validator used by the schema and intended for external consumers. + // Layers a typeof guard, case-canonical check, control-character rejection, + // all-numeric-hostname rejection (covers short-form IPs like 127.1 and 2130706433), + // trailing-dot path-segment rejection, DOMAIN_PATTERN test, and length caps. + // Note: DOMAIN_PATTERN alone is not sufficient — always prefer this method. + // This is a syntactic / data-integrity validator, not an SSRF gate. Callers that + // make outbound fetches must layer their own private-IP and DNS-resolution checks. + static isValidDomain(value) { + if (typeof value !== 'string' || value !== value.toLowerCase()) { + return false; + } + if (/[^\x21-\x7e]/.test(value)) { + return false; + } + const [hostname, ...pathParts] = value.split('/'); + if (/^[\d.]+$/.test(hostname)) { + return false; + } + // Reject path segments that are purely dots, end with a dot, or contain + // consecutive dots (foo., foo.., foo../bar, v1..0). DOMAIN_PATTERN's + // negative lookahead only blocks segments STARTING with a dot. + if (pathParts.some((seg) => /\.$/.test(seg) || seg.includes('..'))) { + return false; + } + return PlgOnboarding.DOMAIN_PATTERN.test(value) + && hostname.length <= PlgOnboarding.MAX_HOSTNAME_LENGTH + && value.length <= PlgOnboarding.MAX_DOMAIN_LENGTH; + } + static STATUSES = { PRE_ONBOARDING: 'PRE_ONBOARDING', IN_PROGRESS: 'IN_PROGRESS', diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js index a824ab9d7..ec327015d 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.schema.js @@ -28,26 +28,7 @@ const schema = new SchemaBuilder(PlgOnboarding, PlgOnboardingCollection) type: 'string', required: true, readOnly: true, - validate: (value) => { - if (typeof value !== 'string' || value !== value.toLowerCase()) { - return false; - } - if (/[^\x21-\x7e]/.test(value)) { - return false; - } - const [hostname, ...pathParts] = value.split('/'); - // Reject all-numeric hostname forms (short-form IPs: 127.1, 2130706433). - if (/^[\d.]+$/.test(hostname)) { - return false; - } - // Reject path segments that are purely dots or end with a dot (e.g. foo., foo.., foo../bar). - if (pathParts.some((seg) => /\.$/.test(seg))) { - return false; - } - return PlgOnboarding.DOMAIN_PATTERN.test(value) - && hostname.length <= PlgOnboarding.MAX_HOSTNAME_LENGTH - && value.length <= PlgOnboarding.MAX_DOMAIN_LENGTH; - }, + validate: (value) => PlgOnboarding.isValidDomain(value), }) .addAttribute('baseURL', { type: 'string', diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js index 5a84af543..ca8a4256a 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js @@ -143,6 +143,109 @@ describe('PlgOnboardingModel', () => { }); }); + describe('isValidDomain', () => { + describe('valid values', () => { + [ + 'nba.com', + 'www.nba.com', + 'nba.com/kings', + 'nba.com/us/kings', + 'example.com/en-us', + 'example.com/case_studies', + 'xn--nba-6na.com', + ].forEach((value) => { + it(`accepts "${value}"`, () => { + expect(PlgOnboarding.isValidDomain(value)).to.be.true; + }); + }); + }); + + describe('invalid values', () => { + [ + ['empty string', ''], + ['scheme prefix', 'https://nba.com'], + ['IPv4 address', '127.0.0.1'], + ['short-form IPv4', '127.1'], + ['decimal IPv4', '2130706433'], + ['uppercase hostname', 'NBA.COM'], + ['uppercase path segment', 'nba.com/Kings'], + ['null byte in domain', 'nba.com\x00/evil'], + ['control character in path', 'nba.com/ki\x01ngs'], + ['trailing dot path segment', 'nba.com/foo.'], + ['trailing dot fqdn', 'nba.com.'], + ['consecutive dots mid path segment', 'nba.com/v1..0'], + ['consecutive dots mid path segment 2', 'nba.com/foo..bar'], + ].forEach(([label, value]) => { + it(`rejects ${label}`, () => { + expect(PlgOnboarding.isValidDomain(value)).to.be.false; + }); + }); + }); + + describe('non-string inputs', () => { + [ + ['null', null], + ['undefined', undefined], + ['number', 123], + ['boolean', true], + ['object', { domain: 'nba.com' }], + ['array', ['nba.com']], + ].forEach(([label, value]) => { + it(`rejects ${label}`, () => { + expect(PlgOnboarding.isValidDomain(value)).to.be.false; + }); + }); + }); + + describe('length boundaries', () => { + it('accepts a hostname of exactly 253 chars', () => { + const hostname = `${'a'.repeat(249)}.com`; + expect(hostname.length).to.equal(253); + expect(PlgOnboarding.isValidDomain(hostname)).to.be.true; + }); + + it('rejects a hostname exceeding 253 chars', () => { + const hostname = `${'a'.repeat(250)}.com`; + expect(hostname.length).to.equal(254); + expect(PlgOnboarding.isValidDomain(hostname)).to.be.false; + }); + + it('accepts a domain of exactly 2048 chars', () => { + const value = `nba.com/${'a'.repeat(2040)}`; + expect(value.length).to.equal(2048); + expect(PlgOnboarding.isValidDomain(value)).to.be.true; + }); + + it('rejects a domain exceeding 2048 chars', () => { + const value = `nba.com/${'a'.repeat(2041)}`; + expect(value.length).to.equal(2049); + expect(PlgOnboarding.isValidDomain(value)).to.be.false; + }); + }); + + describe('regression: DOMAIN_PATTERN alone is insufficient', () => { + // Pinning tests: these inputs pass the bare regex but are correctly rejected + // by the full validator. They exist to prevent regressions if a future caller + // is tempted to import DOMAIN_PATTERN directly instead of isValidDomain. + it('DOMAIN_PATTERN accepts short-form IPv4 "127.1" but isValidDomain rejects it', () => { + expect(PlgOnboarding.DOMAIN_PATTERN.test('127.1')).to.be.true; + expect(PlgOnboarding.isValidDomain('127.1')).to.be.false; + }); + + it('DOMAIN_PATTERN accepts trailing-dot path segment but isValidDomain rejects it', () => { + expect(PlgOnboarding.DOMAIN_PATTERN.test('nba.com/foo.')).to.be.true; + expect(PlgOnboarding.isValidDomain('nba.com/foo.')).to.be.false; + }); + + it('DOMAIN_PATTERN has no length cap but isValidDomain enforces 2048', () => { + const tooLong = `nba.com/${'a'.repeat(2041)}`; + expect(tooLong.length).to.equal(2049); + expect(PlgOnboarding.DOMAIN_PATTERN.test(tooLong)).to.be.true; + expect(PlgOnboarding.isValidDomain(tooLong)).to.be.false; + }); + }); + }); + describe('normalizeDomain', () => { it('lowercases a string value', () => { expect(PlgOnboarding.normalizeDomain('NBA.COM/Kings')).to.equal('nba.com/kings'); diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js index 41c69ae87..4e0354aaf 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js @@ -76,6 +76,8 @@ describe('PlgOnboarding Schema', () => { ['trailing dot in path segment', 'nba.com/foo.'], ['trailing double-dot in path segment', 'nba.com/foo..'], ['dot-dot mid-path segment', 'nba.com/foo../bar'], + ['consecutive dots mid path segment', 'nba.com/v1..0'], + ['consecutive dots in middle of segment', 'nba.com/foo..bar'], ['null byte in domain', 'nba.com\x00/evil'], ['control character in path', 'nba.com/ki\x01ngs'], ].forEach(([label, value]) => { From b0e0bb6c95414e10b4469f5b31560b23a862d448 Mon Sep 17 00:00:00 2001 From: Kanishka Date: Fri, 22 May 2026 00:43:21 +0530 Subject: [PATCH 7/9] fix(data-access): anchor PLG IPv4 lookahead to whole-hostname MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous DOMAIN_PATTERN rejected any hostname starting with a dotted-quad, which incorrectly blocked legitimate hosts like 1.2.3.4.example.com and nip.io-style wildcard DNS (192.168.1.1.nip.io). Anchor the IPv4 negative lookahead to slash-or-end-of-string so bare IPv4 (127.0.0.1, 127.0.0.1/path) is still rejected while hosts whose first labels happen to be numeric pass through. Short-form IPs like 127.1 continue to be caught by isValidDomain's all-numeric-hostname check. Also document that lowercase canonicalization (host AND path) is intentional — the domain field is part of the dedup sort key on findByImsOrgIdAndDomain, so mixed-case paths would create distinct onboarding rows for the same site. Co-Authored-By: Claude Sonnet 4.6 --- .../plg-onboarding/plg-onboarding.model.js | 17 ++++++++++++----- .../plg-onboarding/plg-onboarding.model.test.js | 8 ++++++++ .../plg-onboarding.schema.test.js | 2 ++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js index 2a71f8dae..114f88e21 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js @@ -32,17 +32,20 @@ class PlgOnboarding extends BaseModel { // Matches lowercase hostnames (at least one dot required) and an optional subpath // (e.g. nba.com, nba.com/kings, nba.com/us/kings). // Rejects: uppercase letters (use normalizeDomain() first), schemes (https://), - // bare/short-form IPv4 (127.0.0.1; see schema validator for all-numeric hostname - // check covering short forms like 127.1), ports (:8080), single-label hostnames - // (localhost, metadata), query strings, fragments, empty/trailing path segments, - // and any path segment starting with a dot (blocks ./, ../, .hidden, ..foo, etc.). + // bare IPv4 hostnames (127.0.0.1, 127.0.0.1/path — the negative lookahead is + // anchored to slash/end-of-string so legitimate hosts like 1.2.3.4.example.com + // and nip.io-style wildcard DNS (192.168.1.1.nip.io) are still accepted; see + // schema validator for all-numeric hostname check covering short forms like + // 127.1), ports (:8080), single-label hostnames (localhost, metadata), query + // strings, fragments, empty/trailing path segments, and any path segment + // starting with a dot (blocks ./, ../, .hidden, ..foo, etc.). // Path-qualified domains (nba.com/kings) are distinct sort-key values from the bare // hostname; callers must call normalizeDomain() before findByImsOrgIdAndDomain. // Labels must not start or end with a hyphen (RFC 1035). // Raw Unicode / IDN must be punycode-encoded before validation (xn-- form is accepted). // Percent-encoded path characters (%20 etc.) are not accepted; decode before validation. // Underscore is allowed in path segments but not in hostname labels. - static DOMAIN_PATTERN = /^(?!\d+(\.\d+){3})[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)+(\/(?!\.)[a-z0-9._~-]+)*$/; + static DOMAIN_PATTERN = /^(?!\d+(\.\d+){3}(?:\/|$))[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)+(\/(?!\.)[a-z0-9._~-]+)*$/; // Returns the canonical form of a domain value: lowercased. // Must be called on any user-supplied value before passing it to the domain @@ -56,6 +59,10 @@ class PlgOnboarding extends BaseModel { // all-numeric-hostname rejection (covers short-form IPs like 127.1 and 2130706433), // trailing-dot path-segment rejection, DOMAIN_PATTERN test, and length caps. // Note: DOMAIN_PATTERN alone is not sufficient — always prefer this method. + // Lowercase-only (host AND path) is intentional canonicalization, not a bug. The + // domain field is part of the dedup sort key on findByImsOrgIdAndDomain; allowing + // mixed-case paths would let `nba.com/Kings` and `nba.com/kings` create distinct + // onboarding rows for the same site. Callers should call normalizeDomain() first. // This is a syntactic / data-integrity validator, not an SSRF gate. Callers that // make outbound fetches must layer their own private-IP and DNS-resolution checks. static isValidDomain(value) { diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js index ca8a4256a..f18212a66 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js @@ -95,6 +95,10 @@ describe('PlgOnboardingModel', () => { 'example.com/en-us', 'example.com/case_studies', 'xn--nba-6na.com', + // IPv4 lookahead is anchored to slash/end-of-string; hostnames that merely + // begin with a dotted-quad remain valid (covers nip.io-style wildcard DNS). + '1.2.3.4.example.com', + '192.168.1.1.nip.io', ].forEach((value) => { it(`accepts "${value}"`, () => { expect(DOMAIN_PATTERN.test(value)).to.be.true; @@ -153,6 +157,10 @@ describe('PlgOnboardingModel', () => { 'example.com/en-us', 'example.com/case_studies', 'xn--nba-6na.com', + // Hostnames that start with a dotted-quad but continue with non-IP labels: + // the IPv4 lookahead is anchored to slash/end-of-string so these are valid. + '1.2.3.4.example.com', + '192.168.1.1.nip.io', ].forEach((value) => { it(`accepts "${value}"`, () => { expect(PlgOnboarding.isValidDomain(value)).to.be.true; diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js index 4e0354aaf..e21943b30 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js @@ -44,6 +44,8 @@ describe('PlgOnboarding Schema', () => { 'xn--nba-6na.com', 'example.com/foo_bar', 'example.com/us/foo_bar', + '1.2.3.4.example.com', + '192.168.1.1.nip.io', ].forEach((value) => { it(`accepts "${value}"`, () => { expect(domainAttr.validate(value)).to.be.true; From 9b31b47936defd6128b66b4843a54a624a083355 Mon Sep 17 00:00:00 2001 From: Kanishka Date: Fri, 22 May 2026 12:03:58 +0530 Subject: [PATCH 8/9] fix(data-access): reject IP-literal hostnames via alphabetic TLD requirement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous DOMAIN_PATTERN relied on an all-numeric-hostname check and a dotted-quad lookahead to block IP literals. Both miss hex-encoded IPv4 forms (0x7f.0.0.1, 0xa9.254.169.254 → AWS IMDS), which the WHATWG URL parser canonicalizes to private/loopback IPs at fetch time — bypassing any raw-string SSRF denylist downstream. Replace the lookahead with a structural alphabetic/punycode TLD requirement: the final label must match [a-z]{2,} or xn--*. This rejects every IP-literal form (dotted-quad, short-form, decimal, hex, octal) and foo.1-style typos in one rule, and keeps legitimate hostnames including nip.io-style wildcard DNS (192.168.1.1.nip.io) and punycode (xn--nba-6na.com). The schema-level all-numeric-hostname check stays as defense-in-depth. Co-Authored-By: Claude Sonnet 4.6 --- .../plg-onboarding/plg-onboarding.model.js | 21 ++++++---- .../plg-onboarding.model.test.js | 41 ++++++++++++++++--- .../plg-onboarding.schema.test.js | 4 ++ 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js index 114f88e21..06c81f6ca 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js @@ -31,21 +31,23 @@ class PlgOnboarding extends BaseModel { // Matches lowercase hostnames (at least one dot required) and an optional subpath // (e.g. nba.com, nba.com/kings, nba.com/us/kings). + // The final label (TLD) must be alphabetic (>= 2 chars) or punycode (xn--*). This + // structurally rejects every IP-literal form: dotted-quad (127.0.0.1), short-form + // (127.1), decimal (2130706433), hex (0x7f.0.0.1, 0xa9.254.169.254 → AWS IMDS), + // and octal (0177.0.0.1) — and also blocks foo.1-style typos. WHATWG URL would + // otherwise canonicalize hex/decimal IPs to their dotted-quad form, bypassing + // denylist-based SSRF gates downstream. // Rejects: uppercase letters (use normalizeDomain() first), schemes (https://), - // bare IPv4 hostnames (127.0.0.1, 127.0.0.1/path — the negative lookahead is - // anchored to slash/end-of-string so legitimate hosts like 1.2.3.4.example.com - // and nip.io-style wildcard DNS (192.168.1.1.nip.io) are still accepted; see - // schema validator for all-numeric hostname check covering short forms like - // 127.1), ports (:8080), single-label hostnames (localhost, metadata), query - // strings, fragments, empty/trailing path segments, and any path segment - // starting with a dot (blocks ./, ../, .hidden, ..foo, etc.). + // ports (:8080), single-label hostnames (localhost, metadata), query strings, + // fragments, empty/trailing path segments, and any path segment starting with + // a dot (blocks ./, ../, .hidden, ..foo, etc.). // Path-qualified domains (nba.com/kings) are distinct sort-key values from the bare // hostname; callers must call normalizeDomain() before findByImsOrgIdAndDomain. // Labels must not start or end with a hyphen (RFC 1035). // Raw Unicode / IDN must be punycode-encoded before validation (xn-- form is accepted). // Percent-encoded path characters (%20 etc.) are not accepted; decode before validation. // Underscore is allowed in path segments but not in hostname labels. - static DOMAIN_PATTERN = /^(?!\d+(\.\d+){3}(?:\/|$))[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)+(\/(?!\.)[a-z0-9._~-]+)*$/; + static DOMAIN_PATTERN = /^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)*\.(?:[a-z]{2,}|xn--[a-z0-9-]+)(\/(?!\.)[a-z0-9._~-]+)*$/; // Returns the canonical form of a domain value: lowercased. // Must be called on any user-supplied value before passing it to the domain @@ -56,7 +58,8 @@ class PlgOnboarding extends BaseModel { // Complete domain validator used by the schema and intended for external consumers. // Layers a typeof guard, case-canonical check, control-character rejection, - // all-numeric-hostname rejection (covers short-form IPs like 127.1 and 2130706433), + // all-numeric-hostname rejection (defense-in-depth; DOMAIN_PATTERN's alphabetic-TLD + // requirement already rejects dotted-quad, short-form, decimal, hex, and octal IPs), // trailing-dot path-segment rejection, DOMAIN_PATTERN test, and length caps. // Note: DOMAIN_PATTERN alone is not sufficient — always prefer this method. // Lowercase-only (host AND path) is intentional canonicalization, not a bug. The diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js index f18212a66..e44101d65 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js @@ -139,6 +139,12 @@ describe('PlgOnboardingModel', () => { ['IPv6 bracketed', '[::1]'], ['IPv6 unbracketed', '2001:db8::1'], ['percent-encoded path', 'nba.com/path%20with%20space'], + // IP-literal forms rejected via the alphabetic/punycode TLD requirement. + ['hex IPv4', '0x7f.0.0.1'], + ['hex IPv4 (IMDS)', '0xa9.254.169.254'], + ['octal IPv4', '0177.0.0.1'], + ['short-form IPv4', '127.1'], + ['numeric TLD', 'foo.1'], ].forEach(([label, value]) => { it(`rejects ${label}: "${value}"`, () => { expect(DOMAIN_PATTERN.test(value)).to.be.false; @@ -183,6 +189,13 @@ describe('PlgOnboardingModel', () => { ['trailing dot fqdn', 'nba.com.'], ['consecutive dots mid path segment', 'nba.com/v1..0'], ['consecutive dots mid path segment 2', 'nba.com/foo..bar'], + // Hex-encoded IPv4 literals (WHATWG URL canonicalizes these to dotted-quad, + // bypassing denylist-based SSRF gates that match raw strings). + ['hex IPv4', '0x7f.0.0.1'], + ['hex IPv4 (IMDS)', '0xa9.254.169.254'], + ['hex IPv4 all hex', '0xa9.0xfe.0xa9.0xfe'], + ['octal IPv4', '0177.0.0.1'], + ['numeric TLD', 'foo.1'], ].forEach(([label, value]) => { it(`rejects ${label}`, () => { expect(PlgOnboarding.isValidDomain(value)).to.be.false; @@ -235,16 +248,16 @@ describe('PlgOnboardingModel', () => { // Pinning tests: these inputs pass the bare regex but are correctly rejected // by the full validator. They exist to prevent regressions if a future caller // is tempted to import DOMAIN_PATTERN directly instead of isValidDomain. - it('DOMAIN_PATTERN accepts short-form IPv4 "127.1" but isValidDomain rejects it', () => { - expect(PlgOnboarding.DOMAIN_PATTERN.test('127.1')).to.be.true; - expect(PlgOnboarding.isValidDomain('127.1')).to.be.false; - }); - it('DOMAIN_PATTERN accepts trailing-dot path segment but isValidDomain rejects it', () => { expect(PlgOnboarding.DOMAIN_PATTERN.test('nba.com/foo.')).to.be.true; expect(PlgOnboarding.isValidDomain('nba.com/foo.')).to.be.false; }); + it('DOMAIN_PATTERN accepts consecutive-dot path segment but isValidDomain rejects it', () => { + expect(PlgOnboarding.DOMAIN_PATTERN.test('nba.com/v1..0')).to.be.true; + expect(PlgOnboarding.isValidDomain('nba.com/v1..0')).to.be.false; + }); + it('DOMAIN_PATTERN has no length cap but isValidDomain enforces 2048', () => { const tooLong = `nba.com/${'a'.repeat(2041)}`; expect(tooLong.length).to.equal(2049); @@ -252,6 +265,24 @@ describe('PlgOnboardingModel', () => { expect(PlgOnboarding.isValidDomain(tooLong)).to.be.false; }); }); + + describe('SSRF defense: IP-literal hostnames rejected via TLD requirement', () => { + // These would canonicalize to private/loopback IPs via WHATWG URL parsing + // (new URL('https://0xa9.254.169.254').hostname → '169.254.169.254'). + // The alphabetic/punycode TLD requirement in DOMAIN_PATTERN rejects them at + // the structural level so downstream raw-string denylists cannot be bypassed. + [ + ['hex IPv4 loopback', '0x7f.0.0.1', '127.0.0.1'], + ['hex IPv4 IMDS', '0xa9.254.169.254', '169.254.169.254'], + ['hex IPv4 all hex', '0xa9.0xfe.0xa9.0xfe', '169.254.169.254'], + ['hex IPv4 RFC1918', '0xa.0.0.1', '10.0.0.1'], + ['octal IPv4', '0177.0.0.1', '127.0.0.1'], + ].forEach(([label, input]) => { + it(`isValidDomain rejects ${label} (${input})`, () => { + expect(PlgOnboarding.isValidDomain(input)).to.be.false; + }); + }); + }); }); describe('normalizeDomain', () => { diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js index e21943b30..eb7e2bdee 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.schema.test.js @@ -82,6 +82,10 @@ describe('PlgOnboarding Schema', () => { ['consecutive dots in middle of segment', 'nba.com/foo..bar'], ['null byte in domain', 'nba.com\x00/evil'], ['control character in path', 'nba.com/ki\x01ngs'], + ['hex IPv4 loopback', '0x7f.0.0.1'], + ['hex IPv4 IMDS', '0xa9.254.169.254'], + ['octal IPv4', '0177.0.0.1'], + ['numeric TLD', 'foo.1'], ].forEach(([label, value]) => { it(`rejects ${label}`, () => { expect(domainAttr.validate(value)).to.be.false; From 69408dac8730164d78637866221e1b717b0d2603 Mon Sep 17 00:00:00 2001 From: Kanishka Date: Fri, 22 May 2026 13:27:08 +0530 Subject: [PATCH 9/9] chore(data-access): tighten PLG validator JSDoc + length-cap ordering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Polish from multi-agent review. No behavior change for legitimate inputs; pathological-length inputs now rejected in O(1) before the regex scan. - Reorder isValidDomain to run length caps BEFORE DOMAIN_PATTERN.test(). The regex itself is linear (no overlapping quantifiers) and not a ReDoS vector, but isValidDomain is exported for external consumers that may not bound input size upstream. Cheap O(1) bail-out beats multi-MB regex scan on adversarial input. - Add prominent "do NOT use DOMAIN_PATTERN directly" warning above the pattern declaration so external consumers see it via autocomplete, not buried inside isValidDomain's doc. - Clarify normalizeDomain JSDoc: non-string inputs are returned unchanged (null/undefined/number/object), so callers MUST also run isValidDomain before using the result as a sort key. - Correct MAX_DOMAIN_LENGTH rationale: the cap is for storage and sort-key index depth, not for any specific browser/URL-bar limit. - Fix stale "IPv4 lookahead anchored to slash/end-of-string" test comments — the current regex has no IPv4 lookahead; IP literals are blocked by the alphabetic/punycode TLD requirement. Co-Authored-By: Claude Sonnet 4.6 --- .../plg-onboarding/plg-onboarding.model.js | 31 +++++++++++++++---- .../plg-onboarding.model.test.js | 9 ++++-- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js index 06c81f6ca..610609811 100644 --- a/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js +++ b/packages/spacecat-shared-data-access/src/models/plg-onboarding/plg-onboarding.model.js @@ -26,9 +26,17 @@ class PlgOnboarding extends BaseModel { static MAX_HOSTNAME_LENGTH = 253; // RFC 1035 DNS name limit - // Practical cap covering common browser (2000-2083) and CDN limits. + // Practical cap, chosen for storage and sort-key index depth rather than for any + // specific browser/URL-bar limit (the domain field is a stored identifier, not a URL). static MAX_DOMAIN_LENGTH = 2048; + // **WARNING for external consumers: do NOT use DOMAIN_PATTERN directly.** + // This regex is incomplete on its own — it has no length cap, no control-character + // rejection, no all-numeric-hostname check, no trailing-dot/consecutive-dot path + // rejection, and no typeof guard. Always call `PlgOnboarding.isValidDomain(value)` + // which composes this regex with the rest of the validator. The regex is exported + // only for legacy callers and may become module-private in a future major release. + // // Matches lowercase hostnames (at least one dot required) and an optional subpath // (e.g. nba.com, nba.com/kings, nba.com/us/kings). // The final label (TLD) must be alphabetic (>= 2 chars) or punycode (xn--*). This @@ -50,8 +58,11 @@ class PlgOnboarding extends BaseModel { static DOMAIN_PATTERN = /^[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(?:\.[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)*\.(?:[a-z]{2,}|xn--[a-z0-9-]+)(\/(?!\.)[a-z0-9._~-]+)*$/; // Returns the canonical form of a domain value: lowercased. - // Must be called on any user-supplied value before passing it to the domain - // attribute validator or to findByImsOrgIdAndDomain to prevent duplicate records. + // Note: non-string inputs (null/undefined/number/object) are returned unchanged. + // Callers MUST also run `isValidDomain(value)` before using the result — calling + // `normalizeDomain` alone does not guarantee the value is a string or safe to + // pass to `findByImsOrgIdAndDomain` (which would otherwise treat a non-string + // sort key as something it isn't). static normalizeDomain(value) { return typeof value === 'string' ? value.toLowerCase() : value; } @@ -72,10 +83,20 @@ class PlgOnboarding extends BaseModel { if (typeof value !== 'string' || value !== value.toLowerCase()) { return false; } + // Length caps run BEFORE the regex test so a multi-MB pathological input is + // rejected in O(1) rather than driving a multi-MB regex scan. The regex itself + // is linear (no overlapping quantifiers) but external consumers may not bound + // input size upstream. + if (value.length > PlgOnboarding.MAX_DOMAIN_LENGTH) { + return false; + } if (/[^\x21-\x7e]/.test(value)) { return false; } const [hostname, ...pathParts] = value.split('/'); + if (hostname.length > PlgOnboarding.MAX_HOSTNAME_LENGTH) { + return false; + } if (/^[\d.]+$/.test(hostname)) { return false; } @@ -85,9 +106,7 @@ class PlgOnboarding extends BaseModel { if (pathParts.some((seg) => /\.$/.test(seg) || seg.includes('..'))) { return false; } - return PlgOnboarding.DOMAIN_PATTERN.test(value) - && hostname.length <= PlgOnboarding.MAX_HOSTNAME_LENGTH - && value.length <= PlgOnboarding.MAX_DOMAIN_LENGTH; + return PlgOnboarding.DOMAIN_PATTERN.test(value); } static STATUSES = { diff --git a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js index e44101d65..eefd13d57 100644 --- a/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js +++ b/packages/spacecat-shared-data-access/test/unit/models/plg-onboarding/plg-onboarding.model.test.js @@ -95,8 +95,10 @@ describe('PlgOnboardingModel', () => { 'example.com/en-us', 'example.com/case_studies', 'xn--nba-6na.com', - // IPv4 lookahead is anchored to slash/end-of-string; hostnames that merely - // begin with a dotted-quad remain valid (covers nip.io-style wildcard DNS). + // Hostnames that begin with a dotted-quad-shaped prefix but end in an + // alphabetic TLD remain valid — IP literals are blocked by the TLD + // requirement (final label must be [a-z]{2,} or xn--*), not by a + // lookahead. Covers nip.io-style wildcard DNS. '1.2.3.4.example.com', '192.168.1.1.nip.io', ].forEach((value) => { @@ -164,7 +166,8 @@ describe('PlgOnboardingModel', () => { 'example.com/case_studies', 'xn--nba-6na.com', // Hostnames that start with a dotted-quad but continue with non-IP labels: - // the IPv4 lookahead is anchored to slash/end-of-string so these are valid. + // IP literals are rejected via the alphabetic/punycode TLD requirement, + // not via a hostname-prefix lookahead, so these legitimate hosts pass. '1.2.3.4.example.com', '192.168.1.1.nip.io', ].forEach((value) => {