From d1e5d594f636be22a7ff79aad3d3b304fce6ae6d Mon Sep 17 00:00:00 2001 From: ChrisJr404 <11917633+ChrisJr404@users.noreply.github.com> Date: Sun, 3 May 2026 19:07:07 -0400 Subject: [PATCH] Excavate: extract IPv6 URLs (#1815) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The url_full YARA rule and the full_url_regex / full_url_regex_strict post-filters all required hosts to be word-character labels, so URLs with bracketed IPv6 hosts (http://[2001:db8::1]/, http://[::1]:8080/...) were dropped at extraction time. Add a [0-9a-fA-F:]+ alternative to the host part of all three patterns so IPv6 URLs are emitted as URL_UNVERIFIED events alongside DNS-name URLs. Adds bbot/test/test_step_1/test_excavate_url_regexes.py — 6 cases that pin both the new IPv6 acceptance and a regression guard for the existing DNS-name / IPv4 URLs. Closes #1815 --- bbot/modules/internal/excavate.py | 10 ++- .../test_step_1/test_excavate_url_regexes.py | 77 +++++++++++++++++++ 2 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 bbot/test/test_step_1/test_excavate_url_regexes.py diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py index 5a0fc04d43..6c3bac1a56 100644 --- a/bbot/modules/internal/excavate.py +++ b/bbot/modules/internal/excavate.py @@ -864,7 +864,7 @@ class URLExtractor(ExcavateRule): tags = "spider-danger" description = "contains full URL" strings: - $url_full = /https?:\/\/([\w\.-]+)(:\d{1,5})?([\/\w\.-]*)/ + $url_full = /https?:\/\/(\[[0-9a-fA-F:]+\]|[\w\.-]+)(:\d{1,5})?([\/\w\.-]*)/ condition: $url_full } @@ -884,8 +884,12 @@ class URLExtractor(ExcavateRule): """ ), } - full_url_regex = re.compile(r"(https?)://(\w(?:[\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)") - full_url_regex_strict = re.compile(r"^(https?):\/\/([\w.-]+)(?::\d{1,5})?(\/[\w\/\.-]*)?(\?[^\s]+)?$") + full_url_regex = re.compile( + r"(https?)://((?:\[[0-9a-fA-F:]+\]|\w(?:[\w-]+\.?)+)(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)" + ) + full_url_regex_strict = re.compile( + r"^(https?):\/\/(\[[0-9a-fA-F:]+\]|[\w.-]+)(?::\d{1,5})?(\/[\w\/\.-]*)?(\?[^\s]+)?$" + ) tag_attribute_regex = bbot_regexes.tag_attribute_regex async def process(self, yara_results, event, yara_rule_settings, discovery_context): diff --git a/bbot/test/test_step_1/test_excavate_url_regexes.py b/bbot/test/test_step_1/test_excavate_url_regexes.py new file mode 100644 index 0000000000..ef9868cac5 --- /dev/null +++ b/bbot/test/test_step_1/test_excavate_url_regexes.py @@ -0,0 +1,77 @@ +"""Unit tests for the URL regexes used by the excavate URLExtractor. + +These tests pin behaviour of the IPv6 host parsing in the YARA rule and the +two Python regex post-filters (``full_url_regex`` / ``full_url_regex_strict``). + +They cover the regression in issue #1815: the original patterns rejected +``http://[2001:db8::1]/`` and other IPv6 URLs because the host alternative +only matched word characters and dots. +""" + +import yara + +from bbot.modules.internal.excavate import excavate + + +full_url_regex = excavate.URLExtractor.full_url_regex +full_url_regex_strict = excavate.URLExtractor.full_url_regex_strict +yara_url_rule = yara.compile(source=excavate.URLExtractor.yara_rules["url_full"]) + + +IPV6_URLS = [ + "http://[2001:db8::1]/api", + "https://[2001:db8::1]:8443/api", + "http://[::1]/", + "http://[::1]:8080/path", + "https://[fe80::dead:beef]/foo/bar.html", + "http://[fe80::1234:5678:9abc:def0]:80/", +] + +NON_IPV6_URLS = [ + "http://example.com/", + "https://www.example.com:8080/path", + "http://127.0.0.1:8888/", + "https://asdffoo.test.notreal/some/path", +] + + +def test_full_url_regex_matches_ipv6(): + for url in IPV6_URLS: + m = full_url_regex.search(url) + assert m is not None, f"full_url_regex should match IPv6 URL {url!r}" + # The captured host should retain the surrounding brackets so the + # downstream URL parser recognises it as IPv6. + assert m.group(2).startswith("["), f"host capture missing leading [ for {url!r}: {m.group(2)!r}" + + +def test_full_url_regex_still_matches_existing_patterns(): + for url in NON_IPV6_URLS: + assert full_url_regex.search(url) is not None, f"regression: full_url_regex broke for {url!r}" + + +def test_full_url_regex_strict_matches_ipv6(): + for url in IPV6_URLS: + assert full_url_regex_strict.match(url) is not None, ( + f"full_url_regex_strict should match IPv6 URL {url!r}" + ) + + +def test_full_url_regex_strict_still_matches_existing_patterns(): + for url in NON_IPV6_URLS: + assert full_url_regex_strict.match(url) is not None, ( + f"regression: full_url_regex_strict broke for {url!r}" + ) + + +def test_yara_url_rule_matches_ipv6(): + for url in IPV6_URLS: + assert yara_url_rule.match(data=url.encode()), ( + f"YARA url_full rule should match IPv6 URL {url!r}" + ) + + +def test_yara_url_rule_still_matches_existing_patterns(): + for url in NON_IPV6_URLS: + assert yara_url_rule.match(data=url.encode()), ( + f"regression: YARA url_full rule broke for {url!r}" + )