From d1e5d594f636be22a7ff79aad3d3b304fce6ae6d Mon Sep 17 00:00:00 2001
From: ChrisJr404 <11917633+ChrisJr404@users.noreply.github.com>
Date: Sun, 3 May 2026 19:07:07 -0400
Subject: [PATCH] Excavate: extract IPv6 URLs (#1815)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The url_full YARA rule and the full_url_regex / full_url_regex_strict
post-filters all required hosts to be word-character labels, so URLs
with bracketed IPv6 hosts (http://[2001:db8::1]/, http://[::1]:8080/...)
were dropped at extraction time. Add a [0-9a-fA-F:]+ alternative to the
host part of all three patterns so IPv6 URLs are emitted as
URL_UNVERIFIED events alongside DNS-name URLs.

Adds bbot/test/test_step_1/test_excavate_url_regexes.py — 6 cases that
pin both the new IPv6 acceptance and a regression guard for the
existing DNS-name / IPv4 URLs.

Closes #1815
---
 bbot/modules/internal/excavate.py             | 10 ++-
 .../test_step_1/test_excavate_url_regexes.py  | 77 +++++++++++++++++++
 2 files changed, 84 insertions(+), 3 deletions(-)
 create mode 100644 bbot/test/test_step_1/test_excavate_url_regexes.py

diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py
index 5a0fc04d43..6c3bac1a56 100644
--- a/bbot/modules/internal/excavate.py
+++ b/bbot/modules/internal/excavate.py
@@ -864,7 +864,7 @@ class URLExtractor(ExcavateRule):
                         tags = "spider-danger"
                         description = "contains full URL"
                     strings:
-                        $url_full = /https?:\/\/([\w\.-]+)(:\d{1,5})?([\/\w\.-]*)/
+                        $url_full = /https?:\/\/(\[[0-9a-fA-F:]+\]|[\w\.-]+)(:\d{1,5})?([\/\w\.-]*)/
                     condition:
                         $url_full
                 }
@@ -884,8 +884,12 @@ class URLExtractor(ExcavateRule):
                 """
             ),
         }
-        full_url_regex = re.compile(r"(https?)://(\w(?:[\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)")
-        full_url_regex_strict = re.compile(r"^(https?):\/\/([\w.-]+)(?::\d{1,5})?(\/[\w\/\.-]*)?(\?[^\s]+)?$")
+        full_url_regex = re.compile(
+            r"(https?)://((?:\[[0-9a-fA-F:]+\]|\w(?:[\w-]+\.?)+)(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)"
+        )
+        full_url_regex_strict = re.compile(
+            r"^(https?):\/\/(\[[0-9a-fA-F:]+\]|[\w.-]+)(?::\d{1,5})?(\/[\w\/\.-]*)?(\?[^\s]+)?$"
+        )
         tag_attribute_regex = bbot_regexes.tag_attribute_regex
 
         async def process(self, yara_results, event, yara_rule_settings, discovery_context):
diff --git a/bbot/test/test_step_1/test_excavate_url_regexes.py b/bbot/test/test_step_1/test_excavate_url_regexes.py
new file mode 100644
index 0000000000..ef9868cac5
--- /dev/null
+++ b/bbot/test/test_step_1/test_excavate_url_regexes.py
@@ -0,0 +1,77 @@
+"""Unit tests for the URL regexes used by the excavate URLExtractor.
+
+These tests pin behaviour of the IPv6 host parsing in the YARA rule and the
+two Python regex post-filters (``full_url_regex`` / ``full_url_regex_strict``).
+
+They cover the regression in issue #1815: the original patterns rejected
+``http://[2001:db8::1]/`` and other IPv6 URLs because the host alternative
+only matched word characters and dots.
+"""
+
+import yara
+
+from bbot.modules.internal.excavate import excavate
+
+
+full_url_regex = excavate.URLExtractor.full_url_regex
+full_url_regex_strict = excavate.URLExtractor.full_url_regex_strict
+yara_url_rule = yara.compile(source=excavate.URLExtractor.yara_rules["url_full"])
+
+
+IPV6_URLS = [
+    "http://[2001:db8::1]/api",
+    "https://[2001:db8::1]:8443/api",
+    "http://[::1]/",
+    "http://[::1]:8080/path",
+    "https://[fe80::dead:beef]/foo/bar.html",
+    "http://[fe80::1234:5678:9abc:def0]:80/",
+]
+
+NON_IPV6_URLS = [
+    "http://example.com/",
+    "https://www.example.com:8080/path",
+    "http://127.0.0.1:8888/",
+    "https://asdffoo.test.notreal/some/path",
+]
+
+
+def test_full_url_regex_matches_ipv6():
+    for url in IPV6_URLS:
+        m = full_url_regex.search(url)
+        assert m is not None, f"full_url_regex should match IPv6 URL {url!r}"
+        # The captured host should retain the surrounding brackets so the
+        # downstream URL parser recognises it as IPv6.
+        assert m.group(2).startswith("["), f"host capture missing leading [ for {url!r}: {m.group(2)!r}"
+
+
+def test_full_url_regex_still_matches_existing_patterns():
+    for url in NON_IPV6_URLS:
+        assert full_url_regex.search(url) is not None, f"regression: full_url_regex broke for {url!r}"
+
+
+def test_full_url_regex_strict_matches_ipv6():
+    for url in IPV6_URLS:
+        assert full_url_regex_strict.match(url) is not None, (
+            f"full_url_regex_strict should match IPv6 URL {url!r}"
+        )
+
+
+def test_full_url_regex_strict_still_matches_existing_patterns():
+    for url in NON_IPV6_URLS:
+        assert full_url_regex_strict.match(url) is not None, (
+            f"regression: full_url_regex_strict broke for {url!r}"
+        )
+
+
+def test_yara_url_rule_matches_ipv6():
+    for url in IPV6_URLS:
+        assert yara_url_rule.match(data=url.encode()), (
+            f"YARA url_full rule should match IPv6 URL {url!r}"
+        )
+
+
+def test_yara_url_rule_still_matches_existing_patterns():
+    for url in NON_IPV6_URLS:
+        assert yara_url_rule.match(data=url.encode()), (
+            f"regression: YARA url_full rule broke for {url!r}"
+        )