Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions bbot/modules/internal/excavate.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,7 +864,7 @@ class URLExtractor(ExcavateRule):
tags = "spider-danger"
description = "contains full URL"
strings:
$url_full = /https?:\/\/([\w\.-]+)(:\d{1,5})?([\/\w\.-]*)/
$url_full = /https?:\/\/(\[[0-9a-fA-F:]+\]|[\w\.-]+)(:\d{1,5})?([\/\w\.-]*)/
condition:
$url_full
}
Expand All @@ -884,8 +884,12 @@ class URLExtractor(ExcavateRule):
"""
),
}
full_url_regex = re.compile(r"(https?)://(\w(?:[\w-]+\.?)+(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)")
full_url_regex_strict = re.compile(r"^(https?):\/\/([\w.-]+)(?::\d{1,5})?(\/[\w\/\.-]*)?(\?[^\s]+)?$")
full_url_regex = re.compile(
r"(https?)://((?:\[[0-9a-fA-F:]+\]|\w(?:[\w-]+\.?)+)(?::\d{1,5})?(?:/[-\w\.\(\)]*[-\w\.]+)*/?)"
)
full_url_regex_strict = re.compile(
r"^(https?):\/\/(\[[0-9a-fA-F:]+\]|[\w.-]+)(?::\d{1,5})?(\/[\w\/\.-]*)?(\?[^\s]+)?$"
)
tag_attribute_regex = bbot_regexes.tag_attribute_regex

async def process(self, yara_results, event, yara_rule_settings, discovery_context):
Expand Down
77 changes: 77 additions & 0 deletions bbot/test/test_step_1/test_excavate_url_regexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Unit tests for the URL regexes used by the excavate URLExtractor.

These tests pin behaviour of the IPv6 host parsing in the YARA rule and the
two Python regex post-filters (``full_url_regex`` / ``full_url_regex_strict``).

They cover the regression in issue #1815: the original patterns rejected
``http://[2001:db8::1]/`` and other IPv6 URLs because the host alternative
only matched word characters and dots.
"""

import yara

from bbot.modules.internal.excavate import excavate


full_url_regex = excavate.URLExtractor.full_url_regex
full_url_regex_strict = excavate.URLExtractor.full_url_regex_strict
yara_url_rule = yara.compile(source=excavate.URLExtractor.yara_rules["url_full"])


IPV6_URLS = [
"http://[2001:db8::1]/api",
"https://[2001:db8::1]:8443/api",
"http://[::1]/",
"http://[::1]:8080/path",
"https://[fe80::dead:beef]/foo/bar.html",
"http://[fe80::1234:5678:9abc:def0]:80/",
]

NON_IPV6_URLS = [
"http://example.com/",
"https://www.example.com:8080/path",
"http://127.0.0.1:8888/",
"https://asdffoo.test.notreal/some/path",
]


def test_full_url_regex_matches_ipv6():
for url in IPV6_URLS:
m = full_url_regex.search(url)
assert m is not None, f"full_url_regex should match IPv6 URL {url!r}"
# The captured host should retain the surrounding brackets so the
# downstream URL parser recognises it as IPv6.
assert m.group(2).startswith("["), f"host capture missing leading [ for {url!r}: {m.group(2)!r}"


def test_full_url_regex_still_matches_existing_patterns():
for url in NON_IPV6_URLS:
assert full_url_regex.search(url) is not None, f"regression: full_url_regex broke for {url!r}"


def test_full_url_regex_strict_matches_ipv6():
for url in IPV6_URLS:
assert full_url_regex_strict.match(url) is not None, (
f"full_url_regex_strict should match IPv6 URL {url!r}"
)


def test_full_url_regex_strict_still_matches_existing_patterns():
for url in NON_IPV6_URLS:
assert full_url_regex_strict.match(url) is not None, (
f"regression: full_url_regex_strict broke for {url!r}"
)


def test_yara_url_rule_matches_ipv6():
for url in IPV6_URLS:
assert yara_url_rule.match(data=url.encode()), (
f"YARA url_full rule should match IPv6 URL {url!r}"
)


def test_yara_url_rule_still_matches_existing_patterns():
for url in NON_IPV6_URLS:
assert yara_url_rule.match(data=url.encode()), (
f"regression: YARA url_full rule broke for {url!r}"
)
Loading