Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion llmstack/common/utils/text_extract.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import base64
import ipaddress
import logging
import re
from io import BytesIO
Expand Down Expand Up @@ -59,11 +60,34 @@ def connection(self):
return self._connection


def _is_private_url(url):
"""Check if a URL points to a private/internal IP address."""
try:
from urllib.parse import urlparse as _urlparse
parsed = _urlparse(url)
hostname = parsed.hostname
if not hostname:
return True
if hostname in ('localhost', 'metadata.google.internal', 'metadata.internal'):
return True
try:
ip = ipaddress.ip_address(hostname)
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved:
return True
except ValueError:
pass
return False
except Exception:
return True


def get_url_content_type(url, connection=None):
if _is_private_url(url):
raise ValueError("URLs pointing to private/internal addresses are not allowed")
response = requests.head(
url,
allow_redirects=True,
verify=False,
verify=True,
_connection=connection,
)

Expand Down