commoncrawl · malteos · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -8,6 +8,12 @@ on:
     branches:
     - main
 
+# These permissions are needed to interact with AWS S3 via GitHub's OIDC Token endpoint
+permissions:
+  id-token: write
+  contents: read
+  pull-requests: read
+
 jobs:
   unit-tests:
     runs-on: ${{ matrix.os }}
@@ -56,7 +62,33 @@ jobs:
           pip install setuptools
 
       - name: Install cdx_toolkit
-        run: pip install .[test]
+        run: pip install .[all]
+
+      - name: Configure AWS credentials from OIDC (disabled for forks)
+        if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::837454214164:role/GitHubActions-Role
+          aws-region: us-east-1
+
+      - name: Disable S3 unit tests for Python 3.8 (botocore requires Python 3.9+)
+        if: ${{ startsWith(matrix.python-version, '3.8') }}
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('CDXT_DISABLE_S3_TESTS', '1')
+      - name: Set environment variables for faster unit tests (requests are mocked)
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('CDXT_MAX_ERRORS', '2')
+            core.exportVariable('CDXT_WARNING_AFTER_N_ERRORS', '2')
+            core.exportVariable('CDXT_DEFAULT_MIN_RETRY_INTERVAL', '0.01')
+            core.exportVariable('CDXT_CC_INDEX_MIN_RETRY_INTERVAL', '0.01')
+            core.exportVariable('CDXT_CC_DATA_MIN_RETRY_INTERVAL', '0.01')
+            core.exportVariable('CDXT_IA_MIN_RETRY_INTERVAL', '0.01')
+            core.exportVariable('DISABLE_ATHENA_TESTS', '1')
+            core.exportVariable('LOGLEVEL', 'DEBUG')
 
       - name: Lint code
         run: |
@@ -70,3 +102,48 @@ jobs:
         uses: codecov/codecov-action@v4
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
+
+  unit-tests-minimal:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: true
+      matrix:
+        include:
+          - python-version: '3.9'
+            os: ubuntu-22.04
+          - python-version: '3.14'
+            os: ubuntu-latest
+
+    steps:
+      - name: checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install setuptools on python 3.12+
+        if: ${{ matrix.python-version >= '3.12' }}
+        run: |
+          pip install setuptools
+
+      - name: Install cdx_toolkit (minimal)
+        run: pip install .[test]
+
+      - name: Set environment variables for faster unit tests (requests are mocked)
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('CDXT_MAX_ERRORS', '2')
+            core.exportVariable('CDXT_WARNING_AFTER_N_ERRORS', '2')
+            core.exportVariable('CDXT_DEFAULT_MIN_RETRY_INTERVAL', '0.01')
+            core.exportVariable('CDXT_CC_INDEX_MIN_RETRY_INTERVAL', '0.01')
+            core.exportVariable('CDXT_CC_DATA_MIN_RETRY_INTERVAL', '0.01')
+            core.exportVariable('CDXT_IA_MIN_RETRY_INTERVAL', '0.01')
+            core.exportVariable('DISABLE_ATHENA_TESTS', '1')
+            core.exportVariable('LOGLEVEL', 'DEBUG')
+
+      - name: test minimal
+        run: |
+          make test
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,6 @@ __pycache__
 cdx_toolkit.egg-info
 .coverage
 .eggs/
-tmp/
+tmp/
+.env
+.vscode
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -10,6 +10,18 @@ Clone the repository, setup a virtual environment, and run the following command
 make install
 ```
 
+For S3-related features or tests, install optional dependencies:
+
+```bash
+pip install -e ".[s3]"
+```
+
+To install everything (dev/test/S3), use:
+
+```bash
+pip install -e ".[all]"
+```
+
 ## Tests
 
 To test code changes, please run our test suite before submitting pull requests:
@@ -33,14 +45,14 @@ If the remote APIs change, new mock data can be semi-automatically collected by
 ```bash
 # set environment variable (DISABLE_MOCK_RESPONSES should not be set)
 export SAVE_MOCK_RESPONSES=./tmp/mock_responses
-    
+
 # run the test for what mock data should be saved to $SAVE_MOCK_RESPONSES/<test_file>/<test_func>.jsonl
 pytest tests/test_cli.py::test_basics
 ```
 
 ## Code format & linting
 
-Please following the definitions from `.editorconfig` and `.flake8`. 
+Please following the definitions from `.editorconfig` and `.flake8`.
 
 To test the linting, run this command:
 
@@ -54,4 +66,4 @@ You can also run the hooks manually on all files:
 
 ```bash
 pre-commit run --all-files
-```
+```
diff --git a/README.md b/README.md
@@ -24,6 +24,13 @@ $ pip install cdx_toolkit
 
 or clone this repo and use `pip install .`
 
+Optional extras:
+
+```
+$ pip install cdx_toolkit[s3]   # enable S3 and other remote filesystem support
+$ pip install cdx_toolkit[all]  # install all optional dependencies
+```
+
 ## Command-line tools
 
 ```
@@ -275,7 +282,7 @@ cdx_toolkit has reached the beta-testing stage of development.
 
 ## Contributing
 
-See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on contributing 
+See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on contributing
 and running tests.
 
 ## License

diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py
@@ -1,12 +1,14 @@
-from argparse import ArgumentParser
+from argparse import ArgumentParser, Namespace
 import logging
 import csv
 import sys
 import json
 import os
 
 import cdx_toolkit
-from cdx_toolkit.commoncrawl import normalize_crawl
+
+from cdx_toolkit.utils import get_version, setup_cdx_fetcher_and_kwargs
+
 
 LOGGER = logging.getLogger(__name__)
 
@@ -135,7 +137,7 @@ def main(args=None):
     cmd.func(cmd, cmdline)
 
 
-def set_loglevel(cmd):
+def set_loglevel(cmd: Namespace):
     loglevel = os.getenv('LOGLEVEL') or 'WARNING'
     if cmd.verbose:
         if cmd.verbose > 0:
@@ -151,58 +153,15 @@ def set_loglevel(cmd):
     LOGGER.info('set loglevel to %s', str(loglevel))
 
 
-def get_version():
-    return cdx_toolkit.__version__
-
-
-def setup(cmd):
-    kwargs = {}
-    kwargs['source'] = 'cc' if cmd.crawl else cmd.cc or cmd.ia or cmd.source or None
-    if kwargs['source'] is None:
-        raise ValueError('must specify --cc, --ia, or a --source')
-    if cmd.wb:
-        kwargs['wb'] = cmd.wb
-    if cmd.cc_mirror:
-        kwargs['cc_mirror'] = cmd.cc_mirror
-    if cmd.crawl:
-        kwargs['crawl'] = normalize_crawl([cmd.crawl])  # currently a string, not a list
-    if getattr(cmd, 'warc_download_prefix', None) is not None:
-        kwargs['warc_download_prefix'] = cmd.warc_download_prefix
-
-    cdx = cdx_toolkit.CDXFetcher(**kwargs)
-
-    kwargs = {}
-    if cmd.limit:
-        kwargs['limit'] = cmd.limit
-    if 'from' in vars(cmd) and vars(cmd)['from']:  # python, uh, from is a reserved word
-        kwargs['from_ts'] = vars(cmd)['from']
-    if cmd.to:
-        kwargs['to'] = cmd.to
-    if cmd.closest:
-        if not cmd.get:  # pragma: no cover
-            LOGGER.info('note: --closest works best with --get')
-        kwargs['closest'] = cmd.closest
-    if cmd.filter:
-        kwargs['filter'] = cmd.filter
-
-    if cmd.cmd == 'warc' and cmd.size:
-        kwargs['size'] = cmd.size
-
-    if cmd.cmd == 'size' and cmd.details:
-        kwargs['details'] = cmd.details
-
-    return cdx, kwargs
-
-
-def winnow_fields(cmd, fields, obj):
+def winnow_fields(cmd: Namespace, fields, obj):
     if cmd.all_fields:
         printme = obj
     else:
         printme = dict([(k, obj[k]) for k in fields if k in obj])
     return printme
 
 
-def print_line(cmd, writer, printme):
+def print_line(cmd: Namespace, writer, printme):
     if cmd.jsonl:
         print(json.dumps(printme, sort_keys=True))
     elif writer:
@@ -211,8 +170,8 @@ def print_line(cmd, writer, printme):
         print(', '.join([' '.join((k, printme[k])) for k in sorted(printme.keys())]))
 
 
-def iterator(cmd, cmdline):
-    cdx, kwargs = setup(cmd)
+def iterator(cmd: Namespace, cmdline):
+    cdx, kwargs = setup_cdx_fetcher_and_kwargs(cmd)
     fields = set(cmd.fields.split(','))
     if cmd.csv:
         writer = csv.DictWriter(sys.stdout, fieldnames=sorted(list(fields)))
@@ -232,8 +191,8 @@ def iterator(cmd, cmdline):
         print_line(cmd, writer, printme)
 
 
-def warcer(cmd, cmdline):
-    cdx, kwargs = setup(cmd)
+def warcer(cmd: Namespace, cmdline: str):
+    cdx, kwargs = setup_cdx_fetcher_and_kwargs(cmd)
 
     ispartof = cmd.prefix
     if cmd.subprefix:
@@ -275,9 +234,15 @@ def warcer(cmd, cmdline):
             LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp)
         writer.write_record(record)
 
+    writer.close()
 
-def sizer(cmd, cmdline):
-    cdx, kwargs = setup(cmd)
+
+def sizer(cmd: Namespace, cmdline):
+    cdx, kwargs = setup_cdx_fetcher_and_kwargs(cmd)
 
     size = cdx.get_size_estimate(cmd.url, **kwargs)
     print(size)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py
@@ -9,6 +9,8 @@
 import json
 import logging
 
+from cdx_toolkit.settings import CACHE_DIR, get_mock_time
+
 from .myrequests import myrequests_get
 from .timeutils import (
     time_to_timestamp,
@@ -34,7 +36,7 @@ def normalize_crawl(crawl):
 
 
 def get_cache_names(cc_mirror):
-    cache = os.path.expanduser('~/.cache/cdx_toolkit/')
+    cache = os.path.expanduser(CACHE_DIR)
     filename = re.sub(r'[^\w]', '_', cc_mirror.replace('https://', ''))
     return cache, filename
 
@@ -128,9 +130,13 @@ def apply_cc_defaults(params, crawl_present=False, now=None):
                 LOGGER.info('to but no from_ts, setting from_ts=%s', params['from_ts'])
         else:
             if not now:
-                # now is passed in by tests. if not set, use actual now.
-                # XXX could be changed to mock
-                now = time.time()
+                # Check for test/override time first
+                mock_time = get_mock_time()
+                if mock_time:
+                    now = mock_time
+                else:
+                    # now is passed in by tests. if not set, use actual now.
+                    now = time.time()
             params['from_ts'] = time_to_timestamp(now - year)
             LOGGER.info('no from or to, setting default 1 year ago from_ts=%s', params['from_ts'])
     else:

diff --git a/cdx_toolkit/myrequests.py b/cdx_toolkit/myrequests.py
@@ -1,9 +1,18 @@
+from typing import Optional
 import requests
 import logging
 import time
 from urllib.parse import urlparse
 
 from . import __version__
+from .settings import (
+    DEFAULT_MIN_RETRY_INTERVAL,
+    CC_DATA_MIN_RETRY_INTERVAL,
+    CC_INDEX_MIN_RETRY_INTERVAL,
+    IA_MIN_RETRY_INTERVAL,
+    MAX_ERRORS,
+    WARNING_AFTER_N_ERRORS,
+)
 
 LOGGER = logging.getLogger(__name__)
 
@@ -23,19 +32,19 @@ def dns_fatal(hostname):
 retry_info = {
     'default': {
         'next_fetch': 0,
-        'minimum_interval': 3.0,
+        'minimum_interval': DEFAULT_MIN_RETRY_INTERVAL,
     },
     'index.commoncrawl.org': {
         'next_fetch': 0,
-        'minimum_interval': 1.0,
+        'minimum_interval': CC_INDEX_MIN_RETRY_INTERVAL,
     },
     'data.commoncrawl.org': {
         'next_fetch': 0,
-        'minimum_interval': 0.55,
+        'minimum_interval': CC_DATA_MIN_RETRY_INTERVAL,
     },
     'web.archive.org': {
         'next_fetch': 0,
-        'minimum_interval': 6.0,
+        'minimum_interval': IA_MIN_RETRY_INTERVAL,
     },
 }
 
@@ -60,12 +69,18 @@ def myrequests_get(
     headers=None,
     cdx=False,
     allow404=False,
-    raise_error_after_n_errors: int = 100,
-    raise_warning_after_n_errors: int = 10,
+    raise_error_after_n_errors: Optional[int] = None,
+    raise_warning_after_n_errors: Optional[int] = None,
     retry_max_sec: int = 60,
 ):
     t = time.time()
 
+    if raise_error_after_n_errors is None:
+        raise_error_after_n_errors = MAX_ERRORS
+
+    if raise_warning_after_n_errors is None:
+        raise_warning_after_n_errors = WARNING_AFTER_N_ERRORS
+
     hostname = urlparse(url).hostname
     next_fetch, minimum_interval = get_retries(hostname)
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,4 +4,6 @@ __pycache__ @@
     cdx_toolkit.egg-info
     .coverage
     .eggs/
-    tmp/
+    tmp/
+    .env
+    .vscode