Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ on:
branches:
- main

# These permissions are needed to interact with AWS S3 via GitHub's OIDC Token endpoint
permissions:
id-token: write
contents: read
pull-requests: read

jobs:
unit-tests:
runs-on: ${{ matrix.os }}
Expand Down Expand Up @@ -56,7 +62,33 @@ jobs:
pip install setuptools

- name: Install cdx_toolkit
run: pip install .[test]
run: pip install .[all]

- name: Configure AWS credentials from OIDC (disabled for forks)
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::837454214164:role/GitHubActions-Role
aws-region: us-east-1

- name: Disable S3 unit tests for Python 3.8 (botocore requires Python 3.9+)
if: ${{ startsWith(matrix.python-version, '3.8') }}
uses: actions/github-script@v7
with:
script: |
core.exportVariable('CDXT_DISABLE_S3_TESTS', '1')
- name: Set environment variables for faster unit tests (requests are mocked)
uses: actions/github-script@v7
with:
script: |
core.exportVariable('CDXT_MAX_ERRORS', '2')
core.exportVariable('CDXT_WARNING_AFTER_N_ERRORS', '2')
core.exportVariable('CDXT_DEFAULT_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('CDXT_CC_INDEX_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('CDXT_CC_DATA_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('CDXT_IA_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('DISABLE_ATHENA_TESTS', '1')
core.exportVariable('LOGLEVEL', 'DEBUG')

- name: Lint code
run: |
Expand All @@ -70,3 +102,48 @@ jobs:
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}

unit-tests-minimal:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: true
matrix:
include:
- python-version: '3.9'
os: ubuntu-22.04
- python-version: '3.14'
os: ubuntu-latest

steps:
- name: checkout
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install setuptools on python 3.12+
if: ${{ matrix.python-version >= '3.12' }}
run: |
pip install setuptools

- name: Install cdx_toolkit (minimal)
run: pip install .[test]

- name: Set environment variables for faster unit tests (requests are mocked)
uses: actions/github-script@v7
with:
script: |
core.exportVariable('CDXT_MAX_ERRORS', '2')
core.exportVariable('CDXT_WARNING_AFTER_N_ERRORS', '2')
core.exportVariable('CDXT_DEFAULT_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('CDXT_CC_INDEX_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('CDXT_CC_DATA_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('CDXT_IA_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('DISABLE_ATHENA_TESTS', '1')
core.exportVariable('LOGLEVEL', 'DEBUG')

- name: test minimal
run: |
make test
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ __pycache__
cdx_toolkit.egg-info
.coverage
.eggs/
tmp/
tmp/
.env
.vscode
18 changes: 15 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@ Clone the repository, setup a virtual environment, and run the following command
make install
```

For S3-related features or tests, install optional dependencies:

```bash
pip install -e ".[s3]"
```

To install everything (dev/test/S3), use:

```bash
pip install -e ".[all]"
```

## Tests

To test code changes, please run our test suite before submitting pull requests:
Expand All @@ -33,14 +45,14 @@ If the remote APIs change, new mock data can be semi-automatically collected by
```bash
# set environment variable (DISABLE_MOCK_RESPONSES should not be set)
export SAVE_MOCK_RESPONSES=./tmp/mock_responses

# run the test for what mock data should be saved to $SAVE_MOCK_RESPONSES/<test_file>/<test_func>.jsonl
pytest tests/test_cli.py::test_basics
```

## Code format & linting

Please following the definitions from `.editorconfig` and `.flake8`.
Please following the definitions from `.editorconfig` and `.flake8`.

To test the linting, run this command:

Expand All @@ -54,4 +66,4 @@ You can also run the hooks manually on all files:

```bash
pre-commit run --all-files
```
```
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ $ pip install cdx_toolkit

or clone this repo and use `pip install .`

Optional extras:

```
$ pip install cdx_toolkit[s3] # enable S3 and other remote filesystem support
$ pip install cdx_toolkit[all] # install all optional dependencies
```

## Command-line tools

```
Expand Down Expand Up @@ -275,7 +282,7 @@ cdx_toolkit has reached the beta-testing stage of development.

## Contributing

See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on contributing
See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on contributing
and running tests.

## License
Expand Down
73 changes: 19 additions & 54 deletions cdx_toolkit/cli.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from argparse import ArgumentParser
from argparse import ArgumentParser, Namespace
import logging
import csv
import sys
import json
import os

import cdx_toolkit
from cdx_toolkit.commoncrawl import normalize_crawl

from cdx_toolkit.utils import get_version, setup_cdx_fetcher_and_kwargs


LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -135,7 +137,7 @@ def main(args=None):
cmd.func(cmd, cmdline)


def set_loglevel(cmd):
def set_loglevel(cmd: Namespace):
loglevel = os.getenv('LOGLEVEL') or 'WARNING'
if cmd.verbose:
if cmd.verbose > 0:
Expand All @@ -151,58 +153,15 @@ def set_loglevel(cmd):
LOGGER.info('set loglevel to %s', str(loglevel))


def get_version():
return cdx_toolkit.__version__


def setup(cmd):
kwargs = {}
kwargs['source'] = 'cc' if cmd.crawl else cmd.cc or cmd.ia or cmd.source or None
if kwargs['source'] is None:
raise ValueError('must specify --cc, --ia, or a --source')
if cmd.wb:
kwargs['wb'] = cmd.wb
if cmd.cc_mirror:
kwargs['cc_mirror'] = cmd.cc_mirror
if cmd.crawl:
kwargs['crawl'] = normalize_crawl([cmd.crawl]) # currently a string, not a list
if getattr(cmd, 'warc_download_prefix', None) is not None:
kwargs['warc_download_prefix'] = cmd.warc_download_prefix

cdx = cdx_toolkit.CDXFetcher(**kwargs)

kwargs = {}
if cmd.limit:
kwargs['limit'] = cmd.limit
if 'from' in vars(cmd) and vars(cmd)['from']: # python, uh, from is a reserved word
kwargs['from_ts'] = vars(cmd)['from']
if cmd.to:
kwargs['to'] = cmd.to
if cmd.closest:
if not cmd.get: # pragma: no cover
LOGGER.info('note: --closest works best with --get')
kwargs['closest'] = cmd.closest
if cmd.filter:
kwargs['filter'] = cmd.filter

if cmd.cmd == 'warc' and cmd.size:
kwargs['size'] = cmd.size

if cmd.cmd == 'size' and cmd.details:
kwargs['details'] = cmd.details

return cdx, kwargs


def winnow_fields(cmd, fields, obj):
def winnow_fields(cmd: Namespace, fields, obj):
if cmd.all_fields:
printme = obj
else:
printme = dict([(k, obj[k]) for k in fields if k in obj])
return printme


def print_line(cmd, writer, printme):
def print_line(cmd: Namespace, writer, printme):
if cmd.jsonl:
print(json.dumps(printme, sort_keys=True))
elif writer:
Expand All @@ -211,8 +170,8 @@ def print_line(cmd, writer, printme):
print(', '.join([' '.join((k, printme[k])) for k in sorted(printme.keys())]))


def iterator(cmd, cmdline):
cdx, kwargs = setup(cmd)
def iterator(cmd: Namespace, cmdline):
cdx, kwargs = setup_cdx_fetcher_and_kwargs(cmd)
fields = set(cmd.fields.split(','))
if cmd.csv:
writer = csv.DictWriter(sys.stdout, fieldnames=sorted(list(fields)))
Expand All @@ -232,8 +191,8 @@ def iterator(cmd, cmdline):
print_line(cmd, writer, printme)


def warcer(cmd, cmdline):
cdx, kwargs = setup(cmd)
def warcer(cmd: Namespace, cmdline: str):
cdx, kwargs = setup_cdx_fetcher_and_kwargs(cmd)

ispartof = cmd.prefix
if cmd.subprefix:
Expand Down Expand Up @@ -275,9 +234,15 @@ def warcer(cmd, cmdline):
LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp)
writer.write_record(record)

writer.close()

def sizer(cmd, cmdline):
cdx, kwargs = setup(cmd)

def sizer(cmd: Namespace, cmdline):
cdx, kwargs = setup_cdx_fetcher_and_kwargs(cmd)

size = cdx.get_size_estimate(cmd.url, **kwargs)
print(size)


if __name__ == "__main__":
main()
14 changes: 10 additions & 4 deletions cdx_toolkit/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import json
import logging

from cdx_toolkit.settings import CACHE_DIR, get_mock_time

from .myrequests import myrequests_get
from .timeutils import (
time_to_timestamp,
Expand All @@ -34,7 +36,7 @@ def normalize_crawl(crawl):


def get_cache_names(cc_mirror):
cache = os.path.expanduser('~/.cache/cdx_toolkit/')
cache = os.path.expanduser(CACHE_DIR)
filename = re.sub(r'[^\w]', '_', cc_mirror.replace('https://', ''))
return cache, filename

Expand Down Expand Up @@ -128,9 +130,13 @@ def apply_cc_defaults(params, crawl_present=False, now=None):
LOGGER.info('to but no from_ts, setting from_ts=%s', params['from_ts'])
else:
if not now:
# now is passed in by tests. if not set, use actual now.
# XXX could be changed to mock
now = time.time()
# Check for test/override time first
mock_time = get_mock_time()
if mock_time:
now = mock_time
else:
# now is passed in by tests. if not set, use actual now.
now = time.time()
params['from_ts'] = time_to_timestamp(now - year)
LOGGER.info('no from or to, setting default 1 year ago from_ts=%s', params['from_ts'])
else:
Expand Down
27 changes: 21 additions & 6 deletions cdx_toolkit/myrequests.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
from typing import Optional
import requests
import logging
import time
from urllib.parse import urlparse

from . import __version__
from .settings import (
DEFAULT_MIN_RETRY_INTERVAL,
CC_DATA_MIN_RETRY_INTERVAL,
CC_INDEX_MIN_RETRY_INTERVAL,
IA_MIN_RETRY_INTERVAL,
MAX_ERRORS,
WARNING_AFTER_N_ERRORS,
)

LOGGER = logging.getLogger(__name__)

Expand All @@ -23,19 +32,19 @@ def dns_fatal(hostname):
retry_info = {
'default': {
'next_fetch': 0,
'minimum_interval': 3.0,
'minimum_interval': DEFAULT_MIN_RETRY_INTERVAL,
},
'index.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 1.0,
'minimum_interval': CC_INDEX_MIN_RETRY_INTERVAL,
},
'data.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 0.55,
'minimum_interval': CC_DATA_MIN_RETRY_INTERVAL,
},
'web.archive.org': {
'next_fetch': 0,
'minimum_interval': 6.0,
'minimum_interval': IA_MIN_RETRY_INTERVAL,
},
}

Expand All @@ -60,12 +69,18 @@ def myrequests_get(
headers=None,
cdx=False,
allow404=False,
raise_error_after_n_errors: int = 100,
raise_warning_after_n_errors: int = 10,
raise_error_after_n_errors: Optional[int] = None,
raise_warning_after_n_errors: Optional[int] = None,
retry_max_sec: int = 60,
):
t = time.time()

if raise_error_after_n_errors is None:
raise_error_after_n_errors = MAX_ERRORS

if raise_warning_after_n_errors is None:
raise_warning_after_n_errors = WARNING_AFTER_N_ERRORS

hostname = urlparse(url).hostname
next_fetch, minimum_interval = get_retries(hostname)

Expand Down
Loading