Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright (c) 2016, PDFTables.com
Copyright (c) 2026, PDFTables.com
All rights reserved.

Redistribution and use in source and binary forms, with or without
Expand Down
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,26 @@ To convert to CSV, XML or HTML simply change `c.xlsx` to be `c.csv`, `c.xml` or

To specify Excel (single sheet) or Excel (multiple sheets) use `c.xlsx_single` or `c.xlsx_multiple`.

## Extractor

You can specify which extraction engine to use when creating a `Client`. The available extractors are `standard` (default), `ai-1`, and `ai-2`.

For AI extractors (`ai-1` and `ai-2`), you can also specify an `extract` option to control what content is extracted: `tables` (default) or `tables-paragraphs`.

```py
from pdftables_api import (Client, EXTRACTOR_AI_1, EXTRACTOR_AI_2,
EXTRACT_TABLES, EXTRACT_TABLES_PARAGRAPHS)

# Standard extractor (default)
c_standard = Client('my-api-key')

# AI extractors for complex documents
c_ai_1 = Client('my-api-key', extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES)
c_ai_2 = Client('my-api-key', extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES_PARAGRAPHS)
```

See [PDFTables API documentation](https://pdftables.com/pdf-to-excel-api#extractors) for details.

## Test

Tests run with pytest: `make test`
Expand Down
12 changes: 11 additions & 1 deletion pdftables_api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2016 The Sensible Code Company
# Copyright 2026 Cantabular Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,6 +13,11 @@
# limitations under the License.

from .pdftables_api import (
EXTRACT_TABLES,
EXTRACT_TABLES_PARAGRAPHS,
EXTRACTOR_AI_1,
EXTRACTOR_AI_2,
EXTRACTOR_STANDARD,
FORMAT_CSV,
FORMAT_XLSX,
FORMAT_XLSX_MULTIPLE,
Expand All @@ -30,4 +35,9 @@
"FORMAT_XML",
"APIException",
"Client",
"EXTRACTOR_STANDARD",
"EXTRACTOR_AI_1",
"EXTRACTOR_AI_2",
"EXTRACT_TABLES",
"EXTRACT_TABLES_PARAGRAPHS",
]
71 changes: 68 additions & 3 deletions pdftables_api/pdftables_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2016 The Sensible Code Company
# Copyright 2026 Cantabular Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,13 +42,71 @@
}
_STRING_FORMATS = {FORMAT_CSV, FORMAT_HTML, FORMAT_XML}

EXTRACTOR_STANDARD = "standard"
EXTRACTOR_AI_1 = "ai-1"
EXTRACTOR_AI_2 = "ai-2"

EXTRACT_TABLES = "tables"
EXTRACT_TABLES_PARAGRAPHS = "tables-paragraphs"

# Valid extractor options for each extractor type
_VALID_EXTRACTOR_VALUES = {
EXTRACTOR_STANDARD: (), # Standard extractor has no options
EXTRACTOR_AI_1: (
EXTRACT_TABLES,
EXTRACT_TABLES_PARAGRAPHS,
), # Use a tuple for consistent order in error messages
EXTRACTOR_AI_2: (
EXTRACT_TABLES,
EXTRACT_TABLES_PARAGRAPHS,
),
}

# Valid extractor types
_VALID_EXTRACTORS = tuple(_VALID_EXTRACTOR_VALUES.keys())


class Client:
def __init__(self, api_key, api_url=_API_URL, timeout=_DEFAULT_TIMEOUT):
def __init__(
self,
api_key,
api_url=_API_URL,
timeout=_DEFAULT_TIMEOUT,
extractor=EXTRACTOR_STANDARD,
extract=None,
):
self.api_key = api_key
self.api_url = api_url
self.timeout = timeout

# Validate and set extractor configuration
self._validate_extractor(extractor, extract)
self.extractor = extractor
self.extract = extract

@staticmethod
def _validate_extractor(extractor, extract):
"""Validate extractor and extract parameters."""
if extractor not in _VALID_EXTRACTORS:
valid_extractors = ", ".join(_VALID_EXTRACTORS)
raise ValueError(
f'Invalid extractor "{extractor}". Valid options are: {valid_extractors}'
)

valid_extract_values = _VALID_EXTRACTOR_VALUES[extractor]
if extract is not None and extract not in valid_extract_values:
if len(valid_extract_values) == 0:
raise ValueError(
f'Extractor "{extractor}" does not support extract parameter'
)
else:
valid_extract_values_str = ", ".join(
str(opt) for opt in valid_extract_values
)
raise ValueError(
f'Invalid extract value "{extract}" for extractor "{extractor}". Valid values are: {valid_extract_values_str}'
)

def xlsx(self, pdf_path, xlsx_path=None):
"""
Convenience method to convert PDF to XLSX multiple sheets.
Expand Down Expand Up @@ -147,7 +205,14 @@ def request(self, pdf_fo, out_format=None, query_params=None, **requests_params)
url = self.api_url
files = {"f": ("file.pdf", pdf_fo)}
params = query_params if query_params else {}
params.update({"key": self.api_key, "format": out_format})
params.update(
{
"key": self.api_key,
"format": out_format,
"extractor": self.extractor,
"extract": self.extract,
}
)

response = requests.post(
url, files=files, stream=True, params=params, **requests_params
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ description = "PDFTables.com Python API library."
readme = "README.md"
license = { text = "Apache License 2.0" }
keywords = ["pdf", "tables", "excel", "csv", "xml", "api"]
authors = [ { name = "The Sensible Code Company", email = "[email protected]" } ]
urls = { "Homepage" = "https://github.com/sensiblecode/python-pdftables-api" }
authors = [ { name = "Cantabular Ltd", email = "[email protected]" } ]
urls = { "Homepage" = "https://github.com/pdftables/python-pdftables-api" }
dependencies = ["requests"]
classifiers = [
"Development Status :: 5 - Production/Stable",
Expand Down
163 changes: 161 additions & 2 deletions test/test_pdftables_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2016 The Sensible Code Company
# Copyright 2026 Cantabular Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -20,7 +20,15 @@
import pytest
import requests_mock

from pdftables_api import APIException, Client
from pdftables_api import (
EXTRACT_TABLES,
EXTRACT_TABLES_PARAGRAPHS,
EXTRACTOR_AI_1,
EXTRACTOR_AI_2,
EXTRACTOR_STANDARD,
APIException,
Client,
)


class TestEnsureExtFormat(TestCase):
Expand Down Expand Up @@ -181,6 +189,157 @@ def test_response_unknown_file_format(self):
c.dump(png_fo)


class TestExtractorParameters(TestCase):
def test_default_extractor(self):
"""Test that default extractor is 'standard' with no extract parameter."""
with requests_mock.mock() as m:
m.post(
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=standard",
text="xlsx output",
)

c = Client("fake_key")
with NamedTemporaryFile(suffix="test.pdf") as tf:
tf.write(b"Hello world")
tf.file.close()
c.convert(tf.name)

def test_ai1_extractor_with_no_extract(self):
"""Test ai-1 extractor with no extract parameter."""
with requests_mock.mock() as m:
m.post(
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-1",
text="xlsx output",
)

c = Client("fake_key", extractor=EXTRACTOR_AI_1)
with NamedTemporaryFile(suffix="test.pdf") as tf:
tf.write(b"Hello world")
tf.file.close()
c.convert(tf.name)

def test_ai1_extractor_with_tables(self):
"""Test ai-1 extractor with 'tables' extract parameter."""
with requests_mock.mock() as m:
m.post(
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-1&extract=tables",
text="xlsx output",
)

c = Client("fake_key", extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES)
with NamedTemporaryFile(suffix="test.pdf") as tf:
tf.write(b"Hello world")
tf.file.close()
c.convert(tf.name)

def test_ai1_extractor_with_tables_paragraphs(self):
"""Test ai-1 extractor with 'tables-paragraphs' extract parameter."""
with requests_mock.mock() as m:
m.post(
"https://pdftables.com/api?key=fake_key&format=csv&extractor=ai-1&extract=tables-paragraphs",
text="csv output",
)

c = Client(
"fake_key", extractor=EXTRACTOR_AI_1, extract=EXTRACT_TABLES_PARAGRAPHS
)
with NamedTemporaryFile(suffix="test.pdf") as tf:
tf.write(b"Hello world")
tf.file.close()
c.convert(tf.name, out_format="csv")

def test_ai2_extractor_with_no_extract(self):
"""Test ai-2 extractor with no extract parameter."""
with requests_mock.mock() as m:
m.post(
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-2",
text="xlsx output",
)

c = Client("fake_key", extractor=EXTRACTOR_AI_2)
with NamedTemporaryFile(suffix="test.pdf") as tf:
tf.write(b"Hello world")
tf.file.close()
c.convert(tf.name)

def test_ai2_extractor_with_tables(self):
"""Test ai-2 extractor with 'tables' extract parameter."""
with requests_mock.mock() as m:
m.post(
"https://pdftables.com/api?key=fake_key&format=xlsx-multiple&extractor=ai-2&extract=tables",
text="xlsx output",
)

c = Client("fake_key", extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES)
with NamedTemporaryFile(suffix="test.pdf") as tf:
tf.write(b"Hello world")
tf.file.close()
c.convert(tf.name)

def test_ai2_extractor_with_tables_paragraphs(self):
"""Test ai-2 extractor with 'tables-paragraphs' extract parameter."""
with requests_mock.mock() as m:
m.post(
"https://pdftables.com/api?key=fake_key&format=csv&extractor=ai-2&extract=tables-paragraphs",
text="csv output",
)

c = Client(
"fake_key", extractor=EXTRACTOR_AI_2, extract=EXTRACT_TABLES_PARAGRAPHS
)
with NamedTemporaryFile(suffix="test.pdf") as tf:
tf.write(b"Hello world")
tf.file.close()
c.convert(tf.name, out_format="csv")

def test_standard_extractor_no_extract_param_in_url(self):
"""Test that standard extractor doesn't include extract parameter in URL."""
with requests_mock.mock() as m:
# Note: no 'extract' parameter in the URL for standard extractor
m.post(
"https://pdftables.com/api?key=fake_key&format=csv&extractor=standard",
text="csv output",
)

c = Client("fake_key", extractor=EXTRACTOR_STANDARD, extract=None)
with NamedTemporaryFile(suffix="test.pdf") as tf:
tf.write(b"Hello world")
tf.file.close()
c.convert(tf.name, out_format="csv")

def test_invalid_extractor_raises_error(self):
"""Test that invalid extractor raises ValueError."""
with pytest.raises(
ValueError,
match='^Invalid extractor "invalid". Valid options are: standard, ai-1, ai-2$',
):
Client("fake_key", extractor="invalid")

def test_invalid_extract_for_standard_raises_error(self):
"""Test that providing extract parameter for standard extractor raises ValueError."""
with pytest.raises(
ValueError,
match='^Extractor "standard" does not support extract parameter$',
):
Client("fake_key", extractor=EXTRACTOR_STANDARD, extract=EXTRACT_TABLES)

def test_invalid_extract_for_ai_raises_error(self):
"""Test that invalid extract value for AI extractor raises ValueError."""
with pytest.raises(
ValueError,
match='^Invalid extract value "invalid" for extractor "ai-1". Valid values are: tables, tables-paragraphs$',
):
Client("fake_key", extractor=EXTRACTOR_AI_1, extract="invalid")

def test_invalid_extract_for_ai2_raises_error(self):
"""Test that invalid extract value for AI-2 extractor raises ValueError."""
with pytest.raises(
ValueError,
match='^Invalid extract value "invalid" for extractor "ai-2". Valid values are: tables, tables-paragraphs$',
):
Client("fake_key", extractor=EXTRACTOR_AI_2, extract="invalid")


def consume(s):
r = b""
for chunk in s:
Expand Down