Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
86b234d
Replace setup.py with pyproject.toml
StevenMaude Jan 12, 2026
70ba3e7
Run tests with pytest
StevenMaude Jan 12, 2026
e6cf661
Specify supported Python version for install
StevenMaude Jan 12, 2026
c0aea88
Pin a version of Python for development
StevenMaude Jan 12, 2026
7447981
Use pytest for testing in CI
StevenMaude Jan 12, 2026
662bf9b
Modernise tests to use pytest style
StevenMaude Jan 12, 2026
5802eca
Specify dev dependencies so `uv` installs them
StevenMaude Jan 12, 2026
619a4c6
Update supported Python versions in CI
StevenMaude Jan 12, 2026
e60a943
Add uv.lock
StevenMaude Jan 12, 2026
688660e
Update `.gitignore`
StevenMaude Jan 12, 2026
7bcd378
Use uv in CI
StevenMaude Jan 12, 2026
9b033a4
Switch to `uv` in Dependabot configuration
StevenMaude Jan 12, 2026
a62b362
Build package in CI
StevenMaude Jan 12, 2026
a6b332e
Add ruff to dev requirements
StevenMaude Jan 12, 2026
a5f9cda
Update uv.lock
StevenMaude Jan 12, 2026
7e97111
Specify a minimum `uv` version
StevenMaude Jan 12, 2026
407d3a9
Configure `ruff`
StevenMaude Jan 12, 2026
6cfab68
Add Makefile for common tasks
StevenMaude Jan 12, 2026
e0f4e27
Use `make test` in CI
StevenMaude Jan 12, 2026
8f5474e
Reformat with Ruff
StevenMaude Jan 12, 2026
59cf4a1
Adjust import ordering with `ruff`
StevenMaude Jan 12, 2026
b2ab01d
Add values to `__all__`
StevenMaude Jan 12, 2026
5ce519f
Check whether a value is `None` or not
StevenMaude Jan 12, 2026
9951593
Remove unused names
StevenMaude Jan 12, 2026
245f3d2
Check formatting and linting in CI
StevenMaude Jan 12, 2026
77c11de
Update developer notes
StevenMaude Jan 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ updates:
schedule:
interval: "daily"

- package-ecosystem: "pip"
- package-ecosystem: "uv"
directory: "/"
schedule:
interval: "daily"
25 changes: 19 additions & 6 deletions .github/workflows/ci-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,29 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python: ['3.8', '3.9', '3.10', '3.11']
python: ['3.10', '3.11', '3.12', '3.13', '3.14']

steps:
- name: Checkout code
uses: actions/checkout@v5

- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python }}
- name: Install the latest version of uv
uses: astral-sh/setup-uv@681c641aba71e4a1c380be3ab5e12ad51f415867 # v7.1.6

- name: Install python
run: uv python install ${{ matrix.python-version }}

- name: Install dependencies
run: uv sync --frozen

- name: Check formatting
run: make check-format

- name: Check linting
run: make check-fix

- name: Run tests
run: python setup.py test
run: make test

- name: Build package
run: uv build
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.pyc
/dist/
/*.egg-info
.venv
build/
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.14
18 changes: 18 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
RUN := uv run

test:
@$(RUN) pytest

fix:
@$(RUN) ruff check --fix

check-fix:
@$(RUN) ruff check

format:
@$(RUN) ruff format

check-format:
@$(RUN) ruff format --check

.PHONY: test fix check-fix format check-format
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ pip: (requires git installed)
pip: (without git)

pip install https://github.com/pdftables/python-pdftables-api/archive/master.tar.gz

Locally:

python setup.py install
For local development:

uv sync

### Upgrading

Expand Down Expand Up @@ -48,7 +48,12 @@ To specify Excel (single sheet) or Excel (multiple sheets) use `c.xlsx_single` o

## Test

python -m unittest test.test_pdftables_api
Tests run with pytest: `make test`

## Linting and formatting

* Format with `make format`
* Apply Ruff fixes with `make fix`

## Configuring a timeout

Expand Down
26 changes: 19 additions & 7 deletions pdftables_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .pdftables_api import (Client,
APIException,
FORMAT_CSV,
FORMAT_XLSX,
FORMAT_XLSX_MULTIPLE,
FORMAT_XLSX_SINGLE,
FORMAT_XML)
from .pdftables_api import (
FORMAT_CSV,
FORMAT_XLSX,
FORMAT_XLSX_MULTIPLE,
FORMAT_XLSX_SINGLE,
FORMAT_XML,
APIException,
Client,
)

__all__ = [
"FORMAT_CSV",
"FORMAT_XLSX",
"FORMAT_XLSX_MULTIPLE",
"FORMAT_XLSX_SINGLE",
"FORMAT_XML",
"APIException",
"Client",
]
97 changes: 48 additions & 49 deletions pdftables_api/pdftables_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,38 +13,37 @@
# limitations under the License.

import os

import requests

from shutil import copyfileobj

import requests

FORMAT_CSV = 'csv'
FORMAT_HTML = 'html'
FORMAT_XLSX_MULTIPLE = 'xlsx-multiple'
FORMAT_XLSX_SINGLE = 'xlsx-single'
FORMAT_CSV = "csv"
FORMAT_HTML = "html"
FORMAT_XLSX_MULTIPLE = "xlsx-multiple"
FORMAT_XLSX_SINGLE = "xlsx-single"
FORMAT_XLSX = FORMAT_XLSX_MULTIPLE
FORMAT_XML = 'xml'
FORMAT_XML = "xml"

_API_URL = 'https://pdftables.com/api'
_API_URL = "https://pdftables.com/api"
_DEFAULT_TIMEOUT = (10, 300) # seconds (connect and read)
_FORMATS_EXT = {
FORMAT_CSV: '.csv',
FORMAT_HTML: '.html',
FORMAT_XLSX: '.xlsx',
FORMAT_XLSX_MULTIPLE: '.xlsx',
FORMAT_XLSX_SINGLE: '.xlsx',
FORMAT_XML: '.xml',
FORMAT_CSV: ".csv",
FORMAT_HTML: ".html",
FORMAT_XLSX: ".xlsx",
FORMAT_XLSX_MULTIPLE: ".xlsx",
FORMAT_XLSX_SINGLE: ".xlsx",
FORMAT_XML: ".xml",
}
_EXT_FORMATS = {
'.csv': FORMAT_CSV,
'.html': FORMAT_HTML,
'.xlsx': FORMAT_XLSX,
'.xml': FORMAT_XML,
".csv": FORMAT_CSV,
".html": FORMAT_HTML,
".xlsx": FORMAT_XLSX,
".xml": FORMAT_XML,
}
_STRING_FORMATS = {FORMAT_CSV, FORMAT_HTML, FORMAT_XML}

class Client(object):

class Client:
def __init__(self, api_key, api_url=_API_URL, timeout=_DEFAULT_TIMEOUT):
self.api_key = api_key
self.api_url = api_url
Expand Down Expand Up @@ -98,60 +97,61 @@ def html(self, pdf_path, html_path=None):
"""
return self.convert(pdf_path, html_path, out_format=FORMAT_HTML)

def convert(self, pdf_path, out_path=None, out_format=None, query_params=None, **requests_params):
def convert(
self,
pdf_path,
out_path=None,
out_format=None,
query_params=None,
**requests_params,
):
"""
Convert PDF given by `pdf_path` into `format` at `out_path`.

If `out_path` is None, returns a string containing the contents, or a
bytes for binary output types (e.g, XLSX)
"""
(out_path, out_format) = Client.ensure_format_ext(out_path, out_format)
with open(pdf_path, 'rb') as pdf_fo:
response = self.request(pdf_fo, out_format, query_params,
**requests_params)
with open(pdf_path, "rb") as pdf_fo:
response = self.request(pdf_fo, out_format, query_params, **requests_params)

if out_path is None:
use_text = out_format in _STRING_FORMATS
return response.text if use_text else response.content

with open(out_path, 'wb') as out_fo:
with open(out_path, "wb") as out_fo:
converted_fo = response.raw
# Ensure that gzip content is decoded.
converted_fo.decode_content = True
copyfileobj(converted_fo, out_fo)

def dump(self, pdf_fo, out_format=None, query_params=None,
**requests_params):
def dump(self, pdf_fo, out_format=None, query_params=None, **requests_params):
"""
Convert PDF file object given by `pdf_fo` into an output stream iterator.
"""
response = self.request(pdf_fo, out_format, query_params,
**requests_params)
response = self.request(pdf_fo, out_format, query_params, **requests_params)

return response.iter_content(chunk_size=4096)

def request(self, pdf_fo, out_format=None, query_params=None,
**requests_params):
def request(self, pdf_fo, out_format=None, query_params=None, **requests_params):
"""
Convert PDF given by `pdf_path`, returning requests.Response object.
"""
if self.api_key == "":
raise APIException("Invalid API key")

if 'timeout' not in requests_params:
requests_params.update({'timeout': self.timeout})
if "timeout" not in requests_params:
requests_params.update({"timeout": self.timeout})

(_, out_format) = Client.ensure_format_ext(None, out_format)
url = self.api_url
files = {'f': ('file.pdf', pdf_fo)}
files = {"f": ("file.pdf", pdf_fo)}
params = query_params if query_params else {}
params.update({'key': self.api_key, 'format': out_format})
params.update({"key": self.api_key, "format": out_format})

response = requests.post(url,
files=files,
stream=True,
params=params,
**requests_params)
response = requests.post(
url, files=files, stream=True, params=params, **requests_params
)

if response.status_code == 400:
raise APIException("Unknown file format")
Expand All @@ -172,9 +172,9 @@ def remaining(self, query_params=None, **requests_params):
if self.api_key == "":
raise APIException("Invalid API key")

url = self.api_url+'/remaining'
url = self.api_url + "/remaining"
params = query_params if query_params else {}
params.update({'key': self.api_key})
params.update({"key": self.api_key})

response = requests.get(url, params=params, **requests_params)

Expand All @@ -190,29 +190,28 @@ def ensure_format_ext(out_path, out_format):
Ensure the appropriate file extension and format is given. If not
provided, try to guess either.
"""
if out_format != None and out_format not in _FORMATS_EXT.keys():
raise ValueError('Invalid output format')
if out_format is not None and out_format not in _FORMATS_EXT.keys():
raise ValueError("Invalid output format")

default_format = FORMAT_XLSX_MULTIPLE

# Check if stdout is desired
if out_path == None:
if out_format == None:
if out_path is None:
if out_format is None:
out_format = default_format
return (None, out_format)

_, ext = os.path.splitext(out_path)

# Guess output format by file extension
if out_format == None:
if out_format is None:
if ext in _FORMATS_EXT.values():
out_format = _EXT_FORMATS[ext]
else:
out_format = default_format

# Ensure correct file extension by output format
if (ext not in _FORMATS_EXT.values() or
ext != _FORMATS_EXT[out_format]):
if ext not in _FORMATS_EXT.values() or ext != _FORMATS_EXT[out_format]:
out_path = out_path + _FORMATS_EXT[out_format]

return (out_path, out_format)
Expand Down
52 changes: 52 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
[build-system]
requires = ["setuptools>=42", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "pdftables-api"
version = "2.0.0"
requires-python = ">=3.10"
description = "PDFTables.com Python API library."
readme = "README.md"
license = { text = "Apache License 2.0" }
keywords = ["pdf", "tables", "excel", "csv", "xml", "api"]
authors = [ { name = "The Sensible Code Company", email = "[email protected]" } ]
urls = { "Homepage" = "https://github.com/sensiblecode/python-pdftables-api" }
dependencies = ["requests"]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Topic :: Software Development :: Libraries",
"Topic :: System :: Networking",
]

[dependency-groups]
dev = [
"pytest",
"requests-mock",
"ruff",
]

[tool.pytest.ini_options]
testpaths = ["test"]
python_files = ["test_*.py"]

[tool.ruff]
line-length = 88

[tool.ruff.lint]
# "A": flake8-builtins
# "I": isort
# "UP": pyupgrade
# "W": pycodestyle
extend-select = ["A", "I", "UP", "W"]

[tool.setuptools.packages.find]
where = ["."]
include = ["pdftables_api"]

[tool.uv]
required-version = ">=0.9"
Loading