databricks-industry-solutions · ghanse · Apr 30, 2026 · May 5, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.build-constraints.txt b/.build-constraints.txt
@@ -13,7 +13,7 @@ pluggy==1.6.0 \
     --hash=sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3 \
     --hash=sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746
     # via hatchling
-trove-classifiers==2026.4.28.13 \
-    --hash=sha256:8f4b1eb4e16296b57d612965444f87a83861cc989a0451ac97fe4265ddef03b8 \
-    --hash=sha256:c85bb8a53c3de7330d1699b844ed9fb809a602a09ac15dc79ad6d1a509be0676
+trove-classifiers==2026.5.7.17 \
+    --hash=sha256:5ec0800de5e2ddbd7c663cb4c0c15328f132dc168813897c18866c5c7b93db33 \
+    --hash=sha256:a04a48f8f0a787cb996514d3969ac7608aa3c60cb15d073c1e02801e60533e80
     # via hatchling
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,68 @@
+name: publish
+
+on:
+  push:
+    branches: [ main, preview ]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  build:
+    name: Build Docusaurus
+    runs-on: html_publisher
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          fetch-depth: 0
+
+      - name: Setup uv
+        uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7.3.1
+        with:
+          version: "0.11.2"
+          enable-cache: true
+          cache-dependency-glob: "**/uv.lock"
+
+      - name: Setup Node.js
+        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
+        with:
+          node-version: 22
+          cache: yarn
+          cache-dependency-path: docs/python-data-sources/yarn.lock
+
+      - name: Install docs dependencies
+        run: make docs-install
+
+      - name: Build website
+        run: make docs-build
+
+      - name: Upload Build Artifact
+        uses: actions/upload-pages-artifact@7b1f4a764d45c48632c6b24a0339c27f5614fb0b # v4.0.0
+        with:
+          path: docs/python-data-sources/build
+
+  deploy:
+    name: Deploy to GitHub Pages
+    needs: build
+
+    permissions:
+      pages: write
+      id-token: write
+
+    environment:
+      name: ${{ github.ref_name == 'main' && 'github-pages' || 'preview' }}
+      url: ${{ steps.deployment.outputs.page_url }}
+
+    runs-on: html_publisher
+
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5
diff --git a/Makefile b/Makefile
@@ -1,12 +1,12 @@
-.PHONY: all clean dev fmt lint test coverage lock-dependencies verify-lock build
+.PHONY: all clean dev fmt lint test coverage lock-dependencies verify-lock build docs-install docs-build docs-serve docs-serve-dev docs-clean
 
 all: clean lint fmt test
 
 export UV_FROZEN := 1
 
 UV_RUN := uv run --exact --all-extras
 
-clean:
+clean: docs-clean
 	rm -fr .venv clean htmlcov .pytest_cache .ruff_cache .coverage coverage.xml dist
 	find . -name '__pycache__' -print0 | xargs -0 rm -fr
 
@@ -29,7 +29,7 @@ verify-lock:
 	if [ -n "$$bad" ]; then \
 		echo "uv.lock contains non-public registry URLs:"; \
 		echo "$$bad"; \
-		echo "Run 'make lock' to regenerate and sanitize."; \
+		echo "Run 'make lock-dependencies' to regenerate and sanitize."; \
 		exit 1; \
 	fi
 
@@ -55,3 +55,21 @@ coverage:
 
 e2e:
 	$(UV_RUN) pytest -rs --timeout 30 tests/e2e
+
+docs-install:
+	yarn --cwd docs/python-data-sources install --frozen-lockfile
+
+docs-build:
+	uv run --group docs pydoc-markdown
+	yarn --cwd docs/python-data-sources build
+
+docs-serve-dev:
+	uv run --group docs pydoc-markdown
+	yarn --cwd docs/python-data-sources start
+
+docs-serve: docs-build
+	yarn --cwd docs/python-data-sources serve
+
+docs-clean:
+	rm -rf docs/python-data-sources/build docs/python-data-sources/.docusaurus docs/python-data-sources/.cache
+	find docs/python-data-sources/docs/reference/api -mindepth 1 -not -name 'index.mdx' -exec rm -rf {} +
diff --git a/docs/python-data-sources/.gitignore b/docs/python-data-sources/.gitignore
@@ -0,0 +1,23 @@
+# Dependencies
+/node_modules
+
+# Production
+/build
+
+# Generated files
+.docusaurus
+.cache-loader
+
+# Generated API reference (pydoc-markdown output)
+/docs/reference/api/python_data_sources/
+
+# Misc
+.DS_Store
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
diff --git a/docs/python-data-sources/docs/demos.mdx b/docs/python-data-sources/docs/demos.mdx
@@ -0,0 +1,14 @@
+---
+sidebar_position: 3
+---
+
+# Demos
+
+This project contains example notebooks in [`examples/`](https://github.com/databricks-industry-solutions/python-data-sources/tree/main/examples)
+for you to work with custom data sources.
+
+* [`examples/zipdcm/zip-dicom-demo.ipynb`](https://github.com/databricks-industry-solutions/python-data-sources/blob/main/examples/zipdcm/zip-dicom-demo.ipynb) shows how to register a `zipdcm` data source, read a zipped archive of DICOM files, and access the pixel arrays.
+
+## Contributing a new demo
+
+New example notebooks are welcome. Place the notebook under `examples/<connector>/` and reference it from this page.
diff --git a/docs/python-data-sources/docs/installation.mdx b/docs/python-data-sources/docs/installation.mdx
@@ -0,0 +1,55 @@
+---
+sidebar_position: 2
+---
+
+import Admonition from '@theme/Admonition';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Installation
+
+Data sources can be installed from PyPI using `pip` commands. Each data source is packaged as an optional extra module. You can 
+limit installation of data sources and their dependencies to only the sources you need.
+
+<Admonition type="info" title="Required Dependencies">
+Installing Python Data Sources from this library requires Spark 4.0+. Each data source includes its required Python dependencies.
+</Admonition>
+
+## Installing data sources
+
+List required data sources when installing using `pip` commands.
+
+```bash
+pip install "python-data-sources[mcap]"       # MCAP robotics logs
+pip install "python-data-sources[mqtt]"       # MQTT streaming
+pip install "python-data-sources[zipdcm]"     # DICOM in ZIP
+pip install "python-data-sources[mcap,mqtt]"  # Both MCAP and MQTT
+pip install "python-data-sources[all]"        # Installs all data sources
+```
+
+<Admonition type="tip" title="Install only required sources">
+Each data source requires its own dependencies. Because new data sources may be added to the project over time, it is best to
+limit installation to sources required for your workloads.
+</Admonition>
+
+## Registering data sources
+
+Data sources must be registered to be used in an active `SparkSession`. Once data sources are registered, you can reference them
+using their data source name (e.g. "mcap" for the MCAP data source). See the [API reference](/docs/reference) for the full list of 
+registered data sources and their options.
+
+```python
+from pyspark.sql import SparkSession
+from python_data_sources.mcap import McapDataSource
+
+# Register the data source
+spark = SparkSession.builder.getOrCreate()
+spark.dataSource.register(McapDataSource)
+
+# Get the data source name
+mcap_source_name = McapDataSource.name()  # 'mcap'
+
+# Read MCAP data using the registered data source
+df = spark.read.format("mcap").load("/path/to/logs/")
+df.show()
+```
diff --git a/docs/python-data-sources/docs/motivation.mdx b/docs/python-data-sources/docs/motivation.mdx
@@ -0,0 +1,25 @@
+---
+sidebar_position: 1
+---
+
+# Motivation
+
+Apache Spark ships native data sources commonly used in analytics workloads. The long tail of data sources is not covered by 
+standard Spark capabilities and is served by a patchwork of Java libraries or user-defined functions that are difficult to maintain.
+
+The [PySpark DataSource API](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/datasource.html) allows users to implement and register Spark data source formats using `spark.read.format(...)`. 
+Sources that implement `DataSourceReader` and `DataSourceWriter` contracts can be used to read or write data with Spark.
+
+**python-data-sources** is a collection of Python-native connectors for formats and protocols that don't have a built-in Spark reader.
+
+## Connectors
+
+| Connector | Read | Write  | Notes                                                          |
+|-----------|------|--------|----------------------------------------------------------------|
+| `mcap`    | ✓    | —      | MCAP container format used for robotics, AV, and ROS bag logs. |
+| `mqtt`    | ✓    | —      | MQTT broker as a Structured Streaming source.                  |
+| `zipdcm`  | ✓    | —      | DICOM medical images packaged inside ZIP archives.             |
+
+## Contributing
+
+If you need a connector that isn't provided as a Python Data Source, [open a GitHub issue](https://github.com/databricks-industry-solutions/python-data-sources/issues).
diff --git a/docs/python-data-sources/docs/reference/api/common/range_partition.md b/docs/python-data-sources/docs/reference/api/common/range_partition.md
@@ -0,0 +1,13 @@
+---
+sidebar_label: range_partition
+title: python_data_sources.common.range_partition
+---
+
+## RangePartition Objects
+
+```python
+class RangePartition(InputPartition)
+```
+
+This DataSource InputPartition class provides tracking of ranges within a list
+
diff --git a/docs/python-data-sources/docs/reference/api/index.mdx b/docs/python-data-sources/docs/reference/api/index.mdx
@@ -0,0 +1,9 @@
+---
+sidebar_position: 1
+---
+
+# API Reference
+
+The module-by-module API reference is generated by `pydoc-markdown` from the docstrings under `src/python_data_sources/`. Run `make docs-build` to regenerate the surrounding pages.
+
+If you're reading this in a freshly cloned tree, the per-module pages aren't here yet — they're written into this directory the first time `make docs-build` runs.
diff --git a/docs/python-data-sources/docs/reference/api/mcap/mcap_datasource.md b/docs/python-data-sources/docs/reference/api/mcap/mcap_datasource.md
@@ -0,0 +1,117 @@
+---
+sidebar_label: mcap_datasource
+title: python_data_sources.mcap.mcap_datasource
+---
+
+### path\_handler
+
+```python
+def path_handler(path: str,
+                 glob_pattern: str,
+                 recursive: bool = False) -> list
+```
+
+Discover files matching the glob pattern in the given path.
+
+**Arguments**:
+
+- `path` - Path to search for files
+- `glob_pattern` - Glob pattern to match files (e.g., &quot;*.mcap&quot;)
+- `recursive` - If True, recursively search subdirectories using rglob
+
+
+**Returns**:
+
+  List of file paths matching the pattern
+
+### decode\_protobuf\_message
+
+```python
+def decode_protobuf_message(message, schema, _reader)
+```
+
+Decode protobuf messages.
+
+### decode\_json\_message
+
+```python
+def decode_json_message(message, _schema, _reader)
+```
+
+Decode JSON messages.
+
+### decode\_fallback
+
+```python
+def decode_fallback(message, _schema, _reader)
+```
+
+Fallback decoder for unknown formats.
+
+## MCAPDataSourceReader Objects
+
+```python
+class MCAPDataSourceReader(DataSourceReader)
+```
+
+Facilitate reading MCAP (ROS 2 bag) files.
+
+### partitions
+
+```python
+def partitions() -> Sequence[RangePartition]
+```
+
+Compute &#x27;splits&#x27; of the data to read.
+
+**Returns**:
+
+  List of RangePartition objects
+
+### read
+
+```python
+def read(partition: InputPartition) -> Iterator[tuple]
+```
+
+Executor level method, performs read by Range Partition.
+
+**Arguments**:
+
+- `partition` - The partition to read
+
+
+**Returns**:
+
+  Iterator of tuples (sequence, topic, schema, encoding, log_time, data_json)
+
+## MCAPDataSource Objects
+
+```python
+class MCAPDataSource(DataSource)
+```
+
+A data source for batch query over MCAP (ROS 2 bag) files.
+
+Usage:
+    # Read all topics
+    df = spark.read.format(&quot;mcap&quot;).option(&quot;path&quot;, &quot;/path/to/mcap/files&quot;).load()
+
+    # Filter by specific topic at read time (more efficient than DataFrame filter)
+    df = spark.read.format(&quot;mcap&quot;)             .option(&quot;path&quot;, &quot;/path/to/mcap/files&quot;)             .option(&quot;topicFilter&quot;, &quot;pose&quot;)             .load()
+
+Options:
+    - path: Path to MCAP file(s) or directory (required)
+    - pathGlobFilter: Glob pattern for file matching (default: &quot;*.mcap&quot;)
+    - numPartitions: Number of partitions to split files across (default: 4)
+    - recursiveFileLookup: Recursively search subdirectories (default: false)
+    - topicFilter: Filter messages by topic name (optional). Use &quot;*&quot; or omit to read all topics.
+
+Schema:
+    - sequence: BIGINT - The message sequence number from MCAP
+    - topic: STRING - The message topic
+    - schema: STRING - The schema name
+    - encoding: STRING - The encoding type (protobuf, json, etc.)
+    - log_time: BIGINT - The message timestamp in nanoseconds
+    - data: STRING - JSON string containing all message fields
+