From e8c0259bacd6ba7029447898d370a58430ec552f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 09:13:36 -0400 Subject: [PATCH 1/3] Add data build metadata helpers --- policyengine_uk/build_metadata.py | 85 ++++++++++++++++++++ policyengine_uk/tests/test_build_metadata.py | 43 ++++++++++ 2 files changed, 128 insertions(+) create mode 100644 policyengine_uk/build_metadata.py create mode 100644 policyengine_uk/tests/test_build_metadata.py diff --git a/policyengine_uk/build_metadata.py b/policyengine_uk/build_metadata.py new file mode 100644 index 000000000..14d7c2aef --- /dev/null +++ b/policyengine_uk/build_metadata.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from functools import lru_cache +import hashlib +from importlib import metadata +from pathlib import Path +import subprocess + +PACKAGE_NAME = "policyengine-uk" +PACKAGE_ROOT = Path(__file__).resolve().parent +DATA_BUILD_SURFACE = ( + "data", + "parameters", + "variables", + "entities.py", + "microsimulation.py", + "simulation.py", + "system.py", + "tax_benefit_system.py", + "programs.yaml", +) + + +def _iter_surface_files() -> list[Path]: + files: list[Path] = [] + for relative_path in DATA_BUILD_SURFACE: + path = PACKAGE_ROOT / relative_path + if path.is_file(): + files.append(path) + continue + if path.is_dir(): + files.extend( + child + for child in sorted(path.rglob("*")) + if child.is_file() + and "__pycache__" not in child.parts + and child.suffix not in {".pyc", ".pyo"} + ) + return files + + +def _get_package_version() -> str | None: + try: + return metadata.version(PACKAGE_NAME) + except metadata.PackageNotFoundError: + return None + + +def _get_git_sha() -> str | None: + for candidate in (PACKAGE_ROOT, *PACKAGE_ROOT.parents): + if not (candidate / ".git").exists(): + continue + try: + return ( + subprocess.check_output( + ["git", "-C", str(candidate), "rev-parse", "HEAD"], + stderr=subprocess.DEVNULL, + text=True, + ) + .strip() + ) + except Exception: + return None + return None + + +@lru_cache(maxsize=1) +def get_data_build_fingerprint() -> str: + digest = hashlib.sha256() + for file_path in _iter_surface_files(): + relative_path = file_path.relative_to(PACKAGE_ROOT).as_posix() + digest.update(relative_path.encode("utf-8")) + digest.update(b"\0") + digest.update(file_path.read_bytes()) + digest.update(b"\0") + return f"sha256:{digest.hexdigest()}" + + +def get_data_build_metadata() -> dict[str, str | None]: + return { + "name": PACKAGE_NAME, + "version": _get_package_version(), + "git_sha": _get_git_sha(), + "data_build_fingerprint": get_data_build_fingerprint(), + } diff --git a/policyengine_uk/tests/test_build_metadata.py b/policyengine_uk/tests/test_build_metadata.py new file mode 100644 index 000000000..26bde0dfd --- /dev/null +++ b/policyengine_uk/tests/test_build_metadata.py @@ -0,0 +1,43 @@ +from unittest.mock import patch + +from policyengine_uk.build_metadata import ( + get_data_build_fingerprint, + get_data_build_metadata, +) + + +def test_data_build_fingerprint_is_stable_within_process(): + get_data_build_fingerprint.cache_clear() + + first = get_data_build_fingerprint() + second = get_data_build_fingerprint() + + assert first.startswith("sha256:") + assert first == second + + +def test_get_data_build_metadata_includes_version_git_sha_and_fingerprint(): + get_data_build_fingerprint.cache_clear() + + with ( + patch( + "policyengine_uk.build_metadata._get_package_version", + return_value="2.74.0", + ), + patch( + "policyengine_uk.build_metadata._get_git_sha", + return_value="deadbeef", + ), + patch( + "policyengine_uk.build_metadata.get_data_build_fingerprint", + return_value="sha256:fingerprint", + ), + ): + metadata = get_data_build_metadata() + + assert metadata == { + "name": "policyengine-uk", + "version": "2.74.0", + "git_sha": "deadbeef", + "data_build_fingerprint": "sha256:fingerprint", + } From 7457839d0f393b48519c66d861c2e61ed9e24d65 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 09:15:32 -0400 Subject: [PATCH 2/3] Add changelog fragment for build metadata --- changelog.d/data-build-fingerprint.changed.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/data-build-fingerprint.changed.md diff --git a/changelog.d/data-build-fingerprint.changed.md b/changelog.d/data-build-fingerprint.changed.md new file mode 100644 index 000000000..1700a3ed2 --- /dev/null +++ b/changelog.d/data-build-fingerprint.changed.md @@ -0,0 +1 @@ +Expose build metadata helpers for UK data artifacts, including a stable data-build fingerprint and build provenance metadata. From fa12880fb0b89d728dc7ac95e5a21acef44ad1dc Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 12 Apr 2026 09:18:12 -0400 Subject: [PATCH 3/3] Format build metadata helper --- policyengine_uk/build_metadata.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/policyengine_uk/build_metadata.py b/policyengine_uk/build_metadata.py index 14d7c2aef..3d856af4a 100644 --- a/policyengine_uk/build_metadata.py +++ b/policyengine_uk/build_metadata.py @@ -51,14 +51,11 @@ def _get_git_sha() -> str | None: if not (candidate / ".git").exists(): continue try: - return ( - subprocess.check_output( - ["git", "-C", str(candidate), "rev-parse", "HEAD"], - stderr=subprocess.DEVNULL, - text=True, - ) - .strip() - ) + return subprocess.check_output( + ["git", "-C", str(candidate), "rev-parse", "HEAD"], + stderr=subprocess.DEVNULL, + text=True, + ).strip() except Exception: return None return None