From 07a6637d7402829fb27744437b00c98b23d94241 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Tue, 30 Jun 2026 13:08:36 +0200
Subject: [PATCH 1/4] Add Populace Ledger firm comparison

---
 README.md                               |  47 +-
 paper/Appendix/a_data.tex               |  25 +
 paper/Sections/data.tex                 |   6 +
 pyproject.toml                          |   1 +
 results/populace_ledger_comparison.txt  |  34 ++
 results/populace_ledger_provenance.json | 101 ++++
 src/firm_microsim/__init__.py           |  44 +-
 src/firm_microsim/populace_ledger.py    | 656 ++++++++++++++++++++++++
 tests/test_package_smoke.py             |  23 +
 tests/test_populace_ledger.py           |  81 +++
 10 files changed, 1014 insertions(+), 4 deletions(-)
 create mode 100644 results/populace_ledger_comparison.txt
 create mode 100644 results/populace_ledger_provenance.json
 create mode 100644 src/firm_microsim/populace_ledger.py
 create mode 100644 tests/test_populace_ledger.py

diff --git a/README.md b/README.md
index d1d0d86..89e5f04 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,12 @@ The result is ~2.94M firm rows weighted to ~2.5M UK firms. Because the populatio
 is calibrated **to** the HMRC aggregates, agreement with them is an internal
 consistency check, not external validation.
 
+The official target surface is also being mirrored into PolicyEngine Ledger and
+Populace. This repository keeps the paper's archived CSV inputs and generator for
+reproducibility, and includes a Populace/Ledger comparison command so the pinned
+migration snapshot can be audited without silently changing the published paper
+population.
+
 ## Data vintages — single version, one-line switch
 
 The pipeline is **single-version**: there is one `VAT_THRESHOLD`, not separate
@@ -83,6 +89,7 @@ firm-microsim --vintage 2024-25             # one vintage only (£90k)
 firm-microsim --threshold 88 --seed 7 --output my_run.csv
 firm-microsim-report                        # calibration report only
 firm-microsim-figures                       # descriptive figures only
+firm-microsim-populace-ledger               # Populace/Ledger comparison
 ```
 
 ```python
@@ -148,13 +155,49 @@ firm-microsim-report
 an informational diagnostic only (47.1% / 21.7%). The model fixes firm inputs
 and sets liability = turnover − input but does not yet calibrate the
 **input/output tax structure**, so per-sector net liability is structurally
-unhittable and, while targeted, competed with the dimensions above (it scored
-43.9% / −121.1% and dragged the naive mean down). It is gated off via
+unhittable and is gated off via
 `Config.calibrate_vat_liability_sector = False`. Restoring it after input/output
 calibration is tracked in issues
 [#1](https://github.com/PolicyEngine/firm-microsim-paper/issues/1) and
 [#2](https://github.com/PolicyEngine/firm-microsim-paper/issues/2).
 
+## Populace/Ledger migration check
+
+`firm-microsim-populace-ledger` reports the current migration comparison. The
+checked reference run used the 2024-25 Ledger target surface from
+[PolicyEngine/arch-data#67](https://github.com/PolicyEngine/arch-data/pull/67)
+at `cd98b5cb7b1604fbf7750689a429bbc356e5603a` and Populace's experimental UK
+firm generator from
+[PolicyEngine/populace#223](https://github.com/PolicyEngine/populace/pull/223)
+at `fa20daf75ff023e5e88731a140f456f58e0b864e`. Both PRs were open, mergeable,
+and clean when checked on June 30, 2026. The reference population uses 1,000
+calibration iterations:
+
+```bash
+firm-microsim-populace-ledger \
+  --output results/populace_ledger_comparison.txt \
+  --json-output results/populace_ledger_provenance.json
+```
+
+When `populace-build` is installed from the Populace source tree, the same command
+can recompute the table and paper-CSV parity from Ledger consumer facts:
+
+```bash
+firm-microsim-populace-ledger \
+  --facts-jsonl /path/to/uk_firm_consumer_facts.jsonl \
+  --iterations 1000 \
+  --output results/populace_ledger_comparison.txt \
+  --json-output results/populace_ledger_provenance.json
+```
+
+The current reference comparison shows exact parity between the Ledger-backed
+targets and the paper's processed 2024-25 numeric inputs: six normalized source
+tables checked, zero mismatches, max numeric difference 0. It does **not** exactly
+replicate the paper's generated synthetic population: Populace's shared optimizer
+lands at 93.8% overall accuracy versus the paper's 90.5%, with a different
+tradeoff across weighted population (2,945,777 vs 2,577,076), sector distribution
+(85.0% vs 94.5%), and VAT liability by band (99.5% vs 81.4%).
+
 ## Figures
 
 Figures follow the project house style: single clean panels (no embedded titles,
diff --git a/paper/Appendix/a_data.tex b/paper/Appendix/a_data.tex
index 0b8bbf5..1ef4355 100644
--- a/paper/Appendix/a_data.tex
+++ b/paper/Appendix/a_data.tex
@@ -29,6 +29,31 @@ \subsection{Data construction detail}
 diagnostic, because the current generator does not calibrate sector-specific
 input/output VAT structure.
 
+\paragraph{Ledger and Populace migration check.} The official target tables are
+being moved into PolicyEngine Ledger, with Populace providing the shared
+synthetic-population generator. I therefore keep the paper's archived processed
+CSVs as the reproduction source for the reported results, and treat the pinned
+Populace/Ledger path as an auditable migration check rather than a silent
+replacement. The snapshot used here is PolicyEngine/arch-data pull request 67
+at commit \texttt{cd98b5c} and PolicyEngine/populace pull request 223 at commit
+\texttt{fa20daf}; both were open, mergeable, and clean when checked on June 30,
+2026. For the 2024--25 vintage, the Ledger-backed targets match the paper's
+processed numeric inputs exactly after dropping presentation-only labels,
+totals, and the HMRC ``Unknown'' column that the generator does not calibrate:
+six normalized source tables checked, zero mismatches, and maximum numeric
+difference zero. A 1,000-iteration Populace run from those targets generated
+2,946,015 firm rows and a 93.8 percent headline calibration score, compared with
+the paper's 2,945,974 rows and 90.5 percent score for the same vintage. The
+higher headline score reflects a different optimizer tradeoff, not exact
+replication: Populace hits HMRC turnover bands and VAT liability by turnover
+band more closely, while its weighted population (2,945,777 versus 2,577,076)
+and sector distribution (85.0 percent versus 94.5 percent) differ more from the
+paper population. VAT liability by sector remains an informational diagnostic
+in both runs. The comparison table and structured provenance are reproduced by
+\texttt{firm-microsim-populace-ledger} and checked into
+\texttt{results/populace\_ledger\_comparison.txt} and
+\texttt{results/populace\_ledger\_provenance.json}.
+
 \paragraph{Counterfactual exclusion window and polynomial degree.} The no-VAT
 counterfactual density of Section~\ref{sec:bunching} is fitted by polynomial
 regression on the observed density outside a manipulation window around the
diff --git a/paper/Sections/data.tex b/paper/Sections/data.tex
index 6af24e6..b8ae700 100644
--- a/paper/Sections/data.tex
+++ b/paper/Sections/data.tex
@@ -11,6 +11,12 @@ \section{Data}
 level. The generator takes the registration threshold as a parameter and is
 documented in Appendix~\ref{app:data}.
 
+The archived CSV inputs in this repository remain the reproduction source for
+the paper's reported numbers; a pinned migration snapshot also represents the
+same 2024--25 numeric target surface through PolicyEngine Ledger and the
+experimental Populace firm generator, and Appendix~\ref{app:data} reports the
+parity check between that shared source-of-truth path and the paper inputs.
+
 \paragraph{Construction.} Index firms by $i$. For each sector $s$ and ONS
 turnover band $b=[\underline{b},\overline{b}]$, the \emph{UK Business: Activity,
 Size and Location} table gives a firm count $N_{s,b}$, and I draw $N_{s,b}$ firms
diff --git a/pyproject.toml b/pyproject.toml
index 16eb46f..e52ff65 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ firm-microsim-dynamic = "firm_microsim.dynamic.__main__:main"
 firm-microsim-placebo = "firm_microsim.analysis.placebo_bunching:cli"
 firm-microsim-dominated-region = "firm_microsim.analysis.dominated_region_mass:cli"
 firm-microsim-reform-menu = "firm_microsim.analysis.reform_menu_common_base:cli"
+firm-microsim-populace-ledger = "firm_microsim.populace_ledger:cli"
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/firm_microsim"]
diff --git a/results/populace_ledger_comparison.txt b/results/populace_ledger_comparison.txt
new file mode 100644
index 0000000..336be85
--- /dev/null
+++ b/results/populace_ledger_comparison.txt
@@ -0,0 +1,34 @@
+# Populace/Ledger firm-generation comparison
+
+Reference run:
+
+- Vintage: 2024-25
+- Seed: 42
+- Populace iterations: 1,000
+- Ledger input surface: 1,439 consumer facts
+- Ledger facts SHA256: `58b6c2752adec5baa6a6260fe8cd9e9b85d0a78b5ba76ea28201f4c7986dce50`
+- Arch data snapshot: https://github.com/PolicyEngine/arch-data/pull/67 at `cd98b5cb7b1604fbf7750689a429bbc356e5603a` (OPEN, MERGEABLE, CLEAN on 2026-06-30)
+- Populace snapshot: https://github.com/PolicyEngine/populace/pull/223 at `fa20daf75ff023e5e88731a140f456f58e0b864e` (OPEN, MERGEABLE, CLEAN on 2026-06-30)
+- Normalized source-table parity: 6 tables checked, 0 mismatched, max absolute numeric difference 0
+
+The Ledger-backed targets match the paper's processed 2024-25 numeric input tables exactly after dropping presentation-only labels, totals, and the HMRC Unknown column that the generator does not calibrate.
+
+| Metric | Truth / target where it exists | Paper 2024-25 | Populace/Ledger 2024-25 |
+| --- | ---: | ---: | ---: |
+| Rows | N/A, synthetic support size | 2,945,974 | 2,946,015 |
+| Weighted population | 2,734,615 ONS firms | 2,577,076 | 2,945,777 |
+| HMRC turnover bands | 2,171,200 VAT-registered firms excluding Unknown | 92.7% | 99.9% |
+| ONS population | 2,734,615 ONS firms | 94.2% | 92.3% |
+| Employment bands | ONS employment-band distribution, sum 2,734,615 | 89.7% | 92.3% |
+| Sector distribution | 2,330,230 VAT-registered firms by SIC sector | 94.5% | 85.0% |
+| VAT liability by band | GBP 177.17bn net VAT liability by turnover band | 81.4% | 99.5% |
+| Overall | N/A, mean of calibrated accuracy scores | 90.5% | 93.8% |
+| VAT liability by sector diagnostic | GBP 177.29bn net VAT liability by SIC sector | 21.7% | 42.2% |
+
+Interpretation: this is not a silent replacement for the paper's published synthetic population. The target surface is identical, but Populace's shared calibration optimizer produces a different tradeoff across dimensions. VAT liability by sector remains an informational diagnostic, not a calibrated target.
+
+Recompute command:
+
+```bash
+firm-microsim-populace-ledger --facts-jsonl /tmp/uk_firm_consumer_facts.jsonl --iterations 1000 --output results/populace_ledger_comparison.txt --json-output results/populace_ledger_provenance.json
+```
diff --git a/results/populace_ledger_provenance.json b/results/populace_ledger_provenance.json
new file mode 100644
index 0000000..75a4de7
--- /dev/null
+++ b/results/populace_ledger_provenance.json
@@ -0,0 +1,101 @@
+{
+  "ledger_paper_parity": {
+    "facts_count": 1439,
+    "facts_sha256": "58b6c2752adec5baa6a6260fe8cd9e9b85d0a78b5ba76ea28201f4c7986dce50",
+    "max_abs_numeric_diff": 0.0,
+    "mismatched_tables": [],
+    "tables": [
+      {
+        "max_abs_numeric_diff": 0.0,
+        "name": "ons_turnover_by_sic_band",
+        "paper_rows_after_filter": 88,
+        "rows": 88,
+        "same_keys": true,
+        "values_equal": true
+      },
+      {
+        "max_abs_numeric_diff": 0.0,
+        "name": "ons_employment_by_sic_band",
+        "paper_rows_after_filter": 88,
+        "rows": 88,
+        "same_keys": true,
+        "values_equal": true
+      },
+      {
+        "max_abs_numeric_diff": 0.0,
+        "name": "hmrc_population_by_turnover_band",
+        "paper_rows_after_filter": 1,
+        "rows": 1,
+        "same_keys": true,
+        "values_equal": true
+      },
+      {
+        "max_abs_numeric_diff": 0.0,
+        "name": "hmrc_population_by_sic",
+        "paper_rows_after_filter": 88,
+        "rows": 88,
+        "same_keys": true,
+        "values_equal": true
+      },
+      {
+        "max_abs_numeric_diff": 0.0,
+        "name": "hmrc_liability_by_turnover_band",
+        "paper_rows_after_filter": 1,
+        "rows": 1,
+        "same_keys": true,
+        "values_equal": true
+      },
+      {
+        "max_abs_numeric_diff": 0.0,
+        "name": "hmrc_liability_by_sic",
+        "paper_rows_after_filter": 88,
+        "rows": 88,
+        "same_keys": true,
+        "values_equal": true
+      }
+    ],
+    "tables_checked": 6
+  },
+  "migration_snapshot": {
+    "arch_data_commit": "cd98b5cb7b1604fbf7750689a429bbc356e5603a",
+    "arch_data_pr": "https://github.com/PolicyEngine/arch-data/pull/67",
+    "arch_data_state_at_check": "OPEN, MERGEABLE, CLEAN on 2026-06-30",
+    "comparison_command": "firm-microsim-populace-ledger --facts-jsonl /tmp/uk_firm_consumer_facts.jsonl --iterations 1000 --output results/populace_ledger_comparison.txt --json-output results/populace_ledger_provenance.json",
+    "populace_commit": "fa20daf75ff023e5e88731a140f456f58e0b864e",
+    "populace_pr": "https://github.com/PolicyEngine/populace/pull/223",
+    "populace_state_at_check": "OPEN, MERGEABLE, CLEAN on 2026-06-30",
+    "source_packages": [
+      "ons-uk-business-firm-targets-2025",
+      "ons-uk-business-firm-sector-targets-2025",
+      "hmrc-vat-firm-targets-2024-25",
+      "hmrc-vat-firm-sector-targets-2024-25"
+    ]
+  },
+  "paper_2024_25": {
+    "employment": 89.7,
+    "hmrc_bands": 92.7,
+    "ons_population": 94.2,
+    "overall": 90.5,
+    "rows": 2945974,
+    "sector": 94.5,
+    "vat_liability_band": 81.4,
+    "vat_liability_sector": 21.7,
+    "weighted_population": 2577076.0
+  },
+  "populace_ledger_2024_25": {
+    "employment": 92.3,
+    "hmrc_bands": 99.9,
+    "ons_population": 92.3,
+    "overall": 93.8,
+    "rows": 2946015,
+    "sector": 85.0,
+    "vat_liability_band": 99.5,
+    "vat_liability_sector": 42.2,
+    "weighted_population": 2945776.8
+  },
+  "run_parameters": {
+    "populace_iterations": 1000,
+    "seed": 42
+  },
+  "vintage": "2024-25"
+}
diff --git a/src/firm_microsim/__init__.py b/src/firm_microsim/__init__.py
index 863b0fa..3991599 100644
--- a/src/firm_microsim/__init__.py
+++ b/src/firm_microsim/__init__.py
@@ -9,9 +9,12 @@
     >>> df = firm_microsim.generate(threshold=85)  # doctest: +SKIP
 """
 
+from __future__ import annotations
+
+import sys
+import types
+
 from .config import DEFAULT_CONFIG, VAT_THRESHOLD, Config
-from .generate import generate
-from .validate import ValidationReport
 
 __version__ = "1.0.0"
 
@@ -23,3 +26,40 @@
     "ValidationReport",
     "__version__",
 ]
+
+
+def generate(*args, **kwargs):
+    """Generate a synthetic firm population without importing torch at package import."""
+
+    from .generate import generate as _generate
+
+    return _generate(*args, **kwargs)
+
+
+def __getattr__(name: str):
+    """Lazily expose heavyweight public helpers."""
+
+    if name == "ValidationReport":
+        from .validate import ValidationReport
+
+        return ValidationReport
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+class _FirmMicrosimModule(types.ModuleType):
+    """Keep the historic package-level generate function stable.
+
+    Importing the ``firm_microsim.generate`` submodule makes Python assign that
+    module to ``firm_microsim.generate``. Before lazy imports this package
+    eagerly re-exported the callable and kept that public attribute stable; this
+    hook preserves that behavior without importing torch during package import.
+    """
+
+    def __setattr__(self, name: str, value):
+        if name == "generate" and isinstance(value, types.ModuleType):
+            super().__setattr__("_generate_submodule", value)
+            return
+        super().__setattr__(name, value)
+
+
+sys.modules[__name__].__class__ = _FirmMicrosimModule
diff --git a/src/firm_microsim/populace_ledger.py b/src/firm_microsim/populace_ledger.py
new file mode 100644
index 0000000..fe93101
--- /dev/null
+++ b/src/firm_microsim/populace_ledger.py
@@ -0,0 +1,656 @@
+"""Compare the paper population with the experimental Populace/Ledger run.
+
+The firm generator is being migrated toward PolicyEngine's shared Populace stack,
+with official targets flowing from Ledger. This module keeps that migration
+auditable from the paper repository:
+
+* without extra dependencies it prints the reference comparison from the
+  verified 2024-25 Populace/Ledger run;
+* with ``populace-build`` installed from the Populace source tree, it can
+  recompute the comparison and Ledger-vs-paper target parity from Ledger
+  consumer facts.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+from dataclasses import asdict, dataclass
+from pathlib import Path
+
+from .config import PROCESSED_DATA_DIR
+
+
+@dataclass(frozen=True)
+class CalibrationSnapshot:
+    """One synthetic-population calibration summary."""
+
+    rows: int
+    weighted_population: float
+    hmrc_bands: float
+    ons_population: float
+    employment: float
+    sector: float
+    vat_liability_band: float
+    overall: float
+    vat_liability_sector: float
+
+
+@dataclass(frozen=True)
+class TargetSummary:
+    """Official target description for one reported metric."""
+
+    label: str
+    truth: str
+    paper_value: str
+    populace_value: str
+
+
+@dataclass(frozen=True)
+class ParityTable:
+    """One normalized Ledger-vs-paper source-table parity check."""
+
+    name: str
+    rows: int
+    paper_rows_after_filter: int
+    same_keys: bool
+    values_equal: bool
+    max_abs_numeric_diff: float | None
+
+
+@dataclass(frozen=True)
+class ParitySummary:
+    """Source-table parity and facts-file provenance."""
+
+    facts_count: int
+    facts_sha256: str
+    tables: tuple[ParityTable, ...]
+
+    @property
+    def mismatched_tables(self) -> tuple[str, ...]:
+        return tuple(table.name for table in self.tables if not table.values_equal)
+
+    @property
+    def max_abs_numeric_diff(self) -> float:
+        diffs = [
+            table.max_abs_numeric_diff
+            for table in self.tables
+            if table.max_abs_numeric_diff is not None
+        ]
+        return max(diffs, default=0.0)
+
+
+PAPER_2024_25 = CalibrationSnapshot(
+    rows=2_945_974,
+    weighted_population=2_577_076.0,
+    hmrc_bands=92.7,
+    ons_population=94.2,
+    employment=89.7,
+    sector=94.5,
+    vat_liability_band=81.4,
+    overall=90.5,
+    vat_liability_sector=21.7,
+)
+
+REFERENCE_POPULACE_LEDGER_2024_25 = CalibrationSnapshot(
+    rows=2_946_015,
+    weighted_population=2_945_776.8,
+    hmrc_bands=99.9,
+    ons_population=92.3,
+    employment=92.3,
+    sector=85.0,
+    vat_liability_band=99.5,
+    overall=93.8,
+    vat_liability_sector=42.2,
+)
+
+REFERENCE_PARITY = ParitySummary(
+    facts_count=1_439,
+    facts_sha256="58b6c2752adec5baa6a6260fe8cd9e9b85d0a78b5ba76ea28201f4c7986dce50",
+    tables=(
+        ParityTable("ons_turnover_by_sic_band", 88, 88, True, True, 0.0),
+        ParityTable("ons_employment_by_sic_band", 88, 88, True, True, 0.0),
+        ParityTable("hmrc_population_by_turnover_band", 1, 1, True, True, 0.0),
+        ParityTable("hmrc_population_by_sic", 88, 88, True, True, 0.0),
+        ParityTable("hmrc_liability_by_turnover_band", 1, 1, True, True, 0.0),
+        ParityTable("hmrc_liability_by_sic", 88, 88, True, True, 0.0),
+    ),
+)
+
+REFERENCE_SOURCE_TOTALS = {
+    "source_facts": "1,439 Ledger consumer facts",
+    "ons_population": "2,734,615 ONS firms",
+    "hmrc_turnover_bands": "2,171,200 VAT-registered firms excluding Unknown",
+    "hmrc_sector": "2,330,230 VAT-registered firms by SIC sector",
+    "hmrc_liability_band": "GBP 177.17bn net VAT liability by turnover band",
+    "hmrc_liability_sector": "GBP 177.29bn net VAT liability by SIC sector",
+}
+
+REFERENCE_MIGRATION = {
+    "arch_data_pr": "https://github.com/PolicyEngine/arch-data/pull/67",
+    "arch_data_commit": "cd98b5cb7b1604fbf7750689a429bbc356e5603a",
+    "arch_data_state_at_check": "OPEN, MERGEABLE, CLEAN on 2026-06-30",
+    "populace_pr": "https://github.com/PolicyEngine/populace/pull/223",
+    "populace_commit": "fa20daf75ff023e5e88731a140f456f58e0b864e",
+    "populace_state_at_check": "OPEN, MERGEABLE, CLEAN on 2026-06-30",
+    "source_packages": [
+        "ons-uk-business-firm-targets-2025",
+        "ons-uk-business-firm-sector-targets-2025",
+        "hmrc-vat-firm-targets-2024-25",
+        "hmrc-vat-firm-sector-targets-2024-25",
+    ],
+    "comparison_command": (
+        "firm-microsim-populace-ledger --facts-jsonl "
+        "/tmp/uk_firm_consumer_facts.jsonl --iterations 1000 "
+        "--output results/populace_ledger_comparison.txt "
+        "--json-output results/populace_ledger_provenance.json"
+    ),
+}
+
+
+def comparison_rows(
+    populace: CalibrationSnapshot = REFERENCE_POPULACE_LEDGER_2024_25,
+) -> list[TargetSummary]:
+    """Return the 2024-25 paper-vs-Populace comparison table rows."""
+
+    return [
+        TargetSummary(
+            "Rows",
+            "N/A, synthetic support size",
+            _count(PAPER_2024_25.rows),
+            _count(populace.rows),
+        ),
+        TargetSummary(
+            "Weighted population",
+            REFERENCE_SOURCE_TOTALS["ons_population"],
+            _count(PAPER_2024_25.weighted_population),
+            _count(populace.weighted_population),
+        ),
+        TargetSummary(
+            "HMRC turnover bands",
+            REFERENCE_SOURCE_TOTALS["hmrc_turnover_bands"],
+            _pct(PAPER_2024_25.hmrc_bands),
+            _pct(populace.hmrc_bands),
+        ),
+        TargetSummary(
+            "ONS population",
+            REFERENCE_SOURCE_TOTALS["ons_population"],
+            _pct(PAPER_2024_25.ons_population),
+            _pct(populace.ons_population),
+        ),
+        TargetSummary(
+            "Employment bands",
+            "ONS employment-band distribution, sum 2,734,615",
+            _pct(PAPER_2024_25.employment),
+            _pct(populace.employment),
+        ),
+        TargetSummary(
+            "Sector distribution",
+            REFERENCE_SOURCE_TOTALS["hmrc_sector"],
+            _pct(PAPER_2024_25.sector),
+            _pct(populace.sector),
+        ),
+        TargetSummary(
+            "VAT liability by band",
+            REFERENCE_SOURCE_TOTALS["hmrc_liability_band"],
+            _pct(PAPER_2024_25.vat_liability_band),
+            _pct(populace.vat_liability_band),
+        ),
+        TargetSummary(
+            "Overall",
+            "N/A, mean of calibrated accuracy scores",
+            _pct(PAPER_2024_25.overall),
+            _pct(populace.overall),
+        ),
+        TargetSummary(
+            "VAT liability by sector diagnostic",
+            REFERENCE_SOURCE_TOTALS["hmrc_liability_sector"],
+            _pct(PAPER_2024_25.vat_liability_sector),
+            _pct(populace.vat_liability_sector),
+        ),
+    ]
+
+
+def format_comparison_report(
+    populace: CalibrationSnapshot = REFERENCE_POPULACE_LEDGER_2024_25,
+    parity: ParitySummary = REFERENCE_PARITY,
+    *,
+    iterations: int = 1_000,
+    seed: int = 42,
+) -> str:
+    """Render a Markdown comparison report."""
+
+    mismatches = len(parity.mismatched_tables)
+    parity_exact = mismatches == 0 and parity.max_abs_numeric_diff == 0.0
+    if parity_exact:
+        parity_sentence = (
+            "The Ledger-backed targets match the paper's processed 2024-25 "
+            "numeric input tables exactly after dropping presentation-only "
+            "labels, totals, and the HMRC Unknown column that the generator "
+            "does not calibrate."
+        )
+        interpretation_target_sentence = "The target surface is identical, "
+    else:
+        mismatch_list = ", ".join(parity.mismatched_tables) or "unknown tables"
+        parity_sentence = (
+            "The Ledger-backed targets do not exactly match the paper's "
+            "processed 2024-25 numeric input tables under this parity check; "
+            f"mismatched tables: {mismatch_list}."
+        )
+        interpretation_target_sentence = (
+            "The target surface is not identical in this recompute, "
+        )
+    lines = [
+        "# Populace/Ledger firm-generation comparison",
+        "",
+        "Reference run:",
+        "",
+        "- Vintage: 2024-25",
+        f"- Seed: {seed}",
+        f"- Populace iterations: {_count(iterations)}",
+        f"- Ledger input surface: {_count(parity.facts_count)} consumer facts",
+        f"- Ledger facts SHA256: `{parity.facts_sha256}`",
+        f"- Arch data snapshot: {REFERENCE_MIGRATION['arch_data_pr']} at "
+        f"`{REFERENCE_MIGRATION['arch_data_commit']}` "
+        f"({REFERENCE_MIGRATION['arch_data_state_at_check']})",
+        f"- Populace snapshot: {REFERENCE_MIGRATION['populace_pr']} at "
+        f"`{REFERENCE_MIGRATION['populace_commit']}` "
+        f"({REFERENCE_MIGRATION['populace_state_at_check']})",
+        f"- Normalized source-table parity: {len(parity.tables)} tables checked, "
+        f"{mismatches} mismatched, max absolute numeric difference "
+        f"{parity.max_abs_numeric_diff:g}",
+        "",
+        parity_sentence,
+        "",
+        "| Metric | Truth / target where it exists | Paper 2024-25 | Populace/Ledger 2024-25 |",
+        "| --- | ---: | ---: | ---: |",
+    ]
+    lines.extend(
+        f"| {row.label} | {row.truth} | {row.paper_value} | {row.populace_value} |"
+        for row in comparison_rows(populace)
+    )
+    lines.extend(
+        [
+            "",
+            "Interpretation: this is not a silent replacement for the paper's "
+            f"published synthetic population. {interpretation_target_sentence}"
+            "but Populace's shared calibration optimizer produces a different "
+            "tradeoff across dimensions. VAT liability by sector remains an "
+            "informational diagnostic, not a calibrated target.",
+            "",
+            "Recompute command:",
+            "",
+            "```bash",
+            REFERENCE_MIGRATION["comparison_command"],
+            "```",
+            "",
+        ]
+    )
+    return "\n".join(lines)
+
+
+def provenance_payload(
+    populace: CalibrationSnapshot = REFERENCE_POPULACE_LEDGER_2024_25,
+    parity: ParitySummary = REFERENCE_PARITY,
+    *,
+    iterations: int = 1_000,
+    seed: int = 42,
+) -> dict:
+    """Return structured provenance for the checked comparison."""
+
+    return {
+        "vintage": "2024-25",
+        "run_parameters": {
+            "seed": seed,
+            "populace_iterations": iterations,
+        },
+        "migration_snapshot": REFERENCE_MIGRATION,
+        "paper_2024_25": asdict(PAPER_2024_25),
+        "populace_ledger_2024_25": asdict(populace),
+        "ledger_paper_parity": {
+            "facts_count": parity.facts_count,
+            "facts_sha256": parity.facts_sha256,
+            "tables_checked": len(parity.tables),
+            "mismatched_tables": list(parity.mismatched_tables),
+            "max_abs_numeric_diff": parity.max_abs_numeric_diff,
+            "tables": [asdict(table) for table in parity.tables],
+        },
+    }
+
+
+def load_populace_source_data(facts_jsonl: Path):
+    """Read Ledger facts and build Populace source data."""
+
+    try:
+        from populace.build.uk_runtime.firm_generation import (
+            uk_firm_source_data_from_ledger_facts,
+        )
+    except ModuleNotFoundError as exc:
+        raise RuntimeError(
+            "Reading Ledger firm facts requires the experimental Populace firm "
+            "generator. Install the Populace source tree containing "
+            "populace.build.uk_runtime.firm_generation in this environment."
+        ) from exc
+
+    facts = _read_facts(facts_jsonl)
+    return uk_firm_source_data_from_ledger_facts(facts, data_vintage="2024-25")
+
+
+def source_table_parity(
+    facts_jsonl: Path,
+    *,
+    processed_dir: Path = PROCESSED_DATA_DIR / "2024-25",
+) -> ParitySummary:
+    """Compare Ledger-derived 2024-25 source tables with paper CSV inputs."""
+
+    data = load_populace_source_data(facts_jsonl)
+    tables = (
+        _compare_by_key(
+            "ons_turnover_by_sic_band",
+            data.ons_turnover,
+            processed_dir / "ons_firm_turnover.csv",
+            ["SIC Code"],
+            [
+                "0-49",
+                "50-99",
+                "100-249",
+                "250-499",
+                "500-999",
+                "1000-4999",
+                "5000+",
+                "Total",
+            ],
+            paper_filter_key="SIC Code",
+        ),
+        _compare_by_key(
+            "ons_employment_by_sic_band",
+            data.ons_employment,
+            processed_dir / "ons_firm_employment.csv",
+            ["SIC Code"],
+            ["0-4", "5-9", "10-19", "20-49", "50-99", "100-249", "250+", "Total"],
+            paper_filter_key="SIC Code",
+        ),
+        _compare_by_key(
+            "hmrc_population_by_turnover_band",
+            data.hmrc_population_band,
+            processed_dir / "hmrc_vat_population_by_turnover_band.csv",
+            ["Financial_Year"],
+            [
+                "Negative_or_Zero",
+                "£1_to_Threshold",
+                "£Threshold_to_£150k",
+                "£150k_to_£300k",
+                "£300k_to_£500k",
+                "£500k_to_£1m",
+                "£1m_to_£10m",
+                "Greater_than_£10m",
+            ],
+        ),
+        _compare_by_key(
+            "hmrc_population_by_sic",
+            data.hmrc_population_sector,
+            processed_dir / "hmrc_vat_population_by_sector.csv",
+            ["Trade_Sector"],
+            ["2024-25"],
+        ),
+        _compare_by_key(
+            "hmrc_liability_by_turnover_band",
+            data.hmrc_liability_band,
+            processed_dir / "hmrc_vat_liability_by_turnover_band.csv",
+            ["Financial_Year"],
+            [
+                "Negative_or_Zero",
+                "£1_to_Threshold",
+                "£Threshold_to_£150k",
+                "£150k_to_£300k",
+                "£300k_to_£500k",
+                "£500k_to_£1m",
+                "£1m_to_£10m",
+                "Greater_than_£10m",
+            ],
+        ),
+        _compare_by_key(
+            "hmrc_liability_by_sic",
+            data.hmrc_liability_sector,
+            processed_dir / "hmrc_vat_liability_by_sector.csv",
+            ["Trade_Sector"],
+            ["2024-25"],
+        ),
+    )
+    return ParitySummary(
+        facts_count=len(_read_facts(facts_jsonl)),
+        facts_sha256=_sha256(facts_jsonl),
+        tables=tables,
+    )
+
+
+def run_populace_from_ledger_facts(
+    facts_jsonl: Path,
+    *,
+    iterations: int = 1_000,
+    seed: int = 42,
+) -> CalibrationSnapshot:
+    """Generate the Populace firm population from Ledger facts and score it."""
+
+    try:
+        from populace.build.uk_runtime.firm_generation import (
+            UKFirmGenerationConfig,
+            generate_uk_firm_population,
+        )
+    except ModuleNotFoundError as exc:
+        raise RuntimeError(
+            "Recomputing the Populace/Ledger comparison requires the "
+            "experimental Populace firm generator. Install the Populace source "
+            "tree containing populace.build.uk_runtime.firm_generation in this "
+            "environment, then rerun with --facts-jsonl."
+        ) from exc
+
+    data = load_populace_source_data(facts_jsonl)
+    result = generate_uk_firm_population(
+        data,
+        UKFirmGenerationConfig(
+            data_vintage="2024-25",
+            n_iterations=iterations,
+            seed=seed,
+        ),
+    )
+    validation = result.validation
+    return CalibrationSnapshot(
+        rows=len(result.firms),
+        weighted_population=float(result.firms["firm_weight"].sum()),
+        hmrc_bands=validation.hmrc_bands * 100.0,
+        ons_population=validation.ons_population * 100.0,
+        employment=validation.employment * 100.0,
+        sector=validation.sector * 100.0,
+        vat_liability_band=validation.vat_liability_band * 100.0,
+        overall=validation.overall * 100.0,
+        vat_liability_sector=validation.vat_liability_sector * 100.0,
+    )
+
+
+def _compare_by_key(
+    name: str,
+    ledger_df,
+    paper_csv: Path,
+    key_cols: list[str],
+    value_cols: list[str],
+    *,
+    paper_filter_key: str | None = None,
+) -> ParityTable:
+    import pandas as pd
+
+    ledger = ledger_df[key_cols + value_cols].copy()
+    paper = pd.read_csv(paper_csv)
+    if paper_filter_key is not None:
+        paper = paper[paper[paper_filter_key].notna()].copy()
+    paper = paper[key_cols + value_cols].copy()
+
+    ledger = _normalize_keys(ledger, key_cols)
+    paper = _normalize_keys(paper, key_cols)
+    for column in value_cols:
+        ledger[column] = pd.to_numeric(ledger[column])
+        paper[column] = pd.to_numeric(paper[column])
+
+    ledger = ledger.sort_values(key_cols).reset_index(drop=True)
+    paper = paper.sort_values(key_cols).reset_index(drop=True)
+    same_shape = ledger.shape == paper.shape
+    same_keys = same_shape and ledger[key_cols].equals(paper[key_cols])
+    max_abs = None
+    values_equal = False
+    if same_shape and same_keys:
+        diff = (ledger[value_cols].astype(float) - paper[value_cols].astype(float)).abs()
+        max_abs = float(diff.to_numpy().max()) if diff.size else 0.0
+        values_equal = max_abs < 1e-9
+
+    return ParityTable(
+        name=name,
+        rows=len(ledger),
+        paper_rows_after_filter=len(paper),
+        same_keys=bool(same_keys),
+        values_equal=bool(values_equal),
+        max_abs_numeric_diff=max_abs,
+    )
+
+
+def _normalize_keys(df, key_cols: list[str]):
+    import pandas as pd
+
+    out = df.copy()
+    for key in key_cols:
+        converted = pd.to_numeric(out[key], errors="coerce")
+        if converted.notna().all():
+            out[key] = converted.astype(int)
+        else:
+            out[key] = out[key].astype(str)
+    return out
+
+
+def _read_facts(path: Path) -> list[dict]:
+    return [json.loads(line) for line in path.read_text().splitlines() if line.strip()]
+
+
+def _sha256(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as file:
+        for chunk in iter(lambda: file.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def _count(value: float) -> str:
+    return f"{value:,.0f}"
+
+
+def _pct(value: float) -> str:
+    return f"{value:.1f}%"
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--facts-jsonl",
+        type=Path,
+        default=None,
+        help=(
+            "Ledger consumer facts JSONL to recompute source parity and, by "
+            "default, the Populace run. Omit to print the checked reference "
+            "comparison."
+        ),
+    )
+    parser.add_argument(
+        "--paper-processed-dir",
+        type=Path,
+        default=PROCESSED_DATA_DIR / "2024-25",
+        help="Paper processed CSV directory for Ledger parity checks.",
+    )
+    parser.add_argument(
+        "--reference-population",
+        action="store_true",
+        help=(
+            "When --facts-jsonl is supplied, compute Ledger/paper parity from "
+            "facts but use the checked reference Populace population snapshot "
+            "instead of rerunning the slow optimizer."
+        ),
+    )
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=1_000,
+        help="Populace calibration iterations when --facts-jsonl is supplied.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Populace generator seed when --facts-jsonl is supplied.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="Optional path to write the Markdown report.",
+    )
+    parser.add_argument(
+        "--json-output",
+        type=Path,
+        default=None,
+        help="Optional path to write structured JSON provenance.",
+    )
+    return parser
+
+
+def main(argv: list[str] | None = None) -> str:
+    """CLI implementation."""
+
+    args = _build_parser().parse_args(argv)
+    if args.facts_jsonl:
+        parity = source_table_parity(
+            args.facts_jsonl,
+            processed_dir=args.paper_processed_dir,
+        )
+        populace = (
+            REFERENCE_POPULACE_LEDGER_2024_25
+            if args.reference_population
+            else run_populace_from_ledger_facts(
+                args.facts_jsonl,
+                iterations=args.iterations,
+                seed=args.seed,
+            )
+        )
+    else:
+        parity = REFERENCE_PARITY
+        populace = REFERENCE_POPULACE_LEDGER_2024_25
+
+    report_iterations = (
+        1_000 if (not args.facts_jsonl or args.reference_population) else args.iterations
+    )
+    report_seed = 42 if (not args.facts_jsonl or args.reference_population) else args.seed
+    report = format_comparison_report(
+        populace,
+        parity,
+        iterations=report_iterations,
+        seed=report_seed,
+    )
+    payload = provenance_payload(
+        populace,
+        parity,
+        iterations=report_iterations,
+        seed=report_seed,
+    )
+    if args.output:
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        args.output.write_text(report)
+    if args.json_output:
+        args.json_output.parent.mkdir(parents=True, exist_ok=True)
+        args.json_output.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n")
+    print(report)
+    return report
+
+
+def cli() -> None:
+    main()
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/tests/test_package_smoke.py b/tests/test_package_smoke.py
index ed60d70..4068dcb 100644
--- a/tests/test_package_smoke.py
+++ b/tests/test_package_smoke.py
@@ -3,6 +3,7 @@
 import subprocess
 import sys
 from pathlib import Path
+from importlib import import_module
 
 from firm_microsim.config import Config
 
@@ -17,6 +18,27 @@ def test_import_public_packages() -> None:
     assert firm_microsim.__version__ == "1.0.0"
 
 
+def test_lazy_public_exports_remain_accessible() -> None:
+    import firm_microsim
+
+    assert callable(firm_microsim.generate)
+    assert firm_microsim.ValidationReport.__name__ == "ValidationReport"
+
+
+def test_lazy_generate_export_survives_submodule_import() -> None:
+    import firm_microsim
+
+    import_module("firm_microsim.generate")
+    from firm_microsim import generate
+
+    namespace: dict[str, object] = {}
+    exec("from firm_microsim import *", namespace)
+
+    assert callable(firm_microsim.generate)
+    assert callable(generate)
+    assert callable(namespace["generate"])
+
+
 def test_cli_help_entry_points() -> None:
     modules = [
         "firm_microsim",
@@ -47,6 +69,7 @@ def test_console_script_help_entry_points() -> None:
         "firm-microsim-placebo",
         "firm-microsim-dominated-region",
         "firm-microsim-reform-menu",
+        "firm-microsim-populace-ledger",
     ]
     bin_dir = Path(sys.executable).parent
     for script in scripts:
diff --git a/tests/test_populace_ledger.py b/tests/test_populace_ledger.py
new file mode 100644
index 0000000..58f19bb
--- /dev/null
+++ b/tests/test_populace_ledger.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from firm_microsim.populace_ledger import (
+    ParitySummary,
+    ParityTable,
+    REFERENCE_POPULACE_LEDGER_2024_25,
+    REFERENCE_PARITY,
+    comparison_rows,
+    format_comparison_report,
+    provenance_payload,
+)
+
+
+def test_comparison_rows_include_truth_for_official_targets() -> None:
+    rows = {row.label: row for row in comparison_rows()}
+
+    assert rows["Weighted population"].truth == "2,734,615 ONS firms"
+    assert rows["VAT liability by band"].truth.endswith(
+        "net VAT liability by turnover band"
+    )
+    assert rows["Rows"].truth == "N/A, synthetic support size"
+    assert rows["Overall"].truth == "N/A, mean of calibrated accuracy scores"
+
+
+def test_reference_report_records_populace_calibration_tradeoff() -> None:
+    report = format_comparison_report()
+
+    assert "Paper 2024-25" in report
+    assert "Populace/Ledger 2024-25" in report
+    assert "| Overall | N/A, mean of calibrated accuracy scores | 90.5% | 93.8% |" in report
+    assert "0 mismatched" in report
+    assert REFERENCE_PARITY.facts_sha256 in report
+    assert (
+        "| VAT liability by sector diagnostic | GBP 177.29bn net VAT liability "
+        "by SIC sector | 21.7% | 42.2% |"
+    ) in report
+
+
+def test_mismatched_parity_report_does_not_claim_exact_match() -> None:
+    parity = ParitySummary(
+        facts_count=1,
+        facts_sha256="abc",
+        tables=(
+            ParityTable(
+                name="hmrc_population_by_sic",
+                rows=1,
+                paper_rows_after_filter=1,
+                same_keys=True,
+                values_equal=False,
+                max_abs_numeric_diff=1.0,
+            ),
+        ),
+    )
+
+    report = format_comparison_report(parity=parity)
+
+    assert "1 mismatched" in report
+    assert "do not exactly match" in report
+    assert "The target surface is identical" not in report
+
+
+def test_reference_provenance_records_pinned_pr_snapshot() -> None:
+    payload = provenance_payload()
+
+    assert payload["ledger_paper_parity"]["facts_count"] == 1_439
+    assert payload["ledger_paper_parity"]["mismatched_tables"] == []
+    assert payload["ledger_paper_parity"]["max_abs_numeric_diff"] == 0.0
+    assert (
+        payload["migration_snapshot"]["arch_data_commit"]
+        == "cd98b5cb7b1604fbf7750689a429bbc356e5603a"
+    )
+    assert (
+        payload["migration_snapshot"]["populace_commit"]
+        == "fa20daf75ff023e5e88731a140f456f58e0b864e"
+    )
+
+
+def test_reference_snapshot_uses_full_populace_run() -> None:
+    assert REFERENCE_POPULACE_LEDGER_2024_25.rows == 2_946_015
+    assert round(REFERENCE_POPULACE_LEDGER_2024_25.weighted_population) == 2_945_777
+    assert REFERENCE_POPULACE_LEDGER_2024_25.hmrc_bands == 99.9

From 6a4c8b4e8e31a5d28986302e209f86239ded54bc Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Tue, 30 Jun 2026 14:25:19 +0200
Subject: [PATCH 2/4] Address Populace comparison review

---
 README.md                               | 13 ++++---
 paper/Appendix/a_data.tex               | 20 ++++++-----
 results/calibration_accuracy.txt        |  4 +--
 results/populace_ledger_comparison.txt  | 29 ++++++++--------
 results/populace_ledger_provenance.json |  6 ++--
 src/firm_microsim/populace_ledger.py    | 46 +++++++++++++++++++------
 tests/test_populace_ledger.py           | 32 +++++++++++++++--
 7 files changed, 104 insertions(+), 46 deletions(-)

diff --git a/README.md b/README.md
index 89e5f04..75f8351 100644
--- a/README.md
+++ b/README.md
@@ -152,7 +152,7 @@ firm-microsim-report
 | **Overall (5 calibrated dimensions)** | **89.9%** | **90.5%** |
 
 **VAT liability by *sector*** is **not** a calibration target — it is reported as
-an informational diagnostic only (47.1% / 21.7%). The model fixes firm inputs
+an informational diagnostic only (47.1% / 44.5%). The model fixes firm inputs
 and sets liability = turnover − input but does not yet calibrate the
 **input/output tax structure**, so per-sector net liability is structurally
 unhittable and is gated off via
@@ -164,7 +164,7 @@ calibration is tracked in issues
 ## Populace/Ledger migration check
 
 `firm-microsim-populace-ledger` reports the current migration comparison. The
-checked reference run used the 2024-25 Ledger target surface from
+checked preliminary reference run used the 2024-25 Ledger target surface from
 [PolicyEngine/arch-data#67](https://github.com/PolicyEngine/arch-data/pull/67)
 at `cd98b5cb7b1604fbf7750689a429bbc356e5603a` and Populace's experimental UK
 firm generator from
@@ -194,9 +194,12 @@ The current reference comparison shows exact parity between the Ledger-backed
 targets and the paper's processed 2024-25 numeric inputs: six normalized source
 tables checked, zero mismatches, max numeric difference 0. It does **not** exactly
 replicate the paper's generated synthetic population: Populace's shared optimizer
-lands at 93.8% overall accuracy versus the paper's 90.5%, with a different
-tradeoff across weighted population (2,945,777 vs 2,577,076), sector distribution
-(85.0% vs 94.5%), and VAT liability by band (99.5% vs 81.4%).
+lands at 93.8% overall accuracy under its own validator versus the paper's 90.5%,
+but that overall pair is **not like-for-like**: HMRC turnover-band accuracy uses
+different band sets, and sector distribution reflects different calibration-target
+definitions. The directly comparable rows are ONS population, employment bands,
+and VAT liability by turnover band. Treat the Populace population figures as
+preliminary until the upstream Arch and Populace PRs merge.
 
 ## Figures
 
diff --git a/paper/Appendix/a_data.tex b/paper/Appendix/a_data.tex
index 1ef4355..c9d88f4 100644
--- a/paper/Appendix/a_data.tex
+++ b/paper/Appendix/a_data.tex
@@ -33,7 +33,7 @@ \subsection{Data construction detail}
 being moved into PolicyEngine Ledger, with Populace providing the shared
 synthetic-population generator. I therefore keep the paper's archived processed
 CSVs as the reproduction source for the reported results, and treat the pinned
-Populace/Ledger path as an auditable migration check rather than a silent
+Populace/Ledger path as an auditable preliminary migration check rather than a silent
 replacement. The snapshot used here is PolicyEngine/arch-data pull request 67
 at commit \texttt{cd98b5c} and PolicyEngine/populace pull request 223 at commit
 \texttt{fa20daf}; both were open, mergeable, and clean when checked on June 30,
@@ -41,14 +41,16 @@ \subsection{Data construction detail}
 processed numeric inputs exactly after dropping presentation-only labels,
 totals, and the HMRC ``Unknown'' column that the generator does not calibrate:
 six normalized source tables checked, zero mismatches, and maximum numeric
-difference zero. A 1,000-iteration Populace run from those targets generated
-2,946,015 firm rows and a 93.8 percent headline calibration score, compared with
-the paper's 2,945,974 rows and 90.5 percent score for the same vintage. The
-higher headline score reflects a different optimizer tradeoff, not exact
-replication: Populace hits HMRC turnover bands and VAT liability by turnover
-band more closely, while its weighted population (2,945,777 versus 2,577,076)
-and sector distribution (85.0 percent versus 94.5 percent) differ more from the
-paper population. VAT liability by sector remains an informational diagnostic
+difference zero. A 1,000-iteration preliminary Populace run from those targets
+generated 2,946,015 firm rows. Its own validator reports a 93.8 percent overall
+calibration score, compared with the paper validator's 90.5 percent score for
+the same vintage, but this overall comparison is not like-for-like: HMRC
+turnover-band accuracy uses different band sets, and sector distribution
+reflects different calibration-target definitions. The directly comparable rows
+are ONS population, employment bands, and VAT liability by turnover band.
+Populace hits VAT liability by turnover band more closely, while its weighted
+population (2,945,777 versus 2,577,078) differs more from the paper population.
+VAT liability by sector remains an informational diagnostic
 in both runs. The comparison table and structured provenance are reproduced by
 \texttt{firm-microsim-populace-ledger} and checked into
 \texttt{results/populace\_ledger\_comparison.txt} and
diff --git a/results/calibration_accuracy.txt b/results/calibration_accuracy.txt
index c405ca6..3e9d0bf 100644
--- a/results/calibration_accuracy.txt
+++ b/results/calibration_accuracy.txt
@@ -28,7 +28,7 @@ dimensions and excludes the VAT-liability-by-sector diagnostic.
   Vintage 2024-25  |  threshold £90k
 ================================================================
   rows (firm types):   2,945,974
-  weighted population: 2,577,076 firms
+  weighted population: 2,577,078 firms
 ----------------------------------------------------------------
   Dimension                     accuracy       error
 ----------------------------------------------------------------
@@ -41,7 +41,7 @@ dimensions and excludes the VAT-liability-by-sector diagnostic.
   Overall (5 calibrated dims)       90.5%        9.5%
 ----------------------------------------------------------------
   Informational diagnostic (not a calibration target):
-  VAT Liability by Sector          21.7%       78.3%
+  VAT Liability by Sector          44.5%       55.5%
 ================================================================
 
 Done: 2/2 vintage(s) reported.
diff --git a/results/populace_ledger_comparison.txt b/results/populace_ledger_comparison.txt
index 336be85..37e6d45 100644
--- a/results/populace_ledger_comparison.txt
+++ b/results/populace_ledger_comparison.txt
@@ -1,7 +1,8 @@
 # Populace/Ledger firm-generation comparison
 
-Reference run:
+Preliminary reference run:
 
+- Status: pinned to open, unmerged Arch and Populace PR snapshots
 - Vintage: 2024-25
 - Seed: 42
 - Populace iterations: 1,000
@@ -13,22 +14,22 @@ Reference run:
 
 The Ledger-backed targets match the paper's processed 2024-25 numeric input tables exactly after dropping presentation-only labels, totals, and the HMRC Unknown column that the generator does not calibrate.
 
-| Metric | Truth / target where it exists | Paper 2024-25 | Populace/Ledger 2024-25 |
-| --- | ---: | ---: | ---: |
-| Rows | N/A, synthetic support size | 2,945,974 | 2,946,015 |
-| Weighted population | 2,734,615 ONS firms | 2,577,076 | 2,945,777 |
-| HMRC turnover bands | 2,171,200 VAT-registered firms excluding Unknown | 92.7% | 99.9% |
-| ONS population | 2,734,615 ONS firms | 94.2% | 92.3% |
-| Employment bands | ONS employment-band distribution, sum 2,734,615 | 89.7% | 92.3% |
-| Sector distribution | 2,330,230 VAT-registered firms by SIC sector | 94.5% | 85.0% |
-| VAT liability by band | GBP 177.17bn net VAT liability by turnover band | 81.4% | 99.5% |
-| Overall | N/A, mean of calibrated accuracy scores | 90.5% | 93.8% |
-| VAT liability by sector diagnostic | GBP 177.29bn net VAT liability by SIC sector | 21.7% | 42.2% |
+| Metric | Comparability | Truth / target where it exists | Paper 2024-25 | Populace/Ledger 2024-25 |
+| --- | --- | ---: | ---: | ---: |
+| Rows | Descriptive | N/A, synthetic support size | 2,945,974 | 2,946,015 |
+| Weighted population | Direct | 2,734,615 ONS firms | 2,577,078 | 2,945,777 |
+| HMRC turnover bands | Not like-for-like | 2,171,200 VAT-registered firms excluding Unknown | 92.7% | 99.9% |
+| ONS population | Direct | 2,734,615 ONS firms | 94.2% | 92.3% |
+| Employment bands | Direct | ONS employment-band distribution, sum 2,734,615 | 89.7% | 92.3% |
+| Sector distribution | Project-specific | 2,330,230 VAT-registered firms by SIC sector | 94.5% | 85.0% |
+| VAT liability by band | Direct | GBP 177.17bn net VAT liability by turnover band | 81.4% | 99.5% |
+| Overall | Not like-for-like | N/A, mean of calibrated accuracy scores | 90.5% | 93.8% |
+| VAT liability by sector diagnostic | Diagnostic | GBP 177.29bn net VAT liability by SIC sector | 44.5% | 42.2% |
 
-Interpretation: this is not a silent replacement for the paper's published synthetic population. The target surface is identical, but Populace's shared calibration optimizer produces a different tradeoff across dimensions. VAT liability by sector remains an informational diagnostic, not a calibrated target.
+Interpretation: this is not a silent replacement for the paper's published synthetic population. The target surface is identical, and the directly comparable rows are ONS population, employment bands, and VAT liability by turnover band. HMRC turnover-band accuracy, sector distribution, and overall accuracy are computed under project-specific definitions, so the overall scores are not a like-for-like quality ranking. VAT liability by sector remains an informational diagnostic, not a calibrated target.
 
 Recompute command:
 
 ```bash
-firm-microsim-populace-ledger --facts-jsonl /tmp/uk_firm_consumer_facts.jsonl --iterations 1000 --output results/populace_ledger_comparison.txt --json-output results/populace_ledger_provenance.json
+firm-microsim-populace-ledger --facts-jsonl /tmp/uk_firm_consumer_facts.jsonl --reference-population --output results/populace_ledger_comparison.txt --json-output results/populace_ledger_provenance.json
 ```
diff --git a/results/populace_ledger_provenance.json b/results/populace_ledger_provenance.json
index 75a4de7..98fe7b7 100644
--- a/results/populace_ledger_provenance.json
+++ b/results/populace_ledger_provenance.json
@@ -60,7 +60,7 @@
     "arch_data_commit": "cd98b5cb7b1604fbf7750689a429bbc356e5603a",
     "arch_data_pr": "https://github.com/PolicyEngine/arch-data/pull/67",
     "arch_data_state_at_check": "OPEN, MERGEABLE, CLEAN on 2026-06-30",
-    "comparison_command": "firm-microsim-populace-ledger --facts-jsonl /tmp/uk_firm_consumer_facts.jsonl --iterations 1000 --output results/populace_ledger_comparison.txt --json-output results/populace_ledger_provenance.json",
+    "comparison_command": "firm-microsim-populace-ledger --facts-jsonl /tmp/uk_firm_consumer_facts.jsonl --reference-population --output results/populace_ledger_comparison.txt --json-output results/populace_ledger_provenance.json",
     "populace_commit": "fa20daf75ff023e5e88731a140f456f58e0b864e",
     "populace_pr": "https://github.com/PolicyEngine/populace/pull/223",
     "populace_state_at_check": "OPEN, MERGEABLE, CLEAN on 2026-06-30",
@@ -79,8 +79,8 @@
     "rows": 2945974,
     "sector": 94.5,
     "vat_liability_band": 81.4,
-    "vat_liability_sector": 21.7,
-    "weighted_population": 2577076.0
+    "vat_liability_sector": 44.5,
+    "weighted_population": 2577078.0
   },
   "populace_ledger_2024_25": {
     "employment": 92.3,
diff --git a/src/firm_microsim/populace_ledger.py b/src/firm_microsim/populace_ledger.py
index fe93101..db87baf 100644
--- a/src/firm_microsim/populace_ledger.py
+++ b/src/firm_microsim/populace_ledger.py
@@ -42,6 +42,7 @@ class TargetSummary:
     """Official target description for one reported metric."""
 
     label: str
+    comparability: str
     truth: str
     paper_value: str
     populace_value: str
@@ -81,16 +82,27 @@ def max_abs_numeric_diff(self) -> float:
         return max(diffs, default=0.0)
 
 
+# Manually captured reference snapshots.
+#
+# PAPER_2024_25 is from the paper generator at seed 42. The 2024-25 synthetic
+# population was regenerated on 2026-06-30 with:
+#
+#   firm-microsim --vintage 2024-25 --output synthetic_firms_2024-25.csv
+#
+# REFERENCE_POPULACE_LEDGER_2024_25 is from a separate full Populace optimizer
+# run using the pinned PR snapshots recorded in REFERENCE_MIGRATION. These
+# generated-population numbers are not recomputed in CI; CI verifies rendering,
+# provenance, and Ledger-vs-paper source-table parity.
 PAPER_2024_25 = CalibrationSnapshot(
     rows=2_945_974,
-    weighted_population=2_577_076.0,
+    weighted_population=2_577_078.0,
     hmrc_bands=92.7,
     ons_population=94.2,
     employment=89.7,
     sector=94.5,
     vat_liability_band=81.4,
     overall=90.5,
-    vat_liability_sector=21.7,
+    vat_liability_sector=44.5,
 )
 
 REFERENCE_POPULACE_LEDGER_2024_25 = CalibrationSnapshot(
@@ -142,7 +154,7 @@ def max_abs_numeric_diff(self) -> float:
     ],
     "comparison_command": (
         "firm-microsim-populace-ledger --facts-jsonl "
-        "/tmp/uk_firm_consumer_facts.jsonl --iterations 1000 "
+        "/tmp/uk_firm_consumer_facts.jsonl --reference-population "
         "--output results/populace_ledger_comparison.txt "
         "--json-output results/populace_ledger_provenance.json"
     ),
@@ -157,54 +169,63 @@ def comparison_rows(
     return [
         TargetSummary(
             "Rows",
+            "Descriptive",
             "N/A, synthetic support size",
             _count(PAPER_2024_25.rows),
             _count(populace.rows),
         ),
         TargetSummary(
             "Weighted population",
+            "Direct",
             REFERENCE_SOURCE_TOTALS["ons_population"],
             _count(PAPER_2024_25.weighted_population),
             _count(populace.weighted_population),
         ),
         TargetSummary(
             "HMRC turnover bands",
+            "Not like-for-like",
             REFERENCE_SOURCE_TOTALS["hmrc_turnover_bands"],
             _pct(PAPER_2024_25.hmrc_bands),
             _pct(populace.hmrc_bands),
         ),
         TargetSummary(
             "ONS population",
+            "Direct",
             REFERENCE_SOURCE_TOTALS["ons_population"],
             _pct(PAPER_2024_25.ons_population),
             _pct(populace.ons_population),
         ),
         TargetSummary(
             "Employment bands",
+            "Direct",
             "ONS employment-band distribution, sum 2,734,615",
             _pct(PAPER_2024_25.employment),
             _pct(populace.employment),
         ),
         TargetSummary(
             "Sector distribution",
+            "Project-specific",
             REFERENCE_SOURCE_TOTALS["hmrc_sector"],
             _pct(PAPER_2024_25.sector),
             _pct(populace.sector),
         ),
         TargetSummary(
             "VAT liability by band",
+            "Direct",
             REFERENCE_SOURCE_TOTALS["hmrc_liability_band"],
             _pct(PAPER_2024_25.vat_liability_band),
             _pct(populace.vat_liability_band),
         ),
         TargetSummary(
             "Overall",
+            "Not like-for-like",
             "N/A, mean of calibrated accuracy scores",
             _pct(PAPER_2024_25.overall),
             _pct(populace.overall),
         ),
         TargetSummary(
             "VAT liability by sector diagnostic",
+            "Diagnostic",
             REFERENCE_SOURCE_TOTALS["hmrc_liability_sector"],
             _pct(PAPER_2024_25.vat_liability_sector),
             _pct(populace.vat_liability_sector),
@@ -244,8 +265,9 @@ def format_comparison_report(
     lines = [
         "# Populace/Ledger firm-generation comparison",
         "",
-        "Reference run:",
+        "Preliminary reference run:",
         "",
+        "- Status: pinned to open, unmerged Arch and Populace PR snapshots",
         "- Vintage: 2024-25",
         f"- Seed: {seed}",
         f"- Populace iterations: {_count(iterations)}",
@@ -263,11 +285,12 @@ def format_comparison_report(
         "",
         parity_sentence,
         "",
-        "| Metric | Truth / target where it exists | Paper 2024-25 | Populace/Ledger 2024-25 |",
-        "| --- | ---: | ---: | ---: |",
+        "| Metric | Comparability | Truth / target where it exists | Paper 2024-25 | Populace/Ledger 2024-25 |",
+        "| --- | --- | ---: | ---: | ---: |",
     ]
     lines.extend(
-        f"| {row.label} | {row.truth} | {row.paper_value} | {row.populace_value} |"
+        f"| {row.label} | {row.comparability} | {row.truth} | "
+        f"{row.paper_value} | {row.populace_value} |"
         for row in comparison_rows(populace)
     )
     lines.extend(
@@ -275,9 +298,12 @@ def format_comparison_report(
             "",
             "Interpretation: this is not a silent replacement for the paper's "
             f"published synthetic population. {interpretation_target_sentence}"
-            "but Populace's shared calibration optimizer produces a different "
-            "tradeoff across dimensions. VAT liability by sector remains an "
-            "informational diagnostic, not a calibrated target.",
+            "and the directly comparable rows are ONS population, employment "
+            "bands, and VAT liability by turnover band. HMRC turnover-band "
+            "accuracy, sector distribution, and overall accuracy are computed "
+            "under project-specific definitions, so the overall scores are not "
+            "a like-for-like quality ranking. VAT liability by sector remains "
+            "an informational diagnostic, not a calibrated target.",
             "",
             "Recompute command:",
             "",
diff --git a/tests/test_populace_ledger.py b/tests/test_populace_ledger.py
index 58f19bb..4bd977b 100644
--- a/tests/test_populace_ledger.py
+++ b/tests/test_populace_ledger.py
@@ -1,5 +1,8 @@
 from __future__ import annotations
 
+import json
+from pathlib import Path
+
 from firm_microsim.populace_ledger import (
     ParitySummary,
     ParityTable,
@@ -15,11 +18,15 @@ def test_comparison_rows_include_truth_for_official_targets() -> None:
     rows = {row.label: row for row in comparison_rows()}
 
     assert rows["Weighted population"].truth == "2,734,615 ONS firms"
+    assert rows["Weighted population"].comparability == "Direct"
     assert rows["VAT liability by band"].truth.endswith(
         "net VAT liability by turnover band"
     )
+    assert rows["VAT liability by band"].comparability == "Direct"
     assert rows["Rows"].truth == "N/A, synthetic support size"
+    assert rows["HMRC turnover bands"].comparability == "Not like-for-like"
     assert rows["Overall"].truth == "N/A, mean of calibrated accuracy scores"
+    assert rows["Overall"].comparability == "Not like-for-like"
 
 
 def test_reference_report_records_populace_calibration_tradeoff() -> None:
@@ -27,15 +34,31 @@ def test_reference_report_records_populace_calibration_tradeoff() -> None:
 
     assert "Paper 2024-25" in report
     assert "Populace/Ledger 2024-25" in report
-    assert "| Overall | N/A, mean of calibrated accuracy scores | 90.5% | 93.8% |" in report
+    assert (
+        "| Overall | Not like-for-like | N/A, mean of calibrated accuracy scores | "
+        "90.5% | 93.8% |"
+    ) in report
+    assert "not a like-for-like quality ranking" in report
     assert "0 mismatched" in report
     assert REFERENCE_PARITY.facts_sha256 in report
+    assert "--reference-population" in report
     assert (
-        "| VAT liability by sector diagnostic | GBP 177.29bn net VAT liability "
-        "by SIC sector | 21.7% | 42.2% |"
+        "| VAT liability by sector diagnostic | Diagnostic | GBP 177.29bn net "
+        "VAT liability by SIC sector | 44.5% | 42.2% |"
     ) in report
 
 
+def test_checked_artifacts_match_reference_rendering() -> None:
+    root = Path(__file__).resolve().parents[1]
+
+    assert (
+        root / "results" / "populace_ledger_comparison.txt"
+    ).read_text() == format_comparison_report()
+    assert json.loads(
+        (root / "results" / "populace_ledger_provenance.json").read_text()
+    ) == provenance_payload()
+
+
 def test_mismatched_parity_report_does_not_claim_exact_match() -> None:
     parity = ParitySummary(
         facts_count=1,
@@ -73,6 +96,9 @@ def test_reference_provenance_records_pinned_pr_snapshot() -> None:
         payload["migration_snapshot"]["populace_commit"]
         == "fa20daf75ff023e5e88731a140f456f58e0b864e"
     )
+    assert "--reference-population" in payload["migration_snapshot"][
+        "comparison_command"
+    ]
 
 
 def test_reference_snapshot_uses_full_populace_run() -> None:

From 6b9718a9fe3aa30e137572042758b9ff8672060d Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Tue, 30 Jun 2026 15:07:11 +0200
Subject: [PATCH 3/4] Update merged Populace Ledger language

---
 README.md                               | 17 ++++++++++-------
 paper/Appendix/a_data.tex               | 12 ++++++------
 results/populace_ledger_comparison.txt  |  8 ++++----
 results/populace_ledger_provenance.json |  8 +++++---
 src/firm_microsim/populace_ledger.py    | 22 +++++++++++++---------
 tests/test_populace_ledger.py           | 16 ++++++++++++++++
 6 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 75f8351..46d2d6f 100644
--- a/README.md
+++ b/README.md
@@ -164,14 +164,16 @@ calibration is tracked in issues
 ## Populace/Ledger migration check
 
 `firm-microsim-populace-ledger` reports the current migration comparison. The
-checked preliminary reference run used the 2024-25 Ledger target surface from
-[PolicyEngine/arch-data#67](https://github.com/PolicyEngine/arch-data/pull/67)
+checked reference run used the 2024-25 Ledger target surface from
+[PolicyEngine/ledger#67](https://github.com/PolicyEngine/ledger/pull/67)
 at `cd98b5cb7b1604fbf7750689a429bbc356e5603a` and Populace's experimental UK
 firm generator from
 [PolicyEngine/populace#223](https://github.com/PolicyEngine/populace/pull/223)
-at `fa20daf75ff023e5e88731a140f456f58e0b864e`. Both PRs were open, mergeable,
-and clean when checked on June 30, 2026. The reference population uses 1,000
-calibration iterations:
+at `fa20daf75ff023e5e88731a140f456f58e0b864e`. Both upstream PRs merged on
+June 30, 2026: Ledger at merge commit
+`ac643afa0c1d45fc4abd0268dc5aa7c843440b38`, and Populace at merge commit
+`8271d767244161631253ad1d9ad792a82e2b96b4`. The reference population uses
+1,000 calibration iterations:
 
 ```bash
 firm-microsim-populace-ledger \
@@ -198,8 +200,9 @@ lands at 93.8% overall accuracy under its own validator versus the paper's 90.5%
 but that overall pair is **not like-for-like**: HMRC turnover-band accuracy uses
 different band sets, and sector distribution reflects different calibration-target
 definitions. The directly comparable rows are ONS population, employment bands,
-and VAT liability by turnover band. Treat the Populace population figures as
-preliminary until the upstream Arch and Populace PRs merge.
+and VAT liability by turnover band. The Populace/Ledger path is now based on
+merged upstream inputs, while remaining a migration check rather than a silent
+replacement for the paper's archived generator/results.
 
 ## Figures
 
diff --git a/paper/Appendix/a_data.tex b/paper/Appendix/a_data.tex
index c9d88f4..f82a22b 100644
--- a/paper/Appendix/a_data.tex
+++ b/paper/Appendix/a_data.tex
@@ -33,16 +33,16 @@ \subsection{Data construction detail}
 being moved into PolicyEngine Ledger, with Populace providing the shared
 synthetic-population generator. I therefore keep the paper's archived processed
 CSVs as the reproduction source for the reported results, and treat the pinned
-Populace/Ledger path as an auditable preliminary migration check rather than a silent
-replacement. The snapshot used here is PolicyEngine/arch-data pull request 67
+Populace/Ledger path as an auditable migration check rather than a silent
+replacement. The snapshot used here is PolicyEngine/ledger pull request 67
 at commit \texttt{cd98b5c} and PolicyEngine/populace pull request 223 at commit
-\texttt{fa20daf}; both were open, mergeable, and clean when checked on June 30,
-2026. For the 2024--25 vintage, the Ledger-backed targets match the paper's
+\texttt{fa20daf}; both upstream pull requests merged on June 30, 2026. For the
+2024--25 vintage, the Ledger-backed targets match the paper's
 processed numeric inputs exactly after dropping presentation-only labels,
 totals, and the HMRC ``Unknown'' column that the generator does not calibrate:
 six normalized source tables checked, zero mismatches, and maximum numeric
-difference zero. A 1,000-iteration preliminary Populace run from those targets
-generated 2,946,015 firm rows. Its own validator reports a 93.8 percent overall
+difference zero. A 1,000-iteration Populace run from those targets generated
+2,946,015 firm rows. Its own validator reports a 93.8 percent overall
 calibration score, compared with the paper validator's 90.5 percent score for
 the same vintage, but this overall comparison is not like-for-like: HMRC
 turnover-band accuracy uses different band sets, and sector distribution
diff --git a/results/populace_ledger_comparison.txt b/results/populace_ledger_comparison.txt
index 37e6d45..2e4fa61 100644
--- a/results/populace_ledger_comparison.txt
+++ b/results/populace_ledger_comparison.txt
@@ -1,15 +1,15 @@
 # Populace/Ledger firm-generation comparison
 
-Preliminary reference run:
+Reference run:
 
-- Status: pinned to open, unmerged Arch and Populace PR snapshots
+- Status: pinned to merged Ledger and Populace snapshots
 - Vintage: 2024-25
 - Seed: 42
 - Populace iterations: 1,000
 - Ledger input surface: 1,439 consumer facts
 - Ledger facts SHA256: `58b6c2752adec5baa6a6260fe8cd9e9b85d0a78b5ba76ea28201f4c7986dce50`
-- Arch data snapshot: https://github.com/PolicyEngine/arch-data/pull/67 at `cd98b5cb7b1604fbf7750689a429bbc356e5603a` (OPEN, MERGEABLE, CLEAN on 2026-06-30)
-- Populace snapshot: https://github.com/PolicyEngine/populace/pull/223 at `fa20daf75ff023e5e88731a140f456f58e0b864e` (OPEN, MERGEABLE, CLEAN on 2026-06-30)
+- Ledger target snapshot: https://github.com/PolicyEngine/ledger/pull/67 at `cd98b5cb7b1604fbf7750689a429bbc356e5603a` (MERGED on 2026-06-30, merge commit `ac643afa0c1d45fc4abd0268dc5aa7c843440b38`)
+- Populace snapshot: https://github.com/PolicyEngine/populace/pull/223 at `fa20daf75ff023e5e88731a140f456f58e0b864e` (MERGED on 2026-06-30, merge commit `8271d767244161631253ad1d9ad792a82e2b96b4`)
 - Normalized source-table parity: 6 tables checked, 0 mismatched, max absolute numeric difference 0
 
 The Ledger-backed targets match the paper's processed 2024-25 numeric input tables exactly after dropping presentation-only labels, totals, and the HMRC Unknown column that the generator does not calibrate.
diff --git a/results/populace_ledger_provenance.json b/results/populace_ledger_provenance.json
index 98fe7b7..08cbe1d 100644
--- a/results/populace_ledger_provenance.json
+++ b/results/populace_ledger_provenance.json
@@ -58,12 +58,14 @@
   },
   "migration_snapshot": {
     "arch_data_commit": "cd98b5cb7b1604fbf7750689a429bbc356e5603a",
-    "arch_data_pr": "https://github.com/PolicyEngine/arch-data/pull/67",
-    "arch_data_state_at_check": "OPEN, MERGEABLE, CLEAN on 2026-06-30",
+    "arch_data_merge_commit": "ac643afa0c1d45fc4abd0268dc5aa7c843440b38",
+    "arch_data_pr": "https://github.com/PolicyEngine/ledger/pull/67",
+    "arch_data_state_at_check": "MERGED on 2026-06-30",
     "comparison_command": "firm-microsim-populace-ledger --facts-jsonl /tmp/uk_firm_consumer_facts.jsonl --reference-population --output results/populace_ledger_comparison.txt --json-output results/populace_ledger_provenance.json",
     "populace_commit": "fa20daf75ff023e5e88731a140f456f58e0b864e",
+    "populace_merge_commit": "8271d767244161631253ad1d9ad792a82e2b96b4",
     "populace_pr": "https://github.com/PolicyEngine/populace/pull/223",
-    "populace_state_at_check": "OPEN, MERGEABLE, CLEAN on 2026-06-30",
+    "populace_state_at_check": "MERGED on 2026-06-30",
     "source_packages": [
       "ons-uk-business-firm-targets-2025",
       "ons-uk-business-firm-sector-targets-2025",
diff --git a/src/firm_microsim/populace_ledger.py b/src/firm_microsim/populace_ledger.py
index db87baf..907fdea 100644
--- a/src/firm_microsim/populace_ledger.py
+++ b/src/firm_microsim/populace_ledger.py
@@ -90,7 +90,7 @@ def max_abs_numeric_diff(self) -> float:
 #   firm-microsim --vintage 2024-25 --output synthetic_firms_2024-25.csv
 #
 # REFERENCE_POPULACE_LEDGER_2024_25 is from a separate full Populace optimizer
-# run using the pinned PR snapshots recorded in REFERENCE_MIGRATION. These
+# run using the merged upstream snapshots recorded in REFERENCE_MIGRATION. These
 # generated-population numbers are not recomputed in CI; CI verifies rendering,
 # provenance, and Ledger-vs-paper source-table parity.
 PAPER_2024_25 = CalibrationSnapshot(
@@ -140,12 +140,14 @@ def max_abs_numeric_diff(self) -> float:
 }
 
 REFERENCE_MIGRATION = {
-    "arch_data_pr": "https://github.com/PolicyEngine/arch-data/pull/67",
+    "arch_data_pr": "https://github.com/PolicyEngine/ledger/pull/67",
     "arch_data_commit": "cd98b5cb7b1604fbf7750689a429bbc356e5603a",
-    "arch_data_state_at_check": "OPEN, MERGEABLE, CLEAN on 2026-06-30",
+    "arch_data_merge_commit": "ac643afa0c1d45fc4abd0268dc5aa7c843440b38",
+    "arch_data_state_at_check": "MERGED on 2026-06-30",
     "populace_pr": "https://github.com/PolicyEngine/populace/pull/223",
     "populace_commit": "fa20daf75ff023e5e88731a140f456f58e0b864e",
-    "populace_state_at_check": "OPEN, MERGEABLE, CLEAN on 2026-06-30",
+    "populace_merge_commit": "8271d767244161631253ad1d9ad792a82e2b96b4",
+    "populace_state_at_check": "MERGED on 2026-06-30",
     "source_packages": [
         "ons-uk-business-firm-targets-2025",
         "ons-uk-business-firm-sector-targets-2025",
@@ -265,20 +267,22 @@ def format_comparison_report(
     lines = [
         "# Populace/Ledger firm-generation comparison",
         "",
-        "Preliminary reference run:",
+        "Reference run:",
         "",
-        "- Status: pinned to open, unmerged Arch and Populace PR snapshots",
+        "- Status: pinned to merged Ledger and Populace snapshots",
         "- Vintage: 2024-25",
         f"- Seed: {seed}",
         f"- Populace iterations: {_count(iterations)}",
         f"- Ledger input surface: {_count(parity.facts_count)} consumer facts",
         f"- Ledger facts SHA256: `{parity.facts_sha256}`",
-        f"- Arch data snapshot: {REFERENCE_MIGRATION['arch_data_pr']} at "
+        f"- Ledger target snapshot: {REFERENCE_MIGRATION['arch_data_pr']} at "
         f"`{REFERENCE_MIGRATION['arch_data_commit']}` "
-        f"({REFERENCE_MIGRATION['arch_data_state_at_check']})",
+        f"({REFERENCE_MIGRATION['arch_data_state_at_check']}, merge commit "
+        f"`{REFERENCE_MIGRATION['arch_data_merge_commit']}`)",
         f"- Populace snapshot: {REFERENCE_MIGRATION['populace_pr']} at "
         f"`{REFERENCE_MIGRATION['populace_commit']}` "
-        f"({REFERENCE_MIGRATION['populace_state_at_check']})",
+        f"({REFERENCE_MIGRATION['populace_state_at_check']}, merge commit "
+        f"`{REFERENCE_MIGRATION['populace_merge_commit']}`)",
         f"- Normalized source-table parity: {len(parity.tables)} tables checked, "
         f"{mismatches} mismatched, max absolute numeric difference "
         f"{parity.max_abs_numeric_diff:g}",
diff --git a/tests/test_populace_ledger.py b/tests/test_populace_ledger.py
index 4bd977b..abc296d 100644
--- a/tests/test_populace_ledger.py
+++ b/tests/test_populace_ledger.py
@@ -34,6 +34,8 @@ def test_reference_report_records_populace_calibration_tradeoff() -> None:
 
     assert "Paper 2024-25" in report
     assert "Populace/Ledger 2024-25" in report
+    assert "pinned to merged Ledger and Populace snapshots" in report
+    assert "Preliminary reference run" not in report
     assert (
         "| Overall | Not like-for-like | N/A, mean of calibrated accuracy scores | "
         "90.5% | 93.8% |"
@@ -92,10 +94,24 @@ def test_reference_provenance_records_pinned_pr_snapshot() -> None:
         payload["migration_snapshot"]["arch_data_commit"]
         == "cd98b5cb7b1604fbf7750689a429bbc356e5603a"
     )
+    assert (
+        payload["migration_snapshot"]["arch_data_merge_commit"]
+        == "ac643afa0c1d45fc4abd0268dc5aa7c843440b38"
+    )
+    assert payload["migration_snapshot"]["arch_data_state_at_check"].startswith(
+        "MERGED"
+    )
     assert (
         payload["migration_snapshot"]["populace_commit"]
         == "fa20daf75ff023e5e88731a140f456f58e0b864e"
     )
+    assert (
+        payload["migration_snapshot"]["populace_merge_commit"]
+        == "8271d767244161631253ad1d9ad792a82e2b96b4"
+    )
+    assert payload["migration_snapshot"]["populace_state_at_check"].startswith(
+        "MERGED"
+    )
     assert "--reference-population" in payload["migration_snapshot"][
         "comparison_command"
     ]

From 49586e22386389fe70d41164eafbd4400ff8db57 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Tue, 30 Jun 2026 15:30:45 +0200
Subject: [PATCH 4/4] Record exact upstream Populace rebuild metrics

---
 results/populace_ledger_provenance.json | 16 ++++++++--------
 src/firm_microsim/populace_ledger.py    | 16 ++++++++--------
 tests/test_populace_ledger.py           |  2 +-
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/results/populace_ledger_provenance.json b/results/populace_ledger_provenance.json
index 08cbe1d..7e5f7d2 100644
--- a/results/populace_ledger_provenance.json
+++ b/results/populace_ledger_provenance.json
@@ -85,15 +85,15 @@
     "weighted_population": 2577078.0
   },
   "populace_ledger_2024_25": {
-    "employment": 92.3,
-    "hmrc_bands": 99.9,
-    "ons_population": 92.3,
-    "overall": 93.8,
+    "employment": 92.27463056446766,
+    "hmrc_bands": 99.92894355826047,
+    "ons_population": 92.27819089707326,
+    "overall": 93.79009306789898,
     "rows": 2946015,
-    "sector": 85.0,
-    "vat_liability_band": 99.5,
-    "vat_liability_sector": 42.2,
-    "weighted_population": 2945776.8
+    "sector": 85.0102525769766,
+    "vat_liability_band": 99.45844774271694,
+    "vat_liability_sector": 42.17514113123149,
+    "weighted_population": 2945776.75
   },
   "run_parameters": {
     "populace_iterations": 1000,
diff --git a/src/firm_microsim/populace_ledger.py b/src/firm_microsim/populace_ledger.py
index 907fdea..5eb1f87 100644
--- a/src/firm_microsim/populace_ledger.py
+++ b/src/firm_microsim/populace_ledger.py
@@ -107,14 +107,14 @@ def max_abs_numeric_diff(self) -> float:
 
 REFERENCE_POPULACE_LEDGER_2024_25 = CalibrationSnapshot(
     rows=2_946_015,
-    weighted_population=2_945_776.8,
-    hmrc_bands=99.9,
-    ons_population=92.3,
-    employment=92.3,
-    sector=85.0,
-    vat_liability_band=99.5,
-    overall=93.8,
-    vat_liability_sector=42.2,
+    weighted_population=2_945_776.75,
+    hmrc_bands=99.92894355826047,
+    ons_population=92.27819089707326,
+    employment=92.27463056446766,
+    sector=85.0102525769766,
+    vat_liability_band=99.45844774271694,
+    overall=93.79009306789898,
+    vat_liability_sector=42.17514113123149,
 )
 
 REFERENCE_PARITY = ParitySummary(
diff --git a/tests/test_populace_ledger.py b/tests/test_populace_ledger.py
index abc296d..28190bb 100644
--- a/tests/test_populace_ledger.py
+++ b/tests/test_populace_ledger.py
@@ -120,4 +120,4 @@ def test_reference_provenance_records_pinned_pr_snapshot() -> None:
 def test_reference_snapshot_uses_full_populace_run() -> None:
     assert REFERENCE_POPULACE_LEDGER_2024_25.rows == 2_946_015
     assert round(REFERENCE_POPULACE_LEDGER_2024_25.weighted_population) == 2_945_777
-    assert REFERENCE_POPULACE_LEDGER_2024_25.hmrc_bands == 99.9
+    assert round(REFERENCE_POPULACE_LEDGER_2024_25.hmrc_bands, 1) == 99.9