From 5196aa203c8b8b5aa265fdd98a111127a203e048 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 20 Jun 2026 18:11:51 -0400 Subject: [PATCH 1/3] Add UK raw-source and local-geography parity --- packages/populace-build/README.md | 45 +- .../src/populace/build/gates.py | 10 +- .../src/populace/build/source_manifest.py | 1 + .../src/populace/build/uk/__init__.py | 197 +++- .../src/populace/build/uk/local_geography.py | 293 +++++- .../src/populace/build/uk/local_runner.py | 237 ++++- .../src/populace/build/uk/local_solver.py | 103 +- .../src/populace/build/uk/source_stages.json | 990 ++++++++++++++++++ .../src/populace/build/uk/spi_support.py | 14 +- .../tests/test_uk_local_geography.py | 108 ++ .../tests/test_uk_local_runner.py | 159 ++- .../tests/test_uk_local_solver.py | 63 ++ .../tests/test_uk_source_manifest.py | 318 ++++++ .../tests/test_uk_spi_support.py | 2 +- 14 files changed, 2447 insertions(+), 93 deletions(-) create mode 100644 packages/populace-build/src/populace/build/uk/source_stages.json create mode 100644 packages/populace-build/tests/test_uk_source_manifest.py diff --git a/packages/populace-build/README.md b/packages/populace-build/README.md index 267056b..75a7269 100644 --- a/packages/populace-build/README.md +++ b/packages/populace-build/README.md @@ -15,11 +15,9 @@ names its donor survey and fails loudly — no silent fallbacks), and the short-term capital gains to −$3.9T); - **export surface** — every replacement artifact can prove that its exported variables match a reference surface, with only documented - structural extras or reviewed exclusions (for UK, this is the eFRS - compatibility check); + structural extras or reviewed exclusions; - **target surface** — the calibration target set covers the reference - target surface and may only be wider, not narrower (for UK, Populace must - calibrate to at least the eFRS target surface); + target surface and may only be wider, not narrower; - **per-family fit** — the calibration's within-10% share is reported per source family, while only broad family-level misses block publication so one family cannot hide inside the global average; @@ -37,11 +35,12 @@ JSON manifests and executed by shared Populace runtimes. ## UK local-geography path `populace.build.uk.local_geography` holds the Populace-owned replacement shape -for UK constituency and local-authority geography. It uses the same stacked -local-area layout as the US local ECPS flow: +for UK constituency and local-authority geography. The production local path is +row-wise assigned, matching the longwise direction of the US local ECPS flow: ```text -column = area_index * n_households + household_index +column = household_index +target rows only see households assigned to that area code ``` The solved weights export to a long sidecar with `(area_type, area_code, @@ -50,12 +49,13 @@ the format PolicyEngine can group by directly for constituency and local authority outputs, and it avoids preserving the legacy dense `areas x households` matrix artifact. -The module does not import the incumbent UK data package. Engine runners and +The module does not import an incumbent UK data package. Engine runners and target providers pass household metric tables and aligned target tables into -`build_stacked_local_matrix`; this keeps Populace clean while the target source -files move over. The helper `sort_households_by_id` also codifies the 2024-25 -FRS fix: household attributes and weights must be sorted by the same stable -household ID before any positional assignment. +`build_assigned_local_matrix` / `build_local_candidate`; this keeps Populace as +the owner of the build surface while historical incumbent comparisons remain +external migration benchmarks. The helper `sort_households_by_id` also +codifies the 2024-25 FRS fix: household attributes and weights must be sorted +by the same stable household ID before any positional assignment. `populace.build.uk.local_targets` declares the constituency and local-authority metric surface used by the local build: HMRC employment/self-employment amount @@ -63,9 +63,9 @@ and count rows, ONS age bands, Universal Credit household rows, constituency UC-by-children rows, and the LA income/tenure/rent rows. It accepts a PolicyEngine-UK-like simulation object and returns household-indexed metric tables; it still takes target values as explicit input tables. `local_solver` -wraps the Populace calibrator's log-weight optimizer for stacked local weights +wraps the Populace calibrator's log-weight optimizer for assigned local weights and records per-area/per-metric diagnostics before the solved weights are -exported with `stacked_weights_to_long`. +exported with `assigned_weights_to_long`. `populace.build.uk.local_runner` is the Populace-owned candidate build path. It loads explicit area and target tables, aligns a sorted household frame with @@ -95,6 +95,23 @@ postcode sources. It writes the cloned row-wise H5, a geography coverage CSV, and `rowwise_build_manifest.json` with input/output hashes, row counts, target coverage, weight preservation, and weakest local-support diagnostics. +Like the US plan, UK migration comparisons against earlier production datasets +belong in release/benchmark harnesses outside this package. The build code here +must not import or depend on the incumbent UK data package; `source_manifest.py` +rejects incumbent country data-package references in declarative source specs. + +`populace.build.uk` now also exposes `UK_SOURCE_MANIFEST`, +`UK_SOURCE_STAGE_SPECS`, `UK_SOURCE_OUTPUTS`, `UK_NONNEGATIVE_SOURCE_OUTPUTS`, +`UK_DONORS`, `UK_STAGE_NAMES`, and `uk_plan(implementations)`. The packaged +`uk/source_stages.json` is the Populace-owned raw-input parity contract for the +UK build: FRS base tables, WAS wealth/debt/vehicles, LCFS consumption and fuel, +ETB VAT and public services, NHS usage, SPI high-income income/reliefs, +FRS-only pension/savings/reported-benefit fill, Advani-Summers capital gains, +salary sacrifice, SLC student-loan plan assignment, and row-wise OA/LA/ +constituency geography. Stage implementations are injected and the plan refuses +to assemble with any missing or unknown stage, matching the US complete-or-fail +source-plan behavior. + ## US plan status `populace.build.us` declares the US build: stage order, donor graph with diff --git a/packages/populace-build/src/populace/build/gates.py b/packages/populace-build/src/populace/build/gates.py index b8d950c..f58e322 100644 --- a/packages/populace-build/src/populace/build/gates.py +++ b/packages/populace-build/src/populace/build/gates.py @@ -24,7 +24,8 @@ member names, not raw source-system codes. - :func:`export_surface_gate` and :func:`target_surface_gate` — replacement builds can prove they cover a reference artifact's export variables and - calibration targets, e.g. UK Populace against eFRS. + calibration targets. Reference artifacts are comparison surfaces, not build + inputs. Scoring uses :func:`relative_error_loss` — the calibrator's own objective — so there is no calibrator-vs-scorer objective mismatch: what the solver @@ -750,10 +751,9 @@ def export_surface_gate( This is stricter than :func:`parity_gate`: parity checks whether populated reference layers are also populated, while this gate checks the exported variable *surface* itself. It is intended for live release blocking where a - country has a known incumbent-compatible artifact, such as UK Populace - matching eFRS exported variables. Extra columns are refused unless the - build declares them as structural/compatibility additions; missing - reference columns require a named reviewed exclusion. + country has a known reference export surface. Extra columns are refused + unless the build declares them as structural/compatibility additions; + missing reference columns require a named reviewed exclusion. """ candidate = {str(name) for name in candidate_columns} reference = {str(name) for name in reference_columns} diff --git a/packages/populace-build/src/populace/build/source_manifest.py b/packages/populace-build/src/populace/build/source_manifest.py index 808809d..8580c96 100644 --- a/packages/populace-build/src/populace/build/source_manifest.py +++ b/packages/populace-build/src/populace/build/source_manifest.py @@ -40,6 +40,7 @@ "assign_by_plan_type", "assign_binary_from_rate", "calibrate_binary_assignment", + "calibrate_weights", "convert_interest_to_structural_mortgage_inputs", "compute_ratio", "derive", diff --git a/packages/populace-build/src/populace/build/uk/__init__.py b/packages/populace-build/src/populace/build/uk/__init__.py index cd5bea3..e125e23 100644 --- a/packages/populace-build/src/populace/build/uk/__init__.py +++ b/packages/populace-build/src/populace/build/uk/__init__.py @@ -1,4 +1,16 @@ -"""UK build helpers for Populace-owned local-geography artifacts.""" +"""UK build helpers for Populace-owned raw-source and local artifacts.""" + +from __future__ import annotations + +from collections.abc import Callable, Mapping +from importlib.resources import files + +from populace.build.plan import DonorSpec, Stage, StagePlan +from populace.build.source_manifest import ( + SourceManifest, + SourceStageSpec, + load_source_manifest, +) from populace.build.uk.geography_sources import ( ENGLAND_LAD_REGION_URL, @@ -42,11 +54,15 @@ write_geography_crosswalk, ) from populace.build.uk.local_geography import ( + AREA_TYPE_TO_ROWWISE_HOUSEHOLD_COLUMN, LONG_GEOGRAPHY_COLUMNS, StackedLocalMatrix, align_area_targets, area_support_summary, + assigned_weights_to_long, + build_assigned_local_matrix, build_stacked_local_matrix, + rowwise_assignment_column, sort_households_by_id, stacked_design_weights, stacked_weights_to_long, @@ -68,6 +84,7 @@ ) from populace.build.uk.local_solver import ( StackedLocalSolveResult, + solve_assigned_local_weights, solve_stacked_local_weights, ) from populace.build.uk.local_targets import ( @@ -123,11 +140,175 @@ support_clone_index_column, support_source_id_column, ) +from populace.frame import Frame + +UK_DONORS: Mapping[str, DonorSpec] = { + "was_wealth": DonorSpec( + survey="Wealth and Assets Survey", + source="https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/debt/methodologies/wealthandassetssurveyqmi", + notes="Household wealth, debts, vehicles, and student-loan balances.", + ), + "regional_property_uprating": DonorSpec( + survey="UK House Price Index and regional land-value tables", + source="https://www.gov.uk/government/collections/uk-house-price-index-reports", + notes="Regional property-value uprating after WAS wealth imputation.", + ), + "lcfs_consumption": DonorSpec( + survey="Living Costs and Food Survey", + source="https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/methodologies/livingcostsandfoodsurveyqmi", + notes="COICOP consumption, fuel spending, and domestic energy use.", + ), + "road_fuel_energy_calibration": DonorSpec( + survey="Road fuel and household energy administrative totals", + source="https://www.gov.uk/government/collections/road-transport-consumption-at-regional-and-local-level", + notes="Fuel and energy calibration targets for LCFS-imputed amounts.", + ), + "etb_vat": DonorSpec( + survey="Effects of Taxes and Benefits", + source="https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/theeffectsoftaxesandbenefitsonhouseholdincomehistoricaldatasets", + notes="Full-rate VAT expenditure-rate imputation.", + ), + "nhs_usage": DonorSpec( + survey="NHS activity and unit-cost tables", + source="https://www.england.nhs.uk/statistics/statistical-work-areas/hospital-activity/monthly-hospital-activity/", + notes="A&E, inpatient, outpatient visit and spending inputs.", + ), + "etb_public_services": DonorSpec( + survey="Effects of Taxes and Benefits public-service tables", + source="https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/theeffectsoftaxesandbenefitsonhouseholdincomehistoricaldatasets", + notes="Education, rail, and bus public-service benefit inputs.", + ), + "rail_public_service_calibration": DonorSpec( + survey="Rail public-service administrative totals", + source="https://www.gov.uk/government/collections/rail-statistics", + notes="Post-weight rail subsidy and usage scaling.", + ), + "spi_income": DonorSpec( + survey="Survey of Personal Incomes", + source="https://www.gov.uk/government/collections/personal-incomes-statistics", + notes="High-income components, Gift Aid, and investment-gift reliefs.", + ), + "frs_only_spi_fill": DonorSpec( + survey="Family Resources Survey 2023-24", + source="https://www.gov.uk/government/collections/family-resources-survey--2", + notes=( + "Second-stage pension, savings, and reported-benefit behavior for " + "SPI support rows." + ), + ), + "advani_summers_capital_gains": DonorSpec( + survey="Advani-Summers capital gains distribution", + source="https://ideas.repec.org/p/hal/wpaper/halshs-03022609.html", + notes="Capital gains assignment and clone flag.", + ), + "frs_salary_sacrifice": DonorSpec( + survey="Family Resources Survey salary-sacrifice subsample", + source="https://www.gov.uk/government/collections/family-resources-survey--2", + notes="Salary-sacrifice pension contributions and employee adjustment.", + ), + "slc_student_loan_plan": DonorSpec( + survey="Student Loans Company repayment-plan statistics", + source="https://www.gov.uk/government/collections/student-loans-for-higher-and-further-education", + notes="Student-loan repayment plan assignment by cohort and balance.", + ), +} + +UK_STAGE_NAMES: tuple[str, ...] = ( + "frs_base", + "was_wealth", + "regional_property_uprating", + "lcfs_consumption", + "etb_vat", + "nhs_usage", + "etb_public_services", + UK_SPI_SUPPORT_STAGE_NAME, + "spi_income", + "frs_only_spi_fill", + "advani_summers_capital_gains", + "frs_salary_sacrifice", + "slc_student_loan_plan", + "rowwise_oa_geography", + "national_calibration", + "local_geography_weights", + "rail_public_service_calibration", + "road_fuel_energy_calibration", + "export", +) + +UK_STRUCTURAL_SOURCE_STAGES: tuple[str, ...] = ( + "frs_base", + UK_SPI_SUPPORT_STAGE_NAME, + "rowwise_oa_geography", + "national_calibration", + "local_geography_weights", +) + + +def _load_uk_source_manifest() -> SourceManifest: + return load_source_manifest(files(__package__).joinpath("source_stages.json")) + + +UK_SOURCE_MANIFEST = _load_uk_source_manifest() +_UK_SOURCE_STAGE_MAP = UK_SOURCE_MANIFEST.stage_map() +_UNKNOWN_UK_SOURCE_STAGES = sorted(set(_UK_SOURCE_STAGE_MAP) - set(UK_STAGE_NAMES)) +if _UNKNOWN_UK_SOURCE_STAGES: + raise ValueError( + "UK source manifest stage(s) are not declared in UK_STAGE_NAMES: " + f"{_UNKNOWN_UK_SOURCE_STAGES}." + ) +UK_SOURCE_STAGE_SPECS: tuple[SourceStageSpec, ...] = tuple( + _UK_SOURCE_STAGE_MAP[name] for name in UK_STAGE_NAMES if name in _UK_SOURCE_STAGE_MAP +) +UK_SOURCE_OUTPUTS: frozenset[str] = frozenset( + output for stage in UK_SOURCE_STAGE_SPECS for output in stage.outputs +) +UK_SOURCE_OUTPUT_STAGES: Mapping[str, tuple[str, ...]] = { + output: tuple( + stage.stage for stage in UK_SOURCE_STAGE_SPECS if output in stage.outputs + ) + for output in sorted(UK_SOURCE_OUTPUTS) +} +UK_REWRITTEN_SOURCE_OUTPUT_STAGES: Mapping[str, tuple[str, ...]] = { + output: stages + for output, stages in UK_SOURCE_OUTPUT_STAGES.items() + if len(stages) > 1 +} +UK_NONNEGATIVE_SOURCE_OUTPUTS: frozenset[str] = frozenset( + output for stage in UK_SOURCE_STAGE_SPECS for output in stage.nonnegative_outputs +) + + +def uk_plan( + implementations: Mapping[str, Callable[[Frame], Frame]], +) -> StagePlan: + """Assemble the UK build plan from injected stage implementations.""" + + missing = [name for name in UK_STAGE_NAMES if name not in implementations] + if missing: + raise ValueError( + f"uk_plan needs an implementation for every declared stage; " + f"missing {missing}. There are no stubs or fallbacks by design." + ) + unknown = sorted(set(implementations) - set(UK_STAGE_NAMES)) + if unknown: + raise ValueError( + f"Unknown stage implementation(s) {unknown}; declared stages " + f"are {list(UK_STAGE_NAMES)}." + ) + return StagePlan( + Stage( + name=name, + transform=implementations[name], + donor=UK_DONORS.get(name), + ) + for name in UK_STAGE_NAMES + ) __all__ = [ "AGE_BANDS", "AREA_TYPES", "AREA_TYPE_TO_CROSSWALK_COLUMN", + "AREA_TYPE_TO_ROWWISE_HOUSEHOLD_COLUMN", "BASE_FRS_SUPPORT_CHANNEL", "BENUNIT_ID_COLUMNS", "COUNTRY_TO_REGION", @@ -167,17 +348,28 @@ "StackedLocalSolveResult", "UK_POSTCODE_OA_MAY25_ZIP_URL", "UK_POSTCODE_PCON_MAY24_ZIP_URL", + "UK_DONORS", "UKLocalCandidateResult", + "UK_NONNEGATIVE_SOURCE_OUTPUTS", "UKRowwiseDatasetResult", "UKSPISupportResult", + "UK_SOURCE_MANIFEST", + "UK_SOURCE_OUTPUTS", + "UK_SOURCE_OUTPUT_STAGES", + "UK_SOURCE_STAGE_SPECS", + "UK_REWRITTEN_SOURCE_OUTPUT_STAGES", "UK_SINGLE_YEAR_TABLES", "UK_SPI_SUPPORT_STAGE_NAME", + "UK_STAGE_NAMES", + "UK_STRUCTURAL_SOURCE_STAGES", "align_area_targets", + "assigned_weights_to_long", "area_support_summary", "area_groups_from_codes", "assign_household_geography", "build_local_candidate", "build_local_candidate_from_dataset", + "build_assigned_local_matrix", "build_complete_uk_geography_crosswalk", "build_england_wales_crosswalk", "build_great_britain_crosswalk", @@ -216,7 +408,9 @@ "prepare_geography_crosswalk", "prepare_household_frame", "read_local_table", + "rowwise_assignment_column", "set_simulation_area_group", + "solve_assigned_local_weights", "solve_stacked_local_weights", "sort_households_by_id", "stacked_design_weights", @@ -225,6 +419,7 @@ "support_channel_column", "support_clone_index_column", "support_source_id_column", + "uk_plan", "update_england_wales_lad_codes", "validate_uk_rowwise_dataset_tables", "validate_geography_coverage", diff --git a/packages/populace-build/src/populace/build/uk/local_geography.py b/packages/populace-build/src/populace/build/uk/local_geography.py index f87c164..b6cc367 100644 --- a/packages/populace-build/src/populace/build/uk/local_geography.py +++ b/packages/populace-build/src/populace/build/uk/local_geography.py @@ -1,10 +1,12 @@ """Longwise UK local-geography build primitives. -This module owns the representation that lets Populace replace the legacy +This module owns the representations that let Populace replace the legacy UK incumbent ``areas x households`` matrix artifacts: * a stacked sparse matrix whose columns are ``area_index * n_households + household_index``; and +* an assigned sparse matrix whose columns are household weights and whose + target rows only see households assigned to that local area; and * a longweight sidecar with one row per non-zero ``(area, household, weight)`` assignment. @@ -39,6 +41,12 @@ "weight_source", ) +AREA_TYPE_TO_ROWWISE_HOUSEHOLD_COLUMN = { + "constituency": "constituency_code_oa", + "la": "la_code_oa", + "local_authority": "la_code_oa", +} + _AREA_METADATA_COLUMNS = frozenset( { "area_code", @@ -54,7 +62,7 @@ @dataclass(frozen=True) class StackedLocalMatrix: - """Sparse stacked local-area calibration matrix and aligned targets.""" + """Sparse local-area calibration matrix and aligned targets.""" matrix: sp.csr_matrix targets: np.ndarray @@ -226,8 +234,10 @@ def build_stacked_local_matrix( ) cache_key = (group, metric_index) if cache_key not in nonzero_cache: - column = metric_tables[group].iloc[:, metric_index].to_numpy( - dtype=np.float64 + column = ( + metric_tables[group] + .iloc[:, metric_index] + .to_numpy(dtype=np.float64) ) if not np.isfinite(column).all(): raise ValueError( @@ -267,6 +277,162 @@ def build_stacked_local_matrix( ) +def rowwise_assignment_column( + area_type: str, + *, + assignment_column: str | None = None, +) -> str: + """Return the household column carrying rowwise local geography codes.""" + + if assignment_column is not None: + column = str(assignment_column).strip() + if column == "": + raise ValueError("assignment_column must not be blank.") + return column + if area_type not in AREA_TYPE_TO_ROWWISE_HOUSEHOLD_COLUMN: + raise ValueError( + f"No default rowwise assignment column is defined for {area_type!r}." + ) + return AREA_TYPE_TO_ROWWISE_HOUSEHOLD_COLUMN[area_type] + + +def build_assigned_local_matrix( + metrics: pd.DataFrame | Mapping[str, pd.DataFrame], + targets: pd.DataFrame, + *, + household_frame: pd.DataFrame, + area_codes: Sequence[str] | None = None, + area_groups: Mapping[str, str] | None = None, + household_ids: Sequence[Any] | None = None, + area_type: str = "constituency", + code_column: str = "code", + assignment_column: str | None = None, +) -> StackedLocalMatrix: + """Build a rowwise-assigned sparse matrix for local-area calibration. + + Unlike :func:`build_stacked_local_matrix`, each household has a single + column. A household contributes only to the target rows for the local area + stored in its rowwise geography assignment column, such as + ``constituency_code_oa`` or ``la_code_oa``. + """ + + if area_codes is None: + if code_column not in targets.columns: + raise ValueError( + "area_codes must be supplied when targets has no " + f"{code_column!r} column." + ) + area_codes = targets[code_column].astype(str).tolist() + codes = _area_code_tuple(area_codes) + if household_ids is None: + if "household_id" not in household_frame.columns: + raise ValueError("household_frame must include 'household_id'.") + household_ids = household_frame["household_id"].to_numpy() + hh_ids = np.asarray(household_ids) + aligned_households = _align_household_frame(household_frame, hh_ids) + assert aligned_households is not None + assignment_name = rowwise_assignment_column( + area_type, + assignment_column=assignment_column, + ) + if assignment_name not in aligned_households.columns: + raise ValueError( + f"household_frame is missing rowwise assignment column {assignment_name!r}." + ) + assignments = _normalise_area_assignments(aligned_households[assignment_name]) + + metric_tables, groups = _normalise_metric_tables( + metrics, + area_codes=codes, + area_groups=area_groups, + household_ids=hh_ids, + ) + first = next(iter(metric_tables.values())) + metric_names = tuple(str(col) for col in first.columns) + target_values = align_area_targets( + targets, + codes, + metric_names=metric_names, + code_column=code_column, + ) + + n_households = len(first) + n_areas = len(codes) + n_metrics = len(metric_names) + n_targets = n_areas * n_metrics + rows: list[np.ndarray] = [] + cols: list[np.ndarray] = [] + data: list[np.ndarray] = [] + target_rows: list[dict[str, Any]] = [] + assignment_indices = { + area_code: np.flatnonzero(assignments == area_code) for area_code in codes + } + metric_cache: dict[tuple[str, int], np.ndarray] = {} + + for area_index, area_code in enumerate(codes): + group = groups[area_code] + household_positions = assignment_indices[area_code] + for metric_index, metric_name in enumerate(metric_names): + target_index = area_index * n_metrics + metric_index + target_rows.append( + { + "target_index": target_index, + "area_type": area_type, + "area_code": area_code, + "area_index": area_index, + "area_group": group, + "metric": metric_name, + "metric_index": metric_index, + "value": float(target_values.loc[area_code, metric_name]), + } + ) + if len(household_positions) == 0: + continue + cache_key = (group, metric_index) + if cache_key not in metric_cache: + column = ( + metric_tables[group] + .iloc[:, metric_index] + .to_numpy(dtype=np.float64) + ) + if not np.isfinite(column).all(): + raise ValueError( + f"metric {metric_name!r} for group {group!r} " + "contains non-finite values." + ) + metric_cache[cache_key] = column + values = metric_cache[cache_key][household_positions] + nz = np.flatnonzero(values) + if len(nz) == 0: + continue + rows.append(np.full(len(nz), target_index, dtype=np.int64)) + cols.append(household_positions[nz].astype(np.int64)) + data.append(values[nz].astype(np.float64, copy=False)) + + if rows: + row_array = np.concatenate(rows) + col_array = np.concatenate(cols) + data_array = np.concatenate(data) + else: + row_array = np.array([], dtype=np.int64) + col_array = np.array([], dtype=np.int64) + data_array = np.array([], dtype=np.float64) + matrix = sp.csr_matrix( + (data_array, (row_array, col_array)), + shape=(n_targets, n_households), + dtype=np.float64, + ) + target_frame = pd.DataFrame(target_rows) + return StackedLocalMatrix( + matrix=matrix, + targets=target_frame["value"].to_numpy(dtype=np.float64), + target_frame=target_frame, + area_codes=codes, + metric_names=metric_names, + n_households=n_households, + ) + + def stacked_design_weights( base_weights: Sequence[float], n_areas: int, @@ -362,6 +528,114 @@ def stacked_weights_to_long( return out.loc[:, LONG_GEOGRAPHY_COLUMNS] +def assigned_weights_to_long( + weights: Sequence[float], + area_codes: Sequence[str], + household_ids: Sequence[Any], + *, + area_type: str, + household_frame: pd.DataFrame, + assignment_column: str | None = None, + base_weights: Sequence[float] | None = None, + drop_weight_atol: float = 0.0, + source_year: int | None = None, + weight_source: str = "populace_local_assigned", + drop_zero: bool = True, +) -> pd.DataFrame: + """Convert assigned household weights to the local-geography sidecar.""" + + codes = _area_code_tuple(area_codes) + hh_ids = np.asarray(household_ids) + n_households = len(hh_ids) + w = np.asarray(weights, dtype=np.float64).reshape(-1) + if len(w) != n_households: + raise ValueError( + f"weights length must equal household count ({n_households}), got {len(w)}." + ) + if not np.isfinite(w).all() or (w < 0).any(): + raise ValueError("weights must be finite and non-negative.") + base = None if base_weights is None else np.asarray(base_weights, dtype=np.float64) + if base is not None: + if base.shape != w.shape: + raise ValueError( + f"base_weights must align with weights, got {base.shape} vs {w.shape}." + ) + if not np.isfinite(base).all() or (base < 0).any(): + raise ValueError("base_weights must be finite and non-negative.") + if not np.isfinite(drop_weight_atol) or drop_weight_atol < 0: + raise ValueError("drop_weight_atol must be finite and non-negative.") + + household_frame = _align_household_frame(household_frame, hh_ids) + assert household_frame is not None + assignment_name = rowwise_assignment_column( + area_type, + assignment_column=assignment_column, + ) + if assignment_name not in household_frame.columns: + raise ValueError( + f"household_frame is missing rowwise assignment column {assignment_name!r}." + ) + assignments = _normalise_area_assignments(household_frame[assignment_name]) + area_index_by_code = {area_code: idx for idx, area_code in enumerate(codes)} + in_requested_area = np.fromiter( + (area_code in area_index_by_code for area_code in assignments), + dtype=bool, + count=n_households, + ) + if drop_zero: + if base is None: + in_requested_area &= w > drop_weight_atol + else: + zero_base_floor = (base == 0) & (w <= drop_weight_atol) + in_requested_area &= (w != 0) & ~zero_base_floor + selected = np.flatnonzero(in_requested_area) + + source_year_values = _metadata_values( + household_frame, + "source_year", + default=source_year, + length=n_households, + ) + source_household_ids = _metadata_values( + household_frame, + "source_household_id", + default=hh_ids, + length=n_households, + ) + source_keys = _metadata_values( + household_frame, + "source_household_key", + default=_source_keys(source_year_values, source_household_ids), + length=n_households, + ) + clone_index = _metadata_values( + household_frame, + "clone_index", + default=0, + length=n_households, + ) + + selected_area_codes = assignments[selected] + out = pd.DataFrame( + { + "area_type": area_type, + "area_code": selected_area_codes, + "area_index": [ + area_index_by_code[area_code] for area_code in selected_area_codes + ], + "household_index": selected.astype(np.int64), + "household_id": hh_ids[selected], + "source_year": source_year_values[selected], + "source_household_id": source_household_ids[selected], + "source_household_key": source_keys[selected], + "clone_index": clone_index[selected], + "weight": w[selected], + "weight_source": weight_source, + } + ) + return out.loc[:, LONG_GEOGRAPHY_COLUMNS] + + def area_support_summary( long_weights: pd.DataFrame, *, @@ -477,8 +751,7 @@ def _normalise_metric_tables( for group, frame in tables.items(): if len(frame) != len(first): raise ValueError( - f"metric table {group!r} has {len(frame)} rows; expected " - f"{len(first)}." + f"metric table {group!r} has {len(frame)} rows; expected {len(first)}." ) if not frame.index.equals(first.index): raise ValueError( @@ -580,6 +853,14 @@ def _align_household_frame( return aligned.reset_index(drop=True) +def _normalise_area_assignments(values: Sequence[Any]) -> np.ndarray: + series = pd.Series(values) + missing = series.isna() + strings = series.astype(str).str.strip() + strings = strings.mask(missing | (strings == ""), None) + return strings.to_numpy(dtype=object) + + def _source_keys( source_year: Sequence[Any], source_household_id: Sequence[Any], diff --git a/packages/populace-build/src/populace/build/uk/local_runner.py b/packages/populace-build/src/populace/build/uk/local_runner.py index 04b2e1a..2e6ea6e 100644 --- a/packages/populace-build/src/populace/build/uk/local_runner.py +++ b/packages/populace-build/src/populace/build/uk/local_runner.py @@ -20,13 +20,17 @@ from populace.build.uk.local_geography import ( StackedLocalMatrix, area_support_summary, + assigned_weights_to_long, + build_assigned_local_matrix, build_stacked_local_matrix, + rowwise_assignment_column, sort_households_by_id, stacked_weights_to_long, write_long_geography_weights, ) from populace.build.uk.local_solver import ( StackedLocalSolveResult, + solve_assigned_local_weights, solve_stacked_local_weights, ) from populace.build.uk.local_targets import ( @@ -44,6 +48,7 @@ class UKLocalCandidateResult: solve_result: StackedLocalSolveResult long_weights: pd.DataFrame support_summary: pd.DataFrame + support_mode: str def read_local_table(path: str | Path) -> pd.DataFrame: @@ -260,6 +265,8 @@ def build_local_candidate( max_areas: int | None = None, source_year: int | None = None, weight_source: str = "populace_uk_local", + support_mode: str = "auto", + assignment_column: str | None = None, solver_options: Mapping[str, Any] | None = None, ) -> UKLocalCandidateResult: """Build, solve, and export a UK local candidate in longwise form.""" @@ -281,32 +288,81 @@ def build_local_candidate( code_column=code_column, group_column=group_column, ) - household_ids = households["household_id"].to_numpy() - base_weights = households["household_weight"].to_numpy(dtype=np.float64) - target_frame = _as_frame(targets) - problem = build_stacked_local_matrix( - metrics, - target_frame, - area_codes=area_codes, - area_groups=area_groups, - household_ids=household_ids, - area_type=area_type, - code_column=code_column, - ) - solve_result = solve_stacked_local_weights( - problem, - base_weights, - **dict(solver_options or {}), - ) - long_weights = stacked_weights_to_long( - solve_result.weights, - area_codes, - household_ids, + resolved_support_mode = _resolve_support_mode( + support_mode, area_type=area_type, household_frame=households, - source_year=source_year, - weight_source=weight_source, + assignment_column=assignment_column, ) + if resolved_support_mode == "assigned": + households = _filter_assigned_households_to_areas( + households, + area_codes=area_codes, + area_type=area_type, + assignment_column=assignment_column, + ) + metrics = _subset_metric_tables_to_households( + metrics, + households["household_id"].to_numpy(), + ) + household_ids = households["household_id"].to_numpy() + base_weights = households["household_weight"].to_numpy(dtype=np.float64) + target_frame = _as_frame(targets) + solver_config = dict(solver_options or {}) + if resolved_support_mode == "assigned": + problem = build_assigned_local_matrix( + metrics, + target_frame, + household_frame=households, + area_codes=area_codes, + area_groups=area_groups, + household_ids=household_ids, + area_type=area_type, + code_column=code_column, + assignment_column=assignment_column, + ) + solve_result = solve_assigned_local_weights( + problem, + base_weights, + **solver_config, + ) + min_initial_weight = float(solver_config.get("min_initial_weight", 1e-4)) + long_weights = assigned_weights_to_long( + solve_result.weights, + area_codes, + household_ids, + area_type=area_type, + household_frame=households, + assignment_column=assignment_column, + base_weights=base_weights, + drop_weight_atol=min_initial_weight, + source_year=source_year, + weight_source=weight_source, + ) + else: + problem = build_stacked_local_matrix( + metrics, + target_frame, + area_codes=area_codes, + area_groups=area_groups, + household_ids=household_ids, + area_type=area_type, + code_column=code_column, + ) + solve_result = solve_stacked_local_weights( + problem, + base_weights, + **solver_config, + ) + long_weights = stacked_weights_to_long( + solve_result.weights, + area_codes, + household_ids, + area_type=area_type, + household_frame=households, + source_year=source_year, + weight_source=weight_source, + ) return UKLocalCandidateResult( problem=problem, solve_result=solve_result, @@ -316,6 +372,7 @@ def build_local_candidate( area_codes=area_codes, area_type=area_type, ), + support_mode=resolved_support_mode, ) @@ -333,12 +390,16 @@ def build_local_candidate_from_dataset( max_areas: int | None = None, source_year: int | None = None, weight_source: str = "populace_uk_local", + support_mode: str = "auto", + assignment_column: str | None = None, simulation_factory: Callable[[Any], Any] | None = None, solver_options: Mapping[str, Any] | None = None, ) -> UKLocalCandidateResult: """Build a UK local candidate from a Populace UK H5 or dataset object.""" - dataset_obj = load_uk_dataset(dataset) if isinstance(dataset, str | Path) else dataset + dataset_obj = ( + load_uk_dataset(dataset) if isinstance(dataset, str | Path) else dataset + ) areas = prepare_area_frame( area_frame, code_column=code_column, @@ -375,6 +436,8 @@ def build_local_candidate_from_dataset( sort_areas_by_code=False, source_year=source_year, weight_source=weight_source, + support_mode=support_mode, + assignment_column=assignment_column, solver_options=solver_options, ) @@ -394,6 +457,7 @@ def summarize_local_candidate(result: UKLocalCandidateResult) -> dict[str, Any]: "n_targets": int(len(result.problem.targets)), "n_long_rows": int(len(result.long_weights)), "n_nonzero": int(result.solve_result.n_nonzero), + "support_mode": result.support_mode, "initial_loss": float(result.solve_result.initial_loss), "final_loss": float(result.solve_result.final_loss), "weight_sum": float(result.long_weights["weight"].sum()), @@ -418,14 +482,10 @@ def summarize_local_candidate(result: UKLocalCandidateResult) -> dict[str, Any]: 0 if support.empty else int(support["nonzero_source_households"].max()) ), "min_area_effective_sample_size": ( - 0.0 - if support.empty - else float(support["effective_sample_size"].min()) + 0.0 if support.empty else float(support["effective_sample_size"].min()) ), "median_area_effective_sample_size": ( - 0.0 - if support.empty - else float(support["effective_sample_size"].median()) + 0.0 if support.empty else float(support["effective_sample_size"].median()) ), } @@ -465,6 +525,109 @@ def _normalise_nonblank_strings(values: pd.Series, *, column: str) -> pd.Series: return strings +def _resolve_support_mode( + support_mode: str, + *, + area_type: str, + household_frame: pd.DataFrame, + assignment_column: str | None, +) -> str: + mode = str(support_mode).strip().lower() + valid_modes = {"auto", "assigned", "stacked"} + if mode not in valid_modes: + raise ValueError(f"support_mode must be one of {sorted(valid_modes)}.") + if mode == "stacked": + return mode + try: + column = rowwise_assignment_column( + area_type, + assignment_column=assignment_column, + ) + except ValueError: + if mode == "auto": + return "stacked" + raise + if mode == "assigned": + return mode + return "assigned" if column in household_frame.columns else "stacked" + + +def _filter_assigned_households_to_areas( + households: pd.DataFrame, + *, + area_codes: Sequence[str], + area_type: str, + assignment_column: str | None, +) -> pd.DataFrame: + column = rowwise_assignment_column(area_type, assignment_column=assignment_column) + if column not in households.columns: + raise ValueError( + f"household_frame is missing rowwise assignment column {column!r}." + ) + assignments = _normalise_optional_strings(households[column]) + mask = assignments.isin(set(map(str, area_codes))) + filtered = households.loc[mask].reset_index(drop=True) + if filtered.empty: + raise ValueError( + "no households are assigned to the requested local area codes." + ) + return filtered + + +def _subset_metric_tables_to_households( + metrics: pd.DataFrame | Mapping[str, pd.DataFrame], + household_ids: Sequence[Any], +) -> pd.DataFrame | dict[str, pd.DataFrame]: + if isinstance(metrics, pd.DataFrame): + return _subset_metric_table_to_households( + metrics, + household_ids, + group="__all__", + ) + return { + str(group): _subset_metric_table_to_households( + frame, + household_ids, + group=str(group), + ) + for group, frame in metrics.items() + } + + +def _subset_metric_table_to_households( + table: pd.DataFrame, + household_ids: Sequence[Any], + *, + group: str, +) -> pd.DataFrame: + expected = pd.Index(household_ids) + if expected.has_duplicates: + duplicates = expected[expected.duplicated()].unique() + raise ValueError( + "assigned household IDs must be unique before metric subsetting; " + f"duplicate value(s): {list(map(str, duplicates[:5]))}." + ) + if table.index.has_duplicates: + duplicates = table.index[table.index.duplicated()].unique() + raise ValueError( + f"metric table {group!r} household index must be unique; " + f"duplicate value(s): {list(map(str, duplicates[:5]))}." + ) + missing = expected.difference(table.index) + if len(missing): + raise ValueError( + f"metric table {group!r} is missing household_id value(s): " + f"{list(map(str, missing[:5]))}." + ) + return table.reindex(expected) + + +def _normalise_optional_strings(values: pd.Series) -> pd.Series: + missing = values.isna() + strings = values.astype(str).str.strip() + return strings.mask(missing | (strings == ""), None) + + def _source_household_keys( household_frame: pd.DataFrame, *, @@ -494,9 +657,7 @@ def _metric_table_from_frame( group: str, ) -> pd.DataFrame: if household_id_column not in frame.columns: - raise ValueError( - f"metric table {group!r} is missing {household_id_column!r}." - ) + raise ValueError(f"metric table {group!r} is missing {household_id_column!r}.") table = frame.copy() if table[household_id_column].isna().any(): raise ValueError( @@ -552,12 +713,6 @@ def _align_metric_table_to_households( f"metric table {group!r} is missing household_id value(s): " f"{list(map(str, missing[:5]))}." ) - extra = table.index.difference(expected) - if len(extra): - raise ValueError( - f"metric table {group!r} has unexpected household_id value(s): " - f"{list(map(str, extra[:5]))}." - ) return table.reindex(expected) @@ -568,9 +723,7 @@ def _infer_period(dataset: Any, period: int | str | None) -> int | str: value = getattr(dataset, attr, None) if value is not None: return value - raise ValueError( - "period is required when it cannot be inferred from the dataset." - ) + raise ValueError("period is required when it cannot be inferred from the dataset.") def _default_uk_simulation_factory(dataset: Any) -> Any: diff --git a/packages/populace-build/src/populace/build/uk/local_solver.py b/packages/populace-build/src/populace/build/uk/local_solver.py index 5560f2b..d2b9705 100644 --- a/packages/populace-build/src/populace/build/uk/local_solver.py +++ b/packages/populace-build/src/populace/build/uk/local_solver.py @@ -1,4 +1,4 @@ -"""Solver wrapper for UK stacked local-geography weights.""" +"""Solver wrappers for UK local-geography weights.""" from __future__ import annotations @@ -72,6 +72,100 @@ def solve_stacked_local_weights( problem.n_areas, min_weight=min_initial_weight, ) + if len(initial_weights) != problem.matrix.shape[1]: + raise ValueError( + "base_weights expanded to the wrong stacked length: " + f"{len(initial_weights)} vs {problem.matrix.shape[1]}." + ) + return _solve_local_weights( + problem, + initial_weights, + epochs=epochs, + learning_rate=learning_rate, + max_weight_ratio=max_weight_ratio, + conserve_mass=conserve_mass, + target_records=target_records, + l0_lambda=l0_lambda, + target_loss_weights=target_loss_weights, + target_loss_scales=target_loss_scales, + target_loss_cap=target_loss_cap, + budget_iters=budget_iters, + seed=seed, + ) + + +def solve_assigned_local_weights( + problem: StackedLocalMatrix, + base_weights: Sequence[float], + *, + epochs: int = 512, + learning_rate: float = 0.15, + max_weight_ratio: float | None = None, + conserve_mass: bool = False, + target_records: int | None = None, + l0_lambda: float = 0.0, + min_initial_weight: float = 1e-4, + target_loss_weights: Sequence[float] | None = None, + target_loss_scales: Sequence[float] | None = None, + target_loss_cap: float = 10.0, + budget_iters: int = 10, + seed: int = 0, +) -> StackedLocalSolveResult: + """Solve rowwise-assigned local weights for a Populace UK local build. + + ``base_weights`` align one-to-one with the household columns in ``problem``. + The optional ``min_initial_weight`` floor mirrors the stacked solver and is + required by the torch log-weight optimizer. The assigned path defaults to + no ``max_weight_ratio`` cap so zero-weight support rows, such as synthetic + SPI rows, can be upweighted from the optimizer floor. + """ + + weights = np.asarray(base_weights, dtype=np.float64) + if weights.ndim != 1: + raise ValueError("base_weights must be one-dimensional.") + if not np.isfinite(weights).all() or (weights < 0).any(): + raise ValueError("base_weights must be finite and non-negative.") + if not np.isfinite(min_initial_weight) or min_initial_weight < 0: + raise ValueError("min_initial_weight must be finite and non-negative.") + initial_weights = np.maximum(weights, min_initial_weight) + if len(initial_weights) != problem.matrix.shape[1]: + raise ValueError( + "base_weights must align with the assigned local matrix columns: " + f"{len(initial_weights)} vs {problem.matrix.shape[1]}." + ) + return _solve_local_weights( + problem, + initial_weights, + epochs=epochs, + learning_rate=learning_rate, + max_weight_ratio=max_weight_ratio, + conserve_mass=conserve_mass, + target_records=target_records, + l0_lambda=l0_lambda, + target_loss_weights=target_loss_weights, + target_loss_scales=target_loss_scales, + target_loss_cap=target_loss_cap, + budget_iters=budget_iters, + seed=seed, + ) + + +def _solve_local_weights( + problem: StackedLocalMatrix, + initial_weights: np.ndarray, + *, + epochs: int, + learning_rate: float, + max_weight_ratio: float | None, + conserve_mass: bool, + target_records: int | None, + l0_lambda: float, + target_loss_weights: Sequence[float] | None, + target_loss_scales: Sequence[float] | None, + target_loss_cap: float, + budget_iters: int, + seed: int, +) -> StackedLocalSolveResult: targets = np.asarray(problem.targets, dtype=np.float64) scales = ( default_target_loss_scales(targets) @@ -93,14 +187,9 @@ def solve_stacked_local_weights( "target_loss_weights must align with targets, got " f"{loss_weights.shape} vs {targets.shape}." ) - if len(initial_weights) != problem.matrix.shape[1]: - raise ValueError( - "base_weights expanded to the wrong stacked length: " - f"{len(initial_weights)} vs {problem.matrix.shape[1]}." - ) if (initial_weights <= 0).any(): raise ValueError( - "all expanded initial weights must be strictly positive for the " + "all initial weights must be strictly positive for the " "log-weight optimizer; use a positive min_initial_weight or remove " "zero-weight records before solving." ) diff --git a/packages/populace-build/src/populace/build/uk/source_stages.json b/packages/populace-build/src/populace/build/uk/source_stages.json new file mode 100644 index 0000000..91ab42e --- /dev/null +++ b/packages/populace-build/src/populace/build/uk/source_stages.json @@ -0,0 +1,990 @@ +{ + "version": 1, + "country": "uk", + "policy": "UK source stages are manifest-defined. Country/source content may declare primary artifacts, columns, sentinel handling, derivations, imputation recipes, outputs, and validation requirements here; executable Python belongs only in shared Populace runtimes.", + "stages": [ + { + "stage": "frs_base", + "survey": "Family Resources Survey 2023-24", + "source": "https://www.gov.uk/government/collections/family-resources-survey--2", + "grain": "household_person_benunit", + "artifacts": [ + { + "kind": "survey_microdata", + "format": "tabular_release", + "vintage": "2023-24", + "locator": "DWP Family Resources Survey household, benefit unit, and person tables" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "frs_household", + "frs_benunit", + "frs_person" + ], + "weight": "household_weight" + }, + { + "kind": "replace_sentinels", + "scope": "frs_missing_or_not_applicable_fields" + }, + { + "kind": "derive", + "outputs": [ + "entity_ids", + "source_household_lineage", + "household_income_predictors", + "education_counts", + "housing_predictors" + ] + } + ], + "outputs": [ + "household_id", + "benunit_id", + "person_id", + "person_household_id", + "person_benunit_id", + "household_weight", + "region", + "age", + "gender", + "employment_income", + "self_employment_income", + "private_pension_income", + "capital_income", + "household_net_income", + "hbai_household_net_income", + "tenure_type", + "accommodation_type", + "num_adults", + "num_children", + "num_bedrooms", + "council_tax" + ], + "nonnegative_outputs": [ + "household_weight", + "age", + "employment_income", + "self_employment_income", + "private_pension_income", + "household_net_income", + "hbai_household_net_income", + "num_adults", + "num_children", + "num_bedrooms", + "council_tax" + ], + "notes": "This stage owns the raw FRS base entities and lineage. The compact UK artifact remains the fast national input; local variants can pool years or clone from this base before row-wise geography assignment." + }, + { + "stage": "was_wealth", + "survey": "Wealth and Assets Survey", + "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/debt/methodologies/wealthandassetssurveyqmi", + "grain": "household", + "artifacts": [ + { + "kind": "survey_microdata", + "format": "secure_or_licensed_extract", + "vintage": "latest_available", + "locator": "ONS Wealth and Assets Survey household/person extract" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "was_household", + "weight": "weight" + }, + { + "kind": "replace_sentinels", + "scope": "was_missing_or_not_applicable_fields" + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "household_net_income", + "num_adults", + "num_children", + "private_pension_income", + "employment_income", + "self_employment_income", + "capital_income", + "num_bedrooms", + "council_tax", + "is_renting", + "region" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "owned_land", + "property_wealth", + "corporate_wealth", + "gross_financial_wealth", + "net_financial_wealth", + "main_residence_value", + "other_residential_property_value", + "non_residential_property_value", + "savings", + "num_vehicles", + "student_loan_balance", + "mortgage_debt", + "consumer_debt" + ], + "nonnegative_outputs": [ + "owned_land", + "property_wealth", + "corporate_wealth", + "gross_financial_wealth", + "main_residence_value", + "other_residential_property_value", + "non_residential_property_value", + "savings", + "num_vehicles", + "student_loan_balance", + "mortgage_debt", + "consumer_debt" + ], + "notes": "Northern Ireland can borrow the Wales region support when the donor survey does not identify Northern Ireland with enough detail." + }, + { + "stage": "regional_property_uprating", + "survey": "UK House Price Index and regional land-value tables", + "source": "https://www.gov.uk/government/collections/uk-house-price-index-reports", + "grain": "household", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "UK House Price Index regional series and land-value adjustment tables" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "regional_property_uprating" + }, + { + "kind": "uprate", + "from_year": "wealth_survey_vintage", + "to_year_from_build_config": true, + "by": [ + "region", + "property_type" + ] + } + ], + "outputs": [ + "property_wealth", + "main_residence_value" + ], + "nonnegative_outputs": [ + "property_wealth", + "main_residence_value" + ] + }, + { + "stage": "lcfs_consumption", + "survey": "Living Costs and Food Survey", + "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/methodologies/livingcostsandfoodsurveyqmi", + "grain": "household", + "artifacts": [ + { + "kind": "survey_microdata", + "format": "licensed_extract", + "vintage": "latest_available", + "locator": "ONS Living Costs and Food Survey household/person extract" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "lcfs_household", + "lcfs_person" + ], + "weight": "weight" + }, + { + "kind": "derive", + "outputs": [ + "lcfs_coicop_consumption", + "lcfs_fuel_spending", + "has_fuel_consumption" + ] + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "is_adult", + "is_child", + "region", + "employment_income", + "self_employment_income", + "private_pension_income", + "hbai_household_net_income", + "tenure_type", + "accommodation_type", + "has_fuel_consumption" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "has_fuel_consumption", + "food_and_non_alcoholic_beverages_consumption", + "alcohol_and_tobacco_consumption", + "clothing_and_footwear_consumption", + "housing_water_and_electricity_consumption", + "household_furnishings_consumption", + "health_consumption", + "transport_consumption", + "communication_consumption", + "recreation_consumption", + "education_consumption", + "restaurants_and_hotels_consumption", + "miscellaneous_consumption", + "petrol_spending", + "diesel_spending", + "domestic_energy_consumption", + "electricity_consumption", + "gas_consumption" + ], + "nonnegative_outputs": [ + "food_and_non_alcoholic_beverages_consumption", + "alcohol_and_tobacco_consumption", + "clothing_and_footwear_consumption", + "housing_water_and_electricity_consumption", + "household_furnishings_consumption", + "health_consumption", + "transport_consumption", + "communication_consumption", + "recreation_consumption", + "education_consumption", + "restaurants_and_hotels_consumption", + "miscellaneous_consumption", + "petrol_spending", + "diesel_spending", + "domestic_energy_consumption", + "electricity_consumption", + "gas_consumption" + ], + "notes": "The fuel-consumption bridge uses the WAS vehicle signal on recipient households and LCFS fuel purchases on donors." + }, + { + "stage": "road_fuel_energy_calibration", + "survey": "Road fuel and household energy administrative totals", + "source": "https://www.gov.uk/government/collections/road-transport-consumption-at-regional-and-local-level", + "grain": "household", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "DfT road-fuel consumption totals and DESNZ household energy tables" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "road_fuel_consumption", + "domestic_energy_targets" + ] + }, + { + "kind": "uprate", + "variables": [ + "petrol_spending", + "diesel_spending", + "electricity_consumption", + "gas_consumption" + ], + "targets": [ + "road_fuel_consumption", + "domestic_energy_targets" + ], + "weight": "household_weight" + }, + { + "kind": "derive", + "outputs": [ + "domestic_energy_consumption" + ], + "formula": "electricity_consumption + gas_consumption" + } + ], + "outputs": [ + "petrol_spending", + "diesel_spending", + "domestic_energy_consumption", + "electricity_consumption", + "gas_consumption" + ], + "nonnegative_outputs": [ + "petrol_spending", + "diesel_spending", + "domestic_energy_consumption", + "electricity_consumption", + "gas_consumption" + ] + }, + { + "stage": "etb_vat", + "survey": "Effects of Taxes and Benefits", + "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/theeffectsoftaxesandbenefitsonhouseholdincomehistoricaldatasets", + "grain": "household", + "artifacts": [ + { + "kind": "published_microdata_table", + "format": "spreadsheet", + "vintage": "build_year", + "locator": "ONS Effects of Taxes and Benefits indirect-tax tables" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "etb_household" + }, + { + "kind": "derive", + "outputs": [ + "full_rate_vat_expenditure_rate" + ] + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "is_adult", + "is_child", + "is_SP_age", + "household_net_income" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "full_rate_vat_expenditure_rate" + ], + "nonnegative_outputs": [ + "full_rate_vat_expenditure_rate" + ] + }, + { + "stage": "nhs_usage", + "survey": "NHS activity and unit-cost tables", + "source": "https://www.england.nhs.uk/statistics/statistical-work-areas/hospital-activity/monthly-hospital-activity/", + "grain": "person", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "NHS activity counts and service-cost totals by age/sex where available" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "nhs_activity", + "nhs_unit_costs" + ] + }, + { + "kind": "fit_weighted_imputer", + "predictors": [ + "age", + "gender", + "region", + "disability_benefit_indicators", + "hbai_household_net_income" + ] + }, + { + "kind": "derive", + "outputs": [ + "nhs_visits", + "nhs_spending" + ] + }, + { + "kind": "support_clip", + "range": "administrative_nonnegative" + } + ], + "outputs": [ + "a_and_e_visits", + "admitted_patient_visits", + "outpatient_visits", + "nhs_a_and_e_spending", + "nhs_admitted_patient_spending", + "nhs_outpatient_spending", + "nhs_visits", + "nhs_spending" + ], + "nonnegative_outputs": [ + "a_and_e_visits", + "admitted_patient_visits", + "outpatient_visits", + "nhs_a_and_e_spending", + "nhs_admitted_patient_spending", + "nhs_outpatient_spending", + "nhs_visits", + "nhs_spending" + ] + }, + { + "stage": "etb_public_services", + "survey": "Effects of Taxes and Benefits public-service tables", + "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/theeffectsoftaxesandbenefitsonhouseholdincomehistoricaldatasets", + "grain": "household", + "artifacts": [ + { + "kind": "published_microdata_table", + "format": "spreadsheet", + "vintage": "build_year", + "locator": "ONS Effects of Taxes and Benefits benefits-in-kind tables" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "etb_household_services" + }, + { + "kind": "derive", + "outputs": [ + "rail_usage" + ] + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "is_adult", + "is_child", + "is_SP_age", + "count_primary_education", + "count_secondary_education", + "count_further_education", + "dla", + "pip", + "hbai_household_net_income" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "dfe_education_spending", + "rail_subsidy_spending", + "bus_subsidy_spending", + "rail_usage" + ], + "nonnegative_outputs": [ + "dfe_education_spending", + "rail_subsidy_spending", + "bus_subsidy_spending", + "rail_usage" + ] + }, + { + "stage": "rail_public_service_calibration", + "survey": "Rail public-service administrative totals", + "source": "https://www.gov.uk/government/collections/rail-statistics", + "grain": "household", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "DfT rail passenger and subsidy totals" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "rail_public_service_targets" + }, + { + "kind": "uprate", + "variables": [ + "rail_subsidy_spending", + "rail_usage" + ], + "targets": [ + "rail_public_service_targets" + ], + "weight": "household_weight" + } + ], + "outputs": [ + "rail_subsidy_spending", + "rail_usage" + ], + "nonnegative_outputs": [ + "rail_subsidy_spending", + "rail_usage" + ], + "notes": "This post-weight stage scales rail usage and subsidy after the local/final household weights are available." + }, + { + "stage": "spi_support_channel", + "survey": "Family Resources Survey 2023-24 support copy", + "source": "https://www.gov.uk/government/collections/family-resources-survey--2", + "grain": "household_person_benunit", + "artifacts": [ + { + "kind": "derived_support_frame", + "format": "in_memory_tables", + "vintage": "2023-24", + "locator": "zero-weight FRS support channel for high-income SPI imputation" + } + ], + "operations": [ + { + "kind": "derive", + "outputs": [ + "zero_weight_spi_support_copy", + "source_household_lineage" + ] + } + ], + "outputs": [ + "household_is_spi_synthetic", + "household_support_channel", + "person_support_channel", + "benunit_support_channel", + "household_support_clone_index", + "person_support_clone_index", + "benunit_support_clone_index", + "household_source_id", + "person_source_id", + "benunit_source_id", + "source_household_id", + "source_household_key" + ], + "notes": "The support copy has zero initial household weight and source-household lineage so local-geography support accounting does not count it as independent FRS sample." + }, + { + "stage": "spi_income", + "survey": "Survey of Personal Incomes", + "source": "https://www.gov.uk/government/collections/personal-incomes-statistics", + "grain": "person", + "artifacts": [ + { + "kind": "administrative_microdata_or_tabulation", + "format": "hmrc_spi_extract", + "vintage": "latest_available", + "locator": "HMRC Survey of Personal Incomes person-level income and relief records" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "spi_person", + "weight": "spi_weight" + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "age", + "gender", + "region" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "employment_income", + "self_employment_income", + "savings_interest_income", + "dividend_income", + "private_pension_income", + "property_income", + "gift_aid", + "charitable_investment_gifts" + ], + "nonnegative_outputs": [ + "employment_income", + "self_employment_income", + "savings_interest_income", + "dividend_income", + "private_pension_income", + "property_income", + "gift_aid", + "charitable_investment_gifts" + ], + "notes": "The SPI-trained first stage fills the SPI support channel jointly for income components, Gift Aid, and qualifying investment gifts." + }, + { + "stage": "frs_only_spi_fill", + "survey": "Family Resources Survey 2023-24", + "source": "https://www.gov.uk/government/collections/family-resources-survey--2", + "grain": "person", + "artifacts": [ + { + "kind": "survey_microdata", + "format": "tabular_release", + "vintage": "2023-24", + "locator": "DWP Family Resources Survey person-level pension, savings, and reported-benefit fields" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "frs_person" + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "age", + "gender", + "region", + "employment_income", + "self_employment_income", + "savings_interest_income", + "dividend_income", + "private_pension_income", + "property_income" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "employee_pension_contributions", + "employer_pension_contributions", + "personal_pension_contributions", + "pension_contributions_via_salary_sacrifice", + "tax_free_savings_income", + "universal_credit_reported", + "pension_credit_reported", + "child_benefit_reported", + "housing_benefit_reported", + "income_support_reported", + "working_tax_credit_reported", + "child_tax_credit_reported", + "attendance_allowance_reported", + "state_pension_reported", + "dla_sc_reported", + "dla_m_reported", + "pip_m_reported", + "pip_dl_reported", + "sda_reported", + "carers_allowance_reported", + "iidb_reported", + "afcs_reported", + "bsp_reported", + "incapacity_benefit_reported", + "maternity_allowance_reported", + "winter_fuel_allowance_reported", + "council_tax_benefit_reported", + "jsa_contrib_reported", + "jsa_income_reported", + "esa_contrib_reported", + "esa_income_reported" + ], + "nonnegative_outputs": [ + "employee_pension_contributions", + "employer_pension_contributions", + "personal_pension_contributions", + "pension_contributions_via_salary_sacrifice", + "tax_free_savings_income", + "universal_credit_reported", + "pension_credit_reported", + "child_benefit_reported", + "housing_benefit_reported", + "income_support_reported", + "working_tax_credit_reported", + "child_tax_credit_reported", + "attendance_allowance_reported", + "state_pension_reported", + "dla_sc_reported", + "dla_m_reported", + "pip_m_reported", + "pip_dl_reported", + "sda_reported", + "carers_allowance_reported", + "iidb_reported", + "afcs_reported", + "bsp_reported", + "incapacity_benefit_reported", + "maternity_allowance_reported", + "winter_fuel_allowance_reported", + "council_tax_benefit_reported", + "jsa_contrib_reported", + "jsa_income_reported", + "esa_contrib_reported", + "esa_income_reported" + ], + "notes": "This stage replaces benefit receipt and pension/savings behavior on SPI support rows with draws conditional on the SPI-imputed income surface." + }, + { + "stage": "advani_summers_capital_gains", + "survey": "Advani-Summers capital gains distribution", + "source": "https://ideas.repec.org/p/hal/wpaper/halshs-03022609.html", + "grain": "household", + "artifacts": [ + { + "kind": "research_table", + "format": "csv", + "vintage": "latest_available", + "locator": "capital gains distribution by income/rank cell" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "capital_gains_distribution" + }, + { + "kind": "calibrate_binary_assignment", + "variable": "household_is_capital_gains_clone", + "weight": "household_weight" + }, + { + "kind": "fit_weighted_imputer", + "predictors": [ + "household_net_income", + "employment_income", + "self_employment_income", + "dividend_income", + "property_income" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "capital_gains", + "household_is_capital_gains_clone" + ], + "nonnegative_outputs": [ + "capital_gains" + ] + }, + { + "stage": "frs_salary_sacrifice", + "survey": "Family Resources Survey salary-sacrifice subsample", + "source": "https://www.gov.uk/government/collections/family-resources-survey--2", + "grain": "person", + "artifacts": [ + { + "kind": "survey_microdata", + "format": "tabular_release", + "vintage": "2023-24", + "locator": "FRS person-level salary-sacrifice fields with OBR/ASHE aggregate target" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "frs_person_salary_sacrifice" + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "age", + "employment_income" + ] + }, + { + "kind": "fold_into", + "target": "employee_pension_contributions", + "amount": "pension_contributions_via_salary_sacrifice" + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "pension_contributions_via_salary_sacrifice", + "employee_pension_contributions" + ], + "nonnegative_outputs": [ + "pension_contributions_via_salary_sacrifice", + "employee_pension_contributions" + ] + }, + { + "stage": "slc_student_loan_plan", + "survey": "Student Loans Company repayment-plan statistics", + "source": "https://www.gov.uk/government/collections/student-loans-for-higher-and-further-education", + "grain": "person", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "SLC borrower plan snapshot by cohort and geography" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "slc_student_loan_snapshot" + }, + { + "kind": "assign_by_plan_type", + "predictors": [ + "age", + "student_loan_balance", + "education_status", + "region" + ], + "output": "student_loan_plan" + } + ], + "outputs": [ + "student_loan_plan" + ] + }, + { + "stage": "rowwise_oa_geography", + "survey": "UK official small-area geography crosswalks", + "source": "https://geoportal.statistics.gov.uk/", + "grain": "household", + "artifacts": [ + { + "kind": "public_geography", + "format": "csv_or_geojson", + "vintage": "build_year", + "locator": "ONS, NRS, NISRA, and postcode-directory OA/DZ to LA/constituency crosswalks" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "uk_official_geography_crosswalk", + "household_region_frame" + ] + }, + { + "kind": "derive", + "outputs": [ + "rowwise_household_clones", + "finest_available_geography_assignment" + ] + }, + { + "kind": "join", + "on": [ + "household_id", + "source_household_id" + ] + } + ], + "outputs": [ + "oa_code", + "lsoa_code", + "msoa_code", + "la_code_oa", + "constituency_code_oa", + "region_code_oa" + ], + "notes": "This is the long-format local geography path: each household row receives one finest-area assignment, and later local weights export as area-household rows rather than a dense area-by-household matrix." + }, + { + "stage": "national_calibration", + "survey": "UK national calibration target registry", + "source": "https://github.com/PolicyEngine/populace/tree/main/packages/populace-calibrate", + "grain": "household", + "artifacts": [ + { + "kind": "target_registry", + "format": "json_or_yaml", + "vintage": "build_year", + "locator": "Populace national calibration target registry and supplied administrative target tables" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "national_calibration_targets", + "household_metric_tables" + ] + }, + { + "kind": "calibrate_weights", + "weight": "household_weight", + "outputs": [ + "household_weight" + ] + } + ], + "outputs": [ + "household_weight" + ], + "nonnegative_outputs": [ + "household_weight" + ], + "notes": "This stage rewrites design weights to national target weights before local and post-weight amount scaling stages." + }, + { + "stage": "local_geography_weights", + "survey": "UK local calibration target tables", + "source": "https://github.com/PolicyEngine/populace/tree/main/packages/populace-build", + "grain": "household_area", + "artifacts": [ + { + "kind": "target_tables", + "format": "csv_or_parquet", + "vintage": "build_year", + "locator": "Explicit constituency and local-authority target tables supplied to the UK local runner" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "local_area_targets", + "household_metric_tables" + ] + }, + { + "kind": "calibrate_weights", + "weight": "household_weight", + "outputs": [ + "local_geography_weight" + ] + } + ], + "outputs": [ + "local_geography_weight", + "local_solve_diagnostic" + ], + "nonnegative_outputs": [ + "local_geography_weight" + ], + "notes": "Assigned long local weights are solved after row-wise geography assignment and before post-weight rail/fuel amount scaling." + } + ] +} diff --git a/packages/populace-build/src/populace/build/uk/spi_support.py b/packages/populace-build/src/populace/build/uk/spi_support.py index 630fda0..9aff3b5 100644 --- a/packages/populace-build/src/populace/build/uk/spi_support.py +++ b/packages/populace-build/src/populace/build/uk/spi_support.py @@ -33,9 +33,9 @@ "property_income", ) -# Mirrors the eFRS SPI-trained first-stage QRF output surface. Gift Aid and -# qualifying investment gifts are relief variables, not income components, but -# they need to be drawn jointly with high-income SPI rows. +# UK SPI-trained first-stage QRF output surface. Gift Aid and qualifying +# investment gifts are relief variables, not income components, but they need +# to be drawn jointly with high-income SPI rows. SPI_INCOME_IMPUTATION_COLUMNS = SPI_INCOME_COMPONENT_COLUMNS + ( "gift_aid", "charitable_investment_gifts", @@ -48,9 +48,9 @@ *SPI_INCOME_COMPONENT_COLUMNS, ) -# Mirrors the eFRS second-stage FRS-only QRF output surface. These fields are -# replaced on SPI support rows so high-income synthetic rows do not retain a -# random middle-income FRS donor's benefit receipt or pension behavior. +# UK second-stage FRS-only QRF output surface. These fields are replaced on SPI +# support rows so high-income synthetic rows do not retain a random +# middle-income FRS donor's benefit receipt or pension behavior. FRS_ONLY_SPI_FILL_PERSON_COLUMNS = ( "employee_pension_contributions", "employer_pension_contributions", @@ -245,7 +245,7 @@ def fill_support_channel_from_source( QRF prediction frame keyed by original ``person_id``. Rows outside ``channel`` are left unchanged. Missing target columns are initialized to ``fill_missing_columns_with`` before the channel-specific update, matching - the eFRS treatment of SPI-only variables such as charitable-giving fields. + the UK SPI treatment of variables such as charitable-giving fields. """ entity = _require_entity(entity) diff --git a/packages/populace-build/tests/test_uk_local_geography.py b/packages/populace-build/tests/test_uk_local_geography.py index 9ba714f..1363f61 100644 --- a/packages/populace-build/tests/test_uk_local_geography.py +++ b/packages/populace-build/tests/test_uk_local_geography.py @@ -7,6 +7,8 @@ from populace.build.uk import ( LONG_GEOGRAPHY_COLUMNS, area_support_summary, + assigned_weights_to_long, + build_assigned_local_matrix, build_stacked_local_matrix, sort_households_by_id, stacked_design_weights, @@ -82,6 +84,61 @@ def test_build_stacked_local_matrix_uses_area_blocks_and_group_metrics() -> None np.testing.assert_allclose(dense[3], [0.0, 0.0, 0.0, 0.0, 2.0, 2.0]) +def test_build_assigned_local_matrix_uses_rowwise_area_assignments() -> None: + metrics = { + "England": pd.DataFrame( + { + "population": [1.0, 2.0, 3.0], + "earnings": [10.0, 20.0, 30.0], + }, + index=[101, 102, 103], + ), + "Scotland": pd.DataFrame( + { + "population": [100.0, 200.0, 300.0], + "earnings": [1000.0, 2000.0, 3000.0], + }, + index=[101, 102, 103], + ), + } + targets = pd.DataFrame( + { + "code": ["S001", "E001"], + "population": [300.0, 1.0], + "earnings": [3000.0, 10.0], + } + ) + households = pd.DataFrame( + { + "household_id": [103, 101, 102], + "constituency_code_oa": ["S001", "E001", "E999"], + } + ) + + assigned = build_assigned_local_matrix( + metrics, + targets, + household_frame=households, + area_codes=["E001", "S001"], + area_groups={"E001": "England", "S001": "Scotland"}, + household_ids=[101, 102, 103], + ) + + assert assigned.matrix.shape == (4, 3) + assert assigned.targets.tolist() == [1.0, 10.0, 300.0, 3000.0] + assert assigned.target_frame["area_code"].tolist() == [ + "E001", + "E001", + "S001", + "S001", + ] + dense = assigned.matrix.toarray() + np.testing.assert_allclose(dense[0], [1.0, 0.0, 0.0]) + np.testing.assert_allclose(dense[1], [10.0, 0.0, 0.0]) + np.testing.assert_allclose(dense[2], [0.0, 0.0, 300.0]) + np.testing.assert_allclose(dense[3], [0.0, 0.0, 3000.0]) + + def test_build_stacked_local_matrix_rejects_drifted_household_index() -> None: metrics = { "England": pd.DataFrame({"population": [1.0, 2.0]}, index=[101, 102]), @@ -156,6 +213,57 @@ def test_stacked_weights_to_long_preserves_source_metadata() -> None: assert long["clone_index"].tolist() == [0, 0, 3] +def test_assigned_weights_to_long_preserves_metadata_and_filters_area_codes() -> None: + household_frame = pd.DataFrame( + { + "household_id": [102, 103, 101], + "constituency_code_oa": ["S001", "E999", "E001"], + "source_year": [2022, 2021, 2023], + "source_household_id": ["b", "c", "a"], + "source_household_key": ["2022:b", "2021:c", "2023:a"], + "clone_index": [3, 2, 0], + } + ) + + long = assigned_weights_to_long( + [1.5, 2.5, 3.5], + ["E001", "S001"], + [101, 102, 103], + area_type="constituency", + household_frame=household_frame, + ) + + assert tuple(long.columns) == LONG_GEOGRAPHY_COLUMNS + assert long["weight"].tolist() == [1.5, 2.5] + assert long["area_code"].tolist() == ["E001", "S001"] + assert long["area_index"].tolist() == [0, 1] + assert long["household_id"].tolist() == [101, 102] + assert long["source_household_key"].tolist() == ["2023:a", "2022:b"] + assert long["clone_index"].tolist() == [0, 3] + + +def test_assigned_weights_to_long_drops_unused_zero_base_floor_weights() -> None: + household_frame = pd.DataFrame( + { + "household_id": [101, 102, 103], + "constituency_code_oa": ["E001", "E001", "E001"], + } + ) + + long = assigned_weights_to_long( + [1.0, 1e-4, 0.5], + ["E001"], + [101, 102, 103], + area_type="constituency", + household_frame=household_frame, + base_weights=[1.0, 0.0, 0.0], + drop_weight_atol=1e-4, + ) + + assert long["household_id"].tolist() == [101, 103] + assert long["weight"].tolist() == [1.0, 0.5] + + def test_stacked_weights_to_long_rejects_missing_household_metadata() -> None: household_frame = pd.DataFrame({"household_id": [101], "source_year": [2023]}) diff --git a/packages/populace-build/tests/test_uk_local_runner.py b/packages/populace-build/tests/test_uk_local_runner.py index 0ae45b7..046f971 100644 --- a/packages/populace-build/tests/test_uk_local_runner.py +++ b/packages/populace-build/tests/test_uk_local_runner.py @@ -45,9 +45,7 @@ def calculate(self, variable, **_kwargs): def test_prepare_area_frame_sorts_and_validates_codes() -> None: - areas = pd.DataFrame( - {"code": ["S001", "E001"], "country": ["Scotland", "England"]} - ) + areas = pd.DataFrame({"code": ["S001", "E001"], "country": ["Scotland", "England"]}) prepared = prepare_area_frame(areas) @@ -129,10 +127,7 @@ def fake_compute(sim, area_type, *, period=None, household_ids=None): assert set(tables) == {"England", "Scotland"} assert tables["England"].index.tolist() == [101, 102] - regions = [ - sim.inputs[("region", 2023)][0] - for sim in calls - ] + regions = [sim.inputs[("region", 2023)][0] for sim in calls] assert regions == ["SOUTH_EAST", "SCOTLAND"] @@ -165,10 +160,37 @@ def fake_compute(sim, area_type, *, period=None, household_ids=None): assert tables["England"]["population"].tolist() == [10.0, 20.0] -def test_build_local_candidate_solves_and_exports_long_weights() -> None: - areas = pd.DataFrame( - {"code": ["S001", "E001"], "country": ["Scotland", "England"]} +def test_build_metric_tables_from_dataset_allows_selected_households( + monkeypatch, +) -> None: + class ExtraHouseholdSimulation(FakeSimulation): + def calculate(self, variable, **_kwargs): + assert variable == "household_id" + return Result([103, 102, 101]) + + def fake_compute(sim, area_type, *, period=None, household_ids=None): + assert household_ids is None + return pd.DataFrame( + {"population": [30.0, 20.0, 10.0]}, + index=pd.Index([103, 102, 101]), + ) + + monkeypatch.setattr(local_runner, "compute_household_metrics", fake_compute) + + tables = build_metric_tables_from_dataset( + dataset=type("Dataset", (), {"time_period": 2023})(), + area_groups={"E001": "England"}, + area_type="constituency", + household_ids=[101, 102], + simulation_factory=ExtraHouseholdSimulation, ) + + assert tables["England"].index.tolist() == [101, 102] + assert tables["England"]["population"].tolist() == [10.0, 20.0] + + +def test_build_local_candidate_solves_and_exports_long_weights() -> None: + areas = pd.DataFrame({"code": ["S001", "E001"], "country": ["Scotland", "England"]}) targets = pd.DataFrame( { "code": ["E001", "S001"], @@ -210,6 +232,87 @@ def test_build_local_candidate_solves_and_exports_long_weights() -> None: assert "effective_sample_size" in result.support_summary.columns +def test_build_local_candidate_uses_assigned_support_when_available() -> None: + areas = pd.DataFrame({"code": ["S001", "E001"], "country": ["Scotland", "England"]}) + targets = pd.DataFrame( + { + "code": ["E001", "S001"], + "population": [1.5, 0.5], + } + ) + metrics = { + "England": pd.DataFrame( + {"population": [1.0, 1.0, 10.0]}, + index=[101, 102, 103], + ), + "Scotland": pd.DataFrame( + {"population": [1.0, 1.0, 10.0]}, + index=[101, 102, 103], + ), + } + households = pd.DataFrame( + { + "household_id": [102, 103, 101], + "household_weight": [1.0, 100.0, 1.0], + "constituency_code_oa": ["S001", "E999", "E001"], + } + ) + + result = build_local_candidate( + area_type="constituency", + area_frame=areas, + targets=targets, + metrics=metrics, + household_frame=households, + solver_options={"epochs": 80, "learning_rate": 0.2, "seed": 1}, + ) + + assert result.support_mode == "assigned" + assert result.problem.matrix.shape == (2, 2) + assert result.problem.n_households == 2 + assert result.solve_result.weights.shape == (2,) + assert result.solve_result.final_loss < result.solve_result.initial_loss + assert result.long_weights["area_code"].tolist() == ["E001", "S001"] + assert result.support_summary["nonzero_households"].tolist() == [1, 1] + + +def test_build_local_candidate_uses_la_assigned_support_and_zero_area() -> None: + areas = pd.DataFrame({"code": ["E06000002", "E06000001"]}) + targets = pd.DataFrame( + { + "code": ["E06000001", "E06000002"], + "population": [1.0, 0.0], + } + ) + metrics = pd.DataFrame({"population": [1.0, 10.0]}, index=[101, 102]) + households = pd.DataFrame( + { + "household_id": [102, 101], + "household_weight": [100.0, 1.0], + "la_code_oa": ["E99999999", "E06000001"], + } + ) + + result = build_local_candidate( + area_type="la", + area_frame=areas, + targets=targets, + metrics=metrics, + household_frame=households, + solver_options={"epochs": 5, "learning_rate": 0.2, "seed": 1}, + ) + + assert result.support_mode == "assigned" + assert result.problem.area_codes == ("E06000001", "E06000002") + assert result.problem.matrix.shape == (2, 1) + assert result.long_weights["area_code"].tolist() == ["E06000001"] + assert result.support_summary["area_code"].tolist() == [ + "E06000001", + "E06000002", + ] + assert result.support_summary["nonzero_households"].tolist() == [1, 0] + + def test_build_local_candidate_can_limit_pilot_areas() -> None: areas = pd.DataFrame( { @@ -271,6 +374,42 @@ def fake_compute(sim, area_type, *, period=None, household_ids=None): assert result.support_summary["nonzero_households"].tolist() == [1] +def test_build_local_candidate_from_dataset_auto_uses_assigned_support( + monkeypatch, +) -> None: + areas = pd.DataFrame({"code": ["E001"], "country": ["England"]}) + targets = pd.DataFrame({"code": ["E001"], "population": [1.0]}) + households = pd.DataFrame( + { + "household_id": [101], + "household_weight": [1.0], + "constituency_code_oa": ["E001"], + } + ) + + def fake_compute(sim, area_type, *, period=None, household_ids=None): + assert area_type == "constituency" + assert period == 2023 + assert household_ids is None + return pd.DataFrame({"population": [1.0]}, index=pd.Index([101])) + + monkeypatch.setattr(local_runner, "compute_household_metrics", fake_compute) + + result = build_local_candidate_from_dataset( + dataset=type("Dataset", (), {"time_period": 2023})(), + area_type="constituency", + area_frame=areas, + targets=targets, + household_frame=households, + simulation_factory=SingleHouseholdSimulation, + solver_options={"epochs": 2}, + ) + + assert result.support_mode == "assigned" + assert result.problem.matrix.shape == (1, 1) + assert result.long_weights["area_code"].tolist() == ["E001"] + + def test_write_local_candidate_outputs(tmp_path: Path) -> None: areas = pd.DataFrame({"code": ["E001"], "country": ["England"]}) targets = pd.DataFrame({"code": ["E001"], "population": [1.0]}) diff --git a/packages/populace-build/tests/test_uk_local_solver.py b/packages/populace-build/tests/test_uk_local_solver.py index 52ca571..b73b8b5 100644 --- a/packages/populace-build/tests/test_uk_local_solver.py +++ b/packages/populace-build/tests/test_uk_local_solver.py @@ -7,7 +7,9 @@ import populace.build.uk.local_solver as local_solver from populace.build.uk import ( + build_assigned_local_matrix, build_stacked_local_matrix, + solve_assigned_local_weights, solve_stacked_local_weights, ) @@ -39,6 +41,67 @@ def test_solve_stacked_local_weights_reduces_loss_and_reports_diagnostics() -> N np.testing.assert_allclose(result.diagnostics["target"], [1.5, 0.5]) +def test_solve_assigned_local_weights_uses_household_weight_columns() -> None: + metrics = pd.DataFrame({"population": [1.0, 1.0]}, index=[101, 102]) + targets = pd.DataFrame({"code": ["E001", "S001"], "population": [1.5, 0.5]}) + households = pd.DataFrame( + { + "household_id": [101, 102], + "constituency_code_oa": ["E001", "S001"], + } + ) + problem = build_assigned_local_matrix( + metrics, + targets, + household_frame=households, + area_codes=["E001", "S001"], + household_ids=[101, 102], + ) + + result = solve_assigned_local_weights( + problem, + [1.0, 1.0], + epochs=80, + learning_rate=0.2, + max_weight_ratio=10.0, + seed=1, + ) + + assert result.weights.shape == (2,) + assert result.initial_weights.tolist() == [1.0, 1.0] + assert result.final_loss < result.initial_loss + assert result.diagnostics["area_code"].tolist() == ["E001", "S001"] + + +def test_solve_assigned_local_weights_can_upweight_zero_base_support() -> None: + metrics = pd.DataFrame({"income": [1_000_000.0]}, index=[101]) + targets = pd.DataFrame({"code": ["E001"], "income": [1_000_000.0]}) + households = pd.DataFrame( + { + "household_id": [101], + "constituency_code_oa": ["E001"], + } + ) + problem = build_assigned_local_matrix( + metrics, + targets, + household_frame=households, + area_codes=["E001"], + household_ids=[101], + ) + + result = solve_assigned_local_weights( + problem, + [0.0], + epochs=80, + learning_rate=0.3, + seed=1, + ) + + assert result.weights[0] > 0.01 + assert result.final_loss < 0.05 + + def test_solve_stacked_local_weights_uses_explicit_positive_floor() -> None: metrics = pd.DataFrame({"population": [1.0, 1.0]}, index=[101, 102]) targets = pd.DataFrame({"code": ["E001"], "population": [1.0]}) diff --git a/packages/populace-build/tests/test_uk_source_manifest.py b/packages/populace-build/tests/test_uk_source_manifest.py new file mode 100644 index 0000000..2120386 --- /dev/null +++ b/packages/populace-build/tests/test_uk_source_manifest.py @@ -0,0 +1,318 @@ +"""UK raw-source plan declaration: full surface or nothing.""" + +from __future__ import annotations + +import pytest + +from populace.build.source_manifest import SourceManifest, SourceOperationSpec +from populace.build.uk import ( + FRS_ONLY_SPI_FILL_PERSON_COLUMNS, + ROWWISE_GEOGRAPHY_COLUMNS, + SPI_INCOME_IMPUTATION_COLUMNS, + UK_DONORS, + UK_NONNEGATIVE_SOURCE_OUTPUTS, + UK_REWRITTEN_SOURCE_OUTPUT_STAGES, + UK_SOURCE_MANIFEST, + UK_SOURCE_OUTPUTS, + UK_SOURCE_OUTPUT_STAGES, + UK_SOURCE_STAGE_SPECS, + UK_SPI_SUPPORT_STAGE_NAME, + UK_STAGE_NAMES, + UK_STRUCTURAL_SOURCE_STAGES, + uk_plan, +) + + +def _noop_implementations() -> dict: + return {name: (lambda frame: frame) for name in UK_STAGE_NAMES} + + +class TestUkPlan: + def test_assembles_with_all_stages_and_donor_citations(self) -> None: + plan = uk_plan(_noop_implementations()) + + assert tuple(stage.name for stage in plan.stages) == UK_STAGE_NAMES + donor_stages = dict(plan.donors()) + assert set(donor_stages) == set(UK_DONORS) + for spec in donor_stages.values(): + assert spec.source.startswith("https://") + + def test_missing_stage_refuses_to_assemble(self) -> None: + implementations = _noop_implementations() + del implementations["was_wealth"] + + with pytest.raises(ValueError, match="missing \\['was_wealth'\\]"): + uk_plan(implementations) + + def test_unknown_stage_is_refused(self) -> None: + implementations = _noop_implementations() + implementations["legacy_fill"] = lambda frame: frame + + with pytest.raises(ValueError, match="Unknown stage implementation"): + uk_plan(implementations) + + +class TestUkSources: + def test_source_manifest_loads_as_spec_contract(self) -> None: + assert UK_SOURCE_MANIFEST.country == "uk" + assert UK_SOURCE_MANIFEST.version == 1 + assert len(UK_SOURCE_STAGE_SPECS) >= len(UK_DONORS) + + def test_every_donor_stage_has_matching_source_spec(self) -> None: + specs = UK_SOURCE_MANIFEST.stage_map() + for stage, donor in UK_DONORS.items(): + assert stage in specs + assert specs[stage].survey == donor.survey + assert specs[stage].source == donor.source + + def test_source_specs_align_with_declared_plan(self) -> None: + source_stage_names = {spec.stage for spec in UK_SOURCE_STAGE_SPECS} + + assert set(UK_SOURCE_MANIFEST.stage_map()) == source_stage_names + assert source_stage_names == set(UK_DONORS) | set(UK_STRUCTURAL_SOURCE_STAGES) + assert source_stage_names.issubset(UK_STAGE_NAMES) + assert tuple(spec.stage for spec in UK_SOURCE_STAGE_SPECS) == tuple( + name for name in UK_STAGE_NAMES if name in source_stage_names + ) + assert UK_STAGE_NAMES.index("rowwise_oa_geography") < UK_STAGE_NAMES.index( + "local_geography_weights" + ) + + def test_stage_order_keeps_required_upstream_surfaces_available(self) -> None: + assert UK_STAGE_NAMES.index("was_wealth") < UK_STAGE_NAMES.index( + "regional_property_uprating" + ) + assert UK_STAGE_NAMES.index("was_wealth") < UK_STAGE_NAMES.index( + "lcfs_consumption" + ) + assert UK_STAGE_NAMES.index(UK_SPI_SUPPORT_STAGE_NAME) < UK_STAGE_NAMES.index( + "spi_income" + ) + assert UK_STAGE_NAMES.index("spi_income") < UK_STAGE_NAMES.index( + "frs_only_spi_fill" + ) + assert UK_STAGE_NAMES.index("local_geography_weights") < UK_STAGE_NAMES.index( + "rail_public_service_calibration" + ) + assert UK_STAGE_NAMES.index("local_geography_weights") < UK_STAGE_NAMES.index( + "road_fuel_energy_calibration" + ) + + def test_source_specs_are_manifest_only_not_python_loaders(self) -> None: + for spec in UK_SOURCE_STAGE_SPECS: + assert spec.operations + for operation in spec.operations: + assert "module" not in operation.parameters + assert "function" not in operation.parameters + assert operation.kind not in { + "python_module", + "python_function", + "import_module", + } + + def test_weight_calibration_stages_are_manifest_declared(self) -> None: + specs = UK_SOURCE_MANIFEST.stage_map() + for stage in ("national_calibration", "local_geography_weights"): + kinds = [operation.kind for operation in specs[stage].operations] + assert "calibrate_weights" in kinds + + def test_raw_source_surface_declares_salient_outputs_from_each_input(self) -> None: + required_outputs = { + "property_wealth", + "mortgage_debt", + "consumer_debt", + "student_loan_balance", + "num_vehicles", + "full_rate_vat_expenditure_rate", + "food_and_non_alcoholic_beverages_consumption", + "electricity_consumption", + "gas_consumption", + "petrol_spending", + "diesel_spending", + "dfe_education_spending", + "rail_subsidy_spending", + "bus_subsidy_spending", + "rail_usage", + "a_and_e_visits", + "admitted_patient_visits", + "outpatient_visits", + "nhs_spending", + "gift_aid", + "charitable_investment_gifts", + "capital_gains", + "household_is_capital_gains_clone", + "pension_contributions_via_salary_sacrifice", + "student_loan_plan", + "household_is_spi_synthetic", + "source_household_key", + "local_geography_weight", + } + + required_outputs.update(SPI_INCOME_IMPUTATION_COLUMNS) + required_outputs.update(FRS_ONLY_SPI_FILL_PERSON_COLUMNS) + required_outputs.update(ROWWISE_GEOGRAPHY_COLUMNS) + + assert sorted(required_outputs - UK_SOURCE_OUTPUTS) == [] + + def test_nonnegative_surface_covers_key_money_and_count_outputs(self) -> None: + required_nonnegative = { + "owned_land", + "property_wealth", + "mortgage_debt", + "consumer_debt", + "student_loan_balance", + "food_and_non_alcoholic_beverages_consumption", + "electricity_consumption", + "gas_consumption", + "petrol_spending", + "diesel_spending", + "full_rate_vat_expenditure_rate", + "a_and_e_visits", + "nhs_spending", + "dfe_education_spending", + "rail_usage", + "gift_aid", + "charitable_investment_gifts", + "capital_gains", + "pension_contributions_via_salary_sacrifice", + "local_geography_weight", + } + + assert sorted(required_nonnegative - UK_NONNEGATIVE_SOURCE_OUTPUTS) == [] + assert "student_loan_plan" not in UK_NONNEGATIVE_SOURCE_OUTPUTS + + def test_rewritten_outputs_are_explicit_and_have_reviewed_final_writers( + self, + ) -> None: + expected_rewrites = { + "diesel_spending": ( + "lcfs_consumption", + "road_fuel_energy_calibration", + ), + "domestic_energy_consumption": ( + "lcfs_consumption", + "road_fuel_energy_calibration", + ), + "electricity_consumption": ( + "lcfs_consumption", + "road_fuel_energy_calibration", + ), + "gas_consumption": ( + "lcfs_consumption", + "road_fuel_energy_calibration", + ), + "petrol_spending": ( + "lcfs_consumption", + "road_fuel_energy_calibration", + ), + "main_residence_value": ( + "was_wealth", + "regional_property_uprating", + ), + "property_wealth": ( + "was_wealth", + "regional_property_uprating", + ), + "household_weight": ( + "frs_base", + "national_calibration", + ), + "employment_income": ( + "frs_base", + "spi_income", + ), + "private_pension_income": ( + "frs_base", + "spi_income", + ), + "self_employment_income": ( + "frs_base", + "spi_income", + ), + "employee_pension_contributions": ( + "frs_only_spi_fill", + "frs_salary_sacrifice", + ), + "pension_contributions_via_salary_sacrifice": ( + "frs_only_spi_fill", + "frs_salary_sacrifice", + ), + "rail_subsidy_spending": ( + "etb_public_services", + "rail_public_service_calibration", + ), + "rail_usage": ( + "etb_public_services", + "rail_public_service_calibration", + ), + } + + assert dict(UK_REWRITTEN_SOURCE_OUTPUT_STAGES) == expected_rewrites + for output, stages in expected_rewrites.items(): + assert UK_SOURCE_OUTPUT_STAGES[output] == stages + indices = [UK_STAGE_NAMES.index(stage) for stage in stages] + assert indices == sorted(indices) + + def test_fuel_energy_amount_scaling_is_not_binary_assignment(self) -> None: + operations = UK_SOURCE_MANIFEST.stage_map()[ + "road_fuel_energy_calibration" + ].operations + kinds = [operation.kind for operation in operations] + + assert "calibrate_binary_assignment" not in kinds + assert "uprate" in kinds + uprate = operations[kinds.index("uprate")] + assert tuple(uprate.parameters["variables"]) == ( + "petrol_spending", + "diesel_spending", + "electricity_consumption", + "gas_consumption", + ) + derive = operations[kinds.index("derive")] + assert tuple(derive.parameters["outputs"]) == ("domestic_energy_consumption",) + + def test_spi_stage_declares_support_channel_before_income_fit(self) -> None: + specs = UK_SOURCE_MANIFEST.stage_map() + spi_kinds = [operation.kind for operation in specs["spi_income"].operations] + + assert spi_kinds.index("read_table") < spi_kinds.index("fit_weighted_qrf") + assert spi_kinds.index("fit_weighted_qrf") < spi_kinds.index("support_clip") + assert "household_is_spi_synthetic" in specs[UK_SPI_SUPPORT_STAGE_NAME].outputs + + def test_source_operation_parser_rejects_python_loader_shapes(self) -> None: + with pytest.raises(ValueError, match="executable-loader"): + SourceOperationSpec.from_mapping( + { + "kind": "python_module", + "module": "populace.build.uk.sources", + "function": "add_was_wealth", + } + ) + + def test_source_manifest_parser_rejects_incumbent_package_artifacts(self) -> None: + with pytest.raises(ValueError, match="forbidden incumbent dependency"): + SourceManifest.from_mapping( + { + "version": 1, + "country": "uk", + "policy": "spec only", + "stages": [ + { + "stage": "was_wealth", + "survey": "Wealth and Assets Survey", + "source": "https://example.test/was", + "grain": "household", + "artifacts": [ + { + "kind": "derived_dataset", + "locator": "policyengine_" + "uk_data", + } + ], + "operations": [ + {"kind": "read_table", "table": "was_household"} + ], + "outputs": ["property_wealth"], + } + ], + } + ) diff --git a/packages/populace-build/tests/test_uk_spi_support.py b/packages/populace-build/tests/test_uk_spi_support.py index e1641c4..3b3888a 100644 --- a/packages/populace-build/tests/test_uk_spi_support.py +++ b/packages/populace-build/tests/test_uk_spi_support.py @@ -209,7 +209,7 @@ def test_spi_fill_only_updates_spi_channel_and_can_initialize_new_columns() -> N assert spi["gift_aid"].tolist() == [9.0, 10.0, 11.0, 12.0] -def test_spi_variable_surfaces_include_efrs_stage1_and_stage2_fixes() -> None: +def test_spi_variable_surfaces_include_recent_stage1_and_stage2_fixes() -> None: assert SPI_INCOME_COMPONENT_COLUMNS == ( "employment_income", "self_employment_income", From f5e7283526c0e698f50d1e93fbcaabc7f180ee4a Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 20 Jun 2026 18:25:19 -0400 Subject: [PATCH 2/3] Move UK source plan metadata into manifest --- .../src/populace/build/source_manifest.py | 18 ++- .../src/populace/build/uk/__init__.py | 116 +++--------------- .../src/populace/build/uk/source_stages.json | 96 ++++++++++++--- .../tests/test_uk_source_manifest.py | 48 ++++++++ 4 files changed, 155 insertions(+), 123 deletions(-) diff --git a/packages/populace-build/src/populace/build/source_manifest.py b/packages/populace-build/src/populace/build/source_manifest.py index 8580c96..5915fee 100644 --- a/packages/populace-build/src/populace/build/source_manifest.py +++ b/packages/populace-build/src/populace/build/source_manifest.py @@ -41,6 +41,7 @@ "assign_binary_from_rate", "calibrate_binary_assignment", "calibrate_weights", + "compile_ledger_targets", "convert_interest_to_structural_mortgage_inputs", "compute_ratio", "derive", @@ -128,6 +129,7 @@ class SourceStageSpec: outputs: tuple[str, ...] nonnegative_outputs: tuple[str, ...] = () notes: str = "" + role: str = "source" @classmethod def from_mapping(cls, raw: Mapping[str, Any]) -> SourceStageSpec: @@ -161,6 +163,9 @@ def from_mapping(cls, raw: Mapping[str, Any]) -> SourceStageSpec: notes = raw.get("notes", "") if not isinstance(notes, str): raise ValueError("source stage 'notes' must be a string when provided.") + role = raw.get("role", "source") + if not isinstance(role, str) or not role: + raise ValueError("source stage 'role' must be a non-empty string.") _reject_executable_parameter_keys(raw, context=f"stage {raw['stage']!r}") _reject_incumbent_dependencies(raw, context=f"stage {raw['stage']!r}") return cls( @@ -168,6 +173,7 @@ def from_mapping(cls, raw: Mapping[str, Any]) -> SourceStageSpec: survey=raw["survey"], source=raw["source"], grain=raw["grain"], + role=role, artifacts=artifacts, operations=operations, outputs=outputs, @@ -183,6 +189,7 @@ class SourceManifest: country: str version: int policy: str + plan_stages: tuple[str, ...] stages: tuple[SourceStageSpec, ...] @classmethod @@ -204,9 +211,18 @@ def from_mapping(cls, raw: Mapping[str, Any]) -> SourceManifest: duplicates = sorted({name for name in names if names.count(name) > 1}) if duplicates: raise ValueError(f"duplicate source stage spec(s): {duplicates}.") + plan_stages = tuple( + _require_string_sequence(raw.get("plan_stages", names), key="plan_stages") + ) _reject_executable_parameter_keys(raw, context=f"{country} source manifest") _reject_incumbent_dependencies(raw, context=f"{country} source manifest") - return cls(country=country, version=version, policy=policy, stages=stages) + return cls( + country=country, + version=version, + policy=policy, + plan_stages=plan_stages, + stages=stages, + ) def stage_map(self) -> Mapping[str, SourceStageSpec]: return {stage.stage: stage for stage in self.stages} diff --git a/packages/populace-build/src/populace/build/uk/__init__.py b/packages/populace-build/src/populace/build/uk/__init__.py index e125e23..0887721 100644 --- a/packages/populace-build/src/populace/build/uk/__init__.py +++ b/packages/populace-build/src/populace/build/uk/__init__.py @@ -142,123 +142,35 @@ ) from populace.frame import Frame -UK_DONORS: Mapping[str, DonorSpec] = { - "was_wealth": DonorSpec( - survey="Wealth and Assets Survey", - source="https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/debt/methodologies/wealthandassetssurveyqmi", - notes="Household wealth, debts, vehicles, and student-loan balances.", - ), - "regional_property_uprating": DonorSpec( - survey="UK House Price Index and regional land-value tables", - source="https://www.gov.uk/government/collections/uk-house-price-index-reports", - notes="Regional property-value uprating after WAS wealth imputation.", - ), - "lcfs_consumption": DonorSpec( - survey="Living Costs and Food Survey", - source="https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/methodologies/livingcostsandfoodsurveyqmi", - notes="COICOP consumption, fuel spending, and domestic energy use.", - ), - "road_fuel_energy_calibration": DonorSpec( - survey="Road fuel and household energy administrative totals", - source="https://www.gov.uk/government/collections/road-transport-consumption-at-regional-and-local-level", - notes="Fuel and energy calibration targets for LCFS-imputed amounts.", - ), - "etb_vat": DonorSpec( - survey="Effects of Taxes and Benefits", - source="https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/theeffectsoftaxesandbenefitsonhouseholdincomehistoricaldatasets", - notes="Full-rate VAT expenditure-rate imputation.", - ), - "nhs_usage": DonorSpec( - survey="NHS activity and unit-cost tables", - source="https://www.england.nhs.uk/statistics/statistical-work-areas/hospital-activity/monthly-hospital-activity/", - notes="A&E, inpatient, outpatient visit and spending inputs.", - ), - "etb_public_services": DonorSpec( - survey="Effects of Taxes and Benefits public-service tables", - source="https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/theeffectsoftaxesandbenefitsonhouseholdincomehistoricaldatasets", - notes="Education, rail, and bus public-service benefit inputs.", - ), - "rail_public_service_calibration": DonorSpec( - survey="Rail public-service administrative totals", - source="https://www.gov.uk/government/collections/rail-statistics", - notes="Post-weight rail subsidy and usage scaling.", - ), - "spi_income": DonorSpec( - survey="Survey of Personal Incomes", - source="https://www.gov.uk/government/collections/personal-incomes-statistics", - notes="High-income components, Gift Aid, and investment-gift reliefs.", - ), - "frs_only_spi_fill": DonorSpec( - survey="Family Resources Survey 2023-24", - source="https://www.gov.uk/government/collections/family-resources-survey--2", - notes=( - "Second-stage pension, savings, and reported-benefit behavior for " - "SPI support rows." - ), - ), - "advani_summers_capital_gains": DonorSpec( - survey="Advani-Summers capital gains distribution", - source="https://ideas.repec.org/p/hal/wpaper/halshs-03022609.html", - notes="Capital gains assignment and clone flag.", - ), - "frs_salary_sacrifice": DonorSpec( - survey="Family Resources Survey salary-sacrifice subsample", - source="https://www.gov.uk/government/collections/family-resources-survey--2", - notes="Salary-sacrifice pension contributions and employee adjustment.", - ), - "slc_student_loan_plan": DonorSpec( - survey="Student Loans Company repayment-plan statistics", - source="https://www.gov.uk/government/collections/student-loans-for-higher-and-further-education", - notes="Student-loan repayment plan assignment by cohort and balance.", - ), -} - -UK_STAGE_NAMES: tuple[str, ...] = ( - "frs_base", - "was_wealth", - "regional_property_uprating", - "lcfs_consumption", - "etb_vat", - "nhs_usage", - "etb_public_services", - UK_SPI_SUPPORT_STAGE_NAME, - "spi_income", - "frs_only_spi_fill", - "advani_summers_capital_gains", - "frs_salary_sacrifice", - "slc_student_loan_plan", - "rowwise_oa_geography", - "national_calibration", - "local_geography_weights", - "rail_public_service_calibration", - "road_fuel_energy_calibration", - "export", -) - -UK_STRUCTURAL_SOURCE_STAGES: tuple[str, ...] = ( - "frs_base", - UK_SPI_SUPPORT_STAGE_NAME, - "rowwise_oa_geography", - "national_calibration", - "local_geography_weights", -) - def _load_uk_source_manifest() -> SourceManifest: return load_source_manifest(files(__package__).joinpath("source_stages.json")) UK_SOURCE_MANIFEST = _load_uk_source_manifest() +UK_STAGE_NAMES: tuple[str, ...] = UK_SOURCE_MANIFEST.plan_stages _UK_SOURCE_STAGE_MAP = UK_SOURCE_MANIFEST.stage_map() _UNKNOWN_UK_SOURCE_STAGES = sorted(set(_UK_SOURCE_STAGE_MAP) - set(UK_STAGE_NAMES)) if _UNKNOWN_UK_SOURCE_STAGES: raise ValueError( - "UK source manifest stage(s) are not declared in UK_STAGE_NAMES: " + "UK source manifest stage(s) are not declared in plan_stages: " f"{_UNKNOWN_UK_SOURCE_STAGES}." ) UK_SOURCE_STAGE_SPECS: tuple[SourceStageSpec, ...] = tuple( _UK_SOURCE_STAGE_MAP[name] for name in UK_STAGE_NAMES if name in _UK_SOURCE_STAGE_MAP ) +UK_DONORS: Mapping[str, DonorSpec] = { + stage.stage: DonorSpec( + survey=stage.survey, + source=stage.source, + notes=stage.notes, + ) + for stage in UK_SOURCE_STAGE_SPECS + if stage.role == "donor" +} +UK_STRUCTURAL_SOURCE_STAGES: tuple[str, ...] = tuple( + stage.stage for stage in UK_SOURCE_STAGE_SPECS if stage.role != "donor" +) UK_SOURCE_OUTPUTS: frozenset[str] = frozenset( output for stage in UK_SOURCE_STAGE_SPECS for output in stage.outputs ) diff --git a/packages/populace-build/src/populace/build/uk/source_stages.json b/packages/populace-build/src/populace/build/uk/source_stages.json index 91ab42e..334e729 100644 --- a/packages/populace-build/src/populace/build/uk/source_stages.json +++ b/packages/populace-build/src/populace/build/uk/source_stages.json @@ -2,9 +2,31 @@ "version": 1, "country": "uk", "policy": "UK source stages are manifest-defined. Country/source content may declare primary artifacts, columns, sentinel handling, derivations, imputation recipes, outputs, and validation requirements here; executable Python belongs only in shared Populace runtimes.", + "plan_stages": [ + "frs_base", + "was_wealth", + "regional_property_uprating", + "lcfs_consumption", + "etb_vat", + "nhs_usage", + "etb_public_services", + "spi_support_channel", + "spi_income", + "frs_only_spi_fill", + "advani_summers_capital_gains", + "frs_salary_sacrifice", + "slc_student_loan_plan", + "rowwise_oa_geography", + "national_calibration", + "local_geography_weights", + "rail_public_service_calibration", + "road_fuel_energy_calibration", + "export" + ], "stages": [ { "stage": "frs_base", + "role": "base", "survey": "Family Resources Survey 2023-24", "source": "https://www.gov.uk/government/collections/family-resources-survey--2", "grain": "household_person_benunit", @@ -81,6 +103,7 @@ }, { "stage": "was_wealth", + "role": "donor", "survey": "Wealth and Assets Survey", "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/debt/methodologies/wealthandassetssurveyqmi", "grain": "household", @@ -156,6 +179,7 @@ }, { "stage": "regional_property_uprating", + "role": "donor", "survey": "UK House Price Index and regional land-value tables", "source": "https://www.gov.uk/government/collections/uk-house-price-index-reports", "grain": "household", @@ -193,6 +217,7 @@ }, { "stage": "lcfs_consumption", + "role": "donor", "survey": "Living Costs and Food Survey", "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/methodologies/livingcostsandfoodsurveyqmi", "grain": "household", @@ -284,6 +309,7 @@ }, { "stage": "road_fuel_energy_calibration", + "role": "donor", "survey": "Road fuel and household energy administrative totals", "source": "https://www.gov.uk/government/collections/road-transport-consumption-at-regional-and-local-level", "grain": "household", @@ -342,6 +368,7 @@ }, { "stage": "etb_vat", + "role": "donor", "survey": "Effects of Taxes and Benefits", "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/theeffectsoftaxesandbenefitsonhouseholdincomehistoricaldatasets", "grain": "household", @@ -387,6 +414,7 @@ }, { "stage": "nhs_usage", + "role": "donor", "survey": "NHS activity and unit-cost tables", "source": "https://www.england.nhs.uk/statistics/statistical-work-areas/hospital-activity/monthly-hospital-activity/", "grain": "person", @@ -451,6 +479,7 @@ }, { "stage": "etb_public_services", + "role": "donor", "survey": "Effects of Taxes and Benefits public-service tables", "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/theeffectsoftaxesandbenefitsonhouseholdincomehistoricaldatasets", "grain": "household", @@ -507,6 +536,7 @@ }, { "stage": "rail_public_service_calibration", + "role": "donor", "survey": "Rail public-service administrative totals", "source": "https://www.gov.uk/government/collections/rail-statistics", "grain": "household", @@ -547,6 +577,7 @@ }, { "stage": "spi_support_channel", + "role": "support", "survey": "Family Resources Survey 2023-24 support copy", "source": "https://www.gov.uk/government/collections/family-resources-survey--2", "grain": "household_person_benunit", @@ -585,6 +616,7 @@ }, { "stage": "spi_income", + "role": "donor", "survey": "Survey of Personal Incomes", "source": "https://www.gov.uk/government/collections/personal-incomes-statistics", "grain": "person", @@ -639,6 +671,7 @@ }, { "stage": "frs_only_spi_fill", + "role": "donor", "survey": "Family Resources Survey 2023-24", "source": "https://www.gov.uk/government/collections/family-resources-survey--2", "grain": "person", @@ -744,6 +777,7 @@ }, { "stage": "advani_summers_capital_gains", + "role": "donor", "survey": "Advani-Summers capital gains distribution", "source": "https://ideas.repec.org/p/hal/wpaper/halshs-03022609.html", "grain": "household", @@ -790,6 +824,7 @@ }, { "stage": "frs_salary_sacrifice", + "role": "donor", "survey": "Family Resources Survey salary-sacrifice subsample", "source": "https://www.gov.uk/government/collections/family-resources-survey--2", "grain": "person", @@ -834,6 +869,7 @@ }, { "stage": "slc_student_loan_plan", + "role": "donor", "survey": "Student Loans Company repayment-plan statistics", "source": "https://www.gov.uk/government/collections/student-loans-for-higher-and-further-education", "grain": "person", @@ -867,6 +903,7 @@ }, { "stage": "rowwise_oa_geography", + "role": "geography", "survey": "UK official small-area geography crosswalks", "source": "https://geoportal.statistics.gov.uk/", "grain": "household", @@ -913,28 +950,37 @@ }, { "stage": "national_calibration", - "survey": "UK national calibration target registry", - "source": "https://github.com/PolicyEngine/populace/tree/main/packages/populace-calibrate", + "role": "calibration", + "survey": "PolicyEngine Ledger UK national calibration facts", + "source": "https://github.com/PolicyEngine/arch-data", "grain": "household", "artifacts": [ { - "kind": "target_registry", - "format": "json_or_yaml", + "kind": "ledger_consumer_facts", + "format": "jsonl", "vintage": "build_year", - "locator": "Populace national calibration target registry and supplied administrative target tables" + "locator": "Ledger consumer facts filtered to the UK national calibration target profile" } ], "operations": [ { - "kind": "read_tables", - "tables": [ - "national_calibration_targets", - "household_metric_tables" - ] + "kind": "read_table", + "table": "ledger_consumer_facts" + }, + { + "kind": "compile_ledger_targets", + "country": "uk", + "target_profile": "uk_national_calibration", + "geography_levels": [ + "country" + ], + "target_table": "national_calibration_targets" }, { "kind": "calibrate_weights", "weight": "household_weight", + "targets": "national_calibration_targets", + "metrics": "household_metric_tables", "outputs": [ "household_weight" ] @@ -950,28 +996,38 @@ }, { "stage": "local_geography_weights", - "survey": "UK local calibration target tables", - "source": "https://github.com/PolicyEngine/populace/tree/main/packages/populace-build", + "role": "calibration", + "survey": "PolicyEngine Ledger UK local geography facts", + "source": "https://github.com/PolicyEngine/arch-data", "grain": "household_area", "artifacts": [ { - "kind": "target_tables", - "format": "csv_or_parquet", + "kind": "ledger_consumer_facts", + "format": "jsonl", "vintage": "build_year", - "locator": "Explicit constituency and local-authority target tables supplied to the UK local runner" + "locator": "Ledger consumer facts filtered to the UK constituency and local-authority target profile" } ], "operations": [ { - "kind": "read_tables", - "tables": [ - "local_area_targets", - "household_metric_tables" - ] + "kind": "read_table", + "table": "ledger_consumer_facts" + }, + { + "kind": "compile_ledger_targets", + "country": "uk", + "target_profile": "uk_local_geography", + "area_types": [ + "constituency", + "la" + ], + "target_table": "local_area_targets" }, { "kind": "calibrate_weights", "weight": "household_weight", + "targets": "local_area_targets", + "metrics": "household_metric_tables", "outputs": [ "local_geography_weight" ] diff --git a/packages/populace-build/tests/test_uk_source_manifest.py b/packages/populace-build/tests/test_uk_source_manifest.py index 2120386..70775bb 100644 --- a/packages/populace-build/tests/test_uk_source_manifest.py +++ b/packages/populace-build/tests/test_uk_source_manifest.py @@ -6,6 +6,7 @@ from populace.build.source_manifest import SourceManifest, SourceOperationSpec from populace.build.uk import ( + AREA_TYPES, FRS_ONLY_SPI_FILL_PERSON_COLUMNS, ROWWISE_GEOGRAPHY_COLUMNS, SPI_INCOME_IMPUTATION_COLUMNS, @@ -68,6 +69,7 @@ def test_every_donor_stage_has_matching_source_spec(self) -> None: def test_source_specs_align_with_declared_plan(self) -> None: source_stage_names = {spec.stage for spec in UK_SOURCE_STAGE_SPECS} + assert UK_STAGE_NAMES == UK_SOURCE_MANIFEST.plan_stages assert set(UK_SOURCE_MANIFEST.stage_map()) == source_stage_names assert source_stage_names == set(UK_DONORS) | set(UK_STRUCTURAL_SOURCE_STAGES) assert source_stage_names.issubset(UK_STAGE_NAMES) @@ -78,6 +80,19 @@ def test_source_specs_align_with_declared_plan(self) -> None: "local_geography_weights" ) + def test_donor_and_structural_stage_groups_are_manifest_derived(self) -> None: + donor_stage_names = tuple( + spec.stage for spec in UK_SOURCE_STAGE_SPECS if spec.role == "donor" + ) + structural_stage_names = tuple( + spec.stage for spec in UK_SOURCE_STAGE_SPECS if spec.role != "donor" + ) + + assert tuple(UK_DONORS) == donor_stage_names + assert UK_STRUCTURAL_SOURCE_STAGES == structural_stage_names + assert "national_calibration" in UK_STRUCTURAL_SOURCE_STAGES + assert "local_geography_weights" in UK_STRUCTURAL_SOURCE_STAGES + def test_stage_order_keeps_required_upstream_surfaces_available(self) -> None: assert UK_STAGE_NAMES.index("was_wealth") < UK_STAGE_NAMES.index( "regional_property_uprating" @@ -113,8 +128,41 @@ def test_source_specs_are_manifest_only_not_python_loaders(self) -> None: def test_weight_calibration_stages_are_manifest_declared(self) -> None: specs = UK_SOURCE_MANIFEST.stage_map() for stage in ("national_calibration", "local_geography_weights"): + artifact_kinds = {artifact["kind"] for artifact in specs[stage].artifacts} kinds = [operation.kind for operation in specs[stage].operations] + compile_operation = next( + operation + for operation in specs[stage].operations + if operation.kind == "compile_ledger_targets" + ) + + assert specs[stage].source == "https://github.com/PolicyEngine/arch-data" + assert artifact_kinds == {"ledger_consumer_facts"} + assert "target_registry" not in artifact_kinds + assert "target_tables" not in artifact_kinds + assert kinds.index("read_table") < kinds.index("compile_ledger_targets") + assert kinds.index("compile_ledger_targets") < kinds.index( + "calibrate_weights" + ) assert "calibrate_weights" in kinds + assert compile_operation.parameters["country"] == "uk" + + assert ( + next( + operation + for operation in specs["national_calibration"].operations + if operation.kind == "compile_ledger_targets" + ).parameters["target_profile"] + == "uk_national_calibration" + ) + assert ( + local_compile_operation := next( + operation + for operation in specs["local_geography_weights"].operations + if operation.kind == "compile_ledger_targets" + ) + ).parameters["target_profile"] == "uk_local_geography" + assert tuple(local_compile_operation.parameters["area_types"]) == AREA_TYPES def test_raw_source_surface_declares_salient_outputs_from_each_input(self) -> None: required_outputs = { From 0e6ca255bdcefb72357a550c12ecf951f2529a1f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 20 Jun 2026 18:26:42 -0400 Subject: [PATCH 3/3] Sort UK manifest imports --- packages/populace-build/src/populace/build/uk/__init__.py | 1 - packages/populace-build/tests/test_uk_source_manifest.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/populace-build/src/populace/build/uk/__init__.py b/packages/populace-build/src/populace/build/uk/__init__.py index 0887721..5861066 100644 --- a/packages/populace-build/src/populace/build/uk/__init__.py +++ b/packages/populace-build/src/populace/build/uk/__init__.py @@ -11,7 +11,6 @@ SourceStageSpec, load_source_manifest, ) - from populace.build.uk.geography_sources import ( ENGLAND_LAD_REGION_URL, ENGLAND_WALES_OA2021_COUNT, diff --git a/packages/populace-build/tests/test_uk_source_manifest.py b/packages/populace-build/tests/test_uk_source_manifest.py index 70775bb..e1a2cdf 100644 --- a/packages/populace-build/tests/test_uk_source_manifest.py +++ b/packages/populace-build/tests/test_uk_source_manifest.py @@ -14,8 +14,8 @@ UK_NONNEGATIVE_SOURCE_OUTPUTS, UK_REWRITTEN_SOURCE_OUTPUT_STAGES, UK_SOURCE_MANIFEST, - UK_SOURCE_OUTPUTS, UK_SOURCE_OUTPUT_STAGES, + UK_SOURCE_OUTPUTS, UK_SOURCE_STAGE_SPECS, UK_SPI_SUPPORT_STAGE_NAME, UK_STAGE_NAMES,