diff --git a/packages/populace-build/README.md b/packages/populace-build/README.md index e869550..7e9abcb 100644 --- a/packages/populace-build/README.md +++ b/packages/populace-build/README.md @@ -15,11 +15,9 @@ names its donor survey and fails loudly — no silent fallbacks), and the short-term capital gains to −$3.9T); - **export surface** — every replacement artifact can prove that its exported variables match a reference surface, with only documented - structural extras or reviewed exclusions (for UK, this is the eFRS - compatibility check); + structural extras or reviewed exclusions; - **target surface** — the calibration target set covers the reference - target surface and may only be wider, not narrower (for UK, Populace must - calibrate to at least the eFRS target surface); + target surface and may only be wider, not narrower; - **per-family fit** — the calibration's within-10% share is reported per source family, while only broad family-level misses block publication so one family cannot hide inside the global average; @@ -41,11 +39,13 @@ modules; guard tests enforce this so country content stays declarative. ## UK local-geography path `populace.build.uk_runtime.local_geography` holds the Populace-owned replacement shape -for UK constituency and local-authority geography. It uses the same stacked -local-area layout as the US local ECPS flow: +for UK constituency and local-authority geography. It supports both the stacked +local-area layout used by the US local ECPS flow and an assigned row-wise path +for UK builds that have a finest-available area assignment: ```text -column = area_index * n_households + household_index +stacked: column = area_index * n_households + household_index +assigned: column = household_index, and target rows only see households assigned to that area code ``` The solved weights export to a long sidecar with `(area_type, area_code, @@ -54,22 +54,25 @@ the format PolicyEngine can group by directly for constituency and local authority outputs, and it avoids preserving the legacy dense `areas x households` matrix artifact. -The module does not import the incumbent UK data package. Engine runners and +The module does not import an incumbent UK data package. Engine runners and target providers pass household metric tables and aligned target tables into -`build_stacked_local_matrix`; this keeps Populace clean while the target source -files move over. The helper `sort_households_by_id` also codifies the 2024-25 -FRS fix: household attributes and weights must be sorted by the same stable -household ID before any positional assignment. +`build_stacked_local_matrix`, `build_assigned_local_matrix`, or +`build_local_candidate`; this keeps Populace as the owner of the build surface +while historical incumbent comparisons remain external migration benchmarks. +The helper `sort_households_by_id` also +codifies the 2024-25 FRS fix: household attributes and weights must be sorted +by the same stable household ID before any positional assignment. `populace.build.uk_runtime.local_targets` declares the constituency and local-authority metric surface used by the local build: HMRC employment/self-employment amount and count rows, ONS age bands, Universal Credit household rows, constituency UC-by-children rows, and the LA income/tenure/rent rows. It accepts a PolicyEngine-UK-like simulation object and returns household-indexed metric -tables; it still takes target values as explicit input tables. `local_solver` -wraps the Populace calibrator's log-weight optimizer for stacked local weights -and records per-area/per-metric diagnostics before the solved weights are -exported with `stacked_weights_to_long`. +tables. It can derive the metric subset from Ledger target profiles while +target values remain explicit build inputs. `local_solver` wraps the Populace +calibrator's log-weight optimizer for stacked and assigned local weights and +records per-area/per-metric diagnostics before the solved weights are exported +with `stacked_weights_to_long` or `assigned_weights_to_long`. `populace.build.uk_runtime.local_runner` is the Populace-owned candidate build path. It loads explicit area and target tables, aligns a sorted household frame with @@ -99,6 +102,21 @@ postcode sources. It writes the cloned row-wise H5, a geography coverage CSV, and `rowwise_build_manifest.json` with input/output hashes, row counts, target coverage, weight preservation, and weakest local-support diagnostics. +Like the US plan, UK migration comparisons against earlier production datasets +belong in release/benchmark harnesses outside this package. The build code here +must not import or depend on the incumbent UK data package; `source_manifest.py` +rejects incumbent country data-package references in declarative source specs. + +The packaged `uk/source_stages.json` is the Populace-owned raw-input parity +contract for the UK build: FRS base tables, WAS wealth/debt/vehicles with the +cash ISA and stocks-and-shares ISA split, LCFS consumption and bus fare spend, +ETB VAT and public services, DfT bus/rail amount anchors, NHS usage, SPI +high-income income/reliefs, FRS-only pension/savings/reported-benefit fill, +Advani-Summers capital gains, salary sacrifice, SLC student-loan plan +assignment, and row-wise OA/LA/constituency geography. It is a declarative +resource, not a country Python runtime; shared Populace runtimes load and +execute specs. + ## US plan status `populace.build.us_runtime` declares the US build: stage order, donor graph with diff --git a/packages/populace-build/src/populace/build/gates.py b/packages/populace-build/src/populace/build/gates.py index 9811b43..2bfd178 100644 --- a/packages/populace-build/src/populace/build/gates.py +++ b/packages/populace-build/src/populace/build/gates.py @@ -24,7 +24,8 @@ member names, not raw source-system codes. - :func:`export_surface_gate` and :func:`target_surface_gate` — replacement builds can prove they cover a reference artifact's export variables and - calibration targets, e.g. UK Populace against eFRS. + calibration targets. Reference artifacts are comparison surfaces, not build + inputs. Scoring uses :func:`relative_error_loss` — the calibrator's own objective — so there is no calibrator-vs-scorer objective mismatch: what the solver @@ -750,10 +751,9 @@ def export_surface_gate( This is stricter than :func:`parity_gate`: parity checks whether populated reference layers are also populated, while this gate checks the exported variable *surface* itself. It is intended for live release blocking where a - country has a known incumbent-compatible artifact, such as UK Populace - matching eFRS exported variables. Extra columns are refused unless the - build declares them as structural/compatibility additions; missing - reference columns require a named reviewed exclusion. + country has a known reference export surface. Extra columns are refused + unless the build declares them as structural/compatibility additions; + missing reference columns require a named reviewed exclusion. """ candidate = {str(name) for name in candidate_columns} reference = {str(name) for name in reference_columns} diff --git a/packages/populace-build/src/populace/build/uk/country_package.json b/packages/populace-build/src/populace/build/uk/country_package.json index 0a0951c..105bddb 100644 --- a/packages/populace-build/src/populace/build/uk/country_package.json +++ b/packages/populace-build/src/populace/build/uk/country_package.json @@ -2,5 +2,5 @@ "schema_version": 1, "country": "uk", "policy": "spec-only country package; Python execution lives in shared runtime modules", - "resources": [] + "resources": ["source_stages.json"] } diff --git a/packages/populace-build/src/populace/build/uk/source_stages.json b/packages/populace-build/src/populace/build/uk/source_stages.json new file mode 100644 index 0000000..1f4949a --- /dev/null +++ b/packages/populace-build/src/populace/build/uk/source_stages.json @@ -0,0 +1,1007 @@ +{ + "version": 1, + "country": "uk", + "policy": "UK source stages are manifest-defined. Country/source content may declare primary artifacts, columns, sentinel handling, derivations, imputation recipes, outputs, and validation requirements here; executable Python belongs only in shared Populace runtimes. Weight calibration and Ledger target-profile consumption live in shared calibration/local-target runtimes rather than country package manifests.", + "stages": [ + { + "stage": "frs_base", + "role": "base", + "survey": "Family Resources Survey 2023-24", + "source": "https://www.gov.uk/government/collections/family-resources-survey--2", + "grain": "household_person_benunit", + "artifacts": [ + { + "kind": "survey_microdata", + "format": "tabular_release", + "vintage": "2023-24", + "locator": "DWP Family Resources Survey household, benefit unit, and person tables" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "frs_household", + "frs_benunit", + "frs_person" + ], + "weight": "household_weight" + }, + { + "kind": "replace_sentinels", + "scope": "frs_missing_or_not_applicable_fields" + }, + { + "kind": "derive", + "outputs": [ + "entity_ids", + "source_household_lineage", + "household_income_predictors", + "education_counts", + "housing_predictors", + "employment_sector", + "sic_industry_division" + ] + } + ], + "outputs": [ + "household_id", + "benunit_id", + "person_id", + "person_household_id", + "person_benunit_id", + "household_weight", + "region", + "age", + "gender", + "employment_sector", + "sic_industry_division", + "employment_income", + "self_employment_income", + "private_pension_income", + "capital_income", + "household_net_income", + "hbai_household_net_income", + "tenure_type", + "accommodation_type", + "num_adults", + "num_children", + "num_bedrooms", + "council_tax" + ], + "nonnegative_outputs": [ + "household_weight", + "age", + "sic_industry_division", + "employment_income", + "self_employment_income", + "private_pension_income", + "household_net_income", + "hbai_household_net_income", + "num_adults", + "num_children", + "num_bedrooms", + "council_tax" + ], + "notes": "This stage owns the raw FRS base entities and lineage. The compact UK artifact remains the fast national input; local variants can pool years or clone from this base before row-wise geography assignment." + }, + { + "stage": "was_wealth", + "role": "donor", + "survey": "Wealth and Assets Survey", + "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/debt/methodologies/wealthandassetssurveyqmi", + "grain": "household", + "artifacts": [ + { + "kind": "survey_microdata", + "format": "secure_or_licensed_extract", + "vintage": "latest_available", + "locator": "ONS Wealth and Assets Survey household/person extract" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "was_household", + "weight": "weight" + }, + { + "kind": "replace_sentinels", + "scope": "was_missing_or_not_applicable_fields" + }, + { + "kind": "derive", + "outputs": [ + "cash_isa", + "stocks_and_shares_isa" + ], + "source_fields": { + "cash_isa": "DVCISAVR8", + "stocks_and_shares_isa": "DVIISAVR8" + } + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "household_net_income", + "num_adults", + "num_children", + "private_pension_income", + "employment_income", + "self_employment_income", + "capital_income", + "num_bedrooms", + "council_tax", + "is_renting", + "region" + ] + }, + { + "kind": "fold_into", + "target": "corporate_wealth", + "amount": "stocks_and_shares_isa" + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "owned_land", + "property_wealth", + "corporate_wealth", + "gross_financial_wealth", + "net_financial_wealth", + "main_residence_value", + "other_residential_property_value", + "non_residential_property_value", + "savings", + "cash_isa", + "stocks_and_shares_isa", + "num_vehicles", + "student_loan_balance", + "mortgage_debt", + "consumer_debt" + ], + "nonnegative_outputs": [ + "owned_land", + "property_wealth", + "corporate_wealth", + "gross_financial_wealth", + "main_residence_value", + "other_residential_property_value", + "non_residential_property_value", + "savings", + "cash_isa", + "stocks_and_shares_isa", + "num_vehicles", + "student_loan_balance", + "mortgage_debt", + "consumer_debt" + ], + "notes": "Northern Ireland can borrow the Wales region support when the donor survey does not identify Northern Ireland with enough detail. Cash ISA and stocks-and-shares ISA holdings are surfaced separately; the investment ISA component is also folded into corporate_wealth for backward-compatible wealth totals." + }, + { + "stage": "regional_property_uprating", + "role": "donor", + "survey": "UK House Price Index and regional land-value tables", + "source": "https://www.gov.uk/government/collections/uk-house-price-index-reports", + "grain": "household", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "UK House Price Index regional series and land-value adjustment tables" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "regional_property_uprating" + }, + { + "kind": "uprate", + "from_year": "wealth_survey_vintage", + "to_year_from_build_config": true, + "by": [ + "region", + "property_type" + ] + } + ], + "outputs": [ + "property_wealth", + "main_residence_value" + ], + "nonnegative_outputs": [ + "property_wealth", + "main_residence_value" + ] + }, + { + "stage": "lcfs_consumption", + "role": "donor", + "survey": "Living Costs and Food Survey", + "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/methodologies/livingcostsandfoodsurveyqmi", + "grain": "household", + "artifacts": [ + { + "kind": "survey_microdata", + "format": "licensed_extract", + "vintage": "latest_available", + "locator": "ONS Living Costs and Food Survey household/person extract" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "lcfs_household", + "lcfs_person" + ], + "weight": "weight" + }, + { + "kind": "derive", + "outputs": [ + "lcfs_coicop_consumption", + "bus_fare_spending", + "lcfs_fuel_spending", + "has_fuel_consumption" + ], + "source_codes": { + "bus_fare_spending": [ + "c73212", + "c73213", + "c73214" + ] + } + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "is_adult", + "is_child", + "region", + "employment_income", + "self_employment_income", + "private_pension_income", + "hbai_household_net_income", + "tenure_type", + "accommodation_type", + "has_fuel_consumption" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "has_fuel_consumption", + "food_and_non_alcoholic_beverages_consumption", + "alcohol_and_tobacco_consumption", + "clothing_and_footwear_consumption", + "housing_water_and_electricity_consumption", + "household_furnishings_consumption", + "health_consumption", + "transport_consumption", + "communication_consumption", + "recreation_consumption", + "education_consumption", + "restaurants_and_hotels_consumption", + "miscellaneous_consumption", + "bus_fare_spending", + "petrol_spending", + "diesel_spending", + "domestic_energy_consumption", + "electricity_consumption", + "gas_consumption" + ], + "nonnegative_outputs": [ + "food_and_non_alcoholic_beverages_consumption", + "alcohol_and_tobacco_consumption", + "clothing_and_footwear_consumption", + "housing_water_and_electricity_consumption", + "household_furnishings_consumption", + "health_consumption", + "transport_consumption", + "communication_consumption", + "recreation_consumption", + "education_consumption", + "restaurants_and_hotels_consumption", + "miscellaneous_consumption", + "bus_fare_spending", + "petrol_spending", + "diesel_spending", + "domestic_energy_consumption", + "electricity_consumption", + "gas_consumption" + ], + "notes": "The fuel-consumption bridge uses the WAS vehicle signal on recipient households and LCFS fuel purchases on donors." + }, + { + "stage": "road_fuel_energy_calibration", + "role": "donor", + "survey": "Road fuel and household energy administrative totals", + "source": "https://www.gov.uk/government/collections/road-transport-consumption-at-regional-and-local-level", + "grain": "household", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "DfT road-fuel consumption totals and DESNZ household energy tables" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "road_fuel_consumption", + "domestic_energy_targets" + ] + }, + { + "kind": "uprate", + "variables": [ + "petrol_spending", + "diesel_spending", + "electricity_consumption", + "gas_consumption" + ], + "targets": [ + "road_fuel_consumption", + "domestic_energy_targets" + ], + "weight": "household_weight" + }, + { + "kind": "derive", + "outputs": [ + "domestic_energy_consumption" + ], + "formula": "electricity_consumption + gas_consumption" + } + ], + "outputs": [ + "petrol_spending", + "diesel_spending", + "domestic_energy_consumption", + "electricity_consumption", + "gas_consumption" + ], + "nonnegative_outputs": [ + "petrol_spending", + "diesel_spending", + "domestic_energy_consumption", + "electricity_consumption", + "gas_consumption" + ] + }, + { + "stage": "etb_vat", + "role": "donor", + "survey": "Effects of Taxes and Benefits", + "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/theeffectsoftaxesandbenefitsonhouseholdincomehistoricaldatasets", + "grain": "household", + "artifacts": [ + { + "kind": "published_microdata_table", + "format": "spreadsheet", + "vintage": "build_year", + "locator": "ONS Effects of Taxes and Benefits indirect-tax tables" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "etb_household" + }, + { + "kind": "derive", + "outputs": [ + "full_rate_vat_expenditure_rate" + ] + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "is_adult", + "is_child", + "is_SP_age", + "household_net_income" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "full_rate_vat_expenditure_rate" + ], + "nonnegative_outputs": [ + "full_rate_vat_expenditure_rate" + ] + }, + { + "stage": "nhs_usage", + "role": "donor", + "survey": "NHS activity and unit-cost tables", + "source": "https://www.england.nhs.uk/statistics/statistical-work-areas/hospital-activity/monthly-hospital-activity/", + "grain": "person", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "NHS activity counts and service-cost totals by age/sex where available" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "nhs_activity", + "nhs_unit_costs" + ] + }, + { + "kind": "fit_weighted_imputer", + "predictors": [ + "age", + "gender", + "region", + "disability_benefit_indicators", + "hbai_household_net_income" + ] + }, + { + "kind": "derive", + "outputs": [ + "nhs_visits", + "nhs_spending" + ] + }, + { + "kind": "support_clip", + "range": "administrative_nonnegative" + } + ], + "outputs": [ + "a_and_e_visits", + "admitted_patient_visits", + "outpatient_visits", + "nhs_a_and_e_spending", + "nhs_admitted_patient_spending", + "nhs_outpatient_spending", + "nhs_visits", + "nhs_spending" + ], + "nonnegative_outputs": [ + "a_and_e_visits", + "admitted_patient_visits", + "outpatient_visits", + "nhs_a_and_e_spending", + "nhs_admitted_patient_spending", + "nhs_outpatient_spending", + "nhs_visits", + "nhs_spending" + ] + }, + { + "stage": "etb_public_services", + "role": "donor", + "survey": "Effects of Taxes and Benefits public-service tables", + "source": "https://www.ons.gov.uk/peoplepopulationandcommunity/personalandhouseholdfinances/incomeandwealth/datasets/theeffectsoftaxesandbenefitsonhouseholdincomehistoricaldatasets", + "grain": "household", + "artifacts": [ + { + "kind": "published_microdata_table", + "format": "spreadsheet", + "vintage": "build_year", + "locator": "ONS Effects of Taxes and Benefits benefits-in-kind tables" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "etb_household_services" + }, + { + "kind": "derive", + "outputs": [ + "rail_usage" + ] + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "is_adult", + "is_child", + "is_SP_age", + "count_primary_education", + "count_secondary_education", + "count_further_education", + "dla", + "pip", + "hbai_household_net_income" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "dfe_education_spending", + "rail_subsidy_spending", + "bus_subsidy_spending", + "rail_usage" + ], + "nonnegative_outputs": [ + "dfe_education_spending", + "rail_subsidy_spending", + "bus_subsidy_spending", + "rail_usage" + ] + }, + { + "stage": "bus_public_service_calibration", + "role": "donor", + "survey": "DfT annual bus statistics", + "source": "https://www.gov.uk/government/collections/bus-statistics", + "grain": "household", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "DfT bus fare receipts and net government support totals" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "bus_public_service_targets" + }, + { + "kind": "uprate", + "variables": [ + "bus_fare_spending", + "bus_subsidy_spending" + ], + "targets": [ + "bus_public_service_targets" + ], + "weight": "household_weight" + } + ], + "outputs": [ + "bus_fare_spending", + "bus_subsidy_spending" + ], + "nonnegative_outputs": [ + "bus_fare_spending", + "bus_subsidy_spending" + ], + "notes": "Anchors LCFS fare spending and ETB subsidy allocation to DfT Annual Bus Statistics totals. Fare spending is households' passenger fare outlay; subsidy spending is net government support allocated to households." + }, + { + "stage": "rail_public_service_calibration", + "role": "donor", + "survey": "Rail public-service administrative totals", + "source": "https://www.gov.uk/government/collections/rail-statistics", + "grain": "household", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "DfT rail passenger and subsidy totals" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "rail_public_service_targets" + }, + { + "kind": "uprate", + "variables": [ + "rail_subsidy_spending", + "rail_usage" + ], + "targets": [ + "rail_public_service_targets" + ], + "weight": "household_weight" + } + ], + "outputs": [ + "rail_subsidy_spending", + "rail_usage" + ], + "nonnegative_outputs": [ + "rail_subsidy_spending", + "rail_usage" + ], + "notes": "This post-weight stage scales rail usage and subsidy after the local/final household weights are available." + }, + { + "stage": "spi_support_channel", + "role": "support", + "survey": "Family Resources Survey 2023-24 support copy", + "source": "https://www.gov.uk/government/collections/family-resources-survey--2", + "grain": "household_person_benunit", + "artifacts": [ + { + "kind": "derived_support_frame", + "format": "in_memory_tables", + "vintage": "2023-24", + "locator": "zero-weight FRS support channel for high-income SPI imputation" + } + ], + "operations": [ + { + "kind": "derive", + "outputs": [ + "zero_weight_spi_support_copy", + "source_household_lineage" + ] + } + ], + "outputs": [ + "household_is_spi_synthetic", + "household_support_channel", + "person_support_channel", + "benunit_support_channel", + "household_support_clone_index", + "person_support_clone_index", + "benunit_support_clone_index", + "household_source_id", + "person_source_id", + "benunit_source_id", + "source_household_id", + "source_household_key" + ], + "notes": "The support copy has zero initial household weight and source-household lineage so local-geography support accounting does not count it as independent FRS sample." + }, + { + "stage": "spi_income", + "role": "donor", + "survey": "Survey of Personal Incomes", + "source": "https://www.gov.uk/government/collections/personal-incomes-statistics", + "grain": "person", + "artifacts": [ + { + "kind": "administrative_microdata_or_tabulation", + "format": "hmrc_spi_extract", + "vintage": "latest_available", + "locator": "HMRC Survey of Personal Incomes person-level income and relief records" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "spi_person", + "weight": "spi_weight" + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "age", + "gender", + "region" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "employment_income", + "self_employment_income", + "savings_interest_income", + "dividend_income", + "private_pension_income", + "property_income", + "gift_aid", + "charitable_investment_gifts" + ], + "nonnegative_outputs": [ + "employment_income", + "self_employment_income", + "savings_interest_income", + "dividend_income", + "private_pension_income", + "property_income", + "gift_aid", + "charitable_investment_gifts" + ], + "notes": "The SPI-trained first stage fills the SPI support channel jointly for income components, Gift Aid, and qualifying investment gifts." + }, + { + "stage": "frs_only_spi_fill", + "role": "donor", + "survey": "Family Resources Survey 2023-24", + "source": "https://www.gov.uk/government/collections/family-resources-survey--2", + "grain": "person", + "artifacts": [ + { + "kind": "survey_microdata", + "format": "tabular_release", + "vintage": "2023-24", + "locator": "DWP Family Resources Survey person-level pension, savings, and reported-benefit fields" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "frs_person" + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "age", + "gender", + "region", + "employment_income", + "self_employment_income", + "savings_interest_income", + "dividend_income", + "private_pension_income", + "property_income" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "employee_pension_contributions", + "employer_pension_contributions", + "personal_pension_contributions", + "pension_contributions_via_salary_sacrifice", + "tax_free_savings_income", + "universal_credit_reported", + "pension_credit_reported", + "child_benefit_reported", + "housing_benefit_reported", + "income_support_reported", + "working_tax_credit_reported", + "child_tax_credit_reported", + "attendance_allowance_reported", + "state_pension_reported", + "dla_sc_reported", + "dla_m_reported", + "pip_m_reported", + "pip_dl_reported", + "sda_reported", + "carers_allowance_reported", + "iidb_reported", + "afcs_reported", + "bsp_reported", + "incapacity_benefit_reported", + "maternity_allowance_reported", + "winter_fuel_allowance_reported", + "council_tax_benefit_reported", + "jsa_contrib_reported", + "jsa_income_reported", + "esa_contrib_reported", + "esa_income_reported" + ], + "nonnegative_outputs": [ + "employee_pension_contributions", + "employer_pension_contributions", + "personal_pension_contributions", + "pension_contributions_via_salary_sacrifice", + "tax_free_savings_income", + "universal_credit_reported", + "pension_credit_reported", + "child_benefit_reported", + "housing_benefit_reported", + "income_support_reported", + "working_tax_credit_reported", + "child_tax_credit_reported", + "attendance_allowance_reported", + "state_pension_reported", + "dla_sc_reported", + "dla_m_reported", + "pip_m_reported", + "pip_dl_reported", + "sda_reported", + "carers_allowance_reported", + "iidb_reported", + "afcs_reported", + "bsp_reported", + "incapacity_benefit_reported", + "maternity_allowance_reported", + "winter_fuel_allowance_reported", + "council_tax_benefit_reported", + "jsa_contrib_reported", + "jsa_income_reported", + "esa_contrib_reported", + "esa_income_reported" + ], + "notes": "This stage replaces benefit receipt and pension/savings behavior on SPI support rows with draws conditional on the SPI-imputed income surface." + }, + { + "stage": "advani_summers_capital_gains", + "role": "donor", + "survey": "Advani-Summers capital gains distribution", + "source": "https://ideas.repec.org/p/hal/wpaper/halshs-03022609.html", + "grain": "household", + "artifacts": [ + { + "kind": "research_table", + "format": "csv", + "vintage": "latest_available", + "locator": "capital gains distribution by income/rank cell" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "capital_gains_distribution" + }, + { + "kind": "calibrate_binary_assignment", + "variable": "household_is_capital_gains_clone", + "weight": "household_weight" + }, + { + "kind": "fit_weighted_imputer", + "predictors": [ + "household_net_income", + "employment_income", + "self_employment_income", + "dividend_income", + "property_income" + ] + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "capital_gains", + "household_is_capital_gains_clone" + ], + "nonnegative_outputs": [ + "capital_gains" + ] + }, + { + "stage": "frs_salary_sacrifice", + "role": "donor", + "survey": "Family Resources Survey salary-sacrifice subsample", + "source": "https://www.gov.uk/government/collections/family-resources-survey--2", + "grain": "person", + "artifacts": [ + { + "kind": "survey_microdata", + "format": "tabular_release", + "vintage": "2023-24", + "locator": "FRS person-level salary-sacrifice fields with OBR/ASHE aggregate target" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "frs_person_salary_sacrifice" + }, + { + "kind": "fit_weighted_qrf", + "predictors": [ + "age", + "employment_income" + ] + }, + { + "kind": "fold_into", + "target": "employee_pension_contributions", + "amount": "pension_contributions_via_salary_sacrifice" + }, + { + "kind": "support_clip", + "range": "donor_realized" + } + ], + "outputs": [ + "pension_contributions_via_salary_sacrifice", + "employee_pension_contributions" + ], + "nonnegative_outputs": [ + "pension_contributions_via_salary_sacrifice", + "employee_pension_contributions" + ] + }, + { + "stage": "slc_student_loan_plan", + "role": "donor", + "survey": "Student Loans Company repayment-plan statistics", + "source": "https://www.gov.uk/government/collections/student-loans-for-higher-and-further-education", + "grain": "person", + "artifacts": [ + { + "kind": "administrative_table", + "format": "published_table", + "vintage": "build_year", + "locator": "SLC borrower plan snapshot by cohort and geography" + } + ], + "operations": [ + { + "kind": "read_table", + "table": "slc_student_loan_snapshot" + }, + { + "kind": "assign_by_plan_type", + "predictors": [ + "age", + "student_loan_balance", + "education_status", + "region" + ], + "output": "student_loan_plan" + } + ], + "outputs": [ + "student_loan_plan" + ] + }, + { + "stage": "rowwise_oa_geography", + "role": "geography", + "survey": "UK official small-area geography crosswalks", + "source": "https://geoportal.statistics.gov.uk/", + "grain": "household", + "artifacts": [ + { + "kind": "public_geography", + "format": "csv_or_geojson", + "vintage": "build_year", + "locator": "ONS, NRS, NISRA, and postcode-directory OA/DZ to LA/constituency crosswalks" + } + ], + "operations": [ + { + "kind": "read_tables", + "tables": [ + "uk_official_geography_crosswalk", + "household_region_frame" + ] + }, + { + "kind": "derive", + "outputs": [ + "rowwise_household_clones", + "finest_available_geography_assignment" + ] + }, + { + "kind": "join", + "on": [ + "household_id", + "source_household_id" + ] + } + ], + "outputs": [ + "oa_code", + "lsoa_code", + "msoa_code", + "la_code_oa", + "constituency_code_oa", + "region_code_oa" + ], + "notes": "This is the long-format local geography path: each household row receives one finest-area assignment, and later local weights export as area-household rows rather than a dense area-by-household matrix." + } + ] +} diff --git a/packages/populace-build/src/populace/build/uk_runtime/__init__.py b/packages/populace-build/src/populace/build/uk_runtime/__init__.py index f0addf9..37085d6 100644 --- a/packages/populace-build/src/populace/build/uk_runtime/__init__.py +++ b/packages/populace-build/src/populace/build/uk_runtime/__init__.py @@ -42,11 +42,15 @@ write_geography_crosswalk, ) from populace.build.uk_runtime.local_geography import ( + AREA_TYPE_TO_ROWWISE_HOUSEHOLD_COLUMN, LONG_GEOGRAPHY_COLUMNS, StackedLocalMatrix, align_area_targets, area_support_summary, + assigned_weights_to_long, + build_assigned_local_matrix, build_stacked_local_matrix, + rowwise_assignment_column, sort_households_by_id, stacked_design_weights, stacked_weights_to_long, @@ -68,6 +72,7 @@ ) from populace.build.uk_runtime.local_solver import ( StackedLocalSolveResult, + solve_assigned_local_weights, solve_stacked_local_weights, ) from populace.build.uk_runtime.local_targets import ( @@ -131,6 +136,7 @@ "AREA_TYPE_TO_LEDGER_GEOGRAPHY_LEVEL", "AREA_TYPES", "AREA_TYPE_TO_CROSSWALK_COLUMN", + "AREA_TYPE_TO_ROWWISE_HOUSEHOLD_COLUMN", "BASE_FRS_SUPPORT_CHANNEL", "BENUNIT_ID_COLUMNS", "COUNTRY_TO_REGION", @@ -176,11 +182,13 @@ "UK_SINGLE_YEAR_TABLES", "UK_SPI_SUPPORT_STAGE_NAME", "align_area_targets", + "assigned_weights_to_long", "area_support_summary", "area_groups_from_codes", "assign_household_geography", "build_local_candidate", "build_local_candidate_from_dataset", + "build_assigned_local_matrix", "build_complete_uk_geography_crosswalk", "build_england_wales_crosswalk", "build_great_britain_crosswalk", @@ -220,7 +228,9 @@ "prepare_geography_crosswalk", "prepare_household_frame", "read_local_table", + "rowwise_assignment_column", "set_simulation_area_group", + "solve_assigned_local_weights", "solve_stacked_local_weights", "sort_households_by_id", "stacked_design_weights", diff --git a/packages/populace-build/src/populace/build/uk_runtime/local_geography.py b/packages/populace-build/src/populace/build/uk_runtime/local_geography.py index f87c164..b6cc367 100644 --- a/packages/populace-build/src/populace/build/uk_runtime/local_geography.py +++ b/packages/populace-build/src/populace/build/uk_runtime/local_geography.py @@ -1,10 +1,12 @@ """Longwise UK local-geography build primitives. -This module owns the representation that lets Populace replace the legacy +This module owns the representations that let Populace replace the legacy UK incumbent ``areas x households`` matrix artifacts: * a stacked sparse matrix whose columns are ``area_index * n_households + household_index``; and +* an assigned sparse matrix whose columns are household weights and whose + target rows only see households assigned to that local area; and * a longweight sidecar with one row per non-zero ``(area, household, weight)`` assignment. @@ -39,6 +41,12 @@ "weight_source", ) +AREA_TYPE_TO_ROWWISE_HOUSEHOLD_COLUMN = { + "constituency": "constituency_code_oa", + "la": "la_code_oa", + "local_authority": "la_code_oa", +} + _AREA_METADATA_COLUMNS = frozenset( { "area_code", @@ -54,7 +62,7 @@ @dataclass(frozen=True) class StackedLocalMatrix: - """Sparse stacked local-area calibration matrix and aligned targets.""" + """Sparse local-area calibration matrix and aligned targets.""" matrix: sp.csr_matrix targets: np.ndarray @@ -226,8 +234,10 @@ def build_stacked_local_matrix( ) cache_key = (group, metric_index) if cache_key not in nonzero_cache: - column = metric_tables[group].iloc[:, metric_index].to_numpy( - dtype=np.float64 + column = ( + metric_tables[group] + .iloc[:, metric_index] + .to_numpy(dtype=np.float64) ) if not np.isfinite(column).all(): raise ValueError( @@ -267,6 +277,162 @@ def build_stacked_local_matrix( ) +def rowwise_assignment_column( + area_type: str, + *, + assignment_column: str | None = None, +) -> str: + """Return the household column carrying rowwise local geography codes.""" + + if assignment_column is not None: + column = str(assignment_column).strip() + if column == "": + raise ValueError("assignment_column must not be blank.") + return column + if area_type not in AREA_TYPE_TO_ROWWISE_HOUSEHOLD_COLUMN: + raise ValueError( + f"No default rowwise assignment column is defined for {area_type!r}." + ) + return AREA_TYPE_TO_ROWWISE_HOUSEHOLD_COLUMN[area_type] + + +def build_assigned_local_matrix( + metrics: pd.DataFrame | Mapping[str, pd.DataFrame], + targets: pd.DataFrame, + *, + household_frame: pd.DataFrame, + area_codes: Sequence[str] | None = None, + area_groups: Mapping[str, str] | None = None, + household_ids: Sequence[Any] | None = None, + area_type: str = "constituency", + code_column: str = "code", + assignment_column: str | None = None, +) -> StackedLocalMatrix: + """Build a rowwise-assigned sparse matrix for local-area calibration. + + Unlike :func:`build_stacked_local_matrix`, each household has a single + column. A household contributes only to the target rows for the local area + stored in its rowwise geography assignment column, such as + ``constituency_code_oa`` or ``la_code_oa``. + """ + + if area_codes is None: + if code_column not in targets.columns: + raise ValueError( + "area_codes must be supplied when targets has no " + f"{code_column!r} column." + ) + area_codes = targets[code_column].astype(str).tolist() + codes = _area_code_tuple(area_codes) + if household_ids is None: + if "household_id" not in household_frame.columns: + raise ValueError("household_frame must include 'household_id'.") + household_ids = household_frame["household_id"].to_numpy() + hh_ids = np.asarray(household_ids) + aligned_households = _align_household_frame(household_frame, hh_ids) + assert aligned_households is not None + assignment_name = rowwise_assignment_column( + area_type, + assignment_column=assignment_column, + ) + if assignment_name not in aligned_households.columns: + raise ValueError( + f"household_frame is missing rowwise assignment column {assignment_name!r}." + ) + assignments = _normalise_area_assignments(aligned_households[assignment_name]) + + metric_tables, groups = _normalise_metric_tables( + metrics, + area_codes=codes, + area_groups=area_groups, + household_ids=hh_ids, + ) + first = next(iter(metric_tables.values())) + metric_names = tuple(str(col) for col in first.columns) + target_values = align_area_targets( + targets, + codes, + metric_names=metric_names, + code_column=code_column, + ) + + n_households = len(first) + n_areas = len(codes) + n_metrics = len(metric_names) + n_targets = n_areas * n_metrics + rows: list[np.ndarray] = [] + cols: list[np.ndarray] = [] + data: list[np.ndarray] = [] + target_rows: list[dict[str, Any]] = [] + assignment_indices = { + area_code: np.flatnonzero(assignments == area_code) for area_code in codes + } + metric_cache: dict[tuple[str, int], np.ndarray] = {} + + for area_index, area_code in enumerate(codes): + group = groups[area_code] + household_positions = assignment_indices[area_code] + for metric_index, metric_name in enumerate(metric_names): + target_index = area_index * n_metrics + metric_index + target_rows.append( + { + "target_index": target_index, + "area_type": area_type, + "area_code": area_code, + "area_index": area_index, + "area_group": group, + "metric": metric_name, + "metric_index": metric_index, + "value": float(target_values.loc[area_code, metric_name]), + } + ) + if len(household_positions) == 0: + continue + cache_key = (group, metric_index) + if cache_key not in metric_cache: + column = ( + metric_tables[group] + .iloc[:, metric_index] + .to_numpy(dtype=np.float64) + ) + if not np.isfinite(column).all(): + raise ValueError( + f"metric {metric_name!r} for group {group!r} " + "contains non-finite values." + ) + metric_cache[cache_key] = column + values = metric_cache[cache_key][household_positions] + nz = np.flatnonzero(values) + if len(nz) == 0: + continue + rows.append(np.full(len(nz), target_index, dtype=np.int64)) + cols.append(household_positions[nz].astype(np.int64)) + data.append(values[nz].astype(np.float64, copy=False)) + + if rows: + row_array = np.concatenate(rows) + col_array = np.concatenate(cols) + data_array = np.concatenate(data) + else: + row_array = np.array([], dtype=np.int64) + col_array = np.array([], dtype=np.int64) + data_array = np.array([], dtype=np.float64) + matrix = sp.csr_matrix( + (data_array, (row_array, col_array)), + shape=(n_targets, n_households), + dtype=np.float64, + ) + target_frame = pd.DataFrame(target_rows) + return StackedLocalMatrix( + matrix=matrix, + targets=target_frame["value"].to_numpy(dtype=np.float64), + target_frame=target_frame, + area_codes=codes, + metric_names=metric_names, + n_households=n_households, + ) + + def stacked_design_weights( base_weights: Sequence[float], n_areas: int, @@ -362,6 +528,114 @@ def stacked_weights_to_long( return out.loc[:, LONG_GEOGRAPHY_COLUMNS] +def assigned_weights_to_long( + weights: Sequence[float], + area_codes: Sequence[str], + household_ids: Sequence[Any], + *, + area_type: str, + household_frame: pd.DataFrame, + assignment_column: str | None = None, + base_weights: Sequence[float] | None = None, + drop_weight_atol: float = 0.0, + source_year: int | None = None, + weight_source: str = "populace_local_assigned", + drop_zero: bool = True, +) -> pd.DataFrame: + """Convert assigned household weights to the local-geography sidecar.""" + + codes = _area_code_tuple(area_codes) + hh_ids = np.asarray(household_ids) + n_households = len(hh_ids) + w = np.asarray(weights, dtype=np.float64).reshape(-1) + if len(w) != n_households: + raise ValueError( + f"weights length must equal household count ({n_households}), got {len(w)}." + ) + if not np.isfinite(w).all() or (w < 0).any(): + raise ValueError("weights must be finite and non-negative.") + base = None if base_weights is None else np.asarray(base_weights, dtype=np.float64) + if base is not None: + if base.shape != w.shape: + raise ValueError( + f"base_weights must align with weights, got {base.shape} vs {w.shape}." + ) + if not np.isfinite(base).all() or (base < 0).any(): + raise ValueError("base_weights must be finite and non-negative.") + if not np.isfinite(drop_weight_atol) or drop_weight_atol < 0: + raise ValueError("drop_weight_atol must be finite and non-negative.") + + household_frame = _align_household_frame(household_frame, hh_ids) + assert household_frame is not None + assignment_name = rowwise_assignment_column( + area_type, + assignment_column=assignment_column, + ) + if assignment_name not in household_frame.columns: + raise ValueError( + f"household_frame is missing rowwise assignment column {assignment_name!r}." + ) + assignments = _normalise_area_assignments(household_frame[assignment_name]) + area_index_by_code = {area_code: idx for idx, area_code in enumerate(codes)} + in_requested_area = np.fromiter( + (area_code in area_index_by_code for area_code in assignments), + dtype=bool, + count=n_households, + ) + if drop_zero: + if base is None: + in_requested_area &= w > drop_weight_atol + else: + zero_base_floor = (base == 0) & (w <= drop_weight_atol) + in_requested_area &= (w != 0) & ~zero_base_floor + selected = np.flatnonzero(in_requested_area) + + source_year_values = _metadata_values( + household_frame, + "source_year", + default=source_year, + length=n_households, + ) + source_household_ids = _metadata_values( + household_frame, + "source_household_id", + default=hh_ids, + length=n_households, + ) + source_keys = _metadata_values( + household_frame, + "source_household_key", + default=_source_keys(source_year_values, source_household_ids), + length=n_households, + ) + clone_index = _metadata_values( + household_frame, + "clone_index", + default=0, + length=n_households, + ) + + selected_area_codes = assignments[selected] + out = pd.DataFrame( + { + "area_type": area_type, + "area_code": selected_area_codes, + "area_index": [ + area_index_by_code[area_code] for area_code in selected_area_codes + ], + "household_index": selected.astype(np.int64), + "household_id": hh_ids[selected], + "source_year": source_year_values[selected], + "source_household_id": source_household_ids[selected], + "source_household_key": source_keys[selected], + "clone_index": clone_index[selected], + "weight": w[selected], + "weight_source": weight_source, + } + ) + return out.loc[:, LONG_GEOGRAPHY_COLUMNS] + + def area_support_summary( long_weights: pd.DataFrame, *, @@ -477,8 +751,7 @@ def _normalise_metric_tables( for group, frame in tables.items(): if len(frame) != len(first): raise ValueError( - f"metric table {group!r} has {len(frame)} rows; expected " - f"{len(first)}." + f"metric table {group!r} has {len(frame)} rows; expected {len(first)}." ) if not frame.index.equals(first.index): raise ValueError( @@ -580,6 +853,14 @@ def _align_household_frame( return aligned.reset_index(drop=True) +def _normalise_area_assignments(values: Sequence[Any]) -> np.ndarray: + series = pd.Series(values) + missing = series.isna() + strings = series.astype(str).str.strip() + strings = strings.mask(missing | (strings == ""), None) + return strings.to_numpy(dtype=object) + + def _source_keys( source_year: Sequence[Any], source_household_id: Sequence[Any], diff --git a/packages/populace-build/src/populace/build/uk_runtime/local_runner.py b/packages/populace-build/src/populace/build/uk_runtime/local_runner.py index a86a2de..bb0f9c2 100644 --- a/packages/populace-build/src/populace/build/uk_runtime/local_runner.py +++ b/packages/populace-build/src/populace/build/uk_runtime/local_runner.py @@ -20,13 +20,17 @@ from populace.build.uk_runtime.local_geography import ( StackedLocalMatrix, area_support_summary, + assigned_weights_to_long, + build_assigned_local_matrix, build_stacked_local_matrix, + rowwise_assignment_column, sort_households_by_id, stacked_weights_to_long, write_long_geography_weights, ) from populace.build.uk_runtime.local_solver import ( StackedLocalSolveResult, + solve_assigned_local_weights, solve_stacked_local_weights, ) from populace.build.uk_runtime.local_targets import ( @@ -44,6 +48,7 @@ class UKLocalCandidateResult: solve_result: StackedLocalSolveResult long_weights: pd.DataFrame support_summary: pd.DataFrame + support_mode: str def read_local_table(path: str | Path) -> pd.DataFrame: @@ -260,6 +265,8 @@ def build_local_candidate( max_areas: int | None = None, source_year: int | None = None, weight_source: str = "populace_uk_local", + support_mode: str = "auto", + assignment_column: str | None = None, solver_options: Mapping[str, Any] | None = None, ) -> UKLocalCandidateResult: """Build, solve, and export a UK local candidate in longwise form.""" @@ -281,32 +288,81 @@ def build_local_candidate( code_column=code_column, group_column=group_column, ) - household_ids = households["household_id"].to_numpy() - base_weights = households["household_weight"].to_numpy(dtype=np.float64) - target_frame = _as_frame(targets) - problem = build_stacked_local_matrix( - metrics, - target_frame, - area_codes=area_codes, - area_groups=area_groups, - household_ids=household_ids, - area_type=area_type, - code_column=code_column, - ) - solve_result = solve_stacked_local_weights( - problem, - base_weights, - **dict(solver_options or {}), - ) - long_weights = stacked_weights_to_long( - solve_result.weights, - area_codes, - household_ids, + resolved_support_mode = _resolve_support_mode( + support_mode, area_type=area_type, household_frame=households, - source_year=source_year, - weight_source=weight_source, + assignment_column=assignment_column, ) + if resolved_support_mode == "assigned": + households = _filter_assigned_households_to_areas( + households, + area_codes=area_codes, + area_type=area_type, + assignment_column=assignment_column, + ) + metrics = _subset_metric_tables_to_households( + metrics, + households["household_id"].to_numpy(), + ) + household_ids = households["household_id"].to_numpy() + base_weights = households["household_weight"].to_numpy(dtype=np.float64) + target_frame = _as_frame(targets) + solver_config = dict(solver_options or {}) + if resolved_support_mode == "assigned": + problem = build_assigned_local_matrix( + metrics, + target_frame, + household_frame=households, + area_codes=area_codes, + area_groups=area_groups, + household_ids=household_ids, + area_type=area_type, + code_column=code_column, + assignment_column=assignment_column, + ) + solve_result = solve_assigned_local_weights( + problem, + base_weights, + **solver_config, + ) + min_initial_weight = float(solver_config.get("min_initial_weight", 1e-4)) + long_weights = assigned_weights_to_long( + solve_result.weights, + area_codes, + household_ids, + area_type=area_type, + household_frame=households, + assignment_column=assignment_column, + base_weights=base_weights, + drop_weight_atol=min_initial_weight, + source_year=source_year, + weight_source=weight_source, + ) + else: + problem = build_stacked_local_matrix( + metrics, + target_frame, + area_codes=area_codes, + area_groups=area_groups, + household_ids=household_ids, + area_type=area_type, + code_column=code_column, + ) + solve_result = solve_stacked_local_weights( + problem, + base_weights, + **solver_config, + ) + long_weights = stacked_weights_to_long( + solve_result.weights, + area_codes, + household_ids, + area_type=area_type, + household_frame=households, + source_year=source_year, + weight_source=weight_source, + ) return UKLocalCandidateResult( problem=problem, solve_result=solve_result, @@ -316,6 +372,7 @@ def build_local_candidate( area_codes=area_codes, area_type=area_type, ), + support_mode=resolved_support_mode, ) @@ -333,6 +390,8 @@ def build_local_candidate_from_dataset( max_areas: int | None = None, source_year: int | None = None, weight_source: str = "populace_uk_local", + support_mode: str = "auto", + assignment_column: str | None = None, simulation_factory: Callable[[Any], Any] | None = None, target_profile: Mapping[str, Any] | Any | None = None, solver_options: Mapping[str, Any] | None = None, @@ -379,6 +438,8 @@ def build_local_candidate_from_dataset( sort_areas_by_code=False, source_year=source_year, weight_source=weight_source, + support_mode=support_mode, + assignment_column=assignment_column, solver_options=solver_options, ) @@ -398,6 +459,7 @@ def summarize_local_candidate(result: UKLocalCandidateResult) -> dict[str, Any]: "n_targets": int(len(result.problem.targets)), "n_long_rows": int(len(result.long_weights)), "n_nonzero": int(result.solve_result.n_nonzero), + "support_mode": result.support_mode, "initial_loss": float(result.solve_result.initial_loss), "final_loss": float(result.solve_result.final_loss), "weight_sum": float(result.long_weights["weight"].sum()), @@ -465,6 +527,109 @@ def _normalise_nonblank_strings(values: pd.Series, *, column: str) -> pd.Series: return strings +def _resolve_support_mode( + support_mode: str, + *, + area_type: str, + household_frame: pd.DataFrame, + assignment_column: str | None, +) -> str: + mode = str(support_mode).strip().lower() + valid_modes = {"auto", "assigned", "stacked"} + if mode not in valid_modes: + raise ValueError(f"support_mode must be one of {sorted(valid_modes)}.") + if mode == "stacked": + return mode + try: + column = rowwise_assignment_column( + area_type, + assignment_column=assignment_column, + ) + except ValueError: + if mode == "auto": + return "stacked" + raise + if mode == "assigned": + return mode + return "assigned" if column in household_frame.columns else "stacked" + + +def _filter_assigned_households_to_areas( + households: pd.DataFrame, + *, + area_codes: Sequence[str], + area_type: str, + assignment_column: str | None, +) -> pd.DataFrame: + column = rowwise_assignment_column(area_type, assignment_column=assignment_column) + if column not in households.columns: + raise ValueError( + f"household_frame is missing rowwise assignment column {column!r}." + ) + assignments = _normalise_optional_strings(households[column]) + mask = assignments.isin(set(map(str, area_codes))) + filtered = households.loc[mask].reset_index(drop=True) + if filtered.empty: + raise ValueError( + "no households are assigned to the requested local area codes." + ) + return filtered + + +def _subset_metric_tables_to_households( + metrics: pd.DataFrame | Mapping[str, pd.DataFrame], + household_ids: Sequence[Any], +) -> pd.DataFrame | dict[str, pd.DataFrame]: + if isinstance(metrics, pd.DataFrame): + return _subset_metric_table_to_households( + metrics, + household_ids, + group="__all__", + ) + return { + str(group): _subset_metric_table_to_households( + frame, + household_ids, + group=str(group), + ) + for group, frame in metrics.items() + } + + +def _subset_metric_table_to_households( + table: pd.DataFrame, + household_ids: Sequence[Any], + *, + group: str, +) -> pd.DataFrame: + expected = pd.Index(household_ids) + if expected.has_duplicates: + duplicates = expected[expected.duplicated()].unique() + raise ValueError( + "assigned household IDs must be unique before metric subsetting; " + f"duplicate value(s): {list(map(str, duplicates[:5]))}." + ) + if table.index.has_duplicates: + duplicates = table.index[table.index.duplicated()].unique() + raise ValueError( + f"metric table {group!r} household index must be unique; " + f"duplicate value(s): {list(map(str, duplicates[:5]))}." + ) + missing = expected.difference(table.index) + if len(missing): + raise ValueError( + f"metric table {group!r} is missing household_id value(s): " + f"{list(map(str, missing[:5]))}." + ) + return table.reindex(expected) + + +def _normalise_optional_strings(values: pd.Series) -> pd.Series: + missing = values.isna() + strings = values.astype(str).str.strip() + return strings.mask(missing | (strings == ""), None) + + def _source_household_keys( household_frame: pd.DataFrame, *, @@ -550,12 +715,6 @@ def _align_metric_table_to_households( f"metric table {group!r} is missing household_id value(s): " f"{list(map(str, missing[:5]))}." ) - extra = table.index.difference(expected) - if len(extra): - raise ValueError( - f"metric table {group!r} has unexpected household_id value(s): " - f"{list(map(str, extra[:5]))}." - ) return table.reindex(expected) diff --git a/packages/populace-build/src/populace/build/uk_runtime/local_solver.py b/packages/populace-build/src/populace/build/uk_runtime/local_solver.py index 17bd9d2..f4b8b50 100644 --- a/packages/populace-build/src/populace/build/uk_runtime/local_solver.py +++ b/packages/populace-build/src/populace/build/uk_runtime/local_solver.py @@ -1,4 +1,4 @@ -"""Solver wrapper for UK stacked local-geography weights.""" +"""Solver wrappers for UK local-geography weights.""" from __future__ import annotations @@ -72,6 +72,100 @@ def solve_stacked_local_weights( problem.n_areas, min_weight=min_initial_weight, ) + if len(initial_weights) != problem.matrix.shape[1]: + raise ValueError( + "base_weights expanded to the wrong stacked length: " + f"{len(initial_weights)} vs {problem.matrix.shape[1]}." + ) + return _solve_local_weights( + problem, + initial_weights, + epochs=epochs, + learning_rate=learning_rate, + max_weight_ratio=max_weight_ratio, + conserve_mass=conserve_mass, + target_records=target_records, + l0_lambda=l0_lambda, + target_loss_weights=target_loss_weights, + target_loss_scales=target_loss_scales, + target_loss_cap=target_loss_cap, + budget_iters=budget_iters, + seed=seed, + ) + + +def solve_assigned_local_weights( + problem: StackedLocalMatrix, + base_weights: Sequence[float], + *, + epochs: int = 512, + learning_rate: float = 0.15, + max_weight_ratio: float | None = None, + conserve_mass: bool = False, + target_records: int | None = None, + l0_lambda: float = 0.0, + min_initial_weight: float = 1e-4, + target_loss_weights: Sequence[float] | None = None, + target_loss_scales: Sequence[float] | None = None, + target_loss_cap: float = 10.0, + budget_iters: int = 10, + seed: int = 0, +) -> StackedLocalSolveResult: + """Solve rowwise-assigned local weights for a Populace UK local build. + + ``base_weights`` align one-to-one with the household columns in ``problem``. + The optional ``min_initial_weight`` floor mirrors the stacked solver and is + required by the torch log-weight optimizer. The assigned path defaults to + no ``max_weight_ratio`` cap so zero-weight support rows, such as synthetic + SPI rows, can be upweighted from the optimizer floor. + """ + + weights = np.asarray(base_weights, dtype=np.float64) + if weights.ndim != 1: + raise ValueError("base_weights must be one-dimensional.") + if not np.isfinite(weights).all() or (weights < 0).any(): + raise ValueError("base_weights must be finite and non-negative.") + if not np.isfinite(min_initial_weight) or min_initial_weight < 0: + raise ValueError("min_initial_weight must be finite and non-negative.") + initial_weights = np.maximum(weights, min_initial_weight) + if len(initial_weights) != problem.matrix.shape[1]: + raise ValueError( + "base_weights must align with the assigned local matrix columns: " + f"{len(initial_weights)} vs {problem.matrix.shape[1]}." + ) + return _solve_local_weights( + problem, + initial_weights, + epochs=epochs, + learning_rate=learning_rate, + max_weight_ratio=max_weight_ratio, + conserve_mass=conserve_mass, + target_records=target_records, + l0_lambda=l0_lambda, + target_loss_weights=target_loss_weights, + target_loss_scales=target_loss_scales, + target_loss_cap=target_loss_cap, + budget_iters=budget_iters, + seed=seed, + ) + + +def _solve_local_weights( + problem: StackedLocalMatrix, + initial_weights: np.ndarray, + *, + epochs: int, + learning_rate: float, + max_weight_ratio: float | None, + conserve_mass: bool, + target_records: int | None, + l0_lambda: float, + target_loss_weights: Sequence[float] | None, + target_loss_scales: Sequence[float] | None, + target_loss_cap: float, + budget_iters: int, + seed: int, +) -> StackedLocalSolveResult: targets = np.asarray(problem.targets, dtype=np.float64) scales = ( default_target_loss_scales(targets) @@ -93,14 +187,9 @@ def solve_stacked_local_weights( "target_loss_weights must align with targets, got " f"{loss_weights.shape} vs {targets.shape}." ) - if len(initial_weights) != problem.matrix.shape[1]: - raise ValueError( - "base_weights expanded to the wrong stacked length: " - f"{len(initial_weights)} vs {problem.matrix.shape[1]}." - ) if (initial_weights <= 0).any(): raise ValueError( - "all expanded initial weights must be strictly positive for the " + "all initial weights must be strictly positive for the " "log-weight optimizer; use a positive min_initial_weight or remove " "zero-weight records before solving." ) diff --git a/packages/populace-build/src/populace/build/uk_runtime/spi_support.py b/packages/populace-build/src/populace/build/uk_runtime/spi_support.py index 88f2ebc..9a671c2 100644 --- a/packages/populace-build/src/populace/build/uk_runtime/spi_support.py +++ b/packages/populace-build/src/populace/build/uk_runtime/spi_support.py @@ -33,9 +33,9 @@ "property_income", ) -# Mirrors the eFRS SPI-trained first-stage QRF output surface. Gift Aid and -# qualifying investment gifts are relief variables, not income components, but -# they need to be drawn jointly with high-income SPI rows. +# UK SPI-trained first-stage QRF output surface. Gift Aid and qualifying +# investment gifts are relief variables, not income components, but they need +# to be drawn jointly with high-income SPI rows. SPI_INCOME_IMPUTATION_COLUMNS = SPI_INCOME_COMPONENT_COLUMNS + ( "gift_aid", "charitable_investment_gifts", @@ -48,9 +48,9 @@ *SPI_INCOME_COMPONENT_COLUMNS, ) -# Mirrors the eFRS second-stage FRS-only QRF output surface. These fields are -# replaced on SPI support rows so high-income synthetic rows do not retain a -# random middle-income FRS donor's benefit receipt or pension behavior. +# UK second-stage FRS-only QRF output surface. These fields are replaced on SPI +# support rows so high-income synthetic rows do not retain a random +# middle-income FRS donor's benefit receipt or pension behavior. FRS_ONLY_SPI_FILL_PERSON_COLUMNS = ( "employee_pension_contributions", "employer_pension_contributions", @@ -245,7 +245,7 @@ def fill_support_channel_from_source( QRF prediction frame keyed by original ``person_id``. Rows outside ``channel`` are left unchanged. Missing target columns are initialized to ``fill_missing_columns_with`` before the channel-specific update, matching - the eFRS treatment of SPI-only variables such as charitable-giving fields. + the UK SPI treatment of variables such as charitable-giving fields. """ entity = _require_entity(entity) diff --git a/packages/populace-build/tests/test_uk_local_geography.py b/packages/populace-build/tests/test_uk_local_geography.py index bbe55c7..a46fdea 100644 --- a/packages/populace-build/tests/test_uk_local_geography.py +++ b/packages/populace-build/tests/test_uk_local_geography.py @@ -7,6 +7,8 @@ from populace.build.uk_runtime import ( LONG_GEOGRAPHY_COLUMNS, area_support_summary, + assigned_weights_to_long, + build_assigned_local_matrix, build_stacked_local_matrix, sort_households_by_id, stacked_design_weights, @@ -82,6 +84,61 @@ def test_build_stacked_local_matrix_uses_area_blocks_and_group_metrics() -> None np.testing.assert_allclose(dense[3], [0.0, 0.0, 0.0, 0.0, 2.0, 2.0]) +def test_build_assigned_local_matrix_uses_rowwise_area_assignments() -> None: + metrics = { + "England": pd.DataFrame( + { + "population": [1.0, 2.0, 3.0], + "earnings": [10.0, 20.0, 30.0], + }, + index=[101, 102, 103], + ), + "Scotland": pd.DataFrame( + { + "population": [100.0, 200.0, 300.0], + "earnings": [1000.0, 2000.0, 3000.0], + }, + index=[101, 102, 103], + ), + } + targets = pd.DataFrame( + { + "code": ["S001", "E001"], + "population": [300.0, 1.0], + "earnings": [3000.0, 10.0], + } + ) + households = pd.DataFrame( + { + "household_id": [103, 101, 102], + "constituency_code_oa": ["S001", "E001", "E999"], + } + ) + + assigned = build_assigned_local_matrix( + metrics, + targets, + household_frame=households, + area_codes=["E001", "S001"], + area_groups={"E001": "England", "S001": "Scotland"}, + household_ids=[101, 102, 103], + ) + + assert assigned.matrix.shape == (4, 3) + assert assigned.targets.tolist() == [1.0, 10.0, 300.0, 3000.0] + assert assigned.target_frame["area_code"].tolist() == [ + "E001", + "E001", + "S001", + "S001", + ] + dense = assigned.matrix.toarray() + np.testing.assert_allclose(dense[0], [1.0, 0.0, 0.0]) + np.testing.assert_allclose(dense[1], [10.0, 0.0, 0.0]) + np.testing.assert_allclose(dense[2], [0.0, 0.0, 300.0]) + np.testing.assert_allclose(dense[3], [0.0, 0.0, 3000.0]) + + def test_build_stacked_local_matrix_rejects_drifted_household_index() -> None: metrics = { "England": pd.DataFrame({"population": [1.0, 2.0]}, index=[101, 102]), @@ -156,6 +213,57 @@ def test_stacked_weights_to_long_preserves_source_metadata() -> None: assert long["clone_index"].tolist() == [0, 0, 3] +def test_assigned_weights_to_long_preserves_metadata_and_filters_area_codes() -> None: + household_frame = pd.DataFrame( + { + "household_id": [102, 103, 101], + "constituency_code_oa": ["S001", "E999", "E001"], + "source_year": [2022, 2021, 2023], + "source_household_id": ["b", "c", "a"], + "source_household_key": ["2022:b", "2021:c", "2023:a"], + "clone_index": [3, 2, 0], + } + ) + + long = assigned_weights_to_long( + [1.5, 2.5, 3.5], + ["E001", "S001"], + [101, 102, 103], + area_type="constituency", + household_frame=household_frame, + ) + + assert tuple(long.columns) == LONG_GEOGRAPHY_COLUMNS + assert long["weight"].tolist() == [1.5, 2.5] + assert long["area_code"].tolist() == ["E001", "S001"] + assert long["area_index"].tolist() == [0, 1] + assert long["household_id"].tolist() == [101, 102] + assert long["source_household_key"].tolist() == ["2023:a", "2022:b"] + assert long["clone_index"].tolist() == [0, 3] + + +def test_assigned_weights_to_long_drops_unused_zero_base_floor_weights() -> None: + household_frame = pd.DataFrame( + { + "household_id": [101, 102, 103], + "constituency_code_oa": ["E001", "E001", "E001"], + } + ) + + long = assigned_weights_to_long( + [1.0, 1e-4, 0.5], + ["E001"], + [101, 102, 103], + area_type="constituency", + household_frame=household_frame, + base_weights=[1.0, 0.0, 0.0], + drop_weight_atol=1e-4, + ) + + assert long["household_id"].tolist() == [101, 103] + assert long["weight"].tolist() == [1.0, 0.5] + + def test_stacked_weights_to_long_rejects_missing_household_metadata() -> None: household_frame = pd.DataFrame({"household_id": [101], "source_year": [2023]}) diff --git a/packages/populace-build/tests/test_uk_local_runner.py b/packages/populace-build/tests/test_uk_local_runner.py index 05ec3ed..02fa39d 100644 --- a/packages/populace-build/tests/test_uk_local_runner.py +++ b/packages/populace-build/tests/test_uk_local_runner.py @@ -160,6 +160,35 @@ def fake_compute(sim, area_type, *, period=None, household_ids=None): assert tables["England"]["population"].tolist() == [10.0, 20.0] +def test_build_metric_tables_from_dataset_allows_selected_households( + monkeypatch, +) -> None: + class ExtraHouseholdSimulation(FakeSimulation): + def calculate(self, variable, **_kwargs): + assert variable == "household_id" + return Result([103, 102, 101]) + + def fake_compute(sim, area_type, *, period=None, household_ids=None): + assert household_ids is None + return pd.DataFrame( + {"population": [30.0, 20.0, 10.0]}, + index=pd.Index([103, 102, 101]), + ) + + monkeypatch.setattr(local_runner, "compute_household_metrics", fake_compute) + + tables = build_metric_tables_from_dataset( + dataset=type("Dataset", (), {"time_period": 2023})(), + area_groups={"E001": "England"}, + area_type="constituency", + household_ids=[101, 102], + simulation_factory=ExtraHouseholdSimulation, + ) + + assert tables["England"].index.tolist() == [101, 102] + assert tables["England"]["population"].tolist() == [10.0, 20.0] + + def test_build_local_candidate_solves_and_exports_long_weights() -> None: areas = pd.DataFrame({"code": ["S001", "E001"], "country": ["Scotland", "England"]}) targets = pd.DataFrame( @@ -203,6 +232,87 @@ def test_build_local_candidate_solves_and_exports_long_weights() -> None: assert "effective_sample_size" in result.support_summary.columns +def test_build_local_candidate_uses_assigned_support_when_available() -> None: + areas = pd.DataFrame({"code": ["S001", "E001"], "country": ["Scotland", "England"]}) + targets = pd.DataFrame( + { + "code": ["E001", "S001"], + "population": [1.5, 0.5], + } + ) + metrics = { + "England": pd.DataFrame( + {"population": [1.0, 1.0, 10.0]}, + index=[101, 102, 103], + ), + "Scotland": pd.DataFrame( + {"population": [1.0, 1.0, 10.0]}, + index=[101, 102, 103], + ), + } + households = pd.DataFrame( + { + "household_id": [102, 103, 101], + "household_weight": [1.0, 100.0, 1.0], + "constituency_code_oa": ["S001", "E999", "E001"], + } + ) + + result = build_local_candidate( + area_type="constituency", + area_frame=areas, + targets=targets, + metrics=metrics, + household_frame=households, + solver_options={"epochs": 80, "learning_rate": 0.2, "seed": 1}, + ) + + assert result.support_mode == "assigned" + assert result.problem.matrix.shape == (2, 2) + assert result.problem.n_households == 2 + assert result.solve_result.weights.shape == (2,) + assert result.solve_result.final_loss < result.solve_result.initial_loss + assert result.long_weights["area_code"].tolist() == ["E001", "S001"] + assert result.support_summary["nonzero_households"].tolist() == [1, 1] + + +def test_build_local_candidate_uses_la_assigned_support_and_zero_area() -> None: + areas = pd.DataFrame({"code": ["E06000002", "E06000001"]}) + targets = pd.DataFrame( + { + "code": ["E06000001", "E06000002"], + "population": [1.0, 0.0], + } + ) + metrics = pd.DataFrame({"population": [1.0, 10.0]}, index=[101, 102]) + households = pd.DataFrame( + { + "household_id": [102, 101], + "household_weight": [100.0, 1.0], + "la_code_oa": ["E99999999", "E06000001"], + } + ) + + result = build_local_candidate( + area_type="la", + area_frame=areas, + targets=targets, + metrics=metrics, + household_frame=households, + solver_options={"epochs": 5, "learning_rate": 0.2, "seed": 1}, + ) + + assert result.support_mode == "assigned" + assert result.problem.area_codes == ("E06000001", "E06000002") + assert result.problem.matrix.shape == (2, 1) + assert result.long_weights["area_code"].tolist() == ["E06000001"] + assert result.support_summary["area_code"].tolist() == [ + "E06000001", + "E06000002", + ] + assert result.support_summary["nonzero_households"].tolist() == [1, 0] + + def test_build_local_candidate_can_limit_pilot_areas() -> None: areas = pd.DataFrame( { @@ -281,6 +391,42 @@ def fake_compute( assert result.support_summary["nonzero_households"].tolist() == [1] +def test_build_local_candidate_from_dataset_auto_uses_assigned_support( + monkeypatch, +) -> None: + areas = pd.DataFrame({"code": ["E001"], "country": ["England"]}) + targets = pd.DataFrame({"code": ["E001"], "population": [1.0]}) + households = pd.DataFrame( + { + "household_id": [101], + "household_weight": [1.0], + "constituency_code_oa": ["E001"], + } + ) + + def fake_compute(sim, area_type, *, period=None, household_ids=None): + assert area_type == "constituency" + assert period == 2023 + assert household_ids is None + return pd.DataFrame({"population": [1.0]}, index=pd.Index([101])) + + monkeypatch.setattr(local_runner, "compute_household_metrics", fake_compute) + + result = build_local_candidate_from_dataset( + dataset=type("Dataset", (), {"time_period": 2023})(), + area_type="constituency", + area_frame=areas, + targets=targets, + household_frame=households, + simulation_factory=SingleHouseholdSimulation, + solver_options={"epochs": 2}, + ) + + assert result.support_mode == "assigned" + assert result.problem.matrix.shape == (1, 1) + assert result.long_weights["area_code"].tolist() == ["E001"] + + def test_write_local_candidate_outputs(tmp_path: Path) -> None: areas = pd.DataFrame({"code": ["E001"], "country": ["England"]}) targets = pd.DataFrame({"code": ["E001"], "population": [1.0]}) diff --git a/packages/populace-build/tests/test_uk_local_solver.py b/packages/populace-build/tests/test_uk_local_solver.py index 18bed70..45e5c8a 100644 --- a/packages/populace-build/tests/test_uk_local_solver.py +++ b/packages/populace-build/tests/test_uk_local_solver.py @@ -7,7 +7,9 @@ import populace.build.uk_runtime.local_solver as local_solver from populace.build.uk_runtime import ( + build_assigned_local_matrix, build_stacked_local_matrix, + solve_assigned_local_weights, solve_stacked_local_weights, ) @@ -39,6 +41,67 @@ def test_solve_stacked_local_weights_reduces_loss_and_reports_diagnostics() -> N np.testing.assert_allclose(result.diagnostics["target"], [1.5, 0.5]) +def test_solve_assigned_local_weights_uses_household_weight_columns() -> None: + metrics = pd.DataFrame({"population": [1.0, 1.0]}, index=[101, 102]) + targets = pd.DataFrame({"code": ["E001", "S001"], "population": [1.5, 0.5]}) + households = pd.DataFrame( + { + "household_id": [101, 102], + "constituency_code_oa": ["E001", "S001"], + } + ) + problem = build_assigned_local_matrix( + metrics, + targets, + household_frame=households, + area_codes=["E001", "S001"], + household_ids=[101, 102], + ) + + result = solve_assigned_local_weights( + problem, + [1.0, 1.0], + epochs=80, + learning_rate=0.2, + max_weight_ratio=10.0, + seed=1, + ) + + assert result.weights.shape == (2,) + assert result.initial_weights.tolist() == [1.0, 1.0] + assert result.final_loss < result.initial_loss + assert result.diagnostics["area_code"].tolist() == ["E001", "S001"] + + +def test_solve_assigned_local_weights_can_upweight_zero_base_support() -> None: + metrics = pd.DataFrame({"income": [1_000_000.0]}, index=[101]) + targets = pd.DataFrame({"code": ["E001"], "income": [1_000_000.0]}) + households = pd.DataFrame( + { + "household_id": [101], + "constituency_code_oa": ["E001"], + } + ) + problem = build_assigned_local_matrix( + metrics, + targets, + household_frame=households, + area_codes=["E001"], + household_ids=[101], + ) + + result = solve_assigned_local_weights( + problem, + [0.0], + epochs=80, + learning_rate=0.3, + seed=1, + ) + + assert result.weights[0] > 0.01 + assert result.final_loss < 0.05 + + def test_solve_stacked_local_weights_uses_explicit_positive_floor() -> None: metrics = pd.DataFrame({"population": [1.0, 1.0]}, index=[101, 102]) targets = pd.DataFrame({"code": ["E001"], "population": [1.0]}) diff --git a/packages/populace-build/tests/test_uk_source_manifest.py b/packages/populace-build/tests/test_uk_source_manifest.py new file mode 100644 index 0000000..58c70fc --- /dev/null +++ b/packages/populace-build/tests/test_uk_source_manifest.py @@ -0,0 +1,393 @@ +"""UK source manifest contract: spec-only resource, full surface.""" + +from __future__ import annotations + +import json +from collections import defaultdict +from importlib.resources import files + +import pytest + +from populace.build.source_manifest import ( + ALLOWED_SOURCE_OPERATION_KINDS, + SourceManifest, + SourceOperationSpec, + load_source_manifest, +) +from populace.build.uk_runtime import ( + FRS_ONLY_SPI_FILL_PERSON_COLUMNS, + ROWWISE_GEOGRAPHY_COLUMNS, + SPI_INCOME_IMPUTATION_COLUMNS, + UK_SPI_SUPPORT_STAGE_NAME, +) + +UK_RESOURCE_ROOT = files("populace.build.uk") +UK_SOURCE_RESOURCE = UK_RESOURCE_ROOT.joinpath("source_stages.json") +UK_SOURCE_MANIFEST = load_source_manifest(UK_SOURCE_RESOURCE) +UK_SOURCE_STAGE_SPECS = UK_SOURCE_MANIFEST.stages +UK_STAGE_NAMES = tuple(stage.stage for stage in UK_SOURCE_STAGE_SPECS) +UK_SOURCE_OUTPUT_STAGES: dict[str, list[str]] = defaultdict(list) +for _stage in UK_SOURCE_STAGE_SPECS: + for _output in _stage.outputs: + UK_SOURCE_OUTPUT_STAGES[_output].append(_stage.stage) +UK_SOURCE_OUTPUT_STAGES = dict(UK_SOURCE_OUTPUT_STAGES) +UK_SOURCE_OUTPUTS = set(UK_SOURCE_OUTPUT_STAGES) +UK_NONNEGATIVE_SOURCE_OUTPUTS = { + output for stage in UK_SOURCE_STAGE_SPECS for output in stage.nonnegative_outputs +} +UK_REWRITTEN_SOURCE_OUTPUT_STAGES = { + output: tuple(stages) + for output, stages in UK_SOURCE_OUTPUT_STAGES.items() + if len(stages) > 1 +} + + +class TestUkSources: + def test_source_manifest_loads_as_spec_contract(self) -> None: + assert UK_SOURCE_MANIFEST.country == "uk" + assert UK_SOURCE_MANIFEST.version == 1 + assert len(UK_SOURCE_STAGE_SPECS) >= 12 + assert "source_stages.json" in _country_package_resources() + + def test_source_specs_align_with_declared_resource_order(self) -> None: + raw = json.loads(UK_SOURCE_RESOURCE.read_text(encoding="utf-8")) + + assert UK_STAGE_NAMES == tuple(stage["stage"] for stage in raw["stages"]) + assert set(UK_SOURCE_MANIFEST.stage_map()) == set(UK_STAGE_NAMES) + assert "rowwise_oa_geography" in UK_STAGE_NAMES + assert "local_geography_weights" not in UK_STAGE_NAMES + assert "national_calibration" not in UK_STAGE_NAMES + + def test_stage_order_keeps_required_upstream_surfaces_available(self) -> None: + assert UK_STAGE_NAMES.index("was_wealth") < UK_STAGE_NAMES.index( + "regional_property_uprating" + ) + assert UK_STAGE_NAMES.index("was_wealth") < UK_STAGE_NAMES.index( + "lcfs_consumption" + ) + assert UK_STAGE_NAMES.index("lcfs_consumption") < UK_STAGE_NAMES.index( + "bus_public_service_calibration" + ) + assert UK_STAGE_NAMES.index("etb_public_services") < UK_STAGE_NAMES.index( + "bus_public_service_calibration" + ) + assert UK_STAGE_NAMES.index(UK_SPI_SUPPORT_STAGE_NAME) < UK_STAGE_NAMES.index( + "spi_income" + ) + assert UK_STAGE_NAMES.index("spi_income") < UK_STAGE_NAMES.index( + "frs_only_spi_fill" + ) + + def test_source_specs_are_manifest_only_not_python_loaders(self) -> None: + for spec in UK_SOURCE_STAGE_SPECS: + assert spec.operations + for operation in spec.operations: + assert operation.kind in ALLOWED_SOURCE_OPERATION_KINDS + assert "module" not in operation.parameters + assert "function" not in operation.parameters + assert operation.kind not in { + "python_module", + "python_function", + "import_module", + } + + def test_ledger_weight_calibration_is_not_declared_as_source_operations( + self, + ) -> None: + operation_kinds = { + operation.kind + for stage in UK_SOURCE_STAGE_SPECS + for operation in stage.operations + } + + assert "compile_ledger_targets" not in operation_kinds + assert "calibrate_weights" not in operation_kinds + + def test_raw_source_surface_declares_salient_outputs_from_each_input( + self, + ) -> None: + required_outputs = { + "employment_sector", + "sic_industry_division", + "property_wealth", + "mortgage_debt", + "consumer_debt", + "student_loan_balance", + "num_vehicles", + "cash_isa", + "stocks_and_shares_isa", + "full_rate_vat_expenditure_rate", + "food_and_non_alcoholic_beverages_consumption", + "electricity_consumption", + "gas_consumption", + "petrol_spending", + "diesel_spending", + "bus_fare_spending", + "dfe_education_spending", + "rail_subsidy_spending", + "bus_subsidy_spending", + "rail_usage", + "a_and_e_visits", + "admitted_patient_visits", + "outpatient_visits", + "nhs_spending", + "gift_aid", + "charitable_investment_gifts", + "capital_gains", + "household_is_capital_gains_clone", + "pension_contributions_via_salary_sacrifice", + "student_loan_plan", + "household_is_spi_synthetic", + "source_household_key", + } + + required_outputs.update(SPI_INCOME_IMPUTATION_COLUMNS) + required_outputs.update(FRS_ONLY_SPI_FILL_PERSON_COLUMNS) + required_outputs.update(ROWWISE_GEOGRAPHY_COLUMNS) + + assert sorted(required_outputs - UK_SOURCE_OUTPUTS) == [] + + def test_nonnegative_surface_covers_key_money_and_count_outputs(self) -> None: + required_nonnegative = { + "sic_industry_division", + "owned_land", + "property_wealth", + "mortgage_debt", + "consumer_debt", + "student_loan_balance", + "cash_isa", + "stocks_and_shares_isa", + "food_and_non_alcoholic_beverages_consumption", + "electricity_consumption", + "gas_consumption", + "petrol_spending", + "diesel_spending", + "bus_fare_spending", + "bus_subsidy_spending", + "full_rate_vat_expenditure_rate", + "a_and_e_visits", + "nhs_spending", + "dfe_education_spending", + "rail_usage", + "gift_aid", + "charitable_investment_gifts", + "capital_gains", + "pension_contributions_via_salary_sacrifice", + } + + assert sorted(required_nonnegative - UK_NONNEGATIVE_SOURCE_OUTPUTS) == [] + assert "employment_sector" not in UK_NONNEGATIVE_SOURCE_OUTPUTS + assert "student_loan_plan" not in UK_NONNEGATIVE_SOURCE_OUTPUTS + + def test_rewritten_outputs_are_explicit_and_have_reviewed_final_writers( + self, + ) -> None: + expected_rewrites = { + "bus_fare_spending": ( + "lcfs_consumption", + "bus_public_service_calibration", + ), + "bus_subsidy_spending": ( + "etb_public_services", + "bus_public_service_calibration", + ), + "diesel_spending": ( + "lcfs_consumption", + "road_fuel_energy_calibration", + ), + "domestic_energy_consumption": ( + "lcfs_consumption", + "road_fuel_energy_calibration", + ), + "electricity_consumption": ( + "lcfs_consumption", + "road_fuel_energy_calibration", + ), + "gas_consumption": ( + "lcfs_consumption", + "road_fuel_energy_calibration", + ), + "petrol_spending": ( + "lcfs_consumption", + "road_fuel_energy_calibration", + ), + "main_residence_value": ( + "was_wealth", + "regional_property_uprating", + ), + "property_wealth": ( + "was_wealth", + "regional_property_uprating", + ), + "employment_income": ( + "frs_base", + "spi_income", + ), + "private_pension_income": ( + "frs_base", + "spi_income", + ), + "self_employment_income": ( + "frs_base", + "spi_income", + ), + "employee_pension_contributions": ( + "frs_only_spi_fill", + "frs_salary_sacrifice", + ), + "pension_contributions_via_salary_sacrifice": ( + "frs_only_spi_fill", + "frs_salary_sacrifice", + ), + "rail_subsidy_spending": ( + "etb_public_services", + "rail_public_service_calibration", + ), + "rail_usage": ( + "etb_public_services", + "rail_public_service_calibration", + ), + } + + assert UK_REWRITTEN_SOURCE_OUTPUT_STAGES == expected_rewrites + for output, stages in expected_rewrites.items(): + assert tuple(UK_SOURCE_OUTPUT_STAGES[output]) == stages + indices = [UK_STAGE_NAMES.index(stage) for stage in stages] + assert indices == sorted(indices) + + def test_fuel_energy_amount_scaling_is_not_binary_assignment(self) -> None: + operations = UK_SOURCE_MANIFEST.stage_map()[ + "road_fuel_energy_calibration" + ].operations + kinds = [operation.kind for operation in operations] + + assert "calibrate_binary_assignment" not in kinds + assert "uprate" in kinds + uprate = operations[kinds.index("uprate")] + assert tuple(uprate.parameters["variables"]) == ( + "petrol_spending", + "diesel_spending", + "electricity_consumption", + "gas_consumption", + ) + derive = operations[kinds.index("derive")] + assert tuple(derive.parameters["outputs"]) == ("domestic_energy_consumption",) + + def test_bus_surface_matches_recent_uk_data_contract(self) -> None: + specs = UK_SOURCE_MANIFEST.stage_map() + lcfs_operations = specs["lcfs_consumption"].operations + lcfs_derive = next( + operation + for operation in lcfs_operations + if operation.kind == "derive" + and "bus_fare_spending" in operation.parameters["outputs"] + ) + bus_operations = specs["bus_public_service_calibration"].operations + kinds = [operation.kind for operation in bus_operations] + + assert tuple(lcfs_derive.parameters["source_codes"]["bus_fare_spending"]) == ( + "c73212", + "c73213", + "c73214", + ) + assert kinds == ["read_table", "uprate"] + assert tuple(bus_operations[1].parameters["variables"]) == ( + "bus_fare_spending", + "bus_subsidy_spending", + ) + + def test_wealth_surface_splits_isa_outputs_and_preserves_back_compat( + self, + ) -> None: + stage = UK_SOURCE_MANIFEST.stage_map()["was_wealth"] + operations = stage.operations + derive = next( + operation + for operation in operations + if operation.kind == "derive" + and "cash_isa" in operation.parameters["outputs"] + ) + folds = [operation for operation in operations if operation.kind == "fold_into"] + + assert { + "cash_isa", + "stocks_and_shares_isa", + } <= set(stage.outputs) + assert derive.parameters["source_fields"] == { + "cash_isa": "DVCISAVR8", + "stocks_and_shares_isa": "DVIISAVR8", + } + assert any( + operation.parameters + == { + "target": "corporate_wealth", + "amount": "stocks_and_shares_isa", + } + for operation in folds + ) + + def test_frs_base_carries_employment_sector_and_sic_from_raw_frs(self) -> None: + stage = UK_SOURCE_MANIFEST.stage_map()["frs_base"] + + assert {"employment_sector", "sic_industry_division"} <= set(stage.outputs) + assert "sic_industry_division" in stage.nonnegative_outputs + + def test_spi_stage_declares_support_channel_before_income_fit(self) -> None: + specs = UK_SOURCE_MANIFEST.stage_map() + spi_kinds = [operation.kind for operation in specs["spi_income"].operations] + + assert spi_kinds.index("read_table") < spi_kinds.index("fit_weighted_qrf") + assert spi_kinds.index("fit_weighted_qrf") < spi_kinds.index("support_clip") + assert "household_is_spi_synthetic" in specs[UK_SPI_SUPPORT_STAGE_NAME].outputs + + def test_source_operation_parser_rejects_python_loader_shapes(self) -> None: + with pytest.raises(ValueError, match="executable-loader"): + SourceOperationSpec.from_mapping( + { + "kind": "python_module", + "module": "populace.build.uk.sources", + "function": "add_was_wealth", + } + ) + + def test_source_operation_parser_rejects_old_weight_calibration_ops(self) -> None: + with pytest.raises(ValueError, match="allowed manifest operation vocabulary"): + SourceOperationSpec.from_mapping({"kind": "compile_ledger_targets"}) + + with pytest.raises(ValueError, match="allowed manifest operation vocabulary"): + SourceOperationSpec.from_mapping({"kind": "calibrate_weights"}) + + def test_source_manifest_parser_rejects_incumbent_package_artifacts(self) -> None: + with pytest.raises(ValueError, match="forbidden incumbent dependency"): + SourceManifest.from_mapping( + { + "version": 1, + "country": "uk", + "policy": "spec only", + "stages": [ + { + "stage": "was_wealth", + "survey": "Wealth and Assets Survey", + "source": "https://example.test/was", + "grain": "household", + "artifacts": [ + { + "kind": "derived_dataset", + "locator": "policyengine_" + "uk_data", + } + ], + "operations": [ + {"kind": "read_table", "table": "was_household"} + ], + "outputs": ["property_wealth"], + } + ], + } + ) + + +def _country_package_resources() -> set[str]: + package = json.loads( + UK_RESOURCE_ROOT.joinpath("country_package.json").read_text(encoding="utf-8") + ) + return set(package["resources"]) diff --git a/packages/populace-build/tests/test_uk_spi_support.py b/packages/populace-build/tests/test_uk_spi_support.py index 6cdf68d..1d11d74 100644 --- a/packages/populace-build/tests/test_uk_spi_support.py +++ b/packages/populace-build/tests/test_uk_spi_support.py @@ -209,7 +209,7 @@ def test_spi_fill_only_updates_spi_channel_and_can_initialize_new_columns() -> N assert spi["gift_aid"].tolist() == [9.0, 10.0, 11.0, 12.0] -def test_spi_variable_surfaces_include_efrs_stage1_and_stage2_fixes() -> None: +def test_spi_variable_surfaces_include_recent_stage1_and_stage2_fixes() -> None: assert SPI_INCOME_COMPONENT_COLUMNS == ( "employment_income", "self_employment_income",