From e9b6963e9eb0b9930d0203e0dba612be3e52e41f Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 15 Jan 2026 09:53:33 -0500 Subject: [PATCH 01/19] Add health insurance premiums to local area calibration - Add build_calibration_matrix.py runner for building X_sparse matrix with SNAP + health_insurance_premiums_without_medicare_part_b targets - Add test_sparse_matrix_builder.py with tests for person-level aggregation and combined target support - Update changelog_entry.yaml Closes #472 Co-Authored-By: Claude Haiku 4.5 --- changelog_entry.yaml | 5 + .../build_calibration_matrix.py | 195 ++++++++++++++++++ .../tests/test_sparse_matrix_builder.py | 64 ++++++ 3 files changed, 264 insertions(+) create mode 100644 policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py create mode 100644 policyengine_us_data/tests/test_sparse_matrix_builder.py diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29bb..c38106204 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,5 @@ +- bump: minor + changes: + added: + - Support for health_insurance_premiums_without_medicare_part_b in local area calibration + - Test coverage for sparse matrix builder with person-level targets diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py b/policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py new file mode 100644 index 000000000..185914a5c --- /dev/null +++ b/policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py @@ -0,0 +1,195 @@ +""" +Build calibration matrix for geo-stacking reweighting. +Generates X_sparse and target vector, prints diagnostics using MatrixTracer. +""" + +import logging + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + +import numpy as np +import pandas as pd +from policyengine_us import Microsimulation +from policyengine_us_data.storage import STORAGE_FOLDER +from sparse_matrix_builder import SparseMatrixBuilder +from matrix_tracer import MatrixTracer +from calibration_utils import create_target_groups + +# ============================================================================ +# CONFIGURATION +# ============================================================================ +db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" +db_uri = f"sqlite:///{db_path}" +time_period = 2023 + +# Base dataset for geo-stacking: stratified extended CPS +dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" + +cds_to_calibrate = [ + "101", # Alabama CD-1 + "601", # California CD-1 + "602", # California CD-2 + "3601", # New York CD-1 + "4801", # Texas CD-1 +] + +print(f"Testing with {len(cds_to_calibrate)} congressional districts") + +# ============================================================================ +# STEP 1: LOAD SIMULATION FROM EXTENDED CPS +# ============================================================================ +print(f"\nLoading simulation from {dataset_path}...") +sim = Microsimulation(dataset=str(dataset_path)) +n_households = len(sim.calculate("household_id", map_to="household").values) +print(f"Loaded {n_households:,} households") + +# ============================================================================ +# STEP 2: BUILD SPARSE MATRIX WITH COMBINED TARGETS +# ============================================================================ +print("\nBuilding sparse matrix...") +builder = SparseMatrixBuilder( + db_uri=db_uri, + time_period=time_period, + cds_to_calibrate=cds_to_calibrate, + dataset_path=None, +) + +# SNAP targets (stratum_group_id=4) + specific health insurance variable +# Uses OR logic: gets all SNAP targets OR the health insurance target +targets_df, X_sparse, household_id_mapping = builder.build_matrix( + sim, + target_filter={ + "stratum_group_ids": [4], + "variables": ["health_insurance_premiums_without_medicare_part_b"], + }, +) + +print(f"\nMatrix built successfully:") +print(f" Shape: {X_sparse.shape}") +print(f" Targets: {len(targets_df)}") +nnz = X_sparse.nnz +total = X_sparse.shape[0] * X_sparse.shape[1] +print(f" Sparsity: {1 - nnz / total:.4%}") + +# ============================================================================ +# STEP 3: EXTRACT TARGET VECTOR +# ============================================================================ +target_vector = targets_df["value"].values +print(f"\nTarget vector shape: {target_vector.shape}") +print(f"Target total: ${target_vector.sum():,.0f}") + +# ============================================================================ +# STEP 4: HEALTH INSURANCE PREMIUM VERIFICATION +# ============================================================================ +print("\n" + "=" * 80) +print("HEALTH INSURANCE PREMIUM TARGET ANALYSIS") +print("=" * 80) + +health_ins_targets = targets_df[ + targets_df["variable"] + == "health_insurance_premiums_without_medicare_part_b" +] + +if len(health_ins_targets) > 0: + print(f"\nFound {len(health_ins_targets)} health insurance target(s):") + print( + health_ins_targets[ + [ + "target_id", + "variable", + "value", + "geographic_id", + "stratum_group_id", + ] + ] + ) + + health_ins_idx = health_ins_targets.index[0] + health_ins_row = X_sparse[health_ins_idx, :] + + print(f"\nMatrix row {health_ins_idx} (health insurance):") + print(f" Non-zero entries: {health_ins_row.nnz:,}") + print(f" Row sum: ${health_ins_row.sum():,.0f}") + print(f" Target value: ${health_ins_targets.iloc[0]['value']:,.0f}") + + person_total = sim.calculate( + "health_insurance_premiums_without_medicare_part_b", + time_period, + map_to="person", + ).values.sum() + + household_total = sim.calculate( + "health_insurance_premiums_without_medicare_part_b", + time_period, + map_to="household", + ).values.sum() + + print(f"\nEntity aggregation verification:") + print(f" Person-level total: ${person_total:,.0f}") + print(f" Household-level total: ${household_total:,.0f}") + print(f" Match: {np.isclose(person_total, household_total, rtol=1e-6)}") + +else: + print("\nWARNING: No health insurance targets found!") + +# ============================================================================ +# STEP 5: SNAP TARGET SUMMARY +# ============================================================================ +print("\n" + "=" * 80) +print("SNAP TARGET SUMMARY") +print("=" * 80) + +snap_targets = targets_df[targets_df["variable"] == "snap"] +household_count_targets = targets_df[ + targets_df["variable"] == "household_count" +] + +print(f"\nSNAP benefit targets: {len(snap_targets)}") +print(f"Household count targets: {len(household_count_targets)}") + +if len(snap_targets) > 0: + print(f"\nSNAP total (all states): ${snap_targets['value'].sum():,.0f}") + print(f"\nSample SNAP targets:") + print( + snap_targets[["target_id", "variable", "value", "geographic_id"]].head( + 10 + ) + ) + +# ============================================================================ +# STEP 6: USE MATRIX TRACER FOR DETAILED DIAGNOSTICS +# ============================================================================ +print("\n" + "=" * 80) +print("MATRIX TRACER DIAGNOSTICS") +print("=" * 80) + +tracer = MatrixTracer( + targets_df=targets_df, + matrix=X_sparse, + household_id_mapping=household_id_mapping, + geographic_ids=cds_to_calibrate, + sim=sim, +) + +tracer.print_matrix_structure(show_groups=True) + +# ============================================================================ +# STEP 7: TARGET GROUP ANALYSIS +# ============================================================================ +print("\n" + "=" * 80) +print("TARGET GROUP ANALYSIS") +print("=" * 80) + +target_groups, group_info = create_target_groups(targets_df) + +print(f"\nTotal target groups: {len(group_info)}") +for group_id, info in enumerate(group_info): + group_mask = target_groups == group_id + n_targets_in_group = group_mask.sum() + print(f" Group {group_id}: {info} ({n_targets_in_group} targets)") + +print("\n" + "=" * 80) +print("RUNNER COMPLETED SUCCESSFULLY") +print("=" * 80) diff --git a/policyengine_us_data/tests/test_sparse_matrix_builder.py b/policyengine_us_data/tests/test_sparse_matrix_builder.py new file mode 100644 index 000000000..2d08ea3f1 --- /dev/null +++ b/policyengine_us_data/tests/test_sparse_matrix_builder.py @@ -0,0 +1,64 @@ +import pytest +import numpy as np +from policyengine_us import Microsimulation +from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( + SparseMatrixBuilder, +) + + +@pytest.fixture +def sim(): + dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" + return Microsimulation(dataset=str(dataset_path)) + + +@pytest.fixture +def builder(): + db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" + db_uri = f"sqlite:///{db_path}" + cds_to_calibrate = ["101", "601"] # AL-1, CA-1 + return SparseMatrixBuilder( + db_uri=db_uri, + time_period=2023, + cds_to_calibrate=cds_to_calibrate, + dataset_path=None, + ) + + +def test_person_level_aggregation_preserves_totals(sim): + """Health insurance premiums (person-level) should sum correctly to household.""" + var = "health_insurance_premiums_without_medicare_part_b" + person_total = sim.calculate(var, 2023, map_to="person").values.sum() + household_total = sim.calculate(var, 2023, map_to="household").values.sum() + assert np.isclose(person_total, household_total, rtol=1e-6) + + +def test_matrix_shape(sim, builder): + """Matrix should have (n_targets, n_households * n_cds) shape.""" + targets_df, X_sparse, _ = builder.build_matrix( + sim, + target_filter={ + "variables": ["health_insurance_premiums_without_medicare_part_b"] + }, + ) + n_households = len( + sim.calculate("household_id", map_to="household").values + ) + n_cds = 2 + assert X_sparse.shape == (1, n_households * n_cds) + + +def test_combined_snap_and_health_insurance(sim, builder): + """Matrix should include both SNAP and health insurance targets.""" + targets_df, X_sparse, _ = builder.build_matrix( + sim, + target_filter={ + "stratum_group_ids": [4], + "variables": ["health_insurance_premiums_without_medicare_part_b"], + }, + ) + variables = targets_df["variable"].unique() + assert "snap" in variables + assert "household_count" in variables + assert "health_insurance_premiums_without_medicare_part_b" in variables From 76137fbb9f786e31f42618d535af7ecb8ed07afa Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 15 Jan 2026 11:00:02 -0500 Subject: [PATCH 02/19] adding documentation --- .../ADDING_CALIBRATION_TARGETS.md | 241 ++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md b/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md new file mode 100644 index 000000000..1b30df41f --- /dev/null +++ b/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md @@ -0,0 +1,241 @@ +# Adding Calibration Targets to Local Area Geo-Stacking + +This document summarizes key learnings from adding `health_insurance_premiums_without_medicare_part_b` as a calibration target. Use this as a reference when adding future targets. + +## Key Discovery: No Code Changes Needed for Most Targets + +The `sparse_matrix_builder.py` is **already entity-agnostic**. PolicyEngine's `map_to="household"` parameter automatically handles aggregation from any entity level (person, tax_unit, spm_unit) to household level. + +```python +# This line in sparse_matrix_builder.py (line 220-222) handles ALL entity types: +target_values = state_sim.calculate( + target["variable"], map_to="household" +).values +``` + +**Verified behavior:** +- Person-level variables (like health_insurance_premiums): automatically summed to household +- SPM-unit variables (like snap): automatically aggregated to household +- Household variables: returned as-is + +## Architecture Overview + +### File Locations + +``` +policyengine_us_data/ +├── storage/ +│ ├── calibration/ +│ │ └── policy_data.db # Target database (SQLite) +│ └── stratified_extended_cps_2023.h5 # Base dataset for calibration +└── datasets/cps/local_area_calibration/ + ├── sparse_matrix_builder.py # Builds X_sparse matrix (GENERIC) + ├── matrix_tracer.py # Diagnostics for debugging matrices + ├── calibration_utils.py # Helper functions + └── build_calibration_matrix.py # Runner script +``` + +### Database Schema (policy_data.db) + +```sql +-- Core tables +targets(target_id, variable, period, stratum_id, value, active, ...) +strata(stratum_id, definition_hash, stratum_group_id, ...) +stratum_constraints(stratum_id, constraint_variable, operation, value, ...) +``` + +**Key stratum_group_ids:** +- Group 1: National hardcoded targets (20 variables including health insurance, medicaid, snap national, etc.) +- Group 4: SNAP state/CD targets (538 targets: 51 state snap + 487 household_count) + +### Target Filter Logic + +The `build_matrix()` method uses **OR logic** for filters: + +```python +# Gets SNAP targets OR health insurance target +target_filter={ + "stratum_group_ids": [4], # All SNAP targets + "variables": ["health_insurance_premiums_without_medicare_part_b"], # Specific variable +} +``` + +## How to Add a New Target + +### Step 1: Check if Target Exists in Database + +```python +import sqlite3 +from policyengine_us_data.storage import STORAGE_FOLDER + +conn = sqlite3.connect(STORAGE_FOLDER / "calibration" / "policy_data.db") +cursor = conn.cursor() + +# Find your target +cursor.execute(""" + SELECT t.target_id, t.variable, t.value, t.period, t.stratum_id, + s.stratum_group_id + FROM targets t + JOIN strata s ON t.stratum_id = s.stratum_id + WHERE t.variable = 'your_variable_name' +""") +print(cursor.fetchall()) + +# Check constraints for that stratum +cursor.execute(""" + SELECT * FROM stratum_constraints WHERE stratum_id = +""") +print(cursor.fetchall()) +``` + +### Step 2: Determine Entity Type + +```python +from policyengine_us import Microsimulation + +sim = Microsimulation() +var = sim.tax_benefit_system.variables['your_variable_name'] +print(f"Entity: {var.entity.key}") # person, household, tax_unit, spm_unit, etc. +``` + +### Step 3: Verify Aggregation Works + +```python +# For non-household variables, verify totals are preserved +person_total = sim.calculate('your_variable', 2023, map_to='person').values.sum() +household_total = sim.calculate('your_variable', 2023, map_to='household').values.sum() +print(f"Match: {np.isclose(person_total, household_total, rtol=1e-6)}") +``` + +### Step 4: Update the Runner Script + +Edit `build_calibration_matrix.py` to include your new target: + +```python +targets_df, X_sparse, household_id_mapping = builder.build_matrix( + sim, + target_filter={ + "stratum_group_ids": [4], # SNAP + "variables": [ + "health_insurance_premiums_without_medicare_part_b", + "your_new_variable", # Add here + ], + }, +) +``` + +### Step 5: Run and Verify + +```bash +cd policyengine_us_data/datasets/cps/local_area_calibration +python build_calibration_matrix.py +``` + +## When Code Changes ARE Needed + +The current implementation may need modification for: + +1. **Count variables with special semantics**: Variables ending in `_count` might need `.nunique()` instead of `.sum()` for aggregation. The junkyard implementation handles this but our current builder doesn't. + +2. **Variables with state-specific calculations**: SNAP and Medicaid are already handled (state_fips is set before calculation). Other state-dependent variables should work the same way. + +3. **Constraint evaluation at non-household level**: Currently all constraints are evaluated at household level after aggregation. If you need person-level constraint evaluation (e.g., "only count persons with income > X"), the junkyard has this pattern but our builder doesn't. + +## The Junkyard Reference + +Location: `~/devl/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration/metrics_matrix_geo_stacking_sparse.py` + +This 2,400+ line file has extensive logic we intentionally avoided: +- Hard-coded variable names and stratum_group_ids +- Complex entity relationship tracking +- Person-level constraint evaluation with `.any()` aggregation + +**Key pattern from junkyard (if ever needed):** +```python +# Dynamic entity detection +target_entity = sim.tax_benefit_system.variables[target_variable].entity.key + +# Entity relationship DataFrame +entity_rel = pd.DataFrame({ + "person_id": sim.calculate("person_id", map_to="person").values, + "household_id": sim.calculate("household_id", map_to="person").values, + "tax_unit_id": sim.calculate("tax_unit_id", map_to="person").values, + # ... other entities +}) + +# For counts: use .nunique() on entity IDs +# For amounts: use .sum() on values +``` + +## Matrix Structure + +The sparse matrix X has shape `(n_targets, n_households × n_cds)`: + +``` +Columns: [CD1_hh0, CD1_hh1, ..., CD1_hhN, CD2_hh0, ..., CDM_hhN] +Rows: One per target (geographic_id + variable combination) + +Column index formula: col_idx = cd_idx * n_households + hh_idx +``` + +Use `MatrixTracer` for debugging: +```python +from matrix_tracer import MatrixTracer + +tracer = MatrixTracer(targets_df, X_sparse, household_id_mapping, cds_to_calibrate, sim) +tracer.print_matrix_structure() +tracer.get_column_info(100) # Info about column 100 +tracer.get_row_info(0) # Info about row 0 (first target) +``` + +## Environment Setup + +```bash +# Use the sep environment for this repo +source ~/envs/sep/bin/activate + +# Run from the local_area_calibration directory +cd ~/devl/sep/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration + +# Run tests +pytest ../../tests/test_sparse_matrix_builder.py -v +``` + +## Common Queries + +### List all target variables +```sql +SELECT DISTINCT variable FROM targets; +``` + +### List all constraint variables +```sql +SELECT DISTINCT constraint_variable FROM stratum_constraints; +``` + +### Find targets by geographic level +```sql +-- National targets (no geographic constraints) +SELECT t.* FROM targets t +JOIN strata s ON t.stratum_id = s.stratum_id +WHERE t.stratum_id NOT IN ( + SELECT stratum_id FROM stratum_constraints + WHERE constraint_variable IN ('state_fips', 'congressional_district_geoid') +); + +-- State-level targets +SELECT t.* FROM targets t +WHERE t.stratum_id IN ( + SELECT stratum_id FROM stratum_constraints + WHERE constraint_variable = 'state_fips' +); +``` + +## Summary + +For most new targets: +1. Verify target exists in `policy_data.db` +2. Add variable name to the target filter in `build_calibration_matrix.py` +3. Run and verify with `MatrixTracer` + +No code changes to `sparse_matrix_builder.py` needed unless you have special aggregation or constraint requirements. From 11d86e60d7c55b1f6064fb6d75a7dab952986b90 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 15 Jan 2026 16:53:45 -0500 Subject: [PATCH 03/19] Add weight calibration prototype using l0-python - Add fit_calibration_weights.py for L0-regularized weight optimization - Add l0-python as optional dependency under [project.optional-dependencies] - Uses SparseCalibrationWeights from l0.calibration package - Builds calibration matrix for all 436 congressional districts - Filters to achievable targets before fitting Co-Authored-By: Claude Opus 4.5 --- .../fit_calibration_weights.py | 207 ++++++++++++++++++ pyproject.toml | 3 + 2 files changed, 210 insertions(+) create mode 100644 policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py new file mode 100644 index 000000000..3fcd65be7 --- /dev/null +++ b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py @@ -0,0 +1,207 @@ +""" +Fit calibration weights using L0-regularized optimization. +Prototype script for weight calibration using the l0-python package. +""" + +import logging +from datetime import datetime +from pathlib import Path + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + +import numpy as np +from policyengine_us import Microsimulation +from policyengine_us_data.storage import STORAGE_FOLDER +from sparse_matrix_builder import SparseMatrixBuilder +from calibration_utils import get_all_cds_from_database + +try: + from l0.calibration import SparseCalibrationWeights +except ImportError: + raise ImportError( + "l0-python is required for weight fitting. " + "Install with: pip install policyengine-us-data[l0]" + ) + +# ============================================================================ +# CONFIGURATION +# ============================================================================ +DEVICE = "cpu" # Use "cuda" for GPU runs on Modal +TOTAL_EPOCHS = 100 # Reduced for testing; use 5000+ for production +EPOCHS_PER_CHUNK = 50 + +# Hyperparameters +BETA = 0.35 +GAMMA = -0.1 +ZETA = 1.1 +INIT_KEEP_PROB = 0.999 +LOG_WEIGHT_JITTER_SD = 0.05 +LOG_ALPHA_JITTER_SD = 0.01 +LAMBDA_L0 = 1e-8 +LAMBDA_L2 = 1e-8 +LEARNING_RATE = 0.15 + +# Data paths +db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" +db_uri = f"sqlite:///{db_path}" +dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" +output_dir = STORAGE_FOLDER / "calibration" +time_period = 2023 + +# Get all CDs from database +cds_to_calibrate = get_all_cds_from_database(db_uri) +print(f"Found {len(cds_to_calibrate)} congressional districts") + +# ============================================================================ +# STEP 1: BUILD CALIBRATION MATRIX +# ============================================================================ +print(f"Loading simulation from {dataset_path}...") +sim = Microsimulation(dataset=str(dataset_path)) +n_households = len(sim.calculate("household_id", map_to="household").values) +print(f"Loaded {n_households:,} households") + +print("\nBuilding sparse matrix...") +builder = SparseMatrixBuilder( + db_uri=db_uri, + time_period=time_period, + cds_to_calibrate=cds_to_calibrate, + dataset_path=str(dataset_path), +) + +targets_df, X_sparse, household_id_mapping = builder.build_matrix( + sim, + target_filter={ + "stratum_group_ids": [4], + "variables": ["health_insurance_premiums_without_medicare_part_b"], + }, +) + +print(f"Matrix shape: {X_sparse.shape}") +print(f"Targets: {len(targets_df)}") + +# Filter to achievable targets (rows with non-zero data) +row_sums = np.array(X_sparse.sum(axis=1)).flatten() +achievable_mask = row_sums > 0 +n_achievable = achievable_mask.sum() +n_impossible = (~achievable_mask).sum() + +print(f"\nAchievable targets: {n_achievable}") +print(f"Impossible targets (filtered out): {n_impossible}") + +targets_df = targets_df[achievable_mask].reset_index(drop=True) +X_sparse = X_sparse[achievable_mask, :] + +print(f"Filtered matrix shape: {X_sparse.shape}") + +# Extract target vector and names +targets = targets_df["value"].values +target_names = [ + f"{row['geographic_id']}/{row['variable']}" + for _, row in targets_df.iterrows() +] + +# ============================================================================ +# STEP 2: INITIALIZE WEIGHTS +# ============================================================================ +initial_weights = np.ones(X_sparse.shape[1]) * 100 +print(f"\nInitial weights shape: {initial_weights.shape}") +print(f"Initial weights sum: {initial_weights.sum():,.0f}") + +# ============================================================================ +# STEP 3: CREATE MODEL +# ============================================================================ +print("\nCreating SparseCalibrationWeights model...") +model = SparseCalibrationWeights( + n_features=X_sparse.shape[1], + beta=BETA, + gamma=GAMMA, + zeta=ZETA, + init_keep_prob=INIT_KEEP_PROB, + init_weights=initial_weights, + log_weight_jitter_sd=LOG_WEIGHT_JITTER_SD, + log_alpha_jitter_sd=LOG_ALPHA_JITTER_SD, + device=DEVICE, +) + +# ============================================================================ +# STEP 4: TRAIN IN CHUNKS +# ============================================================================ +timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") +sparsity_log = [] + +for chunk_start in range(0, TOTAL_EPOCHS, EPOCHS_PER_CHUNK): + chunk_epochs = min(EPOCHS_PER_CHUNK, TOTAL_EPOCHS - chunk_start) + current_epoch = chunk_start + chunk_epochs + + print(f"\nTraining epochs {chunk_start + 1} to {current_epoch}...") + + model.fit( + M=X_sparse, + y=targets, + target_groups=None, + lambda_l0=LAMBDA_L0, + lambda_l2=LAMBDA_L2, + lr=LEARNING_RATE, + epochs=chunk_epochs, + loss_type="relative", + verbose=True, + verbose_freq=chunk_epochs, + ) + + active_info = model.get_active_weights() + active_count = active_info["count"] + total_count = X_sparse.shape[1] + sparsity_pct = 100 * (1 - active_count / total_count) + + sparsity_log.append( + { + "epoch": current_epoch, + "active_weights": active_count, + "total_weights": total_count, + "sparsity_pct": sparsity_pct, + } + ) + + print(f" Active weights: {active_count:,} / {total_count:,}") + print(f" Sparsity: {sparsity_pct:.2f}%") + +# ============================================================================ +# STEP 5: EXTRACT AND SAVE WEIGHTS +# ============================================================================ +import torch + +with torch.no_grad(): + w = model.get_weights(deterministic=True).cpu().numpy() + +print(f"\nFinal weights shape: {w.shape}") +print(f"Final weights sum: {w.sum():,.0f}") +print(f"Non-zero weights: {(w > 0).sum():,}") + +output_path = output_dir / f"calibration_weights_{timestamp}.npy" +np.save(output_path, w) +print(f"\nWeights saved to: {output_path}") + +# ============================================================================ +# STEP 6: VERIFY PREDICTIONS +# ============================================================================ +print("\n" + "=" * 60) +print("PREDICTION VERIFICATION") +print("=" * 60) + +with torch.no_grad(): + predictions = model.predict(X_sparse).cpu().numpy() + +for i in range(len(targets)): + rel_error = (predictions[i] - targets[i]) / targets[i] * 100 + print( + f"{target_names[i][:50]:50} | " + f"pred: {predictions[i]:>12,.0f} | " + f"target: {targets[i]:>12,.0f} | " + f"err: {rel_error:>6.2f}%" + ) + +print("\n" + "=" * 60) +print("FITTING COMPLETED") +print("=" * 60) diff --git a/pyproject.toml b/pyproject.toml index d7f9b3ae3..f175a9ca3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,9 @@ dependencies = [ calibration = [ "samplics", ] +l0 = [ + "l0-python", +] [dependency-groups] dev = [ From 172983f07d3471b74b3b5f373c03e28c44276494 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 16 Jan 2026 20:39:24 -0500 Subject: [PATCH 04/19] Add Modal GPU support for weight calibration --- docs/local_area_calibration_setup.ipynb | 4 +- modal_app/fit_weights.py | 70 + .../conversation_about_this_repo.txt | 2147 +++++++++++++++++ .../fit_calibration_weights.py | 17 +- 4 files changed, 2234 insertions(+), 4 deletions(-) create mode 100644 modal_app/fit_weights.py create mode 100644 policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt diff --git a/docs/local_area_calibration_setup.ipynb b/docs/local_area_calibration_setup.ipynb index cdd1cc971..9060a3df2 100644 --- a/docs/local_area_calibration_setup.ipynb +++ b/docs/local_area_calibration_setup.ipynb @@ -459,10 +459,10 @@ "print(\"Remember, this is a North Carolina target:\\n\")\n", "print(targets_df.iloc[row_loc])\n", "\n", - "print(\"\\nHousehold donated to NC's 2nd district, 2023 SNAP dollars:\")\n", + "print(\"\\nNC State target. Household donated to NC's 2nd district, 2023 SNAP dollars:\")\n", "print(X_sparse[row_loc, positions['3702']]) # Household donated to NC's 2nd district\n", "\n", - "print(\"\\nHousehold donated to NC's 2nd district, 2023 SNAP dollars:\")\n", + "print(\"\\nSame target, same household, donated to AK's at Large district, 2023 SNAP dollars:\")\n", "print(X_sparse[row_loc, positions['201']]) # Household donated to AK's at Large District" ] }, diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py new file mode 100644 index 000000000..f07b976f6 --- /dev/null +++ b/modal_app/fit_weights.py @@ -0,0 +1,70 @@ +import os +import subprocess +import modal + +app = modal.App("policyengine-us-data-fit-weights") + +hf_secret = modal.Secret.from_name("huggingface-token") + +image = ( + modal.Image.debian_slim(python_version="3.11") + .apt_install("git") + .pip_install("uv") +) + +REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git" + + +@app.function( + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="T4", + timeout=14400, +) +def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: + os.chdir("/root") + subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True) + os.chdir("policyengine-us-data") + + subprocess.run(["uv", "sync", "--locked", "--extra", "l0"], check=True) + + script_path = ( + "policyengine_us_data/datasets/cps/" + "local_area_calibration/fit_calibration_weights.py" + ) + result = subprocess.run( + [ + "uv", "run", "python", script_path, + "--device", "cuda", + "--epochs", str(epochs), + ], + check=True, + capture_output=True, + text=True, + env=os.environ.copy(), + ) + print(result.stdout) + if result.stderr: + print("STDERR:", result.stderr) + + output_line = [ + line for line in result.stdout.split('\n') if 'OUTPUT_PATH:' in line + ][0] + output_path = output_line.split('OUTPUT_PATH:')[1].strip() + + with open(output_path, 'rb') as f: + return f.read() + + +@app.local_entrypoint() +def main( + branch: str = "main", + epochs: int = 200, + output: str = "calibration_weights.npy" +): + weights_bytes = fit_weights.remote(branch=branch, epochs=epochs) + with open(output, 'wb') as f: + f.write(weights_bytes) + print(f"Weights saved to: {output}") diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt b/policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt new file mode 100644 index 000000000..3242ef913 --- /dev/null +++ b/policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt @@ -0,0 +1,2147 @@ + + ▐▛███▜▌ Claude Code v2.1.7 +▝▜█████▛▘ Haiku 4.5 · Claude Max + ▘▘ ▝▝ ~/…/policyengine_us_data/datasets/cps/local_area_calibration + +❯ We need to bring in a new target to our calibration efforts, and this + will be manifested in the X matrix that is created from + sparse_matrix_builder.py in this directory. We have only test `snap` so + far, and you can see that demonstrated in + ~/docs/local_area_calibration_setup.ipynb. But now we want to bring in + health_insurance_premiums_without_medicare_part_b. We want to fully + understand what kind of variable this is from our target database (used + to construct the target vector), which you can find in + policyengine_us_data/storage/calibration/policy_data.db and see examples + for how to query it at + /home/baogorek/devl/code-snippets/reweighting/querying_policy_data_db.md. + This variable is defined at a different unit of analysis than `snap`, so + you need to be mindful of that. One more thing to be aware of: while we + are branched off the mainline in this repo, clone at + ~/devl/sep/policyengine-us-data, there is a "junkyard" clone of this same + repo at ~/devl/policyengine-us-data, which contains + /home/baogorek/devl/policyengine-us-data/policyengine_us_data/datasets/cp + s/local_area_calibration/metrics_matrix_geo_stacking_sparse.py, a massive + 2,000+ line file that contains logic to handle all of our targets, but + it does it in a very idiosycratic way, hard-coding target names in the + module. Our sparse_matrix_builder.py must never do this! We want to use + our policy_data.db to handle any target and create that sparse matrix. + Due to the size of the matrix creator in the "junkyard", and the + existance of two repos, this is a difficult task. Use the uv environment + ~/envs/sep for this branch and task, and use ~/envs/pe for the + "junkyard". Plan out the addition of this new target, and Ultrathink! + +● I'll help you plan the addition of the health_insurance_premiums_without_medicare_part_b target to your calibration system. This is a complex task that requires understanding multiple components across two repositories. + + Let me start by exploring the codebase to understand the current implementation and requirements. + +● 3 Explore agents finished (ctrl+o to expand) + ├─ Understand current sparse_matrix_builder implementation · 17 tool uses · + 27.6k tokens + │ ⎿  Done + ├─ Understand policy_data.db structure · 40 tool uses · 40.6k tokens + │ ⎿  Done + └─ Explore junkyard implementation · 40 tool uses · 52.5k tokens + ⎿  Done + +● Excellent! I've gathered comprehensive information about the current implementation, database structure, and junkyard patterns. Now let me launch a Plan agent to design the implementation approach. + +● Plan(Design health insurance premiums integration) + ⎿  Done (36 tool uses · 46.4k tokens · 4m 36s) + + +● Let me verify the Plan agent's findings by reading the key files directly. + +● Read(sparse_matrix_builder.py) + ⎿  Read 239 lines + +● Read(~/docs/local_area_calibration_setup.ipynb) + ⎿  Error reading file + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python + -c " + import sqlite3…) + ⎿ === Health Insurance Premium Targets === + (5, 'health_insurance_premiums_without_medicare_part_b', + 385000000000.0 + , 2024, 1, 1) + … +3 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Search(pattern: "~/docs/*.ipynb") + ⎿  Found 0 files + +● Read(~/devl/code-snippets/reweighting/querying_policy_data_db.md) + ⎿  Read 96 lines + +● Bash(find /home/baogorek/docs -name "*.ipynb" -type f 2>/dev/null | head -10) + ⎿  (No content) + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python + -c " + import sqlite3…) + ⎿  === Stratum Group Analysis === + + Stratum group 1 (health insurance stratum): + … +56 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python + -c " + from policyengine_us import Microsimulation…) + ⎿  === Entity Aggregation Test === + Person-level array shape: (52826,) + Household-level array shape: (21045,) + … +6 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Excellent! I've now verified the Plan agent's key finding. Let me create the implementation plan and ask some clarifying questions. + +● Updated plan + ⎿  Wrote 267 lines to + ../../../../../../../.claude/plans/velvet-rolling-dawn.md + 1 # Implementation Plan: Adding Health Insurance Premiums to Calibr + ation + 2 ## Executive Summary + 3 **Surprising Finding**: The current `sparse_matrix_builder.py` im + plementation **already supports person-level targets** without an + y code changes needed. PolicyEngine's `map_to="household"` parame + ter automatically aggregates person-level values to household lev + el while preserving totals. + 4 ## Background + 5 ### Current State + 6 - **Working targets**: SNAP (stratum_group_id=4, spm_unit-level, + 538 targets: 51 state snap amounts + 487 household_count targets) + … +257 lines (ctrl+o to expand) + +● User answered Claude's questions: + ⎿  · Do you want to calibrate using only the existing national health + insurance premium target ($385B), or should state-level targets be + added to the database first? → National target only (Recommended) + · How do you plan to use the health insurance premium target in your + calibration? → Combined with SNAP + + · The existing health insurance target has no geographic constraints + (same target for all CDs). Is this the desired behavior? → Yes, + national is correct + + +● Excellent! Based on your answers, let me finalize the implementation plan. + +● Updated plan + ⎿  /plan to preview + 250 - ✓ Total preservation is guaranteed by PolicyEngine's aggrega + tion logic + 251 - ✓ Generic architecture means extensible to any future entity + types + 252 + 253 -## Next Steps (Post-Approval) + 253 +## User Decisions + 254 + 255 -1. **Update usage examples** to show health insurance premium + -calibration + 256 -2. **Test with combined targets** (SNAP + health insurance) to + - verify OR filter logic + 257 -3. **Document target filtering patterns** for future reference + 258 -4. **(Optional) Add state-level health insurance targets** to + -database if geographic variation is needed + 255 +✓ **Target Scope**: Use national target only (existing target_ + +id=5, no database changes) + 256 +✓ **Usage Pattern**: Combined with SNAP (test multi-target cal + +ibration) + 257 +✓ **Geographic Scope**: National aggregate is correct (no stat + +e variation needed) + 258 + 260 -## Questions for User + 259 +## Implementation Steps (Post-Approval) + 260 + 262 -Before finalizing this plan, clarification needed on: + 261 +### Step 1: Verify Database Target + 262 +Confirm health insurance premium target exists and is active: + 263 +```python + 264 +# Using ~/envs/sep environment + 265 +import sqlite3 + 266 +conn = sqlite3.connect('policyengine_us_data/storage/calibrati + +on/policy_data.db') + 267 +cursor = conn.cursor() + 268 +cursor.execute(""" + 269 + SELECT target_id, variable, value, period, stratum_id, act + +ive + 270 + FROM targets + 271 + WHERE variable = 'health_insurance_premiums_without_medica + +re_part_b' + 272 +""") + 273 +print(cursor.fetchone()) + 274 +``` + 275 +Expected: `(5, 'health_insurance_premiums_without_medicare_par + +t_b', 385000000000.0, 2024, 1, 1)` + 276 + 264 -1. **Target Scope**: Do you want ONLY the national health insu + -rance target, or should state-level targets be added to the da + -tabase? + 265 -2. **Usage Pattern**: Will you calibrate using health insuranc + -e alone, or combined with SNAP targets? + 266 -3. **Geographic Constraints**: The current target has no geogr + -aphic constraints (applies to all CDs equally). Is this correc + -t, or should premiums vary by state/CD? + 277 +### Step 2: Test Combined Target Matrix Building + 278 +Create a test script to build matrix with both SNAP and health + + insurance: + 279 + + 280 +```python + 281 +from policyengine_us import Microsimulation + 282 +from policyengine_us_data.datasets.cps.local_area_calibration. + +sparse_matrix_builder import SparseMatrixBuilder + 283 + + 284 +# Load simulation + 285 +sim = Microsimulation() + 286 + + 287 +# Create builder + 288 +builder = SparseMatrixBuilder( + 289 + db_uri="sqlite:///policyengine_us_data/storage/calibration + +/policy_data.db", + 290 + time_period=2024, + 291 + cds_to_calibrate=['0101', '0601', '0602'], # Example CDs: + + AL-1, CA-1, CA-2 + 292 + dataset_path=None + 293 +) + 294 + + 295 +# Build matrix with SNAP + health insurance + 296 +targets_df, X_sparse, household_mapping = builder.build_matrix + +( + 297 + sim, + 298 + target_filter={"stratum_group_ids": [1, 4]} # National + + +SNAP targets + 299 +) + 300 + + 301 +print(f"Total targets: {len(targets_df)}") + 302 +print(f"Matrix shape: {X_sparse.shape}") + 303 +print(f"Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_s + +parse.shape[1]):.2%}") + 304 + + 305 +# Verify health insurance target is included + 306 +health_ins = targets_df[ + 307 + targets_df["variable"] == "health_insurance_premiums_witho + +ut_medicare_part_b" + 308 +] + 309 +print(f"\nHealth insurance target:") + 310 +print(health_ins[["target_id", "variable", "value", "geographi + +c_id"]]) + 311 +``` + 312 + + 313 +### Step 3: Verify Entity Aggregation + 314 +Confirm person-to-household aggregation works correctly: + 315 + + 316 +```python + 317 +# Verify totals are preserved + 318 +person_total = sim.calculate( + 319 + 'health_insurance_premiums_without_medicare_part_b', + 320 + 2024, + 321 + map_to='person' + 322 +).values.sum() + 323 + + 324 +household_total = sim.calculate( + 325 + 'health_insurance_premiums_without_medicare_part_b', + 326 + 2024, + 327 + map_to='household' + 328 +).values.sum() + 329 + + 330 +print(f"Person-level total: ${person_total:,.0f}") + 331 +print(f"Household-level total: ${household_total:,.0f}") + 332 +print(f"Match: {abs(person_total - household_total) < 1}") + 333 + + 334 +# Check matrix row sum for health insurance + 335 +health_ins_idx = targets_df[ + 336 + targets_df["variable"] == "health_insurance_premiums_witho + +ut_medicare_part_b" + 337 +].index[0] + 338 +matrix_total = X_sparse[health_ins_idx, :].sum() * len(cds_to_ + +calibrate) # Adjust for stacking + 339 +print(f"Matrix row total (adjusted): ${matrix_total:,.0f}") + 340 +``` + 341 + + 342 +### Step 4: Update Documentation + 343 +Add example to `~/docs/local_area_calibration_setup.ipynb` sho + +wing: + 344 +- How to filter for multiple target groups + 345 +- How person-level targets work alongside spm_unit-level targe + +ts + 346 +- Interpretation of the resulting matrix structure + 347 + + 348 +### Step 5: Calibration Integration + 349 +Use the matrix in calibration workflow: + 350 +```python + 351 +# After building matrix with combined targets + 352 +from your_calibration_module import solve_weights + 353 + + 354 +# Solve for weights + 355 +weights = solve_weights(X_sparse, targets_df["value"].values) + 356 + + 357 +# Verify calibration matches targets + 358 +calibrated = X_sparse.T @ weights + 359 +targets = targets_df["value"].values + 360 + + 361 +for idx, row in targets_df.iterrows(): + 362 + print(f"{row['variable']}: target={row['value']:,.0f}, cal + +ibrated={calibrated[idx]:,.0f}") + 363 +``` + 364 + + 365 +## Success Criteria + 366 + + 367 +1. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 + +, 4]}` filter + 368 +2. ✓ Health insurance row appears in targets_df with correct v + +alue ($385B) + 369 +3. ✓ Matrix shape is (n_targets, n_households × n_cds) where n + +_targets ≈ 558 (20 national + 538 SNAP) + 370 +4. ✓ Person-to-household aggregation preserves totals + 371 +5. ✓ Matrix row for health insurance contains household-aggreg + +ated premium values + 372 +6. ✓ No code changes needed (only usage pattern demonstration) + +● Updated plan + ⎿  /plan to preview + 1 # Implementation Plan: Adding Health Insurance Premiums to Cali + bration + 2 + 3 +## TL;DR + 4 + + 5 +**ZERO CODE CHANGES NEEDED** 🎉 + 6 + + 7 +The current `sparse_matrix_builder.py` already supports person- + +level targets through PolicyEngine's `map_to="household"` param + +eter, which automatically aggregates person-level values to hou + +sehold level. Simply use: + 8 + + 9 +```python + 10 +builder.build_matrix(sim, target_filter={"stratum_group_ids": [ + +1, 4]}) # SNAP + health insurance + 11 +``` + 12 + + 13 +This works because: + 14 +- Health insurance premium is in the database (target_id=5, $38 + +5B) + 15 +- PolicyEngine auto-aggregates person→household (preserves tota + +ls) + 16 +- No hard-coded variable names in the builder + 17 + + 18 ## Executive Summary + 19 + 5 -**Surprising Finding**: The current `sparse_matrix_builder.py` + -implementation **already supports person-level targets** withou + -t any code changes needed. PolicyEngine's `map_to="household"` + -parameter automatically aggregates person-level values to house + -hold level while preserving totals. + 20 +**Key Finding**: The current `sparse_matrix_builder.py` impleme + +ntation **already supports person-level targets** without any c + +ode changes needed. PolicyEngine's `map_to="household"` paramet + +er automatically aggregates person-level values to household le + +vel while preserving totals. + 21 + 22 ## Background + 23 + +● Updated plan + ⎿  /plan to preview + 385 4. ✓ Person-to-household aggregation preserves totals + 386 5. ✓ Matrix row for health insurance contains household-aggreg + ated premium values + 387 6. ✓ No code changes needed (only usage pattern demonstration) + 388 + + 389 +## Quick Reference + 390 + + 391 +### Critical Files + 392 +- **No changes**: `sparse_matrix_builder.py` (already supports + + person-level targets) + 393 +- **Database**: `policyengine_us_data/storage/calibration/poli + +cy_data.db` (contains target_id=5) + 394 +- **Environment**: `~/envs/sep` (for this development branch) + 395 + + 396 +### Key Commands + 397 +```bash + 398 +# Activate environment + 399 +source ~/envs/sep/bin/activate + 400 + + 401 +# Query database for health insurance target + 402 +cd ~/devl/sep/policyengine-us-data + 403 +python -c "import sqlite3; conn = sqlite3.connect('policyengin + +e_us_data/storage/calibration/policy_data.db'); cursor = conn. + +cursor(); cursor.execute('SELECT * FROM targets WHERE variable + + = \"health_insurance_premiums_without_medicare_part_b\"'); pr + +int(cursor.fetchone())" + 404 + + 405 +# Run verification tests + 406 +python test_combined_targets.py # Create this test script per + + Step 2 above + 407 +``` + 408 + + 409 +### Target Filter Examples + 410 +```python + 411 +# SNAP only + 412 +{"stratum_group_ids": [4]} + 413 + + 414 +# Health insurance only + 415 +{"variables": ["health_insurance_premiums_without_medicare_par + +t_b"]} + 416 + + 417 +# Both SNAP and health insurance (recommended) + 418 +{"stratum_group_ids": [1, 4]} + 419 +``` + +● Perfect! I've completed a comprehensive investigation and created a detailed implementation plan. The surprising finding is that no code changes are needed - your sparse_matrix_builder.py already supports person-level targets through PolicyEngine's automatic entity aggregation. + + Key discoveries: + 1. Health insurance premiums is person-level (verified via PolicyEngine metadata) + 2. PolicyEngine's map_to="household" automatically aggregates person values to household level + 3. Totals are preserved (verified: $56,465,248 at both person and household levels) + 4. Database already contains the target (target_id=5, $385B, stratum_group_id=1) + 5. Your generic architecture works perfectly for any entity type + + The plan includes: + - Complete verification of entity aggregation + - Step-by-step testing procedures + - Combined SNAP + health insurance usage pattern + - Success criteria and quick reference commands + ⎿ Tool use rejected with user message: Do not do Step 5, rather, please create + a "runner" program that generates X_sparse and the target vector and prints + some diagnostics. You can use the first 100 lines of + /home/baogorek/devl/policyengine-us-data/policyengine_us_data/datasets/cps/l + ocal_area_calibration/calibrate_cds_sparse.py (from the junkyard) as an + example. Remember that you have access to the tracer in this repo as well. + + +● Read(~/devl/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_ + calibration/calibrate_cds_sparse.py) + ⎿  Read 100 lines + +● Search(pattern: "**/tracer*.py") + ⎿  Found 0 files + +● Search(pattern: "**/*tracer*.py") + ⎿  Found 1 file (ctrl+o to expand) + +● Read(matrix_tracer.py) + ⎿  Read 384 lines + +● Updated plan + ⎿  /plan to preview + 360 - How person-level targets work alongside spm_unit-level targe + ts + 361 - Interpretation of the resulting matrix structure + 362 + 363 -### Step 5: Calibration Integration + 364 -Use the matrix in calibration workflow: + 363 +### Step 5: Create Runner Program with Diagnostics + 364 +Create a runner script `test_combined_targets.py` that: + 365 +- Builds X_sparse with SNAP + health insurance targets + 366 +- Extracts target vector + 367 +- Uses MatrixTracer to print diagnostics + 368 + + 369 +Pattern based on first 100 lines of junkyard's `calibrate_cds_ + +sparse.py`: + 370 + + 371 ```python + 366 -# After building matrix with combined targets + 367 -from your_calibration_module import solve_weights + 372 +""" + 373 +Test runner for combined SNAP + health insurance calibration t + +argets. + 374 +Generates X_sparse and target vector, prints diagnostics using + + MatrixTracer. + 375 +""" + 376 +import logging + 377 +from pathlib import Path + 378 + 369 -# Solve for weights + 370 -weights = solve_weights(X_sparse, targets_df["value"].values) + 379 +logging.basicConfig( + 380 + level=logging.INFO, + 381 + format="%(asctime)s - %(levelname)s - %(message)s" + 382 +) + 383 + 372 -# Verify calibration matches targets + 373 -calibrated = X_sparse.T @ weights + 374 -targets = targets_df["value"].values + 384 +import numpy as np + 385 +import pandas as pd + 386 +from policyengine_us import Microsimulation + 387 +from policyengine_us_data.datasets.cps.local_area_calibration. + +sparse_matrix_builder import SparseMatrixBuilder + 388 +from policyengine_us_data.datasets.cps.local_area_calibration. + +matrix_tracer import MatrixTracer + 389 +from policyengine_us_data.datasets.cps.local_area_calibration. + +calibration_utils import ( + 390 + create_target_groups, + 391 +) + 392 + 376 -for idx, row in targets_df.iterrows(): + 377 - print(f"{row['variable']}: target={row['value']:,.0f}, cal + -ibrated={calibrated[idx]:,.0f}") + 393 +# ============================================================ + +================ + 394 +# CONFIGURATION + 395 +# ============================================================ + +================ + 396 +db_path = "policyengine_us_data/storage/calibration/policy_dat + +a.db" + 397 +db_uri = f"sqlite:///{db_path}" + 398 +time_period = 2024 + 399 + + 400 +# Test with a few CDs from different states + 401 +cds_to_calibrate = [ + 402 + '0101', # Alabama CD-1 + 403 + '0601', # California CD-1 + 404 + '0602', # California CD-2 + 405 + '3601', # New York CD-1 + 406 + '4801', # Texas CD-1 + 407 +] + 408 + + 409 +print(f"Testing with {len(cds_to_calibrate)} congressional dis + +tricts") + 410 + + 411 +# ============================================================ + +================ + 412 +# STEP 1: LOAD SIMULATION + 413 +# ============================================================ + +================ + 414 +print("\nLoading simulation...") + 415 +sim = Microsimulation() + 416 +n_households = len(sim.calculate("household_id", map_to="house + +hold").values) + 417 +print(f"Loaded {n_households:,} households") + 418 + + 419 +# ============================================================ + +================ + 420 +# STEP 2: BUILD SPARSE MATRIX WITH COMBINED TARGETS + 421 +# ============================================================ + +================ + 422 +print("\nBuilding sparse matrix...") + 423 +builder = SparseMatrixBuilder( + 424 + db_uri=db_uri, + 425 + time_period=time_period, + 426 + cds_to_calibrate=cds_to_calibrate, + 427 + dataset_path=None + 428 +) + 429 + + 430 +# Query both SNAP (stratum_group_id=4) and national targets (s + +tratum_group_id=1) + 431 +targets_df, X_sparse, household_id_mapping = builder.build_mat + +rix( + 432 + sim, + 433 + target_filter={"stratum_group_ids": [1, 4]} + 434 +) + 435 + + 436 +print(f"\nMatrix built successfully:") + 437 +print(f" Shape: {X_sparse.shape}") + 438 +print(f" Targets: {len(targets_df)}") + 439 +print(f" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X + +_sparse.shape[1]):.4%}") + 440 + + 441 +# ============================================================ + +================ + 442 +# STEP 3: EXTRACT TARGET VECTOR + 443 +# ============================================================ + +================ + 444 +target_vector = targets_df["value"].values + 445 +print(f"\nTarget vector shape: {target_vector.shape}") + 446 +print(f"Target total: ${target_vector.sum():,.0f}") + 447 + + 448 +# ============================================================ + +================ + 449 +# STEP 4: HEALTH INSURANCE PREMIUM VERIFICATION + 450 +# ============================================================ + +================ + 451 +print("\n" + "="*80) + 452 +print("HEALTH INSURANCE PREMIUM TARGET ANALYSIS") + 453 +print("="*80) + 454 + + 455 +health_ins_targets = targets_df[ + 456 + targets_df["variable"] == "health_insurance_premiums_witho + +ut_medicare_part_b" + 457 +] + 458 + + 459 +if len(health_ins_targets) > 0: + 460 + print(f"\nFound {len(health_ins_targets)} health insurance + + target(s):") + 461 + print(health_ins_targets[["target_id", "variable", "value" + +, "geographic_id", "stratum_group_id"]]) + 462 + + 463 + # Get the row index for health insurance + 464 + health_ins_idx = health_ins_targets.index[0] + 465 + health_ins_row = X_sparse[health_ins_idx, :] + 466 + + 467 + print(f"\nMatrix row {health_ins_idx} (health insurance):" + +) + 468 + print(f" Non-zero entries: {health_ins_row.nnz:,}") + 469 + print(f" Row sum: ${health_ins_row.sum():,.0f}") + 470 + print(f" Target value: ${health_ins_targets.iloc[0]['valu + +e']:,.0f}") + 471 + + 472 + # Verify person-to-household aggregation + 473 + person_total = sim.calculate( + 474 + 'health_insurance_premiums_without_medicare_part_b', + 475 + time_period, + 476 + map_to='person' + 477 + ).values.sum() + 478 + + 479 + household_total = sim.calculate( + 480 + 'health_insurance_premiums_without_medicare_part_b', + 481 + time_period, + 482 + map_to='household' + 483 + ).values.sum() + 484 + + 485 + print(f"\nEntity aggregation verification:") + 486 + print(f" Person-level total: ${person_total:,.0f}") + 487 + print(f" Household-level total: ${household_total:,.0f}") + 488 + print(f" Match: {abs(person_total - household_total) < 1} + +") + 489 + + 490 +else: + 491 + print("\nWARNING: No health insurance targets found!") + 492 + + 493 +# ============================================================ + +================ + 494 +# STEP 5: SNAP TARGET SUMMARY + 495 +# ============================================================ + +================ + 496 +print("\n" + "="*80) + 497 +print("SNAP TARGET SUMMARY") + 498 +print("="*80) + 499 + + 500 +snap_targets = targets_df[targets_df["variable"] == "snap"] + 501 +household_count_targets = targets_df[targets_df["variable"] == + + "household_count"] + 502 + + 503 +print(f"\nSNAP benefit targets: {len(snap_targets)}") + 504 +print(f"Household count targets: {len(household_count_targets) + +}") + 505 + + 506 +if len(snap_targets) > 0: + 507 + print(f"\nSNAP total (all states): ${snap_targets['value'] + +.sum():,.0f}") + 508 + print(f"\nSample SNAP targets:") + 509 + print(snap_targets[["target_id", "variable", "value", "geo + +graphic_id"]].head(10)) + 510 + + 511 +# ============================================================ + +================ + 512 +# STEP 6: USE MATRIX TRACER FOR DETAILED DIAGNOSTICS + 513 +# ============================================================ + +================ + 514 +print("\n" + "="*80) + 515 +print("MATRIX TRACER DIAGNOSTICS") + 516 +print("="*80) + 517 + + 518 +tracer = MatrixTracer( + 519 + targets_df=targets_df, + 520 + matrix=X_sparse, + 521 + household_id_mapping=household_id_mapping, + 522 + geographic_ids=cds_to_calibrate, + 523 + sim=sim + 524 +) + 525 + + 526 +# Print comprehensive matrix structure + 527 +tracer.print_matrix_structure(show_groups=True) + 528 + + 529 +# ============================================================ + +================ + 530 +# STEP 7: TARGET GROUP ANALYSIS + 531 +# ============================================================ + +================ + 532 +print("\n" + "="*80) + 533 +print("TARGET GROUP ANALYSIS") + 534 +print("="*80) + 535 + + 536 +target_groups, group_info = create_target_groups(targets_df) + 537 + + 538 +print(f"\nTotal target groups: {len(group_info)}") + 539 +for group_id, info in enumerate(group_info): + 540 + group_mask = target_groups == group_id + 541 + n_targets_in_group = group_mask.sum() + 542 + print(f" Group {group_id}: {info} ({n_targets_in_group} t + +argets)") + 543 + + 544 +print("\n" + "="*80) + 545 +print("RUNNER COMPLETED SUCCESSFULLY") + 546 +print("="*80) + 547 ``` + 548 + 549 +**Key Features**: + 550 +- Loads simulation once + 551 +- Builds matrix with combined targets using OR filter + 552 +- Verifies health insurance person→household aggregation + 553 +- Uses MatrixTracer for comprehensive diagnostics + 554 +- Prints target group analysis + 555 +- Follows junkyard pattern but with new sparse_matrix_builder + 556 + + 557 ## Success Criteria + 558 + 559 1. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 + , 4]}` filter + +● Updated plan + ⎿  /plan to preview + 556 + 557 ## Success Criteria + 558 + 559 -1. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 + -, 4]}` filter + 560 -2. ✓ Health insurance row appears in targets_df with correct v + -alue ($385B) + 561 -3. ✓ Matrix shape is (n_targets, n_households × n_cds) where n + -_targets ≈ 558 (20 national + 538 SNAP) + 562 -4. ✓ Person-to-household aggregation preserves totals + 563 -5. ✓ Matrix row for health insurance contains household-aggreg + -ated premium values + 564 -6. ✓ No code changes needed (only usage pattern demonstration) + 559 +1. ✓ Runner script executes successfully without errors + 560 +2. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 + +, 4]}` filter + 561 +3. ✓ Health insurance target appears in targets_df with correc + +t value ($385B) + 562 +4. ✓ Matrix shape is (n_targets, n_households × n_cds) where n + +_targets ≈ 558 (20 national + 538 SNAP) + 563 +5. ✓ Person-to-household aggregation preserves totals (verifie + +d in runner output) + 564 +6. ✓ MatrixTracer prints comprehensive diagnostics + 565 +7. ✓ Target groups are correctly identified and displayed + 566 +8. ✓ No code changes to sparse_matrix_builder.py (only new run + +ner script) + 567 + 568 ## Quick Reference + 569 + +● Updated plan + ⎿  /plan to preview + 569 + 570 ### Critical Files + 571 - **No changes**: `sparse_matrix_builder.py` (already supports + person-level targets) + 572 +- **New file**: `test_combined_targets.py` (runner program to + +create and verify) + 573 - **Database**: `policyengine_us_data/storage/calibration/poli + cy_data.db` (contains target_id=5) + 574 +- **Tracer**: `matrix_tracer.py` (used for diagnostics in runn + +er) + 575 - **Environment**: `~/envs/sep` (for this development branch) + 576 + 577 ### Key Commands + ... + 579 # Activate environment + 580 source ~/envs/sep/bin/activate + 581 + 580 -# Query database for health insurance target + 582 +# Navigate to project directory + 583 cd ~/devl/sep/policyengine-us-data + 584 + + 585 +# Query database for health insurance target + 586 python -c "import sqlite3; conn = sqlite3.connect('policyengin + e_us_data/storage/calibration/policy_data.db'); cursor = conn. + cursor(); cursor.execute('SELECT * FROM targets WHERE variable + = \"health_insurance_premiums_without_medicare_part_b\"'); pr + int(cursor.fetchone())" + 587 + 584 -# Run verification tests + 585 -python test_combined_targets.py # Create this test script per + - Step 2 above + 588 +# Run the combined targets runner (after creating per Step 5) + 589 +python test_combined_targets.py + 590 ``` + 591 + 592 ### Target Filter Examples + +● Perfect! I've updated the plan to include a comprehensive runner program (Step 5) that will: + - Build X_sparse with combined SNAP + health insurance targets + - Extract the target vector + - Use MatrixTracer for detailed diagnostics + - Verify person→household aggregation + - Print comprehensive analysis + + The runner follows the pattern from the junkyard's calibrate_cds_sparse.py but uses your new generic sparse_matrix_builder.py. + +● User approved Claude's plan + ⎿  Plan saved to: ~/.claude/plans/velvet-rolling-dawn.md · /plan to edit + + Implementation Plan: Adding Health Insurance Premiums to Calibration + + TL;DR + + ZERO CODE CHANGES NEEDED 🎉 + + The current sparse_matrix_builder.py already supports person-level + targets through PolicyEngine's map_to="household" parameter, which + automatically aggregates person-level values to household level. + Simply use: + + builder.build_matrix(sim, target_filter={"stratum_group_ids": [1, 4]}) + # SNAP + health insurance + + This works because: + - Health insurance premium is in the database (target_id=5, $385B) + - PolicyEngine auto-aggregates person→household (preserves totals) + - No hard-coded variable names in the builder + + Executive Summary + + Key Finding: The current sparse_matrix_builder.py implementation + already supports person-level targets without any code changes needed. + PolicyEngine's map_to="household" parameter automatically aggregates + person-level values to household level while preserving totals. + + Background + + Current State + + - Working targets: SNAP (stratum_group_id=4, spm_unit-level, 538 + targets: 51 state snap amounts + 487 household_count targets) + - New target: health_insurance_premiums_without_medicare_part_b + (person-level, stratum_group_id=1, 1 national target) + - Repository: Development branch at ~/devl/sep/policyengine-us-data + (use ~/envs/sep) + - Junkyard: Reference implementation at ~/devl/policyengine-us-data + (use ~/envs/pe) + + Key Differences + + - SNAP: spm_unit-level variable with state-specific constraints and + two variables (snap dollars + household_count) + - Health Insurance: person-level variable, single national target (no + geographic constraints), only one variable + + Investigation Results + + Database Analysis + + -- Health insurance premium target in policy_data.db + target_id: 5 + variable: 'health_insurance_premiums_without_medicare_part_b' + value: $385,000,000,000 (2024) + stratum_id: 1 + stratum_group_id: 1 (national hardcoded targets) + constraints: NONE (applies to entire nation) + + Stratum group 1 contains 20 national hardcoded targets including + medicaid, snap (national), ssi, tanf, social_security, rent, etc. + + Entity Aggregation Verification + + # Verified with PolicyEngine Microsimulation + Person-level array: 52,826 persons, sum = $56,465,248 + Household-level array: 21,045 households, sum = $56,465,248 + Entity type: 'person' + Sums match: True ✓ + + Conclusion: PolicyEngine's map_to="household" automatically sums + person-level values to household level, preserving totals perfectly. + + Current Implementation Analysis + + The generic code at sparse_matrix_builder.py:220-222: + target_values = state_sim.calculate( + target["variable"], map_to="household" + ).values + masked_values = (target_values * mask).astype(np.float32) + + This works for ANY entity type because: + 1. PolicyEngine detects the variable's entity type from the tax + benefit system + 2. map_to="household" triggers automatic aggregation (sum for amounts, + count for _count variables) + 3. No explicit entity handling code needed + + Implementation Plan + + Phase 1: No Code Changes Needed ✓ + + The sparse_matrix_builder.py requires zero modifications to support + person-level targets. + + Why: The implementation is already entity-agnostic via PolicyEngine's + map_to parameter. + + Phase 2: Usage Pattern + + To add health insurance premiums to calibration, use the appropriate + target filter: + + Option A: Include specific stratum_group_id + builder = SparseMatrixBuilder(db_uri, time_period, cds_to_calibrate, + dataset_path) + + # Just health insurance (national hardcoded targets) + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [1]} # 20 national targets + ) + + # SNAP only (state-level targets) + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [4]} # 538 SNAP targets + ) + + # Both SNAP and national targets together + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [1, 4]} # 558 targets total + ) + + Option B: Include specific variable(s) + # Just health insurance premiums + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"variables": + ["health_insurance_premiums_without_medicare_part_b"]} + ) + + # Multiple specific variables + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={ + "variables": ["snap", + "health_insurance_premiums_without_medicare_part_b"] + } + ) + + Option C: Include specific stratum_id or target_id + # Using stratum_id + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_ids": [1]} # National stratum + ) + + # Using target_id + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"target_ids": [5]} # Health insurance target + ) + + Phase 3: Matrix Structure + + The resulting sparse matrix will have: + - Rows: Number of targets (1 for health insurance only, 558 for SNAP + + health insurance) + - Columns: n_households × n_cds (e.g., 21,045 households × 19 CDs = + 399,855 columns) + - Values: For health insurance row, each column contains the sum of + person-level premiums for that household + - Sparsity: ~99.95% sparse (most households have zero premiums) + + Phase 4: Constraint Handling + + For health insurance premiums (no constraints): + - All households included (mask = all True) + - Each household gets sum of all persons' premiums in that household + - Same value repeated across all CDs (since there's no geographic + constraint) + + For SNAP (with state constraints): + - Constraints filtered by state_fips and snap>0 + - Only qualifying households get non-zero values + - Values vary by state due to state-specific SNAP rules + + Phase 5: Verification + + Test 1: Basic Matrix Construction + # Create builder with policy_data.db + builder = SparseMatrixBuilder( + db_uri="sqlite:///policyengine_us_data/storage/calibration/policy_ + data.db", + time_period=2024, + cds_to_calibrate=['0101', '0102', ...], # Example CDs + dataset_path=None # Or path to CPS dataset + ) + + # Build matrix with health insurance + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"variables": + ["health_insurance_premiums_without_medicare_part_b"]} + ) + + # Verify + assert len(targets_df) == 1 + assert targets_df.iloc[0]["variable"] == + "health_insurance_premiums_without_medicare_part_b" + assert targets_df.iloc[0]["value"] == 385_000_000_000 + assert X_sparse.shape[0] == 1 + assert X_sparse.sum() > 0 + + Test 2: Combined Targets + # Build matrix with both SNAP and health insurance + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [1, 4]} + ) + + # Verify + assert len(targets_df) == 558 # 20 national + 538 SNAP + health_ins_rows = targets_df[ + targets_df["variable"] == + "health_insurance_premiums_without_medicare_part_b" + ] + assert len(health_ins_rows) == 1 + + Test 3: Entity Aggregation + # Verify person-to-household aggregation preserves totals + sim = Microsimulation(dataset=dataset_path) + + person_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + 2024, + map_to='person' + ).values.sum() + + household_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + 2024, + map_to='household' + ).values.sum() + + assert np.isclose(person_total, household_total, rtol=1e-6) + + Critical Files + + No Modifications Needed + + - ✓ sparse_matrix_builder.py - Already supports person-level targets + + Database (Already Contains Target) + + - ✓ policyengine_us_data/storage/calibration/policy_data.db - Contains + target_id=5 + + Documentation/Examples + + - May need to update: ~/docs/local_area_calibration_setup.ipynb - Show + health insurance example + + Architecture Decisions + + Why No Code Changes? + + 1. PolicyEngine's Generic Design: map_to parameter handles all entity + aggregations + 2. Database-Driven Approach: All target metadata comes from + policy_data.db + 3. Constraint-Based Filtering: Generic mask application works for any + entity type + 4. No Hard-Coding: Variable names never appear in the builder logic + + Entity Aggregation Semantics + + - Amount variables (like premiums): SUM from entity to household + - Count variables (like person_count): COUNT unique entities per + household + - Detection: PolicyEngine uses variable metadata from + tax_benefit_system + + Limitations (Current Implementation) + + - Assumes all targets should be aggregated to household level for + calibration + - No support for tax_unit or spm_unit level calibration (only + household-level weights) + - This is correct for geo-stacking where we reweight households within + CDs + + Risk Analysis + + Low Risk + + - ✓ No code changes means no regression risk + - ✓ Existing SNAP functionality unchanged + - ✓ Database already contains correct target definition + + Medium Risk + + - ⚠️ If user wants state-level health insurance targets, those need to + be added to database first + - ⚠️ Performance: Person-level aggregation is more expensive than + household-level (but PolicyEngine caches this) + + Zero Risk Items + + - ✓ Entity detection is automatic via PolicyEngine + - ✓ Total preservation is guaranteed by PolicyEngine's aggregation + logic + - ✓ Generic architecture means extensible to any future entity types + + User Decisions + + ✓ Target Scope: Use national target only (existing target_id=5, no + database changes) + ✓ Usage Pattern: Combined with SNAP (test multi-target calibration) + ✓ Geographic Scope: National aggregate is correct (no state variation + needed) + + Implementation Steps (Post-Approval) + + Step 1: Verify Database Target + + Confirm health insurance premium target exists and is active: + # Using ~/envs/sep environment + import sqlite3 + conn = sqlite3.connect('policyengine_us_data/storage/calibration/polic + y_data.db') + cursor = conn.cursor() + cursor.execute(""" + SELECT target_id, variable, value, period, stratum_id, active + FROM targets + WHERE variable = + 'health_insurance_premiums_without_medicare_part_b' + """) + print(cursor.fetchone()) + Expected: (5, 'health_insurance_premiums_without_medicare_part_b', + 385000000000.0, 2024, 1, 1) + + Step 2: Test Combined Target Matrix Building + + Create a test script to build matrix with both SNAP and health + insurance: + + from policyengine_us import Microsimulation + from policyengine_us_data.datasets.cps.local_area_calibration.sparse_m + atrix_builder import SparseMatrixBuilder + + # Load simulation + sim = Microsimulation() + + # Create builder + builder = SparseMatrixBuilder( + db_uri="sqlite:///policyengine_us_data/storage/calibration/policy_ + data.db", + time_period=2024, + cds_to_calibrate=['0101', '0601', '0602'], # Example CDs: AL-1, + CA-1, CA-2 + dataset_path=None + ) + + # Build matrix with SNAP + health insurance + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [1, 4]} # National + SNAP + targets + ) + + print(f"Total targets: {len(targets_df)}") + print(f"Matrix shape: {X_sparse.shape}") + print(f"Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * + X_sparse.shape[1]):.2%}") + + # Verify health insurance target is included + health_ins = targets_df[ + targets_df["variable"] == + "health_insurance_premiums_without_medicare_part_b" + ] + print(f"\nHealth insurance target:") + print(health_ins[["target_id", "variable", "value", "geographic_id"]]) + + Step 3: Verify Entity Aggregation + + Confirm person-to-household aggregation works correctly: + + # Verify totals are preserved + person_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + 2024, + map_to='person' + ).values.sum() + + household_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + 2024, + map_to='household' + ).values.sum() + + print(f"Person-level total: ${person_total:,.0f}") + print(f"Household-level total: ${household_total:,.0f}") + print(f"Match: {abs(person_total - household_total) < 1}") + + # Check matrix row sum for health insurance + health_ins_idx = targets_df[ + targets_df["variable"] == + "health_insurance_premiums_without_medicare_part_b" + ].index[0] + matrix_total = X_sparse[health_ins_idx, :].sum() * + len(cds_to_calibrate) # Adjust for stacking + print(f"Matrix row total (adjusted): ${matrix_total:,.0f}") + + Step 4: Update Documentation + + Add example to ~/docs/local_area_calibration_setup.ipynb showing: + - How to filter for multiple target groups + - How person-level targets work alongside spm_unit-level targets + - Interpretation of the resulting matrix structure + + Step 5: Create Runner Program with Diagnostics + + Create a runner script test_combined_targets.py that: + - Builds X_sparse with SNAP + health insurance targets + - Extracts target vector + - Uses MatrixTracer to print diagnostics + + Pattern based on first 100 lines of junkyard's + calibrate_cds_sparse.py: + + """ + Test runner for combined SNAP + health insurance calibration targets. + Generates X_sparse and target vector, prints diagnostics using + MatrixTracer. + """ + import logging + from pathlib import Path + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" + ) + + import numpy as np + import pandas as pd + from policyengine_us import Microsimulation + from policyengine_us_data.datasets.cps.local_area_calibration.sparse_m + atrix_builder import SparseMatrixBuilder + from + policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer + import MatrixTracer + from policyengine_us_data.datasets.cps.local_area_calibration.calibrat + ion_utils import ( + create_target_groups, + ) + + # ==================================================================== + ======== + # CONFIGURATION + # ==================================================================== + ======== + db_path = "policyengine_us_data/storage/calibration/policy_data.db" + db_uri = f"sqlite:///{db_path}" + time_period = 2024 + + # Test with a few CDs from different states + cds_to_calibrate = [ + '0101', # Alabama CD-1 + '0601', # California CD-1 + '0602', # California CD-2 + '3601', # New York CD-1 + '4801', # Texas CD-1 + ] + + print(f"Testing with {len(cds_to_calibrate)} congressional districts") + + # ==================================================================== + ======== + # STEP 1: LOAD SIMULATION + # ==================================================================== + ======== + print("\nLoading simulation...") + sim = Microsimulation() + n_households = len(sim.calculate("household_id", + map_to="household").values) + print(f"Loaded {n_households:,} households") + + # ==================================================================== + ======== + # STEP 2: BUILD SPARSE MATRIX WITH COMBINED TARGETS + # ==================================================================== + ======== + print("\nBuilding sparse matrix...") + builder = SparseMatrixBuilder( + db_uri=db_uri, + time_period=time_period, + cds_to_calibrate=cds_to_calibrate, + dataset_path=None + ) + + # Query both SNAP (stratum_group_id=4) and national targets + (stratum_group_id=1) + targets_df, X_sparse, household_id_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [1, 4]} + ) + + print(f"\nMatrix built successfully:") + print(f" Shape: {X_sparse.shape}") + print(f" Targets: {len(targets_df)}") + print(f" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * + X_sparse.shape[1]):.4%}") + + # ==================================================================== + ======== + # STEP 3: EXTRACT TARGET VECTOR + # ==================================================================== + ======== + target_vector = targets_df["value"].values + print(f"\nTarget vector shape: {target_vector.shape}") + print(f"Target total: ${target_vector.sum():,.0f}") + + # ==================================================================== + ======== + # STEP 4: HEALTH INSURANCE PREMIUM VERIFICATION + # ==================================================================== + ======== + print("\n" + "="*80) + print("HEALTH INSURANCE PREMIUM TARGET ANALYSIS") + print("="*80) + + health_ins_targets = targets_df[ + targets_df["variable"] == + "health_insurance_premiums_without_medicare_part_b" + ] + + if len(health_ins_targets) > 0: + print(f"\nFound {len(health_ins_targets)} health insurance + target(s):") + print(health_ins_targets[["target_id", "variable", "value", + "geographic_id", "stratum_group_id"]]) + + # Get the row index for health insurance + health_ins_idx = health_ins_targets.index[0] + health_ins_row = X_sparse[health_ins_idx, :] + + print(f"\nMatrix row {health_ins_idx} (health insurance):") + print(f" Non-zero entries: {health_ins_row.nnz:,}") + print(f" Row sum: ${health_ins_row.sum():,.0f}") + print(f" Target value: + ${health_ins_targets.iloc[0]['value']:,.0f}") + + # Verify person-to-household aggregation + person_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + time_period, + map_to='person' + ).values.sum() + + household_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + time_period, + map_to='household' + ).values.sum() + + print(f"\nEntity aggregation verification:") + print(f" Person-level total: ${person_total:,.0f}") + print(f" Household-level total: ${household_total:,.0f}") + print(f" Match: {abs(person_total - household_total) < 1}") + + else: + print("\nWARNING: No health insurance targets found!") + + # ==================================================================== + ======== + # STEP 5: SNAP TARGET SUMMARY + # ==================================================================== + ======== + print("\n" + "="*80) + print("SNAP TARGET SUMMARY") + print("="*80) + + snap_targets = targets_df[targets_df["variable"] == "snap"] + household_count_targets = targets_df[targets_df["variable"] == + "household_count"] + + print(f"\nSNAP benefit targets: {len(snap_targets)}") + print(f"Household count targets: {len(household_count_targets)}") + + if len(snap_targets) > 0: + print(f"\nSNAP total (all states): + ${snap_targets['value'].sum():,.0f}") + print(f"\nSample SNAP targets:") + print(snap_targets[["target_id", "variable", "value", + "geographic_id"]].head(10)) + + # ==================================================================== + ======== + # STEP 6: USE MATRIX TRACER FOR DETAILED DIAGNOSTICS + # ==================================================================== + ======== + print("\n" + "="*80) + print("MATRIX TRACER DIAGNOSTICS") + print("="*80) + + tracer = MatrixTracer( + targets_df=targets_df, + matrix=X_sparse, + household_id_mapping=household_id_mapping, + geographic_ids=cds_to_calibrate, + sim=sim + ) + + # Print comprehensive matrix structure + tracer.print_matrix_structure(show_groups=True) + + # ==================================================================== + ======== + # STEP 7: TARGET GROUP ANALYSIS + # ==================================================================== + ======== + print("\n" + "="*80) + print("TARGET GROUP ANALYSIS") + print("="*80) + + target_groups, group_info = create_target_groups(targets_df) + + print(f"\nTotal target groups: {len(group_info)}") + for group_id, info in enumerate(group_info): + group_mask = target_groups == group_id + n_targets_in_group = group_mask.sum() + print(f" Group {group_id}: {info} ({n_targets_in_group} + targets)") + + print("\n" + "="*80) + print("RUNNER COMPLETED SUCCESSFULLY") + print("="*80) + + Key Features: + - Loads simulation once + - Builds matrix with combined targets using OR filter + - Verifies health insurance person→household aggregation + - Uses MatrixTracer for comprehensive diagnostics + - Prints target group analysis + - Follows junkyard pattern but with new sparse_matrix_builder + + Success Criteria + + 1. ✓ Runner script executes successfully without errors + 2. ✓ Matrix builds successfully with {"stratum_group_ids": [1, 4]} + filter + 3. ✓ Health insurance target appears in targets_df with correct value + ($385B) + 4. ✓ Matrix shape is (n_targets, n_households × n_cds) where n_targets + ≈ 558 (20 national + 538 SNAP) + 5. ✓ Person-to-household aggregation preserves totals (verified in + runner output) + 6. ✓ MatrixTracer prints comprehensive diagnostics + 7. ✓ Target groups are correctly identified and displayed + 8. ✓ No code changes to sparse_matrix_builder.py (only new runner + script) + + Quick Reference + + Critical Files + + - No changes: sparse_matrix_builder.py (already supports person-level + targets) + - New file: test_combined_targets.py (runner program to create and + verify) + - Database: policyengine_us_data/storage/calibration/policy_data.db + (contains target_id=5) + - Tracer: matrix_tracer.py (used for diagnostics in runner) + - Environment: ~/envs/sep (for this development branch) + + Key Commands + + # Activate environment + source ~/envs/sep/bin/activate + + # Navigate to project directory + cd ~/devl/sep/policyengine-us-data + + # Query database for health insurance target + python -c "import sqlite3; conn = sqlite3.connect('policyengine_us_dat + a/storage/calibration/policy_data.db'); cursor = conn.cursor(); + cursor.execute('SELECT * FROM targets WHERE variable = + \"health_insurance_premiums_without_medicare_part_b\"'); + print(cursor.fetchone())" + + # Run the combined targets runner (after creating per Step 5) + python test_combined_targets.py + + Target Filter Examples + + # SNAP only + {"stratum_group_ids": [4]} + + # Health insurance only + {"variables": ["health_insurance_premiums_without_medicare_part_b"]} + + # Both SNAP and health insurance (recommended) + {"stratum_group_ids": [1, 4]} + + + + + + + + +● Excellent! The plan is approved. Let me create a todo list and start implementing the runner program. + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python + -c " + import sqlite3…) + ⎿  Database target verification: + target_id: 5 + variable: health_insurance_premiums_without_medicare_part_b + … +4 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Write(test_combined_targets.py) + ⎿  Wrote 188 lines to test_combined_targets.py + 1 """ + 2 Test runner for combined SNAP + health insurance calibration targ + ets. + 3 Generates X_sparse and target vector, prints diagnostics using Ma + trixTracer. + 4 """ + 5 import logging + 6 logging.basicConfig( + 7 level=logging.INFO, + 8 format="%(asctime)s - %(levelname)s - %(message)s" + 9 ) + … +178 lines (ctrl+o to expand) + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && timeout: + ~/envs/sep/bin/python policyengine_us_data/datasets/cps/local_ 5m 0s + area_calibration/test_combined_targets.py 2>&1) + ⎿ 2026-01-15 09:12:53,215 - INFO - NumExpr defaulting to 16 threads. + 2026-01-15 09:13:03,329 - INFO - HTTP Request: GET + https://huggingface. + co/api/models/policyengine/policyengine-us-data "HTTP/1.1 200 OK" + … +301 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● The runner executed successfully. Let me update the todos and summarize the results. + +● All tasks completed successfully. Here's a summary: + + Results + + Matrix Structure + + - Shape: (558 targets, 105,225 columns = 21,045 households × 5 CDs) + - Targets breakdown: 20 national (stratum_group_id=1) + 538 SNAP (stratum_group_id=4) + - Sparsity: 98.82% + + Health Insurance Premium Verification + + - Target found: target_id=5, value=$385B (2024) + - Matrix row: 66,150 non-zero entries, row sum = $282,326,240 + - Entity aggregation: Person-level total = Household-level total = $56,465,248 ✓ + - Math check: Row sum $282,326,240 = 5 CDs × $56,465,248 ✓ + + Key Insights + + 1. No code changes to sparse_matrix_builder.py - it already handles person-level targets via PolicyEngine's map_to="household" aggregation + 2. Health insurance is Group 4 in the target groups: "National Health Insurance Premiums Without Medicare Part B (1 target, value=385,000,000,000)" + 3. Combined targets work correctly - the OR filter {"stratum_group_ids": [1, 4]} successfully retrieves both national and SNAP targets + + Files Created + + - /home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration/test_combined_targets.py - Runner program with diagnostics + + The health insurance premium target is now fully integrated into your calibration system with zero code changes to the core builder. + +✻ Baked for 13m 5s + +! git status + ⎿  On branch health-insurance-premiums + Untracked files: + (use "git add ..." to include in what will be committed) + … +4 lines (ctrl+o to expand) + +❯ Ok I see you have a test created, which is nice, but I was hoping for a + runner file too. Did you see that in my last request? It would create + sparse X matrix and run the tracer. + +● The file I created (test_combined_targets.py) is the runner - it generates X_sparse, extracts the target vector, and runs the MatrixTracer for diagnostics. It ran successfully and produced the output you saw. + + Perhaps the name is confusing? Would you like me to rename it to something clearer like run_combined_targets.py? Or is there something additional you'd like the runner to do that I missed? + +! ls test* + ⎿  test_combined_targets.py + +❯ Oh! Yeah let's not call this file "test*". Eventually it will build the + real matrix! Now it will only have snap and our new variable in it. It's + going to be our procedural script that will be an integral part of the + machine learning flow! + +● Got it! Let me rename it to something more appropriate for a production runner. + +● Bash(mv test_combined_targets.py build_calibration_matrix.py) + ⎿  (No content) + +● Read(build_calibration_matrix.py) + ⎿  Read 10 lines + +● Update(build_calibration_matrix.py) + ⎿  Added 1 line, removed 1 line + 1 """ + 2 -Test runner for combined SNAP + health insurance calibration tar + -gets. + 2 +Build calibration matrix for geo-stacking reweighting. + 3 Generates X_sparse and target vector, prints diagnostics using M + atrixTracer. + 4 """ + 5 import logging + +● Done! Renamed to build_calibration_matrix.py with an updated docstring. This is now your production runner for the ML flow. + +● Bash(ls -la build_calibration_matrix.py) + ⎿ -rw-rw-r-- 1 baogorek baogorek 6266 Jan 15 09:19 + build_calibration_matr + ix.py + +● Ready to go: + python build_calibration_matrix.py + +❯ Ok there are some problems here. targets_df has 558 rows and I see + variables like alimony expense in there. I thought we were just adding + our new variable! Remember, it's one variable at a time! also, wy are you + bringing in a default microsimulation? Remember that the "base h5 + dataset" is a very important concept in this local area stacking + approach! That base dataset it ... we'll I'll let you tell me what it + should be. + +● You're right on both counts. Let me fix this: + + 1. Target filter should be specific: Only SNAP (stratum_group_id=4) + the specific health insurance variable - not all of stratum_group_id=1 + 2. Base dataset: For local area stacking, this should be the extended CPS dataset that's been prepared for geo-stacking (likely extended_cps_2024.h5 or similar in the storage directory) + + Let me check what's available: + +● Bash(ls -la /home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/ + storage/*.h5 2>/dev/null || echo "No h5 files in storage root") + ⎿ -rw-rw-r-- 1 baogorek baogorek 446389680 Dec 24 08:03 + /home/baogorek/de + vl/sep/policyengine-us-data/policyengine_us_data/storage/acs_2022.h5 + -rw-rw-r-- 1 baogorek baogorek 789139664 Dec 24 08:03 + /home/baogorek/de + … +78 lines (ctrl+o to expand) + +● I see extended_cps_2024.h5 - that's the extended CPS which has the geographic identifiers and enhanced variables needed for geo-stacking. + + Let me fix both issues: + +● Read(build_calibration_matrix.py) + ⎿  Read 188 lines + +● Update(build_calibration_matrix.py) + ⎿  Added 9 lines, removed 6 lines + 25 # ============================================================= + =============== + 26 # CONFIGURATION + 27 # ============================================================= + =============== + 28 -db_path = ( + 29 - "policyengine_us_data/storage/calibration/policy_data.db" + 30 -) + 28 +db_path = "policyengine_us_data/storage/calibration/policy_data + +.db" + 29 db_uri = f"sqlite:///{db_path}" + 30 time_period = 2024 + 31 + 32 +# Base dataset for geo-stacking: extended CPS with geographic i + +dentifiers + 33 +dataset_path = ( + 34 + "policyengine_us_data/storage/extended_cps_2024.h5" + 35 +) + 36 + + 37 cds_to_calibrate = [ + 38 "101", # Alabama CD-1 + 39 "601", # California CD-1 + ... + 45 print(f"Testing with {len(cds_to_calibrate)} congressional dist + ricts") + 46 + 47 # ============================================================= + =============== + 45 -# STEP 1: LOAD SIMULATION + 48 +# STEP 1: LOAD SIMULATION FROM EXTENDED CPS + 49 # ============================================================= + =============== + 47 -print("\nLoading simulation...") + 48 -sim = Microsimulation() + 50 +print(f"\nLoading simulation from {dataset_path}...") + 51 +sim = Microsimulation(dataset=dataset_path) + 52 n_households = len(sim.calculate("household_id", map_to="househ + old").values) + 53 print(f"Loaded {n_households:,} households") + 54 + +● Update(build_calibration_matrix.py) + ⎿  Added 7 lines, removed 1 line + 63 dataset_path=None, + 64 ) + 65 + 66 +# SNAP targets (stratum_group_id=4) + specific health insurance + + variable + 67 +# Uses OR logic: gets all SNAP targets OR the health insurance + +target + 68 targets_df, X_sparse, household_id_mapping = builder.build_matr + ix( + 67 - sim, target_filter={"stratum_group_ids": [1, 4]} + 69 + sim, + 70 + target_filter={ + 71 + "stratum_group_ids": [4], + 72 + "variables": ["health_insurance_premiums_without_medica + +re_part_b"], + 73 + }, + 74 ) + 75 + 76 print(f"\nMatrix built successfully:") + +● Now let me run it to verify: + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && timeout: + ~/envs/sep/bin/python policyengine_us_data/datasets/cps/local_ 5m 0s + area_calibration/build_calibration_matrix.py 2>…) + ⎿ 2026-01-15 09:27:58,320 - INFO - NumExpr defaulting to 16 threads. + 2026-01-15 09:28:19,765 - INFO - Tracer initialized: 41314 households + x + 5 geographies + … +187 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Now we have: + - 539 targets (SNAP + health insurance only, not all 20 national targets) + - Extended CPS 2024 as the base dataset (41,314 households) + - 4 target groups: + - Group 0: Health Insurance Premiums (1 national target) + - Group 1: State SNAP Household Count (51 targets) + - Group 2: State Snap (51 targets) + - Group 3: District SNAP Household Count (436 targets) + + The entity aggregation "Match: False" is just a floating point precision issue ($4 difference out of $109M). Let me fix that check: + +● Update(build_calibration_matrix.py) + ⎿  Added 1 line, removed 1 line + 130 print(f"\nEntity aggregation verification:") + 131 print(f" Person-level total: ${person_total:,.0f}") + 132 print(f" Household-level total: ${household_total:,.0f}") + 133 - print(f" Match: {abs(person_total - household_total) < 1} + -") + 133 + print(f" Match: {np.isclose(person_total, household_total + +, rtol=1e-6)}") + 134 + 135 else: + 136 print("\nWARNING: No health insurance targets found!") + +● Done! The runner now: + 1. Uses extended_cps_2024.h5 as the base dataset + 2. Includes only SNAP + health insurance targets (539 total) + 3. Creates 4 target groups (down from 23) + +✻ Crunched for 1m 36s + +❯ I guess I never made it clear that the base dataset will be + "stratified_extended_cps_2023.h5" which you perhaps could have figured + out from the local-area* target in the Makefile, but that's okay. Can you + help the builder use that file in a way that it could be run + interactively by a user in the directory that script resides in? + +● Got it! Let me fix the paths to work when running from the script's directory using __file__ to resolve paths. + +● Update(build_calibration_matrix.py) + ⎿  Added 12 lines, removed 15 lines + 3 Generates X_sparse and target vector, prints diagnostics using M + atrixTracer. + 4 """ + 5 import logging + 6 +from pathlib import Path + 7 + 8 logging.basicConfig( + 9 level=logging.INFO, + ... + 13 import numpy as np + 14 import pandas as pd + 15 from policyengine_us import Microsimulation + 15 -from policyengine_us_data.datasets.cps.local_area_calibration.s + -parse_matrix_builder import ( + 16 - SparseMatrixBuilder, + 17 -) + 18 -from policyengine_us_data.datasets.cps.local_area_calibration.m + -atrix_tracer import ( + 19 - MatrixTracer, + 20 -) + 21 -from policyengine_us_data.datasets.cps.local_area_calibration.c + -alibration_utils import ( + 22 - create_target_groups, + 23 -) + 16 +from sparse_matrix_builder import SparseMatrixBuilder + 17 +from matrix_tracer import MatrixTracer + 18 +from calibration_utils import create_target_groups + 19 + 20 # ============================================================= + =============== + 21 # CONFIGURATION + 22 # ============================================================= + =============== + 28 -db_path = "policyengine_us_data/storage/calibration/policy_data + -.db" + 23 +# Resolve paths relative to this script's location + 24 +SCRIPT_DIR = Path(__file__).resolve().parent + 25 +STORAGE_DIR = SCRIPT_DIR.parent.parent.parent / "storage" + 26 + + 27 +db_path = STORAGE_DIR / "calibration" / "policy_data.db" + 28 db_uri = f"sqlite:///{db_path}" + 30 -time_period = 2024 + 29 +time_period = 2023 + 30 + 32 -# Base dataset for geo-stacking: extended CPS with geographic i + -dentifiers + 33 -dataset_path = ( + 34 - "policyengine_us_data/storage/extended_cps_2024.h5" + 35 -) + 31 +# Base dataset for geo-stacking: stratified extended CPS + 32 +dataset_path = STORAGE_DIR / "stratified_extended_cps_2023.h5" + 33 + 34 cds_to_calibrate = [ + 35 "101", # Alabama CD-1 + +● Now let me verify it runs from the script's directory: + +● Bash(~/envs/sep/bin/python build_calibration_matrix.py 2>&1) timeout: 5m 0s + ⎿  Error: Exit code 1 + 2026-01-15 09:34:39,873 - INFO - NumExpr defaulting to 16 threads. + TEST_LITE == False + Testing with 5 congressional districts + + Loading simulation from /home/baogorek/devl/sep/policyengine-us-data/policye + ngine_us_data/storage/stratified_extended_cps_2023.h5... + Traceback (most recent call last): + File "/home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/da + tasets/cps/local_area_calibration/build_calibration_matrix.py", line 48, in + + sim = Microsimulation(dataset=dataset_path) + File "/home/baogorek/envs/sep/lib/python3.13/site-packages/policyengine_us + /system.py", line 220, in __init__ + … +9 lines (ctrl+o to see all) + +● Need to convert the Path to a string for Microsimulation: + +● Update(build_calibration_matrix.py) + ⎿  Added 1 line, removed 1 line + 45 # STEP 1: LOAD SIMULATION FROM EXTENDED CPS + 46 # ============================================================= + =============== + 47 print(f"\nLoading simulation from {dataset_path}...") + 48 -sim = Microsimulation(dataset=dataset_path) + 48 +sim = Microsimulation(dataset=str(dataset_path)) + 49 n_households = len(sim.calculate("household_id", map_to="househ + old").values) + 50 print(f"Loaded {n_households:,} households") + 51 + +● Bash(~/envs/sep/bin/python build_calibration_matrix.py 2>&1) timeout: 5m 0s + ⎿ 2026-01-15 09:35:11,752 - INFO - NumExpr defaulting to 16 threads. + 2026-01-15 09:35:27,099 - INFO - Tracer initialized: 13507 households + x + 5 geographies + … +187 lines (ctrl+o to expand) + +● Now it works from the script directory: + - Base dataset: stratified_extended_cps_2023.h5 (13,507 households) + - Time period: 2023 + - 539 targets: 1 health insurance + 538 SNAP + - 4 target groups + - Entity aggregation: Match: True + + The script can now be run interactively: + cd policyengine_us_data/datasets/cps/local_area_calibration + python build_calibration_matrix.py + +✻ Brewed for 1m 12s + +❯ Do you really need SCRIPT_DIR = Path(__file__).resolve().parent? in + there? I can't run __FILE__ in ipython? You can import STORAGE_DIR + directly, right? + +● You're right, let me check if STORAGE_DIR is already available to import: + +● Search(pattern: "STORAGE_DIR", path: + "~/devl/sep/policyengine-us-data/policyengine_us_data", output_mode: + "content") + ⎿  Found 3 lines (ctrl+o to expand) + +● Search(pattern: "storage.*=.*Path", path: + "~/devl/sep/policyengine-us-data/policyengine_us_data", output_mode: + "content") + ⎿  Found 0 lines + +● Search(pattern: "**/policyengine_us_data/**/constants*.py") + ⎿  Found 0 files + +● Search(pattern: "from policyengine_us_data import", path: + "~/devl/sep/policyengine-us-data/policyengine_us_data", output_mode: + "content", head_limit: 10) + ⎿  Found 0 lines + +● Search(pattern: "policyengine_us_data\.STORAGE", path: + "~/devl/sep/policyengine-us-data", output_mode: "content") + ⎿  Found 0 lines + +● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/__init__.py) + ⎿  Read 3 lines + +● Bash(ls -la /home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/ + storage/*.py 2>/dev/null | head -5) + ⎿ -rw-rw-r-- 1 baogorek baogorek 780 Jan 6 11:19 + /home/baogorek/devl/se + p/policyengine-us-data/policyengine_us_data/storage/download_private_p + r + erequisites.py + … +7 lines (ctrl+o to expand) + +● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/storage/__init__.py) + ⎿  Read 6 lines + +● STORAGE_FOLDER is already defined in policyengine_us_data.storage. Let me update the script: + +● Update(build_calibration_matrix.py) + ⎿  Added 3 lines, removed 7 lines + 3 Generates X_sparse and target vector, prints diagnostics using M + atrixTracer. + 4 """ + 5 import logging + 6 -from pathlib import Path + 6 + 7 logging.basicConfig( + 8 level=logging.INFO, + ... + 12 import numpy as np + 13 import pandas as pd + 14 from policyengine_us import Microsimulation + 15 +from policyengine_us_data.storage import STORAGE_FOLDER + 16 from sparse_matrix_builder import SparseMatrixBuilder + 17 from matrix_tracer import MatrixTracer + 18 from calibration_utils import create_target_groups + ... + 20 # ============================================================= + =============== + 21 # CONFIGURATION + 22 # ============================================================= + =============== + 23 -# Resolve paths relative to this script's location + 24 -SCRIPT_DIR = Path(__file__).resolve().parent + 25 -STORAGE_DIR = SCRIPT_DIR.parent.parent.parent / "storage" + 26 - + 27 -db_path = STORAGE_DIR / "calibration" / "policy_data.db" + 23 +db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" + 24 db_uri = f"sqlite:///{db_path}" + 25 time_period = 2023 + 26 + 27 # Base dataset for geo-stacking: stratified extended CPS + 32 -dataset_path = STORAGE_DIR / "stratified_extended_cps_2023.h5" + 28 +dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h + +5" + 29 + 30 cds_to_calibrate = [ + 31 "101", # Alabama CD-1 + +● Bash(~/envs/sep/bin/python build_calibration_matrix.py 2>&1 | head -30) + ⎿ 2026-01-15 09:45:04,625 - INFO - NumExpr defaulting to 16 threads. + 2026-01-15 09:45:19,327 - INFO - Tracer initialized: 13507 households + x + 5 geographies + … +31 lines (ctrl+o to expand) + +● Done. Now you can use STORAGE_FOLDER in IPython too: + + from policyengine_us_data.storage import STORAGE_FOLDER + dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" + +✻ Sautéed for 1m 11s + +❯ Is there an opportunity to create a real test in tests? + +● Search(pattern: "**/tests/**/*.py", path: "~/devl/sep/policyengine-us-data") + ⎿  Found 100 files (ctrl+o to expand) + +● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/tests/test_database. + py) + ⎿  Read 87 lines + +● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/tests/test_datasets/ + test_small_enhanced_cps.py) + ⎿  Read 18 lines + +● Yes! I can create a test for the sparse matrix builder. Here's what we could test: + + 1. Person-level aggregation preserves totals + 2. Matrix builds with correct shape + 3. Health insurance target is included correctly + +● Write(~/devl/sep/policyengine-us-data/policyengine_us_data/tests/test_sparse_m + atrix_builder.py) + ⎿  Wrote 61 lines to ../../../tests/test_sparse_matrix_builder.py + 1 import pytest + 2 import numpy as np + 3 from policyengine_us import Microsimulation + 4 from policyengine_us_data.storage import STORAGE_FOLDER + 5 from policyengine_us_data.datasets.cps.local_area_calibration.spa + rse_matrix_builder import ( + 6 SparseMatrixBuilder, + 7 ) + 8 @pytest.fixture + … +51 lines (ctrl+o to expand) + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && timeout: 5m + ~/envs/sep/bin/pytest 0s + policyengine_us_data/tests/test_sparse_matrix_builder.py -v + 2>&1) + ⎿ ============================= test session starts + ===================== + ========= + platform linux -- Python 3.13.6, pytest-8.4.2, pluggy-1.6.0 -- + /home/ba + … +16 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● All 3 tests pass: + + 1. test_person_level_aggregation_preserves_totals - Verifies person→household aggregation preserves totals + 2. test_matrix_shape - Verifies matrix has correct dimensions + 3. test_combined_snap_and_health_insurance - Verifies both SNAP and health insurance targets are included + + policyengine_us_data/tests/test_sparse_matrix_builder.py - 3 passed in 12.33s + +✻ Cooked for 1m 1s + +❯ Cool, I'm surprised we didn't have one before. Let's go ahead and update + the changelog_entry.yaml, create an issue, format with black, commit, + push, and create the PR that links to the issue. + +● Search(pattern: "**/changelog_entry.yaml", path: + "~/devl/sep/policyengine-us-data") + ⎿  Found 1 file (ctrl+o to expand) + +● Read(~/devl/sep/policyengine-us-data/changelog_entry.yaml) + ⎿  Read 5 lines + +● Update(~/devl/sep/policyengine-us-data/changelog_entry.yaml) + ⎿  Added 4 lines, removed 3 lines + 1 -- bump: patch + 1 +- bump: minor + 2 changes: + 3 - fixed: + 4 - - Versioning workflow checkout for push events + 3 + added: + 4 + - Support for health_insurance_premiums_without_medicare_par + +t_b in local area calibration + 5 + - Test coverage for sparse matrix builder with person-level + +targets + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git remote -v) + ⎿  origin git@github.com:PolicyEngine/policyengine-us-data.git (fetch) + origin git@github.com:PolicyEngine/policyengine-us-data.git (push) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && gh issue create \ + --repo PolicyEngine/policyengine-us-data \…) + ⎿  https://github.com/PolicyEngine/policyengine-us-data/issues/472 + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && black . -l 79 2>&1) + ⎿ Skipping .ipynb files as Jupyter dependencies are not installed. + You can fix this by running ``pip install "black[jupyter]"`` + reformatted + /home/baogorek/devl/sep/policyengine-us-data/policyengine_u + … +6 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git status) + ⎿  On branch health-insurance-premiums + Changes not staged for commit: + (use "git add ..." to update what will be committed) + … +12 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git add + changelog_entry.yaml policyengine_us_data/datasets/cps/local_area_calibrat + ion/build_calibration_matri…) + ⎿  On branch health-insurance-premiums + Changes to be committed: + (use "git restore --staged ..." to unstage) + … +9 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git commit -m "Add + health insurance premiums to local area calibration…) + ⎿ [health-insurance-premiums 4268676] Add health insurance premiums to + lo + cal area calibration + 3 files changed, 263 insertions(+), 3 deletions(-) + … +4 lines (ctrl+o to expand) + + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git push -u origin + health-insurance-premiums 2>&1) + ⎿ remote: + remote: Create a pull request for 'health-insurance-premiums' on + GitHub + by visiting: + … +13 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && gh pr create --title + "Add health insurance premiums to local area calibration" --body "## + Summary…) + ⎿  https://github.com/PolicyEngine/policyengine-us-data/pull/473 + Warning: 1 uncommitted change + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Done! Here's the summary: + + - Issue: https://github.com/PolicyEngine/policyengine-us-data/issues/472 + - PR: https://github.com/PolicyEngine/policyengine-us-data/pull/473 + + The PR includes: + - build_calibration_matrix.py - Runner script for building X_sparse with SNAP + health insurance + - test_sparse_matrix_builder.py - 3 tests for person-level aggregation + - changelog_entry.yaml - Minor version bump + +✻ Crunched for 1m 51s + +❯ You've done a lot of work and discovery in this session, can you please + write up a report about what you've learned? This will be useful for your + future self to come back in here and be ready for the next variable, + which might not be quite as easy as this one! + +● Write(ADDING_CALIBRATION_TARGETS.md) + ⎿  Wrote 242 lines to ADDING_CALIBRATION_TARGETS.md + 1 # Adding Calibration Targets to Local Area Geo-Stacking + 2 This document summarizes key learnings from adding `health_insura + nce_premiums_without_medicare_part_b` as a calibration target. Us + e this as a reference when adding future targets. + 3 ## Key Discovery: No Code Changes Needed for Most Targets + 4 The `sparse_matrix_builder.py` is **already entity-agnostic**. Po + licyEngine's `map_to="household"` parameter automatically handles + aggregation from any entity level (person, tax_unit, spm_unit) t + o household level. + 5 ```python + 6 # This line in sparse_matrix_builder.py (line 220-222) handles AL + L entity types: + … +232 lines (ctrl+o to expand) + +● Created ADDING_CALIBRATION_TARGETS.md with: + + 1. Key Discovery: sparse_matrix_builder.py is already entity-agnostic via map_to="household" + 2. Architecture Overview: File locations, database schema, stratum groups + 3. Step-by-Step Guide: How to add new targets (5 steps) + 4. When Code Changes ARE Needed: Count variables, state-specific calcs, person-level constraints + 5. Junkyard Reference: Key patterns from the 2,400+ line reference implementation + 6. Matrix Structure: Shape, column indexing, MatrixTracer usage + 7. Common Queries: SQL snippets for exploring the database + + Would you like me to add this report to the PR as well? + +✻ Sautéed for 53s + + + diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py index 3fcd65be7..fcd23de19 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py @@ -3,6 +3,7 @@ Prototype script for weight calibration using the l0-python package. """ +import argparse import logging from datetime import datetime from pathlib import Path @@ -11,6 +12,17 @@ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) +parser = argparse.ArgumentParser(description="Fit calibration weights") +parser.add_argument( + "--device", default="cpu", choices=["cpu", "cuda"], + help="Device for training (cpu or cuda)" +) +parser.add_argument( + "--epochs", type=int, default=100, + help="Total epochs for training" +) +args = parser.parse_args() + import numpy as np from policyengine_us import Microsimulation from policyengine_us_data.storage import STORAGE_FOLDER @@ -28,8 +40,8 @@ # ============================================================================ # CONFIGURATION # ============================================================================ -DEVICE = "cpu" # Use "cuda" for GPU runs on Modal -TOTAL_EPOCHS = 100 # Reduced for testing; use 5000+ for production +DEVICE = args.device +TOTAL_EPOCHS = args.epochs EPOCHS_PER_CHUNK = 50 # Hyperparameters @@ -182,6 +194,7 @@ output_path = output_dir / f"calibration_weights_{timestamp}.npy" np.save(output_path, w) print(f"\nWeights saved to: {output_path}") +print(f"OUTPUT_PATH:{output_path}") # ============================================================================ # STEP 6: VERIFY PREDICTIONS From 2fb32a9fc5912f37a75bdcdc0914f7123b1a5793 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 16 Jan 2026 20:46:47 -0500 Subject: [PATCH 05/19] Remove --locked flag from uv sync for flexibility --- modal_app/fit_weights.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py index f07b976f6..b8b8147e3 100644 --- a/modal_app/fit_weights.py +++ b/modal_app/fit_weights.py @@ -28,7 +28,7 @@ def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True) os.chdir("policyengine-us-data") - subprocess.run(["uv", "sync", "--locked", "--extra", "l0"], check=True) + subprocess.run(["uv", "sync", "--extra", "l0"], check=True) script_path = ( "policyengine_us_data/datasets/cps/" From 44468ff34f7ba4a1494c05a90f86560cc1bb310e Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 16 Jan 2026 20:56:46 -0500 Subject: [PATCH 06/19] Show script errors in Modal output --- modal_app/fit_weights.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py index b8b8147e3..4456d4f62 100644 --- a/modal_app/fit_weights.py +++ b/modal_app/fit_weights.py @@ -40,7 +40,6 @@ def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: "--device", "cuda", "--epochs", str(epochs), ], - check=True, capture_output=True, text=True, env=os.environ.copy(), @@ -48,6 +47,8 @@ def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: print(result.stdout) if result.stderr: print("STDERR:", result.stderr) + if result.returncode != 0: + raise RuntimeError(f"Script failed with code {result.returncode}") output_line = [ line for line in result.stdout.split('\n') if 'OUTPUT_PATH:' in line From d6b9b7b19d509ea4fd6eb01068c787950fdb48b5 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 16 Jan 2026 21:11:05 -0500 Subject: [PATCH 07/19] Add data download and path CLI args for Modal GPU runs --- modal_app/fit_weights.py | 31 +++++++++++++++++++ .../fit_calibration_weights.py | 21 +++++++++++-- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py index 4456d4f62..6a386c8cd 100644 --- a/modal_app/fit_weights.py +++ b/modal_app/fit_weights.py @@ -30,6 +30,35 @@ def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: subprocess.run(["uv", "sync", "--extra", "l0"], check=True) + # Download calibration inputs from HuggingFace + print("Downloading calibration inputs from HuggingFace...") + download_result = subprocess.run( + [ + "uv", "run", "python", "-c", + "from policyengine_us_data.utils.huggingface import " + "download_calibration_inputs; " + "paths = download_calibration_inputs('/root/calibration_data'); " + "print(f\"DB: {paths['database']}\"); " + "print(f\"DATASET: {paths['dataset']}\")" + ], + capture_output=True, + text=True, + env=os.environ.copy(), + ) + print(download_result.stdout) + if download_result.stderr: + print("Download STDERR:", download_result.stderr) + if download_result.returncode != 0: + raise RuntimeError(f"Download failed: {download_result.returncode}") + + # Parse paths from output + db_path = dataset_path = None + for line in download_result.stdout.split('\n'): + if line.startswith('DB:'): + db_path = line.split('DB:')[1].strip() + elif line.startswith('DATASET:'): + dataset_path = line.split('DATASET:')[1].strip() + script_path = ( "policyengine_us_data/datasets/cps/" "local_area_calibration/fit_calibration_weights.py" @@ -39,6 +68,8 @@ def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: "uv", "run", "python", script_path, "--device", "cuda", "--epochs", str(epochs), + "--db-path", db_path, + "--dataset-path", dataset_path, ], capture_output=True, text=True, diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py index fcd23de19..f6521e23a 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py @@ -21,6 +21,14 @@ "--epochs", type=int, default=100, help="Total epochs for training" ) +parser.add_argument( + "--db-path", default=None, + help="Path to policy_data.db (default: STORAGE_FOLDER/calibration/policy_data.db)" +) +parser.add_argument( + "--dataset-path", default=None, + help="Path to stratified CPS h5 file" +) args = parser.parse_args() import numpy as np @@ -56,10 +64,19 @@ LEARNING_RATE = 0.15 # Data paths -db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" +if args.db_path: + db_path = Path(args.db_path) +else: + db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" db_uri = f"sqlite:///{db_path}" -dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" + +if args.dataset_path: + dataset_path = Path(args.dataset_path) +else: + dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" + output_dir = STORAGE_FOLDER / "calibration" +output_dir.mkdir(parents=True, exist_ok=True) time_period = 2023 # Get all CDs from database From d409a6dfeff27d55654f2795569a2d87c2769f4d Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Sat, 17 Jan 2026 11:25:39 -0500 Subject: [PATCH 08/19] Switch to A100-40GB for better memory bandwidth --- modal_app/fit_weights.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py index 6a386c8cd..eae020667 100644 --- a/modal_app/fit_weights.py +++ b/modal_app/fit_weights.py @@ -20,7 +20,7 @@ secrets=[hf_secret], memory=32768, cpu=4.0, - gpu="T4", + gpu="A100-40GB", timeout=14400, ) def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: From 1cbd87c90b4e4cc2468279ae59cacf4fd9175825 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Sat, 17 Jan 2026 11:38:15 -0500 Subject: [PATCH 09/19] Try A100-80GB for more memory bandwidth --- modal_app/fit_weights.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py index eae020667..c42d30186 100644 --- a/modal_app/fit_weights.py +++ b/modal_app/fit_weights.py @@ -20,7 +20,7 @@ secrets=[hf_secret], memory=32768, cpu=4.0, - gpu="A100-40GB", + gpu="A100-80GB", timeout=14400, ) def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: From cc4ae5a943418913849a0936caa537622d475b20 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Sat, 17 Jan 2026 12:09:55 -0500 Subject: [PATCH 10/19] Add GPU selection flag and document benchmarks --- modal_app/fit_weights.py | 73 ++++++++++++++++--- .../ADDING_CALIBRATION_TARGETS.md | 49 +++++++++++++ 2 files changed, 110 insertions(+), 12 deletions(-) diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py index c42d30186..842223a1c 100644 --- a/modal_app/fit_weights.py +++ b/modal_app/fit_weights.py @@ -15,22 +15,14 @@ REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git" -@app.function( - image=image, - secrets=[hf_secret], - memory=32768, - cpu=4.0, - gpu="A100-80GB", - timeout=14400, -) -def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: +def _fit_weights_impl(branch: str, epochs: int) -> bytes: + """Shared implementation for weight fitting.""" os.chdir("/root") subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True) os.chdir("policyengine-us-data") subprocess.run(["uv", "sync", "--extra", "l0"], check=True) - # Download calibration inputs from HuggingFace print("Downloading calibration inputs from HuggingFace...") download_result = subprocess.run( [ @@ -51,7 +43,6 @@ def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: if download_result.returncode != 0: raise RuntimeError(f"Download failed: {download_result.returncode}") - # Parse paths from output db_path = dataset_path = None for line in download_result.stdout.split('\n'): if line.startswith('DB:'): @@ -90,13 +81,71 @@ def fit_weights(branch: str = "main", epochs: int = 200) -> bytes: return f.read() +@app.function( + image=image, secrets=[hf_secret], memory=32768, cpu=4.0, + gpu="T4", timeout=14400, +) +def fit_weights_t4(branch: str = "main", epochs: int = 200) -> bytes: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, secrets=[hf_secret], memory=32768, cpu=4.0, + gpu="A10", timeout=14400, +) +def fit_weights_a10(branch: str = "main", epochs: int = 200) -> bytes: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, secrets=[hf_secret], memory=32768, cpu=4.0, + gpu="A100-40GB", timeout=14400, +) +def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> bytes: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, secrets=[hf_secret], memory=32768, cpu=4.0, + gpu="A100-80GB", timeout=14400, +) +def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> bytes: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, secrets=[hf_secret], memory=32768, cpu=4.0, + gpu="H100", timeout=14400, +) +def fit_weights_h100(branch: str = "main", epochs: int = 200) -> bytes: + return _fit_weights_impl(branch, epochs) + + +GPU_FUNCTIONS = { + "T4": fit_weights_t4, + "A10": fit_weights_a10, + "A100-40GB": fit_weights_a100_40, + "A100-80GB": fit_weights_a100_80, + "H100": fit_weights_h100, +} + + @app.local_entrypoint() def main( branch: str = "main", epochs: int = 200, + gpu: str = "T4", output: str = "calibration_weights.npy" ): - weights_bytes = fit_weights.remote(branch=branch, epochs=epochs) + if gpu not in GPU_FUNCTIONS: + raise ValueError( + f"Unknown GPU: {gpu}. Choose from: {list(GPU_FUNCTIONS.keys())}" + ) + + print(f"Running with GPU: {gpu}, epochs: {epochs}, branch: {branch}") + func = GPU_FUNCTIONS[gpu] + weights_bytes = func.remote(branch=branch, epochs=epochs) + with open(output, 'wb') as f: f.write(weights_bytes) print(f"Weights saved to: {output}") diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md b/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md index 1b30df41f..59d486f98 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md +++ b/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md @@ -239,3 +239,52 @@ For most new targets: 3. Run and verify with `MatrixTracer` No code changes to `sparse_matrix_builder.py` needed unless you have special aggregation or constraint requirements. + +## Running Weight Calibration on Modal (GPU) + +The `fit_calibration_weights.py` script can be run on Modal with GPU acceleration using `modal_app/fit_weights.py`. + +### Basic Usage + +```bash +# Default: T4 GPU, 200 epochs +modal run modal_app/fit_weights.py --branch main --epochs 200 + +# Specify GPU type +modal run modal_app/fit_weights.py --branch main --epochs 2000 --gpu A100-40GB +``` + +### GPU Benchmarks (200 epochs, 2 target groups, Jan 2026) + +| GPU | Time | Cost | Notes | +|-----|------|------|-------| +| T4 | 16m 4s | $0.16 | Best for small test runs | +| A100-40GB | 9m 5s | $0.32 | ~44% faster | +| A100-80GB | 10m 28s | $0.44 | Slower than 40GB (variance?) | + +### Key Findings + +1. **Memory bandwidth matters for sparse operations**: The P100 (not available on Modal) outperforms T4 by ~2x on Kaggle due to HBM2 memory (~732 GB/s) vs GDDR6 (~320 GB/s). + +2. **Significant overhead at low epochs**: With only 200 epochs, much of the runtime is fixed overhead: + - Git clone and `uv sync` (~2-3 min) + - HuggingFace data download (~1 min) + - Loading Microsimulation and building sparse matrix (~3-4 min, CPU-bound) + +3. **GPU choice depends on epoch count**: + - **< 500 epochs**: Use T4 (cheapest, overhead dominates) + - **500-2000 epochs**: A100-40GB may break even + - **> 2000 epochs**: A100 likely more cost-effective as training dominates + +4. **Available Modal GPUs** (by memory bandwidth): + - T4: 320 GB/s, $0.000164/sec + - L4: 300 GB/s, $0.000222/sec + - A10: 600 GB/s, $0.000306/sec + - L40S: 864 GB/s, $0.000542/sec + - A100-40GB: 1,555 GB/s, $0.000583/sec + - A100-80GB: 2,039 GB/s, $0.000694/sec + - H100: 3,350 GB/s, $0.001097/sec + +### Output + +Weights are saved locally to `calibration_weights.npy` (configurable via `--output` flag). From c050fe84598dc0e82df46174550601f5a573b39e Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Tue, 20 Jan 2026 18:26:17 -0500 Subject: [PATCH 11/19] Add calibration log output to weight fitting Replace sparsity_log with calibration_log that tracks per-target performance (estimate, target, error metrics) across epochs. Update Modal runner to return both weights and log files. Co-Authored-By: Claude Haiku 4.5 --- modal_app/fit_weights.py | 44 +++++++++++++------ .../fit_calibration_weights.py | 42 ++++++++++-------- 2 files changed, 53 insertions(+), 33 deletions(-) diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py index 842223a1c..76646e478 100644 --- a/modal_app/fit_weights.py +++ b/modal_app/fit_weights.py @@ -15,7 +15,7 @@ REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git" -def _fit_weights_impl(branch: str, epochs: int) -> bytes: +def _fit_weights_impl(branch: str, epochs: int) -> dict: """Shared implementation for weight fitting.""" os.chdir("/root") subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True) @@ -72,20 +72,30 @@ def _fit_weights_impl(branch: str, epochs: int) -> bytes: if result.returncode != 0: raise RuntimeError(f"Script failed with code {result.returncode}") - output_line = [ - line for line in result.stdout.split('\n') if 'OUTPUT_PATH:' in line - ][0] - output_path = output_line.split('OUTPUT_PATH:')[1].strip() + output_path = None + log_path = None + for line in result.stdout.split('\n'): + if 'OUTPUT_PATH:' in line: + output_path = line.split('OUTPUT_PATH:')[1].strip() + elif 'LOG_PATH:' in line: + log_path = line.split('LOG_PATH:')[1].strip() with open(output_path, 'rb') as f: - return f.read() + weights_bytes = f.read() + + log_bytes = None + if log_path: + with open(log_path, 'rb') as f: + log_bytes = f.read() + + return {"weights": weights_bytes, "log": log_bytes} @app.function( image=image, secrets=[hf_secret], memory=32768, cpu=4.0, gpu="T4", timeout=14400, ) -def fit_weights_t4(branch: str = "main", epochs: int = 200) -> bytes: +def fit_weights_t4(branch: str = "main", epochs: int = 200) -> dict: return _fit_weights_impl(branch, epochs) @@ -93,7 +103,7 @@ def fit_weights_t4(branch: str = "main", epochs: int = 200) -> bytes: image=image, secrets=[hf_secret], memory=32768, cpu=4.0, gpu="A10", timeout=14400, ) -def fit_weights_a10(branch: str = "main", epochs: int = 200) -> bytes: +def fit_weights_a10(branch: str = "main", epochs: int = 200) -> dict: return _fit_weights_impl(branch, epochs) @@ -101,7 +111,7 @@ def fit_weights_a10(branch: str = "main", epochs: int = 200) -> bytes: image=image, secrets=[hf_secret], memory=32768, cpu=4.0, gpu="A100-40GB", timeout=14400, ) -def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> bytes: +def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> dict: return _fit_weights_impl(branch, epochs) @@ -109,7 +119,7 @@ def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> bytes: image=image, secrets=[hf_secret], memory=32768, cpu=4.0, gpu="A100-80GB", timeout=14400, ) -def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> bytes: +def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> dict: return _fit_weights_impl(branch, epochs) @@ -117,7 +127,7 @@ def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> bytes: image=image, secrets=[hf_secret], memory=32768, cpu=4.0, gpu="H100", timeout=14400, ) -def fit_weights_h100(branch: str = "main", epochs: int = 200) -> bytes: +def fit_weights_h100(branch: str = "main", epochs: int = 200) -> dict: return _fit_weights_impl(branch, epochs) @@ -135,7 +145,8 @@ def main( branch: str = "main", epochs: int = 200, gpu: str = "T4", - output: str = "calibration_weights.npy" + output: str = "calibration_weights.npy", + log_output: str = "calibration_log.csv" ): if gpu not in GPU_FUNCTIONS: raise ValueError( @@ -144,8 +155,13 @@ def main( print(f"Running with GPU: {gpu}, epochs: {epochs}, branch: {branch}") func = GPU_FUNCTIONS[gpu] - weights_bytes = func.remote(branch=branch, epochs=epochs) + result = func.remote(branch=branch, epochs=epochs) with open(output, 'wb') as f: - f.write(weights_bytes) + f.write(result["weights"]) print(f"Weights saved to: {output}") + + if result["log"]: + with open(log_output, 'wb') as f: + f.write(result["log"]) + print(f"Calibration log saved to: {log_output}") diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py index f6521e23a..3e4688734 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py @@ -32,12 +32,14 @@ args = parser.parse_args() import numpy as np +import pandas as pd from policyengine_us import Microsimulation from policyengine_us_data.storage import STORAGE_FOLDER from sparse_matrix_builder import SparseMatrixBuilder from calibration_utils import get_all_cds_from_database try: + import torch from l0.calibration import SparseCalibrationWeights except ImportError: raise ImportError( @@ -158,7 +160,7 @@ # STEP 4: TRAIN IN CHUNKS # ============================================================================ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") -sparsity_log = [] +calibration_log = pd.DataFrame() for chunk_start in range(0, TOTAL_EPOCHS, EPOCHS_PER_CHUNK): chunk_epochs = min(EPOCHS_PER_CHUNK, TOTAL_EPOCHS - chunk_start) @@ -179,28 +181,25 @@ verbose_freq=chunk_epochs, ) - active_info = model.get_active_weights() - active_count = active_info["count"] - total_count = X_sparse.shape[1] - sparsity_pct = 100 * (1 - active_count / total_count) - - sparsity_log.append( - { - "epoch": current_epoch, - "active_weights": active_count, - "total_weights": total_count, - "sparsity_pct": sparsity_pct, - } - ) - - print(f" Active weights: {active_count:,} / {total_count:,}") - print(f" Sparsity: {sparsity_pct:.2f}%") + with torch.no_grad(): + predictions = model.predict(X_sparse).cpu().numpy() + + chunk_df = pd.DataFrame({ + "target_name": target_names, + "estimate": predictions, + "target": targets, + }) + chunk_df["epoch"] = current_epoch + chunk_df["error"] = chunk_df.estimate - chunk_df.target + chunk_df["rel_error"] = chunk_df.error / chunk_df.target + chunk_df["abs_error"] = chunk_df.error.abs() + chunk_df["rel_abs_error"] = chunk_df.rel_error.abs() + chunk_df["loss"] = chunk_df.rel_abs_error ** 2 + calibration_log = pd.concat([calibration_log, chunk_df], ignore_index=True) # ============================================================================ # STEP 5: EXTRACT AND SAVE WEIGHTS # ============================================================================ -import torch - with torch.no_grad(): w = model.get_weights(deterministic=True).cpu().numpy() @@ -213,6 +212,11 @@ print(f"\nWeights saved to: {output_path}") print(f"OUTPUT_PATH:{output_path}") +log_path = output_dir / f"calibration_log_{timestamp}.csv" +calibration_log.to_csv(log_path, index=False) +print(f"Calibration log saved to: {log_path}") +print(f"LOG_PATH:{log_path}") + # ============================================================================ # STEP 6: VERIFY PREDICTIONS # ============================================================================ From 7e17e63d6d927c32560576b50b460df45c27982d Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Wed, 21 Jan 2026 08:19:48 -0500 Subject: [PATCH 12/19] documentation and parameter changes --- modal_app/README.md | 62 ++++ .../create_stratified_cps.py | 268 +++++++++--------- .../fit_calibration_weights.py | 4 +- 3 files changed, 202 insertions(+), 132 deletions(-) create mode 100644 modal_app/README.md diff --git a/modal_app/README.md b/modal_app/README.md new file mode 100644 index 000000000..b4d9e833a --- /dev/null +++ b/modal_app/README.md @@ -0,0 +1,62 @@ +# Modal App for GPU Weight Fitting + +Run calibration weight fitting on Modal's cloud GPUs. + +## Prerequisites + +- [Modal](https://modal.com/) account and CLI installed (`pip install modal`) +- `modal token new` to authenticate +- HuggingFace token stored as Modal secret named `huggingface-token` + +## Usage + +```bash +modal run modal_app/fit_weights.py --branch --epochs --gpu +``` + +### Arguments + +| Argument | Default | Description | +|----------|---------|-------------| +| `--branch` | `main` | Git branch to clone and run | +| `--epochs` | `200` | Number of training epochs | +| `--gpu` | `T4` | GPU type: `T4`, `A10`, `A100-40GB`, `A100-80GB`, `H100` | +| `--output` | `calibration_weights.npy` | Local path for weights file | +| `--log-output` | `calibration_log.csv` | Local path for calibration log | + +### Example + +```bash +modal run modal_app/fit_weights.py --branch health-insurance-premiums --epochs 100 --gpu T4 +``` + +## Output Files + +- **calibration_weights.npy** - Fitted household weights +- **calibration_log.csv** - Per-target performance metrics across epochs (target_name, estimate, target, epoch, error, rel_error, abs_error, rel_abs_error, loss) + +## Changing Hyperparameters + +Hyperparameters are in `policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py`: + +```python +BETA = 0.35 +GAMMA = -0.1 +ZETA = 1.1 +INIT_KEEP_PROB = 0.999 +LOG_WEIGHT_JITTER_SD = 0.05 +LOG_ALPHA_JITTER_SD = 0.01 +LAMBDA_L0 = 1e-8 +LAMBDA_L2 = 1e-8 +LEARNING_RATE = 0.15 +``` + +To change them: +1. Edit `fit_calibration_weights.py` +2. Commit and push to your branch +3. Re-run the Modal command with that branch + +## Important Notes + +- **Keep your connection open** - Modal needs to stay connected to download results. Don't close your laptop or let it sleep until you see the local "Weights saved to:" and "Calibration log saved to:" messages. +- Modal clones from GitHub, so local changes must be pushed before they take effect. diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py index d9507d174..79fb18b36 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py @@ -1,15 +1,14 @@ """ -Create a stratified sample of extended_cps_2023.h5 that preserves high-income households. -This is needed for congressional district geo-stacking where the full dataset is too large. +Create a stratified sample of extended_cps_2023.h5 that preserves high-income households +while maintaining diversity in lower income strata for poverty analysis. Strategy: -- Keep ALL households above a high income threshold (e.g., top 1%) -- Sample progressively less from lower income strata -- Ensure representation across all income levels +- Keep ALL households in top 1% (for high-income tax analysis) +- Uniform sample from the remaining 99% (preserves low-income diversity) +- Optional: slight oversample of bottom quartile for poverty-focused analysis """ import numpy as np -import pandas as pd import h5py from policyengine_us import Microsimulation from policyengine_core.data.dataset import Dataset @@ -21,16 +20,22 @@ def create_stratified_cps_dataset( target_households=30_000, - high_income_percentile=99, # Keep ALL households above this percentile + high_income_percentile=99, + oversample_poor=False, + seed=None, base_dataset=None, output_path=None, ): """ - Create a stratified sample of CPS data preserving high-income households. + Create a stratified sample of CPS data preserving high-income households + while maintaining low-income diversity for poverty analysis. Args: target_households: Target number of households in output (approximate) - high_income_percentile: Keep ALL households above this AGI percentile + high_income_percentile: Keep ALL households above this AGI percentile (e.g., 99 or 99.5) + oversample_poor: If True, boost sampling rate for bottom 25% by 1.5x + seed: Random seed for reproducibility (default: None for random) + base_dataset: Path to source h5 file (default: extended_cps_2023.h5) output_path: Where to save the stratified h5 file """ print("\n" + "=" * 70) @@ -57,115 +62,108 @@ def create_stratified_cps_dataset( print(f"Target dataset: {target_households:,} households") print(f"Reduction ratio: {target_households/n_households_orig:.1%}") - # Calculate AGI percentiles - print("\nAnalyzing income distribution...") - percentiles = [0, 25, 50, 75, 90, 95, 99, 99.5, 99.9, 100] - agi_percentiles = np.percentile(agi, percentiles) - - print("AGI Percentiles:") - for p, val in zip(percentiles, agi_percentiles): - print(f" {p:5.1f}%: ${val:,.0f}") + # Show income distribution + print("\nAGI Percentiles (original):") + for p in [0, 25, 50, 75, 90, 95, 99, 99.5, 99.9, 100]: + val = np.percentile(agi, p) + print(f" {p:5.1f}%: ${val:>12,.0f}") - # Define sampling strategy - # Keep ALL high earners, sample progressively less from lower strata + # Define strata thresholds high_income_threshold = np.percentile(agi, high_income_percentile) - print( - f"\nHigh-income threshold (top {100-high_income_percentile}%): ${high_income_threshold:,.0f}" - ) - - # Create strata with sampling rates - strata = [ - (99.9, 100, 1.00), # Top 0.1% - keep ALL - (99.5, 99.9, 1.00), # 99.5-99.9% - keep ALL - (99, 99.5, 1.00), # 99-99.5% - keep ALL - (95, 99, 0.80), # 95-99% - keep 80% - (90, 95, 0.60), # 90-95% - keep 60% - (75, 90, 0.40), # 75-90% - keep 40% - (50, 75, 0.25), # 50-75% - keep 25% - (25, 50, 0.15), # 25-50% - keep 15% - (0, 25, 0.10), # Bottom 25% - keep 10% - ] - - # Adjust sampling rates to hit target - print("\nInitial sampling strategy:") - expected_count = 0 - for low_p, high_p, rate in strata: - low_val = np.percentile(agi, low_p) if low_p > 0 else -np.inf - high_val = np.percentile(agi, high_p) if high_p < 100 else np.inf - in_stratum = np.sum((agi > low_val) & (agi <= high_val)) - expected = int(in_stratum * rate) - expected_count += expected - print( - f" {low_p:5.1f}-{high_p:5.1f}%: {in_stratum:6,} households x {rate:.0%} = {expected:6,}" + bottom_25_pct_threshold = np.percentile(agi, 25) + + # Count households in each stratum + n_top = np.sum(agi >= high_income_threshold) + n_bottom_25 = np.sum(agi < bottom_25_pct_threshold) + n_middle = n_households_orig - n_top - n_bottom_25 + + print(f"\nStratum sizes:") + print(f" Top {100 - high_income_percentile}% (AGI >= ${high_income_threshold:,.0f}): {n_top:,}") + print(f" Middle 25-{high_income_percentile}%: {n_middle:,}") + print(f" Bottom 25% (AGI < ${bottom_25_pct_threshold:,.0f}): {n_bottom_25:,}") + + # Calculate sampling rates + # Keep ALL top earners, distribute remaining quota between middle and bottom + remaining_quota = target_households - n_top + if remaining_quota <= 0: + raise ValueError( + f"Target ({target_households:,}) is less than top {100-high_income_percentile}% " + f"count ({n_top:,}). Increase target_households." ) - print(f"Expected total: {expected_count:,} households") - - # Adjust rates if needed - if expected_count > target_households * 1.1: # Allow 10% overage - adjustment = target_households / expected_count - print( - f"\nAdjusting rates by factor of {adjustment:.2f} to meet target..." - ) - - # Never reduce the top percentiles - strata_adjusted = [] - for low_p, high_p, rate in strata: - if high_p >= 99: # Never reduce top 1% - strata_adjusted.append((low_p, high_p, rate)) - else: - strata_adjusted.append( - (low_p, high_p, min(1.0, rate * adjustment)) - ) - strata = strata_adjusted - - # Select households based on strata + if oversample_poor: + # Give bottom 25% a 1.5x boost relative to middle + r_middle = remaining_quota / (1.5 * n_bottom_25 + n_middle) + r_bottom = 1.5 * r_middle + r_middle = min(1.0, r_middle) + r_bottom = min(1.0, r_bottom) + else: + # Uniform sampling for the rest + r_middle = remaining_quota / (n_bottom_25 + n_middle) + r_bottom = r_middle + r_middle = min(1.0, r_middle) + r_bottom = min(1.0, r_bottom) + + print(f"\nSampling rates:") + print(f" Top {100 - high_income_percentile}%: 100%") + print(f" Middle 25-{high_income_percentile}%: {r_middle:.1%}") + print(f" Bottom 25%: {r_bottom:.1%}") + + # Expected counts + expected_top = n_top + expected_middle = int(n_middle * r_middle) + expected_bottom = int(n_bottom_25 * r_bottom) + expected_total = expected_top + expected_middle + expected_bottom + + print(f"\nExpected selection:") + print(f" Top {100 - high_income_percentile}%: {expected_top:,}") + print(f" Middle 25-{high_income_percentile}%: {expected_middle:,}") + print(f" Bottom 25%: {expected_bottom:,}") + print(f" Total: {expected_total:,}") + + # Select households print("\nSelecting households...") + if seed is not None: + np.random.seed(seed) + print(f" Using random seed: {seed}") selected_mask = np.zeros(n_households_orig, dtype=bool) - for low_p, high_p, rate in strata: - low_val = np.percentile(agi, low_p) if low_p > 0 else -np.inf - high_val = np.percentile(agi, high_p) if high_p < 100 else np.inf - - in_stratum = (agi > low_val) & (agi <= high_val) - stratum_indices = np.where(in_stratum)[0] - n_in_stratum = len(stratum_indices) - - if rate >= 1.0: - # Keep all - selected_mask[stratum_indices] = True - n_selected = n_in_stratum - else: - # Random sample within stratum - n_to_select = int(n_in_stratum * rate) - if n_to_select > 0: - np.random.seed(42) # For reproducibility - selected_indices = np.random.choice( - stratum_indices, n_to_select, replace=False - ) - selected_mask[selected_indices] = True - n_selected = n_to_select - else: - n_selected = 0 - - print( - f" {low_p:5.1f}-{high_p:5.1f}%: Selected {n_selected:6,} / {n_in_stratum:6,} ({n_selected/max(1,n_in_stratum):.0%})" - ) + # Top earners - keep all + top_mask = agi >= high_income_threshold + selected_mask[top_mask] = True + print(f" Top {100 - high_income_percentile}%: selected {np.sum(top_mask):,}") + + # Bottom 25% + bottom_mask = agi < bottom_25_pct_threshold + bottom_indices = np.where(bottom_mask)[0] + n_select_bottom = int(len(bottom_indices) * r_bottom) + if r_bottom >= 1.0: + selected_mask[bottom_indices] = True + elif n_select_bottom > 0: + selected_bottom = np.random.choice(bottom_indices, n_select_bottom, replace=False) + selected_mask[selected_bottom] = True + else: + print(f" WARNING: Bottom 25% selection rounded to 0 (rate={r_bottom:.4f}, n={len(bottom_indices)})") + print(f" Bottom 25%: selected {np.sum(selected_mask & bottom_mask):,} / {len(bottom_indices):,}") + + # Middle + middle_mask = ~top_mask & ~bottom_mask + middle_indices = np.where(middle_mask)[0] + n_select_middle = int(len(middle_indices) * r_middle) + if r_middle >= 1.0: + selected_mask[middle_indices] = True + elif n_select_middle > 0: + selected_middle = np.random.choice(middle_indices, n_select_middle, replace=False) + selected_mask[selected_middle] = True + else: + print(f" WARNING: Middle selection rounded to 0 (rate={r_middle:.4f}, n={len(middle_indices)})") + print(f" Middle 25-{high_income_percentile}%: selected {np.sum(selected_mask & middle_mask):,} / {len(middle_indices):,}") n_selected = np.sum(selected_mask) - print( - f"\nTotal selected: {n_selected:,} households ({n_selected/n_households_orig:.1%} of original)" - ) + print(f"\nTotal selected: {n_selected:,} households ({n_selected/n_households_orig:.1%} of original)") # Verify high earners are preserved - high_earners_mask = agi >= high_income_threshold - n_high_earners = np.sum(high_earners_mask) - n_high_earners_selected = np.sum(selected_mask & high_earners_mask) - print(f"\nHigh earners (>=${high_income_threshold:,.0f}):") - print(f" Original: {n_high_earners:,}") - print( - f" Selected: {n_high_earners_selected:,} ({n_high_earners_selected/n_high_earners:.0%})" - ) + print(f"\nHigh earners (>=${high_income_threshold:,.0f}): {np.sum(selected_mask & top_mask):,} / {n_top:,} (100%)") # Get the selected household IDs selected_household_ids = set(household_ids[selected_mask]) @@ -300,28 +298,38 @@ def create_stratified_cps_dataset( if __name__ == "__main__": import sys - # Parse command line arguments - if len(sys.argv) > 1: - try: - target = int(sys.argv[1]) - print( - f"Creating stratified dataset with target of {target:,} households..." - ) - output_file = create_stratified_cps_dataset( - target_households=target - ) - except ValueError: - print(f"Invalid target households: {sys.argv[1]}") - print("Usage: python create_stratified_cps.py [target_households]") - sys.exit(1) - else: - # Default target - print( - "Creating stratified dataset with default target of 30,000 households..." - ) - output_file = create_stratified_cps_dataset(target_households=30_000) + target = 30_000 + high_pct = 99 + oversample = False + seed = None + + for arg in sys.argv[1:]: + if arg == "--oversample-poor": + oversample = True + elif arg.startswith("--top="): + high_pct = float(arg.split("=")[1]) + elif arg.startswith("--seed="): + seed = int(arg.split("=")[1]) + elif arg.isdigit(): + target = int(arg) + + print(f"Creating stratified dataset:") + print(f" Target households: {target:,}") + print(f" Keep all above: {high_pct}th percentile") + print(f" Oversample poor: {oversample}") + print(f" Seed: {seed if seed is not None else 'random'}") + + output_file = create_stratified_cps_dataset( + target_households=target, + high_income_percentile=high_pct, + oversample_poor=oversample, + seed=seed, + ) print(f"\nDone! Created: {output_file}") - print("\nTo test loading:") - print(" from policyengine_us import Microsimulation") - print(f" sim = Microsimulation(dataset='{output_file}')") + print("\nUsage:") + print(" python create_stratified_cps.py [target] [--top=99] [--oversample-poor] [--seed=N]") + print("\nExamples:") + print(" python create_stratified_cps.py 30000") + print(" python create_stratified_cps.py 50000 --top=99.5 --oversample-poor") + print(" python create_stratified_cps.py 30000 --seed=123 # reproducible") diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py index 3e4688734..bcf647ff7 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py @@ -52,7 +52,7 @@ # ============================================================================ DEVICE = args.device TOTAL_EPOCHS = args.epochs -EPOCHS_PER_CHUNK = 50 +EPOCHS_PER_CHUNK = 500 # TODO: need a better way to set this. Remember it can blow up the Vercel app # Hyperparameters BETA = 0.35 @@ -62,7 +62,7 @@ LOG_WEIGHT_JITTER_SD = 0.05 LOG_ALPHA_JITTER_SD = 0.01 LAMBDA_L0 = 1e-8 -LAMBDA_L2 = 1e-8 +LAMBDA_L2 = 1e-12 LEARNING_RATE = 0.15 # Data paths From 268bfbb12d920f6a1b7655fd56450e4043539165 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Wed, 21 Jan 2026 21:54:33 -0500 Subject: [PATCH 13/19] Remove documentation files and update uv.lock Co-Authored-By: Claude Opus 4.5 --- .../ADDING_CALIBRATION_TARGETS.md | 290 --- .../conversation_about_this_repo.txt | 2147 ----------------- uv.lock | 20 +- 3 files changed, 19 insertions(+), 2438 deletions(-) delete mode 100644 policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md delete mode 100644 policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md b/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md deleted file mode 100644 index 59d486f98..000000000 --- a/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md +++ /dev/null @@ -1,290 +0,0 @@ -# Adding Calibration Targets to Local Area Geo-Stacking - -This document summarizes key learnings from adding `health_insurance_premiums_without_medicare_part_b` as a calibration target. Use this as a reference when adding future targets. - -## Key Discovery: No Code Changes Needed for Most Targets - -The `sparse_matrix_builder.py` is **already entity-agnostic**. PolicyEngine's `map_to="household"` parameter automatically handles aggregation from any entity level (person, tax_unit, spm_unit) to household level. - -```python -# This line in sparse_matrix_builder.py (line 220-222) handles ALL entity types: -target_values = state_sim.calculate( - target["variable"], map_to="household" -).values -``` - -**Verified behavior:** -- Person-level variables (like health_insurance_premiums): automatically summed to household -- SPM-unit variables (like snap): automatically aggregated to household -- Household variables: returned as-is - -## Architecture Overview - -### File Locations - -``` -policyengine_us_data/ -├── storage/ -│ ├── calibration/ -│ │ └── policy_data.db # Target database (SQLite) -│ └── stratified_extended_cps_2023.h5 # Base dataset for calibration -└── datasets/cps/local_area_calibration/ - ├── sparse_matrix_builder.py # Builds X_sparse matrix (GENERIC) - ├── matrix_tracer.py # Diagnostics for debugging matrices - ├── calibration_utils.py # Helper functions - └── build_calibration_matrix.py # Runner script -``` - -### Database Schema (policy_data.db) - -```sql --- Core tables -targets(target_id, variable, period, stratum_id, value, active, ...) -strata(stratum_id, definition_hash, stratum_group_id, ...) -stratum_constraints(stratum_id, constraint_variable, operation, value, ...) -``` - -**Key stratum_group_ids:** -- Group 1: National hardcoded targets (20 variables including health insurance, medicaid, snap national, etc.) -- Group 4: SNAP state/CD targets (538 targets: 51 state snap + 487 household_count) - -### Target Filter Logic - -The `build_matrix()` method uses **OR logic** for filters: - -```python -# Gets SNAP targets OR health insurance target -target_filter={ - "stratum_group_ids": [4], # All SNAP targets - "variables": ["health_insurance_premiums_without_medicare_part_b"], # Specific variable -} -``` - -## How to Add a New Target - -### Step 1: Check if Target Exists in Database - -```python -import sqlite3 -from policyengine_us_data.storage import STORAGE_FOLDER - -conn = sqlite3.connect(STORAGE_FOLDER / "calibration" / "policy_data.db") -cursor = conn.cursor() - -# Find your target -cursor.execute(""" - SELECT t.target_id, t.variable, t.value, t.period, t.stratum_id, - s.stratum_group_id - FROM targets t - JOIN strata s ON t.stratum_id = s.stratum_id - WHERE t.variable = 'your_variable_name' -""") -print(cursor.fetchall()) - -# Check constraints for that stratum -cursor.execute(""" - SELECT * FROM stratum_constraints WHERE stratum_id = -""") -print(cursor.fetchall()) -``` - -### Step 2: Determine Entity Type - -```python -from policyengine_us import Microsimulation - -sim = Microsimulation() -var = sim.tax_benefit_system.variables['your_variable_name'] -print(f"Entity: {var.entity.key}") # person, household, tax_unit, spm_unit, etc. -``` - -### Step 3: Verify Aggregation Works - -```python -# For non-household variables, verify totals are preserved -person_total = sim.calculate('your_variable', 2023, map_to='person').values.sum() -household_total = sim.calculate('your_variable', 2023, map_to='household').values.sum() -print(f"Match: {np.isclose(person_total, household_total, rtol=1e-6)}") -``` - -### Step 4: Update the Runner Script - -Edit `build_calibration_matrix.py` to include your new target: - -```python -targets_df, X_sparse, household_id_mapping = builder.build_matrix( - sim, - target_filter={ - "stratum_group_ids": [4], # SNAP - "variables": [ - "health_insurance_premiums_without_medicare_part_b", - "your_new_variable", # Add here - ], - }, -) -``` - -### Step 5: Run and Verify - -```bash -cd policyengine_us_data/datasets/cps/local_area_calibration -python build_calibration_matrix.py -``` - -## When Code Changes ARE Needed - -The current implementation may need modification for: - -1. **Count variables with special semantics**: Variables ending in `_count` might need `.nunique()` instead of `.sum()` for aggregation. The junkyard implementation handles this but our current builder doesn't. - -2. **Variables with state-specific calculations**: SNAP and Medicaid are already handled (state_fips is set before calculation). Other state-dependent variables should work the same way. - -3. **Constraint evaluation at non-household level**: Currently all constraints are evaluated at household level after aggregation. If you need person-level constraint evaluation (e.g., "only count persons with income > X"), the junkyard has this pattern but our builder doesn't. - -## The Junkyard Reference - -Location: `~/devl/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration/metrics_matrix_geo_stacking_sparse.py` - -This 2,400+ line file has extensive logic we intentionally avoided: -- Hard-coded variable names and stratum_group_ids -- Complex entity relationship tracking -- Person-level constraint evaluation with `.any()` aggregation - -**Key pattern from junkyard (if ever needed):** -```python -# Dynamic entity detection -target_entity = sim.tax_benefit_system.variables[target_variable].entity.key - -# Entity relationship DataFrame -entity_rel = pd.DataFrame({ - "person_id": sim.calculate("person_id", map_to="person").values, - "household_id": sim.calculate("household_id", map_to="person").values, - "tax_unit_id": sim.calculate("tax_unit_id", map_to="person").values, - # ... other entities -}) - -# For counts: use .nunique() on entity IDs -# For amounts: use .sum() on values -``` - -## Matrix Structure - -The sparse matrix X has shape `(n_targets, n_households × n_cds)`: - -``` -Columns: [CD1_hh0, CD1_hh1, ..., CD1_hhN, CD2_hh0, ..., CDM_hhN] -Rows: One per target (geographic_id + variable combination) - -Column index formula: col_idx = cd_idx * n_households + hh_idx -``` - -Use `MatrixTracer` for debugging: -```python -from matrix_tracer import MatrixTracer - -tracer = MatrixTracer(targets_df, X_sparse, household_id_mapping, cds_to_calibrate, sim) -tracer.print_matrix_structure() -tracer.get_column_info(100) # Info about column 100 -tracer.get_row_info(0) # Info about row 0 (first target) -``` - -## Environment Setup - -```bash -# Use the sep environment for this repo -source ~/envs/sep/bin/activate - -# Run from the local_area_calibration directory -cd ~/devl/sep/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration - -# Run tests -pytest ../../tests/test_sparse_matrix_builder.py -v -``` - -## Common Queries - -### List all target variables -```sql -SELECT DISTINCT variable FROM targets; -``` - -### List all constraint variables -```sql -SELECT DISTINCT constraint_variable FROM stratum_constraints; -``` - -### Find targets by geographic level -```sql --- National targets (no geographic constraints) -SELECT t.* FROM targets t -JOIN strata s ON t.stratum_id = s.stratum_id -WHERE t.stratum_id NOT IN ( - SELECT stratum_id FROM stratum_constraints - WHERE constraint_variable IN ('state_fips', 'congressional_district_geoid') -); - --- State-level targets -SELECT t.* FROM targets t -WHERE t.stratum_id IN ( - SELECT stratum_id FROM stratum_constraints - WHERE constraint_variable = 'state_fips' -); -``` - -## Summary - -For most new targets: -1. Verify target exists in `policy_data.db` -2. Add variable name to the target filter in `build_calibration_matrix.py` -3. Run and verify with `MatrixTracer` - -No code changes to `sparse_matrix_builder.py` needed unless you have special aggregation or constraint requirements. - -## Running Weight Calibration on Modal (GPU) - -The `fit_calibration_weights.py` script can be run on Modal with GPU acceleration using `modal_app/fit_weights.py`. - -### Basic Usage - -```bash -# Default: T4 GPU, 200 epochs -modal run modal_app/fit_weights.py --branch main --epochs 200 - -# Specify GPU type -modal run modal_app/fit_weights.py --branch main --epochs 2000 --gpu A100-40GB -``` - -### GPU Benchmarks (200 epochs, 2 target groups, Jan 2026) - -| GPU | Time | Cost | Notes | -|-----|------|------|-------| -| T4 | 16m 4s | $0.16 | Best for small test runs | -| A100-40GB | 9m 5s | $0.32 | ~44% faster | -| A100-80GB | 10m 28s | $0.44 | Slower than 40GB (variance?) | - -### Key Findings - -1. **Memory bandwidth matters for sparse operations**: The P100 (not available on Modal) outperforms T4 by ~2x on Kaggle due to HBM2 memory (~732 GB/s) vs GDDR6 (~320 GB/s). - -2. **Significant overhead at low epochs**: With only 200 epochs, much of the runtime is fixed overhead: - - Git clone and `uv sync` (~2-3 min) - - HuggingFace data download (~1 min) - - Loading Microsimulation and building sparse matrix (~3-4 min, CPU-bound) - -3. **GPU choice depends on epoch count**: - - **< 500 epochs**: Use T4 (cheapest, overhead dominates) - - **500-2000 epochs**: A100-40GB may break even - - **> 2000 epochs**: A100 likely more cost-effective as training dominates - -4. **Available Modal GPUs** (by memory bandwidth): - - T4: 320 GB/s, $0.000164/sec - - L4: 300 GB/s, $0.000222/sec - - A10: 600 GB/s, $0.000306/sec - - L40S: 864 GB/s, $0.000542/sec - - A100-40GB: 1,555 GB/s, $0.000583/sec - - A100-80GB: 2,039 GB/s, $0.000694/sec - - H100: 3,350 GB/s, $0.001097/sec - -### Output - -Weights are saved locally to `calibration_weights.npy` (configurable via `--output` flag). diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt b/policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt deleted file mode 100644 index 3242ef913..000000000 --- a/policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt +++ /dev/null @@ -1,2147 +0,0 @@ - - ▐▛███▜▌ Claude Code v2.1.7 -▝▜█████▛▘ Haiku 4.5 · Claude Max - ▘▘ ▝▝ ~/…/policyengine_us_data/datasets/cps/local_area_calibration - -❯ We need to bring in a new target to our calibration efforts, and this - will be manifested in the X matrix that is created from - sparse_matrix_builder.py in this directory. We have only test `snap` so - far, and you can see that demonstrated in - ~/docs/local_area_calibration_setup.ipynb. But now we want to bring in - health_insurance_premiums_without_medicare_part_b. We want to fully - understand what kind of variable this is from our target database (used - to construct the target vector), which you can find in - policyengine_us_data/storage/calibration/policy_data.db and see examples - for how to query it at - /home/baogorek/devl/code-snippets/reweighting/querying_policy_data_db.md. - This variable is defined at a different unit of analysis than `snap`, so - you need to be mindful of that. One more thing to be aware of: while we - are branched off the mainline in this repo, clone at - ~/devl/sep/policyengine-us-data, there is a "junkyard" clone of this same - repo at ~/devl/policyengine-us-data, which contains - /home/baogorek/devl/policyengine-us-data/policyengine_us_data/datasets/cp - s/local_area_calibration/metrics_matrix_geo_stacking_sparse.py, a massive - 2,000+ line file that contains logic to handle all of our targets, but - it does it in a very idiosycratic way, hard-coding target names in the - module. Our sparse_matrix_builder.py must never do this! We want to use - our policy_data.db to handle any target and create that sparse matrix. - Due to the size of the matrix creator in the "junkyard", and the - existance of two repos, this is a difficult task. Use the uv environment - ~/envs/sep for this branch and task, and use ~/envs/pe for the - "junkyard". Plan out the addition of this new target, and Ultrathink! - -● I'll help you plan the addition of the health_insurance_premiums_without_medicare_part_b target to your calibration system. This is a complex task that requires understanding multiple components across two repositories. - - Let me start by exploring the codebase to understand the current implementation and requirements. - -● 3 Explore agents finished (ctrl+o to expand) - ├─ Understand current sparse_matrix_builder implementation · 17 tool uses · - 27.6k tokens - │ ⎿  Done - ├─ Understand policy_data.db structure · 40 tool uses · 40.6k tokens - │ ⎿  Done - └─ Explore junkyard implementation · 40 tool uses · 52.5k tokens - ⎿  Done - -● Excellent! I've gathered comprehensive information about the current implementation, database structure, and junkyard patterns. Now let me launch a Plan agent to design the implementation approach. - -● Plan(Design health insurance premiums integration) - ⎿  Done (36 tool uses · 46.4k tokens · 4m 36s) - - -● Let me verify the Plan agent's findings by reading the key files directly. - -● Read(sparse_matrix_builder.py) - ⎿  Read 239 lines - -● Read(~/docs/local_area_calibration_setup.ipynb) - ⎿  Error reading file - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python - -c " - import sqlite3…) - ⎿ === Health Insurance Premium Targets === - (5, 'health_insurance_premiums_without_medicare_part_b', - 385000000000.0 - , 2024, 1, 1) - … +3 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Search(pattern: "~/docs/*.ipynb") - ⎿  Found 0 files - -● Read(~/devl/code-snippets/reweighting/querying_policy_data_db.md) - ⎿  Read 96 lines - -● Bash(find /home/baogorek/docs -name "*.ipynb" -type f 2>/dev/null | head -10) - ⎿  (No content) - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python - -c " - import sqlite3…) - ⎿  === Stratum Group Analysis === - - Stratum group 1 (health insurance stratum): - … +56 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python - -c " - from policyengine_us import Microsimulation…) - ⎿  === Entity Aggregation Test === - Person-level array shape: (52826,) - Household-level array shape: (21045,) - … +6 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Excellent! I've now verified the Plan agent's key finding. Let me create the implementation plan and ask some clarifying questions. - -● Updated plan - ⎿  Wrote 267 lines to - ../../../../../../../.claude/plans/velvet-rolling-dawn.md - 1 # Implementation Plan: Adding Health Insurance Premiums to Calibr - ation - 2 ## Executive Summary - 3 **Surprising Finding**: The current `sparse_matrix_builder.py` im - plementation **already supports person-level targets** without an - y code changes needed. PolicyEngine's `map_to="household"` parame - ter automatically aggregates person-level values to household lev - el while preserving totals. - 4 ## Background - 5 ### Current State - 6 - **Working targets**: SNAP (stratum_group_id=4, spm_unit-level, - 538 targets: 51 state snap amounts + 487 household_count targets) - … +257 lines (ctrl+o to expand) - -● User answered Claude's questions: - ⎿  · Do you want to calibrate using only the existing national health - insurance premium target ($385B), or should state-level targets be - added to the database first? → National target only (Recommended) - · How do you plan to use the health insurance premium target in your - calibration? → Combined with SNAP - - · The existing health insurance target has no geographic constraints - (same target for all CDs). Is this the desired behavior? → Yes, - national is correct - - -● Excellent! Based on your answers, let me finalize the implementation plan. - -● Updated plan - ⎿  /plan to preview - 250 - ✓ Total preservation is guaranteed by PolicyEngine's aggrega - tion logic - 251 - ✓ Generic architecture means extensible to any future entity - types - 252 - 253 -## Next Steps (Post-Approval) - 253 +## User Decisions - 254 - 255 -1. **Update usage examples** to show health insurance premium - -calibration - 256 -2. **Test with combined targets** (SNAP + health insurance) to - - verify OR filter logic - 257 -3. **Document target filtering patterns** for future reference - 258 -4. **(Optional) Add state-level health insurance targets** to - -database if geographic variation is needed - 255 +✓ **Target Scope**: Use national target only (existing target_ - +id=5, no database changes) - 256 +✓ **Usage Pattern**: Combined with SNAP (test multi-target cal - +ibration) - 257 +✓ **Geographic Scope**: National aggregate is correct (no stat - +e variation needed) - 258 - 260 -## Questions for User - 259 +## Implementation Steps (Post-Approval) - 260 - 262 -Before finalizing this plan, clarification needed on: - 261 +### Step 1: Verify Database Target - 262 +Confirm health insurance premium target exists and is active: - 263 +```python - 264 +# Using ~/envs/sep environment - 265 +import sqlite3 - 266 +conn = sqlite3.connect('policyengine_us_data/storage/calibrati - +on/policy_data.db') - 267 +cursor = conn.cursor() - 268 +cursor.execute(""" - 269 + SELECT target_id, variable, value, period, stratum_id, act - +ive - 270 + FROM targets - 271 + WHERE variable = 'health_insurance_premiums_without_medica - +re_part_b' - 272 +""") - 273 +print(cursor.fetchone()) - 274 +``` - 275 +Expected: `(5, 'health_insurance_premiums_without_medicare_par - +t_b', 385000000000.0, 2024, 1, 1)` - 276 - 264 -1. **Target Scope**: Do you want ONLY the national health insu - -rance target, or should state-level targets be added to the da - -tabase? - 265 -2. **Usage Pattern**: Will you calibrate using health insuranc - -e alone, or combined with SNAP targets? - 266 -3. **Geographic Constraints**: The current target has no geogr - -aphic constraints (applies to all CDs equally). Is this correc - -t, or should premiums vary by state/CD? - 277 +### Step 2: Test Combined Target Matrix Building - 278 +Create a test script to build matrix with both SNAP and health - + insurance: - 279 + - 280 +```python - 281 +from policyengine_us import Microsimulation - 282 +from policyengine_us_data.datasets.cps.local_area_calibration. - +sparse_matrix_builder import SparseMatrixBuilder - 283 + - 284 +# Load simulation - 285 +sim = Microsimulation() - 286 + - 287 +# Create builder - 288 +builder = SparseMatrixBuilder( - 289 + db_uri="sqlite:///policyengine_us_data/storage/calibration - +/policy_data.db", - 290 + time_period=2024, - 291 + cds_to_calibrate=['0101', '0601', '0602'], # Example CDs: - + AL-1, CA-1, CA-2 - 292 + dataset_path=None - 293 +) - 294 + - 295 +# Build matrix with SNAP + health insurance - 296 +targets_df, X_sparse, household_mapping = builder.build_matrix - +( - 297 + sim, - 298 + target_filter={"stratum_group_ids": [1, 4]} # National + - +SNAP targets - 299 +) - 300 + - 301 +print(f"Total targets: {len(targets_df)}") - 302 +print(f"Matrix shape: {X_sparse.shape}") - 303 +print(f"Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_s - +parse.shape[1]):.2%}") - 304 + - 305 +# Verify health insurance target is included - 306 +health_ins = targets_df[ - 307 + targets_df["variable"] == "health_insurance_premiums_witho - +ut_medicare_part_b" - 308 +] - 309 +print(f"\nHealth insurance target:") - 310 +print(health_ins[["target_id", "variable", "value", "geographi - +c_id"]]) - 311 +``` - 312 + - 313 +### Step 3: Verify Entity Aggregation - 314 +Confirm person-to-household aggregation works correctly: - 315 + - 316 +```python - 317 +# Verify totals are preserved - 318 +person_total = sim.calculate( - 319 + 'health_insurance_premiums_without_medicare_part_b', - 320 + 2024, - 321 + map_to='person' - 322 +).values.sum() - 323 + - 324 +household_total = sim.calculate( - 325 + 'health_insurance_premiums_without_medicare_part_b', - 326 + 2024, - 327 + map_to='household' - 328 +).values.sum() - 329 + - 330 +print(f"Person-level total: ${person_total:,.0f}") - 331 +print(f"Household-level total: ${household_total:,.0f}") - 332 +print(f"Match: {abs(person_total - household_total) < 1}") - 333 + - 334 +# Check matrix row sum for health insurance - 335 +health_ins_idx = targets_df[ - 336 + targets_df["variable"] == "health_insurance_premiums_witho - +ut_medicare_part_b" - 337 +].index[0] - 338 +matrix_total = X_sparse[health_ins_idx, :].sum() * len(cds_to_ - +calibrate) # Adjust for stacking - 339 +print(f"Matrix row total (adjusted): ${matrix_total:,.0f}") - 340 +``` - 341 + - 342 +### Step 4: Update Documentation - 343 +Add example to `~/docs/local_area_calibration_setup.ipynb` sho - +wing: - 344 +- How to filter for multiple target groups - 345 +- How person-level targets work alongside spm_unit-level targe - +ts - 346 +- Interpretation of the resulting matrix structure - 347 + - 348 +### Step 5: Calibration Integration - 349 +Use the matrix in calibration workflow: - 350 +```python - 351 +# After building matrix with combined targets - 352 +from your_calibration_module import solve_weights - 353 + - 354 +# Solve for weights - 355 +weights = solve_weights(X_sparse, targets_df["value"].values) - 356 + - 357 +# Verify calibration matches targets - 358 +calibrated = X_sparse.T @ weights - 359 +targets = targets_df["value"].values - 360 + - 361 +for idx, row in targets_df.iterrows(): - 362 + print(f"{row['variable']}: target={row['value']:,.0f}, cal - +ibrated={calibrated[idx]:,.0f}") - 363 +``` - 364 + - 365 +## Success Criteria - 366 + - 367 +1. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 - +, 4]}` filter - 368 +2. ✓ Health insurance row appears in targets_df with correct v - +alue ($385B) - 369 +3. ✓ Matrix shape is (n_targets, n_households × n_cds) where n - +_targets ≈ 558 (20 national + 538 SNAP) - 370 +4. ✓ Person-to-household aggregation preserves totals - 371 +5. ✓ Matrix row for health insurance contains household-aggreg - +ated premium values - 372 +6. ✓ No code changes needed (only usage pattern demonstration) - -● Updated plan - ⎿  /plan to preview - 1 # Implementation Plan: Adding Health Insurance Premiums to Cali - bration - 2 - 3 +## TL;DR - 4 + - 5 +**ZERO CODE CHANGES NEEDED** 🎉 - 6 + - 7 +The current `sparse_matrix_builder.py` already supports person- - +level targets through PolicyEngine's `map_to="household"` param - +eter, which automatically aggregates person-level values to hou - +sehold level. Simply use: - 8 + - 9 +```python - 10 +builder.build_matrix(sim, target_filter={"stratum_group_ids": [ - +1, 4]}) # SNAP + health insurance - 11 +``` - 12 + - 13 +This works because: - 14 +- Health insurance premium is in the database (target_id=5, $38 - +5B) - 15 +- PolicyEngine auto-aggregates person→household (preserves tota - +ls) - 16 +- No hard-coded variable names in the builder - 17 + - 18 ## Executive Summary - 19 - 5 -**Surprising Finding**: The current `sparse_matrix_builder.py` - -implementation **already supports person-level targets** withou - -t any code changes needed. PolicyEngine's `map_to="household"` - -parameter automatically aggregates person-level values to house - -hold level while preserving totals. - 20 +**Key Finding**: The current `sparse_matrix_builder.py` impleme - +ntation **already supports person-level targets** without any c - +ode changes needed. PolicyEngine's `map_to="household"` paramet - +er automatically aggregates person-level values to household le - +vel while preserving totals. - 21 - 22 ## Background - 23 - -● Updated plan - ⎿  /plan to preview - 385 4. ✓ Person-to-household aggregation preserves totals - 386 5. ✓ Matrix row for health insurance contains household-aggreg - ated premium values - 387 6. ✓ No code changes needed (only usage pattern demonstration) - 388 + - 389 +## Quick Reference - 390 + - 391 +### Critical Files - 392 +- **No changes**: `sparse_matrix_builder.py` (already supports - + person-level targets) - 393 +- **Database**: `policyengine_us_data/storage/calibration/poli - +cy_data.db` (contains target_id=5) - 394 +- **Environment**: `~/envs/sep` (for this development branch) - 395 + - 396 +### Key Commands - 397 +```bash - 398 +# Activate environment - 399 +source ~/envs/sep/bin/activate - 400 + - 401 +# Query database for health insurance target - 402 +cd ~/devl/sep/policyengine-us-data - 403 +python -c "import sqlite3; conn = sqlite3.connect('policyengin - +e_us_data/storage/calibration/policy_data.db'); cursor = conn. - +cursor(); cursor.execute('SELECT * FROM targets WHERE variable - + = \"health_insurance_premiums_without_medicare_part_b\"'); pr - +int(cursor.fetchone())" - 404 + - 405 +# Run verification tests - 406 +python test_combined_targets.py # Create this test script per - + Step 2 above - 407 +``` - 408 + - 409 +### Target Filter Examples - 410 +```python - 411 +# SNAP only - 412 +{"stratum_group_ids": [4]} - 413 + - 414 +# Health insurance only - 415 +{"variables": ["health_insurance_premiums_without_medicare_par - +t_b"]} - 416 + - 417 +# Both SNAP and health insurance (recommended) - 418 +{"stratum_group_ids": [1, 4]} - 419 +``` - -● Perfect! I've completed a comprehensive investigation and created a detailed implementation plan. The surprising finding is that no code changes are needed - your sparse_matrix_builder.py already supports person-level targets through PolicyEngine's automatic entity aggregation. - - Key discoveries: - 1. Health insurance premiums is person-level (verified via PolicyEngine metadata) - 2. PolicyEngine's map_to="household" automatically aggregates person values to household level - 3. Totals are preserved (verified: $56,465,248 at both person and household levels) - 4. Database already contains the target (target_id=5, $385B, stratum_group_id=1) - 5. Your generic architecture works perfectly for any entity type - - The plan includes: - - Complete verification of entity aggregation - - Step-by-step testing procedures - - Combined SNAP + health insurance usage pattern - - Success criteria and quick reference commands - ⎿ Tool use rejected with user message: Do not do Step 5, rather, please create - a "runner" program that generates X_sparse and the target vector and prints - some diagnostics. You can use the first 100 lines of - /home/baogorek/devl/policyengine-us-data/policyengine_us_data/datasets/cps/l - ocal_area_calibration/calibrate_cds_sparse.py (from the junkyard) as an - example. Remember that you have access to the tracer in this repo as well. - - -● Read(~/devl/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_ - calibration/calibrate_cds_sparse.py) - ⎿  Read 100 lines - -● Search(pattern: "**/tracer*.py") - ⎿  Found 0 files - -● Search(pattern: "**/*tracer*.py") - ⎿  Found 1 file (ctrl+o to expand) - -● Read(matrix_tracer.py) - ⎿  Read 384 lines - -● Updated plan - ⎿  /plan to preview - 360 - How person-level targets work alongside spm_unit-level targe - ts - 361 - Interpretation of the resulting matrix structure - 362 - 363 -### Step 5: Calibration Integration - 364 -Use the matrix in calibration workflow: - 363 +### Step 5: Create Runner Program with Diagnostics - 364 +Create a runner script `test_combined_targets.py` that: - 365 +- Builds X_sparse with SNAP + health insurance targets - 366 +- Extracts target vector - 367 +- Uses MatrixTracer to print diagnostics - 368 + - 369 +Pattern based on first 100 lines of junkyard's `calibrate_cds_ - +sparse.py`: - 370 + - 371 ```python - 366 -# After building matrix with combined targets - 367 -from your_calibration_module import solve_weights - 372 +""" - 373 +Test runner for combined SNAP + health insurance calibration t - +argets. - 374 +Generates X_sparse and target vector, prints diagnostics using - + MatrixTracer. - 375 +""" - 376 +import logging - 377 +from pathlib import Path - 378 - 369 -# Solve for weights - 370 -weights = solve_weights(X_sparse, targets_df["value"].values) - 379 +logging.basicConfig( - 380 + level=logging.INFO, - 381 + format="%(asctime)s - %(levelname)s - %(message)s" - 382 +) - 383 - 372 -# Verify calibration matches targets - 373 -calibrated = X_sparse.T @ weights - 374 -targets = targets_df["value"].values - 384 +import numpy as np - 385 +import pandas as pd - 386 +from policyengine_us import Microsimulation - 387 +from policyengine_us_data.datasets.cps.local_area_calibration. - +sparse_matrix_builder import SparseMatrixBuilder - 388 +from policyengine_us_data.datasets.cps.local_area_calibration. - +matrix_tracer import MatrixTracer - 389 +from policyengine_us_data.datasets.cps.local_area_calibration. - +calibration_utils import ( - 390 + create_target_groups, - 391 +) - 392 - 376 -for idx, row in targets_df.iterrows(): - 377 - print(f"{row['variable']}: target={row['value']:,.0f}, cal - -ibrated={calibrated[idx]:,.0f}") - 393 +# ============================================================ - +================ - 394 +# CONFIGURATION - 395 +# ============================================================ - +================ - 396 +db_path = "policyengine_us_data/storage/calibration/policy_dat - +a.db" - 397 +db_uri = f"sqlite:///{db_path}" - 398 +time_period = 2024 - 399 + - 400 +# Test with a few CDs from different states - 401 +cds_to_calibrate = [ - 402 + '0101', # Alabama CD-1 - 403 + '0601', # California CD-1 - 404 + '0602', # California CD-2 - 405 + '3601', # New York CD-1 - 406 + '4801', # Texas CD-1 - 407 +] - 408 + - 409 +print(f"Testing with {len(cds_to_calibrate)} congressional dis - +tricts") - 410 + - 411 +# ============================================================ - +================ - 412 +# STEP 1: LOAD SIMULATION - 413 +# ============================================================ - +================ - 414 +print("\nLoading simulation...") - 415 +sim = Microsimulation() - 416 +n_households = len(sim.calculate("household_id", map_to="house - +hold").values) - 417 +print(f"Loaded {n_households:,} households") - 418 + - 419 +# ============================================================ - +================ - 420 +# STEP 2: BUILD SPARSE MATRIX WITH COMBINED TARGETS - 421 +# ============================================================ - +================ - 422 +print("\nBuilding sparse matrix...") - 423 +builder = SparseMatrixBuilder( - 424 + db_uri=db_uri, - 425 + time_period=time_period, - 426 + cds_to_calibrate=cds_to_calibrate, - 427 + dataset_path=None - 428 +) - 429 + - 430 +# Query both SNAP (stratum_group_id=4) and national targets (s - +tratum_group_id=1) - 431 +targets_df, X_sparse, household_id_mapping = builder.build_mat - +rix( - 432 + sim, - 433 + target_filter={"stratum_group_ids": [1, 4]} - 434 +) - 435 + - 436 +print(f"\nMatrix built successfully:") - 437 +print(f" Shape: {X_sparse.shape}") - 438 +print(f" Targets: {len(targets_df)}") - 439 +print(f" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X - +_sparse.shape[1]):.4%}") - 440 + - 441 +# ============================================================ - +================ - 442 +# STEP 3: EXTRACT TARGET VECTOR - 443 +# ============================================================ - +================ - 444 +target_vector = targets_df["value"].values - 445 +print(f"\nTarget vector shape: {target_vector.shape}") - 446 +print(f"Target total: ${target_vector.sum():,.0f}") - 447 + - 448 +# ============================================================ - +================ - 449 +# STEP 4: HEALTH INSURANCE PREMIUM VERIFICATION - 450 +# ============================================================ - +================ - 451 +print("\n" + "="*80) - 452 +print("HEALTH INSURANCE PREMIUM TARGET ANALYSIS") - 453 +print("="*80) - 454 + - 455 +health_ins_targets = targets_df[ - 456 + targets_df["variable"] == "health_insurance_premiums_witho - +ut_medicare_part_b" - 457 +] - 458 + - 459 +if len(health_ins_targets) > 0: - 460 + print(f"\nFound {len(health_ins_targets)} health insurance - + target(s):") - 461 + print(health_ins_targets[["target_id", "variable", "value" - +, "geographic_id", "stratum_group_id"]]) - 462 + - 463 + # Get the row index for health insurance - 464 + health_ins_idx = health_ins_targets.index[0] - 465 + health_ins_row = X_sparse[health_ins_idx, :] - 466 + - 467 + print(f"\nMatrix row {health_ins_idx} (health insurance):" - +) - 468 + print(f" Non-zero entries: {health_ins_row.nnz:,}") - 469 + print(f" Row sum: ${health_ins_row.sum():,.0f}") - 470 + print(f" Target value: ${health_ins_targets.iloc[0]['valu - +e']:,.0f}") - 471 + - 472 + # Verify person-to-household aggregation - 473 + person_total = sim.calculate( - 474 + 'health_insurance_premiums_without_medicare_part_b', - 475 + time_period, - 476 + map_to='person' - 477 + ).values.sum() - 478 + - 479 + household_total = sim.calculate( - 480 + 'health_insurance_premiums_without_medicare_part_b', - 481 + time_period, - 482 + map_to='household' - 483 + ).values.sum() - 484 + - 485 + print(f"\nEntity aggregation verification:") - 486 + print(f" Person-level total: ${person_total:,.0f}") - 487 + print(f" Household-level total: ${household_total:,.0f}") - 488 + print(f" Match: {abs(person_total - household_total) < 1} - +") - 489 + - 490 +else: - 491 + print("\nWARNING: No health insurance targets found!") - 492 + - 493 +# ============================================================ - +================ - 494 +# STEP 5: SNAP TARGET SUMMARY - 495 +# ============================================================ - +================ - 496 +print("\n" + "="*80) - 497 +print("SNAP TARGET SUMMARY") - 498 +print("="*80) - 499 + - 500 +snap_targets = targets_df[targets_df["variable"] == "snap"] - 501 +household_count_targets = targets_df[targets_df["variable"] == - + "household_count"] - 502 + - 503 +print(f"\nSNAP benefit targets: {len(snap_targets)}") - 504 +print(f"Household count targets: {len(household_count_targets) - +}") - 505 + - 506 +if len(snap_targets) > 0: - 507 + print(f"\nSNAP total (all states): ${snap_targets['value'] - +.sum():,.0f}") - 508 + print(f"\nSample SNAP targets:") - 509 + print(snap_targets[["target_id", "variable", "value", "geo - +graphic_id"]].head(10)) - 510 + - 511 +# ============================================================ - +================ - 512 +# STEP 6: USE MATRIX TRACER FOR DETAILED DIAGNOSTICS - 513 +# ============================================================ - +================ - 514 +print("\n" + "="*80) - 515 +print("MATRIX TRACER DIAGNOSTICS") - 516 +print("="*80) - 517 + - 518 +tracer = MatrixTracer( - 519 + targets_df=targets_df, - 520 + matrix=X_sparse, - 521 + household_id_mapping=household_id_mapping, - 522 + geographic_ids=cds_to_calibrate, - 523 + sim=sim - 524 +) - 525 + - 526 +# Print comprehensive matrix structure - 527 +tracer.print_matrix_structure(show_groups=True) - 528 + - 529 +# ============================================================ - +================ - 530 +# STEP 7: TARGET GROUP ANALYSIS - 531 +# ============================================================ - +================ - 532 +print("\n" + "="*80) - 533 +print("TARGET GROUP ANALYSIS") - 534 +print("="*80) - 535 + - 536 +target_groups, group_info = create_target_groups(targets_df) - 537 + - 538 +print(f"\nTotal target groups: {len(group_info)}") - 539 +for group_id, info in enumerate(group_info): - 540 + group_mask = target_groups == group_id - 541 + n_targets_in_group = group_mask.sum() - 542 + print(f" Group {group_id}: {info} ({n_targets_in_group} t - +argets)") - 543 + - 544 +print("\n" + "="*80) - 545 +print("RUNNER COMPLETED SUCCESSFULLY") - 546 +print("="*80) - 547 ``` - 548 - 549 +**Key Features**: - 550 +- Loads simulation once - 551 +- Builds matrix with combined targets using OR filter - 552 +- Verifies health insurance person→household aggregation - 553 +- Uses MatrixTracer for comprehensive diagnostics - 554 +- Prints target group analysis - 555 +- Follows junkyard pattern but with new sparse_matrix_builder - 556 + - 557 ## Success Criteria - 558 - 559 1. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 - , 4]}` filter - -● Updated plan - ⎿  /plan to preview - 556 - 557 ## Success Criteria - 558 - 559 -1. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 - -, 4]}` filter - 560 -2. ✓ Health insurance row appears in targets_df with correct v - -alue ($385B) - 561 -3. ✓ Matrix shape is (n_targets, n_households × n_cds) where n - -_targets ≈ 558 (20 national + 538 SNAP) - 562 -4. ✓ Person-to-household aggregation preserves totals - 563 -5. ✓ Matrix row for health insurance contains household-aggreg - -ated premium values - 564 -6. ✓ No code changes needed (only usage pattern demonstration) - 559 +1. ✓ Runner script executes successfully without errors - 560 +2. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 - +, 4]}` filter - 561 +3. ✓ Health insurance target appears in targets_df with correc - +t value ($385B) - 562 +4. ✓ Matrix shape is (n_targets, n_households × n_cds) where n - +_targets ≈ 558 (20 national + 538 SNAP) - 563 +5. ✓ Person-to-household aggregation preserves totals (verifie - +d in runner output) - 564 +6. ✓ MatrixTracer prints comprehensive diagnostics - 565 +7. ✓ Target groups are correctly identified and displayed - 566 +8. ✓ No code changes to sparse_matrix_builder.py (only new run - +ner script) - 567 - 568 ## Quick Reference - 569 - -● Updated plan - ⎿  /plan to preview - 569 - 570 ### Critical Files - 571 - **No changes**: `sparse_matrix_builder.py` (already supports - person-level targets) - 572 +- **New file**: `test_combined_targets.py` (runner program to - +create and verify) - 573 - **Database**: `policyengine_us_data/storage/calibration/poli - cy_data.db` (contains target_id=5) - 574 +- **Tracer**: `matrix_tracer.py` (used for diagnostics in runn - +er) - 575 - **Environment**: `~/envs/sep` (for this development branch) - 576 - 577 ### Key Commands - ... - 579 # Activate environment - 580 source ~/envs/sep/bin/activate - 581 - 580 -# Query database for health insurance target - 582 +# Navigate to project directory - 583 cd ~/devl/sep/policyengine-us-data - 584 + - 585 +# Query database for health insurance target - 586 python -c "import sqlite3; conn = sqlite3.connect('policyengin - e_us_data/storage/calibration/policy_data.db'); cursor = conn. - cursor(); cursor.execute('SELECT * FROM targets WHERE variable - = \"health_insurance_premiums_without_medicare_part_b\"'); pr - int(cursor.fetchone())" - 587 - 584 -# Run verification tests - 585 -python test_combined_targets.py # Create this test script per - - Step 2 above - 588 +# Run the combined targets runner (after creating per Step 5) - 589 +python test_combined_targets.py - 590 ``` - 591 - 592 ### Target Filter Examples - -● Perfect! I've updated the plan to include a comprehensive runner program (Step 5) that will: - - Build X_sparse with combined SNAP + health insurance targets - - Extract the target vector - - Use MatrixTracer for detailed diagnostics - - Verify person→household aggregation - - Print comprehensive analysis - - The runner follows the pattern from the junkyard's calibrate_cds_sparse.py but uses your new generic sparse_matrix_builder.py. - -● User approved Claude's plan - ⎿  Plan saved to: ~/.claude/plans/velvet-rolling-dawn.md · /plan to edit - - Implementation Plan: Adding Health Insurance Premiums to Calibration - - TL;DR - - ZERO CODE CHANGES NEEDED 🎉 - - The current sparse_matrix_builder.py already supports person-level - targets through PolicyEngine's map_to="household" parameter, which - automatically aggregates person-level values to household level. - Simply use: - - builder.build_matrix(sim, target_filter={"stratum_group_ids": [1, 4]}) - # SNAP + health insurance - - This works because: - - Health insurance premium is in the database (target_id=5, $385B) - - PolicyEngine auto-aggregates person→household (preserves totals) - - No hard-coded variable names in the builder - - Executive Summary - - Key Finding: The current sparse_matrix_builder.py implementation - already supports person-level targets without any code changes needed. - PolicyEngine's map_to="household" parameter automatically aggregates - person-level values to household level while preserving totals. - - Background - - Current State - - - Working targets: SNAP (stratum_group_id=4, spm_unit-level, 538 - targets: 51 state snap amounts + 487 household_count targets) - - New target: health_insurance_premiums_without_medicare_part_b - (person-level, stratum_group_id=1, 1 national target) - - Repository: Development branch at ~/devl/sep/policyengine-us-data - (use ~/envs/sep) - - Junkyard: Reference implementation at ~/devl/policyengine-us-data - (use ~/envs/pe) - - Key Differences - - - SNAP: spm_unit-level variable with state-specific constraints and - two variables (snap dollars + household_count) - - Health Insurance: person-level variable, single national target (no - geographic constraints), only one variable - - Investigation Results - - Database Analysis - - -- Health insurance premium target in policy_data.db - target_id: 5 - variable: 'health_insurance_premiums_without_medicare_part_b' - value: $385,000,000,000 (2024) - stratum_id: 1 - stratum_group_id: 1 (national hardcoded targets) - constraints: NONE (applies to entire nation) - - Stratum group 1 contains 20 national hardcoded targets including - medicaid, snap (national), ssi, tanf, social_security, rent, etc. - - Entity Aggregation Verification - - # Verified with PolicyEngine Microsimulation - Person-level array: 52,826 persons, sum = $56,465,248 - Household-level array: 21,045 households, sum = $56,465,248 - Entity type: 'person' - Sums match: True ✓ - - Conclusion: PolicyEngine's map_to="household" automatically sums - person-level values to household level, preserving totals perfectly. - - Current Implementation Analysis - - The generic code at sparse_matrix_builder.py:220-222: - target_values = state_sim.calculate( - target["variable"], map_to="household" - ).values - masked_values = (target_values * mask).astype(np.float32) - - This works for ANY entity type because: - 1. PolicyEngine detects the variable's entity type from the tax - benefit system - 2. map_to="household" triggers automatic aggregation (sum for amounts, - count for _count variables) - 3. No explicit entity handling code needed - - Implementation Plan - - Phase 1: No Code Changes Needed ✓ - - The sparse_matrix_builder.py requires zero modifications to support - person-level targets. - - Why: The implementation is already entity-agnostic via PolicyEngine's - map_to parameter. - - Phase 2: Usage Pattern - - To add health insurance premiums to calibration, use the appropriate - target filter: - - Option A: Include specific stratum_group_id - builder = SparseMatrixBuilder(db_uri, time_period, cds_to_calibrate, - dataset_path) - - # Just health insurance (national hardcoded targets) - targets_df, X_sparse, household_mapping = builder.build_matrix( - sim, - target_filter={"stratum_group_ids": [1]} # 20 national targets - ) - - # SNAP only (state-level targets) - targets_df, X_sparse, household_mapping = builder.build_matrix( - sim, - target_filter={"stratum_group_ids": [4]} # 538 SNAP targets - ) - - # Both SNAP and national targets together - targets_df, X_sparse, household_mapping = builder.build_matrix( - sim, - target_filter={"stratum_group_ids": [1, 4]} # 558 targets total - ) - - Option B: Include specific variable(s) - # Just health insurance premiums - targets_df, X_sparse, household_mapping = builder.build_matrix( - sim, - target_filter={"variables": - ["health_insurance_premiums_without_medicare_part_b"]} - ) - - # Multiple specific variables - targets_df, X_sparse, household_mapping = builder.build_matrix( - sim, - target_filter={ - "variables": ["snap", - "health_insurance_premiums_without_medicare_part_b"] - } - ) - - Option C: Include specific stratum_id or target_id - # Using stratum_id - targets_df, X_sparse, household_mapping = builder.build_matrix( - sim, - target_filter={"stratum_ids": [1]} # National stratum - ) - - # Using target_id - targets_df, X_sparse, household_mapping = builder.build_matrix( - sim, - target_filter={"target_ids": [5]} # Health insurance target - ) - - Phase 3: Matrix Structure - - The resulting sparse matrix will have: - - Rows: Number of targets (1 for health insurance only, 558 for SNAP + - health insurance) - - Columns: n_households × n_cds (e.g., 21,045 households × 19 CDs = - 399,855 columns) - - Values: For health insurance row, each column contains the sum of - person-level premiums for that household - - Sparsity: ~99.95% sparse (most households have zero premiums) - - Phase 4: Constraint Handling - - For health insurance premiums (no constraints): - - All households included (mask = all True) - - Each household gets sum of all persons' premiums in that household - - Same value repeated across all CDs (since there's no geographic - constraint) - - For SNAP (with state constraints): - - Constraints filtered by state_fips and snap>0 - - Only qualifying households get non-zero values - - Values vary by state due to state-specific SNAP rules - - Phase 5: Verification - - Test 1: Basic Matrix Construction - # Create builder with policy_data.db - builder = SparseMatrixBuilder( - db_uri="sqlite:///policyengine_us_data/storage/calibration/policy_ - data.db", - time_period=2024, - cds_to_calibrate=['0101', '0102', ...], # Example CDs - dataset_path=None # Or path to CPS dataset - ) - - # Build matrix with health insurance - targets_df, X_sparse, household_mapping = builder.build_matrix( - sim, - target_filter={"variables": - ["health_insurance_premiums_without_medicare_part_b"]} - ) - - # Verify - assert len(targets_df) == 1 - assert targets_df.iloc[0]["variable"] == - "health_insurance_premiums_without_medicare_part_b" - assert targets_df.iloc[0]["value"] == 385_000_000_000 - assert X_sparse.shape[0] == 1 - assert X_sparse.sum() > 0 - - Test 2: Combined Targets - # Build matrix with both SNAP and health insurance - targets_df, X_sparse, household_mapping = builder.build_matrix( - sim, - target_filter={"stratum_group_ids": [1, 4]} - ) - - # Verify - assert len(targets_df) == 558 # 20 national + 538 SNAP - health_ins_rows = targets_df[ - targets_df["variable"] == - "health_insurance_premiums_without_medicare_part_b" - ] - assert len(health_ins_rows) == 1 - - Test 3: Entity Aggregation - # Verify person-to-household aggregation preserves totals - sim = Microsimulation(dataset=dataset_path) - - person_total = sim.calculate( - 'health_insurance_premiums_without_medicare_part_b', - 2024, - map_to='person' - ).values.sum() - - household_total = sim.calculate( - 'health_insurance_premiums_without_medicare_part_b', - 2024, - map_to='household' - ).values.sum() - - assert np.isclose(person_total, household_total, rtol=1e-6) - - Critical Files - - No Modifications Needed - - - ✓ sparse_matrix_builder.py - Already supports person-level targets - - Database (Already Contains Target) - - - ✓ policyengine_us_data/storage/calibration/policy_data.db - Contains - target_id=5 - - Documentation/Examples - - - May need to update: ~/docs/local_area_calibration_setup.ipynb - Show - health insurance example - - Architecture Decisions - - Why No Code Changes? - - 1. PolicyEngine's Generic Design: map_to parameter handles all entity - aggregations - 2. Database-Driven Approach: All target metadata comes from - policy_data.db - 3. Constraint-Based Filtering: Generic mask application works for any - entity type - 4. No Hard-Coding: Variable names never appear in the builder logic - - Entity Aggregation Semantics - - - Amount variables (like premiums): SUM from entity to household - - Count variables (like person_count): COUNT unique entities per - household - - Detection: PolicyEngine uses variable metadata from - tax_benefit_system - - Limitations (Current Implementation) - - - Assumes all targets should be aggregated to household level for - calibration - - No support for tax_unit or spm_unit level calibration (only - household-level weights) - - This is correct for geo-stacking where we reweight households within - CDs - - Risk Analysis - - Low Risk - - - ✓ No code changes means no regression risk - - ✓ Existing SNAP functionality unchanged - - ✓ Database already contains correct target definition - - Medium Risk - - - ⚠️ If user wants state-level health insurance targets, those need to - be added to database first - - ⚠️ Performance: Person-level aggregation is more expensive than - household-level (but PolicyEngine caches this) - - Zero Risk Items - - - ✓ Entity detection is automatic via PolicyEngine - - ✓ Total preservation is guaranteed by PolicyEngine's aggregation - logic - - ✓ Generic architecture means extensible to any future entity types - - User Decisions - - ✓ Target Scope: Use national target only (existing target_id=5, no - database changes) - ✓ Usage Pattern: Combined with SNAP (test multi-target calibration) - ✓ Geographic Scope: National aggregate is correct (no state variation - needed) - - Implementation Steps (Post-Approval) - - Step 1: Verify Database Target - - Confirm health insurance premium target exists and is active: - # Using ~/envs/sep environment - import sqlite3 - conn = sqlite3.connect('policyengine_us_data/storage/calibration/polic - y_data.db') - cursor = conn.cursor() - cursor.execute(""" - SELECT target_id, variable, value, period, stratum_id, active - FROM targets - WHERE variable = - 'health_insurance_premiums_without_medicare_part_b' - """) - print(cursor.fetchone()) - Expected: (5, 'health_insurance_premiums_without_medicare_part_b', - 385000000000.0, 2024, 1, 1) - - Step 2: Test Combined Target Matrix Building - - Create a test script to build matrix with both SNAP and health - insurance: - - from policyengine_us import Microsimulation - from policyengine_us_data.datasets.cps.local_area_calibration.sparse_m - atrix_builder import SparseMatrixBuilder - - # Load simulation - sim = Microsimulation() - - # Create builder - builder = SparseMatrixBuilder( - db_uri="sqlite:///policyengine_us_data/storage/calibration/policy_ - data.db", - time_period=2024, - cds_to_calibrate=['0101', '0601', '0602'], # Example CDs: AL-1, - CA-1, CA-2 - dataset_path=None - ) - - # Build matrix with SNAP + health insurance - targets_df, X_sparse, household_mapping = builder.build_matrix( - sim, - target_filter={"stratum_group_ids": [1, 4]} # National + SNAP - targets - ) - - print(f"Total targets: {len(targets_df)}") - print(f"Matrix shape: {X_sparse.shape}") - print(f"Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * - X_sparse.shape[1]):.2%}") - - # Verify health insurance target is included - health_ins = targets_df[ - targets_df["variable"] == - "health_insurance_premiums_without_medicare_part_b" - ] - print(f"\nHealth insurance target:") - print(health_ins[["target_id", "variable", "value", "geographic_id"]]) - - Step 3: Verify Entity Aggregation - - Confirm person-to-household aggregation works correctly: - - # Verify totals are preserved - person_total = sim.calculate( - 'health_insurance_premiums_without_medicare_part_b', - 2024, - map_to='person' - ).values.sum() - - household_total = sim.calculate( - 'health_insurance_premiums_without_medicare_part_b', - 2024, - map_to='household' - ).values.sum() - - print(f"Person-level total: ${person_total:,.0f}") - print(f"Household-level total: ${household_total:,.0f}") - print(f"Match: {abs(person_total - household_total) < 1}") - - # Check matrix row sum for health insurance - health_ins_idx = targets_df[ - targets_df["variable"] == - "health_insurance_premiums_without_medicare_part_b" - ].index[0] - matrix_total = X_sparse[health_ins_idx, :].sum() * - len(cds_to_calibrate) # Adjust for stacking - print(f"Matrix row total (adjusted): ${matrix_total:,.0f}") - - Step 4: Update Documentation - - Add example to ~/docs/local_area_calibration_setup.ipynb showing: - - How to filter for multiple target groups - - How person-level targets work alongside spm_unit-level targets - - Interpretation of the resulting matrix structure - - Step 5: Create Runner Program with Diagnostics - - Create a runner script test_combined_targets.py that: - - Builds X_sparse with SNAP + health insurance targets - - Extracts target vector - - Uses MatrixTracer to print diagnostics - - Pattern based on first 100 lines of junkyard's - calibrate_cds_sparse.py: - - """ - Test runner for combined SNAP + health insurance calibration targets. - Generates X_sparse and target vector, prints diagnostics using - MatrixTracer. - """ - import logging - from pathlib import Path - - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s" - ) - - import numpy as np - import pandas as pd - from policyengine_us import Microsimulation - from policyengine_us_data.datasets.cps.local_area_calibration.sparse_m - atrix_builder import SparseMatrixBuilder - from - policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer - import MatrixTracer - from policyengine_us_data.datasets.cps.local_area_calibration.calibrat - ion_utils import ( - create_target_groups, - ) - - # ==================================================================== - ======== - # CONFIGURATION - # ==================================================================== - ======== - db_path = "policyengine_us_data/storage/calibration/policy_data.db" - db_uri = f"sqlite:///{db_path}" - time_period = 2024 - - # Test with a few CDs from different states - cds_to_calibrate = [ - '0101', # Alabama CD-1 - '0601', # California CD-1 - '0602', # California CD-2 - '3601', # New York CD-1 - '4801', # Texas CD-1 - ] - - print(f"Testing with {len(cds_to_calibrate)} congressional districts") - - # ==================================================================== - ======== - # STEP 1: LOAD SIMULATION - # ==================================================================== - ======== - print("\nLoading simulation...") - sim = Microsimulation() - n_households = len(sim.calculate("household_id", - map_to="household").values) - print(f"Loaded {n_households:,} households") - - # ==================================================================== - ======== - # STEP 2: BUILD SPARSE MATRIX WITH COMBINED TARGETS - # ==================================================================== - ======== - print("\nBuilding sparse matrix...") - builder = SparseMatrixBuilder( - db_uri=db_uri, - time_period=time_period, - cds_to_calibrate=cds_to_calibrate, - dataset_path=None - ) - - # Query both SNAP (stratum_group_id=4) and national targets - (stratum_group_id=1) - targets_df, X_sparse, household_id_mapping = builder.build_matrix( - sim, - target_filter={"stratum_group_ids": [1, 4]} - ) - - print(f"\nMatrix built successfully:") - print(f" Shape: {X_sparse.shape}") - print(f" Targets: {len(targets_df)}") - print(f" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * - X_sparse.shape[1]):.4%}") - - # ==================================================================== - ======== - # STEP 3: EXTRACT TARGET VECTOR - # ==================================================================== - ======== - target_vector = targets_df["value"].values - print(f"\nTarget vector shape: {target_vector.shape}") - print(f"Target total: ${target_vector.sum():,.0f}") - - # ==================================================================== - ======== - # STEP 4: HEALTH INSURANCE PREMIUM VERIFICATION - # ==================================================================== - ======== - print("\n" + "="*80) - print("HEALTH INSURANCE PREMIUM TARGET ANALYSIS") - print("="*80) - - health_ins_targets = targets_df[ - targets_df["variable"] == - "health_insurance_premiums_without_medicare_part_b" - ] - - if len(health_ins_targets) > 0: - print(f"\nFound {len(health_ins_targets)} health insurance - target(s):") - print(health_ins_targets[["target_id", "variable", "value", - "geographic_id", "stratum_group_id"]]) - - # Get the row index for health insurance - health_ins_idx = health_ins_targets.index[0] - health_ins_row = X_sparse[health_ins_idx, :] - - print(f"\nMatrix row {health_ins_idx} (health insurance):") - print(f" Non-zero entries: {health_ins_row.nnz:,}") - print(f" Row sum: ${health_ins_row.sum():,.0f}") - print(f" Target value: - ${health_ins_targets.iloc[0]['value']:,.0f}") - - # Verify person-to-household aggregation - person_total = sim.calculate( - 'health_insurance_premiums_without_medicare_part_b', - time_period, - map_to='person' - ).values.sum() - - household_total = sim.calculate( - 'health_insurance_premiums_without_medicare_part_b', - time_period, - map_to='household' - ).values.sum() - - print(f"\nEntity aggregation verification:") - print(f" Person-level total: ${person_total:,.0f}") - print(f" Household-level total: ${household_total:,.0f}") - print(f" Match: {abs(person_total - household_total) < 1}") - - else: - print("\nWARNING: No health insurance targets found!") - - # ==================================================================== - ======== - # STEP 5: SNAP TARGET SUMMARY - # ==================================================================== - ======== - print("\n" + "="*80) - print("SNAP TARGET SUMMARY") - print("="*80) - - snap_targets = targets_df[targets_df["variable"] == "snap"] - household_count_targets = targets_df[targets_df["variable"] == - "household_count"] - - print(f"\nSNAP benefit targets: {len(snap_targets)}") - print(f"Household count targets: {len(household_count_targets)}") - - if len(snap_targets) > 0: - print(f"\nSNAP total (all states): - ${snap_targets['value'].sum():,.0f}") - print(f"\nSample SNAP targets:") - print(snap_targets[["target_id", "variable", "value", - "geographic_id"]].head(10)) - - # ==================================================================== - ======== - # STEP 6: USE MATRIX TRACER FOR DETAILED DIAGNOSTICS - # ==================================================================== - ======== - print("\n" + "="*80) - print("MATRIX TRACER DIAGNOSTICS") - print("="*80) - - tracer = MatrixTracer( - targets_df=targets_df, - matrix=X_sparse, - household_id_mapping=household_id_mapping, - geographic_ids=cds_to_calibrate, - sim=sim - ) - - # Print comprehensive matrix structure - tracer.print_matrix_structure(show_groups=True) - - # ==================================================================== - ======== - # STEP 7: TARGET GROUP ANALYSIS - # ==================================================================== - ======== - print("\n" + "="*80) - print("TARGET GROUP ANALYSIS") - print("="*80) - - target_groups, group_info = create_target_groups(targets_df) - - print(f"\nTotal target groups: {len(group_info)}") - for group_id, info in enumerate(group_info): - group_mask = target_groups == group_id - n_targets_in_group = group_mask.sum() - print(f" Group {group_id}: {info} ({n_targets_in_group} - targets)") - - print("\n" + "="*80) - print("RUNNER COMPLETED SUCCESSFULLY") - print("="*80) - - Key Features: - - Loads simulation once - - Builds matrix with combined targets using OR filter - - Verifies health insurance person→household aggregation - - Uses MatrixTracer for comprehensive diagnostics - - Prints target group analysis - - Follows junkyard pattern but with new sparse_matrix_builder - - Success Criteria - - 1. ✓ Runner script executes successfully without errors - 2. ✓ Matrix builds successfully with {"stratum_group_ids": [1, 4]} - filter - 3. ✓ Health insurance target appears in targets_df with correct value - ($385B) - 4. ✓ Matrix shape is (n_targets, n_households × n_cds) where n_targets - ≈ 558 (20 national + 538 SNAP) - 5. ✓ Person-to-household aggregation preserves totals (verified in - runner output) - 6. ✓ MatrixTracer prints comprehensive diagnostics - 7. ✓ Target groups are correctly identified and displayed - 8. ✓ No code changes to sparse_matrix_builder.py (only new runner - script) - - Quick Reference - - Critical Files - - - No changes: sparse_matrix_builder.py (already supports person-level - targets) - - New file: test_combined_targets.py (runner program to create and - verify) - - Database: policyengine_us_data/storage/calibration/policy_data.db - (contains target_id=5) - - Tracer: matrix_tracer.py (used for diagnostics in runner) - - Environment: ~/envs/sep (for this development branch) - - Key Commands - - # Activate environment - source ~/envs/sep/bin/activate - - # Navigate to project directory - cd ~/devl/sep/policyengine-us-data - - # Query database for health insurance target - python -c "import sqlite3; conn = sqlite3.connect('policyengine_us_dat - a/storage/calibration/policy_data.db'); cursor = conn.cursor(); - cursor.execute('SELECT * FROM targets WHERE variable = - \"health_insurance_premiums_without_medicare_part_b\"'); - print(cursor.fetchone())" - - # Run the combined targets runner (after creating per Step 5) - python test_combined_targets.py - - Target Filter Examples - - # SNAP only - {"stratum_group_ids": [4]} - - # Health insurance only - {"variables": ["health_insurance_premiums_without_medicare_part_b"]} - - # Both SNAP and health insurance (recommended) - {"stratum_group_ids": [1, 4]} - - - - - - - - -● Excellent! The plan is approved. Let me create a todo list and start implementing the runner program. - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python - -c " - import sqlite3…) - ⎿  Database target verification: - target_id: 5 - variable: health_insurance_premiums_without_medicare_part_b - … +4 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Write(test_combined_targets.py) - ⎿  Wrote 188 lines to test_combined_targets.py - 1 """ - 2 Test runner for combined SNAP + health insurance calibration targ - ets. - 3 Generates X_sparse and target vector, prints diagnostics using Ma - trixTracer. - 4 """ - 5 import logging - 6 logging.basicConfig( - 7 level=logging.INFO, - 8 format="%(asctime)s - %(levelname)s - %(message)s" - 9 ) - … +178 lines (ctrl+o to expand) - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && timeout: - ~/envs/sep/bin/python policyengine_us_data/datasets/cps/local_ 5m 0s - area_calibration/test_combined_targets.py 2>&1) - ⎿ 2026-01-15 09:12:53,215 - INFO - NumExpr defaulting to 16 threads. - 2026-01-15 09:13:03,329 - INFO - HTTP Request: GET - https://huggingface. - co/api/models/policyengine/policyengine-us-data "HTTP/1.1 200 OK" - … +301 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● The runner executed successfully. Let me update the todos and summarize the results. - -● All tasks completed successfully. Here's a summary: - - Results - - Matrix Structure - - - Shape: (558 targets, 105,225 columns = 21,045 households × 5 CDs) - - Targets breakdown: 20 national (stratum_group_id=1) + 538 SNAP (stratum_group_id=4) - - Sparsity: 98.82% - - Health Insurance Premium Verification - - - Target found: target_id=5, value=$385B (2024) - - Matrix row: 66,150 non-zero entries, row sum = $282,326,240 - - Entity aggregation: Person-level total = Household-level total = $56,465,248 ✓ - - Math check: Row sum $282,326,240 = 5 CDs × $56,465,248 ✓ - - Key Insights - - 1. No code changes to sparse_matrix_builder.py - it already handles person-level targets via PolicyEngine's map_to="household" aggregation - 2. Health insurance is Group 4 in the target groups: "National Health Insurance Premiums Without Medicare Part B (1 target, value=385,000,000,000)" - 3. Combined targets work correctly - the OR filter {"stratum_group_ids": [1, 4]} successfully retrieves both national and SNAP targets - - Files Created - - - /home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration/test_combined_targets.py - Runner program with diagnostics - - The health insurance premium target is now fully integrated into your calibration system with zero code changes to the core builder. - -✻ Baked for 13m 5s - -! git status - ⎿  On branch health-insurance-premiums - Untracked files: - (use "git add ..." to include in what will be committed) - … +4 lines (ctrl+o to expand) - -❯ Ok I see you have a test created, which is nice, but I was hoping for a - runner file too. Did you see that in my last request? It would create - sparse X matrix and run the tracer. - -● The file I created (test_combined_targets.py) is the runner - it generates X_sparse, extracts the target vector, and runs the MatrixTracer for diagnostics. It ran successfully and produced the output you saw. - - Perhaps the name is confusing? Would you like me to rename it to something clearer like run_combined_targets.py? Or is there something additional you'd like the runner to do that I missed? - -! ls test* - ⎿  test_combined_targets.py - -❯ Oh! Yeah let's not call this file "test*". Eventually it will build the - real matrix! Now it will only have snap and our new variable in it. It's - going to be our procedural script that will be an integral part of the - machine learning flow! - -● Got it! Let me rename it to something more appropriate for a production runner. - -● Bash(mv test_combined_targets.py build_calibration_matrix.py) - ⎿  (No content) - -● Read(build_calibration_matrix.py) - ⎿  Read 10 lines - -● Update(build_calibration_matrix.py) - ⎿  Added 1 line, removed 1 line - 1 """ - 2 -Test runner for combined SNAP + health insurance calibration tar - -gets. - 2 +Build calibration matrix for geo-stacking reweighting. - 3 Generates X_sparse and target vector, prints diagnostics using M - atrixTracer. - 4 """ - 5 import logging - -● Done! Renamed to build_calibration_matrix.py with an updated docstring. This is now your production runner for the ML flow. - -● Bash(ls -la build_calibration_matrix.py) - ⎿ -rw-rw-r-- 1 baogorek baogorek 6266 Jan 15 09:19 - build_calibration_matr - ix.py - -● Ready to go: - python build_calibration_matrix.py - -❯ Ok there are some problems here. targets_df has 558 rows and I see - variables like alimony expense in there. I thought we were just adding - our new variable! Remember, it's one variable at a time! also, wy are you - bringing in a default microsimulation? Remember that the "base h5 - dataset" is a very important concept in this local area stacking - approach! That base dataset it ... we'll I'll let you tell me what it - should be. - -● You're right on both counts. Let me fix this: - - 1. Target filter should be specific: Only SNAP (stratum_group_id=4) + the specific health insurance variable - not all of stratum_group_id=1 - 2. Base dataset: For local area stacking, this should be the extended CPS dataset that's been prepared for geo-stacking (likely extended_cps_2024.h5 or similar in the storage directory) - - Let me check what's available: - -● Bash(ls -la /home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/ - storage/*.h5 2>/dev/null || echo "No h5 files in storage root") - ⎿ -rw-rw-r-- 1 baogorek baogorek 446389680 Dec 24 08:03 - /home/baogorek/de - vl/sep/policyengine-us-data/policyengine_us_data/storage/acs_2022.h5 - -rw-rw-r-- 1 baogorek baogorek 789139664 Dec 24 08:03 - /home/baogorek/de - … +78 lines (ctrl+o to expand) - -● I see extended_cps_2024.h5 - that's the extended CPS which has the geographic identifiers and enhanced variables needed for geo-stacking. - - Let me fix both issues: - -● Read(build_calibration_matrix.py) - ⎿  Read 188 lines - -● Update(build_calibration_matrix.py) - ⎿  Added 9 lines, removed 6 lines - 25 # ============================================================= - =============== - 26 # CONFIGURATION - 27 # ============================================================= - =============== - 28 -db_path = ( - 29 - "policyengine_us_data/storage/calibration/policy_data.db" - 30 -) - 28 +db_path = "policyengine_us_data/storage/calibration/policy_data - +.db" - 29 db_uri = f"sqlite:///{db_path}" - 30 time_period = 2024 - 31 - 32 +# Base dataset for geo-stacking: extended CPS with geographic i - +dentifiers - 33 +dataset_path = ( - 34 + "policyengine_us_data/storage/extended_cps_2024.h5" - 35 +) - 36 + - 37 cds_to_calibrate = [ - 38 "101", # Alabama CD-1 - 39 "601", # California CD-1 - ... - 45 print(f"Testing with {len(cds_to_calibrate)} congressional dist - ricts") - 46 - 47 # ============================================================= - =============== - 45 -# STEP 1: LOAD SIMULATION - 48 +# STEP 1: LOAD SIMULATION FROM EXTENDED CPS - 49 # ============================================================= - =============== - 47 -print("\nLoading simulation...") - 48 -sim = Microsimulation() - 50 +print(f"\nLoading simulation from {dataset_path}...") - 51 +sim = Microsimulation(dataset=dataset_path) - 52 n_households = len(sim.calculate("household_id", map_to="househ - old").values) - 53 print(f"Loaded {n_households:,} households") - 54 - -● Update(build_calibration_matrix.py) - ⎿  Added 7 lines, removed 1 line - 63 dataset_path=None, - 64 ) - 65 - 66 +# SNAP targets (stratum_group_id=4) + specific health insurance - + variable - 67 +# Uses OR logic: gets all SNAP targets OR the health insurance - +target - 68 targets_df, X_sparse, household_id_mapping = builder.build_matr - ix( - 67 - sim, target_filter={"stratum_group_ids": [1, 4]} - 69 + sim, - 70 + target_filter={ - 71 + "stratum_group_ids": [4], - 72 + "variables": ["health_insurance_premiums_without_medica - +re_part_b"], - 73 + }, - 74 ) - 75 - 76 print(f"\nMatrix built successfully:") - -● Now let me run it to verify: - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && timeout: - ~/envs/sep/bin/python policyengine_us_data/datasets/cps/local_ 5m 0s - area_calibration/build_calibration_matrix.py 2>…) - ⎿ 2026-01-15 09:27:58,320 - INFO - NumExpr defaulting to 16 threads. - 2026-01-15 09:28:19,765 - INFO - Tracer initialized: 41314 households - x - 5 geographies - … +187 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Now we have: - - 539 targets (SNAP + health insurance only, not all 20 national targets) - - Extended CPS 2024 as the base dataset (41,314 households) - - 4 target groups: - - Group 0: Health Insurance Premiums (1 national target) - - Group 1: State SNAP Household Count (51 targets) - - Group 2: State Snap (51 targets) - - Group 3: District SNAP Household Count (436 targets) - - The entity aggregation "Match: False" is just a floating point precision issue ($4 difference out of $109M). Let me fix that check: - -● Update(build_calibration_matrix.py) - ⎿  Added 1 line, removed 1 line - 130 print(f"\nEntity aggregation verification:") - 131 print(f" Person-level total: ${person_total:,.0f}") - 132 print(f" Household-level total: ${household_total:,.0f}") - 133 - print(f" Match: {abs(person_total - household_total) < 1} - -") - 133 + print(f" Match: {np.isclose(person_total, household_total - +, rtol=1e-6)}") - 134 - 135 else: - 136 print("\nWARNING: No health insurance targets found!") - -● Done! The runner now: - 1. Uses extended_cps_2024.h5 as the base dataset - 2. Includes only SNAP + health insurance targets (539 total) - 3. Creates 4 target groups (down from 23) - -✻ Crunched for 1m 36s - -❯ I guess I never made it clear that the base dataset will be - "stratified_extended_cps_2023.h5" which you perhaps could have figured - out from the local-area* target in the Makefile, but that's okay. Can you - help the builder use that file in a way that it could be run - interactively by a user in the directory that script resides in? - -● Got it! Let me fix the paths to work when running from the script's directory using __file__ to resolve paths. - -● Update(build_calibration_matrix.py) - ⎿  Added 12 lines, removed 15 lines - 3 Generates X_sparse and target vector, prints diagnostics using M - atrixTracer. - 4 """ - 5 import logging - 6 +from pathlib import Path - 7 - 8 logging.basicConfig( - 9 level=logging.INFO, - ... - 13 import numpy as np - 14 import pandas as pd - 15 from policyengine_us import Microsimulation - 15 -from policyengine_us_data.datasets.cps.local_area_calibration.s - -parse_matrix_builder import ( - 16 - SparseMatrixBuilder, - 17 -) - 18 -from policyengine_us_data.datasets.cps.local_area_calibration.m - -atrix_tracer import ( - 19 - MatrixTracer, - 20 -) - 21 -from policyengine_us_data.datasets.cps.local_area_calibration.c - -alibration_utils import ( - 22 - create_target_groups, - 23 -) - 16 +from sparse_matrix_builder import SparseMatrixBuilder - 17 +from matrix_tracer import MatrixTracer - 18 +from calibration_utils import create_target_groups - 19 - 20 # ============================================================= - =============== - 21 # CONFIGURATION - 22 # ============================================================= - =============== - 28 -db_path = "policyengine_us_data/storage/calibration/policy_data - -.db" - 23 +# Resolve paths relative to this script's location - 24 +SCRIPT_DIR = Path(__file__).resolve().parent - 25 +STORAGE_DIR = SCRIPT_DIR.parent.parent.parent / "storage" - 26 + - 27 +db_path = STORAGE_DIR / "calibration" / "policy_data.db" - 28 db_uri = f"sqlite:///{db_path}" - 30 -time_period = 2024 - 29 +time_period = 2023 - 30 - 32 -# Base dataset for geo-stacking: extended CPS with geographic i - -dentifiers - 33 -dataset_path = ( - 34 - "policyengine_us_data/storage/extended_cps_2024.h5" - 35 -) - 31 +# Base dataset for geo-stacking: stratified extended CPS - 32 +dataset_path = STORAGE_DIR / "stratified_extended_cps_2023.h5" - 33 - 34 cds_to_calibrate = [ - 35 "101", # Alabama CD-1 - -● Now let me verify it runs from the script's directory: - -● Bash(~/envs/sep/bin/python build_calibration_matrix.py 2>&1) timeout: 5m 0s - ⎿  Error: Exit code 1 - 2026-01-15 09:34:39,873 - INFO - NumExpr defaulting to 16 threads. - TEST_LITE == False - Testing with 5 congressional districts - - Loading simulation from /home/baogorek/devl/sep/policyengine-us-data/policye - ngine_us_data/storage/stratified_extended_cps_2023.h5... - Traceback (most recent call last): - File "/home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/da - tasets/cps/local_area_calibration/build_calibration_matrix.py", line 48, in - - sim = Microsimulation(dataset=dataset_path) - File "/home/baogorek/envs/sep/lib/python3.13/site-packages/policyengine_us - /system.py", line 220, in __init__ - … +9 lines (ctrl+o to see all) - -● Need to convert the Path to a string for Microsimulation: - -● Update(build_calibration_matrix.py) - ⎿  Added 1 line, removed 1 line - 45 # STEP 1: LOAD SIMULATION FROM EXTENDED CPS - 46 # ============================================================= - =============== - 47 print(f"\nLoading simulation from {dataset_path}...") - 48 -sim = Microsimulation(dataset=dataset_path) - 48 +sim = Microsimulation(dataset=str(dataset_path)) - 49 n_households = len(sim.calculate("household_id", map_to="househ - old").values) - 50 print(f"Loaded {n_households:,} households") - 51 - -● Bash(~/envs/sep/bin/python build_calibration_matrix.py 2>&1) timeout: 5m 0s - ⎿ 2026-01-15 09:35:11,752 - INFO - NumExpr defaulting to 16 threads. - 2026-01-15 09:35:27,099 - INFO - Tracer initialized: 13507 households - x - 5 geographies - … +187 lines (ctrl+o to expand) - -● Now it works from the script directory: - - Base dataset: stratified_extended_cps_2023.h5 (13,507 households) - - Time period: 2023 - - 539 targets: 1 health insurance + 538 SNAP - - 4 target groups - - Entity aggregation: Match: True - - The script can now be run interactively: - cd policyengine_us_data/datasets/cps/local_area_calibration - python build_calibration_matrix.py - -✻ Brewed for 1m 12s - -❯ Do you really need SCRIPT_DIR = Path(__file__).resolve().parent? in - there? I can't run __FILE__ in ipython? You can import STORAGE_DIR - directly, right? - -● You're right, let me check if STORAGE_DIR is already available to import: - -● Search(pattern: "STORAGE_DIR", path: - "~/devl/sep/policyengine-us-data/policyengine_us_data", output_mode: - "content") - ⎿  Found 3 lines (ctrl+o to expand) - -● Search(pattern: "storage.*=.*Path", path: - "~/devl/sep/policyengine-us-data/policyengine_us_data", output_mode: - "content") - ⎿  Found 0 lines - -● Search(pattern: "**/policyengine_us_data/**/constants*.py") - ⎿  Found 0 files - -● Search(pattern: "from policyengine_us_data import", path: - "~/devl/sep/policyengine-us-data/policyengine_us_data", output_mode: - "content", head_limit: 10) - ⎿  Found 0 lines - -● Search(pattern: "policyengine_us_data\.STORAGE", path: - "~/devl/sep/policyengine-us-data", output_mode: "content") - ⎿  Found 0 lines - -● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/__init__.py) - ⎿  Read 3 lines - -● Bash(ls -la /home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/ - storage/*.py 2>/dev/null | head -5) - ⎿ -rw-rw-r-- 1 baogorek baogorek 780 Jan 6 11:19 - /home/baogorek/devl/se - p/policyengine-us-data/policyengine_us_data/storage/download_private_p - r - erequisites.py - … +7 lines (ctrl+o to expand) - -● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/storage/__init__.py) - ⎿  Read 6 lines - -● STORAGE_FOLDER is already defined in policyengine_us_data.storage. Let me update the script: - -● Update(build_calibration_matrix.py) - ⎿  Added 3 lines, removed 7 lines - 3 Generates X_sparse and target vector, prints diagnostics using M - atrixTracer. - 4 """ - 5 import logging - 6 -from pathlib import Path - 6 - 7 logging.basicConfig( - 8 level=logging.INFO, - ... - 12 import numpy as np - 13 import pandas as pd - 14 from policyengine_us import Microsimulation - 15 +from policyengine_us_data.storage import STORAGE_FOLDER - 16 from sparse_matrix_builder import SparseMatrixBuilder - 17 from matrix_tracer import MatrixTracer - 18 from calibration_utils import create_target_groups - ... - 20 # ============================================================= - =============== - 21 # CONFIGURATION - 22 # ============================================================= - =============== - 23 -# Resolve paths relative to this script's location - 24 -SCRIPT_DIR = Path(__file__).resolve().parent - 25 -STORAGE_DIR = SCRIPT_DIR.parent.parent.parent / "storage" - 26 - - 27 -db_path = STORAGE_DIR / "calibration" / "policy_data.db" - 23 +db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" - 24 db_uri = f"sqlite:///{db_path}" - 25 time_period = 2023 - 26 - 27 # Base dataset for geo-stacking: stratified extended CPS - 32 -dataset_path = STORAGE_DIR / "stratified_extended_cps_2023.h5" - 28 +dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h - +5" - 29 - 30 cds_to_calibrate = [ - 31 "101", # Alabama CD-1 - -● Bash(~/envs/sep/bin/python build_calibration_matrix.py 2>&1 | head -30) - ⎿ 2026-01-15 09:45:04,625 - INFO - NumExpr defaulting to 16 threads. - 2026-01-15 09:45:19,327 - INFO - Tracer initialized: 13507 households - x - 5 geographies - … +31 lines (ctrl+o to expand) - -● Done. Now you can use STORAGE_FOLDER in IPython too: - - from policyengine_us_data.storage import STORAGE_FOLDER - dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" - -✻ Sautéed for 1m 11s - -❯ Is there an opportunity to create a real test in tests? - -● Search(pattern: "**/tests/**/*.py", path: "~/devl/sep/policyengine-us-data") - ⎿  Found 100 files (ctrl+o to expand) - -● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/tests/test_database. - py) - ⎿  Read 87 lines - -● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/tests/test_datasets/ - test_small_enhanced_cps.py) - ⎿  Read 18 lines - -● Yes! I can create a test for the sparse matrix builder. Here's what we could test: - - 1. Person-level aggregation preserves totals - 2. Matrix builds with correct shape - 3. Health insurance target is included correctly - -● Write(~/devl/sep/policyengine-us-data/policyengine_us_data/tests/test_sparse_m - atrix_builder.py) - ⎿  Wrote 61 lines to ../../../tests/test_sparse_matrix_builder.py - 1 import pytest - 2 import numpy as np - 3 from policyengine_us import Microsimulation - 4 from policyengine_us_data.storage import STORAGE_FOLDER - 5 from policyengine_us_data.datasets.cps.local_area_calibration.spa - rse_matrix_builder import ( - 6 SparseMatrixBuilder, - 7 ) - 8 @pytest.fixture - … +51 lines (ctrl+o to expand) - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && timeout: 5m - ~/envs/sep/bin/pytest 0s - policyengine_us_data/tests/test_sparse_matrix_builder.py -v - 2>&1) - ⎿ ============================= test session starts - ===================== - ========= - platform linux -- Python 3.13.6, pytest-8.4.2, pluggy-1.6.0 -- - /home/ba - … +16 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● All 3 tests pass: - - 1. test_person_level_aggregation_preserves_totals - Verifies person→household aggregation preserves totals - 2. test_matrix_shape - Verifies matrix has correct dimensions - 3. test_combined_snap_and_health_insurance - Verifies both SNAP and health insurance targets are included - - policyengine_us_data/tests/test_sparse_matrix_builder.py - 3 passed in 12.33s - -✻ Cooked for 1m 1s - -❯ Cool, I'm surprised we didn't have one before. Let's go ahead and update - the changelog_entry.yaml, create an issue, format with black, commit, - push, and create the PR that links to the issue. - -● Search(pattern: "**/changelog_entry.yaml", path: - "~/devl/sep/policyengine-us-data") - ⎿  Found 1 file (ctrl+o to expand) - -● Read(~/devl/sep/policyengine-us-data/changelog_entry.yaml) - ⎿  Read 5 lines - -● Update(~/devl/sep/policyengine-us-data/changelog_entry.yaml) - ⎿  Added 4 lines, removed 3 lines - 1 -- bump: patch - 1 +- bump: minor - 2 changes: - 3 - fixed: - 4 - - Versioning workflow checkout for push events - 3 + added: - 4 + - Support for health_insurance_premiums_without_medicare_par - +t_b in local area calibration - 5 + - Test coverage for sparse matrix builder with person-level - +targets - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git remote -v) - ⎿  origin git@github.com:PolicyEngine/policyengine-us-data.git (fetch) - origin git@github.com:PolicyEngine/policyengine-us-data.git (push) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && gh issue create \ - --repo PolicyEngine/policyengine-us-data \…) - ⎿  https://github.com/PolicyEngine/policyengine-us-data/issues/472 - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && black . -l 79 2>&1) - ⎿ Skipping .ipynb files as Jupyter dependencies are not installed. - You can fix this by running ``pip install "black[jupyter]"`` - reformatted - /home/baogorek/devl/sep/policyengine-us-data/policyengine_u - … +6 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git status) - ⎿  On branch health-insurance-premiums - Changes not staged for commit: - (use "git add ..." to update what will be committed) - … +12 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git add - changelog_entry.yaml policyengine_us_data/datasets/cps/local_area_calibrat - ion/build_calibration_matri…) - ⎿  On branch health-insurance-premiums - Changes to be committed: - (use "git restore --staged ..." to unstage) - … +9 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git commit -m "Add - health insurance premiums to local area calibration…) - ⎿ [health-insurance-premiums 4268676] Add health insurance premiums to - lo - cal area calibration - 3 files changed, 263 insertions(+), 3 deletions(-) - … +4 lines (ctrl+o to expand) - - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git push -u origin - health-insurance-premiums 2>&1) - ⎿ remote: - remote: Create a pull request for 'health-insurance-premiums' on - GitHub - by visiting: - … +13 lines (ctrl+o to expand) - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && gh pr create --title - "Add health insurance premiums to local area calibration" --body "## - Summary…) - ⎿  https://github.com/PolicyEngine/policyengine-us-data/pull/473 - Warning: 1 uncommitted change - ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po - licyengine_us_data/datasets/cps/local_area_calibration - -● Done! Here's the summary: - - - Issue: https://github.com/PolicyEngine/policyengine-us-data/issues/472 - - PR: https://github.com/PolicyEngine/policyengine-us-data/pull/473 - - The PR includes: - - build_calibration_matrix.py - Runner script for building X_sparse with SNAP + health insurance - - test_sparse_matrix_builder.py - 3 tests for person-level aggregation - - changelog_entry.yaml - Minor version bump - -✻ Crunched for 1m 51s - -❯ You've done a lot of work and discovery in this session, can you please - write up a report about what you've learned? This will be useful for your - future self to come back in here and be ready for the next variable, - which might not be quite as easy as this one! - -● Write(ADDING_CALIBRATION_TARGETS.md) - ⎿  Wrote 242 lines to ADDING_CALIBRATION_TARGETS.md - 1 # Adding Calibration Targets to Local Area Geo-Stacking - 2 This document summarizes key learnings from adding `health_insura - nce_premiums_without_medicare_part_b` as a calibration target. Us - e this as a reference when adding future targets. - 3 ## Key Discovery: No Code Changes Needed for Most Targets - 4 The `sparse_matrix_builder.py` is **already entity-agnostic**. Po - licyEngine's `map_to="household"` parameter automatically handles - aggregation from any entity level (person, tax_unit, spm_unit) t - o household level. - 5 ```python - 6 # This line in sparse_matrix_builder.py (line 220-222) handles AL - L entity types: - … +232 lines (ctrl+o to expand) - -● Created ADDING_CALIBRATION_TARGETS.md with: - - 1. Key Discovery: sparse_matrix_builder.py is already entity-agnostic via map_to="household" - 2. Architecture Overview: File locations, database schema, stratum groups - 3. Step-by-Step Guide: How to add new targets (5 steps) - 4. When Code Changes ARE Needed: Count variables, state-specific calcs, person-level constraints - 5. Junkyard Reference: Key patterns from the 2,400+ line reference implementation - 6. Matrix Structure: Shape, column indexing, MatrixTracer usage - 7. Common Queries: SQL snippets for exploring the database - - Would you like me to add this report to the PR as well? - -✻ Sautéed for 53s - - - diff --git a/uv.lock b/uv.lock index 24fc41827..6ddfdd548 100644 --- a/uv.lock +++ b/uv.lock @@ -1085,6 +1085,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/dd/ead9d8ea85bf202d90cc513b533f9c363121c7792674f78e0d8a854b63b4/jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780", size = 15884, upload-time = "2023-11-23T09:26:34.325Z" }, ] +[[package]] +name = "l0-python" +version = "0.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "scipy" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cf/6b/4a9ca6d1eb9828c526947fffb2ee2a1d02eec330f04cd53af301a05fde0a/l0_python-0.5.0.tar.gz", hash = "sha256:9b6b1751e142702e21ed866e40d8ab47304a26a5455998620a0eb798f4c7f599", size = 36320, upload-time = "2026-01-21T13:55:53.365Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/80/33ccae8af3fe55a81d33569d9241a29cecde17ab34fdff214804e81fa353/l0_python-0.5.0-py3-none-any.whl", hash = "sha256:9c8f4532426b927a97f4722b1c5114147adb09365100623effb49c0021345881", size = 23590, upload-time = "2026-01-21T13:55:52.406Z" }, +] + [[package]] name = "lark" version = "1.3.1" @@ -1874,6 +1888,9 @@ dependencies = [ calibration = [ { name = "samplics" }, ] +l0 = [ + { name = "l0-python" }, +] [package.dev-dependencies] dev = [ @@ -1894,6 +1911,7 @@ dev = [ requires-dist = [ { name = "google-auth", specifier = ">=2.0.0" }, { name = "google-cloud-storage", specifier = ">=2.0.0" }, + { name = "l0-python", marker = "extra == 'l0'" }, { name = "microdf-python", specifier = ">=1.0.0" }, { name = "microimpute", specifier = ">=1.1.4" }, { name = "openpyxl", specifier = ">=3.1.5" }, @@ -1915,7 +1933,7 @@ requires-dist = [ { name = "us", specifier = ">=2.0.0" }, { name = "xlrd", specifier = ">=2.0.2" }, ] -provides-extras = ["calibration"] +provides-extras = ["calibration", "l0"] [package.metadata.requires-dev] dev = [ From ba25e3ee8835c1233bb380f450722d0f5db39745 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Wed, 21 Jan 2026 21:59:34 -0500 Subject: [PATCH 14/19] Rename fit_weights.py to remote_calibration_runner.py Avoid confusion with fit_calibration_weights.py which contains the actual fitting logic. Co-Authored-By: Claude Opus 4.5 --- modal_app/README.md | 4 ++-- modal_app/{fit_weights.py => remote_calibration_runner.py} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename modal_app/{fit_weights.py => remote_calibration_runner.py} (100%) diff --git a/modal_app/README.md b/modal_app/README.md index b4d9e833a..0b10cf726 100644 --- a/modal_app/README.md +++ b/modal_app/README.md @@ -11,7 +11,7 @@ Run calibration weight fitting on Modal's cloud GPUs. ## Usage ```bash -modal run modal_app/fit_weights.py --branch --epochs --gpu +modal run modal_app/remote_calibration_runner.py --branch --epochs --gpu ``` ### Arguments @@ -27,7 +27,7 @@ modal run modal_app/fit_weights.py --branch --epochs --gpu ### Example ```bash -modal run modal_app/fit_weights.py --branch health-insurance-premiums --epochs 100 --gpu T4 +modal run modal_app/remote_calibration_runner.py --branch health-insurance-premiums --epochs 100 --gpu T4 ``` ## Output Files diff --git a/modal_app/fit_weights.py b/modal_app/remote_calibration_runner.py similarity index 100% rename from modal_app/fit_weights.py rename to modal_app/remote_calibration_runner.py From c6c9f6f540bf5a81e73b78eae73e570825b714e6 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 23 Jan 2026 16:32:16 -0500 Subject: [PATCH 15/19] Fix CI: add uv lock to versioning workflow The versioning workflow bumps the version in pyproject.toml but never updated uv.lock, causing Modal builds to fail with "uv sync --locked" since the lockfile was stale. Co-Authored-By: Claude Opus 4.5 --- .github/workflows/versioning.yaml | 4 ++++ changelog_entry.yaml | 2 ++ uv.lock | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/versioning.yaml b/.github/workflows/versioning.yaml index 84667308e..48658dbc7 100644 --- a/.github/workflows/versioning.yaml +++ b/.github/workflows/versioning.yaml @@ -23,8 +23,12 @@ jobs: uses: actions/setup-python@v5 with: python-version: 3.12 + - name: Install uv + uses: astral-sh/setup-uv@v5 - name: Build changelog run: pip install yaml-changelog && make changelog + - name: Update lockfile + run: uv lock - name: Preview changelog update run: ".github/get-changelog-diff.sh" - name: Update changelog diff --git a/changelog_entry.yaml b/changelog_entry.yaml index c38106204..5dbe417a7 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -3,3 +3,5 @@ added: - Support for health_insurance_premiums_without_medicare_part_b in local area calibration - Test coverage for sparse matrix builder with person-level targets + fixed: + - Versioning workflow now runs uv lock after version bump to keep uv.lock in sync diff --git a/uv.lock b/uv.lock index 6ddfdd548..bf45a4519 100644 --- a/uv.lock +++ b/uv.lock @@ -1858,7 +1858,7 @@ wheels = [ [[package]] name = "policyengine-us-data" -version = "1.51.1" +version = "1.53.0" source = { editable = "." } dependencies = [ { name = "google-auth" }, From 20572be0eede28f33ce1daed64adf4b3329b4c06 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 23 Jan 2026 18:49:58 -0500 Subject: [PATCH 16/19] Streamline data build: remove TEST_LITE/LOCAL_AREA_CALIBRATION, eliminate dense reweighting Remove the dense optimization path from enhanced CPS (only sparse L0 weights are needed), eliminating the mv/cp workaround in the Makefile. Remove both TEST_LITE and LOCAL_AREA_CALIBRATION environment variable flags, generating all datasets unconditionally in a single pass. Merge the data-local-area target into the data target. Co-Authored-By: Claude Opus 4.5 --- .github/workflows/reusable_test.yaml | 3 +- Makefile | 9 +- changelog_entry.yaml | 4 + modal_app/data_build.py | 49 +--------- policyengine_us_data/datasets/cps/cps.py | 29 ++---- .../datasets/cps/enhanced_cps.py | 90 ++----------------- .../datasets/cps/extended_cps.py | 11 +-- .../datasets/cps/small_enhanced_cps.py | 4 +- policyengine_us_data/datasets/puf/puf.py | 14 +-- .../test_datasets/test_sparse_enhanced_cps.py | 4 +- 10 files changed, 37 insertions(+), 180 deletions(-) diff --git a/.github/workflows/reusable_test.yaml b/.github/workflows/reusable_test.yaml index 95d552810..4575a508c 100644 --- a/.github/workflows/reusable_test.yaml +++ b/.github/workflows/reusable_test.yaml @@ -65,8 +65,7 @@ jobs: run: | modal run modal_app/data_build.py \ ${{ inputs.upload_data && '--upload' || '--no-upload' }} \ - --branch=${{ github.head_ref || github.ref_name }} \ - ${{ inputs.upload_data && '--no-test-lite' || '--test-lite' }} + --branch=${{ github.head_ref || github.ref_name }} - name: Install package run: uv sync --dev diff --git a/Makefile b/Makefile index 4d5f5befa..fd212a08e 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all format test install download upload docker documentation data data-local-area publish-local-area clean build paper clean-paper presentations +.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations all: data test @@ -71,13 +71,6 @@ data: download python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/enhanced_cps.py python policyengine_us_data/datasets/cps/small_enhanced_cps.py - mv policyengine_us_data/storage/enhanced_cps_2024.h5 policyengine_us_data/storage/dense_enhanced_cps_2024.h5 - cp policyengine_us_data/storage/sparse_enhanced_cps_2024.h5 policyengine_us_data/storage/enhanced_cps_2024.h5 - -data-local-area: data - LOCAL_AREA_CALIBRATION=true python policyengine_us_data/datasets/cps/cps.py - LOCAL_AREA_CALIBRATION=true python policyengine_us_data/datasets/puf/puf.py - LOCAL_AREA_CALIBRATION=true python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py 10500 publish-local-area: diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 5dbe417a7..70e68eb73 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -3,5 +3,9 @@ added: - Support for health_insurance_premiums_without_medicare_part_b in local area calibration - Test coverage for sparse matrix builder with person-level targets + changed: + - Removed dense reweighting path from enhanced CPS; only sparse (L0) weights are produced + - Eliminated TEST_LITE and LOCAL_AREA_CALIBRATION flags; all datasets generated unconditionally + - Merged data-local-area Makefile target into data target fixed: - Versioning workflow now runs uv lock after version bump to keep uv.lock in sync diff --git a/modal_app/data_build.py b/modal_app/data_build.py index 528035684..f56b96a72 100644 --- a/modal_app/data_build.py +++ b/modal_app/data_build.py @@ -38,7 +38,6 @@ def setup_gcp_credentials(): def build_datasets( upload: bool = False, branch: str = "main", - test_lite: bool = False, ): setup_gcp_credentials() @@ -49,8 +48,6 @@ def build_datasets( subprocess.run(["uv", "sync", "--locked"], check=True) env = os.environ.copy() - if test_lite: - env["TEST_LITE"] = "true" # Download prerequisites subprocess.run( @@ -79,44 +76,8 @@ def build_datasets( print(f"Running {script}...") subprocess.run(["uv", "run", "python", script], check=True, env=env) - os.rename( - "policyengine_us_data/storage/enhanced_cps_2024.h5", - "policyengine_us_data/storage/dense_enhanced_cps_2024.h5", - ) - subprocess.run( - [ - "cp", - "policyengine_us_data/storage/sparse_enhanced_cps_2024.h5", - "policyengine_us_data/storage/enhanced_cps_2024.h5", - ], - check=True, - ) - - # Build local area calibration datasets (without TEST_LITE - must match full dataset) - print("Building local area calibration datasets...") - local_area_env = os.environ.copy() - local_area_env["LOCAL_AREA_CALIBRATION"] = "true" - - subprocess.run( - ["uv", "run", "python", "policyengine_us_data/datasets/cps/cps.py"], - check=True, - env=local_area_env, - ) - subprocess.run( - ["uv", "run", "python", "policyengine_us_data/datasets/puf/puf.py"], - check=True, - env=local_area_env, - ) - subprocess.run( - [ - "uv", - "run", - "python", - "policyengine_us_data/datasets/cps/extended_cps.py", - ], - check=True, - env=local_area_env, - ) + # Build stratified CPS for local area calibration + print("Running create_stratified_cps.py...") subprocess.run( [ "uv", @@ -126,7 +87,7 @@ def build_datasets( "10500", ], check=True, - env=local_area_env, + env=env, ) # Run local area calibration tests @@ -140,7 +101,7 @@ def build_datasets( "-v", ], check=True, - env=local_area_env, + env=env, ) # Run main test suite @@ -167,11 +128,9 @@ def build_datasets( def main( upload: bool = False, branch: str = "main", - test_lite: bool = False, ): result = build_datasets.remote( upload=upload, branch=branch, - test_lite=test_lite, ) print(result) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 27a41bec7..249e40e5d 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -15,9 +15,6 @@ from microimpute.models.qrf import QRF import logging -test_lite = os.environ.get("TEST_LITE") == "true" -print(f"TEST_LITE == {test_lite}") - class CPS(Dataset): name = "cps" @@ -2141,21 +2138,13 @@ class Pooled_3_Year_CPS_2023(PooledCPS): url = "hf://policyengine/policyengine-us-data/pooled_3_year_cps_2023.h5" -local_area_calibration = os.environ.get("LOCAL_AREA_CALIBRATION") == "true" - if __name__ == "__main__": - if test_lite: - CPS_2024().generate() - CPS_2025().generate() - elif local_area_calibration: - CPS_2023_Full().generate() - else: - CPS_2021().generate() - CPS_2022().generate() - CPS_2023().generate() - CPS_2024().generate() - CPS_2025().generate() - CPS_2021_Full().generate() - CPS_2022_Full().generate() - CPS_2023_Full().generate() - Pooled_3_Year_CPS_2023().generate() + CPS_2021().generate() + CPS_2022().generate() + CPS_2023().generate() + CPS_2024().generate() + CPS_2025().generate() + CPS_2021_Full().generate() + CPS_2022_Full().generate() + CPS_2023_Full().generate() + Pooled_3_Year_CPS_2023().generate() diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 4eb0a660b..dc8f50402 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -18,8 +18,6 @@ CPS_2019, CPS_2024, ) -import os -from pathlib import Path import logging try: @@ -32,7 +30,6 @@ def reweight( original_weights, loss_matrix, targets_array, - dropout_rate=0.05, log_path="calibration_log.csv", epochs=500, l0_lambda=2.6445e-07, @@ -40,7 +37,6 @@ def reweight( temperature=0.25, seed=1456, ): - set_seeds(seed) target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32) @@ -53,14 +49,10 @@ def reweight( normalisation_factor, dtype=torch.float32 ) targets_array = torch.tensor(targets_array, dtype=torch.float32) - weights = torch.tensor( - np.log(original_weights), requires_grad=True, dtype=torch.float32 - ) inv_mean_normalisation = 1 / np.mean(normalisation_factor.numpy()) def loss(weights): - # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") if torch.isnan(loss_matrix).any(): @@ -78,75 +70,11 @@ def loss(weights): raise ValueError("Relative error contains NaNs") return rel_error_normalized.mean() - def dropout_weights(weights, p): - if p == 0: - return weights - # Replace p% of the weights with the mean value of the rest of them - mask = torch.rand_like(weights) < p - mean = weights[~mask].mean() - masked_weights = weights.clone() - masked_weights[mask] = mean - return masked_weights - - # Original (Dense) path --- - optimizer = torch.optim.Adam([weights], lr=3e-1) - start_loss = None - - iterator = trange(epochs) - performance = pd.DataFrame() - for i in iterator: - optimizer.zero_grad() - weights_ = dropout_weights(weights, dropout_rate) - l = loss(torch.exp(weights_)) - if (log_path is not None) and (i % 10 == 0): - estimates = torch.exp(weights) @ loss_matrix - estimates = estimates.detach().numpy() - df = pd.DataFrame( - { - "target_name": target_names, - "estimate": estimates, - "target": targets_array.detach().numpy(), - } - ) - df["epoch"] = i - df["error"] = df.estimate - df.target - df["rel_error"] = df.error / df.target - df["abs_error"] = df.error.abs() - df["rel_abs_error"] = df.rel_error.abs() - df["loss"] = df.rel_abs_error**2 - performance = pd.concat([performance, df], ignore_index=True) - - if (log_path is not None) and (i % 1000 == 0): - performance.to_csv(log_path, index=False) - if start_loss is None: - start_loss = l.item() - loss_rel_change = (l.item() - start_loss) / start_loss - l.backward() - iterator.set_postfix( - {"loss": l.item(), "loss_rel_change": loss_rel_change} - ) - optimizer.step() - if log_path is not None: - performance.to_csv(log_path, index=False) - - final_weights_dense = torch.exp(weights).detach().numpy() - - optimised_weights = final_weights_dense - print_reweighting_diagnostics( - final_weights_dense, - loss_matrix, - targets_array, - "Dense Solutions", - ) - - # New (Sparse) path depending on temperature, init_mean, l0_lambda ----- logging.info( f"Sparse optimization using seed {seed}, temp {temperature} " + f"init_mean {init_mean}, l0_lambda {l0_lambda}" ) set_seeds(seed) - p = Path(log_path) - log_path_sparse = p.with_name(f"{p.stem}_sparse{p.suffix}") weights = torch.tensor( np.log(original_weights), requires_grad=True, dtype=torch.float32 @@ -166,7 +94,7 @@ def dropout_weights(weights, p): masked = torch.exp(weights) * gates() l_main = loss(masked) l = l_main + l0_lambda * gates.get_penalty() - if (log_path_sparse is not None) and (i % 10 == 0): + if (log_path is not None) and (i % 10 == 0): gates.eval() estimates = (torch.exp(weights) * gates()) @ loss_matrix gates.train() @@ -186,8 +114,8 @@ def dropout_weights(weights, p): df["loss"] = df.rel_abs_error**2 performance = pd.concat([performance, df], ignore_index=True) - if (log_path_sparse is not None) and (i % 1000 == 0): - performance.to_csv(log_path_sparse, index=False) + if (log_path is not None) and (i % 1000 == 0): + performance.to_csv(log_path, index=False) if start_loss is None: start_loss = l.item() loss_rel_change = (l.item() - start_loss) / start_loss @@ -196,8 +124,8 @@ def dropout_weights(weights, p): {"loss": l.item(), "loss_rel_change": loss_rel_change} ) optimizer.step() - if log_path_sparse is not None: - performance.to_csv(log_path_sparse, index=False) + if log_path is not None: + performance.to_csv(log_path, index=False) gates.eval() final_weights_sparse = (torch.exp(weights) * gates()).detach().numpy() @@ -209,7 +137,7 @@ def dropout_weights(weights, p): "L0 Sparse Solution", ) - return final_weights_dense, final_weights_sparse + return final_weights_sparse def train_previous_year_income_model(): @@ -268,7 +196,6 @@ def generate(self): sim = Microsimulation(dataset=self.input_dataset) data = sim.dataset.load_dataset() data["household_weight"] = {} - data["household_sparse_weight"] = {} original_weights = sim.calculate("household_weight") original_weights = original_weights.values + np.random.normal( 1, 0.1, len(original_weights) @@ -309,7 +236,7 @@ def generate(self): targets_array_clean = targets_array[keep_idx] assert loss_matrix_clean.shape[1] == targets_array_clean.size - optimised_weights_dense, optimised_weights_sparse = reweight( + optimised_weights = reweight( original_weights, loss_matrix_clean, targets_array_clean, @@ -317,8 +244,7 @@ def generate(self): epochs=200, seed=1456, ) - data["household_weight"][year] = optimised_weights_dense - data["household_sparse_weight"][year] = optimised_weights_sparse + data["household_weight"][year] = optimised_weights self.save_dataset(data) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 350aa642d..e8ce3d506 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -4,7 +4,6 @@ from policyengine_us_data.datasets.cps.cps import * from policyengine_us_data.datasets.puf import * import pandas as pd -import os from microimpute.models.qrf import QRF import time import logging @@ -339,11 +338,5 @@ class ExtendedCPS_2024(ExtendedCPS): if __name__ == "__main__": - local_area_calibration = ( - os.environ.get("LOCAL_AREA_CALIBRATION", "").lower() == "true" - ) - - if local_area_calibration: - ExtendedCPS_2023().generate() - else: - ExtendedCPS_2024().generate() + ExtendedCPS_2023().generate() + ExtendedCPS_2024().generate() diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index a9679a2a6..e4d2c05ea 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -50,7 +50,7 @@ def create_sparse_ecps(): ecps = EnhancedCPS_2024() h5 = ecps.load() - sparse_weights = h5["household_sparse_weight"][str(time_period)][:] + sparse_weights = h5["household_weight"][str(time_period)][:] hh_ids = h5["household_id"][str(time_period)][:] template_sim = Microsimulation( @@ -102,7 +102,7 @@ def create_sparse_ecps(): if len(data[variable]) == 0: del data[variable] - with h5py.File(STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5", "w") as f: + with h5py.File(STORAGE_FOLDER / "enhanced_cps_2024.h5", "w") as f: for variable, periods in data.items(): grp = f.create_group(variable) for period, values in periods.items(): diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index c90255e3f..bc10094a4 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -754,13 +754,7 @@ class PUF_2024(PUF): } if __name__ == "__main__": - import os - - local_area_calibration = os.environ.get("LOCAL_AREA_CALIBRATION") == "true" - - if local_area_calibration: - PUF_2023().generate() - else: - PUF_2015().generate() - PUF_2021().generate() - PUF_2024().generate() + PUF_2015().generate() + PUF_2021().generate() + PUF_2023().generate() + PUF_2024().generate() diff --git a/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py index 90c9f8c4d..96e4a996c 100644 --- a/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py @@ -17,7 +17,7 @@ @pytest.fixture(scope="session") def data(): - return Dataset.from_file(STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5") + return Dataset.from_file(STORAGE_FOLDER / "enhanced_cps_2024.h5") @pytest.fixture(scope="session") @@ -93,7 +93,7 @@ def test_sparse_ecps_has_tips(sim): def test_sparse_ecps_replicates_jct_tax_expenditures(): calibration_log = pd.read_csv( - "calibration_log_sparse.csv", + "calibration_log.csv", ) jct_rows = calibration_log[ From 6052d31c9036f81f7e1511d6d27bc8a2ba32791f Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Sat, 24 Jan 2026 10:10:53 -0500 Subject: [PATCH 17/19] Fix h5 file handle conflict in small_enhanced_cps.py and format files Close the h5 file handle and release the Microsimulation reference before writing to the same file path. Also fix pre-existing black formatting in 3 files. Co-Authored-By: Claude Opus 4.5 --- modal_app/remote_calibration_runner.py | 94 ++++++++++++------- .../create_stratified_cps.py | 52 +++++++--- .../fit_calibration_weights.py | 31 +++--- .../datasets/cps/small_enhanced_cps.py | 4 +- 4 files changed, 121 insertions(+), 60 deletions(-) diff --git a/modal_app/remote_calibration_runner.py b/modal_app/remote_calibration_runner.py index 76646e478..43e354456 100644 --- a/modal_app/remote_calibration_runner.py +++ b/modal_app/remote_calibration_runner.py @@ -26,12 +26,15 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict: print("Downloading calibration inputs from HuggingFace...") download_result = subprocess.run( [ - "uv", "run", "python", "-c", + "uv", + "run", + "python", + "-c", "from policyengine_us_data.utils.huggingface import " "download_calibration_inputs; " "paths = download_calibration_inputs('/root/calibration_data'); " "print(f\"DB: {paths['database']}\"); " - "print(f\"DATASET: {paths['dataset']}\")" + "print(f\"DATASET: {paths['dataset']}\")", ], capture_output=True, text=True, @@ -44,11 +47,11 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict: raise RuntimeError(f"Download failed: {download_result.returncode}") db_path = dataset_path = None - for line in download_result.stdout.split('\n'): - if line.startswith('DB:'): - db_path = line.split('DB:')[1].strip() - elif line.startswith('DATASET:'): - dataset_path = line.split('DATASET:')[1].strip() + for line in download_result.stdout.split("\n"): + if line.startswith("DB:"): + db_path = line.split("DB:")[1].strip() + elif line.startswith("DATASET:"): + dataset_path = line.split("DATASET:")[1].strip() script_path = ( "policyengine_us_data/datasets/cps/" @@ -56,11 +59,18 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict: ) result = subprocess.run( [ - "uv", "run", "python", script_path, - "--device", "cuda", - "--epochs", str(epochs), - "--db-path", db_path, - "--dataset-path", dataset_path, + "uv", + "run", + "python", + script_path, + "--device", + "cuda", + "--epochs", + str(epochs), + "--db-path", + db_path, + "--dataset-path", + dataset_path, ], capture_output=True, text=True, @@ -74,58 +84,78 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict: output_path = None log_path = None - for line in result.stdout.split('\n'): - if 'OUTPUT_PATH:' in line: - output_path = line.split('OUTPUT_PATH:')[1].strip() - elif 'LOG_PATH:' in line: - log_path = line.split('LOG_PATH:')[1].strip() + for line in result.stdout.split("\n"): + if "OUTPUT_PATH:" in line: + output_path = line.split("OUTPUT_PATH:")[1].strip() + elif "LOG_PATH:" in line: + log_path = line.split("LOG_PATH:")[1].strip() - with open(output_path, 'rb') as f: + with open(output_path, "rb") as f: weights_bytes = f.read() log_bytes = None if log_path: - with open(log_path, 'rb') as f: + with open(log_path, "rb") as f: log_bytes = f.read() return {"weights": weights_bytes, "log": log_bytes} @app.function( - image=image, secrets=[hf_secret], memory=32768, cpu=4.0, - gpu="T4", timeout=14400, + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="T4", + timeout=14400, ) def fit_weights_t4(branch: str = "main", epochs: int = 200) -> dict: return _fit_weights_impl(branch, epochs) @app.function( - image=image, secrets=[hf_secret], memory=32768, cpu=4.0, - gpu="A10", timeout=14400, + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="A10", + timeout=14400, ) def fit_weights_a10(branch: str = "main", epochs: int = 200) -> dict: return _fit_weights_impl(branch, epochs) @app.function( - image=image, secrets=[hf_secret], memory=32768, cpu=4.0, - gpu="A100-40GB", timeout=14400, + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="A100-40GB", + timeout=14400, ) def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> dict: return _fit_weights_impl(branch, epochs) @app.function( - image=image, secrets=[hf_secret], memory=32768, cpu=4.0, - gpu="A100-80GB", timeout=14400, + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="A100-80GB", + timeout=14400, ) def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> dict: return _fit_weights_impl(branch, epochs) @app.function( - image=image, secrets=[hf_secret], memory=32768, cpu=4.0, - gpu="H100", timeout=14400, + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="H100", + timeout=14400, ) def fit_weights_h100(branch: str = "main", epochs: int = 200) -> dict: return _fit_weights_impl(branch, epochs) @@ -146,7 +176,7 @@ def main( epochs: int = 200, gpu: str = "T4", output: str = "calibration_weights.npy", - log_output: str = "calibration_log.csv" + log_output: str = "calibration_log.csv", ): if gpu not in GPU_FUNCTIONS: raise ValueError( @@ -157,11 +187,11 @@ def main( func = GPU_FUNCTIONS[gpu] result = func.remote(branch=branch, epochs=epochs) - with open(output, 'wb') as f: + with open(output, "wb") as f: f.write(result["weights"]) print(f"Weights saved to: {output}") if result["log"]: - with open(log_output, 'wb') as f: + with open(log_output, "wb") as f: f.write(result["log"]) print(f"Calibration log saved to: {log_output}") diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py index 79fb18b36..da3dffc05 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py @@ -78,9 +78,13 @@ def create_stratified_cps_dataset( n_middle = n_households_orig - n_top - n_bottom_25 print(f"\nStratum sizes:") - print(f" Top {100 - high_income_percentile}% (AGI >= ${high_income_threshold:,.0f}): {n_top:,}") + print( + f" Top {100 - high_income_percentile}% (AGI >= ${high_income_threshold:,.0f}): {n_top:,}" + ) print(f" Middle 25-{high_income_percentile}%: {n_middle:,}") - print(f" Bottom 25% (AGI < ${bottom_25_pct_threshold:,.0f}): {n_bottom_25:,}") + print( + f" Bottom 25% (AGI < ${bottom_25_pct_threshold:,.0f}): {n_bottom_25:,}" + ) # Calculate sampling rates # Keep ALL top earners, distribute remaining quota between middle and bottom @@ -131,7 +135,9 @@ def create_stratified_cps_dataset( # Top earners - keep all top_mask = agi >= high_income_threshold selected_mask[top_mask] = True - print(f" Top {100 - high_income_percentile}%: selected {np.sum(top_mask):,}") + print( + f" Top {100 - high_income_percentile}%: selected {np.sum(top_mask):,}" + ) # Bottom 25% bottom_mask = agi < bottom_25_pct_threshold @@ -140,11 +146,17 @@ def create_stratified_cps_dataset( if r_bottom >= 1.0: selected_mask[bottom_indices] = True elif n_select_bottom > 0: - selected_bottom = np.random.choice(bottom_indices, n_select_bottom, replace=False) + selected_bottom = np.random.choice( + bottom_indices, n_select_bottom, replace=False + ) selected_mask[selected_bottom] = True else: - print(f" WARNING: Bottom 25% selection rounded to 0 (rate={r_bottom:.4f}, n={len(bottom_indices)})") - print(f" Bottom 25%: selected {np.sum(selected_mask & bottom_mask):,} / {len(bottom_indices):,}") + print( + f" WARNING: Bottom 25% selection rounded to 0 (rate={r_bottom:.4f}, n={len(bottom_indices)})" + ) + print( + f" Bottom 25%: selected {np.sum(selected_mask & bottom_mask):,} / {len(bottom_indices):,}" + ) # Middle middle_mask = ~top_mask & ~bottom_mask @@ -153,17 +165,27 @@ def create_stratified_cps_dataset( if r_middle >= 1.0: selected_mask[middle_indices] = True elif n_select_middle > 0: - selected_middle = np.random.choice(middle_indices, n_select_middle, replace=False) + selected_middle = np.random.choice( + middle_indices, n_select_middle, replace=False + ) selected_mask[selected_middle] = True else: - print(f" WARNING: Middle selection rounded to 0 (rate={r_middle:.4f}, n={len(middle_indices)})") - print(f" Middle 25-{high_income_percentile}%: selected {np.sum(selected_mask & middle_mask):,} / {len(middle_indices):,}") + print( + f" WARNING: Middle selection rounded to 0 (rate={r_middle:.4f}, n={len(middle_indices)})" + ) + print( + f" Middle 25-{high_income_percentile}%: selected {np.sum(selected_mask & middle_mask):,} / {len(middle_indices):,}" + ) n_selected = np.sum(selected_mask) - print(f"\nTotal selected: {n_selected:,} households ({n_selected/n_households_orig:.1%} of original)") + print( + f"\nTotal selected: {n_selected:,} households ({n_selected/n_households_orig:.1%} of original)" + ) # Verify high earners are preserved - print(f"\nHigh earners (>=${high_income_threshold:,.0f}): {np.sum(selected_mask & top_mask):,} / {n_top:,} (100%)") + print( + f"\nHigh earners (>=${high_income_threshold:,.0f}): {np.sum(selected_mask & top_mask):,} / {n_top:,} (100%)" + ) # Get the selected household IDs selected_household_ids = set(household_ids[selected_mask]) @@ -328,8 +350,12 @@ def create_stratified_cps_dataset( print(f"\nDone! Created: {output_file}") print("\nUsage:") - print(" python create_stratified_cps.py [target] [--top=99] [--oversample-poor] [--seed=N]") + print( + " python create_stratified_cps.py [target] [--top=99] [--oversample-poor] [--seed=N]" + ) print("\nExamples:") print(" python create_stratified_cps.py 30000") - print(" python create_stratified_cps.py 50000 --top=99.5 --oversample-poor") + print( + " python create_stratified_cps.py 50000 --top=99.5 --oversample-poor" + ) print(" python create_stratified_cps.py 30000 --seed=123 # reproducible") diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py index bcf647ff7..c88fa645a 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py @@ -14,20 +14,21 @@ parser = argparse.ArgumentParser(description="Fit calibration weights") parser.add_argument( - "--device", default="cpu", choices=["cpu", "cuda"], - help="Device for training (cpu or cuda)" + "--device", + default="cpu", + choices=["cpu", "cuda"], + help="Device for training (cpu or cuda)", ) parser.add_argument( - "--epochs", type=int, default=100, - help="Total epochs for training" + "--epochs", type=int, default=100, help="Total epochs for training" ) parser.add_argument( - "--db-path", default=None, - help="Path to policy_data.db (default: STORAGE_FOLDER/calibration/policy_data.db)" + "--db-path", + default=None, + help="Path to policy_data.db (default: STORAGE_FOLDER/calibration/policy_data.db)", ) parser.add_argument( - "--dataset-path", default=None, - help="Path to stratified CPS h5 file" + "--dataset-path", default=None, help="Path to stratified CPS h5 file" ) args = parser.parse_args() @@ -184,17 +185,19 @@ with torch.no_grad(): predictions = model.predict(X_sparse).cpu().numpy() - chunk_df = pd.DataFrame({ - "target_name": target_names, - "estimate": predictions, - "target": targets, - }) + chunk_df = pd.DataFrame( + { + "target_name": target_names, + "estimate": predictions, + "target": targets, + } + ) chunk_df["epoch"] = current_epoch chunk_df["error"] = chunk_df.estimate - chunk_df.target chunk_df["rel_error"] = chunk_df.error / chunk_df.target chunk_df["abs_error"] = chunk_df.error.abs() chunk_df["rel_abs_error"] = chunk_df.rel_error.abs() - chunk_df["loss"] = chunk_df.rel_abs_error ** 2 + chunk_df["loss"] = chunk_df.rel_abs_error**2 calibration_log = pd.concat([calibration_log, chunk_df], ignore_index=True) # ============================================================================ diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index e4d2c05ea..5e099bec5 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -52,13 +52,15 @@ def create_sparse_ecps(): h5 = ecps.load() sparse_weights = h5["household_weight"][str(time_period)][:] hh_ids = h5["household_id"][str(time_period)][:] + h5.close() template_sim = Microsimulation( dataset=EnhancedCPS_2024, ) template_sim.set_input("household_weight", time_period, sparse_weights) - df = template_sim.to_input_dataframe() # Not at household level + df = template_sim.to_input_dataframe() + del template_sim household_weight_column = f"household_weight__{time_period}" df_household_id_column = f"household_id__{time_period}" From f7c3bb38c655229889727ef9c978c7f55008cd86 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 26 Jan 2026 10:22:05 -0500 Subject: [PATCH 18/19] Add snap to calibration targets, remove redundant files - Add "snap" to variables filter in fit_calibration_weights.py to include National SNAP targets (in different stratum_group_id than 4) - Remove redundant test_sparse_matrix_builder.py (tests consolidated in test_matrix_national_variation.py) - Remove redundant build_calibration_matrix.py (functionality covered by fit_calibration_weights.py) Co-Authored-By: Claude Opus 4.5 --- changelog_entry.yaml | 3 +- .../build_calibration_matrix.py | 195 ------------------ .../fit_calibration_weights.py | 5 +- .../tests/test_sparse_matrix_builder.py | 64 ------ 4 files changed, 6 insertions(+), 261 deletions(-) delete mode 100644 policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py delete mode 100644 policyengine_us_data/tests/test_sparse_matrix_builder.py diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 70e68eb73..06397409d 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -2,10 +2,11 @@ changes: added: - Support for health_insurance_premiums_without_medicare_part_b in local area calibration - - Test coverage for sparse matrix builder with person-level targets changed: - Removed dense reweighting path from enhanced CPS; only sparse (L0) weights are produced - Eliminated TEST_LITE and LOCAL_AREA_CALIBRATION flags; all datasets generated unconditionally - Merged data-local-area Makefile target into data target + removed: + - Redundant test_sparse_matrix_builder.py (tests consolidated in test_matrix_national_variation.py) fixed: - Versioning workflow now runs uv lock after version bump to keep uv.lock in sync diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py b/policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py deleted file mode 100644 index 185914a5c..000000000 --- a/policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py +++ /dev/null @@ -1,195 +0,0 @@ -""" -Build calibration matrix for geo-stacking reweighting. -Generates X_sparse and target vector, prints diagnostics using MatrixTracer. -""" - -import logging - -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - -import numpy as np -import pandas as pd -from policyengine_us import Microsimulation -from policyengine_us_data.storage import STORAGE_FOLDER -from sparse_matrix_builder import SparseMatrixBuilder -from matrix_tracer import MatrixTracer -from calibration_utils import create_target_groups - -# ============================================================================ -# CONFIGURATION -# ============================================================================ -db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" -db_uri = f"sqlite:///{db_path}" -time_period = 2023 - -# Base dataset for geo-stacking: stratified extended CPS -dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" - -cds_to_calibrate = [ - "101", # Alabama CD-1 - "601", # California CD-1 - "602", # California CD-2 - "3601", # New York CD-1 - "4801", # Texas CD-1 -] - -print(f"Testing with {len(cds_to_calibrate)} congressional districts") - -# ============================================================================ -# STEP 1: LOAD SIMULATION FROM EXTENDED CPS -# ============================================================================ -print(f"\nLoading simulation from {dataset_path}...") -sim = Microsimulation(dataset=str(dataset_path)) -n_households = len(sim.calculate("household_id", map_to="household").values) -print(f"Loaded {n_households:,} households") - -# ============================================================================ -# STEP 2: BUILD SPARSE MATRIX WITH COMBINED TARGETS -# ============================================================================ -print("\nBuilding sparse matrix...") -builder = SparseMatrixBuilder( - db_uri=db_uri, - time_period=time_period, - cds_to_calibrate=cds_to_calibrate, - dataset_path=None, -) - -# SNAP targets (stratum_group_id=4) + specific health insurance variable -# Uses OR logic: gets all SNAP targets OR the health insurance target -targets_df, X_sparse, household_id_mapping = builder.build_matrix( - sim, - target_filter={ - "stratum_group_ids": [4], - "variables": ["health_insurance_premiums_without_medicare_part_b"], - }, -) - -print(f"\nMatrix built successfully:") -print(f" Shape: {X_sparse.shape}") -print(f" Targets: {len(targets_df)}") -nnz = X_sparse.nnz -total = X_sparse.shape[0] * X_sparse.shape[1] -print(f" Sparsity: {1 - nnz / total:.4%}") - -# ============================================================================ -# STEP 3: EXTRACT TARGET VECTOR -# ============================================================================ -target_vector = targets_df["value"].values -print(f"\nTarget vector shape: {target_vector.shape}") -print(f"Target total: ${target_vector.sum():,.0f}") - -# ============================================================================ -# STEP 4: HEALTH INSURANCE PREMIUM VERIFICATION -# ============================================================================ -print("\n" + "=" * 80) -print("HEALTH INSURANCE PREMIUM TARGET ANALYSIS") -print("=" * 80) - -health_ins_targets = targets_df[ - targets_df["variable"] - == "health_insurance_premiums_without_medicare_part_b" -] - -if len(health_ins_targets) > 0: - print(f"\nFound {len(health_ins_targets)} health insurance target(s):") - print( - health_ins_targets[ - [ - "target_id", - "variable", - "value", - "geographic_id", - "stratum_group_id", - ] - ] - ) - - health_ins_idx = health_ins_targets.index[0] - health_ins_row = X_sparse[health_ins_idx, :] - - print(f"\nMatrix row {health_ins_idx} (health insurance):") - print(f" Non-zero entries: {health_ins_row.nnz:,}") - print(f" Row sum: ${health_ins_row.sum():,.0f}") - print(f" Target value: ${health_ins_targets.iloc[0]['value']:,.0f}") - - person_total = sim.calculate( - "health_insurance_premiums_without_medicare_part_b", - time_period, - map_to="person", - ).values.sum() - - household_total = sim.calculate( - "health_insurance_premiums_without_medicare_part_b", - time_period, - map_to="household", - ).values.sum() - - print(f"\nEntity aggregation verification:") - print(f" Person-level total: ${person_total:,.0f}") - print(f" Household-level total: ${household_total:,.0f}") - print(f" Match: {np.isclose(person_total, household_total, rtol=1e-6)}") - -else: - print("\nWARNING: No health insurance targets found!") - -# ============================================================================ -# STEP 5: SNAP TARGET SUMMARY -# ============================================================================ -print("\n" + "=" * 80) -print("SNAP TARGET SUMMARY") -print("=" * 80) - -snap_targets = targets_df[targets_df["variable"] == "snap"] -household_count_targets = targets_df[ - targets_df["variable"] == "household_count" -] - -print(f"\nSNAP benefit targets: {len(snap_targets)}") -print(f"Household count targets: {len(household_count_targets)}") - -if len(snap_targets) > 0: - print(f"\nSNAP total (all states): ${snap_targets['value'].sum():,.0f}") - print(f"\nSample SNAP targets:") - print( - snap_targets[["target_id", "variable", "value", "geographic_id"]].head( - 10 - ) - ) - -# ============================================================================ -# STEP 6: USE MATRIX TRACER FOR DETAILED DIAGNOSTICS -# ============================================================================ -print("\n" + "=" * 80) -print("MATRIX TRACER DIAGNOSTICS") -print("=" * 80) - -tracer = MatrixTracer( - targets_df=targets_df, - matrix=X_sparse, - household_id_mapping=household_id_mapping, - geographic_ids=cds_to_calibrate, - sim=sim, -) - -tracer.print_matrix_structure(show_groups=True) - -# ============================================================================ -# STEP 7: TARGET GROUP ANALYSIS -# ============================================================================ -print("\n" + "=" * 80) -print("TARGET GROUP ANALYSIS") -print("=" * 80) - -target_groups, group_info = create_target_groups(targets_df) - -print(f"\nTotal target groups: {len(group_info)}") -for group_id, info in enumerate(group_info): - group_mask = target_groups == group_id - n_targets_in_group = group_mask.sum() - print(f" Group {group_id}: {info} ({n_targets_in_group} targets)") - -print("\n" + "=" * 80) -print("RUNNER COMPLETED SUCCESSFULLY") -print("=" * 80) diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py index c88fa645a..ee3d38475 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py @@ -106,7 +106,10 @@ sim, target_filter={ "stratum_group_ids": [4], - "variables": ["health_insurance_premiums_without_medicare_part_b"], + "variables": [ + "health_insurance_premiums_without_medicare_part_b", + "snap", + ], }, ) diff --git a/policyengine_us_data/tests/test_sparse_matrix_builder.py b/policyengine_us_data/tests/test_sparse_matrix_builder.py deleted file mode 100644 index 2d08ea3f1..000000000 --- a/policyengine_us_data/tests/test_sparse_matrix_builder.py +++ /dev/null @@ -1,64 +0,0 @@ -import pytest -import numpy as np -from policyengine_us import Microsimulation -from policyengine_us_data.storage import STORAGE_FOLDER -from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( - SparseMatrixBuilder, -) - - -@pytest.fixture -def sim(): - dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" - return Microsimulation(dataset=str(dataset_path)) - - -@pytest.fixture -def builder(): - db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" - db_uri = f"sqlite:///{db_path}" - cds_to_calibrate = ["101", "601"] # AL-1, CA-1 - return SparseMatrixBuilder( - db_uri=db_uri, - time_period=2023, - cds_to_calibrate=cds_to_calibrate, - dataset_path=None, - ) - - -def test_person_level_aggregation_preserves_totals(sim): - """Health insurance premiums (person-level) should sum correctly to household.""" - var = "health_insurance_premiums_without_medicare_part_b" - person_total = sim.calculate(var, 2023, map_to="person").values.sum() - household_total = sim.calculate(var, 2023, map_to="household").values.sum() - assert np.isclose(person_total, household_total, rtol=1e-6) - - -def test_matrix_shape(sim, builder): - """Matrix should have (n_targets, n_households * n_cds) shape.""" - targets_df, X_sparse, _ = builder.build_matrix( - sim, - target_filter={ - "variables": ["health_insurance_premiums_without_medicare_part_b"] - }, - ) - n_households = len( - sim.calculate("household_id", map_to="household").values - ) - n_cds = 2 - assert X_sparse.shape == (1, n_households * n_cds) - - -def test_combined_snap_and_health_insurance(sim, builder): - """Matrix should include both SNAP and health insurance targets.""" - targets_df, X_sparse, _ = builder.build_matrix( - sim, - target_filter={ - "stratum_group_ids": [4], - "variables": ["health_insurance_premiums_without_medicare_part_b"], - }, - ) - variables = targets_df["variable"].unique() - assert "snap" in variables - assert "household_count" in variables - assert "health_insurance_premiums_without_medicare_part_b" in variables From e4e449d9e511f9036f6901d1e93dca8902534486 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 26 Jan 2026 10:25:29 -0500 Subject: [PATCH 19/19] Add build_calibration_matrix.py removal to changelog --- changelog_entry.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 06397409d..193004c8b 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -8,5 +8,6 @@ - Merged data-local-area Makefile target into data target removed: - Redundant test_sparse_matrix_builder.py (tests consolidated in test_matrix_national_variation.py) + - Redundant build_calibration_matrix.py (functionality in fit_calibration_weights.py) fixed: - Versioning workflow now runs uv lock after version bump to keep uv.lock in sync