Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions .github/workflows/local_area_publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,22 @@ on:
repository_dispatch:
types: [calibration-updated]
workflow_dispatch:
inputs:
num_workers:
description: 'Number of parallel workers'
required: false
default: '8'
type: string
skip_upload:
description: 'Skip upload (build only)'
required: false
default: false
type: boolean

# Trigger strategy:
# 1. Automatic: Code changes to local_area_calibration/ pushed to main
# 2. repository_dispatch: Calibration workflow triggers after uploading new weights
# 3. workflow_dispatch: Manual trigger when you update weights/data on HF yourself
# 3. workflow_dispatch: Manual trigger with optional parameters

jobs:
publish-local-area:
Expand All @@ -39,4 +50,16 @@ jobs:
run: pip install modal

- name: Run local area publishing on Modal
run: modal run modal_app/local_area.py --branch=${{ github.head_ref || github.ref_name }}
run: |
NUM_WORKERS="${{ github.event.inputs.num_workers || '8' }}"
SKIP_UPLOAD="${{ github.event.inputs.skip_upload || 'false' }}"
BRANCH="${{ github.head_ref || github.ref_name }}"

CMD="modal run modal_app/local_area.py --branch=${BRANCH} --num-workers=${NUM_WORKERS}"

if [ "$SKIP_UPLOAD" = "true" ]; then
CMD="${CMD} --skip-upload"
fi

echo "Running: $CMD"
$CMD
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ node_modules
!policyengine_us_data/storage/national_and_district_rents_2023.csv
docs/.ipynb_checkpoints/

## Raw input cache for database pipeline
policyengine_us_data/storage/calibration/raw_inputs/

## Batch processing checkpoints
completed_*.txt

Expand Down
17 changes: 16 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations
.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations database database-refresh promote-database

all: data test

Expand Down Expand Up @@ -54,14 +54,29 @@ documentation-dev:
myst start

database:
rm -f policyengine_us_data/storage/calibration/policy_data.db
python policyengine_us_data/db/create_database_tables.py
python policyengine_us_data/db/create_initial_strata.py
python policyengine_us_data/db/etl_national_targets.py
python policyengine_us_data/db/etl_age.py
python policyengine_us_data/db/etl_medicaid.py
python policyengine_us_data/db/etl_snap.py
python policyengine_us_data/db/etl_irs_soi.py
python policyengine_us_data/db/validate_database.py

database-refresh:
rm -f policyengine_us_data/storage/calibration/policy_data.db
rm -rf policyengine_us_data/storage/calibration/raw_inputs/
$(MAKE) database

promote-database:
cp policyengine_us_data/storage/calibration/policy_data.db \
$(HOME)/devl/huggingface/policyengine-us-data/calibration/policy_data.db
rm -rf $(HOME)/devl/huggingface/policyengine-us-data/calibration/raw_inputs
cp -r policyengine_us_data/storage/calibration/raw_inputs \
$(HOME)/devl/huggingface/policyengine-us-data/calibration/raw_inputs
@echo "Copied DB and raw_inputs to HF clone. Now cd to HF repo, commit, and push."

data: download
python policyengine_us_data/utils/uprating.py
python policyengine_us_data/datasets/acs/acs.py
Expand Down
19 changes: 19 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
- bump: minor
changes:
changed:
- Migrated data pipeline from CPS 2023 to CPS 2024 (March 2025 ASEC)
- Updated ExtendedCPS_2024 to use new CPS_2024_Full (full sample)
- Updated local area calibration to use 2024 extended CPS data
- Updated database ETL strata, IRS SOI, Medicaid, and SNAP scripts
removed:
- Removed CPS_2021_Full, CPS_2022_Full, CPS_2023_Full classes
- Removed PooledCPS and Pooled_3_Year_CPS_2023
- Removed ExtendedCPS_2023
- Removed dead train_previous_year_income_model function
- Removed unused dataset exports from __init__.py
added:
- Added CPS_2024_Full class for full-sample 2024 CPS generation
- Added raw_cache utility for Census data caching
- Added atomic parallel local area H5 publishing with Modal Volume staging
- Added manifest validation with SHA256 checksums for versioned uploads
- Added HuggingFace retry logic with exponential backoff to fix timeout errors
145 changes: 15 additions & 130 deletions docs/local_area_calibration_setup.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,11 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "cell-3",
"metadata": {},
"outputs": [],
"source": [
"db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n",
"db_uri = f\"sqlite:///{db_path}\"\n",
"dataset_path = str(STORAGE_FOLDER / \"stratified_extended_cps_2023.h5\")\n",
"\n",
"engine = create_engine(db_uri)"
]
"source": "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\ndb_uri = f\"sqlite:///{db_path}\"\ndataset_path = str(STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\")\n\nengine = create_engine(db_uri)"
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -148,42 +142,11 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "cell-7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X_sparse shape: (539, 256633)\n",
" Rows (targets): 539\n",
" Columns (household × CD pairs): 256633\n",
" Non-zero entries: 67,756\n",
" Sparsity: 99.95%\n"
]
}
],
"source": [
"sim = Microsimulation(dataset=dataset_path)\n",
"\n",
"builder = SparseMatrixBuilder(\n",
" db_uri,\n",
" time_period=2023,\n",
" cds_to_calibrate=test_cds,\n",
" dataset_path=dataset_path,\n",
")\n",
"\n",
"targets_df, X_sparse, household_id_mapping = builder.build_matrix(\n",
" sim, target_filter={\"stratum_group_ids\": [4], \"variables\": [\"snap\"]}\n",
")\n",
"\n",
"print(f\"X_sparse shape: {X_sparse.shape}\")\n",
"print(f\" Rows (targets): {X_sparse.shape[0]}\")\n",
"print(f\" Columns (household × CD pairs): {X_sparse.shape[1]}\")\n",
"print(f\" Non-zero entries: {X_sparse.nnz:,}\")\n",
"print(f\" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.2%}\")"
]
"outputs": [],
"source": "sim = Microsimulation(dataset=dataset_path)\n\nbuilder = SparseMatrixBuilder(\n db_uri,\n time_period=2024,\n cds_to_calibrate=test_cds,\n dataset_path=dataset_path,\n)\n\ntargets_df, X_sparse, household_id_mapping = builder.build_matrix(\n sim, target_filter={\"stratum_group_ids\": [4], \"variables\": [\"snap\"]}\n)\n\nprint(f\"X_sparse shape: {X_sparse.shape}\")\nprint(f\" Rows (targets): {X_sparse.shape[0]}\")\nprint(f\" Columns (household × CD pairs): {X_sparse.shape[1]}\")\nprint(f\" Non-zero entries: {X_sparse.nnz:,}\")\nprint(f\" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.2%}\")"
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -428,43 +391,11 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "e05aaeab-3786-4ff0-a50b-34577065d2e0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Remember, this is a North Carolina target:\n",
"\n",
"target_id 9372\n",
"stratum_id 9799\n",
"variable snap\n",
"value 4041086120.0\n",
"period 2023\n",
"stratum_group_id 4\n",
"geographic_id 37\n",
"Name: 80, dtype: object\n",
"\n",
"Household donated to NC's 2nd district, 2023 SNAP dollars:\n",
"789.19995\n",
"\n",
"Household donated to NC's 2nd district, 2023 SNAP dollars:\n",
"0.0\n"
]
}
],
"source": [
"print(\"Remember, this is a North Carolina target:\\n\")\n",
"print(targets_df.iloc[row_loc])\n",
"\n",
"print(\"\\nNC State target. Household donated to NC's 2nd district, 2023 SNAP dollars:\")\n",
"print(X_sparse[row_loc, positions['3702']]) # Household donated to NC's 2nd district\n",
"\n",
"print(\"\\nSame target, same household, donated to AK's at Large district, 2023 SNAP dollars:\")\n",
"print(X_sparse[row_loc, positions['201']]) # Household donated to AK's at Large District"
]
"outputs": [],
"source": "print(\"Remember, this is a North Carolina target:\\n\")\nprint(targets_df.iloc[row_loc])\n\nprint(\"\\nNC State target. Household donated to NC's 2nd district, 2024 SNAP dollars:\")\nprint(X_sparse[row_loc, positions['3702']]) # Household donated to NC's 2nd district\n\nprint(\"\\nSame target, same household, donated to AK's at Large district, 2024 SNAP dollars:\")\nprint(X_sparse[row_loc, positions['201']]) # Household donated to AK's at Large District"
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -507,24 +438,11 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"id": "ac59b6f1-859f-4246-8a05-8cb26384c882",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Household donated to AK's 1st district, 2023 SNAP dollars:\n",
"342.48004\n"
]
}
],
"source": [
"print(\"\\nHousehold donated to AK's 1st district, 2023 SNAP dollars:\")\n",
"print(X_sparse[new_row_loc, positions['201']]) # Household donated to AK's at Large District"
]
"outputs": [],
"source": "print(\"\\nHousehold donated to AK's 1st district, 2024 SNAP dollars:\")\nprint(X_sparse[new_row_loc, positions['201']]) # Household donated to AK's at Large District"
},
{
"cell_type": "markdown",
Expand All @@ -538,44 +456,11 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"id": "cell-19",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SNAP values for first 5 households under different state rules:\n",
" NC rules: [789.19995117 0. 0. 0. 0. ]\n",
" AK rules: [342.4800415 0. 0. 0. 0. ]\n",
" Difference: [-446.71990967 0. 0. 0. 0. ]\n"
]
}
],
"source": [
"def create_state_simulation(state_fips):\n",
" \"\"\"Create a simulation with all households assigned to a specific state.\"\"\"\n",
" s = Microsimulation(dataset=dataset_path)\n",
" s.set_input(\n",
" \"state_fips\", 2023, np.full(hh_snap_df.shape[0], state_fips, dtype=np.int32)\n",
" )\n",
" for var in get_calculated_variables(s):\n",
" s.delete_arrays(var)\n",
" return s\n",
"\n",
"# Compare SNAP for first 5 households under NC vs AK rules\n",
"nc_sim = create_state_simulation(37) # NC\n",
"ak_sim = create_state_simulation(2) # AK\n",
"\n",
"nc_snap = nc_sim.calculate(\"snap\", map_to=\"household\").values[:5]\n",
"ak_snap = ak_sim.calculate(\"snap\", map_to=\"household\").values[:5]\n",
"\n",
"print(\"SNAP values for first 5 households under different state rules:\")\n",
"print(f\" NC rules: {nc_snap}\")\n",
"print(f\" AK rules: {ak_snap}\")\n",
"print(f\" Difference: {ak_snap - nc_snap}\")"
]
"outputs": [],
"source": "def create_state_simulation(state_fips):\n \"\"\"Create a simulation with all households assigned to a specific state.\"\"\"\n s = Microsimulation(dataset=dataset_path)\n s.set_input(\n \"state_fips\", 2024, np.full(hh_snap_df.shape[0], state_fips, dtype=np.int32)\n )\n for var in get_calculated_variables(s):\n s.delete_arrays(var)\n return s\n\n# Compare SNAP for first 5 households under NC vs AK rules\nnc_sim = create_state_simulation(37) # NC\nak_sim = create_state_simulation(2) # AK\n\nnc_snap = nc_sim.calculate(\"snap\", map_to=\"household\").values[:5]\nak_snap = ak_sim.calculate(\"snap\", map_to=\"household\").values[:5]\n\nprint(\"SNAP values for first 5 households under different state rules:\")\nprint(f\" NC rules: {nc_snap}\")\nprint(f\" AK rules: {ak_snap}\")\nprint(f\" Difference: {ak_snap - nc_snap}\")"
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -1015,4 +900,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
Loading
Loading