PolicyEngine · baogorek · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 28, 2026
diff --git a/.github/workflows/local_area_publish.yaml b/.github/workflows/local_area_publish.yaml
@@ -10,11 +10,22 @@ on:
   repository_dispatch:
     types: [calibration-updated]
   workflow_dispatch:
+    inputs:
+      num_workers:
+        description: 'Number of parallel workers'
+        required: false
+        default: '8'
+        type: string
+      skip_upload:
+        description: 'Skip upload (build only)'
+        required: false
+        default: false
+        type: boolean
 
 # Trigger strategy:
 # 1. Automatic: Code changes to local_area_calibration/ pushed to main
 # 2. repository_dispatch: Calibration workflow triggers after uploading new weights
-# 3. workflow_dispatch: Manual trigger when you update weights/data on HF yourself
+# 3. workflow_dispatch: Manual trigger with optional parameters
 
 jobs:
   publish-local-area:
@@ -39,4 +50,16 @@ jobs:
         run: pip install modal
 
       - name: Run local area publishing on Modal
-        run: modal run modal_app/local_area.py --branch=${{ github.head_ref || github.ref_name }}
+        run: |
+          NUM_WORKERS="${{ github.event.inputs.num_workers || '8' }}"
+          SKIP_UPLOAD="${{ github.event.inputs.skip_upload || 'false' }}"
+          BRANCH="${{ github.head_ref || github.ref_name }}"
+
+          CMD="modal run modal_app/local_area.py --branch=${BRANCH} --num-workers=${NUM_WORKERS}"
+
+          if [ "$SKIP_UPLOAD" = "true" ]; then
+            CMD="${CMD} --skip-upload"
+          fi
+
+          echo "Running: $CMD"
+          $CMD
diff --git a/.gitignore b/.gitignore
@@ -27,6 +27,9 @@ node_modules
 !policyengine_us_data/storage/national_and_district_rents_2023.csv
 docs/.ipynb_checkpoints/
 
+## Raw input cache for database pipeline
+policyengine_us_data/storage/calibration/raw_inputs/
+
 ## Batch processing checkpoints
 completed_*.txt
 

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations
+.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations database database-refresh promote-database
 
 all: data test
 
@@ -54,14 +54,29 @@ documentation-dev:
 	myst start
 
 database:
+	rm -f policyengine_us_data/storage/calibration/policy_data.db
 	python policyengine_us_data/db/create_database_tables.py
 	python policyengine_us_data/db/create_initial_strata.py
+	python policyengine_us_data/db/etl_national_targets.py
 	python policyengine_us_data/db/etl_age.py
 	python policyengine_us_data/db/etl_medicaid.py
 	python policyengine_us_data/db/etl_snap.py
 	python policyengine_us_data/db/etl_irs_soi.py
 	python policyengine_us_data/db/validate_database.py
 
+database-refresh:
+	rm -f policyengine_us_data/storage/calibration/policy_data.db
+	rm -rf policyengine_us_data/storage/calibration/raw_inputs/
+	$(MAKE) database
+
+promote-database:
+	cp policyengine_us_data/storage/calibration/policy_data.db \
+		$(HOME)/devl/huggingface/policyengine-us-data/calibration/policy_data.db
+	rm -rf $(HOME)/devl/huggingface/policyengine-us-data/calibration/raw_inputs
+	cp -r policyengine_us_data/storage/calibration/raw_inputs \
+		$(HOME)/devl/huggingface/policyengine-us-data/calibration/raw_inputs
+	@echo "Copied DB and raw_inputs to HF clone. Now cd to HF repo, commit, and push."
+
 data: download
 	python policyengine_us_data/utils/uprating.py
 	python policyengine_us_data/datasets/acs/acs.py

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,19 @@
+- bump: minor
+  changes:
+    changed:
+    - Migrated data pipeline from CPS 2023 to CPS 2024 (March 2025 ASEC)
+    - Updated ExtendedCPS_2024 to use new CPS_2024_Full (full sample)
+    - Updated local area calibration to use 2024 extended CPS data
+    - Updated database ETL strata, IRS SOI, Medicaid, and SNAP scripts
+    removed:
+    - Removed CPS_2021_Full, CPS_2022_Full, CPS_2023_Full classes
+    - Removed PooledCPS and Pooled_3_Year_CPS_2023
+    - Removed ExtendedCPS_2023
+    - Removed dead train_previous_year_income_model function
+    - Removed unused dataset exports from __init__.py
+    added:
+    - Added CPS_2024_Full class for full-sample 2024 CPS generation
+    - Added raw_cache utility for Census data caching
+    - Added atomic parallel local area H5 publishing with Modal Volume staging
+    - Added manifest validation with SHA256 checksums for versioned uploads
+    - Added HuggingFace retry logic with exponential backoff to fix timeout errors
diff --git a/docs/local_area_calibration_setup.ipynb b/docs/local_area_calibration_setup.ipynb
@@ -61,17 +61,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "cell-3",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n",
-    "db_uri = f\"sqlite:///{db_path}\"\n",
-    "dataset_path = str(STORAGE_FOLDER / \"stratified_extended_cps_2023.h5\")\n",
-    "\n",
-    "engine = create_engine(db_uri)"
-   ]
+   "source": "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\ndb_uri = f\"sqlite:///{db_path}\"\ndataset_path = str(STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\")\n\nengine = create_engine(db_uri)"
   },
   {
    "cell_type": "markdown",
@@ -148,42 +142,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "cell-7",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "X_sparse shape: (539, 256633)\n",
-      "  Rows (targets): 539\n",
-      "  Columns (household × CD pairs): 256633\n",
-      "  Non-zero entries: 67,756\n",
-      "  Sparsity: 99.95%\n"
-     ]
-    }
-   ],
-   "source": [
-    "sim = Microsimulation(dataset=dataset_path)\n",
-    "\n",
-    "builder = SparseMatrixBuilder(\n",
-    "    db_uri,\n",
-    "    time_period=2023,\n",
-    "    cds_to_calibrate=test_cds,\n",
-    "    dataset_path=dataset_path,\n",
-    ")\n",
-    "\n",
-    "targets_df, X_sparse, household_id_mapping = builder.build_matrix(\n",
-    "    sim, target_filter={\"stratum_group_ids\": [4], \"variables\": [\"snap\"]}\n",
-    ")\n",
-    "\n",
-    "print(f\"X_sparse shape: {X_sparse.shape}\")\n",
-    "print(f\"  Rows (targets): {X_sparse.shape[0]}\")\n",
-    "print(f\"  Columns (household × CD pairs): {X_sparse.shape[1]}\")\n",
-    "print(f\"  Non-zero entries: {X_sparse.nnz:,}\")\n",
-    "print(f\"  Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.2%}\")"
-   ]
+   "outputs": [],
+   "source": "sim = Microsimulation(dataset=dataset_path)\n\nbuilder = SparseMatrixBuilder(\n    db_uri,\n    time_period=2024,\n    cds_to_calibrate=test_cds,\n    dataset_path=dataset_path,\n)\n\ntargets_df, X_sparse, household_id_mapping = builder.build_matrix(\n    sim, target_filter={\"stratum_group_ids\": [4], \"variables\": [\"snap\"]}\n)\n\nprint(f\"X_sparse shape: {X_sparse.shape}\")\nprint(f\"  Rows (targets): {X_sparse.shape[0]}\")\nprint(f\"  Columns (household × CD pairs): {X_sparse.shape[1]}\")\nprint(f\"  Non-zero entries: {X_sparse.nnz:,}\")\nprint(f\"  Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.2%}\")"
   },
   {
    "cell_type": "markdown",
@@ -428,43 +391,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "e05aaeab-3786-4ff0-a50b-34577065d2e0",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Remember, this is a North Carolina target:\n",
-      "\n",
-      "target_id                   9372\n",
-      "stratum_id                  9799\n",
-      "variable                    snap\n",
-      "value               4041086120.0\n",
-      "period                      2023\n",
-      "stratum_group_id               4\n",
-      "geographic_id                 37\n",
-      "Name: 80, dtype: object\n",
-      "\n",
-      "Household donated to NC's 2nd district, 2023 SNAP dollars:\n",
-      "789.19995\n",
-      "\n",
-      "Household donated to NC's 2nd district, 2023 SNAP dollars:\n",
-      "0.0\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Remember, this is a North Carolina target:\\n\")\n",
-    "print(targets_df.iloc[row_loc])\n",
-    "\n",
-    "print(\"\\nNC State target. Household donated to NC's 2nd district, 2023 SNAP dollars:\")\n",
-    "print(X_sparse[row_loc, positions['3702']])  # Household donated to NC's 2nd district\n",
-    "\n",
-    "print(\"\\nSame target, same household, donated to AK's at Large district, 2023 SNAP dollars:\")\n",
-    "print(X_sparse[row_loc, positions['201']])  # Household donated to AK's at Large District"
-   ]
+   "outputs": [],
+   "source": "print(\"Remember, this is a North Carolina target:\\n\")\nprint(targets_df.iloc[row_loc])\n\nprint(\"\\nNC State target. Household donated to NC's 2nd district, 2024 SNAP dollars:\")\nprint(X_sparse[row_loc, positions['3702']])  # Household donated to NC's 2nd district\n\nprint(\"\\nSame target, same household, donated to AK's at Large district, 2024 SNAP dollars:\")\nprint(X_sparse[row_loc, positions['201']])  # Household donated to AK's at Large District"
   },
   {
    "cell_type": "markdown",
@@ -507,24 +438,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "id": "ac59b6f1-859f-4246-8a05-8cb26384c882",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Household donated to AK's 1st district, 2023 SNAP dollars:\n",
-      "342.48004\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"\\nHousehold donated to AK's 1st district, 2023 SNAP dollars:\")\n",
-    "print(X_sparse[new_row_loc, positions['201']])  # Household donated to AK's at Large District"
-   ]
+   "outputs": [],
+   "source": "print(\"\\nHousehold donated to AK's 1st district, 2024 SNAP dollars:\")\nprint(X_sparse[new_row_loc, positions['201']])  # Household donated to AK's at Large District"
   },
   {
    "cell_type": "markdown",
@@ -538,44 +456,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "id": "cell-19",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "SNAP values for first 5 households under different state rules:\n",
-      "  NC rules: [789.19995117   0.           0.           0.           0.        ]\n",
-      "  AK rules: [342.4800415   0.          0.          0.          0.       ]\n",
-      "  Difference: [-446.71990967    0.            0.            0.            0.        ]\n"
-     ]
-    }
-   ],
-   "source": [
-    "def create_state_simulation(state_fips):\n",
-    "    \"\"\"Create a simulation with all households assigned to a specific state.\"\"\"\n",
-    "    s = Microsimulation(dataset=dataset_path)\n",
-    "    s.set_input(\n",
-    "        \"state_fips\", 2023, np.full(hh_snap_df.shape[0], state_fips, dtype=np.int32)\n",
-    "    )\n",
-    "    for var in get_calculated_variables(s):\n",
-    "        s.delete_arrays(var)\n",
-    "    return s\n",
-    "\n",
-    "# Compare SNAP for first 5 households under NC vs AK rules\n",
-    "nc_sim = create_state_simulation(37)  # NC\n",
-    "ak_sim = create_state_simulation(2)   # AK\n",
-    "\n",
-    "nc_snap = nc_sim.calculate(\"snap\", map_to=\"household\").values[:5]\n",
-    "ak_snap = ak_sim.calculate(\"snap\", map_to=\"household\").values[:5]\n",
-    "\n",
-    "print(\"SNAP values for first 5 households under different state rules:\")\n",
-    "print(f\"  NC rules: {nc_snap}\")\n",
-    "print(f\"  AK rules: {ak_snap}\")\n",
-    "print(f\"  Difference: {ak_snap - nc_snap}\")"
-   ]
+   "outputs": [],
+   "source": "def create_state_simulation(state_fips):\n    \"\"\"Create a simulation with all households assigned to a specific state.\"\"\"\n    s = Microsimulation(dataset=dataset_path)\n    s.set_input(\n        \"state_fips\", 2024, np.full(hh_snap_df.shape[0], state_fips, dtype=np.int32)\n    )\n    for var in get_calculated_variables(s):\n        s.delete_arrays(var)\n    return s\n\n# Compare SNAP for first 5 households under NC vs AK rules\nnc_sim = create_state_simulation(37)  # NC\nak_sim = create_state_simulation(2)   # AK\n\nnc_snap = nc_sim.calculate(\"snap\", map_to=\"household\").values[:5]\nak_snap = ak_sim.calculate(\"snap\", map_to=\"household\").values[:5]\n\nprint(\"SNAP values for first 5 households under different state rules:\")\nprint(f\"  NC rules: {nc_snap}\")\nprint(f\"  AK rules: {ak_snap}\")\nprint(f\"  Difference: {ak_snap - nc_snap}\")"
   },
   {
    "cell_type": "markdown",
@@ -1015,4 +900,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}