diff --git a/.github/workflows/reusable_test.yaml b/.github/workflows/reusable_test.yaml index 95d55281..4575a508 100644 --- a/.github/workflows/reusable_test.yaml +++ b/.github/workflows/reusable_test.yaml @@ -65,8 +65,7 @@ jobs: run: | modal run modal_app/data_build.py \ ${{ inputs.upload_data && '--upload' || '--no-upload' }} \ - --branch=${{ github.head_ref || github.ref_name }} \ - ${{ inputs.upload_data && '--no-test-lite' || '--test-lite' }} + --branch=${{ github.head_ref || github.ref_name }} - name: Install package run: uv sync --dev diff --git a/.github/workflows/versioning.yaml b/.github/workflows/versioning.yaml index 84667308..48658dbc 100644 --- a/.github/workflows/versioning.yaml +++ b/.github/workflows/versioning.yaml @@ -23,8 +23,12 @@ jobs: uses: actions/setup-python@v5 with: python-version: 3.12 + - name: Install uv + uses: astral-sh/setup-uv@v5 - name: Build changelog run: pip install yaml-changelog && make changelog + - name: Update lockfile + run: uv lock - name: Preview changelog update run: ".github/get-changelog-diff.sh" - name: Update changelog diff --git a/Makefile b/Makefile index 4d5f5bef..fd212a08 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all format test install download upload docker documentation data data-local-area publish-local-area clean build paper clean-paper presentations +.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations all: data test @@ -71,13 +71,6 @@ data: download python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/enhanced_cps.py python policyengine_us_data/datasets/cps/small_enhanced_cps.py - mv policyengine_us_data/storage/enhanced_cps_2024.h5 policyengine_us_data/storage/dense_enhanced_cps_2024.h5 - cp policyengine_us_data/storage/sparse_enhanced_cps_2024.h5 policyengine_us_data/storage/enhanced_cps_2024.h5 - -data-local-area: data - LOCAL_AREA_CALIBRATION=true python policyengine_us_data/datasets/cps/cps.py - LOCAL_AREA_CALIBRATION=true python policyengine_us_data/datasets/puf/puf.py - LOCAL_AREA_CALIBRATION=true python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py 10500 publish-local-area: diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..193004c8 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,13 @@ +- bump: minor + changes: + added: + - Support for health_insurance_premiums_without_medicare_part_b in local area calibration + changed: + - Removed dense reweighting path from enhanced CPS; only sparse (L0) weights are produced + - Eliminated TEST_LITE and LOCAL_AREA_CALIBRATION flags; all datasets generated unconditionally + - Merged data-local-area Makefile target into data target + removed: + - Redundant test_sparse_matrix_builder.py (tests consolidated in test_matrix_national_variation.py) + - Redundant build_calibration_matrix.py (functionality in fit_calibration_weights.py) + fixed: + - Versioning workflow now runs uv lock after version bump to keep uv.lock in sync diff --git a/docs/local_area_calibration_setup.ipynb b/docs/local_area_calibration_setup.ipynb index cdd1cc97..9060a3df 100644 --- a/docs/local_area_calibration_setup.ipynb +++ b/docs/local_area_calibration_setup.ipynb @@ -459,10 +459,10 @@ "print(\"Remember, this is a North Carolina target:\\n\")\n", "print(targets_df.iloc[row_loc])\n", "\n", - "print(\"\\nHousehold donated to NC's 2nd district, 2023 SNAP dollars:\")\n", + "print(\"\\nNC State target. Household donated to NC's 2nd district, 2023 SNAP dollars:\")\n", "print(X_sparse[row_loc, positions['3702']]) # Household donated to NC's 2nd district\n", "\n", - "print(\"\\nHousehold donated to NC's 2nd district, 2023 SNAP dollars:\")\n", + "print(\"\\nSame target, same household, donated to AK's at Large district, 2023 SNAP dollars:\")\n", "print(X_sparse[row_loc, positions['201']]) # Household donated to AK's at Large District" ] }, diff --git a/modal_app/README.md b/modal_app/README.md new file mode 100644 index 00000000..0b10cf72 --- /dev/null +++ b/modal_app/README.md @@ -0,0 +1,62 @@ +# Modal App for GPU Weight Fitting + +Run calibration weight fitting on Modal's cloud GPUs. + +## Prerequisites + +- [Modal](https://modal.com/) account and CLI installed (`pip install modal`) +- `modal token new` to authenticate +- HuggingFace token stored as Modal secret named `huggingface-token` + +## Usage + +```bash +modal run modal_app/remote_calibration_runner.py --branch --epochs --gpu +``` + +### Arguments + +| Argument | Default | Description | +|----------|---------|-------------| +| `--branch` | `main` | Git branch to clone and run | +| `--epochs` | `200` | Number of training epochs | +| `--gpu` | `T4` | GPU type: `T4`, `A10`, `A100-40GB`, `A100-80GB`, `H100` | +| `--output` | `calibration_weights.npy` | Local path for weights file | +| `--log-output` | `calibration_log.csv` | Local path for calibration log | + +### Example + +```bash +modal run modal_app/remote_calibration_runner.py --branch health-insurance-premiums --epochs 100 --gpu T4 +``` + +## Output Files + +- **calibration_weights.npy** - Fitted household weights +- **calibration_log.csv** - Per-target performance metrics across epochs (target_name, estimate, target, epoch, error, rel_error, abs_error, rel_abs_error, loss) + +## Changing Hyperparameters + +Hyperparameters are in `policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py`: + +```python +BETA = 0.35 +GAMMA = -0.1 +ZETA = 1.1 +INIT_KEEP_PROB = 0.999 +LOG_WEIGHT_JITTER_SD = 0.05 +LOG_ALPHA_JITTER_SD = 0.01 +LAMBDA_L0 = 1e-8 +LAMBDA_L2 = 1e-8 +LEARNING_RATE = 0.15 +``` + +To change them: +1. Edit `fit_calibration_weights.py` +2. Commit and push to your branch +3. Re-run the Modal command with that branch + +## Important Notes + +- **Keep your connection open** - Modal needs to stay connected to download results. Don't close your laptop or let it sleep until you see the local "Weights saved to:" and "Calibration log saved to:" messages. +- Modal clones from GitHub, so local changes must be pushed before they take effect. diff --git a/modal_app/data_build.py b/modal_app/data_build.py index 52803568..f56b96a7 100644 --- a/modal_app/data_build.py +++ b/modal_app/data_build.py @@ -38,7 +38,6 @@ def setup_gcp_credentials(): def build_datasets( upload: bool = False, branch: str = "main", - test_lite: bool = False, ): setup_gcp_credentials() @@ -49,8 +48,6 @@ def build_datasets( subprocess.run(["uv", "sync", "--locked"], check=True) env = os.environ.copy() - if test_lite: - env["TEST_LITE"] = "true" # Download prerequisites subprocess.run( @@ -79,44 +76,8 @@ def build_datasets( print(f"Running {script}...") subprocess.run(["uv", "run", "python", script], check=True, env=env) - os.rename( - "policyengine_us_data/storage/enhanced_cps_2024.h5", - "policyengine_us_data/storage/dense_enhanced_cps_2024.h5", - ) - subprocess.run( - [ - "cp", - "policyengine_us_data/storage/sparse_enhanced_cps_2024.h5", - "policyengine_us_data/storage/enhanced_cps_2024.h5", - ], - check=True, - ) - - # Build local area calibration datasets (without TEST_LITE - must match full dataset) - print("Building local area calibration datasets...") - local_area_env = os.environ.copy() - local_area_env["LOCAL_AREA_CALIBRATION"] = "true" - - subprocess.run( - ["uv", "run", "python", "policyengine_us_data/datasets/cps/cps.py"], - check=True, - env=local_area_env, - ) - subprocess.run( - ["uv", "run", "python", "policyengine_us_data/datasets/puf/puf.py"], - check=True, - env=local_area_env, - ) - subprocess.run( - [ - "uv", - "run", - "python", - "policyengine_us_data/datasets/cps/extended_cps.py", - ], - check=True, - env=local_area_env, - ) + # Build stratified CPS for local area calibration + print("Running create_stratified_cps.py...") subprocess.run( [ "uv", @@ -126,7 +87,7 @@ def build_datasets( "10500", ], check=True, - env=local_area_env, + env=env, ) # Run local area calibration tests @@ -140,7 +101,7 @@ def build_datasets( "-v", ], check=True, - env=local_area_env, + env=env, ) # Run main test suite @@ -167,11 +128,9 @@ def build_datasets( def main( upload: bool = False, branch: str = "main", - test_lite: bool = False, ): result = build_datasets.remote( upload=upload, branch=branch, - test_lite=test_lite, ) print(result) diff --git a/modal_app/remote_calibration_runner.py b/modal_app/remote_calibration_runner.py new file mode 100644 index 00000000..43e35445 --- /dev/null +++ b/modal_app/remote_calibration_runner.py @@ -0,0 +1,197 @@ +import os +import subprocess +import modal + +app = modal.App("policyengine-us-data-fit-weights") + +hf_secret = modal.Secret.from_name("huggingface-token") + +image = ( + modal.Image.debian_slim(python_version="3.11") + .apt_install("git") + .pip_install("uv") +) + +REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git" + + +def _fit_weights_impl(branch: str, epochs: int) -> dict: + """Shared implementation for weight fitting.""" + os.chdir("/root") + subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True) + os.chdir("policyengine-us-data") + + subprocess.run(["uv", "sync", "--extra", "l0"], check=True) + + print("Downloading calibration inputs from HuggingFace...") + download_result = subprocess.run( + [ + "uv", + "run", + "python", + "-c", + "from policyengine_us_data.utils.huggingface import " + "download_calibration_inputs; " + "paths = download_calibration_inputs('/root/calibration_data'); " + "print(f\"DB: {paths['database']}\"); " + "print(f\"DATASET: {paths['dataset']}\")", + ], + capture_output=True, + text=True, + env=os.environ.copy(), + ) + print(download_result.stdout) + if download_result.stderr: + print("Download STDERR:", download_result.stderr) + if download_result.returncode != 0: + raise RuntimeError(f"Download failed: {download_result.returncode}") + + db_path = dataset_path = None + for line in download_result.stdout.split("\n"): + if line.startswith("DB:"): + db_path = line.split("DB:")[1].strip() + elif line.startswith("DATASET:"): + dataset_path = line.split("DATASET:")[1].strip() + + script_path = ( + "policyengine_us_data/datasets/cps/" + "local_area_calibration/fit_calibration_weights.py" + ) + result = subprocess.run( + [ + "uv", + "run", + "python", + script_path, + "--device", + "cuda", + "--epochs", + str(epochs), + "--db-path", + db_path, + "--dataset-path", + dataset_path, + ], + capture_output=True, + text=True, + env=os.environ.copy(), + ) + print(result.stdout) + if result.stderr: + print("STDERR:", result.stderr) + if result.returncode != 0: + raise RuntimeError(f"Script failed with code {result.returncode}") + + output_path = None + log_path = None + for line in result.stdout.split("\n"): + if "OUTPUT_PATH:" in line: + output_path = line.split("OUTPUT_PATH:")[1].strip() + elif "LOG_PATH:" in line: + log_path = line.split("LOG_PATH:")[1].strip() + + with open(output_path, "rb") as f: + weights_bytes = f.read() + + log_bytes = None + if log_path: + with open(log_path, "rb") as f: + log_bytes = f.read() + + return {"weights": weights_bytes, "log": log_bytes} + + +@app.function( + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="T4", + timeout=14400, +) +def fit_weights_t4(branch: str = "main", epochs: int = 200) -> dict: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="A10", + timeout=14400, +) +def fit_weights_a10(branch: str = "main", epochs: int = 200) -> dict: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="A100-40GB", + timeout=14400, +) +def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> dict: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="A100-80GB", + timeout=14400, +) +def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> dict: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, + secrets=[hf_secret], + memory=32768, + cpu=4.0, + gpu="H100", + timeout=14400, +) +def fit_weights_h100(branch: str = "main", epochs: int = 200) -> dict: + return _fit_weights_impl(branch, epochs) + + +GPU_FUNCTIONS = { + "T4": fit_weights_t4, + "A10": fit_weights_a10, + "A100-40GB": fit_weights_a100_40, + "A100-80GB": fit_weights_a100_80, + "H100": fit_weights_h100, +} + + +@app.local_entrypoint() +def main( + branch: str = "main", + epochs: int = 200, + gpu: str = "T4", + output: str = "calibration_weights.npy", + log_output: str = "calibration_log.csv", +): + if gpu not in GPU_FUNCTIONS: + raise ValueError( + f"Unknown GPU: {gpu}. Choose from: {list(GPU_FUNCTIONS.keys())}" + ) + + print(f"Running with GPU: {gpu}, epochs: {epochs}, branch: {branch}") + func = GPU_FUNCTIONS[gpu] + result = func.remote(branch=branch, epochs=epochs) + + with open(output, "wb") as f: + f.write(result["weights"]) + print(f"Weights saved to: {output}") + + if result["log"]: + with open(log_output, "wb") as f: + f.write(result["log"]) + print(f"Calibration log saved to: {log_output}") diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 27a41bec..249e40e5 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -15,9 +15,6 @@ from microimpute.models.qrf import QRF import logging -test_lite = os.environ.get("TEST_LITE") == "true" -print(f"TEST_LITE == {test_lite}") - class CPS(Dataset): name = "cps" @@ -2141,21 +2138,13 @@ class Pooled_3_Year_CPS_2023(PooledCPS): url = "hf://policyengine/policyengine-us-data/pooled_3_year_cps_2023.h5" -local_area_calibration = os.environ.get("LOCAL_AREA_CALIBRATION") == "true" - if __name__ == "__main__": - if test_lite: - CPS_2024().generate() - CPS_2025().generate() - elif local_area_calibration: - CPS_2023_Full().generate() - else: - CPS_2021().generate() - CPS_2022().generate() - CPS_2023().generate() - CPS_2024().generate() - CPS_2025().generate() - CPS_2021_Full().generate() - CPS_2022_Full().generate() - CPS_2023_Full().generate() - Pooled_3_Year_CPS_2023().generate() + CPS_2021().generate() + CPS_2022().generate() + CPS_2023().generate() + CPS_2024().generate() + CPS_2025().generate() + CPS_2021_Full().generate() + CPS_2022_Full().generate() + CPS_2023_Full().generate() + Pooled_3_Year_CPS_2023().generate() diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 4eb0a660..dc8f5040 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -18,8 +18,6 @@ CPS_2019, CPS_2024, ) -import os -from pathlib import Path import logging try: @@ -32,7 +30,6 @@ def reweight( original_weights, loss_matrix, targets_array, - dropout_rate=0.05, log_path="calibration_log.csv", epochs=500, l0_lambda=2.6445e-07, @@ -40,7 +37,6 @@ def reweight( temperature=0.25, seed=1456, ): - set_seeds(seed) target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32) @@ -53,14 +49,10 @@ def reweight( normalisation_factor, dtype=torch.float32 ) targets_array = torch.tensor(targets_array, dtype=torch.float32) - weights = torch.tensor( - np.log(original_weights), requires_grad=True, dtype=torch.float32 - ) inv_mean_normalisation = 1 / np.mean(normalisation_factor.numpy()) def loss(weights): - # Check for Nans in either the weights or the loss matrix if torch.isnan(weights).any(): raise ValueError("Weights contain NaNs") if torch.isnan(loss_matrix).any(): @@ -78,75 +70,11 @@ def loss(weights): raise ValueError("Relative error contains NaNs") return rel_error_normalized.mean() - def dropout_weights(weights, p): - if p == 0: - return weights - # Replace p% of the weights with the mean value of the rest of them - mask = torch.rand_like(weights) < p - mean = weights[~mask].mean() - masked_weights = weights.clone() - masked_weights[mask] = mean - return masked_weights - - # Original (Dense) path --- - optimizer = torch.optim.Adam([weights], lr=3e-1) - start_loss = None - - iterator = trange(epochs) - performance = pd.DataFrame() - for i in iterator: - optimizer.zero_grad() - weights_ = dropout_weights(weights, dropout_rate) - l = loss(torch.exp(weights_)) - if (log_path is not None) and (i % 10 == 0): - estimates = torch.exp(weights) @ loss_matrix - estimates = estimates.detach().numpy() - df = pd.DataFrame( - { - "target_name": target_names, - "estimate": estimates, - "target": targets_array.detach().numpy(), - } - ) - df["epoch"] = i - df["error"] = df.estimate - df.target - df["rel_error"] = df.error / df.target - df["abs_error"] = df.error.abs() - df["rel_abs_error"] = df.rel_error.abs() - df["loss"] = df.rel_abs_error**2 - performance = pd.concat([performance, df], ignore_index=True) - - if (log_path is not None) and (i % 1000 == 0): - performance.to_csv(log_path, index=False) - if start_loss is None: - start_loss = l.item() - loss_rel_change = (l.item() - start_loss) / start_loss - l.backward() - iterator.set_postfix( - {"loss": l.item(), "loss_rel_change": loss_rel_change} - ) - optimizer.step() - if log_path is not None: - performance.to_csv(log_path, index=False) - - final_weights_dense = torch.exp(weights).detach().numpy() - - optimised_weights = final_weights_dense - print_reweighting_diagnostics( - final_weights_dense, - loss_matrix, - targets_array, - "Dense Solutions", - ) - - # New (Sparse) path depending on temperature, init_mean, l0_lambda ----- logging.info( f"Sparse optimization using seed {seed}, temp {temperature} " + f"init_mean {init_mean}, l0_lambda {l0_lambda}" ) set_seeds(seed) - p = Path(log_path) - log_path_sparse = p.with_name(f"{p.stem}_sparse{p.suffix}") weights = torch.tensor( np.log(original_weights), requires_grad=True, dtype=torch.float32 @@ -166,7 +94,7 @@ def dropout_weights(weights, p): masked = torch.exp(weights) * gates() l_main = loss(masked) l = l_main + l0_lambda * gates.get_penalty() - if (log_path_sparse is not None) and (i % 10 == 0): + if (log_path is not None) and (i % 10 == 0): gates.eval() estimates = (torch.exp(weights) * gates()) @ loss_matrix gates.train() @@ -186,8 +114,8 @@ def dropout_weights(weights, p): df["loss"] = df.rel_abs_error**2 performance = pd.concat([performance, df], ignore_index=True) - if (log_path_sparse is not None) and (i % 1000 == 0): - performance.to_csv(log_path_sparse, index=False) + if (log_path is not None) and (i % 1000 == 0): + performance.to_csv(log_path, index=False) if start_loss is None: start_loss = l.item() loss_rel_change = (l.item() - start_loss) / start_loss @@ -196,8 +124,8 @@ def dropout_weights(weights, p): {"loss": l.item(), "loss_rel_change": loss_rel_change} ) optimizer.step() - if log_path_sparse is not None: - performance.to_csv(log_path_sparse, index=False) + if log_path is not None: + performance.to_csv(log_path, index=False) gates.eval() final_weights_sparse = (torch.exp(weights) * gates()).detach().numpy() @@ -209,7 +137,7 @@ def dropout_weights(weights, p): "L0 Sparse Solution", ) - return final_weights_dense, final_weights_sparse + return final_weights_sparse def train_previous_year_income_model(): @@ -268,7 +196,6 @@ def generate(self): sim = Microsimulation(dataset=self.input_dataset) data = sim.dataset.load_dataset() data["household_weight"] = {} - data["household_sparse_weight"] = {} original_weights = sim.calculate("household_weight") original_weights = original_weights.values + np.random.normal( 1, 0.1, len(original_weights) @@ -309,7 +236,7 @@ def generate(self): targets_array_clean = targets_array[keep_idx] assert loss_matrix_clean.shape[1] == targets_array_clean.size - optimised_weights_dense, optimised_weights_sparse = reweight( + optimised_weights = reweight( original_weights, loss_matrix_clean, targets_array_clean, @@ -317,8 +244,7 @@ def generate(self): epochs=200, seed=1456, ) - data["household_weight"][year] = optimised_weights_dense - data["household_sparse_weight"][year] = optimised_weights_sparse + data["household_weight"][year] = optimised_weights self.save_dataset(data) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index a9bf07a4..b5b4fa24 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -4,7 +4,6 @@ from policyengine_us_data.datasets.cps.cps import * from policyengine_us_data.datasets.puf import * import pandas as pd -import os from microimpute.models.qrf import QRF import time import logging @@ -340,11 +339,5 @@ class ExtendedCPS_2024(ExtendedCPS): if __name__ == "__main__": - local_area_calibration = ( - os.environ.get("LOCAL_AREA_CALIBRATION", "").lower() == "true" - ) - - if local_area_calibration: - ExtendedCPS_2023().generate() - else: - ExtendedCPS_2024().generate() + ExtendedCPS_2023().generate() + ExtendedCPS_2024().generate() diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py index d9507d17..da3dffc0 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py @@ -1,15 +1,14 @@ """ -Create a stratified sample of extended_cps_2023.h5 that preserves high-income households. -This is needed for congressional district geo-stacking where the full dataset is too large. +Create a stratified sample of extended_cps_2023.h5 that preserves high-income households +while maintaining diversity in lower income strata for poverty analysis. Strategy: -- Keep ALL households above a high income threshold (e.g., top 1%) -- Sample progressively less from lower income strata -- Ensure representation across all income levels +- Keep ALL households in top 1% (for high-income tax analysis) +- Uniform sample from the remaining 99% (preserves low-income diversity) +- Optional: slight oversample of bottom quartile for poverty-focused analysis """ import numpy as np -import pandas as pd import h5py from policyengine_us import Microsimulation from policyengine_core.data.dataset import Dataset @@ -21,16 +20,22 @@ def create_stratified_cps_dataset( target_households=30_000, - high_income_percentile=99, # Keep ALL households above this percentile + high_income_percentile=99, + oversample_poor=False, + seed=None, base_dataset=None, output_path=None, ): """ - Create a stratified sample of CPS data preserving high-income households. + Create a stratified sample of CPS data preserving high-income households + while maintaining low-income diversity for poverty analysis. Args: target_households: Target number of households in output (approximate) - high_income_percentile: Keep ALL households above this AGI percentile + high_income_percentile: Keep ALL households above this AGI percentile (e.g., 99 or 99.5) + oversample_poor: If True, boost sampling rate for bottom 25% by 1.5x + seed: Random seed for reproducibility (default: None for random) + base_dataset: Path to source h5 file (default: extended_cps_2023.h5) output_path: Where to save the stratified h5 file """ print("\n" + "=" * 70) @@ -57,100 +62,120 @@ def create_stratified_cps_dataset( print(f"Target dataset: {target_households:,} households") print(f"Reduction ratio: {target_households/n_households_orig:.1%}") - # Calculate AGI percentiles - print("\nAnalyzing income distribution...") - percentiles = [0, 25, 50, 75, 90, 95, 99, 99.5, 99.9, 100] - agi_percentiles = np.percentile(agi, percentiles) - - print("AGI Percentiles:") - for p, val in zip(percentiles, agi_percentiles): - print(f" {p:5.1f}%: ${val:,.0f}") + # Show income distribution + print("\nAGI Percentiles (original):") + for p in [0, 25, 50, 75, 90, 95, 99, 99.5, 99.9, 100]: + val = np.percentile(agi, p) + print(f" {p:5.1f}%: ${val:>12,.0f}") - # Define sampling strategy - # Keep ALL high earners, sample progressively less from lower strata + # Define strata thresholds high_income_threshold = np.percentile(agi, high_income_percentile) - print( - f"\nHigh-income threshold (top {100-high_income_percentile}%): ${high_income_threshold:,.0f}" - ) + bottom_25_pct_threshold = np.percentile(agi, 25) - # Create strata with sampling rates - strata = [ - (99.9, 100, 1.00), # Top 0.1% - keep ALL - (99.5, 99.9, 1.00), # 99.5-99.9% - keep ALL - (99, 99.5, 1.00), # 99-99.5% - keep ALL - (95, 99, 0.80), # 95-99% - keep 80% - (90, 95, 0.60), # 90-95% - keep 60% - (75, 90, 0.40), # 75-90% - keep 40% - (50, 75, 0.25), # 50-75% - keep 25% - (25, 50, 0.15), # 25-50% - keep 15% - (0, 25, 0.10), # Bottom 25% - keep 10% - ] - - # Adjust sampling rates to hit target - print("\nInitial sampling strategy:") - expected_count = 0 - for low_p, high_p, rate in strata: - low_val = np.percentile(agi, low_p) if low_p > 0 else -np.inf - high_val = np.percentile(agi, high_p) if high_p < 100 else np.inf - in_stratum = np.sum((agi > low_val) & (agi <= high_val)) - expected = int(in_stratum * rate) - expected_count += expected - print( - f" {low_p:5.1f}-{high_p:5.1f}%: {in_stratum:6,} households x {rate:.0%} = {expected:6,}" - ) + # Count households in each stratum + n_top = np.sum(agi >= high_income_threshold) + n_bottom_25 = np.sum(agi < bottom_25_pct_threshold) + n_middle = n_households_orig - n_top - n_bottom_25 - print(f"Expected total: {expected_count:,} households") + print(f"\nStratum sizes:") + print( + f" Top {100 - high_income_percentile}% (AGI >= ${high_income_threshold:,.0f}): {n_top:,}" + ) + print(f" Middle 25-{high_income_percentile}%: {n_middle:,}") + print( + f" Bottom 25% (AGI < ${bottom_25_pct_threshold:,.0f}): {n_bottom_25:,}" + ) - # Adjust rates if needed - if expected_count > target_households * 1.1: # Allow 10% overage - adjustment = target_households / expected_count - print( - f"\nAdjusting rates by factor of {adjustment:.2f} to meet target..." + # Calculate sampling rates + # Keep ALL top earners, distribute remaining quota between middle and bottom + remaining_quota = target_households - n_top + if remaining_quota <= 0: + raise ValueError( + f"Target ({target_households:,}) is less than top {100-high_income_percentile}% " + f"count ({n_top:,}). Increase target_households." ) - # Never reduce the top percentiles - strata_adjusted = [] - for low_p, high_p, rate in strata: - if high_p >= 99: # Never reduce top 1% - strata_adjusted.append((low_p, high_p, rate)) - else: - strata_adjusted.append( - (low_p, high_p, min(1.0, rate * adjustment)) - ) - strata = strata_adjusted - - # Select households based on strata + if oversample_poor: + # Give bottom 25% a 1.5x boost relative to middle + r_middle = remaining_quota / (1.5 * n_bottom_25 + n_middle) + r_bottom = 1.5 * r_middle + r_middle = min(1.0, r_middle) + r_bottom = min(1.0, r_bottom) + else: + # Uniform sampling for the rest + r_middle = remaining_quota / (n_bottom_25 + n_middle) + r_bottom = r_middle + r_middle = min(1.0, r_middle) + r_bottom = min(1.0, r_bottom) + + print(f"\nSampling rates:") + print(f" Top {100 - high_income_percentile}%: 100%") + print(f" Middle 25-{high_income_percentile}%: {r_middle:.1%}") + print(f" Bottom 25%: {r_bottom:.1%}") + + # Expected counts + expected_top = n_top + expected_middle = int(n_middle * r_middle) + expected_bottom = int(n_bottom_25 * r_bottom) + expected_total = expected_top + expected_middle + expected_bottom + + print(f"\nExpected selection:") + print(f" Top {100 - high_income_percentile}%: {expected_top:,}") + print(f" Middle 25-{high_income_percentile}%: {expected_middle:,}") + print(f" Bottom 25%: {expected_bottom:,}") + print(f" Total: {expected_total:,}") + + # Select households print("\nSelecting households...") + if seed is not None: + np.random.seed(seed) + print(f" Using random seed: {seed}") selected_mask = np.zeros(n_households_orig, dtype=bool) - for low_p, high_p, rate in strata: - low_val = np.percentile(agi, low_p) if low_p > 0 else -np.inf - high_val = np.percentile(agi, high_p) if high_p < 100 else np.inf - - in_stratum = (agi > low_val) & (agi <= high_val) - stratum_indices = np.where(in_stratum)[0] - n_in_stratum = len(stratum_indices) - - if rate >= 1.0: - # Keep all - selected_mask[stratum_indices] = True - n_selected = n_in_stratum - else: - # Random sample within stratum - n_to_select = int(n_in_stratum * rate) - if n_to_select > 0: - np.random.seed(42) # For reproducibility - selected_indices = np.random.choice( - stratum_indices, n_to_select, replace=False - ) - selected_mask[selected_indices] = True - n_selected = n_to_select - else: - n_selected = 0 + # Top earners - keep all + top_mask = agi >= high_income_threshold + selected_mask[top_mask] = True + print( + f" Top {100 - high_income_percentile}%: selected {np.sum(top_mask):,}" + ) + # Bottom 25% + bottom_mask = agi < bottom_25_pct_threshold + bottom_indices = np.where(bottom_mask)[0] + n_select_bottom = int(len(bottom_indices) * r_bottom) + if r_bottom >= 1.0: + selected_mask[bottom_indices] = True + elif n_select_bottom > 0: + selected_bottom = np.random.choice( + bottom_indices, n_select_bottom, replace=False + ) + selected_mask[selected_bottom] = True + else: print( - f" {low_p:5.1f}-{high_p:5.1f}%: Selected {n_selected:6,} / {n_in_stratum:6,} ({n_selected/max(1,n_in_stratum):.0%})" + f" WARNING: Bottom 25% selection rounded to 0 (rate={r_bottom:.4f}, n={len(bottom_indices)})" + ) + print( + f" Bottom 25%: selected {np.sum(selected_mask & bottom_mask):,} / {len(bottom_indices):,}" + ) + + # Middle + middle_mask = ~top_mask & ~bottom_mask + middle_indices = np.where(middle_mask)[0] + n_select_middle = int(len(middle_indices) * r_middle) + if r_middle >= 1.0: + selected_mask[middle_indices] = True + elif n_select_middle > 0: + selected_middle = np.random.choice( + middle_indices, n_select_middle, replace=False ) + selected_mask[selected_middle] = True + else: + print( + f" WARNING: Middle selection rounded to 0 (rate={r_middle:.4f}, n={len(middle_indices)})" + ) + print( + f" Middle 25-{high_income_percentile}%: selected {np.sum(selected_mask & middle_mask):,} / {len(middle_indices):,}" + ) n_selected = np.sum(selected_mask) print( @@ -158,13 +183,8 @@ def create_stratified_cps_dataset( ) # Verify high earners are preserved - high_earners_mask = agi >= high_income_threshold - n_high_earners = np.sum(high_earners_mask) - n_high_earners_selected = np.sum(selected_mask & high_earners_mask) - print(f"\nHigh earners (>=${high_income_threshold:,.0f}):") - print(f" Original: {n_high_earners:,}") print( - f" Selected: {n_high_earners_selected:,} ({n_high_earners_selected/n_high_earners:.0%})" + f"\nHigh earners (>=${high_income_threshold:,.0f}): {np.sum(selected_mask & top_mask):,} / {n_top:,} (100%)" ) # Get the selected household IDs @@ -300,28 +320,42 @@ def create_stratified_cps_dataset( if __name__ == "__main__": import sys - # Parse command line arguments - if len(sys.argv) > 1: - try: - target = int(sys.argv[1]) - print( - f"Creating stratified dataset with target of {target:,} households..." - ) - output_file = create_stratified_cps_dataset( - target_households=target - ) - except ValueError: - print(f"Invalid target households: {sys.argv[1]}") - print("Usage: python create_stratified_cps.py [target_households]") - sys.exit(1) - else: - # Default target - print( - "Creating stratified dataset with default target of 30,000 households..." - ) - output_file = create_stratified_cps_dataset(target_households=30_000) + target = 30_000 + high_pct = 99 + oversample = False + seed = None + + for arg in sys.argv[1:]: + if arg == "--oversample-poor": + oversample = True + elif arg.startswith("--top="): + high_pct = float(arg.split("=")[1]) + elif arg.startswith("--seed="): + seed = int(arg.split("=")[1]) + elif arg.isdigit(): + target = int(arg) + + print(f"Creating stratified dataset:") + print(f" Target households: {target:,}") + print(f" Keep all above: {high_pct}th percentile") + print(f" Oversample poor: {oversample}") + print(f" Seed: {seed if seed is not None else 'random'}") + + output_file = create_stratified_cps_dataset( + target_households=target, + high_income_percentile=high_pct, + oversample_poor=oversample, + seed=seed, + ) print(f"\nDone! Created: {output_file}") - print("\nTo test loading:") - print(" from policyengine_us import Microsimulation") - print(f" sim = Microsimulation(dataset='{output_file}')") + print("\nUsage:") + print( + " python create_stratified_cps.py [target] [--top=99] [--oversample-poor] [--seed=N]" + ) + print("\nExamples:") + print(" python create_stratified_cps.py 30000") + print( + " python create_stratified_cps.py 50000 --top=99.5 --oversample-poor" + ) + print(" python create_stratified_cps.py 30000 --seed=123 # reproducible") diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py new file mode 100644 index 00000000..ee3d3847 --- /dev/null +++ b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py @@ -0,0 +1,247 @@ +""" +Fit calibration weights using L0-regularized optimization. +Prototype script for weight calibration using the l0-python package. +""" + +import argparse +import logging +from datetime import datetime +from pathlib import Path + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + +parser = argparse.ArgumentParser(description="Fit calibration weights") +parser.add_argument( + "--device", + default="cpu", + choices=["cpu", "cuda"], + help="Device for training (cpu or cuda)", +) +parser.add_argument( + "--epochs", type=int, default=100, help="Total epochs for training" +) +parser.add_argument( + "--db-path", + default=None, + help="Path to policy_data.db (default: STORAGE_FOLDER/calibration/policy_data.db)", +) +parser.add_argument( + "--dataset-path", default=None, help="Path to stratified CPS h5 file" +) +args = parser.parse_args() + +import numpy as np +import pandas as pd +from policyengine_us import Microsimulation +from policyengine_us_data.storage import STORAGE_FOLDER +from sparse_matrix_builder import SparseMatrixBuilder +from calibration_utils import get_all_cds_from_database + +try: + import torch + from l0.calibration import SparseCalibrationWeights +except ImportError: + raise ImportError( + "l0-python is required for weight fitting. " + "Install with: pip install policyengine-us-data[l0]" + ) + +# ============================================================================ +# CONFIGURATION +# ============================================================================ +DEVICE = args.device +TOTAL_EPOCHS = args.epochs +EPOCHS_PER_CHUNK = 500 # TODO: need a better way to set this. Remember it can blow up the Vercel app + +# Hyperparameters +BETA = 0.35 +GAMMA = -0.1 +ZETA = 1.1 +INIT_KEEP_PROB = 0.999 +LOG_WEIGHT_JITTER_SD = 0.05 +LOG_ALPHA_JITTER_SD = 0.01 +LAMBDA_L0 = 1e-8 +LAMBDA_L2 = 1e-12 +LEARNING_RATE = 0.15 + +# Data paths +if args.db_path: + db_path = Path(args.db_path) +else: + db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" +db_uri = f"sqlite:///{db_path}" + +if args.dataset_path: + dataset_path = Path(args.dataset_path) +else: + dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" + +output_dir = STORAGE_FOLDER / "calibration" +output_dir.mkdir(parents=True, exist_ok=True) +time_period = 2023 + +# Get all CDs from database +cds_to_calibrate = get_all_cds_from_database(db_uri) +print(f"Found {len(cds_to_calibrate)} congressional districts") + +# ============================================================================ +# STEP 1: BUILD CALIBRATION MATRIX +# ============================================================================ +print(f"Loading simulation from {dataset_path}...") +sim = Microsimulation(dataset=str(dataset_path)) +n_households = len(sim.calculate("household_id", map_to="household").values) +print(f"Loaded {n_households:,} households") + +print("\nBuilding sparse matrix...") +builder = SparseMatrixBuilder( + db_uri=db_uri, + time_period=time_period, + cds_to_calibrate=cds_to_calibrate, + dataset_path=str(dataset_path), +) + +targets_df, X_sparse, household_id_mapping = builder.build_matrix( + sim, + target_filter={ + "stratum_group_ids": [4], + "variables": [ + "health_insurance_premiums_without_medicare_part_b", + "snap", + ], + }, +) + +print(f"Matrix shape: {X_sparse.shape}") +print(f"Targets: {len(targets_df)}") + +# Filter to achievable targets (rows with non-zero data) +row_sums = np.array(X_sparse.sum(axis=1)).flatten() +achievable_mask = row_sums > 0 +n_achievable = achievable_mask.sum() +n_impossible = (~achievable_mask).sum() + +print(f"\nAchievable targets: {n_achievable}") +print(f"Impossible targets (filtered out): {n_impossible}") + +targets_df = targets_df[achievable_mask].reset_index(drop=True) +X_sparse = X_sparse[achievable_mask, :] + +print(f"Filtered matrix shape: {X_sparse.shape}") + +# Extract target vector and names +targets = targets_df["value"].values +target_names = [ + f"{row['geographic_id']}/{row['variable']}" + for _, row in targets_df.iterrows() +] + +# ============================================================================ +# STEP 2: INITIALIZE WEIGHTS +# ============================================================================ +initial_weights = np.ones(X_sparse.shape[1]) * 100 +print(f"\nInitial weights shape: {initial_weights.shape}") +print(f"Initial weights sum: {initial_weights.sum():,.0f}") + +# ============================================================================ +# STEP 3: CREATE MODEL +# ============================================================================ +print("\nCreating SparseCalibrationWeights model...") +model = SparseCalibrationWeights( + n_features=X_sparse.shape[1], + beta=BETA, + gamma=GAMMA, + zeta=ZETA, + init_keep_prob=INIT_KEEP_PROB, + init_weights=initial_weights, + log_weight_jitter_sd=LOG_WEIGHT_JITTER_SD, + log_alpha_jitter_sd=LOG_ALPHA_JITTER_SD, + device=DEVICE, +) + +# ============================================================================ +# STEP 4: TRAIN IN CHUNKS +# ============================================================================ +timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") +calibration_log = pd.DataFrame() + +for chunk_start in range(0, TOTAL_EPOCHS, EPOCHS_PER_CHUNK): + chunk_epochs = min(EPOCHS_PER_CHUNK, TOTAL_EPOCHS - chunk_start) + current_epoch = chunk_start + chunk_epochs + + print(f"\nTraining epochs {chunk_start + 1} to {current_epoch}...") + + model.fit( + M=X_sparse, + y=targets, + target_groups=None, + lambda_l0=LAMBDA_L0, + lambda_l2=LAMBDA_L2, + lr=LEARNING_RATE, + epochs=chunk_epochs, + loss_type="relative", + verbose=True, + verbose_freq=chunk_epochs, + ) + + with torch.no_grad(): + predictions = model.predict(X_sparse).cpu().numpy() + + chunk_df = pd.DataFrame( + { + "target_name": target_names, + "estimate": predictions, + "target": targets, + } + ) + chunk_df["epoch"] = current_epoch + chunk_df["error"] = chunk_df.estimate - chunk_df.target + chunk_df["rel_error"] = chunk_df.error / chunk_df.target + chunk_df["abs_error"] = chunk_df.error.abs() + chunk_df["rel_abs_error"] = chunk_df.rel_error.abs() + chunk_df["loss"] = chunk_df.rel_abs_error**2 + calibration_log = pd.concat([calibration_log, chunk_df], ignore_index=True) + +# ============================================================================ +# STEP 5: EXTRACT AND SAVE WEIGHTS +# ============================================================================ +with torch.no_grad(): + w = model.get_weights(deterministic=True).cpu().numpy() + +print(f"\nFinal weights shape: {w.shape}") +print(f"Final weights sum: {w.sum():,.0f}") +print(f"Non-zero weights: {(w > 0).sum():,}") + +output_path = output_dir / f"calibration_weights_{timestamp}.npy" +np.save(output_path, w) +print(f"\nWeights saved to: {output_path}") +print(f"OUTPUT_PATH:{output_path}") + +log_path = output_dir / f"calibration_log_{timestamp}.csv" +calibration_log.to_csv(log_path, index=False) +print(f"Calibration log saved to: {log_path}") +print(f"LOG_PATH:{log_path}") + +# ============================================================================ +# STEP 6: VERIFY PREDICTIONS +# ============================================================================ +print("\n" + "=" * 60) +print("PREDICTION VERIFICATION") +print("=" * 60) + +with torch.no_grad(): + predictions = model.predict(X_sparse).cpu().numpy() + +for i in range(len(targets)): + rel_error = (predictions[i] - targets[i]) / targets[i] * 100 + print( + f"{target_names[i][:50]:50} | " + f"pred: {predictions[i]:>12,.0f} | " + f"target: {targets[i]:>12,.0f} | " + f"err: {rel_error:>6.2f}%" + ) + +print("\n" + "=" * 60) +print("FITTING COMPLETED") +print("=" * 60) diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index a9679a2a..5e099bec 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -50,15 +50,17 @@ def create_sparse_ecps(): ecps = EnhancedCPS_2024() h5 = ecps.load() - sparse_weights = h5["household_sparse_weight"][str(time_period)][:] + sparse_weights = h5["household_weight"][str(time_period)][:] hh_ids = h5["household_id"][str(time_period)][:] + h5.close() template_sim = Microsimulation( dataset=EnhancedCPS_2024, ) template_sim.set_input("household_weight", time_period, sparse_weights) - df = template_sim.to_input_dataframe() # Not at household level + df = template_sim.to_input_dataframe() + del template_sim household_weight_column = f"household_weight__{time_period}" df_household_id_column = f"household_id__{time_period}" @@ -102,7 +104,7 @@ def create_sparse_ecps(): if len(data[variable]) == 0: del data[variable] - with h5py.File(STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5", "w") as f: + with h5py.File(STORAGE_FOLDER / "enhanced_cps_2024.h5", "w") as f: for variable, periods in data.items(): grp = f.create_group(variable) for period, values in periods.items(): diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index b3290fe9..38afcbea 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -788,13 +788,7 @@ class PUF_2024(PUF): } if __name__ == "__main__": - import os - - local_area_calibration = os.environ.get("LOCAL_AREA_CALIBRATION") == "true" - - if local_area_calibration: - PUF_2023().generate() - else: - PUF_2015().generate() - PUF_2021().generate() - PUF_2024().generate() + PUF_2015().generate() + PUF_2021().generate() + PUF_2023().generate() + PUF_2024().generate() diff --git a/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py index 90c9f8c4..96e4a996 100644 --- a/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_sparse_enhanced_cps.py @@ -17,7 +17,7 @@ @pytest.fixture(scope="session") def data(): - return Dataset.from_file(STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5") + return Dataset.from_file(STORAGE_FOLDER / "enhanced_cps_2024.h5") @pytest.fixture(scope="session") @@ -93,7 +93,7 @@ def test_sparse_ecps_has_tips(sim): def test_sparse_ecps_replicates_jct_tax_expenditures(): calibration_log = pd.read_csv( - "calibration_log_sparse.csv", + "calibration_log.csv", ) jct_rows = calibration_log[ diff --git a/pyproject.toml b/pyproject.toml index 50f857ee..8fbb2490 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,9 @@ dependencies = [ calibration = [ "samplics", ] +l0 = [ + "l0-python", +] [dependency-groups] dev = [ diff --git a/uv.lock b/uv.lock index cf6075de..65e63bc1 100644 --- a/uv.lock +++ b/uv.lock @@ -637,6 +637,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, + { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, @@ -644,6 +645,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, + { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, @@ -1083,6 +1085,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/dd/ead9d8ea85bf202d90cc513b533f9c363121c7792674f78e0d8a854b63b4/jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780", size = 15884, upload-time = "2023-11-23T09:26:34.325Z" }, ] +[[package]] +name = "l0-python" +version = "0.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "scipy" }, + { name = "torch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cf/6b/4a9ca6d1eb9828c526947fffb2ee2a1d02eec330f04cd53af301a05fde0a/l0_python-0.5.0.tar.gz", hash = "sha256:9b6b1751e142702e21ed866e40d8ab47304a26a5455998620a0eb798f4c7f599", size = 36320, upload-time = "2026-01-21T13:55:53.365Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/80/33ccae8af3fe55a81d33569d9241a29cecde17ab34fdff214804e81fa353/l0_python-0.5.0-py3-none-any.whl", hash = "sha256:9c8f4532426b927a97f4722b1c5114147adb09365100623effb49c0021345881", size = 23590, upload-time = "2026-01-21T13:55:52.406Z" }, +] + [[package]] name = "lark" version = "1.3.1" @@ -1843,7 +1859,7 @@ wheels = [ [[package]] name = "policyengine-us-data" -version = "1.54.0" +version = "1.54.1" source = { editable = "." } dependencies = [ { name = "google-auth" }, @@ -1873,6 +1889,9 @@ dependencies = [ calibration = [ { name = "samplics" }, ] +l0 = [ + { name = "l0-python" }, +] [package.dev-dependencies] dev = [ @@ -1893,6 +1912,7 @@ dev = [ requires-dist = [ { name = "google-auth", specifier = ">=2.0.0" }, { name = "google-cloud-storage", specifier = ">=2.0.0" }, + { name = "l0-python", marker = "extra == 'l0'" }, { name = "microdf-python", specifier = ">=1.2.1" }, { name = "microimpute", specifier = ">=1.1.4" }, { name = "openpyxl", specifier = ">=3.1.5" }, @@ -1914,7 +1934,7 @@ requires-dist = [ { name = "us", specifier = ">=2.0.0" }, { name = "xlrd", specifier = ">=2.0.2" }, ] -provides-extras = ["calibration"] +provides-extras = ["calibration", "l0"] [package.metadata.requires-dev] dev = [