PolicyEngine · juaristi22 · Jan 28, 2026 · Jan 28, 2026 · Jan 29, 2026
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- bump: minor
+  changes:
+    added:
+    - Deduplication logic in SparseMatrixBuilder (option to remove duplicate targets or select most specific geographic level).
+    - Entity aware target calculations for correct entity counts.
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
@@ -610,3 +610,78 @@ def calculate_spm_thresholds_for_cd(
         thresholds[i] = base * equiv_scale * geoadj
 
     return thresholds
+
+
+def build_concept_id(variable: str, constraints: List[str]) -> str:
+    """
+    Build normalized concept ID from variable + constraints.
+
+    The concept ID uniquely identifies a calibration target "concept"
+    based on the variable being measured and its non-geographic constraints.
+
+    Args:
+        variable: Target variable name (e.g., "person_count", "snap")
+        constraints: List of constraint strings (e.g., ["age>=5", "age<18"])
+
+    Returns:
+        Normalized concept ID string
+
+    Examples:
+        >>> build_concept_id("person_count", ["age>=5", "age<18"])
+        'person_count_age_gte_5_age_lt_18'
+        >>> build_concept_id("snap", ["snap>0"])
+        'snap_snap_gt_0'
+        >>> build_concept_id("snap", [])
+        'snap'
+    """
+    if not constraints:
+        return variable
+
+    # Normalize and sort constraints for consistent IDs
+    normalized = []
+    for c in sorted(constraints):
+        c_norm = (
+            c.replace(">=", "_gte_")
+            .replace("<=", "_lte_")
+            .replace(">", "_gt_")
+            .replace("<", "_lt_")
+            .replace("==", "_eq_")
+            .replace("=", "_eq_")
+            .replace(" ", "")
+        )
+        normalized.append(c_norm)
+
+    return f"{variable}_{'_'.join(normalized)}"
+
+
+def extract_constraints_from_row(
+    row: pd.Series, exclude_geo: bool = True
+) -> List[str]:
+    """
+    Extract constraint list from a target row's constraint_info column.
+
+    Args:
+        row: DataFrame row with 'constraint_info' column containing
+            pipe-separated constraints (e.g., "age>=5|age<18|state_fips=6")
+        exclude_geo: If True, filter out geographic constraints
+            (state_fips, congressional_district_geoid, tax_unit_is_filer)
+
+    Returns:
+        List of constraint strings like ["age>=5", "age<18"]
+    """
+    if "constraint_info" not in row or pd.isna(row["constraint_info"]):
+        return []
+
+    constraints = row["constraint_info"].split("|")
+
+    if exclude_geo:
+        geo_vars = [
+            "state_fips",
+            "congressional_district_geoid",
+            "tax_unit_is_filer",
+        ]
+        constraints = [
+            c for c in constraints if not any(geo in c for geo in geo_vars)
+        ]
+
+    return constraints
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py
@@ -111,11 +111,20 @@
             "snap",
         ],
     },
+    deduplicate=True,
+    dedup_mode="within_geography",
 )
 
-print(f"Matrix shape: {X_sparse.shape}")
-print(f"Targets: {len(targets_df)}")
+# Print concept and deduplication summaries
+builder.print_concept_summary()
+builder.print_dedup_summary()
 
+print(f"\nMatrix shape: {X_sparse.shape}")
+print(f"Targets after deduplication: {len(targets_df)}")
+
+# ============================================================================
+# STEP 2: FILTER TO ACHIEVABLE TARGETS
+# ============================================================================
 # Filter to achievable targets (rows with non-zero data)
 row_sums = np.array(X_sparse.sum(axis=1)).flatten()
 achievable_mask = row_sums > 0
@@ -128,7 +137,7 @@
 targets_df = targets_df[achievable_mask].reset_index(drop=True)
 X_sparse = X_sparse[achievable_mask, :]
 
-print(f"Filtered matrix shape: {X_sparse.shape}")
+print(f"Final matrix shape: {X_sparse.shape}")
 
 # Extract target vector and names
 targets = targets_df["value"].values
@@ -138,14 +147,14 @@
 ]
 
 # ============================================================================
-# STEP 2: INITIALIZE WEIGHTS
+# STEP 3: INITIALIZE WEIGHTS
 # ============================================================================
 initial_weights = np.ones(X_sparse.shape[1]) * 100
 print(f"\nInitial weights shape: {initial_weights.shape}")
 print(f"Initial weights sum: {initial_weights.sum():,.0f}")
 
 # ============================================================================
-# STEP 3: CREATE MODEL
+# STEP 4: CREATE MODEL
 # ============================================================================
 print("\nCreating SparseCalibrationWeights model...")
 model = SparseCalibrationWeights(
@@ -161,7 +170,7 @@
 )
 
 # ============================================================================
-# STEP 4: TRAIN IN CHUNKS
+# STEP 5: TRAIN IN CHUNKS
 # ============================================================================
 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 calibration_log = pd.DataFrame()
@@ -204,7 +213,7 @@
     calibration_log = pd.concat([calibration_log, chunk_df], ignore_index=True)
 
 # ============================================================================
-# STEP 5: EXTRACT AND SAVE WEIGHTS
+# STEP 6: EXTRACT AND SAVE WEIGHTS
 # ============================================================================
 with torch.no_grad():
     w = model.get_weights(deterministic=True).cpu().numpy()
@@ -224,7 +233,7 @@
 print(f"LOG_PATH:{log_path}")
 
 # ============================================================================
-# STEP 6: VERIFY PREDICTIONS
+# STEP 7: VERIFY PREDICTIONS
 # ============================================================================
 print("\n" + "=" * 60)
 print("PREDICTION VERIFICATION")