Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- bump: minor
changes:
added:
- Deduplication logic in SparseMatrixBuilder (option to remove duplicate targets or select most specific geographic level).
- Entity aware target calculations for correct entity counts.
Original file line number Diff line number Diff line change
Expand Up @@ -610,3 +610,78 @@ def calculate_spm_thresholds_for_cd(
thresholds[i] = base * equiv_scale * geoadj

return thresholds


def build_concept_id(variable: str, constraints: List[str]) -> str:
"""
Build normalized concept ID from variable + constraints.

The concept ID uniquely identifies a calibration target "concept"
based on the variable being measured and its non-geographic constraints.

Args:
variable: Target variable name (e.g., "person_count", "snap")
constraints: List of constraint strings (e.g., ["age>=5", "age<18"])

Returns:
Normalized concept ID string

Examples:
>>> build_concept_id("person_count", ["age>=5", "age<18"])
'person_count_age_gte_5_age_lt_18'
>>> build_concept_id("snap", ["snap>0"])
'snap_snap_gt_0'
>>> build_concept_id("snap", [])
'snap'
"""
if not constraints:
return variable

# Normalize and sort constraints for consistent IDs
normalized = []
for c in sorted(constraints):
c_norm = (
c.replace(">=", "_gte_")
.replace("<=", "_lte_")
.replace(">", "_gt_")
.replace("<", "_lt_")
.replace("==", "_eq_")
.replace("=", "_eq_")
.replace(" ", "")
)
normalized.append(c_norm)

return f"{variable}_{'_'.join(normalized)}"


def extract_constraints_from_row(
row: pd.Series, exclude_geo: bool = True
) -> List[str]:
"""
Extract constraint list from a target row's constraint_info column.

Args:
row: DataFrame row with 'constraint_info' column containing
pipe-separated constraints (e.g., "age>=5|age<18|state_fips=6")
exclude_geo: If True, filter out geographic constraints
(state_fips, congressional_district_geoid, tax_unit_is_filer)

Returns:
List of constraint strings like ["age>=5", "age<18"]
"""
if "constraint_info" not in row or pd.isna(row["constraint_info"]):
return []

constraints = row["constraint_info"].split("|")

if exclude_geo:
geo_vars = [
"state_fips",
"congressional_district_geoid",
"tax_unit_is_filer",
]
constraints = [
c for c in constraints if not any(geo in c for geo in geo_vars)
]

return constraints
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,20 @@
"snap",
],
},
deduplicate=True,
dedup_mode="within_geography",
)

print(f"Matrix shape: {X_sparse.shape}")
print(f"Targets: {len(targets_df)}")
# Print concept and deduplication summaries
builder.print_concept_summary()
builder.print_dedup_summary()

print(f"\nMatrix shape: {X_sparse.shape}")
print(f"Targets after deduplication: {len(targets_df)}")

# ============================================================================
# STEP 2: FILTER TO ACHIEVABLE TARGETS
# ============================================================================
# Filter to achievable targets (rows with non-zero data)
row_sums = np.array(X_sparse.sum(axis=1)).flatten()
achievable_mask = row_sums > 0
Expand All @@ -128,7 +137,7 @@
targets_df = targets_df[achievable_mask].reset_index(drop=True)
X_sparse = X_sparse[achievable_mask, :]

print(f"Filtered matrix shape: {X_sparse.shape}")
print(f"Final matrix shape: {X_sparse.shape}")

# Extract target vector and names
targets = targets_df["value"].values
Expand All @@ -138,14 +147,14 @@
]

# ============================================================================
# STEP 2: INITIALIZE WEIGHTS
# STEP 3: INITIALIZE WEIGHTS
# ============================================================================
initial_weights = np.ones(X_sparse.shape[1]) * 100
print(f"\nInitial weights shape: {initial_weights.shape}")
print(f"Initial weights sum: {initial_weights.sum():,.0f}")

# ============================================================================
# STEP 3: CREATE MODEL
# STEP 4: CREATE MODEL
# ============================================================================
print("\nCreating SparseCalibrationWeights model...")
model = SparseCalibrationWeights(
Expand All @@ -161,7 +170,7 @@
)

# ============================================================================
# STEP 4: TRAIN IN CHUNKS
# STEP 5: TRAIN IN CHUNKS
# ============================================================================
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
calibration_log = pd.DataFrame()
Expand Down Expand Up @@ -204,7 +213,7 @@
calibration_log = pd.concat([calibration_log, chunk_df], ignore_index=True)

# ============================================================================
# STEP 5: EXTRACT AND SAVE WEIGHTS
# STEP 6: EXTRACT AND SAVE WEIGHTS
# ============================================================================
with torch.no_grad():
w = model.get_weights(deterministic=True).cpu().numpy()
Expand All @@ -224,7 +233,7 @@
print(f"LOG_PATH:{log_path}")

# ============================================================================
# STEP 6: VERIFY PREDICTIONS
# STEP 7: VERIFY PREDICTIONS
# ============================================================================
print("\n" + "=" * 60)
print("PREDICTION VERIFICATION")
Expand Down
Loading