data-preservation-programs · parkan · Jan 21, 2026 · Jan 21, 2026 · Jan 22, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,104 @@
+# Changelog - v0.7.0
+
+## Migration Guide
+
+### From v0.5.x / v0.6.x
+
+#### Database Migration
+
+The database schema has changed significantly. Run the following after upgrading:
+
+```bash
+singularity admin init
+```
+
+This will auto-migrate the schema. Key changes:
+- Foreign keys changed from CASCADE to SET NULL for performance
+- New indexes added for job and preparation cleanup
+
+`admin init` is generally cheap to run if schema is healthy so it's advisable to automate as a pre-start script in orchestration, however the initial migration for 0.7.0 may need some time due to index rebuilds so it is advisable to halt other services and run `admin init` standalone in this one case.
+
+#### Breaking: MongoDB Removed
+
+MongoDB backend import is no longer supported. If you were using MongoDB (singularity v1):
+
+1. Export your data from v1
+2. Upgrade to v0.6.0-RC2 and import to PostgreSQL/YugabyteDB (MySQL strongly discouraged, sqlite only for very small deployments)
+3. Then upgrade to v0.7.0
+
+#### Sequence Reset (PostgreSQL/YugabyteDB)
+
+**Automatic**: `singularity admin init` now detects and fixes stale sequences automatically. This handles the common case of importing data with explicit IDs (e.g., from MySQL backup).
+
+**Manual**: If needed, a standalone script is available at `scripts/fix-sequences-after-import.sql`.
+
+#### Piece Type Classification
+
+**Automatic**: `singularity admin init` now infers `piece_type` for CAR files that predate this column, classifying them as `data` (contains file content) or `dag` (directory metadata only). This is a no-op if all pieces are already labeled.
+
+**Manual**: A standalone script is available at `scripts/infer_piece_type.sql` for inspection or manual runs.
+
+#### Curio Compatibility (DAG Pieces)
+
+If DAG pieces were rejected by Curio, you will need to regenerate them:
+
+1. Run `singularity admin init` to classify existing pieces
+2. Delete the rejected DAG pieces using `singularity prep delete-piece`
+3. Re-run `singularity prep start-daggen` on affected preparations to create Curio-compatible pieces
+
+---
+
+## Features
+
+- **prep delete-piece command** - Delete specific pieces from a preparation (#602)
+- **Small piece padding** - Pad very small pieces with literal zeroes for curio compatibility (#595)
+
+## Bug Fixes
+
+- **S3 car file renames** - Fix car file renames on S3-type remotes (#598)
+- **Remote storage padding** - Use local tmpfile for padding small pieces on remote/non-appendable storage (#597)
+- **Deadlocks resolved** - Fix remaining deadlocks in prep deletion and job claiming (#596)
+- **Download flags** - Harmonize download flags across commands (#599, closes #460)
+- **FK re-scanning** - Avoid re-scanning known valid foreign keys during cleanup (#601)
+- **docker-compose env var** - Fix singularity_init container environment variable (#526)
+
+## Database & Performance
+
+- **Nullable FKs for massive deletes** - Use nullable foreign keys and pointers for fast bulk deletion (#600)
+- **Indexes for FK cleanup** - Add indexes on jobs/preps foreign keys for faster operations
+- **SKIP LOCKED for job claiming** - Prevent deadlocks during concurrent job acquisition
+- **Deferred association loading** - Load associations after claiming to reduce lock contention
+
+## Infrastructure
+
+- **MongoDB removed** - Remove legacy MongoDB support (#588)
+- **Devcontainer CI** - Add devcontainer-based CI workflow with PostgreSQL and MySQL (#586)
+- **Distroless runner** - Use distroless container image for smaller footprint (#592)
+- **Dependency upgrades** - Significantly update gorm, boxo, rclone, libp2p, and security packages (#591, #589)
+
+## Breaking Changes
+
+- MongoDB backend no longer supported
+
+---
+
+## PRs Included
+
+| PR | Title |
+|----|-------|
+| #602 | feat: add prep delete-piece command |
+| #601 | hotfix: avoid re-scanning known valid FKs |
+| #600 | give in and use nullable FKs/pointers for massive deletes |
+| #599 | harmonize download flags, closes #460 |
+| #598 | fix: correctly rename cars on S3 type remotes |
+| #597 | hotfix: use local tmpfile for padding small pieces |
+| #596 | fix remaining deadlocks + prep deletion |
+| #595 | Pad very small pieces with literal zeroes |
+| #592 | update runner image |
+| #591 | improve lotus API test survivability |
+| #589 | chore/update |
+| #588 | remove legacy mongodb |
+| #586 | Devcontainer-based CI flow |
+| #526 | fix docker-compose.yml singularity_init container env var |
+| #519 | chore: bump version to v0.6.0-RC3 |
+| #516 | Fix: restore version.json to v0.6.0-RC2 |
diff --git a/model/migrate.go b/model/migrate.go
@@ -107,6 +107,16 @@ func AutoMigrate(db *gorm.DB) error {
 		return errors.Wrap(err, "failed to create salt")
 	}
 
+	// Fix postgres sequences if they're out of sync (e.g., after data import)
+	if err := fixPostgresSequences(db); err != nil {
+		return errors.Wrap(err, "failed to fix sequences")
+	}
+
+	// Infer piece_type for cars that predate the column
+	if err := inferPieceTypes(db); err != nil {
+		return errors.Wrap(err, "failed to infer piece types")
+	}
+
 	return nil
 }
 
@@ -189,6 +199,113 @@ func migrateFKConstraints(db *gorm.DB) error {
 	return nil
 }
 
+// sequenceTable maps table names to their primary key column for sequence fixing.
+// Only tables with numeric auto-increment PKs are included.
+var sequenceTables = []string{
+	"preparations",
+	"storages",
+	"output_attachments",
+	"source_attachments",
+	"jobs",
+	"files",
+	"file_ranges",
+	"directories",
+	"cars",
+	"car_blocks",
+	"deals",
+	"schedules",
+}
+
+// fixPostgresSequences detects and fixes out-of-sync sequences.
+// This can happen when data is imported with explicit IDs (e.g., from MySQL).
+// PostgreSQL sequences don't auto-update on INSERT with explicit ID values.
+func fixPostgresSequences(db *gorm.DB) error {
+	if db.Dialector.Name() != "postgres" {
+		return nil
+	}
+
+	for _, table := range sequenceTables {
+		var maxID, lastValue int64
+
+		// get max id from table
+		err := db.Raw(`SELECT COALESCE(MAX(id), 0) FROM ` + table).Scan(&maxID).Error
+		if err != nil {
+			// table might not exist yet
+			logger.Debugw("skipping sequence check", "table", table, "error", err)
+			continue
+		}
+
+		// get sequence name and current value
+		seqName := table + "_id_seq"
+		err = db.Raw(`SELECT last_value FROM ` + seqName).Scan(&lastValue).Error
+		if err != nil {
+			logger.Debugw("skipping sequence check", "sequence", seqName, "error", err)
+			continue
+		}
+
+		// if max(id) >= sequence value, sequence is stale
+		if maxID >= lastValue {
+			logger.Infow("fixing stale sequence", "table", table, "maxID", maxID, "lastValue", lastValue)
+			err = db.Exec(`SELECT setval(?, ?, true)`, seqName, maxID).Error
+			if err != nil {
+				return errors.Wrapf(err, "failed to fix sequence %s", seqName)
+			}
+		}
+	}
+
+	return nil
+}
+
+// inferPieceTypes sets piece_type for cars that predate the column.
+// A piece is "data" if any of its blocks reference files (contain file content).
+// A piece is "dag" if none of its blocks reference files (directory metadata only).
+// This is idempotent - only updates rows where piece_type is NULL or empty.
+func inferPieceTypes(db *gorm.DB) error {
+	dialect := db.Dialector.Name()
+
+	// check if any cars need updating
+	var count int64
+	err := db.Raw(`SELECT COUNT(*) FROM cars WHERE piece_type IS NULL OR piece_type = ''`).Scan(&count).Error
+	if err != nil {
+		// table might not exist or column missing
+		logger.Debugw("skipping piece type inference", "error", err)
+		return nil
+	}
+
+	if count == 0 {
+		return nil
+	}
+
+	logger.Infow("inferring piece types for legacy cars", "count", count)
+
+	// dialect-specific UPDATE with subquery
+	var query string
+	if dialect == "sqlite" {
+		query = `
+			UPDATE cars SET piece_type = (
+				CASE WHEN EXISTS (
+					SELECT 1 FROM car_blocks WHERE car_blocks.car_id = cars.id AND car_blocks.file_id IS NOT NULL
+				) THEN 'data' ELSE 'dag' END
+			) WHERE piece_type IS NULL OR piece_type = ''`
+	} else {
+		// postgres/mysql support correlated subquery in CASE
+		query = `
+			UPDATE cars c SET piece_type = CASE
+				WHEN EXISTS (
+					SELECT 1 FROM car_blocks cb WHERE cb.car_id = c.id AND cb.file_id IS NOT NULL
+				) THEN 'data' ELSE 'dag'
+			END WHERE c.piece_type IS NULL OR c.piece_type = ''`
+	}
+
+	result := db.Exec(query)
+	if result.Error != nil {
+		return errors.Wrap(result.Error, "failed to infer piece types")
+	}
+
+	logger.Infow("inferred piece types", "updated", result.RowsAffected)
+	return nil
+}
+
 // DropAll removes all tables specified in the Tables slice from the database.
 //
 // This function is typically used during development or testing where a clean database

diff --git a/scripts/fix-sequences-after-import.sql b/scripts/fix-sequences-after-import.sql
@@ -0,0 +1,17 @@
+-- reset sequences after importing data with explicit IDs (e.g., from mysql)
+-- postgresql sequences don't auto-update when inserting with explicit IDs
+-- setval(seq, max, true) means next nextval() returns max+1
+
+SELECT setval(pg_get_serial_sequence('preparations', 'id'), COALESCE((SELECT MAX(id) FROM preparations), 0), true);
+SELECT setval(pg_get_serial_sequence('storages', 'id'), COALESCE((SELECT MAX(id) FROM storages), 0), true);
+SELECT setval(pg_get_serial_sequence('source_attachments', 'id'), COALESCE((SELECT MAX(id) FROM source_attachments), 0), true);
+SELECT setval(pg_get_serial_sequence('output_attachments', 'id'), COALESCE((SELECT MAX(id) FROM output_attachments), 0), true);
+SELECT setval(pg_get_serial_sequence('jobs', 'id'), COALESCE((SELECT MAX(id) FROM jobs), 0), true);
+SELECT setval(pg_get_serial_sequence('files', 'id'), COALESCE((SELECT MAX(id) FROM files), 0), true);
+SELECT setval(pg_get_serial_sequence('file_ranges', 'id'), COALESCE((SELECT MAX(id) FROM file_ranges), 0), true);
+SELECT setval(pg_get_serial_sequence('directories', 'id'), COALESCE((SELECT MAX(id) FROM directories), 0), true);
+SELECT setval(pg_get_serial_sequence('cars', 'id'), COALESCE((SELECT MAX(id) FROM cars), 0), true);
+SELECT setval(pg_get_serial_sequence('car_blocks', 'id'), COALESCE((SELECT MAX(id) FROM car_blocks), 0), true);
+SELECT setval(pg_get_serial_sequence('deals', 'id'), COALESCE((SELECT MAX(id) FROM deals), 0), true);
+SELECT setval(pg_get_serial_sequence('schedules', 'id'), COALESCE((SELECT MAX(id) FROM schedules), 0), true);
+-- workers, globals, wallets have string PKs (no sequence)
diff --git a/scripts/infer_piece_type.sql b/scripts/infer_piece_type.sql
@@ -0,0 +1,30 @@
+-- Infer piece_type for cars that predate the piece_type column
+-- A piece is Data if any of its blocks reference files (contain file content)
+-- A piece is DAG if none of its blocks reference files (contain only directory metadata)
+
+-- Preview what would be updated
+SELECT
+    c.id,
+    c.piece_cid,
+    c.piece_size,
+    c.piece_type as current_type,
+    CASE
+        WHEN EXISTS (
+            SELECT 1 FROM car_blocks cb
+            WHERE cb.car_id = c.id AND cb.file_id IS NOT NULL
+        ) THEN 'data'
+        ELSE 'dag'
+    END as inferred_type
+FROM cars c
+WHERE c.piece_type IS NULL OR c.piece_type = '';
+
+-- Uncomment to actually update:
+-- UPDATE cars c
+-- SET piece_type = CASE
+--     WHEN EXISTS (
+--         SELECT 1 FROM car_blocks cb
+--         WHERE cb.car_id = c.id AND cb.file_id IS NOT NULL
+--     ) THEN 'data'
+--     ELSE 'dag'
+-- END
+-- WHERE c.piece_type IS NULL OR c.piece_type = '';