diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..43231443 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,104 @@ +# Changelog - v0.7.0 + +## Migration Guide + +### From v0.5.x / v0.6.x + +#### Database Migration + +The database schema has changed significantly. Run the following after upgrading: + +```bash +singularity admin init +``` + +This will auto-migrate the schema. Key changes: +- Foreign keys changed from CASCADE to SET NULL for performance +- New indexes added for job and preparation cleanup + +`admin init` is generally cheap to run if schema is healthy so it's advisable to automate as a pre-start script in orchestration, however the initial migration for 0.7.0 may need some time due to index rebuilds so it is advisable to halt other services and run `admin init` standalone in this one case. + +#### Breaking: MongoDB Removed + +MongoDB backend import is no longer supported. If you were using MongoDB (singularity v1): + +1. Export your data from v1 +2. Upgrade to v0.6.0-RC2 and import to PostgreSQL/YugabyteDB (MySQL strongly discouraged, sqlite only for very small deployments) +3. Then upgrade to v0.7.0 + +#### Sequence Reset (PostgreSQL/YugabyteDB) + +**Automatic**: `singularity admin init` now detects and fixes stale sequences automatically. This handles the common case of importing data with explicit IDs (e.g., from MySQL backup). + +**Manual**: If needed, a standalone script is available at `scripts/fix-sequences-after-import.sql`. + +#### Piece Type Classification + +**Automatic**: `singularity admin init` now infers `piece_type` for CAR files that predate this column, classifying them as `data` (contains file content) or `dag` (directory metadata only). This is a no-op if all pieces are already labeled. + +**Manual**: A standalone script is available at `scripts/infer_piece_type.sql` for inspection or manual runs. + +#### Curio Compatibility (DAG Pieces) + +If DAG pieces were rejected by Curio, you will need to regenerate them: + +1. Run `singularity admin init` to classify existing pieces +2. Delete the rejected DAG pieces using `singularity prep delete-piece` +3. Re-run `singularity prep start-daggen` on affected preparations to create Curio-compatible pieces + +--- + +## Features + +- **prep delete-piece command** - Delete specific pieces from a preparation (#602) +- **Small piece padding** - Pad very small pieces with literal zeroes for curio compatibility (#595) + +## Bug Fixes + +- **S3 car file renames** - Fix car file renames on S3-type remotes (#598) +- **Remote storage padding** - Use local tmpfile for padding small pieces on remote/non-appendable storage (#597) +- **Deadlocks resolved** - Fix remaining deadlocks in prep deletion and job claiming (#596) +- **Download flags** - Harmonize download flags across commands (#599, closes #460) +- **FK re-scanning** - Avoid re-scanning known valid foreign keys during cleanup (#601) +- **docker-compose env var** - Fix singularity_init container environment variable (#526) + +## Database & Performance + +- **Nullable FKs for massive deletes** - Use nullable foreign keys and pointers for fast bulk deletion (#600) +- **Indexes for FK cleanup** - Add indexes on jobs/preps foreign keys for faster operations +- **SKIP LOCKED for job claiming** - Prevent deadlocks during concurrent job acquisition +- **Deferred association loading** - Load associations after claiming to reduce lock contention + +## Infrastructure + +- **MongoDB removed** - Remove legacy MongoDB support (#588) +- **Devcontainer CI** - Add devcontainer-based CI workflow with PostgreSQL and MySQL (#586) +- **Distroless runner** - Use distroless container image for smaller footprint (#592) +- **Dependency upgrades** - Significantly update gorm, boxo, rclone, libp2p, and security packages (#591, #589) + +## Breaking Changes + +- MongoDB backend no longer supported + +--- + +## PRs Included + +| PR | Title | +|----|-------| +| #602 | feat: add prep delete-piece command | +| #601 | hotfix: avoid re-scanning known valid FKs | +| #600 | give in and use nullable FKs/pointers for massive deletes | +| #599 | harmonize download flags, closes #460 | +| #598 | fix: correctly rename cars on S3 type remotes | +| #597 | hotfix: use local tmpfile for padding small pieces | +| #596 | fix remaining deadlocks + prep deletion | +| #595 | Pad very small pieces with literal zeroes | +| #592 | update runner image | +| #591 | improve lotus API test survivability | +| #589 | chore/update | +| #588 | remove legacy mongodb | +| #586 | Devcontainer-based CI flow | +| #526 | fix docker-compose.yml singularity_init container env var | +| #519 | chore: bump version to v0.6.0-RC3 | +| #516 | Fix: restore version.json to v0.6.0-RC2 | diff --git a/model/migrate.go b/model/migrate.go index be5621c0..db56440a 100644 --- a/model/migrate.go +++ b/model/migrate.go @@ -107,6 +107,16 @@ func AutoMigrate(db *gorm.DB) error { return errors.Wrap(err, "failed to create salt") } + // Fix postgres sequences if they're out of sync (e.g., after data import) + if err := fixPostgresSequences(db); err != nil { + return errors.Wrap(err, "failed to fix sequences") + } + + // Infer piece_type for cars that predate the column + if err := inferPieceTypes(db); err != nil { + return errors.Wrap(err, "failed to infer piece types") + } + return nil } @@ -189,6 +199,113 @@ func migrateFKConstraints(db *gorm.DB) error { return nil } +// sequenceTable maps table names to their primary key column for sequence fixing. +// Only tables with numeric auto-increment PKs are included. +var sequenceTables = []string{ + "preparations", + "storages", + "output_attachments", + "source_attachments", + "jobs", + "files", + "file_ranges", + "directories", + "cars", + "car_blocks", + "deals", + "schedules", +} + +// fixPostgresSequences detects and fixes out-of-sync sequences. +// This can happen when data is imported with explicit IDs (e.g., from MySQL). +// PostgreSQL sequences don't auto-update on INSERT with explicit ID values. +func fixPostgresSequences(db *gorm.DB) error { + if db.Dialector.Name() != "postgres" { + return nil + } + + for _, table := range sequenceTables { + var maxID, lastValue int64 + + // get max id from table + err := db.Raw(`SELECT COALESCE(MAX(id), 0) FROM ` + table).Scan(&maxID).Error + if err != nil { + // table might not exist yet + logger.Debugw("skipping sequence check", "table", table, "error", err) + continue + } + + // get sequence name and current value + seqName := table + "_id_seq" + err = db.Raw(`SELECT last_value FROM ` + seqName).Scan(&lastValue).Error + if err != nil { + logger.Debugw("skipping sequence check", "sequence", seqName, "error", err) + continue + } + + // if max(id) >= sequence value, sequence is stale + if maxID >= lastValue { + logger.Infow("fixing stale sequence", "table", table, "maxID", maxID, "lastValue", lastValue) + err = db.Exec(`SELECT setval(?, ?, true)`, seqName, maxID).Error + if err != nil { + return errors.Wrapf(err, "failed to fix sequence %s", seqName) + } + } + } + + return nil +} + +// inferPieceTypes sets piece_type for cars that predate the column. +// A piece is "data" if any of its blocks reference files (contain file content). +// A piece is "dag" if none of its blocks reference files (directory metadata only). +// This is idempotent - only updates rows where piece_type is NULL or empty. +func inferPieceTypes(db *gorm.DB) error { + dialect := db.Dialector.Name() + + // check if any cars need updating + var count int64 + err := db.Raw(`SELECT COUNT(*) FROM cars WHERE piece_type IS NULL OR piece_type = ''`).Scan(&count).Error + if err != nil { + // table might not exist or column missing + logger.Debugw("skipping piece type inference", "error", err) + return nil + } + + if count == 0 { + return nil + } + + logger.Infow("inferring piece types for legacy cars", "count", count) + + // dialect-specific UPDATE with subquery + var query string + if dialect == "sqlite" { + query = ` + UPDATE cars SET piece_type = ( + CASE WHEN EXISTS ( + SELECT 1 FROM car_blocks WHERE car_blocks.car_id = cars.id AND car_blocks.file_id IS NOT NULL + ) THEN 'data' ELSE 'dag' END + ) WHERE piece_type IS NULL OR piece_type = ''` + } else { + // postgres/mysql support correlated subquery in CASE + query = ` + UPDATE cars c SET piece_type = CASE + WHEN EXISTS ( + SELECT 1 FROM car_blocks cb WHERE cb.car_id = c.id AND cb.file_id IS NOT NULL + ) THEN 'data' ELSE 'dag' + END WHERE c.piece_type IS NULL OR c.piece_type = ''` + } + + result := db.Exec(query) + if result.Error != nil { + return errors.Wrap(result.Error, "failed to infer piece types") + } + + logger.Infow("inferred piece types", "updated", result.RowsAffected) + return nil +} + // DropAll removes all tables specified in the Tables slice from the database. // // This function is typically used during development or testing where a clean database diff --git a/scripts/fix-sequences-after-import.sql b/scripts/fix-sequences-after-import.sql new file mode 100644 index 00000000..794e96d5 --- /dev/null +++ b/scripts/fix-sequences-after-import.sql @@ -0,0 +1,17 @@ +-- reset sequences after importing data with explicit IDs (e.g., from mysql) +-- postgresql sequences don't auto-update when inserting with explicit IDs +-- setval(seq, max, true) means next nextval() returns max+1 + +SELECT setval(pg_get_serial_sequence('preparations', 'id'), COALESCE((SELECT MAX(id) FROM preparations), 0), true); +SELECT setval(pg_get_serial_sequence('storages', 'id'), COALESCE((SELECT MAX(id) FROM storages), 0), true); +SELECT setval(pg_get_serial_sequence('source_attachments', 'id'), COALESCE((SELECT MAX(id) FROM source_attachments), 0), true); +SELECT setval(pg_get_serial_sequence('output_attachments', 'id'), COALESCE((SELECT MAX(id) FROM output_attachments), 0), true); +SELECT setval(pg_get_serial_sequence('jobs', 'id'), COALESCE((SELECT MAX(id) FROM jobs), 0), true); +SELECT setval(pg_get_serial_sequence('files', 'id'), COALESCE((SELECT MAX(id) FROM files), 0), true); +SELECT setval(pg_get_serial_sequence('file_ranges', 'id'), COALESCE((SELECT MAX(id) FROM file_ranges), 0), true); +SELECT setval(pg_get_serial_sequence('directories', 'id'), COALESCE((SELECT MAX(id) FROM directories), 0), true); +SELECT setval(pg_get_serial_sequence('cars', 'id'), COALESCE((SELECT MAX(id) FROM cars), 0), true); +SELECT setval(pg_get_serial_sequence('car_blocks', 'id'), COALESCE((SELECT MAX(id) FROM car_blocks), 0), true); +SELECT setval(pg_get_serial_sequence('deals', 'id'), COALESCE((SELECT MAX(id) FROM deals), 0), true); +SELECT setval(pg_get_serial_sequence('schedules', 'id'), COALESCE((SELECT MAX(id) FROM schedules), 0), true); +-- workers, globals, wallets have string PKs (no sequence) diff --git a/scripts/infer_piece_type.sql b/scripts/infer_piece_type.sql new file mode 100644 index 00000000..da8840d3 --- /dev/null +++ b/scripts/infer_piece_type.sql @@ -0,0 +1,30 @@ +-- Infer piece_type for cars that predate the piece_type column +-- A piece is Data if any of its blocks reference files (contain file content) +-- A piece is DAG if none of its blocks reference files (contain only directory metadata) + +-- Preview what would be updated +SELECT + c.id, + c.piece_cid, + c.piece_size, + c.piece_type as current_type, + CASE + WHEN EXISTS ( + SELECT 1 FROM car_blocks cb + WHERE cb.car_id = c.id AND cb.file_id IS NOT NULL + ) THEN 'data' + ELSE 'dag' + END as inferred_type +FROM cars c +WHERE c.piece_type IS NULL OR c.piece_type = ''; + +-- Uncomment to actually update: +-- UPDATE cars c +-- SET piece_type = CASE +-- WHEN EXISTS ( +-- SELECT 1 FROM car_blocks cb +-- WHERE cb.car_id = c.id AND cb.file_id IS NOT NULL +-- ) THEN 'data' +-- ELSE 'dag' +-- END +-- WHERE c.piece_type IS NULL OR c.piece_type = '';