From 332f9478f3ba560d69ccaa6205d1091cff962c27 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Fri, 16 Jan 2026 23:25:22 +0800 Subject: [PATCH 1/2] feat: add SnapshotSummaryBuilder Implement SnapshotSummaryBuilder class based on Java's SnapshotSummary.Builder to provide functionality for building snapshot summaries for Iceberg tables. --- src/iceberg/manifest/manifest_entry.h | 6 +- src/iceberg/snapshot.cc | 300 +++++++++++++ src/iceberg/snapshot.h | 112 +++++ src/iceberg/test/CMakeLists.txt | 1 + .../test/snapshot_summary_builder_test.cc | 415 ++++++++++++++++++ 5 files changed, 833 insertions(+), 1 deletion(-) create mode 100644 src/iceberg/test/snapshot_summary_builder_test.cc diff --git a/src/iceberg/manifest/manifest_entry.h b/src/iceberg/manifest/manifest_entry.h index 843bf0aef..c1f81d0ae 100644 --- a/src/iceberg/manifest/manifest_entry.h +++ b/src/iceberg/manifest/manifest_entry.h @@ -30,7 +30,6 @@ #include "iceberg/file_format.h" #include "iceberg/iceberg_export.h" -#include "iceberg/partition_spec.h" #include "iceberg/result.h" #include "iceberg/row/partition_values.h" #include "iceberg/schema_field.h" @@ -294,6 +293,11 @@ struct ICEBERG_EXPORT DataFile { /// \brief Get the schema of the data file with the given partition type. static std::shared_ptr Type(std::shared_ptr partition_type); + + /// \brief Check if this data file is a deletion vector. + bool IsDeletionVector() const { + return content == Content::kPositionDeletes && file_format == FileFormatType::kPuffin; + } }; /// \brief A manifest is an immutable Avro file that lists data files or delete files, diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc index e4edd7b64..b2f8e80a7 100644 --- a/src/iceberg/snapshot.cc +++ b/src/iceberg/snapshot.cc @@ -20,8 +20,11 @@ #include "iceberg/snapshot.h" #include +#include +#include #include "iceberg/file_io.h" +#include "iceberg/manifest/manifest_entry.h" #include "iceberg/manifest/manifest_list.h" #include "iceberg/manifest/manifest_reader.h" #include "iceberg/util/macros.h" @@ -29,6 +32,24 @@ namespace iceberg { +namespace { + +/// \brief Helper function to conditionally add a property to the summary +template +void SetIf(bool condition, std::unordered_map& builder, + const std::string& property, T value) { + if (condition) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_convertible_v) { + builder[property] = value; + } else { + builder[property] = std::to_string(value); + } + } +} + +} // namespace + bool SnapshotRef::Branch::Equals(const SnapshotRef::Branch& other) const { return min_snapshots_to_keep == other.min_snapshots_to_keep && max_snapshot_age_ms == other.max_snapshot_age_ms && @@ -255,4 +276,283 @@ Result> SnapshotCache::DeleteManifests( return std::span(cache.first.data() + delete_start, delete_count); } +// SnapshotSummaryBuilder::UpdateMetrics implementation + +void SnapshotSummaryBuilder::UpdateMetrics::Clear() { + added_size_ = 0; + removed_size_ = 0; + added_files_ = 0; + removed_files_ = 0; + added_eq_delete_files_ = 0; + removed_eq_delete_files_ = 0; + added_pos_delete_files_ = 0; + removed_pos_delete_files_ = 0; + added_delete_files_ = 0; + removed_delete_files_ = 0; + added_dvs_ = 0; + removed_dvs_ = 0; + added_records_ = 0; + deleted_records_ = 0; + added_pos_deletes_ = 0; + removed_pos_deletes_ = 0; + added_eq_deletes_ = 0; + removed_eq_deletes_ = 0; + trust_size_and_delete_counts_ = true; +} + +void SnapshotSummaryBuilder::UpdateMetrics::AddTo( + std::unordered_map& builder) const { + SetIf(added_files_ > 0, builder, SnapshotSummaryFields::kAddedDataFiles, added_files_); + SetIf(removed_files_ > 0, builder, SnapshotSummaryFields::kDeletedDataFiles, + removed_files_); + SetIf(added_eq_delete_files_ > 0, builder, SnapshotSummaryFields::kAddedEqDeleteFiles, + added_eq_delete_files_); + SetIf(removed_eq_delete_files_ > 0, builder, + SnapshotSummaryFields::kRemovedEqDeleteFiles, removed_eq_delete_files_); + SetIf(added_pos_delete_files_ > 0, builder, SnapshotSummaryFields::kAddedPosDeleteFiles, + added_pos_delete_files_); + SetIf(removed_pos_delete_files_ > 0, builder, + SnapshotSummaryFields::kRemovedPosDeleteFiles, removed_pos_delete_files_); + SetIf(added_delete_files_ > 0, builder, SnapshotSummaryFields::kAddedDeleteFiles, + added_delete_files_); + SetIf(removed_delete_files_ > 0, builder, SnapshotSummaryFields::kRemovedDeleteFiles, + removed_delete_files_); + SetIf(added_dvs_ > 0, builder, SnapshotSummaryFields::kAddedDVs, added_dvs_); + SetIf(removed_dvs_ > 0, builder, SnapshotSummaryFields::kRemovedDVs, removed_dvs_); + SetIf(added_records_ > 0, builder, SnapshotSummaryFields::kAddedRecords, + added_records_); + SetIf(deleted_records_ > 0, builder, SnapshotSummaryFields::kDeletedRecords, + deleted_records_); + + if (trust_size_and_delete_counts_) { + SetIf(added_size_ > 0, builder, SnapshotSummaryFields::kAddedFileSize, added_size_); + SetIf(removed_size_ > 0, builder, SnapshotSummaryFields::kRemovedFileSize, + removed_size_); + SetIf(added_pos_deletes_ > 0, builder, SnapshotSummaryFields::kAddedPosDeletes, + added_pos_deletes_); + SetIf(removed_pos_deletes_ > 0, builder, SnapshotSummaryFields::kRemovedPosDeletes, + removed_pos_deletes_); + SetIf(added_eq_deletes_ > 0, builder, SnapshotSummaryFields::kAddedEqDeletes, + added_eq_deletes_); + SetIf(removed_eq_deletes_ > 0, builder, SnapshotSummaryFields::kRemovedEqDeletes, + removed_eq_deletes_); + } +} + +void SnapshotSummaryBuilder::UpdateMetrics::AddedFile(const DataFile& file) { + added_size_ += file.file_size_in_bytes; + + switch (file.content) { + case DataFile::Content::kData: + added_files_ += 1; + added_records_ += file.record_count; + break; + case DataFile::Content::kPositionDeletes: + if (file.IsDeletionVector()) { + added_dvs_ += 1; + } else { + added_pos_delete_files_ += 1; + } + added_delete_files_ += 1; + added_pos_deletes_ += file.record_count; + break; + case DataFile::Content::kEqualityDeletes: + added_delete_files_ += 1; + added_eq_delete_files_ += 1; + added_eq_deletes_ += file.record_count; + break; + default: + std::unreachable(); + } +} + +void SnapshotSummaryBuilder::UpdateMetrics::RemovedFile(const DataFile& file) { + removed_size_ += file.file_size_in_bytes; + + switch (file.content) { + case DataFile::Content::kData: + removed_files_ += 1; + deleted_records_ += file.record_count; + break; + case DataFile::Content::kPositionDeletes: + if (file.IsDeletionVector()) { + removed_dvs_ += 1; + } else { + removed_pos_delete_files_ += 1; + } + removed_delete_files_ += 1; + removed_pos_deletes_ += file.record_count; + break; + case DataFile::Content::kEqualityDeletes: + removed_delete_files_ += 1; + removed_eq_delete_files_ += 1; + removed_eq_deletes_ += file.record_count; + break; + default: + std::unreachable(); + } +} + +void SnapshotSummaryBuilder::UpdateMetrics::AddedManifest(const ManifestFile& manifest) { + switch (manifest.content) { + case ManifestContent::kData: + added_files_ += manifest.added_files_count.value_or(0); + added_records_ += manifest.added_rows_count.value_or(0); + removed_files_ += manifest.deleted_files_count.value_or(0); + deleted_records_ += manifest.deleted_rows_count.value_or(0); + break; + case ManifestContent::kDeletes: + added_delete_files_ += manifest.added_files_count.value_or(0); + removed_delete_files_ += manifest.deleted_files_count.value_or(0); + trust_size_and_delete_counts_ = false; + break; + default: + std::unreachable(); + } +} + +void SnapshotSummaryBuilder::UpdateMetrics::Merge(const UpdateMetrics& other) { + added_files_ += other.added_files_; + removed_files_ += other.removed_files_; + added_eq_delete_files_ += other.added_eq_delete_files_; + removed_eq_delete_files_ += other.removed_eq_delete_files_; + added_pos_delete_files_ += other.added_pos_delete_files_; + removed_pos_delete_files_ += other.removed_pos_delete_files_; + added_dvs_ += other.added_dvs_; + removed_dvs_ += other.removed_dvs_; + added_delete_files_ += other.added_delete_files_; + removed_delete_files_ += other.removed_delete_files_; + added_size_ += other.added_size_; + removed_size_ += other.removed_size_; + added_records_ += other.added_records_; + deleted_records_ += other.deleted_records_; + added_pos_deletes_ += other.added_pos_deletes_; + removed_pos_deletes_ += other.removed_pos_deletes_; + added_eq_deletes_ += other.added_eq_deletes_; + removed_eq_deletes_ += other.removed_eq_deletes_; + trust_size_and_delete_counts_ = + trust_size_and_delete_counts_ && other.trust_size_and_delete_counts_; +} + +// SnapshotSummaryBuilder implementation + +void SnapshotSummaryBuilder::Clear() { + partition_metrics_.clear(); + metrics_.Clear(); + deleted_duplicate_files_ = 0; + trust_partition_metrics_ = true; +} + +void SnapshotSummaryBuilder::SetPartitionSummaryLimit(int32_t max) { + max_changed_partitions_for_summaries_ = max; +} + +void SnapshotSummaryBuilder::IncrementDuplicateDeletes(int32_t increment) { + deleted_duplicate_files_ += increment; +} + +Status SnapshotSummaryBuilder::AddedFile(const PartitionSpec& spec, + const DataFile& file) { + metrics_.AddedFile(file); + ICEBERG_RETURN_UNEXPECTED(UpdatePartitions(spec, file, true)); + return {}; +} + +Status SnapshotSummaryBuilder::DeletedFile(const PartitionSpec& spec, + const DataFile& file) { + metrics_.RemovedFile(file); + ICEBERG_RETURN_UNEXPECTED(UpdatePartitions(spec, file, false)); + return {}; +} + +void SnapshotSummaryBuilder::AddedManifest(const ManifestFile& manifest) { + trust_partition_metrics_ = false; + partition_metrics_.clear(); + metrics_.AddedManifest(manifest); +} + +void SnapshotSummaryBuilder::Set(const std::string& property, const std::string& value) { + properties_[property] = value; +} + +void SnapshotSummaryBuilder::Merge(const SnapshotSummaryBuilder& other) { + for (const auto& [key, value] : other.properties_) { + properties_[key] = value; + } + metrics_.Merge(other.metrics_); + + trust_partition_metrics_ = trust_partition_metrics_ && other.trust_partition_metrics_; + if (trust_partition_metrics_) { + for (const auto& [key, value] : other.partition_metrics_) { + partition_metrics_[key].Merge(value); + } + } else { + partition_metrics_.clear(); + } + + deleted_duplicate_files_ += other.deleted_duplicate_files_; +} + +std::unordered_map SnapshotSummaryBuilder::Build() const { + std::unordered_map builder; + + // Copy custom summary properties + builder.insert(properties_.begin(), properties_.end()); + + metrics_.AddTo(builder); + + SetIf(deleted_duplicate_files_ > 0, builder, + SnapshotSummaryFields::kDeletedDuplicatedFiles, deleted_duplicate_files_); + + SetIf(trust_partition_metrics_, builder, + SnapshotSummaryFields::kChangedPartitionCountProp, partition_metrics_.size()); + + // Add partition summaries if enabled + if (trust_partition_metrics_ && static_cast(partition_metrics_.size()) <= + max_changed_partitions_for_summaries_) { + SetIf(!partition_metrics_.empty(), builder, + SnapshotSummaryFields::kPartitionSummaryProp, "true"); + for (const auto& [key, metrics] : partition_metrics_) { + if (!key.empty()) { + builder[SnapshotSummaryFields::kChangedPartitionPrefix + key] = + PartitionSummary(metrics); + } + } + } + + return builder; +} + +Status SnapshotSummaryBuilder::UpdatePartitions(const PartitionSpec& spec, + const DataFile& file, bool is_addition) { + if (trust_partition_metrics_) { + ICEBERG_ASSIGN_OR_RAISE(std::string partition_path, + spec.PartitionPath(file.partition)); + auto& part_metrics = partition_metrics_[partition_path]; + if (is_addition) { + part_metrics.AddedFile(file); + } else { + part_metrics.RemovedFile(file); + } + } + return {}; +} + +std::string SnapshotSummaryBuilder::PartitionSummary(const UpdateMetrics& metrics) const { + std::unordered_map part_builder; + metrics.AddTo(part_builder); + + // Format as comma-separated key=value pairs + std::ostringstream oss; + bool first = true; + for (const auto& [key, value] : part_builder) { + if (!first) { + oss << ","; + } + oss << key << "=" << value; + first = false; + } + return oss.str(); +} + } // namespace iceberg diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index 65bf2f83a..709cdc40c 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -226,6 +226,10 @@ struct ICEBERG_EXPORT SnapshotSummaryFields { inline static const std::string kDeletedDuplicatedFiles = "deleted-duplicate-files"; /// \brief Number of partitions with files added or removed in the snapshot inline static const std::string kChangedPartitionCountProp = "changed-partition-count"; + /// \brief Partition summaries prefix + inline static const std::string kChangedPartitionPrefix = "partitions."; + /// \brief Whether partition summaries are included + inline static const std::string kPartitionSummaryProp = "partition-summaries-included"; /// Other Fields, see https://iceberg.apache.org/spec/#other-fields @@ -241,6 +245,114 @@ struct ICEBERG_EXPORT SnapshotSummaryFields { inline static const std::string kEngineVersion = "engine-version"; }; +/// \brief Helper class for building snapshot summaries. +/// +/// This class provides methods to track changes to data and delete files, +/// and produces a map of summary properties for snapshot metadata. +class ICEBERG_EXPORT SnapshotSummaryBuilder { + private: + /// \brief Metrics tracking for added and removed files + class UpdateMetrics { + public: + void Clear(); + void AddTo(std::unordered_map& builder) const; + void AddedFile(const DataFile& file); + void RemovedFile(const DataFile& file); + void AddedManifest(const ManifestFile& manifest); + void Merge(const UpdateMetrics& other); + + private: + int64_t added_size_{0}; + int64_t removed_size_{0}; + int32_t added_files_{0}; + int32_t removed_files_{0}; + int32_t added_eq_delete_files_{0}; + int32_t removed_eq_delete_files_{0}; + int32_t added_pos_delete_files_{0}; + int32_t removed_pos_delete_files_{0}; + int32_t added_dvs_{0}; + int32_t removed_dvs_{0}; + int32_t added_delete_files_{0}; + int32_t removed_delete_files_{0}; + int64_t added_records_{0}; + int64_t deleted_records_{0}; + int64_t added_pos_deletes_{0}; + int64_t removed_pos_deletes_{0}; + int64_t added_eq_deletes_{0}; + int64_t removed_eq_deletes_{0}; + bool trust_size_and_delete_counts_{true}; + }; + + public: + SnapshotSummaryBuilder() = default; + + /// \brief Clear all tracked metrics and properties + void Clear(); + + /// \brief Set the maximum number of changed partitions before partition summaries will + /// be excluded. + /// + /// If the number of changed partitions is over this max, summaries will not be + /// included. If the number of changed partitions is <= this limit, then partition-level + /// summaries will be included in the summary if they are available, and + /// "partition-summaries-included" will be set to "true". + /// + /// \param max Maximum number of changed partitions + void SetPartitionSummaryLimit(int32_t max); + + /// \brief Increment the count of duplicate files deleted by a specific amount + /// + /// \param increment Amount to increment by. Defaults to 1. + void IncrementDuplicateDeletes(int32_t increment = 1); + + /// \brief Track a data file being added to the snapshot + /// + /// \param spec The partition spec + /// \param file The data file being added + /// \return Status indicating success or error + Status AddedFile(const PartitionSpec& spec, const DataFile& file); + + /// \brief Track a data file being deleted from the snapshot + /// + /// \param spec The partition spec + /// \param file The data file being deleted + /// \return Status indicating success or error + Status DeletedFile(const PartitionSpec& spec, const DataFile& file); + + /// \brief Track a manifest being added + /// + /// \param manifest The manifest file being added + void AddedManifest(const ManifestFile& manifest); + + /// \brief Set a custom summary property + /// + /// \param property Property name + /// \param value Property value + void Set(const std::string& property, const std::string& value); + + /// \brief Merge another builder's metrics into this one + /// + /// \param other The builder to merge from + void Merge(const SnapshotSummaryBuilder& other); + + /// \brief Build the final summary map + /// + /// \return Map of summary properties + std::unordered_map Build() const; + + private: + Status UpdatePartitions(const PartitionSpec& spec, const DataFile& file, + bool is_addition); + std::string PartitionSummary(const UpdateMetrics& metrics) const; + + std::unordered_map properties_; + std::unordered_map partition_metrics_; + UpdateMetrics metrics_; + int32_t max_changed_partitions_for_summaries_{0}; + int64_t deleted_duplicate_files_{0}; + bool trust_partition_metrics_{true}; +}; + /// \brief Data operation that produce snapshots. /// /// A snapshot can return the operation that created the snapshot to help other components diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 364c690e6..d243a48bf 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -76,6 +76,7 @@ add_iceberg_test(table_test SOURCES location_provider_test.cc metrics_config_test.cc + snapshot_summary_builder_test.cc snapshot_test.cc snapshot_util_test.cc table_metadata_builder_test.cc diff --git a/src/iceberg/test/snapshot_summary_builder_test.cc b/src/iceberg/test/snapshot_summary_builder_test.cc new file mode 100644 index 000000000..f444a546c --- /dev/null +++ b/src/iceberg/test/snapshot_summary_builder_test.cc @@ -0,0 +1,415 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/manifest/manifest_list.h" +#include "iceberg/partition_spec.h" +#include "iceberg/row/partition_values.h" +#include "iceberg/schema.h" +#include "iceberg/schema_field.h" +#include "iceberg/snapshot.h" +#include "iceberg/test/matchers.h" +#include "iceberg/type.h" + +namespace iceberg { + +class SnapshotSummaryBuilderTest : public ::testing::Test { + protected: + void SetUp() override { + // Create a simple schema + schema_ = std::make_shared(std::vector{ + SchemaField::MakeRequired(1, "id", int32()), + SchemaField::MakeRequired(2, "data", string()), + }); + + // Create unpartitioned spec + spec_ = PartitionSpec::Unpartitioned(); + } + + DataFile CreateDataFile(const std::string& path, int64_t size, int64_t record_count) { + DataFile file; + file.content = DataFile::Content::kData; + file.file_path = path; + file.file_format = FileFormatType::kParquet; + file.file_size_in_bytes = size; + file.record_count = record_count; + file.partition = PartitionValues{}; + return file; + } + + DataFile CreatePosDeleteFile(const std::string& path, int64_t size, + int64_t record_count) { + DataFile file; + file.content = DataFile::Content::kPositionDeletes; + file.file_path = path; + file.file_format = FileFormatType::kParquet; + file.file_size_in_bytes = size; + file.record_count = record_count; + file.partition = PartitionValues{}; + return file; + } + + DataFile CreateEqDeleteFile(const std::string& path, int64_t size, + int64_t record_count) { + DataFile file; + file.content = DataFile::Content::kEqualityDeletes; + file.file_path = path; + file.file_format = FileFormatType::kParquet; + file.file_size_in_bytes = size; + file.record_count = record_count; + file.partition = PartitionValues{}; + file.equality_ids = {1}; // Delete on column 1 + return file; + } + + DataFile CreateDeletionVector(const std::string& path, int64_t size, + int64_t record_count) { + DataFile file; + file.content = DataFile::Content::kPositionDeletes; + file.file_path = path; + file.file_format = FileFormatType::kPuffin; // DVs use Puffin format + file.file_size_in_bytes = size; + file.record_count = record_count; + file.partition = PartitionValues{}; + return file; + } + + std::shared_ptr schema_; + std::shared_ptr spec_; +}; + +// Test basic file addition +TEST_F(SnapshotSummaryBuilderTest, AddDataFile) { + SnapshotSummaryBuilder builder; + + auto file = CreateDataFile("/path/to/data.parquet", 1000, 100); + ASSERT_THAT(builder.AddedFile(*spec_, file), IsOk()); + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDataFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedRecords], "100"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedFileSize], "1000"); + EXPECT_EQ(summary.count(SnapshotSummaryFields::kDeletedDataFiles), 0); + EXPECT_EQ(summary.count(SnapshotSummaryFields::kRemovedFileSize), 0); +} + +// Test basic file deletion +TEST_F(SnapshotSummaryBuilderTest, DeleteDataFile) { + SnapshotSummaryBuilder builder; + + auto file = CreateDataFile("/path/to/data.parquet", 1000, 100); + ASSERT_THAT(builder.DeletedFile(*spec_, file), IsOk()); + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kDeletedDataFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kDeletedRecords], "100"); + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedFileSize], "1000"); + EXPECT_EQ(summary.count(SnapshotSummaryFields::kAddedDataFiles), 0); + EXPECT_EQ(summary.count(SnapshotSummaryFields::kAddedFileSize), 0); +} + +// Test adding and removing files together +TEST_F(SnapshotSummaryBuilderTest, AddAndDeleteFiles) { + SnapshotSummaryBuilder builder; + + auto file_a = CreateDataFile("/path/to/data-a.parquet", 500, 50); + auto file_b = CreateDataFile("/path/to/data-b.parquet", 1000, 100); + + ASSERT_THAT(builder.AddedFile(*spec_, file_a), IsOk()); + ASSERT_THAT(builder.DeletedFile(*spec_, file_b), IsOk()); + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDataFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedRecords], "50"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedFileSize], "500"); + EXPECT_EQ(summary[SnapshotSummaryFields::kDeletedDataFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kDeletedRecords], "100"); + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedFileSize], "1000"); +} + +// Test position delete files +TEST_F(SnapshotSummaryBuilderTest, AddPositionDeleteFile) { + SnapshotSummaryBuilder builder; + + auto delete_file = CreatePosDeleteFile("/path/to/deletes.parquet", 100, 10); + ASSERT_THAT(builder.AddedFile(*spec_, delete_file), IsOk()); + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedPosDeleteFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDeleteFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedPosDeletes], "10"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedFileSize], "100"); + EXPECT_EQ(summary.count(SnapshotSummaryFields::kAddedDataFiles), 0); +} + +// Test equality delete files +TEST_F(SnapshotSummaryBuilderTest, AddEqualityDeleteFile) { + SnapshotSummaryBuilder builder; + + auto delete_file = CreateEqDeleteFile("/path/to/eq-deletes.parquet", 100, 10); + ASSERT_THAT(builder.AddedFile(*spec_, delete_file), IsOk()); + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedEqDeleteFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDeleteFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedEqDeletes], "10"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedFileSize], "100"); + EXPECT_EQ(summary.count(SnapshotSummaryFields::kAddedPosDeleteFiles), 0); +} + +// Test deletion vectors +TEST_F(SnapshotSummaryBuilderTest, AddDeletionVector) { + SnapshotSummaryBuilder builder; + + auto dv = CreateDeletionVector("/path/to/dv.puffin", 50, 5); + ASSERT_THAT(builder.AddedFile(*spec_, dv), IsOk()); + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDVs], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDeleteFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedPosDeletes], "5"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedFileSize], "50"); + EXPECT_EQ(summary.count(SnapshotSummaryFields::kAddedPosDeleteFiles), 0); +} + +// Test removing deletion vectors +TEST_F(SnapshotSummaryBuilderTest, RemoveDeletionVector) { + SnapshotSummaryBuilder builder; + + auto dv = CreateDeletionVector("/path/to/dv.puffin", 50, 5); + ASSERT_THAT(builder.DeletedFile(*spec_, dv), IsOk()); + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedDVs], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedDeleteFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedPosDeletes], "5"); + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedFileSize], "50"); +} + +// Test duplicate delete tracking +TEST_F(SnapshotSummaryBuilderTest, DuplicateDeletes) { + SnapshotSummaryBuilder builder; + + builder.IncrementDuplicateDeletes(3); + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kDeletedDuplicatedFiles], "3"); +} + +// Test custom properties +TEST_F(SnapshotSummaryBuilderTest, CustomProperties) { + SnapshotSummaryBuilder builder; + + builder.Set("custom-key", "custom-value"); + builder.Set("another-key", "another-value"); + + auto summary = builder.Build(); + + EXPECT_EQ(summary["custom-key"], "custom-value"); + EXPECT_EQ(summary["another-key"], "another-value"); +} + +// Test merging builders +TEST_F(SnapshotSummaryBuilderTest, MergeBuilders) { + SnapshotSummaryBuilder builder1; + SnapshotSummaryBuilder builder2; + + auto file1 = CreateDataFile("/path/to/data1.parquet", 500, 50); + auto file2 = CreateDataFile("/path/to/data2.parquet", 1000, 100); + + ASSERT_THAT(builder1.AddedFile(*spec_, file1), IsOk()); + ASSERT_THAT(builder2.AddedFile(*spec_, file2), IsOk()); + + builder1.Set("custom-prop", "value1"); + builder2.Set("custom-prop", "value2"); // Should override + + builder1.Merge(builder2); + + auto summary = builder1.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDataFiles], "2"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedRecords], "150"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedFileSize], "1500"); + EXPECT_EQ(summary["custom-prop"], "value2"); // Should be overridden +} + +// Test manifest addition +TEST_F(SnapshotSummaryBuilderTest, AddManifest) { + SnapshotSummaryBuilder builder; + + // First add a file directly + auto file = CreateDataFile("/path/to/data.parquet", 1000, 100); + ASSERT_THAT(builder.AddedFile(*spec_, file), IsOk()); + + // Create a manifest + ManifestFile manifest; + manifest.content = ManifestContent::kData; + manifest.added_files_count = 5; + manifest.added_rows_count = 500; + manifest.deleted_files_count = 2; + manifest.deleted_rows_count = 200; + + // Add manifest (this should clear partition metrics) + builder.AddedManifest(manifest); + + auto summary = builder.Build(); + + // Should have totals from both file and manifest + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDataFiles], "6"); // 1 + 5 + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedRecords], "600"); // 100 + 500 + EXPECT_EQ(summary[SnapshotSummaryFields::kDeletedDataFiles], "2"); + EXPECT_EQ(summary[SnapshotSummaryFields::kDeletedRecords], "200"); + + // Should not have partition count after adding manifest + EXPECT_EQ(summary.count(SnapshotSummaryFields::kChangedPartitionCountProp), 0); +} + +// Test data and delete files together +TEST_F(SnapshotSummaryBuilderTest, MixedDataAndDeleteFiles) { + SnapshotSummaryBuilder builder; + + auto data_file = CreateDataFile("/path/to/data.parquet", 1000, 100); + auto pos_delete = CreatePosDeleteFile("/path/to/pos-delete.parquet", 100, 10); + auto eq_delete = CreateEqDeleteFile("/path/to/eq-delete.parquet", 200, 20); + + ASSERT_THAT(builder.AddedFile(*spec_, data_file), IsOk()); + ASSERT_THAT(builder.AddedFile(*spec_, pos_delete), IsOk()); + ASSERT_THAT(builder.AddedFile(*spec_, eq_delete), IsOk()); + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDataFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedRecords], "100"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDeleteFiles], "2"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedPosDeleteFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedEqDeleteFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedPosDeletes], "10"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedEqDeletes], "20"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedFileSize], "1300"); +} + +// Test multiple files of the same type +TEST_F(SnapshotSummaryBuilderTest, MultipleFiles) { + SnapshotSummaryBuilder builder; + + for (int i = 0; i < 10; ++i) { + auto file = + CreateDataFile("/path/to/data-" + std::to_string(i) + ".parquet", 100, 10); + ASSERT_THAT(builder.AddedFile(*spec_, file), IsOk()); + } + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDataFiles], "10"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedRecords], "100"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedFileSize], "1000"); +} + +// Test removing multiple delete files +TEST_F(SnapshotSummaryBuilderTest, RemoveMultipleDeleteFiles) { + SnapshotSummaryBuilder builder; + + auto pos_delete1 = CreatePosDeleteFile("/path/to/pos1.parquet", 100, 10); + auto pos_delete2 = CreatePosDeleteFile("/path/to/pos2.parquet", 150, 15); + auto eq_delete = CreateEqDeleteFile("/path/to/eq.parquet", 200, 20); + + ASSERT_THAT(builder.DeletedFile(*spec_, pos_delete1), IsOk()); + ASSERT_THAT(builder.DeletedFile(*spec_, pos_delete2), IsOk()); + ASSERT_THAT(builder.DeletedFile(*spec_, eq_delete), IsOk()); + + auto summary = builder.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedDeleteFiles], "3"); + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedPosDeleteFiles], "2"); + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedEqDeleteFiles], "1"); + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedPosDeletes], "25"); + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedEqDeletes], "20"); + EXPECT_EQ(summary[SnapshotSummaryFields::kRemovedFileSize], "450"); +} + +// Test partition summary limit +TEST_F(SnapshotSummaryBuilderTest, PartitionSummaryLimit) { + SnapshotSummaryBuilder builder; + builder.SetPartitionSummaryLimit(2); + + auto file1 = CreateDataFile("/path/to/data1.parquet", 100, 10); + auto file2 = CreateDataFile("/path/to/data2.parquet", 200, 20); + auto file3 = CreateDataFile("/path/to/data3.parquet", 300, 30); + + ASSERT_THAT(builder.AddedFile(*spec_, file1), IsOk()); + ASSERT_THAT(builder.AddedFile(*spec_, file2), IsOk()); + ASSERT_THAT(builder.AddedFile(*spec_, file3), IsOk()); + + auto summary = builder.Build(); + + // With unpartitioned spec, all files go to the same partition ("") + // So we should have changed partition count + EXPECT_EQ(summary[SnapshotSummaryFields::kChangedPartitionCountProp], "1"); + + // Partition summaries should be included because 1 <= 2 + EXPECT_EQ(summary[SnapshotSummaryFields::kPartitionSummaryProp], "true"); +} + +// Test merge with duplicates +TEST_F(SnapshotSummaryBuilderTest, MergeWithDuplicates) { + SnapshotSummaryBuilder builder1; + SnapshotSummaryBuilder builder2; + + builder1.IncrementDuplicateDeletes(5); + builder2.IncrementDuplicateDeletes(3); + + auto file1 = CreateDataFile("/path/to/data1.parquet", 500, 50); + auto file2 = CreateDataFile("/path/to/data2.parquet", 1000, 100); + + ASSERT_THAT(builder1.AddedFile(*spec_, file1), IsOk()); + ASSERT_THAT(builder2.AddedFile(*spec_, file2), IsOk()); + + builder1.Merge(builder2); + + auto summary = builder1.Build(); + + EXPECT_EQ(summary[SnapshotSummaryFields::kDeletedDuplicatedFiles], "8"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedDataFiles], "2"); + EXPECT_EQ(summary[SnapshotSummaryFields::kAddedRecords], "150"); +} + +// Test only custom properties +TEST_F(SnapshotSummaryBuilderTest, OnlyCustomProperties) { + SnapshotSummaryBuilder builder; + + builder.Set("operation", "append"); + builder.Set("engine-name", "iceberg-cpp"); + + auto summary = builder.Build(); + + EXPECT_EQ(summary["operation"], "append"); + EXPECT_EQ(summary["engine-name"], "iceberg-cpp"); +} + +} // namespace iceberg From 7f72a4fd700bc664b117981440f4af0c6b878a77 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sat, 17 Jan 2026 10:20:51 +0800 Subject: [PATCH 2/2] fix: cpp-linter warning comparison between signed and unsigned --- src/iceberg/snapshot.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc index b2f8e80a7..cdbdd4529 100644 --- a/src/iceberg/snapshot.cc +++ b/src/iceberg/snapshot.cc @@ -508,8 +508,9 @@ std::unordered_map SnapshotSummaryBuilder::Build() con SnapshotSummaryFields::kChangedPartitionCountProp, partition_metrics_.size()); // Add partition summaries if enabled - if (trust_partition_metrics_ && static_cast(partition_metrics_.size()) <= - max_changed_partitions_for_summaries_) { + if (trust_partition_metrics_ && max_changed_partitions_for_summaries_ >= 0 && + partition_metrics_.size() <= + static_cast(max_changed_partitions_for_summaries_)) { SetIf(!partition_metrics_.empty(), builder, SnapshotSummaryFields::kPartitionSummaryProp, "true"); for (const auto& [key, metrics] : partition_metrics_) {