diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml index e6c579946cd..9a07cd281a2 100644 --- a/.github/workflows/fuzz.yml +++ b/.github/workflows/fuzz.yml @@ -13,102 +13,17 @@ on: workflow_dispatch: { } jobs: + # ============================================================================ + # IO Fuzzer + # ============================================================================ io_fuzz: name: "IO Fuzz" - timeout-minutes: 230 # almost 4 hours - runs-on: - - runs-on=${{ github.run_id }} - - family=m8g.large - - image=ubuntu24-full-arm64 - - disk=large - - extras=s3-cache - - tag=io-fuzz - outputs: - crashes_found: ${{ steps.check.outputs.crashes_found }} - first_crash_name: ${{ steps.check.outputs.first_crash_name }} - artifact_url: ${{ steps.upload_artifacts.outputs.artifact-url }} - steps: - - uses: runs-on/action@v2 - with: - sccache: s3 - - uses: actions/checkout@v6 - - uses: ./.github/actions/setup-rust - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - toolchain: nightly - - name: Install llvm - uses: aminya/setup-cpp@v1 - with: - compiler: llvm - - name: Install cargo fuzz - run: cargo install --locked cargo-fuzz - - name: Restore corpus - shell: bash - run: | - aws s3api head-object --bucket vortex-fuzz-corpus --key "io_corpus.tar.zst" --query ETag --output text > current_etag - aws s3 cp s3://vortex-fuzz-corpus/io_corpus.tar.zst . - tar -xf io_corpus.tar.zst - env: - AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} - AWS_REGION: "us-east-1" - AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" - - name: Run fuzzing target - id: fuzz - run: | - RUSTFLAGS="--cfg vortex_nightly" RUST_BACKTRACE=1 cargo +nightly fuzz run --release --debug-assertions file_io -- -max_total_time=7200 -rss_limit_mb=0 2>&1 | tee fuzz_output.log - continue-on-error: true - - name: Check for crashes - id: check - run: | - if [ -d "fuzz/artifacts" ] && [ "$(ls -A fuzz/artifacts 2>/dev/null)" ]; then - echo "crashes_found=true" >> $GITHUB_OUTPUT - - # Get the first crash file only - FIRST_CRASH=$(find fuzz/artifacts -type f \( -name "crash-*" -o -name "leak-*" -o -name "timeout-*" -o -name "oom-*" \) | head -1) - - if [ -n "$FIRST_CRASH" ]; then - echo "first_crash=$FIRST_CRASH" >> $GITHUB_OUTPUT - echo "first_crash_name=$(basename $FIRST_CRASH)" >> $GITHUB_OUTPUT - - # Count all crashes for reporting - CRASH_COUNT=$(find fuzz/artifacts -type f \( -name "crash-*" -o -name "leak-*" -o -name "timeout-*" -o -name "oom-*" \) | wc -l) - echo "crash_count=$CRASH_COUNT" >> $GITHUB_OUTPUT - echo "Found $CRASH_COUNT crash(es), will process first: $(basename $FIRST_CRASH)" - fi - else - echo "crashes_found=false" >> $GITHUB_OUTPUT - echo "crash_count=0" >> $GITHUB_OUTPUT - echo "No crashes found" - fi - - name: Archive crash artifacts - id: upload_artifacts - if: steps.check.outputs.crashes_found == 'true' - uses: actions/upload-artifact@v5 - with: - name: io-fuzzing-crash-artifacts - path: fuzz/artifacts - retention-days: 30 - - name: Archive fuzzer output log - if: steps.check.outputs.crashes_found == 'true' - uses: actions/upload-artifact@v5 - with: - name: io-fuzzing-logs - path: fuzz_output.log - retention-days: 30 - - name: Persist corpus - shell: bash - run: | - tar -acf io_corpus.tar.zst fuzz/corpus/file_io - aws s3api put-object --bucket vortex-fuzz-corpus --key "io_corpus.tar.zst" --body io_corpus.tar.zst --checksum-algorithm CRC32 --if-match "$(cat current_etag)" - env: - AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} - AWS_REGION: "us-east-1" - AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" - - name: Fail job if fuzz run found a bug - if: steps.check.outputs.crashes_found == 'true' - run: exit 1 + uses: ./.github/workflows/run-fuzzer.yml + with: + fuzz_target: file_io + secrets: + R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} report-io-fuzz-failures: name: "Report IO Fuzz Failures" @@ -124,8 +39,8 @@ jobs: fuzz_target: file_io crash_file: ${{ needs.io_fuzz.outputs.first_crash_name }} artifact_url: ${{ needs.io_fuzz.outputs.artifact_url }} - artifact_name: io-fuzzing-crash-artifacts - logs_artifact_name: io-fuzzing-logs + artifact_name: file_io-crash-artifacts + logs_artifact_name: file_io-logs branch: ${{ github.ref_name }} commit: ${{ github.sha }} secrets: @@ -146,102 +61,17 @@ jobs: issue_number: ${{ needs.report-io-fuzz-failures.outputs.issue_number }} secrets: inherit + # ============================================================================ + # Array Operations Fuzzer + # ============================================================================ ops_fuzz: name: "Array Operations Fuzz" - timeout-minutes: 230 # almost 4 hours - runs-on: - - runs-on=${{ github.run_id }} - - family=m8g.large - - image=ubuntu24-full-arm64 - - disk=large - - extras=s3-cache - - tag=ops-fuzz - outputs: - crashes_found: ${{ steps.check.outputs.crashes_found }} - first_crash_name: ${{ steps.check.outputs.first_crash_name }} - artifact_url: ${{ steps.upload_artifacts.outputs.artifact-url }} - steps: - - uses: runs-on/action@v2 - with: - sccache: s3 - - uses: actions/checkout@v6 - - uses: ./.github/actions/setup-rust - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - toolchain: nightly - - name: Install llvm - uses: aminya/setup-cpp@v1 - with: - compiler: llvm - - name: Install cargo fuzz - run: cargo install --locked cargo-fuzz - - name: Restore corpus - shell: bash - run: | - aws s3api head-object --bucket vortex-fuzz-corpus --key "array_ops_corpus.tar.zst" --query ETag --output text > current_etag - aws s3 cp s3://vortex-fuzz-corpus/array_ops_corpus.tar.zst . - tar -xf array_ops_corpus.tar.zst - env: - AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} - AWS_REGION: "us-east-1" - AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" - - name: Run fuzzing target - id: fuzz - run: | - RUSTFLAGS="--cfg vortex_nightly" RUST_BACKTRACE=1 cargo +nightly fuzz run --release --debug-assertions array_ops -- -max_total_time=7200 -rss_limit_mb=0 2>&1 | tee fuzz_output.log - continue-on-error: true - - name: Check for crashes - id: check - run: | - if [ -d "fuzz/artifacts" ] && [ "$(ls -A fuzz/artifacts 2>/dev/null)" ]; then - echo "crashes_found=true" >> $GITHUB_OUTPUT - - # Get the first crash file only - FIRST_CRASH=$(find fuzz/artifacts -type f \( -name "crash-*" -o -name "leak-*" -o -name "timeout-*" -o -name "oom-*" \) | head -1) - - if [ -n "$FIRST_CRASH" ]; then - echo "first_crash=$FIRST_CRASH" >> $GITHUB_OUTPUT - echo "first_crash_name=$(basename $FIRST_CRASH)" >> $GITHUB_OUTPUT - - # Count all crashes for reporting - CRASH_COUNT=$(find fuzz/artifacts -type f \( -name "crash-*" -o -name "leak-*" -o -name "timeout-*" -o -name "oom-*" \) | wc -l) - echo "crash_count=$CRASH_COUNT" >> $GITHUB_OUTPUT - echo "Found $CRASH_COUNT crash(es), will process first: $(basename $FIRST_CRASH)" - fi - else - echo "crashes_found=false" >> $GITHUB_OUTPUT - echo "crash_count=0" >> $GITHUB_OUTPUT - echo "No crashes found" - fi - - name: Archive crash artifacts - id: upload_artifacts - if: steps.check.outputs.crashes_found == 'true' - uses: actions/upload-artifact@v5 - with: - name: operations-fuzzing-crash-artifacts - path: fuzz/artifacts - retention-days: 30 - - name: Archive fuzzer output log - if: steps.check.outputs.crashes_found == 'true' - uses: actions/upload-artifact@v5 - with: - name: ops-fuzzing-logs - path: fuzz_output.log - retention-days: 30 - - name: Persist corpus - shell: bash - run: | - tar -acf array_ops_corpus.tar.zst fuzz/corpus/array_ops - aws s3api put-object --bucket vortex-fuzz-corpus --key "array_ops_corpus.tar.zst" --body array_ops_corpus.tar.zst --checksum-algorithm CRC32 --if-match "$(cat current_etag)" - env: - AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} - AWS_REGION: "us-east-1" - AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" - - name: Fail job if fuzz run found a bug - if: steps.check.outputs.crashes_found == 'true' - run: exit 1 + uses: ./.github/workflows/run-fuzzer.yml + with: + fuzz_target: array_ops + secrets: + R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} report-ops-fuzz-failures: name: "Report Array Operations Fuzz Failures" @@ -257,10 +87,22 @@ jobs: fuzz_target: array_ops crash_file: ${{ needs.ops_fuzz.outputs.first_crash_name }} artifact_url: ${{ needs.ops_fuzz.outputs.artifact_url }} - artifact_name: operations-fuzzing-crash-artifacts - logs_artifact_name: ops-fuzzing-logs + artifact_name: array_ops-crash-artifacts + logs_artifact_name: array_ops-logs branch: ${{ github.ref_name }} commit: ${{ github.sha }} secrets: claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} gh_token: ${{ secrets.GITHUB_TOKEN }} + + # ============================================================================ + # Compress Roundtrip Fuzzer + # ============================================================================ + compress_fuzz: + name: "Compress Roundtrip Fuzz" + uses: ./.github/workflows/run-fuzzer.yml + with: + fuzz_target: compress_roundtrip + secrets: + R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/run-fuzzer.yml b/.github/workflows/run-fuzzer.yml new file mode 100644 index 00000000000..302555d7b54 --- /dev/null +++ b/.github/workflows/run-fuzzer.yml @@ -0,0 +1,169 @@ +name: Run Fuzzer + +on: + workflow_call: + inputs: + fuzz_target: + description: "The cargo fuzz target name (e.g., file_io, array_ops, compress_roundtrip)" + required: true + type: string + max_time: + description: "Maximum fuzzing time in seconds" + required: false + type: number + default: 7200 + outputs: + crashes_found: + description: "Whether crashes were found" + value: ${{ jobs.fuzz.outputs.crashes_found }} + first_crash_name: + description: "Name of the first crash file" + value: ${{ jobs.fuzz.outputs.first_crash_name }} + artifact_url: + description: "URL of the uploaded crash artifacts" + value: ${{ jobs.fuzz.outputs.artifact_url }} + secrets: + R2_FUZZ_ACCESS_KEY_ID: + required: true + R2_FUZZ_SECRET_ACCESS_KEY: + required: true + +jobs: + fuzz: + name: "Run ${{ inputs.fuzz_target }}" + timeout-minutes: 230 # almost 4 hours + runs-on: + - runs-on=${{ github.run_id }} + - family=m8g.large + - image=ubuntu24-full-arm64 + - disk=large + - extras=s3-cache + - tag=${{ inputs.fuzz_target }}-fuzz + outputs: + crashes_found: ${{ steps.check.outputs.crashes_found }} + first_crash_name: ${{ steps.check.outputs.first_crash_name }} + artifact_url: ${{ steps.upload_artifacts.outputs.artifact-url }} + env: + AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} + AWS_REGION: "us-east-1" + AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com" + steps: + - uses: runs-on/action@v2 + with: + sccache: s3 + + - uses: actions/checkout@v6 + + - uses: ./.github/actions/setup-rust + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + toolchain: nightly + + - name: Install llvm + uses: aminya/setup-cpp@v1 + with: + compiler: llvm + + - name: Install cargo fuzz + run: cargo install --locked cargo-fuzz + + - name: Restore corpus + shell: bash + run: | + CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst" + CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}" + + # Try to get ETag for optimistic locking on upload + if aws s3api head-object --bucket vortex-fuzz-corpus --key "$CORPUS_KEY" --query ETag --output text > current_etag 2>/dev/null; then + echo "Found existing corpus at s3://vortex-fuzz-corpus/$CORPUS_KEY" + else + echo "" + echo "==========================================" + echo "WARNING: No existing corpus found for ${{ inputs.fuzz_target }}" + echo "This is expected for new fuzzers. Starting with empty corpus." + echo "==========================================" + echo "" + echo '""' > current_etag + fi + + # Try to download corpus + if aws s3 cp "s3://vortex-fuzz-corpus/$CORPUS_KEY" . 2>/dev/null; then + echo "Downloaded corpus successfully" + tar -xf "$CORPUS_KEY" + else + echo "Creating empty corpus directory" + mkdir -p "$CORPUS_DIR" + fi + + - name: Run fuzzing target + id: fuzz + run: | + RUSTFLAGS="--cfg vortex_nightly" RUST_BACKTRACE=1 \ + cargo +nightly fuzz run --release --debug-assertions \ + ${{ inputs.fuzz_target }} -- \ + -max_total_time=${{ inputs.max_time }} -rss_limit_mb=0 \ + 2>&1 | tee fuzz_output.log + continue-on-error: true + + - name: Check for crashes + id: check + run: | + if [ -d "fuzz/artifacts" ] && [ "$(ls -A fuzz/artifacts 2>/dev/null)" ]; then + echo "crashes_found=true" >> $GITHUB_OUTPUT + + # Get the first crash file only + FIRST_CRASH=$(find fuzz/artifacts -type f \( -name "crash-*" -o -name "leak-*" -o -name "timeout-*" -o -name "oom-*" \) | head -1) + + if [ -n "$FIRST_CRASH" ]; then + echo "first_crash=$FIRST_CRASH" >> $GITHUB_OUTPUT + echo "first_crash_name=$(basename $FIRST_CRASH)" >> $GITHUB_OUTPUT + + # Count all crashes for reporting + CRASH_COUNT=$(find fuzz/artifacts -type f \( -name "crash-*" -o -name "leak-*" -o -name "timeout-*" -o -name "oom-*" \) | wc -l) + echo "crash_count=$CRASH_COUNT" >> $GITHUB_OUTPUT + echo "Found $CRASH_COUNT crash(es), will process first: $(basename $FIRST_CRASH)" + fi + else + echo "crashes_found=false" >> $GITHUB_OUTPUT + echo "crash_count=0" >> $GITHUB_OUTPUT + echo "No crashes found" + fi + + - name: Archive crash artifacts + id: upload_artifacts + if: steps.check.outputs.crashes_found == 'true' + uses: actions/upload-artifact@v5 + with: + name: ${{ inputs.fuzz_target }}-crash-artifacts + path: fuzz/artifacts + retention-days: 30 + + - name: Archive fuzzer output log + if: steps.check.outputs.crashes_found == 'true' + uses: actions/upload-artifact@v5 + with: + name: ${{ inputs.fuzz_target }}-logs + path: fuzz_output.log + retention-days: 30 + + - name: Persist corpus + shell: bash + run: | + CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst" + CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}" + + tar -acf "$CORPUS_KEY" "$CORPUS_DIR" + + ETAG=$(cat current_etag) + if [ "$ETAG" = '""' ] || [ -z "$ETAG" ]; then + # New corpus, no ETag check needed + aws s3api put-object --bucket vortex-fuzz-corpus --key "$CORPUS_KEY" --body "$CORPUS_KEY" --checksum-algorithm CRC32 + else + # Existing corpus, use optimistic locking + aws s3api put-object --bucket vortex-fuzz-corpus --key "$CORPUS_KEY" --body "$CORPUS_KEY" --checksum-algorithm CRC32 --if-match "$ETAG" + fi + + - name: Fail job if fuzz run found a bug + if: steps.check.outputs.crashes_found == 'true' + run: exit 1 diff --git a/Cargo.lock b/Cargo.lock index 25c73dfe189..45af95c18eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10577,6 +10577,7 @@ dependencies = [ "vortex-io", "vortex-layout", "vortex-mask", + "vortex-runend", "vortex-scalar", "vortex-session", "vortex-utils", @@ -10769,6 +10770,7 @@ dependencies = [ name = "vortex-runend" version = "0.1.0" dependencies = [ + "arbitrary", "arrow-array 57.2.0", "codspeed-divan-compat", "itertools 0.14.0", diff --git a/encodings/runend/Cargo.toml b/encodings/runend/Cargo.toml index d88e20b481d..9535c84cec7 100644 --- a/encodings/runend/Cargo.toml +++ b/encodings/runend/Cargo.toml @@ -14,6 +14,7 @@ rust-version = { workspace = true } version = { workspace = true } [dependencies] +arbitrary = { workspace = true, optional = true } arrow-array = { workspace = true, optional = true } itertools = { workspace = true } num-traits = { workspace = true } @@ -37,6 +38,7 @@ rstest = { workspace = true } vortex-array = { workspace = true, features = ["_test-harness"] } [features] +arbitrary = ["dep:arbitrary", "vortex-array/arbitrary"] arrow = ["dep:arrow-array"] [[bench]] diff --git a/encodings/runend/src/arbitrary.rs b/encodings/runend/src/arbitrary.rs new file mode 100644 index 00000000000..d70d4a5d4bc --- /dev/null +++ b/encodings/runend/src/arbitrary.rs @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use arbitrary::Arbitrary; +use arbitrary::Result; +use arbitrary::Unstructured; +use vortex_array::IntoArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::arbitrary::ArbitraryArray; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_dtype::DType; +use vortex_dtype::Nullability; +use vortex_dtype::PType; +use vortex_error::VortexExpect; + +use crate::RunEndArray; + +/// A wrapper type to implement `Arbitrary` for `RunEndArray`. +#[derive(Clone, Debug)] +pub struct ArbitraryRunEndArray(pub RunEndArray); + +impl<'a> Arbitrary<'a> for ArbitraryRunEndArray { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + // RunEnd supports Bool or Primitive types for values + // Pick a random primitive type for values + let ptype: PType = u.arbitrary()?; + let nullability: Nullability = u.arbitrary()?; + let dtype = DType::Primitive(ptype, nullability); + Self::with_dtype(u, &dtype, None) + } +} + +impl ArbitraryRunEndArray { + /// Generate an arbitrary RunEndArray with the given dtype for values. + /// + /// The dtype must be a primitive or boolean type. + pub fn with_dtype(u: &mut Unstructured, dtype: &DType, len: Option) -> Result { + // Number of runs (values/ends pairs) + let num_runs = u.int_in_range(0..=20)?; + + if num_runs == 0 { + // Empty RunEndArray + let ends = PrimitiveArray::from_iter(Vec::::new()).into_array(); + let values = ArbitraryArray::arbitrary_with(u, Some(0), dtype)?.0; + let runend_array = RunEndArray::try_new(ends, values) + .vortex_expect("Empty RunEndArray creation should succeed"); + return Ok(ArbitraryRunEndArray(runend_array)); + } + + // Generate arbitrary values for each run + let values = ArbitraryArray::arbitrary_with(u, Some(num_runs), dtype)?.0; + + // Generate strictly increasing ends + // Each end must be > previous end, and first end must be >= 1 + let ends = random_strictly_sorted_ends(u, num_runs, len)?; + + let runend_array = RunEndArray::try_new(ends, values) + .vortex_expect("RunEndArray creation should succeed in arbitrary impl"); + + Ok(ArbitraryRunEndArray(runend_array)) + } +} + +/// Generate a strictly sorted array of run ends. +/// +/// Returns an array of `num_runs` strictly increasing unsigned integers. +/// If `target_len` is provided, the last end will be exactly that value. +fn random_strictly_sorted_ends( + u: &mut Unstructured, + num_runs: usize, + target_len: Option, +) -> Result { + // Choose a random unsigned PType for ends + let ends_ptype = *u.choose(&[PType::U8, PType::U16, PType::U32, PType::U64])?; + + // Generate strictly increasing values + // Start from 0, increment by at least 1 each time + let mut ends: Vec = Vec::with_capacity(num_runs); + let mut current: u64 = 0; + + for i in 0..num_runs { + // Each run must have at least length 1, so increment by at least 1 + let increment = match (i == num_runs - 1, target_len) { + (true, Some(target)) => { + // Last element should reach target_len + let target = target as u64; + if target > current { + target - current + } else { + 1 + } + } + _ => { + // Random increment between 1 and 10 + u.int_in_range(1..=10)? + } + }; + current += increment; + ends.push(current); + } + + // Convert to the chosen PType + // The values are bounded: max is num_runs (20) * max_increment (10) = 200 + // This fits in all unsigned types + let ends_array = match ends_ptype { + PType::U8 => { + let ends_typed: Vec = ends + .iter() + .map(|&e| u8::try_from(e).vortex_expect("end value fits in u8")) + .collect(); + PrimitiveArray::new(Buffer::copy_from(ends_typed), Validity::NonNullable).into_array() + } + PType::U16 => { + let ends_typed: Vec = ends + .iter() + .map(|&e| u16::try_from(e).vortex_expect("end value fits in u16")) + .collect(); + PrimitiveArray::new(Buffer::copy_from(ends_typed), Validity::NonNullable).into_array() + } + PType::U32 => { + let ends_typed: Vec = ends + .iter() + .map(|&e| u32::try_from(e).vortex_expect("end value fits in u32")) + .collect(); + PrimitiveArray::new(Buffer::copy_from(ends_typed), Validity::NonNullable).into_array() + } + PType::U64 => { + PrimitiveArray::new(Buffer::copy_from(ends), Validity::NonNullable).into_array() + } + _ => unreachable!("Only unsigned integer types are valid for ends"), + }; + + Ok(ends_array) +} diff --git a/encodings/runend/src/lib.rs b/encodings/runend/src/lib.rs index bb6e6a43359..589b16e2c65 100644 --- a/encodings/runend/src/lib.rs +++ b/encodings/runend/src/lib.rs @@ -1,6 +1,10 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +#[cfg(feature = "arbitrary")] +mod arbitrary; +#[cfg(feature = "arbitrary")] +pub use arbitrary::ArbitraryRunEndArray; pub use array::*; pub use iter::trimmed_ends_iter; diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 44ba530a428..a9e2119bdb4 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -39,6 +39,7 @@ vortex-error = { workspace = true } vortex-io = { workspace = true } vortex-layout = { workspace = true } vortex-mask = { workspace = true } +vortex-runend = { workspace = true, features = ["arbitrary"] } vortex-scalar = { workspace = true, features = ["arbitrary"] } vortex-session = { workspace = true } vortex-utils = { workspace = true } @@ -73,3 +74,11 @@ name = "array_ops_wasm" path = "fuzz_targets/array_ops_wasm.rs" test = false required-features = ["wasmfuzz"] + +[[bin]] +bench = false +doc = false +name = "compress_roundtrip" +path = "fuzz_targets/compress_roundtrip.rs" +test = false +required-features = ["native"] diff --git a/fuzz/fuzz_targets/compress_roundtrip.rs b/fuzz/fuzz_targets/compress_roundtrip.rs new file mode 100644 index 00000000000..22e2ee80a8e --- /dev/null +++ b/fuzz/fuzz_targets/compress_roundtrip.rs @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![no_main] +#![allow(clippy::unwrap_used, clippy::result_large_err)] + +use libfuzzer_sys::Corpus; +use libfuzzer_sys::fuzz_target; +use vortex_error::vortex_panic; +use vortex_fuzz::FuzzCompressRoundtrip; +use vortex_fuzz::run_compress_roundtrip; + +fuzz_target!(|fuzz: FuzzCompressRoundtrip| -> Corpus { + match run_compress_roundtrip(fuzz) { + Ok(true) => Corpus::Keep, + Ok(false) => Corpus::Reject, + Err(e) => { + vortex_panic!("{e}"); + } + } +}); diff --git a/fuzz/src/compress.rs b/fuzz/src/compress.rs new file mode 100644 index 00000000000..da132e7cd95 --- /dev/null +++ b/fuzz/src/compress.rs @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Fuzzer module for testing compressed encoding canonicalization. +//! +//! This module generates arbitrary instances of compressed encodings (DictArray, etc.), +//! then verifies that `to_canonical()` works and produces correct `len` and `dtype`. + +use arbitrary::Arbitrary; +use arbitrary::Unstructured; +use vortex_array::Array; +use vortex_array::ArrayRef; +use vortex_array::IntoArray; +use vortex_array::arrays::ArbitraryConstantArray; +use vortex_array::arrays::ArbitraryDictArray; +use vortex_runend::ArbitraryRunEndArray; + +/// Which compressed encoding to generate. +#[derive(Debug, Clone, Copy)] +pub enum EncodingKind { + Dict, + Constant, + RunEnd, +} + +impl<'a> Arbitrary<'a> for EncodingKind { + fn arbitrary(u: &mut Unstructured<'a>) -> arbitrary::Result { + match u.int_in_range(0..=2)? { + 0 => Ok(EncodingKind::Dict), + 1 => Ok(EncodingKind::Constant), + 2 => Ok(EncodingKind::RunEnd), + _ => unreachable!(), + } + } +} + +/// Input for the compressed encoding canonicalization fuzzer. +#[derive(Debug)] +pub struct FuzzCompressRoundtrip { + pub array: ArrayRef, +} + +impl<'a> Arbitrary<'a> for FuzzCompressRoundtrip { + fn arbitrary(u: &mut Unstructured<'a>) -> arbitrary::Result { + let kind: EncodingKind = u.arbitrary()?; + + let array = match kind { + EncodingKind::Dict => ArbitraryDictArray::arbitrary(u)?.0.into_array(), + EncodingKind::Constant => ArbitraryConstantArray::arbitrary(u)?.0.into_array(), + EncodingKind::RunEnd => ArbitraryRunEndArray::arbitrary(u)?.0.into_array(), + }; + + Ok(FuzzCompressRoundtrip { array }) + } +} + +/// Run the compressed encoding canonicalization fuzzer. +/// +/// Returns: +/// - `Ok(true)` - keep in corpus +/// - `Ok(false)` - reject from corpus +/// - `Err(_)` - a bug was found +#[allow(clippy::result_large_err)] +pub fn run_compress_roundtrip(fuzz: FuzzCompressRoundtrip) -> crate::error::VortexFuzzResult { + use crate::error::Backtrace; + use crate::error::VortexFuzzError; + + let FuzzCompressRoundtrip { array } = fuzz; + + // Store original properties + let original_len = array.len(); + let original_dtype = array.dtype().clone(); + + // Try to canonicalize - this is the main thing we're testing + let canonical = match array.to_canonical() { + Ok(c) => c, + Err(e) => { + // Canonicalization failed - this is a bug + return Err(VortexFuzzError::VortexError(e, Backtrace::capture())); + } + }; + + let canonical_array: ArrayRef = canonical.into_array(); + + // Verify dtype is preserved + if &original_dtype != canonical_array.dtype() { + return Err(VortexFuzzError::DTypeMismatch( + array, + canonical_array, + 0, + Backtrace::capture(), + )); + } + + // Verify len is preserved + if original_len != canonical_array.len() { + return Err(VortexFuzzError::LengthMismatch( + original_len, + canonical_array.len(), + array, + canonical_array, + 0, + Backtrace::capture(), + )); + } + + Ok(true) +} diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs index ad3904dc298..dd0b3022642 100644 --- a/fuzz/src/lib.rs +++ b/fuzz/src/lib.rs @@ -4,6 +4,7 @@ #![allow(clippy::use_debug)] mod array; +pub mod compress; pub mod error; // File module only available for native builds (requires vortex-file which uses tokio) @@ -15,6 +16,8 @@ pub use array::ExpectedValue; pub use array::FuzzArrayAction; pub use array::run_fuzz_action; pub use array::sort_canonical_array; +pub use compress::FuzzCompressRoundtrip; +pub use compress::run_compress_roundtrip; #[cfg(not(target_arch = "wasm32"))] pub use file::FuzzFileAction; diff --git a/vortex-array/src/arrays/arbitrary.rs b/vortex-array/src/arrays/arbitrary.rs index 64da1c49778..180f8c7db4b 100644 --- a/vortex-array/src/arrays/arbitrary.rs +++ b/vortex-array/src/arrays/arbitrary.rs @@ -328,7 +328,11 @@ fn random_bool( Ok(BoolArray::from_bit_buffer(BitBuffer::from(v), validity).into_array()) } -fn random_validity(u: &mut Unstructured, nullability: Nullability, len: usize) -> Result { +pub fn random_validity( + u: &mut Unstructured, + nullability: Nullability, + len: usize, +) -> Result { match nullability { Nullability::NonNullable => Ok(Validity::NonNullable), Nullability::Nullable => Ok(match u.int_in_range(0..=2)? { diff --git a/vortex-array/src/arrays/constant/arbitrary.rs b/vortex-array/src/arrays/constant/arbitrary.rs new file mode 100644 index 00000000000..fb3cadc4788 --- /dev/null +++ b/vortex-array/src/arrays/constant/arbitrary.rs @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use arbitrary::Arbitrary; +use arbitrary::Result; +use arbitrary::Unstructured; +use vortex_dtype::DType; +use vortex_scalar::arbitrary::random_scalar; + +use super::ConstantArray; + +/// A wrapper type to implement `Arbitrary` for `ConstantArray`. +#[derive(Clone, Debug)] +pub struct ArbitraryConstantArray(pub ConstantArray); + +impl<'a> Arbitrary<'a> for ArbitraryConstantArray { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + let dtype: DType = u.arbitrary()?; + Self::with_dtype(u, &dtype, None) + } +} + +impl ArbitraryConstantArray { + /// Generate an arbitrary ConstantArray with the given dtype. + pub fn with_dtype(u: &mut Unstructured, dtype: &DType, len: Option) -> Result { + let scalar = random_scalar(u, dtype)?; + let len = len.unwrap_or(u.int_in_range(0..=100)?); + Ok(ArbitraryConstantArray(ConstantArray::new(scalar, len))) + } +} diff --git a/vortex-array/src/arrays/constant/mod.rs b/vortex-array/src/arrays/constant/mod.rs index ddc4a79dbf3..079566bf567 100644 --- a/vortex-array/src/arrays/constant/mod.rs +++ b/vortex-array/src/arrays/constant/mod.rs @@ -1,6 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +#[cfg(feature = "arbitrary")] +mod arbitrary; +#[cfg(feature = "arbitrary")] +pub use arbitrary::ArbitraryConstantArray; + mod array; pub use array::ConstantArray; diff --git a/vortex-array/src/arrays/dict/arbitrary.rs b/vortex-array/src/arrays/dict/arbitrary.rs new file mode 100644 index 00000000000..4e44fb0dd6c --- /dev/null +++ b/vortex-array/src/arrays/dict/arbitrary.rs @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use arbitrary::Arbitrary; +use arbitrary::Result; +use arbitrary::Unstructured; +use num_traits::NumCast; +use vortex_buffer::Buffer; +use vortex_dtype::DType; +use vortex_dtype::NativePType; +use vortex_dtype::Nullability; +use vortex_dtype::PType; +use vortex_error::VortexExpect; + +use super::DictArray; +use crate::ArrayRef; +use crate::IntoArray; +use crate::arrays::PrimitiveArray; +use crate::arrays::arbitrary::ArbitraryArray; +use crate::arrays::arbitrary::random_validity; + +/// A wrapper type to implement `Arbitrary` for `DictArray`. +#[derive(Clone, Debug)] +pub struct ArbitraryDictArray(pub DictArray); + +impl<'a> Arbitrary<'a> for ArbitraryDictArray { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + let dtype: DType = u.arbitrary()?; + Self::with_dtype(u, &dtype, None) + } +} + +impl ArbitraryDictArray { + /// Generate an arbitrary DictArray with the given dtype for values. + pub fn with_dtype(u: &mut Unstructured, dtype: &DType, len: Option) -> Result { + // Generate the number of unique values (dictionary size) + let values_len = u.int_in_range(1..=20)?; + // Generate values array with the given dtype + let values = ArbitraryArray::arbitrary_with(u, Some(values_len), dtype)?.0; + + // Generate codes that index into the values + let codes_len = len.unwrap_or(u.int_in_range(0..=100)?); + + // Determine the minimum PType that can represent all indices (max index is values_len - 1) + let min_codes_ptype = PType::min_unsigned_ptype_for_value((values_len - 1) as u64); + + // Choose a random PType at least as wide as the minimum + let valid_ptypes: &[PType] = match min_codes_ptype { + PType::U8 => &[PType::U8, PType::U16, PType::U32, PType::U64], + PType::U16 => &[PType::U16, PType::U32, PType::U64], + PType::U32 => &[PType::U32, PType::U64], + PType::U64 => &[PType::U64], + _ => unreachable!(), + }; + let codes_ptype = *u.choose(valid_ptypes)?; + + // Generate codes with optional nullability + let codes_nullable: Nullability = u.arbitrary()?; + let codes = match codes_ptype { + PType::U8 => random_codes::(u, codes_len, values_len, codes_nullable)?, + PType::U16 => random_codes::(u, codes_len, values_len, codes_nullable)?, + PType::U32 => random_codes::(u, codes_len, values_len, codes_nullable)?, + PType::U64 => random_codes::(u, codes_len, values_len, codes_nullable)?, + _ => unreachable!(), + }; + + Ok(ArbitraryDictArray( + DictArray::try_new(codes, values) + .vortex_expect("DictArray creation should succeed in arbitrary impl"), + )) + } +} + +/// Generate random codes for a DictArray with a specific unsigned integer type. +fn random_codes( + u: &mut Unstructured, + len: usize, + max_value: usize, + nullability: Nullability, +) -> Result +where + T: NativePType + NumCast, +{ + let codes: Vec = (0..len) + .map(|_| { + let idx = u.int_in_range(0..=max_value - 1)?; + // max_value is bounded by T::MAX in the caller, so conversion always succeeds + Ok(T::from(idx).vortex_expect("value within type bounds")) + }) + .collect::>>()?; + let validity = random_validity(u, nullability, len)?; + Ok(PrimitiveArray::new(Buffer::copy_from(codes), validity).into_array()) +} diff --git a/vortex-array/src/arrays/dict/mod.rs b/vortex-array/src/arrays/dict/mod.rs index 2123f498864..3a448e763c7 100644 --- a/vortex-array/src/arrays/dict/mod.rs +++ b/vortex-array/src/arrays/dict/mod.rs @@ -6,6 +6,11 @@ //! Expose a [DictArray] which is zero-copy equivalent to Arrow's //! [DictionaryArray](https://docs.rs/arrow/latest/arrow/array/struct.DictionaryArray.html). +#[cfg(feature = "arbitrary")] +mod arbitrary; +#[cfg(feature = "arbitrary")] +pub use arbitrary::ArbitraryDictArray; + mod array; pub use array::*;