From 77608d43a30a9940f358e7dfcc7dcafc741e3d8b Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Mon, 12 Jan 2026 15:58:24 +0000
Subject: [PATCH 01/12] io bench Signed-off-by: Onur Satici <onur@spiraldb.com>

---
 Cargo.lock                            |   2 +
 vortex-bench/Cargo.toml               |   2 +
 vortex-bench/src/bin/scan_io_bench.rs | 600 ++++++++++++++++++++++++++
 vortex-layout/src/layout.rs           |  25 ++
 vortex-scan/src/scan_builder.rs       |   2 +-
 5 files changed, 630 insertions(+), 1 deletion(-)
 create mode 100644 vortex-bench/src/bin/scan_io_bench.rs
diff --git a/Cargo.lock b/Cargo.lock
index b619866ea58..2e63f32b6c9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10152,6 +10152,7 @@ dependencies = [
  "mimalloc",
  "noodles-bgzf",
  "noodles-vcf",
+ "object_store",
  "parking_lot",
  "parquet 57.2.0",
  "rand 0.9.2",
@@ -10173,6 +10174,7 @@ dependencies = [
  "url",
  "uuid",
  "vortex",
+ "vortex-scan",
 ]
 
 [[package]]
diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml
index aee90d34b77..8ee74f66689 100644
--- a/vortex-bench/Cargo.toml
+++ b/vortex-bench/Cargo.toml
@@ -32,6 +32,7 @@ humansize = { workspace = true }
 indicatif = { workspace = true, features = ["futures"] }
 itertools = { workspace = true }
 mimalloc = { workspace = true }
+object_store = { workspace = true, features = ["aws", "http", "fs"] }
 noodles-bgzf = { workspace = true, features = ["async"] }
 noodles-vcf = { workspace = true, features = ["async"] }
 parking_lot = { workspace = true }
@@ -64,3 +65,4 @@ vortex = { workspace = true, features = [
     "zstd",
     "unstable_encodings",
 ] }
+vortex-scan = { workspace = true }
diff --git a/vortex-bench/src/bin/scan_io_bench.rs b/vortex-bench/src/bin/scan_io_bench.rs
new file mode 100644
index 00000000000..356d9902b25
--- /dev/null
+++ b/vortex-bench/src/bin/scan_io_bench.rs
@@ -0,0 +1,600 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::path::Path;
+use std::path::PathBuf;
+use std::time::Instant;
+
+use anyhow::Result;
+use clap::Parser;
+use clap::ValueEnum;
+use futures::StreamExt;
+use futures::TryStreamExt;
+use object_store::ObjectStore;
+use object_store::ObjectStoreScheme;
+use object_store::aws::AmazonS3Builder;
+use object_store::http::HttpBuilder;
+use object_store::local::LocalFileSystem;
+use object_store::path::Path as ObjectStorePath;
+use url::Url;
+use vortex::array::Array;
+use vortex::array::MaskFuture;
+use vortex::array::expr::Expression;
+use vortex::array::expr::col;
+use vortex::array::expr::eq;
+use vortex::array::expr::gt;
+use vortex::array::expr::gt_eq;
+use vortex::array::expr::lit;
+use vortex::array::expr::lt;
+use vortex::array::expr::lt_eq;
+use vortex::array::expr::not_eq;
+use vortex::array::expr::root;
+use vortex::array::expr::select;
+use vortex::dtype::FieldNames;
+use vortex::error::VortexResult;
+use vortex::error::vortex_err;
+use vortex::file::OpenOptionsSessionExt;
+use vortex::layout::collect_segment_ids;
+use vortex::layout::LayoutReader;
+use vortex::mask::Mask;
+use vortex::metrics::VortexMetrics;
+use parking_lot::Mutex;
+use vortex_bench::SESSION;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering;
+use tracing_subscriber::EnvFilter;
+use vortex_scan::ScanBuilder;
+
+#[derive(Parser, Debug)]
+#[command(version, about = "Benchmark Vortex scans over local files vs object stores")]
+struct Args {
+    /// File path, directory, or object store URL (e.g. file:/..., s3://bucket/path, https://host/path)
+    #[arg(long)]
+    source: String,
+    /// Use object_store even for file: URLs
+    #[arg(long, default_value_t = false)]
+    force_object_store: bool,
+    /// Run a predefined scan shape.
+    #[arg(long, value_enum)]
+    preset: Option<Preset>,
+    /// Projection field names (comma-separated).
+    #[arg(long, value_delimiter = ',')]
+    projection: Option<Vec<String>>,
+    /// Filter column name.
+    #[arg(long)]
+    filter_col: Option<String>,
+    /// Filter operator.
+    #[arg(long, value_enum)]
+    filter_op: Option<FilterOp>,
+    /// Filter literal value (integer).
+    #[arg(long)]
+    filter_value: Option<i64>,
+    /// Filter literal type.
+    #[arg(long, value_enum, default_value_t = LiteralType::I64)]
+    filter_type: LiteralType,
+    /// Number of scan iterations.
+    #[arg(long, default_value_t = 1)]
+    iterations: usize,
+    /// Scan concurrency (tasks per thread).
+    #[arg(long, default_value_t = 4)]
+    concurrency: usize,
+    /// Max files scanned in parallel (file-level readahead).
+    #[arg(long, default_value_t = 1)]
+    file_concurrency: usize,
+    /// Reopen the file for each iteration to avoid caching effects.
+    #[arg(long, default_value_t = false)]
+    reopen: bool,
+    /// Which scan path to use.
+    #[arg(long, value_enum, default_value_t = ScanMode::Full)]
+    mode: ScanMode,
+    /// Only read segments and drop buffers (skip decode/projection).
+    #[arg(long, default_value_t = false)]
+    io_only: bool,
+    /// Only prune whole segments (no intra-segment pruning on CPU).
+    #[arg(long, default_value_t = false)]
+    prune_segments: bool,
+}
+
+#[derive(ValueEnum, Clone, Debug)]
+enum ScanMode {
+    /// Read segments only (no decode).
+    Io,
+    /// Decode arrays without filter evaluation.
+    Decode,
+    /// Decode arrays with full filter/projection evaluation.
+    Full,
+}
+
+#[derive(ValueEnum, Clone, Debug)]
+enum Preset {
+    /// ClickBench query #2: AdvEngineID != 0, projecting AdvEngineID.
+    Clickbench2,
+}
+
+#[derive(ValueEnum, Clone, Debug)]
+enum FilterOp {
+    Eq,
+    Neq,
+    Gt,
+    Gte,
+    Lt,
+    Lte,
+}
+
+#[derive(ValueEnum, Clone, Debug, Copy)]
+enum LiteralType {
+    I16,
+    I32,
+    I64,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(EnvFilter::from_default_env())
+        .init();
+
+    let args = Args::parse();
+    let mode = if args.io_only { ScanMode::Io } else { args.mode.clone() };
+
+    let (projection, filter) = build_scan_exprs(&args)?;
+    let metrics = VortexMetrics::new_with_tags([("bench", "scan-io")]);
+    let read_bytes = metrics.counter("vortex.io.read.total_size");
+
+    let targets = resolve_targets(&args).await?;
+    let cached_files = if args.reopen {
+        None
+    } else {
+        Some(std::sync::Arc::new(
+            open_all_targets(&targets, metrics.clone(), args.file_concurrency).await?,
+        ))
+    };
+    let mut total_rows = 0usize;
+    let mut total_elapsed = 0.0f64;
+    let mut total_bytes = 0i64;
+    let mut total_first_latency = 0.0f64;
+    let mut total_first_bytes = 0i64;
+
+    for _ in 0..args.iterations {
+        read_bytes.clear();
+
+        let start = Instant::now();
+        let bytes_before = read_bytes.count();
+        let first_seen = std::sync::Arc::new(AtomicBool::new(false));
+        let first_info = std::sync::Arc::new(Mutex::new(None::<(f64, i64)>));
+
+        let rows = futures::stream::iter(targets.iter().enumerate())
+            .map(|(idx, target)| {
+                let cached_files = cached_files.clone();
+                let projection = projection.clone();
+                let filter = filter.clone();
+                let metrics = metrics.clone();
+                let read_bytes = read_bytes.clone();
+                let first_seen = first_seen.clone();
+                let first_info = first_info.clone();
+                let mode = mode.clone();
+                async move {
+                    let file = match &cached_files {
+                        Some(files) => files[idx].clone(),
+                        None => open_vortex_file_for_target(target, metrics.clone()).await?,
+                    };
+
+                    if args.prune_segments
+                        && let Some(filter) = filter.as_ref()
+                        && file.can_prune(filter)?
+                    {
+                        return Ok::<_, anyhow::Error>(0);
+                    }
+
+                    if matches!(mode, ScanMode::Io) {
+                        read_all_segments(&file, args.concurrency).await?;
+                        if !first_seen.load(Ordering::Relaxed)
+                            && !first_seen.swap(true, Ordering::Relaxed)
+                        {
+                            let latency = start.elapsed().as_secs_f64();
+                            let bytes = read_bytes.count() - bytes_before;
+                            *first_info.lock() = Some((latency, bytes));
+                        }
+                        let file_rows = usize::try_from(file.row_count())
+                            .map_err(|_| anyhow::anyhow!("row_count exceeds usize"))?;
+                        drop(file);
+                        return Ok::<_, anyhow::Error>(file_rows);
+                    }
+
+                    let (scan_projection, scan_filter, bypass_filter) = match mode {
+                        ScanMode::Decode => {
+                            let scan_filter = if args.prune_segments {
+                                filter.clone()
+                            } else {
+                                None
+                            };
+                            (root(), scan_filter, true)
+                        }
+                        ScanMode::Full => (projection.clone(), filter.clone(), false),
+                        ScanMode::Io => unreachable!("io-only handled above"),
+                    };
+
+                    let layout_reader = file.layout_reader()?;
+                    let layout_reader = if args.prune_segments || bypass_filter {
+                        std::sync::Arc::new(BenchLayoutReader::new(
+                            layout_reader,
+                            args.prune_segments,
+                            bypass_filter,
+                        )) as std::sync::Arc<dyn LayoutReader>
+                    } else {
+                        layout_reader
+                    };
+
+                    let scan = ScanBuilder::new(SESSION.clone(), layout_reader)
+                        .with_metrics(metrics.clone())
+                        .with_projection(scan_projection)
+                        .with_some_filter(scan_filter)
+                        .with_concurrency(args.concurrency)
+                        .map(|array| Ok(array.len()));
+
+                    let mut stream = scan.into_stream()?;
+                    let mut file_rows = 0usize;
+                    while let Some(rows) = stream.try_next().await? {
+                        if !first_seen.load(Ordering::Relaxed)
+                            && !first_seen.swap(true, Ordering::Relaxed)
+                        {
+                            let latency = start.elapsed().as_secs_f64();
+                            let bytes = read_bytes.count() - bytes_before;
+                            *first_info.lock() = Some((latency, bytes));
+                        }
+                        file_rows += rows;
+                    }
+
+                    drop(file);
+                    Ok::<_, anyhow::Error>(file_rows)
+                }
+            })
+            .buffer_unordered(args.file_concurrency.max(1))
+            .try_fold(0usize, |rows, file_rows| async move { Ok(rows + file_rows) })
+            .await?;
+
+        let elapsed = start.elapsed().as_secs_f64();
+        let bytes = read_bytes.count();
+
+        total_rows += rows;
+        total_elapsed += elapsed;
+        total_bytes += bytes;
+        let (iter_first_latency, iter_first_bytes) =
+            first_info.lock().unwrap_or((elapsed, read_bytes.count() - bytes_before));
+        total_first_latency += iter_first_latency;
+        total_first_bytes += iter_first_bytes;
+
+    }
+
+    let avg_elapsed = total_elapsed / args.iterations as f64;
+    let avg_bytes = total_bytes as f64 / args.iterations as f64;
+    let avg_first_latency = total_first_latency / args.iterations as f64;
+    let avg_first_bytes = total_first_bytes as f64 / args.iterations as f64;
+    let steady_bytes = (avg_bytes - avg_first_bytes).max(0.0);
+    let steady_time = (avg_elapsed - avg_first_latency).max(0.0);
+    let total_mb_s = if avg_elapsed > 0.0 {
+        avg_bytes / (1024.0 * 1024.0) / avg_elapsed
+    } else {
+        0.0
+    };
+    let steady_mb_s = if steady_time > 0.0 {
+        steady_bytes / (1024.0 * 1024.0) / steady_time
+    } else {
+        0.0
+    };
+
+    println!("files={}", targets.len());
+    println!("rows={}", total_rows / args.iterations);
+    println!("avg_time_s={:.3}", avg_elapsed);
+    println!("avg_bytes={:.0}", avg_bytes);
+    println!("avg_mb_s={:.2}", total_mb_s);
+    println!("avg_first_latency_ms={:.2}", avg_first_latency * 1000.0);
+    println!("steady_mb_s={:.2}", steady_mb_s);
+
+    Ok(())
+}
+
+fn build_scan_exprs(args: &Args) -> VortexResult<(Expression, Option<Expression>)> {
+    if let Some(preset) = &args.preset {
+        return build_preset_exprs(preset);
+    }
+
+    let projection = match &args.projection {
+        Some(fields) if !fields.is_empty() => {
+            let names = FieldNames::from_iter(fields.iter().map(|s| s.as_str()));
+            select(names, root())
+        }
+        _ => root(),
+    };
+
+    let filter = match (&args.filter_col, &args.filter_op, args.filter_value) {
+        (Some(col_name), Some(op), Some(value)) => {
+            let lhs = col(col_name.as_str());
+            let rhs = match args.filter_type {
+                LiteralType::I16 => lit(
+                    i16::try_from(value).map_err(|_| vortex_err!("filter_value does not fit in i16"))?,
+                ),
+                LiteralType::I32 => lit(
+                    i32::try_from(value).map_err(|_| vortex_err!("filter_value does not fit in i32"))?,
+                ),
+                LiteralType::I64 => lit(value),
+            };
+            Some(apply_filter_op(op.clone(), lhs, rhs))
+        }
+        _ => None,
+    };
+
+    Ok((projection, filter))
+}
+
+fn build_preset_exprs(preset: &Preset) -> VortexResult<(Expression, Option<Expression>)> {
+    match preset {
+        Preset::Clickbench2 => {
+            let projection = select(["AdvEngineID"], root());
+            let filter = not_eq(col("AdvEngineID"), lit(0_i16));
+            Ok((projection, Some(filter)))
+        }
+    }
+}
+
+fn apply_filter_op(op: FilterOp, lhs: Expression, rhs: Expression) -> Expression {
+    match op {
+        FilterOp::Eq => eq(lhs, rhs),
+        FilterOp::Neq => not_eq(lhs, rhs),
+        FilterOp::Gt => gt(lhs, rhs),
+        FilterOp::Gte => gt_eq(lhs, rhs),
+        FilterOp::Lt => lt(lhs, rhs),
+        FilterOp::Lte => lt_eq(lhs, rhs),
+    }
+}
+
+#[derive(Clone)]
+enum ScanTarget {
+    Local(PathBuf),
+    ObjectStore {
+        store: std::sync::Arc<dyn ObjectStore>,
+        path: ObjectStorePath,
+    },
+}
+
+async fn resolve_targets(args: &Args) -> Result<Vec<ScanTarget>> {
+    let source = &args.source;
+
+    if let Ok(url) = Url::parse(source) {
+        if url.scheme() == "file" && !args.force_object_store {
+            let path = url
+                .to_file_path()
+                .map_err(|_| anyhow::anyhow!("Invalid file URL: {source}"))?;
+            return Ok(resolve_local_targets(&path));
+        }
+
+        let (scheme, store, path) = object_store_from_url(source)?;
+        if is_prefix(source) {
+            if matches!(scheme, ObjectStoreScheme::Http) {
+                anyhow::bail!("HTTP object stores do not support listing prefixes");
+            }
+            let mut entries = store.list(Some(&path));
+            let mut targets = Vec::new();
+            while let Some(entry) = entries.try_next().await? {
+                targets.push(ScanTarget::ObjectStore {
+                    store: store.clone(),
+                    path: entry.location.clone(),
+                });
+            }
+            return Ok(targets);
+        }
+
+        return Ok(vec![ScanTarget::ObjectStore {
+            store,
+            path,
+        }]);
+    }
+
+    let path = PathBuf::from(source);
+    Ok(resolve_local_targets(&path))
+}
+
+fn resolve_local_targets(path: &Path) -> Vec<ScanTarget> {
+    if path.is_dir() {
+        let mut entries = match std::fs::read_dir(path) {
+            Ok(entries) => entries
+                .filter_map(|entry| entry.ok())
+                .map(|entry| entry.path())
+                .filter(|entry| entry.extension().is_some_and(|e| e == "vortex"))
+                .collect::<Vec<_>>(),
+            Err(_) => Vec::new(),
+        };
+        entries.sort();
+        entries.into_iter().map(ScanTarget::Local).collect()
+    } else {
+        vec![ScanTarget::Local(path.to_path_buf())]
+    }
+}
+
+fn is_prefix(source: &str) -> bool {
+    source.ends_with('/')
+}
+
+async fn open_vortex_file_for_target(
+    target: &ScanTarget,
+    metrics: VortexMetrics,
+) -> Result<vortex::file::VortexFile> {
+    let session = SESSION.clone();
+    match target {
+        ScanTarget::Local(path) => Ok(session
+            .open_options()
+            .with_metrics(metrics)
+            .open(path.clone())
+            .await?),
+        ScanTarget::ObjectStore { store, path } => {
+            let path_str = path.to_string();
+            Ok(session
+                .open_options()
+                .with_metrics(metrics)
+                .open_object_store(store, &path_str)
+                .await?)
+        }
+    }
+}
+
+async fn open_all_targets(
+    targets: &[ScanTarget],
+    metrics: VortexMetrics,
+    concurrency: usize,
+) -> Result<Vec<vortex::file::VortexFile>> {
+    let mut files = vec![None; targets.len()];
+    let results = futures::stream::iter(targets.iter().enumerate())
+        .map(|(idx, target)| {
+            let metrics = metrics.clone();
+            async move {
+                let file = open_vortex_file_for_target(target, metrics).await?;
+                Ok::<_, anyhow::Error>((idx, file))
+            }
+        })
+        .buffer_unordered(concurrency.max(1))
+        .try_collect::<Vec<_>>()
+        .await?;
+
+    for (idx, file) in results {
+        files[idx] = Some(file);
+    }
+
+    files
+        .into_iter()
+        .map(|file| file.ok_or_else(|| anyhow::anyhow!("file open missing")))
+        .collect()
+}
+
+fn object_store_from_url(
+    url_str: &str,
+) -> Result<(ObjectStoreScheme, std::sync::Arc<dyn ObjectStore>, ObjectStorePath)> {
+    let url = Url::parse(url_str)?;
+    let (scheme, path) = ObjectStoreScheme::parse(&url).map_err(object_store::Error::from)?;
+    let store: std::sync::Arc<dyn ObjectStore> = match scheme {
+        ObjectStoreScheme::Local => std::sync::Arc::new(LocalFileSystem::default()),
+        ObjectStoreScheme::AmazonS3 => {
+            std::sync::Arc::new(AmazonS3Builder::from_env().with_url(url_str).build()?)
+        }
+        ObjectStoreScheme::Http => std::sync::Arc::new(
+            HttpBuilder::new()
+                .with_url(&url[..url::Position::BeforePath])
+                .build()?,
+        ),
+        otherwise => anyhow::bail!("unsupported object store scheme: {otherwise:?}"),
+    };
+
+    Ok((scheme, store, path))
+}
+
+async fn read_all_segments(
+    file: &vortex::file::VortexFile,
+    concurrency: usize,
+) -> Result<()> {
+    let layout = file.footer().layout().clone();
+    let segment_ids = collect_segment_ids(&layout)?;
+    let segment_source = file.segment_source();
+
+    futures::stream::iter(segment_ids)
+        .map(|segment_id| {
+            let segment_source = segment_source.clone();
+            async move {
+                let buffer = segment_source.request(segment_id).await?;
+                drop(buffer);
+                Ok::<_, anyhow::Error>(())
+            }
+        })
+        .buffer_unordered(concurrency.max(1))
+        .try_collect::<Vec<_>>()
+        .await?;
+
+    Ok(())
+}
+
+#[derive(Clone)]
+struct BenchLayoutReader {
+    inner: std::sync::Arc<dyn LayoutReader>,
+    segment_pruning: bool,
+    bypass_filter: bool,
+}
+
+impl BenchLayoutReader {
+    fn new(
+        inner: std::sync::Arc<dyn LayoutReader>,
+        segment_pruning: bool,
+        bypass_filter: bool,
+    ) -> Self {
+        Self {
+            inner,
+            segment_pruning,
+            bypass_filter,
+        }
+    }
+}
+
+impl LayoutReader for BenchLayoutReader {
+    fn name(&self) -> &std::sync::Arc<str> {
+        self.inner.name()
+    }
+
+    fn dtype(&self) -> &vortex::dtype::DType {
+        self.inner.dtype()
+    }
+
+    fn row_count(&self) -> u64 {
+        self.inner.row_count()
+    }
+
+    fn register_splits(
+        &self,
+        field_mask: &[vortex::dtype::FieldMask],
+        row_range: &std::ops::Range<u64>,
+        splits: &mut std::collections::BTreeSet<u64>,
+    ) -> VortexResult<()> {
+        self.inner.register_splits(field_mask, row_range, splits)
+    }
+
+    fn pruning_evaluation(
+        &self,
+        row_range: &std::ops::Range<u64>,
+        expr: &Expression,
+        mask: Mask,
+    ) -> VortexResult<MaskFuture> {
+        if !self.segment_pruning {
+            return self.inner.pruning_evaluation(row_range, expr, mask);
+        }
+
+        let len = mask.len();
+        let fut = self.inner.pruning_evaluation(row_range, expr, mask)?;
+        Ok(MaskFuture::new(len, async move {
+            let mask = fut.await?;
+            if mask.all_false() {
+                Ok(mask)
+            } else {
+                Ok(Mask::new_true(len))
+            }
+        }))
+    }
+
+    fn filter_evaluation(
+        &self,
+        row_range: &std::ops::Range<u64>,
+        expr: &Expression,
+        mask: MaskFuture,
+    ) -> VortexResult<MaskFuture> {
+        if self.bypass_filter {
+            Ok(mask)
+        } else {
+            self.inner.filter_evaluation(row_range, expr, mask)
+        }
+    }
+
+    fn projection_evaluation(
+        &self,
+        row_range: &std::ops::Range<u64>,
+        expr: &Expression,
+        mask: MaskFuture,
+    ) -> VortexResult<futures::future::BoxFuture<'static, VortexResult<vortex::array::ArrayRef>>>
+    {
+        self.inner.projection_evaluation(row_range, expr, mask)
+    }
+}
diff --git a/vortex-layout/src/layout.rs b/vortex-layout/src/layout.rs
index 368d7c6bdc8..f654eb612e1 100644
--- a/vortex-layout/src/layout.rs
+++ b/vortex-layout/src/layout.rs
@@ -23,6 +23,7 @@ use crate::LayoutReaderRef;
 use crate::VTable;
 use crate::display::DisplayLayoutTree;
 use crate::display::display_tree_with_segment_sizes;
+use crate::layouts::flat::FlatVTable;
 use crate::segments::SegmentId;
 use crate::segments::SegmentSource;
 
@@ -230,6 +231,30 @@ impl dyn Layout + '_ {
     }
 }
 
+/// Collect all segment IDs that should be fetched for a layout tree.
+pub fn collect_segment_ids(layout: &LayoutRef) -> VortexResult<Vec<SegmentId>> {
+    let mut segment_ids = Vec::new();
+    collect_segments_to_fetch(layout, &mut segment_ids)?;
+    segment_ids.sort();
+    segment_ids.dedup();
+    Ok(segment_ids)
+}
+
+fn collect_segments_to_fetch(layout: &LayoutRef, segment_ids: &mut Vec<SegmentId>) -> VortexResult<()> {
+    if let Some(flat_layout) = layout.as_opt::<FlatVTable>() {
+        if flat_layout.array_tree().is_none() {
+            segment_ids.push(flat_layout.segment_id());
+        }
+    } else {
+        segment_ids.extend(layout.segment_ids());
+    }
+
+    for child in layout.children()? {
+        collect_segments_to_fetch(&child, segment_ids)?;
+    }
+    Ok(())
+}
+
 /// Display the encoding, dtype, row count, and segment IDs of this layout.
 impl Display for dyn Layout + '_ {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
diff --git a/vortex-scan/src/scan_builder.rs b/vortex-scan/src/scan_builder.rs
index a8b562d134b..7772eb68ea5 100644
--- a/vortex-scan/src/scan_builder.rs
+++ b/vortex-scan/src/scan_builder.rs
@@ -646,7 +646,7 @@ mod test {
 
         let mut values = Vec::new();
         for chunk in &mut iter {
-            values.push(chunk?.to_primitive().into_buffer::<i32>()[0]);
+            values.push(chunk?.to_primitive().as_slice::<i32>()[0]);
         }
 
         assert_eq!(calls.load(Ordering::Relaxed), 1);

From deea96b8fe79787bbb6f6ee6641b820be6f8d898 Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Fri, 16 Jan 2026 13:55:26 +0000
Subject: [PATCH 02/12] clippy

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 vortex-bench/src/bin/scan_io_bench.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vortex-bench/src/bin/scan_io_bench.rs b/vortex-bench/src/bin/scan_io_bench.rs
index 356d9902b25..86f61c720f2 100644
--- a/vortex-bench/src/bin/scan_io_bench.rs
+++ b/vortex-bench/src/bin/scan_io_bench.rs
@@ -424,7 +424,7 @@ async fn open_vortex_file_for_target(
         ScanTarget::Local(path) => Ok(session
             .open_options()
             .with_metrics(metrics)
-            .open(path.clone())
+            .open_path(path)
             .await?),
         ScanTarget::ObjectStore { store, path } => {
             let path_str = path.to_string();

From 0170ff1ec54e40e2c02c98efcd6d449bd2cebf81 Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Fri, 16 Jan 2026 14:56:56 +0000
Subject: [PATCH 03/12] repeats don't wait for the prev iteration

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 vortex-bench/src/bin/scan_io_bench.rs | 55 ++++++++++-----------------
 1 file changed, 21 insertions(+), 34 deletions(-)

diff --git a/vortex-bench/src/bin/scan_io_bench.rs b/vortex-bench/src/bin/scan_io_bench.rs
index 86f61c720f2..1a85004688f 100644
--- a/vortex-bench/src/bin/scan_io_bench.rs
+++ b/vortex-bench/src/bin/scan_io_bench.rs
@@ -149,22 +149,17 @@ async fn main() -> Result<()> {
             open_all_targets(&targets, metrics.clone(), args.file_concurrency).await?,
         ))
     };
-    let mut total_rows = 0usize;
-    let mut total_elapsed = 0.0f64;
-    let mut total_bytes = 0i64;
-    let mut total_first_latency = 0.0f64;
-    let mut total_first_bytes = 0i64;
-
-    for _ in 0..args.iterations {
-        read_bytes.clear();
-
-        let start = Instant::now();
-        let bytes_before = read_bytes.count();
-        let first_seen = std::sync::Arc::new(AtomicBool::new(false));
-        let first_info = std::sync::Arc::new(Mutex::new(None::<(f64, i64)>));
-
-        let rows = futures::stream::iter(targets.iter().enumerate())
-            .map(|(idx, target)| {
+    read_bytes.clear();
+
+    let start = Instant::now();
+    let bytes_before = read_bytes.count();
+    let first_seen = std::sync::Arc::new(AtomicBool::new(false));
+    let first_info = std::sync::Arc::new(Mutex::new(None::<(f64, i64)>));
+    let targets = targets.clone();
+
+    let rows = futures::stream::iter(0..args.iterations)
+        .flat_map(|_| futures::stream::iter(targets.clone().into_iter().enumerate()))
+        .map(|(idx, target)| {
                 let cached_files = cached_files.clone();
                 let projection = projection.clone();
                 let filter = filter.clone();
@@ -176,7 +171,7 @@ async fn main() -> Result<()> {
                 async move {
                     let file = match &cached_files {
                         Some(files) => files[idx].clone(),
-                        None => open_vortex_file_for_target(target, metrics.clone()).await?,
+                        None => open_vortex_file_for_target(&target, metrics.clone()).await?,
                     };
 
                     if args.prune_segments
@@ -253,23 +248,15 @@ async fn main() -> Result<()> {
             .try_fold(0usize, |rows, file_rows| async move { Ok(rows + file_rows) })
             .await?;
 
-        let elapsed = start.elapsed().as_secs_f64();
-        let bytes = read_bytes.count();
-
-        total_rows += rows;
-        total_elapsed += elapsed;
-        total_bytes += bytes;
-        let (iter_first_latency, iter_first_bytes) =
-            first_info.lock().unwrap_or((elapsed, read_bytes.count() - bytes_before));
-        total_first_latency += iter_first_latency;
-        total_first_bytes += iter_first_bytes;
-
-    }
+    let elapsed = start.elapsed().as_secs_f64();
+    let bytes = read_bytes.count();
+    let (first_latency, first_bytes) =
+        first_info.lock().unwrap_or((elapsed, read_bytes.count() - bytes_before));
 
-    let avg_elapsed = total_elapsed / args.iterations as f64;
-    let avg_bytes = total_bytes as f64 / args.iterations as f64;
-    let avg_first_latency = total_first_latency / args.iterations as f64;
-    let avg_first_bytes = total_first_bytes as f64 / args.iterations as f64;
+    let avg_elapsed = elapsed / args.iterations as f64;
+    let avg_bytes = bytes as f64 / args.iterations as f64;
+    let avg_first_latency = first_latency / args.iterations as f64;
+    let avg_first_bytes = first_bytes as f64 / args.iterations as f64;
     let steady_bytes = (avg_bytes - avg_first_bytes).max(0.0);
     let steady_time = (avg_elapsed - avg_first_latency).max(0.0);
     let total_mb_s = if avg_elapsed > 0.0 {
@@ -284,7 +271,7 @@ async fn main() -> Result<()> {
     };
 
     println!("files={}", targets.len());
-    println!("rows={}", total_rows / args.iterations);
+    println!("rows={}", rows / args.iterations);
     println!("avg_time_s={:.3}", avg_elapsed);
     println!("avg_bytes={:.0}", avg_bytes);
     println!("avg_mb_s={:.2}", total_mb_s);

From 11fad54609518fa5b13b25f089bc2ad7c4204a3a Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 20 Jan 2026 11:23:09 +0000
Subject: [PATCH 04/12] cuda pinned buffer pool

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 Cargo.lock                |   1 +
 vortex-cuda/Cargo.toml    |   1 +
 vortex-cuda/src/lib.rs    |   3 +
 vortex-cuda/src/pinned.rs | 121 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 126 insertions(+)
 create mode 100644 vortex-cuda/src/pinned.rs

diff --git a/Cargo.lock b/Cargo.lock
index 2e63f32b6c9..10c402b80a1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10268,6 +10268,7 @@ dependencies = [
  "async-trait",
  "criterion",
  "cudarc",
+ "parking_lot",
  "tokio",
  "tracing",
  "vortex-array",
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index cf822f1ec02..1aa405f8d58 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -19,6 +19,7 @@ workspace = true
 [dependencies]
 async-trait = { workspace = true }
 cudarc = { workspace = true }
+parking_lot = { workspace = true }
 tracing = { workspace = true }
 vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index ca798939493..1d87c8b8dc8 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -6,12 +6,15 @@
 pub mod executor;
 mod for_;
 mod kernel;
+pub mod pinned;
 mod session;
 
 use std::process::Command;
 
 pub use executor::CudaExecutionCtx;
 pub use executor::CudaKernelEvents;
+pub use pinned::PinnedByteBuffer;
+pub use pinned::PinnedByteBufferPool;
 use for_::ForExecutor;
 pub use session::CudaSession;
 
diff --git a/vortex-cuda/src/pinned.rs b/vortex-cuda/src/pinned.rs
new file mode 100644
index 00000000000..6e88cb4ec97
--- /dev/null
+++ b/vortex-cuda/src/pinned.rs
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+
+use cudarc::driver::CudaContext;
+use cudarc::driver::PinnedHostSlice;
+use parking_lot::Mutex;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+use vortex_utils::aliases::hash_map::HashMap;
+
+/// A page-locked host buffer allocated by CUDA.
+///
+/// This is intended as a staging buffer for H2D transfers. Contents are uninitialized after
+/// allocation.
+pub struct PinnedByteBuffer {
+    inner: PinnedHostSlice<u8>,
+}
+
+impl PinnedByteBuffer {
+    /// Allocate a pinned host buffer with uninitialized contents.
+    ///
+    /// # Safety
+    /// The returned buffer's contents are uninitialized. The caller must initialize before read.
+    pub unsafe fn uninit(ctx: &Arc<CudaContext>, len: usize) -> VortexResult<Self> {
+        let inner = unsafe {
+            ctx.alloc_pinned::<u8>(len)
+                .map_err(|e| vortex_err!("failed to allocate pinned host buffer: {e}"))?
+        };
+        Ok(Self { inner })
+    }
+
+    /// Returns the length of the buffer in bytes.
+    pub fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    /// Returns true if the buffer is empty.
+    pub fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    /// Returns the buffer as an immutable slice.
+    pub fn as_slice(&self) -> VortexResult<&[u8]> {
+        self.inner
+            .as_slice()
+            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Returns the buffer as a mutable slice.
+    pub fn as_mut_slice(&mut self) -> VortexResult<&mut [u8]> {
+        self.inner
+            .as_mut_slice()
+            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Returns a raw pointer to the buffer.
+    pub fn as_ptr(&self) -> VortexResult<*const u8> {
+        self.inner
+            .as_ptr()
+            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Returns a mutable raw pointer to the buffer.
+    pub fn as_mut_ptr(&mut self) -> VortexResult<*mut u8> {
+        self.inner
+            .as_mut_ptr()
+            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Returns the CUDA context that owns this allocation.
+    pub fn context(&self) -> &Arc<CudaContext> {
+        self.inner.context()
+    }
+}
+
+/// A simple pinned host buffer pool keyed by allocation size.
+pub struct PinnedByteBufferPool {
+    ctx: Arc<CudaContext>,
+    max_keep_per_size: usize,
+    buckets: Mutex<HashMap<usize, Vec<PinnedByteBuffer>>>,
+}
+
+impl PinnedByteBufferPool {
+    /// Create a new pool with default limits.
+    pub fn new(ctx: Arc<CudaContext>) -> Self {
+        Self::with_limits(ctx, 4)
+    }
+
+    /// Create a new pool with a maximum number of cached buffers per size.
+    pub fn with_limits(ctx: Arc<CudaContext>, max_keep_per_size: usize) -> Self {
+        Self {
+            ctx,
+            max_keep_per_size: max_keep_per_size.max(1),
+            buckets: Mutex::new(HashMap::new()),
+        }
+    }
+
+    /// Acquire a pinned buffer of the given size in bytes.
+    pub fn get(&self, len: usize) -> VortexResult<PinnedByteBuffer> {
+        let mut buckets = self.buckets.lock();
+        if let Some(bucket) = buckets.get_mut(&len)
+            && let Some(buf) = bucket.pop()
+        {
+            return Ok(buf);
+        }
+        unsafe { PinnedByteBuffer::uninit(&self.ctx, len) }
+    }
+
+    /// Return a buffer to the pool.
+    pub fn put(&self, buf: PinnedByteBuffer) -> VortexResult<()> {
+        let len = buf.len();
+        let mut buckets = self.buckets.lock();
+        let bucket = buckets.entry(len).or_default();
+        if bucket.len() < self.max_keep_per_size {
+            bucket.push(buf);
+        }
+        Ok(())
+    }
+}

From df4563990f5ea71aa6fddc0394950347d242a2fc Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 20 Jan 2026 12:53:06 +0000
Subject: [PATCH 05/12] WriteTarget + read_at_into

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 Cargo.lock                         |   2 +
 vortex-cuda/Cargo.toml             |   1 +
 vortex-cuda/src/lib.rs             |   1 +
 vortex-cuda/src/pinned.rs          | 140 +++++++++++++++++++++++++++++
 vortex-io/Cargo.toml               |   1 +
 vortex-io/src/file/object_store.rs |  71 +++++++++++++++
 vortex-io/src/file/std_file.rs     |  20 +++++
 vortex-io/src/lib.rs               |   2 +
 vortex-io/src/read.rs              |  61 +++++++++++++
 vortex-io/src/write_target.rs      |  36 ++++++++
 10 files changed, 335 insertions(+)
 create mode 100644 vortex-io/src/write_target.rs

diff --git a/Cargo.lock b/Cargo.lock
index 10c402b80a1..06e13137e03 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10266,6 +10266,7 @@ name = "vortex-cuda"
 version = "0.1.0"
 dependencies = [
  "async-trait",
+ "bytes",
  "criterion",
  "cudarc",
  "parking_lot",
@@ -10604,6 +10605,7 @@ dependencies = [
  "tempfile",
  "tokio",
  "tracing",
+ "vortex-array",
  "vortex-buffer",
  "vortex-error",
  "vortex-metrics",
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index 1aa405f8d58..ce3372b587c 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -18,6 +18,7 @@ workspace = true
 
 [dependencies]
 async-trait = { workspace = true }
+bytes = { workspace = true }
 cudarc = { workspace = true }
 parking_lot = { workspace = true }
 tracing = { workspace = true }
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index 1d87c8b8dc8..6ad59745fd4 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -15,6 +15,7 @@ pub use executor::CudaExecutionCtx;
 pub use executor::CudaKernelEvents;
 pub use pinned::PinnedByteBuffer;
 pub use pinned::PinnedByteBufferPool;
+pub use pinned::PooledPinnedBuffer;
 use for_::ForExecutor;
 pub use session::CudaSession;
 
diff --git a/vortex-cuda/src/pinned.rs b/vortex-cuda/src/pinned.rs
index 6e88cb4ec97..5b5cbd28dc7 100644
--- a/vortex-cuda/src/pinned.rs
+++ b/vortex-cuda/src/pinned.rs
@@ -3,11 +3,14 @@
 
 use std::sync::Arc;
 
+use bytes::Bytes;
 use cudarc::driver::CudaContext;
 use cudarc::driver::PinnedHostSlice;
 use parking_lot::Mutex;
+use vortex_buffer::ByteBuffer;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
+use vortex_error::vortex_panic;
 use vortex_utils::aliases::hash_map::HashMap;
 
 /// A page-locked host buffer allocated by CUDA.
@@ -118,4 +121,141 @@ impl PinnedByteBufferPool {
         }
         Ok(())
     }
+
+    /// Get a pooled pinned buffer that will be returned to the pool on drop.
+    pub fn get_pooled(self: &Arc<Self>, len: usize) -> VortexResult<PooledPinnedBuffer> {
+        let inner = self.get(len)?;
+        Ok(PooledPinnedBuffer {
+            inner: Some(inner),
+            pool: self.clone(),
+        })
+    }
+}
+
+/// A pinned buffer that is returned to its pool when dropped.
+///
+/// This wrapper owns a [`PinnedByteBuffer`] and ensures it gets returned to the
+/// [`PinnedByteBufferPool`] when the buffer is no longer needed. This enables efficient
+/// buffer reuse for I/O operations.
+pub struct PooledPinnedBuffer {
+    inner: Option<PinnedByteBuffer>,
+    pool: Arc<PinnedByteBufferPool>,
+}
+
+impl PooledPinnedBuffer {
+    /// Create a new pooled buffer.
+    pub fn new(inner: PinnedByteBuffer, pool: Arc<PinnedByteBufferPool>) -> Self {
+        Self {
+            inner: Some(inner),
+            pool,
+        }
+    }
+
+    /// Returns the length of the buffer in bytes.
+    pub fn len(&self) -> usize {
+        self.inner.as_ref().map(|b| b.len()).unwrap_or(0)
+    }
+
+    /// Returns true if the buffer is empty.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the buffer as a mutable slice.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the buffer has already been consumed or if the CUDA context is invalid.
+    pub fn as_mut_slice(&mut self) -> &mut [u8] {
+        let inner = self
+            .inner
+            .as_mut()
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
+        inner
+            .as_mut_slice()
+            .unwrap_or_else(|e| vortex_panic!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Convert this pooled buffer into a [`ByteBuffer`].
+    ///
+    /// The returned buffer will return the underlying pinned memory to the pool when dropped.
+    /// This enables zero-copy conversion to the standard Vortex buffer type while maintaining
+    /// pool-based memory reuse.
+    pub fn into_byte_buffer(mut self) -> ByteBuffer {
+        let inner = self
+            .inner
+            .take()
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
+        let len = inner.len();
+        let pool = self.pool.clone();
+
+        // Create a wrapper that will return the buffer to the pool on drop
+        let wrapper = PooledPinnedBufferOwner::new(inner, pool);
+
+        // Use Bytes::from_owner to create a Bytes that owns the wrapper
+        let bytes = Bytes::from_owner(wrapper);
+
+        // The ByteBuffer should have the full length
+        assert_eq!(bytes.len(), len);
+
+        ByteBuffer::from(bytes)
+    }
+}
+
+impl Drop for PooledPinnedBuffer {
+    fn drop(&mut self) {
+        if let Some(inner) = self.inner.take() {
+            // Return the buffer to the pool, ignoring errors
+            drop(self.pool.put(inner));
+        }
+    }
+}
+
+/// Internal wrapper that owns a PinnedByteBuffer and returns it to the pool on drop.
+///
+/// This is used by `Bytes::from_owner` to manage the lifecycle of pooled pinned buffers.
+struct PooledPinnedBufferOwner {
+    // We use Option so we can take the buffer out in Drop
+    inner: Mutex<Option<PinnedByteBuffer>>,
+    // Cached pointer and length for AsRef implementation
+    ptr: *const u8,
+    len: usize,
+    pool: Arc<PinnedByteBufferPool>,
+}
+
+// SAFETY: The pinned buffer is allocated by CUDA and is safe to send across threads.
+// The pointer is derived from the buffer and remains valid as long as the buffer exists.
+unsafe impl Send for PooledPinnedBufferOwner {}
+unsafe impl Sync for PooledPinnedBufferOwner {}
+
+impl PooledPinnedBufferOwner {
+    fn new(inner: PinnedByteBuffer, pool: Arc<PinnedByteBufferPool>) -> Self {
+        let ptr = inner
+            .as_ptr()
+            .unwrap_or_else(|e| vortex_panic!("failed to get pointer to pinned buffer: {e}"));
+        let len = inner.len();
+        Self {
+            inner: Mutex::new(Some(inner)),
+            ptr,
+            len,
+            pool,
+        }
+    }
+}
+
+impl AsRef<[u8]> for PooledPinnedBufferOwner {
+    fn as_ref(&self) -> &[u8] {
+        // SAFETY: The pointer and length were captured when the buffer was created
+        // and remain valid as long as this struct exists (buffer is in the Mutex).
+        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
+    }
+}
+
+impl Drop for PooledPinnedBufferOwner {
+    fn drop(&mut self) {
+        // Take the buffer out and return it to the pool
+        if let Some(buffer) = self.inner.lock().take() {
+            drop(self.pool.put(buffer));
+        }
+    }
 }
diff --git a/vortex-io/Cargo.toml b/vortex-io/Cargo.toml
index cef1c69e351..5a98bd28528 100644
--- a/vortex-io/Cargo.toml
+++ b/vortex-io/Cargo.toml
@@ -35,6 +35,7 @@ handle = "1.0.2"
 tokio = { workspace = true, features = ["io-util", "rt", "sync"] }
 tracing = { workspace = true }
 vortex-buffer = { workspace = true }
+vortex-array = { workspace = true }
 vortex-error = { workspace = true }
 vortex-metrics = { workspace = true }
 vortex-session = { workspace = true }
diff --git a/vortex-io/src/file/object_store.rs b/vortex-io/src/file/object_store.rs
index 0d09cbdcd2b..30d8ba593f4 100644
--- a/vortex-io/src/file/object_store.rs
+++ b/vortex-io/src/file/object_store.rs
@@ -13,6 +13,7 @@ use object_store::GetRange;
 use object_store::GetResultPayload;
 use object_store::ObjectStore;
 use object_store::path::Path as ObjectPath;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_buffer::ByteBufferMut;
@@ -22,6 +23,7 @@ use vortex_error::vortex_ensure;
 
 use crate::CoalesceConfig;
 use crate::VortexReadAt;
+use crate::WriteTarget;
 #[cfg(not(target_arch = "wasm32"))]
 use crate::file::std_file::read_exact_at;
 use crate::runtime::Handle;
@@ -165,4 +167,73 @@ impl VortexReadAt for ObjectStoreSource {
         })
         .boxed()
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        mut target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let store = self.store.clone();
+        let path = self.path.clone();
+        let handle = self.handle.clone();
+        let length = target.len();
+        let range = offset..(offset + length as u64);
+
+        Compat::new(async move {
+            let response = store
+                .get_opts(
+                    &path,
+                    GetOptions {
+                        range: Some(GetRange::Bounded(range.clone())),
+                        ..Default::default()
+                    },
+                )
+                .await?;
+
+            match response.payload {
+                #[cfg(not(target_arch = "wasm32"))]
+                GetResultPayload::File(file, _) => {
+                    target = handle
+                        .spawn_blocking(move || {
+                            let mut target = target;
+                            read_exact_at(&file, target.as_mut_slice(), range.start)?;
+                            Ok::<_, io::Error>(target)
+                        })
+                        .await
+                        .map_err(io::Error::other)?;
+                }
+                #[cfg(target_arch = "wasm32")]
+                GetResultPayload::File(..) => {
+                    unreachable!("File payload not supported on wasm32")
+                }
+                GetResultPayload::Stream(mut byte_stream) => {
+                    let mut filled = 0usize;
+                    while let Some(bytes) = byte_stream.next().await {
+                        let bytes = bytes?;
+                        let end = filled + bytes.len();
+                        vortex_ensure!(
+                            end <= length,
+                            "Object store stream returned more bytes than expected (expected {} bytes, got at least {} bytes, range: {:?})",
+                            length,
+                            end,
+                            range
+                        );
+                        target.as_mut_slice()[filled..end].copy_from_slice(&bytes);
+                        filled = end;
+                    }
+
+                    vortex_ensure!(
+                        filled == length,
+                        "Object store stream returned {} bytes but expected {} bytes (range: {:?})",
+                        filled,
+                        length,
+                        range
+                    );
+                }
+            }
+
+            Ok(target.into_handle())
+        })
+        .boxed()
+    }
 }
diff --git a/vortex-io/src/file/std_file.rs b/vortex-io/src/file/std_file.rs
index 56abd56eb60..3db3e4c24a5 100644
--- a/vortex-io/src/file/std_file.rs
+++ b/vortex-io/src/file/std_file.rs
@@ -15,6 +15,7 @@ use std::sync::Arc;
 
 use futures::FutureExt;
 use futures::future::BoxFuture;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_buffer::ByteBufferMut;
@@ -22,6 +23,7 @@ use vortex_error::VortexResult;
 
 use crate::CoalesceConfig;
 use crate::VortexReadAt;
+use crate::WriteTarget;
 use crate::runtime::Handle;
 
 /// Read exactly `buffer.len()` bytes from `file` starting at `offset`.
@@ -122,4 +124,22 @@ impl VortexReadAt for FileReadAdapter {
         }
         .boxed()
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        mut target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let file = self.file.clone();
+        let handle = self.handle.clone();
+        async move {
+            handle
+                .spawn_blocking(move || {
+                    read_exact_at(&file, target.as_mut_slice(), offset)?;
+                    Ok(target.into_handle())
+                })
+                .await
+        }
+        .boxed()
+    }
 }
diff --git a/vortex-io/src/lib.rs b/vortex-io/src/lib.rs
index 6a08c821c8f..029c8ea3bc8 100644
--- a/vortex-io/src/lib.rs
+++ b/vortex-io/src/lib.rs
@@ -15,6 +15,7 @@ pub use limit::*;
 #[cfg(feature = "object_store")]
 pub use object_store::*;
 pub use read::*;
+pub use write_target::*;
 pub use write::*;
 
 pub mod file;
@@ -24,6 +25,7 @@ mod limit;
 #[cfg(feature = "object_store")]
 mod object_store;
 mod read;
+mod write_target;
 pub mod runtime;
 pub mod session;
 #[cfg(feature = "tokio")]
diff --git a/vortex-io/src/read.rs b/vortex-io/src/read.rs
index ed8ddecad39..1e235844568 100644
--- a/vortex-io/src/read.rs
+++ b/vortex-io/src/read.rs
@@ -5,6 +5,7 @@ use std::sync::Arc;
 
 use futures::FutureExt;
 use futures::future::BoxFuture;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_error::VortexExpect;
@@ -15,6 +16,8 @@ use vortex_metrics::Histogram;
 use vortex_metrics::Timer;
 use vortex_metrics::VortexMetrics;
 
+use crate::WriteTarget;
+
 /// Configuration for coalescing nearby I/O requests into single operations.
 #[derive(Clone, Copy, Debug)]
 pub struct CoalesceConfig {
@@ -81,6 +84,24 @@ pub trait VortexReadAt: Send + Sync + 'static {
         length: usize,
         alignment: Alignment,
     ) -> BoxFuture<'static, VortexResult<ByteBuffer>>;
+
+    /// Read into a pre-allocated target buffer.
+    ///
+    /// The default implementation reads into a temporary buffer and copies into the target.
+    fn read_at_into(
+        &self,
+        offset: u64,
+        mut target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let len = target.len();
+        let read_fut = self.read_at(offset, len, Alignment::none());
+        async move {
+            let data = read_fut.await?;
+            target.as_mut_slice().copy_from_slice(data.as_ref());
+            Ok(target.into_handle())
+        }
+        .boxed()
+    }
 }
 
 impl VortexReadAt for Arc<dyn VortexReadAt> {
@@ -108,6 +129,14 @@ impl VortexReadAt for Arc<dyn VortexReadAt> {
     ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
         self.as_ref().read_at(offset, length, alignment)
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        self.as_ref().read_at_into(offset, target)
+    }
 }
 
 impl<R: VortexReadAt> VortexReadAt for Arc<R> {
@@ -136,6 +165,14 @@ impl<R: VortexReadAt> VortexReadAt for Arc<R> {
         self.as_ref().read_at(offset, length, alignment)
     }
 
+    fn read_at_into(
+        &self,
+        offset: u64,
+        target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        self.as_ref().read_at_into(offset, target)
+    }
+
     // fn drive(self: Arc<Self>, requests: BoxStream<'static, IoRequest>) -> BoxFuture<'static, ()> {
     //     // Delegate to the inner implementation's drive
     //     let inner: Arc<R> = Arc::clone(&self);
@@ -176,6 +213,30 @@ impl VortexReadAt for ByteBuffer {
         }
         .boxed()
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        mut target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let buffer = self.clone();
+        async move {
+            let start = usize::try_from(offset).vortex_expect("start too big for usize");
+            let end = usize::try_from(offset + target.len() as u64)
+                .vortex_expect("end too big for usize");
+            if end > buffer.len() {
+                vortex_bail!(
+                    "Requested range {}..{} out of bounds for buffer of length {}",
+                    start,
+                    end,
+                    buffer.len()
+                );
+            }
+            target.as_mut_slice().copy_from_slice(&buffer.as_ref()[start..end]);
+            Ok(target.into_handle())
+        }
+        .boxed()
+    }
 }
 
 /// A wrapper that instruments a [`VortexReadAt`] with metrics.
diff --git a/vortex-io/src/write_target.rs b/vortex-io/src/write_target.rs
new file mode 100644
index 00000000000..7ee4685335b
--- /dev/null
+++ b/vortex-io/src/write_target.rs
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_array::buffer::BufferHandle;
+use vortex_buffer::ByteBufferMut;
+
+/// A destination for I/O reads that can be finalized into a [`BufferHandle`].
+pub trait WriteTarget: Send + 'static {
+    /// Returns the buffer as a mutable slice.
+    fn as_mut_slice(&mut self) -> &mut [u8];
+
+    /// Returns the length of the buffer in bytes.
+    fn len(&self) -> usize;
+
+    /// Returns true if the buffer is empty.
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Finalize the target into a buffer handle.
+    fn into_handle(self: Box<Self>) -> BufferHandle;
+}
+
+impl WriteTarget for ByteBufferMut {
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        self.as_mut()
+    }
+
+    fn len(&self) -> usize {
+        ByteBufferMut::len(self)
+    }
+
+    fn into_handle(self: Box<Self>) -> BufferHandle {
+        BufferHandle::new_host(self.freeze())
+    }
+}

From cf0312b0ce2c7a27a89b457a68da7a3d85da170b Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 20 Jan 2026 13:26:39 +0000
Subject: [PATCH 06/12] BufferAllocator

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 vortex-file/src/open.rs            | 25 +++++++++++++-----
 vortex-file/src/segments/source.rs | 42 +++++++++++++++++++++++++++---
 vortex-io/src/allocator.rs         | 25 ++++++++++++++++++
 vortex-io/src/lib.rs               |  2 ++
 4 files changed, 85 insertions(+), 9 deletions(-)
 create mode 100644 vortex-io/src/allocator.rs

diff --git a/vortex-file/src/open.rs b/vortex-file/src/open.rs
index 03232856531..6a8b23aecef 100644
--- a/vortex-file/src/open.rs
+++ b/vortex-file/src/open.rs
@@ -12,6 +12,7 @@ use vortex_dtype::DType;
 use vortex_error::VortexError;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
+use vortex_io::BufferAllocator;
 use vortex_io::InstrumentedReadAt;
 use vortex_io::VortexReadAt;
 use vortex_io::session::RuntimeSessionExt;
@@ -53,6 +54,8 @@ pub struct VortexOpenOptions {
     footer: Option<Footer>,
     /// The segments read during the initial read.
     initial_read_segments: RwLock<HashMap<SegmentId, ByteBuffer>>,
+    /// Optional allocator for read buffers.
+    allocator: Option<Arc<dyn BufferAllocator>>,
     /// A metrics registry for the file.
     metrics: VortexMetrics,
 }
@@ -70,6 +73,7 @@ pub trait OpenOptionsSessionExt:
             dtype: None,
             footer: None,
             initial_read_segments: Default::default(),
+            allocator: None,
             metrics: self.metrics(),
         }
     }
@@ -132,6 +136,12 @@ impl VortexOpenOptions {
         self
     }
 
+    /// Configure a custom buffer allocator for reads.
+    pub fn with_allocator(mut self, allocator: Arc<dyn BufferAllocator>) -> Self {
+        self.allocator = Some(allocator);
+        self
+    }
+
     /// Open a Vortex file using the provided I/O source.
     ///
     /// This is the most common way to open a [`VortexFile`] and tends to provide the best
@@ -182,12 +192,15 @@ impl VortexOpenOptions {
         ));
 
         // Create a segment source backed by the VortexRead implementation.
-        let segment_source = Arc::new(SharedSegmentSource::new(FileSegmentSource::open(
-            footer.segment_map().clone(),
-            read,
-            self.session.handle(),
-            self.metrics.clone(),
-        )));
+        let segment_source = Arc::new(SharedSegmentSource::new(
+            FileSegmentSource::open_with_allocator(
+                footer.segment_map().clone(),
+                read,
+                self.session.handle(),
+                self.metrics.clone(),
+                self.allocator.clone(),
+            ),
+        ));
 
         // Wrap up the segment source to first resolve segments from the initial read cache.
         let segment_source = Arc::new(SegmentCacheSourceAdapter::new(
diff --git a/vortex-file/src/segments/source.rs b/vortex-file/src/segments/source.rs
index 54ad6f8e991..8445269a0a3 100644
--- a/vortex-file/src/segments/source.rs
+++ b/vortex-file/src/segments/source.rs
@@ -16,6 +16,7 @@ use vortex_array::buffer::BufferHandle;
 use vortex_buffer::ByteBuffer;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
+use vortex_io::BufferAllocator;
 use vortex_io::VortexReadAt;
 use vortex_io::runtime::Handle;
 use vortex_layout::segments::SegmentFuture;
@@ -70,11 +71,22 @@ impl FileSegmentSource {
         source: Arc<dyn VortexReadAt>,
         handle: Handle,
         metrics: VortexMetrics,
+    ) -> Self {
+        Self::open_with_allocator(segments, source, handle, metrics, None)
+    }
+
+    pub fn open_with_allocator(
+        segments: Arc<[SegmentSpec]>,
+        source: Arc<dyn VortexReadAt>,
+        handle: Handle,
+        metrics: VortexMetrics,
+        allocator: Option<Arc<dyn BufferAllocator>>,
     ) -> Self {
         let (send, recv) = mpsc::unbounded();
 
         let coalesce_config = source.coalesce_config();
         let concurrency = source.concurrency();
+        let allocator = allocator.clone();
 
         let drive_fut = async move {
             let stream =
@@ -83,10 +95,34 @@ impl FileSegmentSource {
             stream
                 .map(move |req| {
                     let source = source.clone();
+                    let allocator = allocator.clone();
                     async move {
-                        let result = source
-                            .read_at(req.offset(), req.len(), req.alignment())
-                            .await;
+                        let result = if let Some(allocator) = allocator {
+                            let target = match allocator.allocate(req.len(), req.alignment()) {
+                                Ok(target) => target,
+                                Err(e) => {
+                                    req.resolve(Err(e));
+                                    return;
+                                }
+                            };
+
+                            match source.read_at_into(req.offset(), target).await {
+                                Ok(handle) => {
+                                    if handle.as_host_opt().is_some() {
+                                        Ok(handle.unwrap_host())
+                                    } else {
+                                        Err(vortex_err!(
+                                            "read_at_into returned a non-host buffer"
+                                        ))
+                                    }
+                                }
+                                Err(e) => Err(e),
+                            }
+                        } else {
+                            source
+                                .read_at(req.offset(), req.len(), req.alignment())
+                                .await
+                        };
                         req.resolve(result);
                     }
                 })
diff --git a/vortex-io/src/allocator.rs b/vortex-io/src/allocator.rs
new file mode 100644
index 00000000000..14069d5aa34
--- /dev/null
+++ b/vortex-io/src/allocator.rs
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_buffer::Alignment;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+
+use crate::WriteTarget;
+
+/// Allocates buffers for I/O reads.
+pub trait BufferAllocator: Send + Sync + 'static {
+    /// Allocate a buffer for the requested length and alignment.
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>>;
+}
+
+/// The default allocator that uses `ByteBufferMut`.
+pub struct DefaultAllocator;
+
+impl BufferAllocator for DefaultAllocator {
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
+        let mut buffer = ByteBufferMut::with_capacity_aligned(len, alignment);
+        unsafe { buffer.set_len(len) };
+        Ok(Box::new(buffer))
+    }
+}
diff --git a/vortex-io/src/lib.rs b/vortex-io/src/lib.rs
index 029c8ea3bc8..e086bdc433f 100644
--- a/vortex-io/src/lib.rs
+++ b/vortex-io/src/lib.rs
@@ -14,11 +14,13 @@ pub use io_buf::*;
 pub use limit::*;
 #[cfg(feature = "object_store")]
 pub use object_store::*;
+pub use allocator::*;
 pub use read::*;
 pub use write_target::*;
 pub use write::*;
 
 pub mod file;
+mod allocator;
 mod io_buf;
 pub mod kanal_ext;
 mod limit;

From 205dd581a6d9827b965c06084da13bbb929ce44e Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 20 Jan 2026 13:31:13 +0000
Subject: [PATCH 07/12] cuda allocator

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 Cargo.lock                          |  1 +
 vortex-cuda/Cargo.toml              |  1 +
 vortex-cuda/src/lib.rs              |  2 ++
 vortex-cuda/src/pinned.rs           |  1 +
 vortex-cuda/src/pinned_allocator.rs | 45 +++++++++++++++++++++++++++++
 5 files changed, 50 insertions(+)
 create mode 100644 vortex-cuda/src/pinned_allocator.rs

diff --git a/Cargo.lock b/Cargo.lock
index 06e13137e03..e0f6d168ced 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10277,6 +10277,7 @@ dependencies = [
  "vortex-dtype",
  "vortex-error",
  "vortex-fastlanes",
+ "vortex-io",
  "vortex-session",
  "vortex-utils",
 ]
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index ce3372b587c..65ac98ebbef 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -27,6 +27,7 @@ vortex-buffer = { workspace = true }
 vortex-dtype = { workspace = true }
 vortex-error = { workspace = true }
 vortex-fastlanes = { workspace = true }
+vortex-io = { workspace = true }
 vortex-session = { workspace = true }
 vortex-utils = { workspace = true }
 
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index 6ad59745fd4..2e505ce96b8 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -7,6 +7,7 @@ pub mod executor;
 mod for_;
 mod kernel;
 pub mod pinned;
+pub mod pinned_allocator;
 mod session;
 
 use std::process::Command;
@@ -16,6 +17,7 @@ pub use executor::CudaKernelEvents;
 pub use pinned::PinnedByteBuffer;
 pub use pinned::PinnedByteBufferPool;
 pub use pinned::PooledPinnedBuffer;
+pub use pinned_allocator::PinnedBufferAllocator;
 use for_::ForExecutor;
 pub use session::CudaSession;
 
diff --git a/vortex-cuda/src/pinned.rs b/vortex-cuda/src/pinned.rs
index 5b5cbd28dc7..1b7e062ed6a 100644
--- a/vortex-cuda/src/pinned.rs
+++ b/vortex-cuda/src/pinned.rs
@@ -142,6 +142,7 @@ pub struct PooledPinnedBuffer {
     pool: Arc<PinnedByteBufferPool>,
 }
 
+#[allow(clippy::same_name_method)]
 impl PooledPinnedBuffer {
     /// Create a new pooled buffer.
     pub fn new(inner: PinnedByteBuffer, pool: Arc<PinnedByteBufferPool>) -> Self {
diff --git a/vortex-cuda/src/pinned_allocator.rs b/vortex-cuda/src/pinned_allocator.rs
new file mode 100644
index 00000000000..f1ded3ae251
--- /dev/null
+++ b/vortex-cuda/src/pinned_allocator.rs
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+
+use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
+use vortex_error::VortexResult;
+use vortex_io::BufferAllocator;
+use vortex_io::WriteTarget;
+
+use crate::PinnedByteBufferPool;
+use crate::PooledPinnedBuffer;
+
+/// Allocator that sources buffers from a CUDA pinned pool.
+pub struct PinnedBufferAllocator {
+    pool: Arc<PinnedByteBufferPool>,
+}
+
+impl PinnedBufferAllocator {
+    pub fn new(pool: Arc<PinnedByteBufferPool>) -> Self {
+        Self { pool }
+    }
+}
+
+impl BufferAllocator for PinnedBufferAllocator {
+    fn allocate(&self, len: usize, _alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
+        let buffer = self.pool.get_pooled(len)?;
+        Ok(Box::new(buffer))
+    }
+}
+
+impl WriteTarget for PooledPinnedBuffer {
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        PooledPinnedBuffer::as_mut_slice(self)
+    }
+
+    fn len(&self) -> usize {
+        PooledPinnedBuffer::len(self)
+    }
+
+    fn into_handle(self: Box<Self>) -> BufferHandle {
+        BufferHandle::new_host(self.into_byte_buffer())
+    }
+}

From 173bc4d8d341868a8d46dfeef07400f57f1fa8b3 Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 20 Jan 2026 13:58:11 +0000
Subject: [PATCH 08/12] cuda h2d

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 vortex-cuda/src/device_buffer.rs    | 126 ++++++++++++++++++++++++++++
 vortex-cuda/src/lib.rs              |   2 +
 vortex-cuda/src/pinned_allocator.rs |  81 +++++++++++++++++-
 vortex-cuda/src/session.rs          |   8 ++
 vortex-file/src/read/driver.rs      |   4 +-
 vortex-file/src/read/request.rs     |  33 +++++---
 vortex-file/src/segments/source.rs  |  26 ++----
 vortex-io/src/file/object_store.rs  |   2 +-
 vortex-io/src/file/std_file.rs      |   2 +-
 vortex-io/src/read.rs               |   4 +-
 vortex-io/src/write_target.rs       |   7 +-
 11 files changed, 251 insertions(+), 44 deletions(-)
 create mode 100644 vortex-cuda/src/device_buffer.rs

diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
new file mode 100644
index 00000000000..170b47c9011
--- /dev/null
+++ b/vortex-cuda/src/device_buffer.rs
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt;
+use std::hash::Hash;
+use std::hash::Hasher;
+use std::ops::Range;
+use std::sync::Arc;
+
+use cudarc::driver::CudaEvent;
+use cudarc::driver::CudaSlice;
+use cudarc::driver::CudaStream;
+use vortex_array::buffer::DeviceBuffer;
+use vortex_buffer::Alignment;
+use vortex_buffer::ByteBuffer;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+use vortex_error::vortex_panic;
+
+use crate::PooledPinnedBuffer;
+
+/// A device buffer backed by CUDA device memory.
+pub struct CudaDeviceBuffer {
+    data: Arc<CudaSlice<u8>>,
+    offset: usize,
+    len: usize,
+    stream: Arc<CudaStream>,
+    completion: Arc<CudaEvent>,
+    host_buffer: Arc<parking_lot::Mutex<Option<PooledPinnedBuffer>>>,
+}
+
+impl CudaDeviceBuffer {
+    pub fn new(
+        data: Arc<CudaSlice<u8>>,
+        offset: usize,
+        len: usize,
+        stream: Arc<CudaStream>,
+        completion: CudaEvent,
+        host_buffer: PooledPinnedBuffer,
+    ) -> Self {
+        Self {
+            data,
+            offset,
+            len,
+            stream,
+            completion: Arc::new(completion),
+            host_buffer: Arc::new(parking_lot::Mutex::new(Some(host_buffer))),
+        }
+    }
+
+    fn view(&self) -> cudarc::driver::CudaView<'_, u8> {
+        self.data.slice(self.offset..self.offset + self.len)
+    }
+}
+
+impl DeviceBuffer for CudaDeviceBuffer {
+    fn len(&self) -> usize {
+        self.len
+    }
+
+    fn copy_to_host(&self) -> VortexResult<ByteBuffer> {
+        let mut host = ByteBufferMut::with_capacity_aligned(self.len, Alignment::of::<u8>());
+        unsafe { host.set_len(self.len) };
+        self.stream
+            .memcpy_dtoh(&self.view(), host.as_mut_slice())
+            .map_err(|e| vortex_err!("Failed to copy from device: {e}"))?;
+        Ok(host.freeze())
+    }
+
+    fn slice(&self, range: Range<usize>) -> Arc<dyn DeviceBuffer> {
+        if range.start > range.end || range.end > self.len {
+            vortex_panic!(
+                "range out of bounds: {}..{} for length {}",
+                range.start,
+                range.end,
+                self.len
+            );
+        }
+        Arc::new(Self {
+            data: self.data.clone(),
+            offset: self.offset + range.start,
+            len: range.end - range.start,
+            stream: self.stream.clone(),
+            completion: self.completion.clone(),
+            host_buffer: self.host_buffer.clone(),
+        })
+    }
+}
+
+impl Drop for CudaDeviceBuffer {
+    fn drop(&mut self) {
+        let _ = self.completion.synchronize();
+        if let Some(buffer) = self.host_buffer.lock().take() {
+            drop(buffer);
+        }
+    }
+}
+
+impl PartialEq for CudaDeviceBuffer {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::ptr_eq(&self.data, &other.data)
+            && self.offset == other.offset
+            && self.len == other.len
+    }
+}
+
+impl Eq for CudaDeviceBuffer {}
+
+impl Hash for CudaDeviceBuffer {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ptr = Arc::as_ptr(&self.data) as usize;
+        ptr.hash(state);
+        self.offset.hash(state);
+        self.len.hash(state);
+    }
+}
+
+impl fmt::Debug for CudaDeviceBuffer {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("CudaDeviceBuffer")
+            .field("offset", &self.offset)
+            .field("len", &self.len)
+            .finish()
+    }
+}
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index 2e505ce96b8..df2c4c1a1ee 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -4,6 +4,7 @@
 //! CUDA support for Vortex arrays.
 
 pub mod executor;
+mod device_buffer;
 mod for_;
 mod kernel;
 pub mod pinned;
@@ -18,6 +19,7 @@ pub use pinned::PinnedByteBuffer;
 pub use pinned::PinnedByteBufferPool;
 pub use pinned::PooledPinnedBuffer;
 pub use pinned_allocator::PinnedBufferAllocator;
+pub use pinned_allocator::PinnedDeviceAllocator;
 use for_::ForExecutor;
 pub use session::CudaSession;
 
diff --git a/vortex-cuda/src/pinned_allocator.rs b/vortex-cuda/src/pinned_allocator.rs
index f1ded3ae251..cec823983af 100644
--- a/vortex-cuda/src/pinned_allocator.rs
+++ b/vortex-cuda/src/pinned_allocator.rs
@@ -3,14 +3,19 @@
 
 use std::sync::Arc;
 
+use cudarc::driver::CudaStream;
 use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_error::VortexResult;
+use vortex_error::vortex_err;
 use vortex_io::BufferAllocator;
 use vortex_io::WriteTarget;
+use vortex_session::VortexSession;
 
+use crate::device_buffer::CudaDeviceBuffer;
 use crate::PinnedByteBufferPool;
 use crate::PooledPinnedBuffer;
+use crate::session::CudaSessionExt;
 
 /// Allocator that sources buffers from a CUDA pinned pool.
 pub struct PinnedBufferAllocator {
@@ -39,7 +44,79 @@ impl WriteTarget for PooledPinnedBuffer {
         PooledPinnedBuffer::len(self)
     }
 
-    fn into_handle(self: Box<Self>) -> BufferHandle {
-        BufferHandle::new_host(self.into_byte_buffer())
+    fn into_handle(self: Box<Self>) -> VortexResult<BufferHandle> {
+        Ok(BufferHandle::new_host(self.into_byte_buffer()))
+    }
+}
+
+/// Allocator that reads into pinned buffers and transfers to device memory.
+pub struct PinnedDeviceAllocator {
+    pool: Arc<PinnedByteBufferPool>,
+    stream: Arc<CudaStream>,
+}
+
+impl PinnedDeviceAllocator {
+    pub fn new(pool: Arc<PinnedByteBufferPool>, stream: Arc<CudaStream>) -> Self {
+        Self { pool, stream }
+    }
+
+    pub fn from_session(
+        pool: Arc<PinnedByteBufferPool>,
+        session: &VortexSession,
+    ) -> VortexResult<Self> {
+        let stream = session.cuda_session().new_stream()?;
+        Ok(Self::new(pool, stream))
+    }
+}
+
+impl BufferAllocator for PinnedDeviceAllocator {
+    fn allocate(&self, len: usize, _alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
+        let buffer = self.pool.get_pooled(len)?;
+        Ok(Box::new(PinnedDeviceWriteTarget {
+            buffer,
+            stream: self.stream.clone(),
+        }))
+    }
+}
+
+struct PinnedDeviceWriteTarget {
+    buffer: PooledPinnedBuffer,
+    stream: Arc<CudaStream>,
+}
+
+impl WriteTarget for PinnedDeviceWriteTarget {
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        self.buffer.as_mut_slice()
+    }
+
+    fn len(&self) -> usize {
+        self.buffer.len()
+    }
+
+    fn into_handle(mut self: Box<Self>) -> VortexResult<BufferHandle> {
+        let len = self.buffer.len();
+        let mut device = unsafe { self.stream.alloc::<u8>(len) }
+            .map_err(|e| vortex_err!("Failed to allocate device memory: {e}"))?;
+
+        let slice: &[u8] = self.buffer.as_mut_slice();
+        self.stream
+            .memcpy_htod(slice, &mut device)
+            .map_err(|e| vortex_err!("Failed to copy to device: {e}"))?;
+
+        let event = self
+            .stream
+            .record_event(None)
+            .map_err(|e| vortex_err!("Failed to record CUDA event: {e}"))?;
+
+        let device_buffer = CudaDeviceBuffer::new(
+            Arc::new(device),
+            0,
+            len,
+            self.stream.clone(),
+            event,
+            self.buffer,
+        );
+
+        Ok(BufferHandle::new_device(Arc::new(device_buffer)))
     }
 }
diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs
index 6b33d10e753..ce43f09f806 100644
--- a/vortex-cuda/src/session.rs
+++ b/vortex-cuda/src/session.rs
@@ -5,6 +5,7 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 use cudarc::driver::CudaContext;
+use cudarc::driver::CudaStream;
 use vortex_array::vtable::ArrayId;
 use vortex_dtype::PType;
 use vortex_error::VortexResult;
@@ -50,6 +51,13 @@ impl CudaSession {
         Ok(CudaExecutionCtx::new(stream, vortex_session))
     }
 
+    /// Creates a new CUDA stream.
+    pub fn new_stream(&self) -> VortexResult<Arc<CudaStream>> {
+        self.context
+            .new_stream()
+            .map_err(|e| vortex_err!("Failed to create CUDA stream: {}", e))
+    }
+
     /// Registers CUDA support for an array encoding.
     ///
     /// # Arguments
diff --git a/vortex-file/src/read/driver.rs b/vortex-file/src/read/driver.rs
index 4ee92547a03..02dbcf3fac0 100644
--- a/vortex-file/src/read/driver.rs
+++ b/vortex-file/src/read/driver.rs
@@ -310,8 +310,8 @@ impl State {
 mod tests {
     use futures::StreamExt;
     use futures::stream;
+    use vortex_array::buffer::BufferHandle;
     use vortex_buffer::Alignment;
-    use vortex_buffer::ByteBuffer;
     use vortex_error::VortexResult;
 
     use super::*;
@@ -321,7 +321,7 @@ mod tests {
         id: usize,
         offset: u64,
         length: usize,
-    ) -> (ReadRequest, oneshot::Receiver<VortexResult<ByteBuffer>>) {
+    ) -> (ReadRequest, oneshot::Receiver<VortexResult<BufferHandle>>) {
         let (tx, rx) = oneshot::channel();
         (
             ReadRequest {
diff --git a/vortex-file/src/read/request.rs b/vortex-file/src/read/request.rs
index 695c597ed4e..be5ee6a1e7a 100644
--- a/vortex-file/src/read/request.rs
+++ b/vortex-file/src/read/request.rs
@@ -8,7 +8,7 @@ use std::ops::Range;
 use std::sync::Arc;
 
 use vortex_buffer::Alignment;
-use vortex_buffer::ByteBuffer;
+use vortex_array::buffer::BufferHandle;
 use vortex_error::VortexError;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
@@ -51,7 +51,7 @@ impl IoRequest {
     }
 
     /// Resolves the request with the given result.
-    pub fn resolve(self, result: VortexResult<ByteBuffer>) {
+    pub fn resolve(self, result: VortexResult<BufferHandle>) {
         match self.0 {
             IoRequestInner::Single(req) => req.resolve(result),
             IoRequestInner::Coalesced(req) => req.resolve(result),
@@ -90,7 +90,7 @@ pub struct ReadRequest {
     pub(crate) offset: u64,
     pub(crate) length: usize,
     pub(crate) alignment: Alignment,
-    pub(crate) callback: oneshot::Sender<VortexResult<ByteBuffer>>,
+    pub(crate) callback: oneshot::Sender<VortexResult<BufferHandle>>,
 }
 
 impl Debug for ReadRequest {
@@ -106,7 +106,7 @@ impl Debug for ReadRequest {
 }
 
 impl ReadRequest {
-    pub(crate) fn resolve(self, result: VortexResult<ByteBuffer>) {
+    pub(crate) fn resolve(self, result: VortexResult<BufferHandle>) {
         if let Err(e) = self.callback.send(result) {
             tracing::debug!("ReadRequest {} dropped before resolving: {e}", self.id);
         }
@@ -132,16 +132,25 @@ impl Debug for CoalescedRequest {
 }
 
 impl CoalescedRequest {
-    pub fn resolve(self, result: VortexResult<ByteBuffer>) {
+    pub fn resolve(self, result: VortexResult<BufferHandle>) {
         match result {
             Ok(buffer) => {
-                let buffer = buffer.aligned(Alignment::none());
-                for req in self.requests.into_iter() {
-                    let start = usize::try_from(req.offset - self.range.start)
-                        .vortex_expect("invalid offset");
-                    let end = start + req.length;
-                    let slice = buffer.slice(start..end).aligned(req.alignment);
-                    req.resolve(Ok(slice));
+                if let Some(host) = buffer.as_host_opt() {
+                    let host = host.clone().aligned(Alignment::none());
+                    for req in self.requests.into_iter() {
+                        let start = usize::try_from(req.offset - self.range.start)
+                            .vortex_expect("invalid offset");
+                        let end = start + req.length;
+                        let slice = host.slice(start..end).aligned(req.alignment);
+                        req.resolve(Ok(BufferHandle::new_host(slice)));
+                    }
+                } else {
+                    for req in self.requests.into_iter() {
+                        let start = usize::try_from(req.offset - self.range.start)
+                            .vortex_expect("invalid offset");
+                        let end = start + req.length;
+                        req.resolve(Ok(buffer.slice(start..end)));
+                    }
                 }
             }
             Err(e) => {
diff --git a/vortex-file/src/segments/source.rs b/vortex-file/src/segments/source.rs
index 8445269a0a3..9a55389cacb 100644
--- a/vortex-file/src/segments/source.rs
+++ b/vortex-file/src/segments/source.rs
@@ -13,7 +13,6 @@ use futures::FutureExt;
 use futures::StreamExt;
 use futures::channel::mpsc;
 use vortex_array::buffer::BufferHandle;
-use vortex_buffer::ByteBuffer;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 use vortex_io::BufferAllocator;
@@ -106,22 +105,12 @@ impl FileSegmentSource {
                                 }
                             };
 
-                            match source.read_at_into(req.offset(), target).await {
-                                Ok(handle) => {
-                                    if handle.as_host_opt().is_some() {
-                                        Ok(handle.unwrap_host())
-                                    } else {
-                                        Err(vortex_err!(
-                                            "read_at_into returned a non-host buffer"
-                                        ))
-                                    }
-                                }
-                                Err(e) => Err(e),
-                            }
+                            source.read_at_into(req.offset(), target).await
                         } else {
                             source
                                 .read_at(req.offset(), req.len(), req.alignment())
                                 .await
+                                .map(BufferHandle::new_host)
                         };
                         req.resolve(result);
                     }
@@ -177,12 +166,7 @@ impl SegmentSource for FileSegmentSource {
             .boxed()
         });
 
-        async move {
-            maybe_fut
-                .ok_or_else(|| vortex_err!("Missing segment: {}", id))?
-                .await
-                .map(BufferHandle::new_host)
-        }
+        async move { maybe_fut.ok_or_else(|| vortex_err!("Missing segment: {}", id))?.await }
         .boxed()
     }
 }
@@ -193,13 +177,13 @@ impl SegmentSource for FileSegmentSource {
 /// If dropped, the read request will be canceled where possible.
 struct ReadFuture {
     id: usize,
-    recv: oneshot::Receiver<VortexResult<ByteBuffer>>,
+    recv: oneshot::Receiver<VortexResult<BufferHandle>>,
     polled: bool,
     events: mpsc::UnboundedSender<ReadEvent>,
 }
 
 impl Future for ReadFuture {
-    type Output = VortexResult<ByteBuffer>;
+    type Output = VortexResult<BufferHandle>;
 
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         if !self.polled {
diff --git a/vortex-io/src/file/object_store.rs b/vortex-io/src/file/object_store.rs
index 30d8ba593f4..f4602756bf2 100644
--- a/vortex-io/src/file/object_store.rs
+++ b/vortex-io/src/file/object_store.rs
@@ -232,7 +232,7 @@ impl VortexReadAt for ObjectStoreSource {
                 }
             }
 
-            Ok(target.into_handle())
+            target.into_handle()
         })
         .boxed()
     }
diff --git a/vortex-io/src/file/std_file.rs b/vortex-io/src/file/std_file.rs
index 3db3e4c24a5..240cdfe0ca0 100644
--- a/vortex-io/src/file/std_file.rs
+++ b/vortex-io/src/file/std_file.rs
@@ -136,7 +136,7 @@ impl VortexReadAt for FileReadAdapter {
             handle
                 .spawn_blocking(move || {
                     read_exact_at(&file, target.as_mut_slice(), offset)?;
-                    Ok(target.into_handle())
+                    target.into_handle()
                 })
                 .await
         }
diff --git a/vortex-io/src/read.rs b/vortex-io/src/read.rs
index 1e235844568..43057617fe3 100644
--- a/vortex-io/src/read.rs
+++ b/vortex-io/src/read.rs
@@ -98,7 +98,7 @@ pub trait VortexReadAt: Send + Sync + 'static {
         async move {
             let data = read_fut.await?;
             target.as_mut_slice().copy_from_slice(data.as_ref());
-            Ok(target.into_handle())
+            target.into_handle()
         }
         .boxed()
     }
@@ -233,7 +233,7 @@ impl VortexReadAt for ByteBuffer {
                 );
             }
             target.as_mut_slice().copy_from_slice(&buffer.as_ref()[start..end]);
-            Ok(target.into_handle())
+            target.into_handle()
         }
         .boxed()
     }
diff --git a/vortex-io/src/write_target.rs b/vortex-io/src/write_target.rs
index 7ee4685335b..ed72b2bfc39 100644
--- a/vortex-io/src/write_target.rs
+++ b/vortex-io/src/write_target.rs
@@ -3,6 +3,7 @@
 
 use vortex_array::buffer::BufferHandle;
 use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
 
 /// A destination for I/O reads that can be finalized into a [`BufferHandle`].
 pub trait WriteTarget: Send + 'static {
@@ -18,7 +19,7 @@ pub trait WriteTarget: Send + 'static {
     }
 
     /// Finalize the target into a buffer handle.
-    fn into_handle(self: Box<Self>) -> BufferHandle;
+    fn into_handle(self: Box<Self>) -> VortexResult<BufferHandle>;
 }
 
 impl WriteTarget for ByteBufferMut {
@@ -30,7 +31,7 @@ impl WriteTarget for ByteBufferMut {
         ByteBufferMut::len(self)
     }
 
-    fn into_handle(self: Box<Self>) -> BufferHandle {
-        BufferHandle::new_host(self.freeze())
+    fn into_handle(self: Box<Self>) -> VortexResult<BufferHandle> {
+        Ok(BufferHandle::new_host(self.freeze()))
     }
 }

From b686fd4a408b5118a7a7e4218caeceeb0e4b2b80 Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 20 Jan 2026 14:25:50 +0000
Subject: [PATCH 09/12] impl HostSlice to defer to PinnedHostSlice

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 vortex-cuda/src/pinned.rs           | 58 +++++++++++++++++++++++++++++
 vortex-cuda/src/pinned_allocator.rs |  5 +--
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/vortex-cuda/src/pinned.rs b/vortex-cuda/src/pinned.rs
index 1b7e062ed6a..1399ad42e06 100644
--- a/vortex-cuda/src/pinned.rs
+++ b/vortex-cuda/src/pinned.rs
@@ -5,7 +5,10 @@ use std::sync::Arc;
 
 use bytes::Bytes;
 use cudarc::driver::CudaContext;
+use cudarc::driver::CudaStream;
+use cudarc::driver::HostSlice;
 use cudarc::driver::PinnedHostSlice;
+use cudarc::driver::SyncOnDrop;
 use parking_lot::Mutex;
 use vortex_buffer::ByteBuffer;
 use vortex_error::VortexResult;
@@ -21,6 +24,7 @@ pub struct PinnedByteBuffer {
     inner: PinnedHostSlice<u8>,
 }
 
+#[allow(clippy::same_name_method)]
 impl PinnedByteBuffer {
     /// Allocate a pinned host buffer with uninitialized contents.
     ///
@@ -78,6 +82,31 @@ impl PinnedByteBuffer {
     }
 }
 
+#[allow(clippy::same_name_method)]
+impl HostSlice<u8> for PinnedByteBuffer {
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    unsafe fn stream_synced_slice<'a>(
+        &'a self,
+        stream: &'a CudaStream,
+    ) -> (&'a [u8], SyncOnDrop<'a>) {
+        unsafe {
+            <PinnedHostSlice<u8> as HostSlice<u8>>::stream_synced_slice(&self.inner, stream)
+        }
+    }
+
+    unsafe fn stream_synced_mut_slice<'a>(
+        &'a mut self,
+        stream: &'a CudaStream,
+    ) -> (&'a mut [u8], SyncOnDrop<'a>) {
+        unsafe {
+            <PinnedHostSlice<u8> as HostSlice<u8>>::stream_synced_mut_slice(&mut self.inner, stream)
+        }
+    }
+}
+
 /// A simple pinned host buffer pool keyed by allocation size.
 pub struct PinnedByteBufferPool {
     ctx: Arc<CudaContext>,
@@ -203,6 +232,35 @@ impl PooledPinnedBuffer {
     }
 }
 
+#[allow(clippy::same_name_method)]
+impl HostSlice<u8> for PooledPinnedBuffer {
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    unsafe fn stream_synced_slice<'a>(
+        &'a self,
+        stream: &'a CudaStream,
+    ) -> (&'a [u8], SyncOnDrop<'a>) {
+        let inner = self
+            .inner
+            .as_ref()
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
+        unsafe { HostSlice::stream_synced_slice(inner, stream) }
+    }
+
+    unsafe fn stream_synced_mut_slice<'a>(
+        &'a mut self,
+        stream: &'a CudaStream,
+    ) -> (&'a mut [u8], SyncOnDrop<'a>) {
+        let inner = self
+            .inner
+            .as_mut()
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
+        unsafe { HostSlice::stream_synced_mut_slice(inner, stream) }
+    }
+}
+
 impl Drop for PooledPinnedBuffer {
     fn drop(&mut self) {
         if let Some(inner) = self.inner.take() {
diff --git a/vortex-cuda/src/pinned_allocator.rs b/vortex-cuda/src/pinned_allocator.rs
index cec823983af..e3306ccc977 100644
--- a/vortex-cuda/src/pinned_allocator.rs
+++ b/vortex-cuda/src/pinned_allocator.rs
@@ -93,14 +93,13 @@ impl WriteTarget for PinnedDeviceWriteTarget {
         self.buffer.len()
     }
 
-    fn into_handle(mut self: Box<Self>) -> VortexResult<BufferHandle> {
+    fn into_handle(self: Box<Self>) -> VortexResult<BufferHandle> {
         let len = self.buffer.len();
         let mut device = unsafe { self.stream.alloc::<u8>(len) }
             .map_err(|e| vortex_err!("Failed to allocate device memory: {e}"))?;
 
-        let slice: &[u8] = self.buffer.as_mut_slice();
         self.stream
-            .memcpy_htod(slice, &mut device)
+            .memcpy_htod(&self.buffer, &mut device)
             .map_err(|e| vortex_err!("Failed to copy to device: {e}"))?;
 
         let event = self

From 578bad9123bce43dab5e79b1e986e8e9fd7a5fc1 Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 20 Jan 2026 14:35:31 +0000
Subject: [PATCH 10/12] gpu scan

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 Cargo.lock                            |  1 +
 Cargo.toml                            |  1 +
 vortex-bench/Cargo.toml               |  1 +
 vortex-bench/src/bin/scan_io_bench.rs | 72 ++++++++++++++++++++++-----
 vortex-cuda/src/lib.rs                |  1 +
 vortex-cuda/src/pinned_allocator.rs   |  6 +++
 vortex-cuda/src/session.rs            |  5 ++
 7 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e0f6d168ced..41e9bf8fa6d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10174,6 +10174,7 @@ dependencies = [
  "url",
  "uuid",
  "vortex",
+ "vortex-cuda",
  "vortex-scan",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index 6a60714b37d..0828ab7949e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -221,6 +221,7 @@ vortex-btrblocks = { version = "0.1.0", path = "./vortex-btrblocks", default-fea
 vortex-buffer = { version = "0.1.0", path = "./vortex-buffer", default-features = false }
 vortex-bytebool = { version = "0.1.0", path = "./encodings/bytebool", default-features = false }
 vortex-compute = { version = "0.1.0", path = "./vortex-compute", default-features = false }
+vortex-cuda = { version = "0.1.0", path = "./vortex-cuda", default-features = false }
 vortex-datafusion = { version = "0.1.0", path = "./vortex-datafusion", default-features = false }
 vortex-datetime-parts = { version = "0.1.0", path = "./encodings/datetime-parts", default-features = false }
 vortex-decimal-byte-parts = { version = "0.1.0", path = "encodings/decimal-byte-parts", default-features = false }
diff --git a/vortex-bench/Cargo.toml b/vortex-bench/Cargo.toml
index 8ee74f66689..84bac8cf9d4 100644
--- a/vortex-bench/Cargo.toml
+++ b/vortex-bench/Cargo.toml
@@ -66,3 +66,4 @@ vortex = { workspace = true, features = [
     "unstable_encodings",
 ] }
 vortex-scan = { workspace = true }
+vortex-cuda = { workspace = true }
diff --git a/vortex-bench/src/bin/scan_io_bench.rs b/vortex-bench/src/bin/scan_io_bench.rs
index 1a85004688f..6298e3427ed 100644
--- a/vortex-bench/src/bin/scan_io_bench.rs
+++ b/vortex-bench/src/bin/scan_io_bench.rs
@@ -34,12 +34,16 @@ use vortex::dtype::FieldNames;
 use vortex::error::VortexResult;
 use vortex::error::vortex_err;
 use vortex::file::OpenOptionsSessionExt;
+use vortex::io::BufferAllocator;
 use vortex::layout::collect_segment_ids;
 use vortex::layout::LayoutReader;
 use vortex::mask::Mask;
 use vortex::metrics::VortexMetrics;
 use parking_lot::Mutex;
 use vortex_bench::SESSION;
+use vortex_cuda::CudaSessionExt;
+use vortex_cuda::PinnedByteBufferPool;
+use vortex_cuda::PinnedDeviceAllocator;
 use std::sync::atomic::AtomicBool;
 use std::sync::atomic::Ordering;
 use tracing_subscriber::EnvFilter;
@@ -93,6 +97,9 @@ struct Args {
     /// Only prune whole segments (no intra-segment pruning on CPU).
     #[arg(long, default_value_t = false)]
     prune_segments: bool,
+    /// Enable CUDA pinned read + H2D transfer.
+    #[arg(long, default_value_t = false)]
+    gpu: bool,
 }
 
 #[derive(ValueEnum, Clone, Debug)]
@@ -141,12 +148,28 @@ async fn main() -> Result<()> {
     let metrics = VortexMetrics::new_with_tags([("bench", "scan-io")]);
     let read_bytes = metrics.counter("vortex.io.read.total_size");
 
+    #[allow(clippy::if_then_some_else_none)]
+    let gpu_allocator = if args.gpu {
+        let cuda_session = SESSION.cuda_session();
+        let pool = std::sync::Arc::new(PinnedByteBufferPool::new(cuda_session.context().clone()));
+        Some(std::sync::Arc::new(PinnedDeviceAllocator::from_session(
+            pool,
+            &SESSION,
+        )?))
+    } else {
+        None
+    };
+    let allocator: Option<std::sync::Arc<dyn BufferAllocator>> = gpu_allocator
+        .as_ref()
+        .map(|alloc| alloc.clone() as std::sync::Arc<dyn BufferAllocator>);
+
     let targets = resolve_targets(&args).await?;
     let cached_files = if args.reopen {
         None
     } else {
         Some(std::sync::Arc::new(
-            open_all_targets(&targets, metrics.clone(), args.file_concurrency).await?,
+            open_all_targets(&targets, metrics.clone(), args.file_concurrency, allocator.clone())
+                .await?,
         ))
     };
     read_bytes.clear();
@@ -168,10 +191,18 @@ async fn main() -> Result<()> {
                 let first_seen = first_seen.clone();
                 let first_info = first_info.clone();
                 let mode = mode.clone();
+                let allocator = allocator.clone();
                 async move {
                     let file = match &cached_files {
                         Some(files) => files[idx].clone(),
-                        None => open_vortex_file_for_target(&target, metrics.clone()).await?,
+                        None => {
+                            open_vortex_file_for_target(
+                                &target,
+                                metrics.clone(),
+                                allocator,
+                            )
+                            .await?
+                        }
                     };
 
                     if args.prune_segments
@@ -249,6 +280,13 @@ async fn main() -> Result<()> {
             .await?;
 
     let elapsed = start.elapsed().as_secs_f64();
+    let gpu_sync_ms = if let Some(allocator) = gpu_allocator {
+        let sync_start = Instant::now();
+        allocator.synchronize()?;
+        sync_start.elapsed().as_secs_f64() * 1000.0
+    } else {
+        0.0
+    };
     let bytes = read_bytes.count();
     let (first_latency, first_bytes) =
         first_info.lock().unwrap_or((elapsed, read_bytes.count() - bytes_before));
@@ -277,6 +315,9 @@ async fn main() -> Result<()> {
     println!("avg_mb_s={:.2}", total_mb_s);
     println!("avg_first_latency_ms={:.2}", avg_first_latency * 1000.0);
     println!("steady_mb_s={:.2}", steady_mb_s);
+    if args.gpu {
+        println!("gpu_sync_ms={:.2}", gpu_sync_ms);
+    }
 
     Ok(())
 }
@@ -405,21 +446,24 @@ fn is_prefix(source: &str) -> bool {
 async fn open_vortex_file_for_target(
     target: &ScanTarget,
     metrics: VortexMetrics,
+    allocator: Option<std::sync::Arc<dyn BufferAllocator>>,
 ) -> Result<vortex::file::VortexFile> {
     let session = SESSION.clone();
     match target {
-        ScanTarget::Local(path) => Ok(session
-            .open_options()
-            .with_metrics(metrics)
-            .open_path(path)
-            .await?),
+        ScanTarget::Local(path) => {
+            let mut options = session.open_options().with_metrics(metrics);
+            if let Some(allocator) = allocator {
+                options = options.with_allocator(allocator);
+            }
+            Ok(options.open_path(path).await?)
+        }
         ScanTarget::ObjectStore { store, path } => {
             let path_str = path.to_string();
-            Ok(session
-                .open_options()
-                .with_metrics(metrics)
-                .open_object_store(store, &path_str)
-                .await?)
+            let mut options = session.open_options().with_metrics(metrics);
+            if let Some(allocator) = allocator {
+                options = options.with_allocator(allocator);
+            }
+            Ok(options.open_object_store(store, &path_str).await?)
         }
     }
 }
@@ -428,13 +472,15 @@ async fn open_all_targets(
     targets: &[ScanTarget],
     metrics: VortexMetrics,
     concurrency: usize,
+    allocator: Option<std::sync::Arc<dyn BufferAllocator>>,
 ) -> Result<Vec<vortex::file::VortexFile>> {
     let mut files = vec![None; targets.len()];
     let results = futures::stream::iter(targets.iter().enumerate())
         .map(|(idx, target)| {
             let metrics = metrics.clone();
+            let allocator = allocator.clone();
             async move {
-                let file = open_vortex_file_for_target(target, metrics).await?;
+                let file = open_vortex_file_for_target(target, metrics, allocator).await?;
                 Ok::<_, anyhow::Error>((idx, file))
             }
         })
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index df2c4c1a1ee..3693fefaf7f 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -20,6 +20,7 @@ pub use pinned::PinnedByteBufferPool;
 pub use pinned::PooledPinnedBuffer;
 pub use pinned_allocator::PinnedBufferAllocator;
 pub use pinned_allocator::PinnedDeviceAllocator;
+pub use session::CudaSessionExt;
 use for_::ForExecutor;
 pub use session::CudaSession;
 
diff --git a/vortex-cuda/src/pinned_allocator.rs b/vortex-cuda/src/pinned_allocator.rs
index e3306ccc977..0fd82492f71 100644
--- a/vortex-cuda/src/pinned_allocator.rs
+++ b/vortex-cuda/src/pinned_allocator.rs
@@ -67,6 +67,12 @@ impl PinnedDeviceAllocator {
         let stream = session.cuda_session().new_stream()?;
         Ok(Self::new(pool, stream))
     }
+
+    pub fn synchronize(&self) -> VortexResult<()> {
+        self.stream
+            .synchronize()
+            .map_err(|e| vortex_err!("Failed to synchronize CUDA stream: {e}"))
+    }
 }
 
 impl BufferAllocator for PinnedDeviceAllocator {
diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs
index ce43f09f806..bbdd502059b 100644
--- a/vortex-cuda/src/session.rs
+++ b/vortex-cuda/src/session.rs
@@ -58,6 +58,11 @@ impl CudaSession {
             .map_err(|e| vortex_err!("Failed to create CUDA stream: {}", e))
     }
 
+    /// Returns the CUDA context.
+    pub fn context(&self) -> &Arc<CudaContext> {
+        &self.context
+    }
+
     /// Registers CUDA support for an array encoding.
     ///
     /// # Arguments

From 932d09703c029d18ad3fb54b0e63c23ba6093afa Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 20 Jan 2026 15:36:16 +0000
Subject: [PATCH 11/12] instrumented read at impl

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 vortex-cuda/src/pinned.rs | 11 +++++++----
 vortex-io/src/read.rs     | 21 +++++++++++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/vortex-cuda/src/pinned.rs b/vortex-cuda/src/pinned.rs
index 1399ad42e06..d9a51a8cf5f 100644
--- a/vortex-cuda/src/pinned.rs
+++ b/vortex-cuda/src/pinned.rs
@@ -183,7 +183,10 @@ impl PooledPinnedBuffer {
 
     /// Returns the length of the buffer in bytes.
     pub fn len(&self) -> usize {
-        self.inner.as_ref().map(|b| b.len()).unwrap_or(0)
+        self.inner
+            .as_ref()
+            .map(|b| b.len())
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"))
     }
 
     /// Returns true if the buffer is empty.
@@ -275,7 +278,7 @@ impl Drop for PooledPinnedBuffer {
 /// This is used by `Bytes::from_owner` to manage the lifecycle of pooled pinned buffers.
 struct PooledPinnedBufferOwner {
     // We use Option so we can take the buffer out in Drop
-    inner: Mutex<Option<PinnedByteBuffer>>,
+    inner: Option<PinnedByteBuffer>,
     // Cached pointer and length for AsRef implementation
     ptr: *const u8,
     len: usize,
@@ -294,7 +297,7 @@ impl PooledPinnedBufferOwner {
             .unwrap_or_else(|e| vortex_panic!("failed to get pointer to pinned buffer: {e}"));
         let len = inner.len();
         Self {
-            inner: Mutex::new(Some(inner)),
+            inner: Some(inner),
             ptr,
             len,
             pool,
@@ -313,7 +316,7 @@ impl AsRef<[u8]> for PooledPinnedBufferOwner {
 impl Drop for PooledPinnedBufferOwner {
     fn drop(&mut self) {
         // Take the buffer out and return it to the pool
-        if let Some(buffer) = self.inner.lock().take() {
+        if let Some(buffer) = self.inner.take() {
             drop(self.pool.put(buffer));
         }
     }
diff --git a/vortex-io/src/read.rs b/vortex-io/src/read.rs
index 43057617fe3..07388ad3d7a 100644
--- a/vortex-io/src/read.rs
+++ b/vortex-io/src/read.rs
@@ -237,6 +237,7 @@ impl VortexReadAt for ByteBuffer {
         }
         .boxed()
     }
+
 }
 
 /// A wrapper that instruments a [`VortexReadAt`] with metrics.
@@ -322,6 +323,26 @@ impl<T: VortexReadAt> VortexReadAt for InstrumentedReadAt<T> {
         }
         .boxed()
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let durations = self.durations.clone();
+        let sizes = self.sizes.clone();
+        let total_size = self.total_size.clone();
+        let length = target.len();
+        let read_fut = self.read.read_at_into(offset, target);
+        async move {
+            let _timer = durations.time();
+            let result = read_fut.await;
+            sizes.update(length as i64);
+            total_size.add(length as i64);
+            result
+        }
+        .boxed()
+    }
 }
 
 #[cfg(test)]

From 7e840d7af3966cfe4dca4f9ddc75c96ce31c75a2 Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 20 Jan 2026 15:58:01 +0000
Subject: [PATCH 12/12] microbenches

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 Cargo.lock                            |   5 +
 vortex-bench/src/bin/scan_io_bench.rs | 240 ++++++++--------
 vortex-cuda/Cargo.toml                |  15 +-
 vortex-cuda/benches/h2d_pinned.rs     | 349 +++++++++++++++++++++++
 vortex-cuda/benches/pinned_scan.rs    | 395 ++++++++++++++++++++++++++
 vortex-cuda/src/device_buffer.rs      |   4 +-
 vortex-cuda/src/lib.rs                |   6 +-
 vortex-cuda/src/pinned.rs             |   4 +-
 vortex-cuda/src/pinned_allocator.rs   |   2 +-
 vortex-file/src/read/request.rs       |   2 +-
 vortex-file/src/segments/source.rs    |   6 +-
 vortex-io/src/lib.rs                  |   8 +-
 vortex-io/src/read.rs                 |   5 +-
 vortex-layout/src/layout.rs           |   5 +-
 14 files changed, 909 insertions(+), 137 deletions(-)
 create mode 100644 vortex-cuda/benches/h2d_pinned.rs
 create mode 100644 vortex-cuda/benches/pinned_scan.rs

diff --git a/Cargo.lock b/Cargo.lock
index 41e9bf8fa6d..ac44d8fd678 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10270,6 +10270,7 @@ dependencies = [
  "bytes",
  "criterion",
  "cudarc",
+ "futures",
  "parking_lot",
  "tokio",
  "tracing",
@@ -10278,7 +10279,11 @@ dependencies = [
  "vortex-dtype",
  "vortex-error",
  "vortex-fastlanes",
+ "vortex-file",
  "vortex-io",
+ "vortex-layout",
+ "vortex-metrics",
+ "vortex-scan",
  "vortex-session",
  "vortex-utils",
 ]
diff --git a/vortex-bench/src/bin/scan_io_bench.rs b/vortex-bench/src/bin/scan_io_bench.rs
index 6298e3427ed..6dfb4026610 100644
--- a/vortex-bench/src/bin/scan_io_bench.rs
+++ b/vortex-bench/src/bin/scan_io_bench.rs
@@ -3,6 +3,8 @@
 
 use std::path::Path;
 use std::path::PathBuf;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering;
 use std::time::Instant;
 
 use anyhow::Result;
@@ -16,6 +18,8 @@ use object_store::aws::AmazonS3Builder;
 use object_store::http::HttpBuilder;
 use object_store::local::LocalFileSystem;
 use object_store::path::Path as ObjectStorePath;
+use parking_lot::Mutex;
+use tracing_subscriber::EnvFilter;
 use url::Url;
 use vortex::array::Array;
 use vortex::array::MaskFuture;
@@ -35,22 +39,21 @@ use vortex::error::VortexResult;
 use vortex::error::vortex_err;
 use vortex::file::OpenOptionsSessionExt;
 use vortex::io::BufferAllocator;
-use vortex::layout::collect_segment_ids;
 use vortex::layout::LayoutReader;
+use vortex::layout::collect_segment_ids;
 use vortex::mask::Mask;
 use vortex::metrics::VortexMetrics;
-use parking_lot::Mutex;
 use vortex_bench::SESSION;
 use vortex_cuda::CudaSessionExt;
 use vortex_cuda::PinnedByteBufferPool;
 use vortex_cuda::PinnedDeviceAllocator;
-use std::sync::atomic::AtomicBool;
-use std::sync::atomic::Ordering;
-use tracing_subscriber::EnvFilter;
 use vortex_scan::ScanBuilder;
 
 #[derive(Parser, Debug)]
-#[command(version, about = "Benchmark Vortex scans over local files vs object stores")]
+#[command(
+    version,
+    about = "Benchmark Vortex scans over local files vs object stores"
+)]
 struct Args {
     /// File path, directory, or object store URL (e.g. file:/..., s3://bucket/path, https://host/path)
     #[arg(long)]
@@ -142,7 +145,11 @@ async fn main() -> Result<()> {
         .init();
 
     let args = Args::parse();
-    let mode = if args.io_only { ScanMode::Io } else { args.mode.clone() };
+    let mode = if args.io_only {
+        ScanMode::Io
+    } else {
+        args.mode.clone()
+    };
 
     let (projection, filter) = build_scan_exprs(&args)?;
     let metrics = VortexMetrics::new_with_tags([("bench", "scan-io")]);
@@ -153,8 +160,7 @@ async fn main() -> Result<()> {
         let cuda_session = SESSION.cuda_session();
         let pool = std::sync::Arc::new(PinnedByteBufferPool::new(cuda_session.context().clone()));
         Some(std::sync::Arc::new(PinnedDeviceAllocator::from_session(
-            pool,
-            &SESSION,
+            pool, &SESSION,
         )?))
     } else {
         None
@@ -168,8 +174,13 @@ async fn main() -> Result<()> {
         None
     } else {
         Some(std::sync::Arc::new(
-            open_all_targets(&targets, metrics.clone(), args.file_concurrency, allocator.clone())
-                .await?,
+            open_all_targets(
+                &targets,
+                metrics.clone(),
+                args.file_concurrency,
+                allocator.clone(),
+            )
+            .await?,
         ))
     };
     read_bytes.clear();
@@ -183,101 +194,99 @@ async fn main() -> Result<()> {
     let rows = futures::stream::iter(0..args.iterations)
         .flat_map(|_| futures::stream::iter(targets.clone().into_iter().enumerate()))
         .map(|(idx, target)| {
-                let cached_files = cached_files.clone();
-                let projection = projection.clone();
-                let filter = filter.clone();
-                let metrics = metrics.clone();
-                let read_bytes = read_bytes.clone();
-                let first_seen = first_seen.clone();
-                let first_info = first_info.clone();
-                let mode = mode.clone();
-                let allocator = allocator.clone();
-                async move {
-                    let file = match &cached_files {
-                        Some(files) => files[idx].clone(),
-                        None => {
-                            open_vortex_file_for_target(
-                                &target,
-                                metrics.clone(),
-                                allocator,
-                            )
-                            .await?
-                        }
-                    };
-
-                    if args.prune_segments
-                        && let Some(filter) = filter.as_ref()
-                        && file.can_prune(filter)?
-                    {
-                        return Ok::<_, anyhow::Error>(0);
+            let cached_files = cached_files.clone();
+            let projection = projection.clone();
+            let filter = filter.clone();
+            let metrics = metrics.clone();
+            let read_bytes = read_bytes.clone();
+            let first_seen = first_seen.clone();
+            let first_info = first_info.clone();
+            let mode = mode.clone();
+            let allocator = allocator.clone();
+            async move {
+                let file = match &cached_files {
+                    Some(files) => files[idx].clone(),
+                    None => {
+                        open_vortex_file_for_target(&target, metrics.clone(), allocator).await?
                     }
+                };
 
-                    if matches!(mode, ScanMode::Io) {
-                        read_all_segments(&file, args.concurrency).await?;
-                        if !first_seen.load(Ordering::Relaxed)
-                            && !first_seen.swap(true, Ordering::Relaxed)
-                        {
-                            let latency = start.elapsed().as_secs_f64();
-                            let bytes = read_bytes.count() - bytes_before;
-                            *first_info.lock() = Some((latency, bytes));
-                        }
-                        let file_rows = usize::try_from(file.row_count())
-                            .map_err(|_| anyhow::anyhow!("row_count exceeds usize"))?;
-                        drop(file);
-                        return Ok::<_, anyhow::Error>(file_rows);
-                    }
+                if args.prune_segments
+                    && let Some(filter) = filter.as_ref()
+                    && file.can_prune(filter)?
+                {
+                    return Ok::<_, anyhow::Error>(0);
+                }
 
-                    let (scan_projection, scan_filter, bypass_filter) = match mode {
-                        ScanMode::Decode => {
-                            let scan_filter = if args.prune_segments {
-                                filter.clone()
-                            } else {
-                                None
-                            };
-                            (root(), scan_filter, true)
-                        }
-                        ScanMode::Full => (projection.clone(), filter.clone(), false),
-                        ScanMode::Io => unreachable!("io-only handled above"),
-                    };
-
-                    let layout_reader = file.layout_reader()?;
-                    let layout_reader = if args.prune_segments || bypass_filter {
-                        std::sync::Arc::new(BenchLayoutReader::new(
-                            layout_reader,
-                            args.prune_segments,
-                            bypass_filter,
-                        )) as std::sync::Arc<dyn LayoutReader>
-                    } else {
-                        layout_reader
-                    };
-
-                    let scan = ScanBuilder::new(SESSION.clone(), layout_reader)
-                        .with_metrics(metrics.clone())
-                        .with_projection(scan_projection)
-                        .with_some_filter(scan_filter)
-                        .with_concurrency(args.concurrency)
-                        .map(|array| Ok(array.len()));
-
-                    let mut stream = scan.into_stream()?;
-                    let mut file_rows = 0usize;
-                    while let Some(rows) = stream.try_next().await? {
-                        if !first_seen.load(Ordering::Relaxed)
-                            && !first_seen.swap(true, Ordering::Relaxed)
-                        {
-                            let latency = start.elapsed().as_secs_f64();
-                            let bytes = read_bytes.count() - bytes_before;
-                            *first_info.lock() = Some((latency, bytes));
-                        }
-                        file_rows += rows;
+                if matches!(mode, ScanMode::Io) {
+                    read_all_segments(&file, args.concurrency).await?;
+                    if !first_seen.load(Ordering::Relaxed)
+                        && !first_seen.swap(true, Ordering::Relaxed)
+                    {
+                        let latency = start.elapsed().as_secs_f64();
+                        let bytes = read_bytes.count() - bytes_before;
+                        *first_info.lock() = Some((latency, bytes));
                     }
-
+                    let file_rows = usize::try_from(file.row_count())
+                        .map_err(|_| anyhow::anyhow!("row_count exceeds usize"))?;
                     drop(file);
-                    Ok::<_, anyhow::Error>(file_rows)
+                    return Ok::<_, anyhow::Error>(file_rows);
                 }
-            })
-            .buffer_unordered(args.file_concurrency.max(1))
-            .try_fold(0usize, |rows, file_rows| async move { Ok(rows + file_rows) })
-            .await?;
+
+                let (scan_projection, scan_filter, bypass_filter) = match mode {
+                    ScanMode::Decode => {
+                        let scan_filter = if args.prune_segments {
+                            filter.clone()
+                        } else {
+                            None
+                        };
+                        (root(), scan_filter, true)
+                    }
+                    ScanMode::Full => (projection.clone(), filter.clone(), false),
+                    ScanMode::Io => unreachable!("io-only handled above"),
+                };
+
+                let layout_reader = file.layout_reader()?;
+                let layout_reader = if args.prune_segments || bypass_filter {
+                    std::sync::Arc::new(BenchLayoutReader::new(
+                        layout_reader,
+                        args.prune_segments,
+                        bypass_filter,
+                    )) as std::sync::Arc<dyn LayoutReader>
+                } else {
+                    layout_reader
+                };
+
+                let scan = ScanBuilder::new(SESSION.clone(), layout_reader)
+                    .with_metrics(metrics.clone())
+                    .with_projection(scan_projection)
+                    .with_some_filter(scan_filter)
+                    .with_concurrency(args.concurrency)
+                    .map(|array| Ok(array.len()));
+
+                let mut stream = scan.into_stream()?;
+                let mut file_rows = 0usize;
+                while let Some(rows) = stream.try_next().await? {
+                    if !first_seen.load(Ordering::Relaxed)
+                        && !first_seen.swap(true, Ordering::Relaxed)
+                    {
+                        let latency = start.elapsed().as_secs_f64();
+                        let bytes = read_bytes.count() - bytes_before;
+                        *first_info.lock() = Some((latency, bytes));
+                    }
+                    file_rows += rows;
+                }
+
+                drop(file);
+                Ok::<_, anyhow::Error>(file_rows)
+            }
+        })
+        .buffer_unordered(args.file_concurrency.max(1))
+        .try_fold(
+            0usize,
+            |rows, file_rows| async move { Ok(rows + file_rows) },
+        )
+        .await?;
 
     let elapsed = start.elapsed().as_secs_f64();
     let gpu_sync_ms = if let Some(allocator) = gpu_allocator {
@@ -288,8 +297,9 @@ async fn main() -> Result<()> {
         0.0
     };
     let bytes = read_bytes.count();
-    let (first_latency, first_bytes) =
-        first_info.lock().unwrap_or((elapsed, read_bytes.count() - bytes_before));
+    let (first_latency, first_bytes) = first_info
+        .lock()
+        .unwrap_or((elapsed, read_bytes.count() - bytes_before));
 
     let avg_elapsed = elapsed / args.iterations as f64;
     let avg_bytes = bytes as f64 / args.iterations as f64;
@@ -339,12 +349,10 @@ fn build_scan_exprs(args: &Args) -> VortexResult<(Expression, Option<Expression>
         (Some(col_name), Some(op), Some(value)) => {
             let lhs = col(col_name.as_str());
             let rhs = match args.filter_type {
-                LiteralType::I16 => lit(
-                    i16::try_from(value).map_err(|_| vortex_err!("filter_value does not fit in i16"))?,
-                ),
-                LiteralType::I32 => lit(
-                    i32::try_from(value).map_err(|_| vortex_err!("filter_value does not fit in i32"))?,
-                ),
+                LiteralType::I16 => lit(i16::try_from(value)
+                    .map_err(|_| vortex_err!("filter_value does not fit in i16"))?),
+                LiteralType::I32 => lit(i32::try_from(value)
+                    .map_err(|_| vortex_err!("filter_value does not fit in i32"))?),
                 LiteralType::I64 => lit(value),
             };
             Some(apply_filter_op(op.clone(), lhs, rhs))
@@ -412,10 +420,7 @@ async fn resolve_targets(args: &Args) -> Result<Vec<ScanTarget>> {
             return Ok(targets);
         }
 
-        return Ok(vec![ScanTarget::ObjectStore {
-            store,
-            path,
-        }]);
+        return Ok(vec![ScanTarget::ObjectStore { store, path }]);
     }
 
     let path = PathBuf::from(source);
@@ -500,7 +505,11 @@ async fn open_all_targets(
 
 fn object_store_from_url(
     url_str: &str,
-) -> Result<(ObjectStoreScheme, std::sync::Arc<dyn ObjectStore>, ObjectStorePath)> {
+) -> Result<(
+    ObjectStoreScheme,
+    std::sync::Arc<dyn ObjectStore>,
+    ObjectStorePath,
+)> {
     let url = Url::parse(url_str)?;
     let (scheme, path) = ObjectStoreScheme::parse(&url).map_err(object_store::Error::from)?;
     let store: std::sync::Arc<dyn ObjectStore> = match scheme {
@@ -519,10 +528,7 @@ fn object_store_from_url(
     Ok((scheme, store, path))
 }
 
-async fn read_all_segments(
-    file: &vortex::file::VortexFile,
-    concurrency: usize,
-) -> Result<()> {
+async fn read_all_segments(file: &vortex::file::VortexFile, concurrency: usize) -> Result<()> {
     let layout = file.footer().layout().clone();
     let segment_ids = collect_segment_ids(&layout)?;
     let segment_source = file.segment_source();
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index 65ac98ebbef..2181440b01e 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -33,10 +33,23 @@ vortex-utils = { workspace = true }
 
 [dev-dependencies]
 criterion = { workspace = true }
-tokio = { workspace = true, features = ["rt", "macros"] }
+futures = { workspace = true }
+tokio = { workspace = true, features = ["rt", "rt-multi-thread", "macros"] }
+vortex-file = { workspace = true, features = ["tokio"] }
+vortex-layout = { workspace = true }
+vortex-metrics = { workspace = true }
+vortex-scan = { workspace = true }
 
 [build-dependencies]
 
 [[bench]]
 name = "for_cuda"
 harness = false
+
+[[bench]]
+name = "h2d_pinned"
+harness = false
+
+[[bench]]
+name = "pinned_scan"
+harness = false
diff --git a/vortex-cuda/benches/h2d_pinned.rs b/vortex-cuda/benches/h2d_pinned.rs
new file mode 100644
index 00000000000..8d19ba5840b
--- /dev/null
+++ b/vortex-cuda/benches/h2d_pinned.rs
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Benchmarks for H2D transfer throughput with pinned vs regular memory.
+//!
+//! Run with: cargo bench -p vortex-cuda --bench h2d_pinned
+//!
+//! This benchmark measures:
+//! 1. Pure H2D transfer: pinned memory vs regular Vec<u8>
+//! 2. Read-into-pinned: reading from RAM buffer into pinned memory
+//! 3. Full pipeline: RAM -> pinned -> GPU
+
+#![allow(clippy::unwrap_used)]
+#![allow(clippy::expect_used)]
+#![allow(clippy::redundant_clone)]
+
+use std::sync::Arc;
+use std::time::Duration;
+use std::time::Instant;
+
+use criterion::BenchmarkId;
+use criterion::Criterion;
+use criterion::Throughput;
+use criterion::criterion_group;
+use criterion::criterion_main;
+use cudarc::driver::CudaContext;
+use vortex_cuda::PinnedByteBufferPool;
+use vortex_cuda::has_nvcc;
+
+// Buffer sizes to test: 1KB, 64KB, 1MB, 16MB, 64MB, 256MB
+const SIZES: &[(usize, &str)] = &[
+    (1 << 10, "1KB"),
+    (1 << 16, "64KB"),
+    (1 << 20, "1MB"),
+    (16 << 20, "16MB"),
+    (64 << 20, "64MB"),
+    (256 << 20, "256MB"),
+];
+
+/// Benchmark H2D transfer from regular (pageable) memory.
+/// CUDA internally stages through a pinned buffer, so this measures the slower path.
+fn bench_h2d_regular(c: &mut Criterion) {
+    let ctx = CudaContext::new(0).expect("Failed to create CUDA context");
+    let stream = ctx.new_stream().expect("Failed to create stream");
+
+    let mut group = c.benchmark_group("h2d_regular");
+    group.sample_size(10);
+
+    for (size, label) in SIZES {
+        // Skip very large sizes for regular memory test to save time
+        if *size > 64 << 20 {
+            continue;
+        }
+
+        group.throughput(Throughput::Bytes(*size as u64));
+        group.bench_with_input(BenchmarkId::new("regular", label), size, |b, &size| {
+            // Allocate regular memory and touch it
+            let data: Vec<u8> = vec![0x42u8; size];
+
+            // Pre-allocate device buffer
+            let mut device = unsafe { stream.alloc::<u8>(size) }.expect("Failed to alloc device");
+
+            b.iter_custom(|iters| {
+                let mut total = Duration::ZERO;
+                for _ in 0..iters {
+                    let start = Instant::now();
+                    stream.memcpy_htod(&data, &mut device).expect("H2D failed");
+                    stream.synchronize().expect("Sync failed");
+                    total += start.elapsed();
+                }
+                total
+            });
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark H2D transfer from pinned memory.
+/// This uses DMA and should be faster than regular memory.
+fn bench_h2d_pinned(c: &mut Criterion) {
+    let ctx = CudaContext::new(0).expect("Failed to create CUDA context");
+    let stream = ctx.new_stream().expect("Failed to create stream");
+    let pool = Arc::new(PinnedByteBufferPool::new(ctx.clone()));
+
+    let mut group = c.benchmark_group("h2d_pinned");
+    group.sample_size(10);
+
+    for (size, label) in SIZES {
+        group.throughput(Throughput::Bytes(*size as u64));
+        group.bench_with_input(BenchmarkId::new("pinned", label), size, |b, &size| {
+            // Allocate pinned memory and touch it
+            let mut pinned = pool.get(size).expect("Failed to get pinned buffer");
+            pinned.as_mut_slice().expect("slice").fill(0x42);
+
+            // Pre-allocate device buffer
+            let mut device = unsafe { stream.alloc::<u8>(size) }.expect("Failed to alloc device");
+
+            b.iter_custom(|iters| {
+                let mut total = Duration::ZERO;
+                for _ in 0..iters {
+                    let start = Instant::now();
+                    stream
+                        .memcpy_htod(&pinned, &mut device)
+                        .expect("H2D failed");
+                    stream.synchronize().expect("Sync failed");
+                    total += start.elapsed();
+                }
+                total
+            });
+
+            // Return to pool
+            pool.put(pinned).ok();
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark the PooledPinnedBuffer path (what the allocator uses).
+fn bench_h2d_pooled_pinned(c: &mut Criterion) {
+    let ctx = CudaContext::new(0).expect("Failed to create CUDA context");
+    let stream = ctx.new_stream().expect("Failed to create stream");
+    let pool = Arc::new(PinnedByteBufferPool::new(ctx.clone()));
+
+    let mut group = c.benchmark_group("h2d_pooled_pinned");
+    group.sample_size(10);
+
+    for (size, label) in SIZES {
+        group.throughput(Throughput::Bytes(*size as u64));
+        group.bench_with_input(BenchmarkId::new("pooled", label), size, |b, &size| {
+            // Pre-allocate device buffer
+            let mut device = unsafe { stream.alloc::<u8>(size) }.expect("Failed to alloc device");
+
+            b.iter_custom(|iters| {
+                let mut total = Duration::ZERO;
+                for _ in 0..iters {
+                    // Get from pool, fill, transfer, return to pool
+                    let mut pooled = pool.get_pooled(size).expect("Failed to get pooled buffer");
+                    pooled.as_mut_slice().fill(0x42);
+
+                    let start = Instant::now();
+                    stream
+                        .memcpy_htod(&pooled, &mut device)
+                        .expect("H2D failed");
+                    stream.synchronize().expect("Sync failed");
+                    total += start.elapsed();
+
+                    // pooled is returned to pool on drop
+                }
+                total
+            });
+        });
+    }
+
+    group.finish();
+}
+
+/// Benchmark copying from RAM into pinned buffer, then H2D.
+/// This simulates: read from file/network into RAM, copy to pinned, transfer to GPU.
+fn bench_ram_to_pinned_to_gpu(c: &mut Criterion) {
+    let ctx = CudaContext::new(0).expect("Failed to create CUDA context");
+    let stream = ctx.new_stream().expect("Failed to create stream");
+    let pool = Arc::new(PinnedByteBufferPool::new(ctx.clone()));
+
+    let mut group = c.benchmark_group("ram_pinned_gpu");
+    group.sample_size(10);
+
+    for (size, label) in SIZES {
+        // Skip very large for this combined test
+        if *size > 64 << 20 {
+            continue;
+        }
+
+        group.throughput(Throughput::Bytes(*size as u64));
+        group.bench_with_input(
+            BenchmarkId::new("ram_to_pinned_to_gpu", label),
+            size,
+            |b, &size| {
+                // Source data in regular RAM
+                let ram_data: Vec<u8> = vec![0x42u8; size];
+
+                // Pre-allocate device buffer
+                let mut device =
+                    unsafe { stream.alloc::<u8>(size) }.expect("Failed to alloc device");
+
+                b.iter_custom(|iters| {
+                    let mut total = Duration::ZERO;
+                    for _ in 0..iters {
+                        let start = Instant::now();
+
+                        // Get pinned buffer
+                        let mut pinned =
+                            pool.get_pooled(size).expect("Failed to get pooled buffer");
+
+                        // Copy RAM -> pinned
+                        pinned.as_mut_slice().copy_from_slice(&ram_data);
+
+                        // Transfer pinned -> GPU
+                        stream
+                            .memcpy_htod(&pinned, &mut device)
+                            .expect("H2D failed");
+                        stream.synchronize().expect("Sync failed");
+
+                        total += start.elapsed();
+                    }
+                    total
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Benchmark direct RAM to GPU (baseline without pinned intermediate).
+fn bench_ram_to_gpu_direct(c: &mut Criterion) {
+    let ctx = CudaContext::new(0).expect("Failed to create CUDA context");
+    let stream = ctx.new_stream().expect("Failed to create stream");
+
+    let mut group = c.benchmark_group("ram_gpu_direct");
+    group.sample_size(10);
+
+    for (size, label) in SIZES {
+        if *size > 64 << 20 {
+            continue;
+        }
+
+        group.throughput(Throughput::Bytes(*size as u64));
+        group.bench_with_input(
+            BenchmarkId::new("ram_to_gpu_direct", label),
+            size,
+            |b, &size| {
+                let ram_data: Vec<u8> = vec![0x42u8; size];
+                let mut device =
+                    unsafe { stream.alloc::<u8>(size) }.expect("Failed to alloc device");
+
+                b.iter_custom(|iters| {
+                    let mut total = Duration::ZERO;
+                    for _ in 0..iters {
+                        let start = Instant::now();
+                        stream
+                            .memcpy_htod(&ram_data, &mut device)
+                            .expect("H2D failed");
+                        stream.synchronize().expect("Sync failed");
+                        total += start.elapsed();
+                    }
+                    total
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+/// Quick sanity check that prints bandwidth numbers.
+fn print_bandwidth_summary() {
+    println!("\n=== H2D Bandwidth Quick Test ===\n");
+
+    let ctx = CudaContext::new(0).expect("Failed to create CUDA context");
+    let stream = ctx.new_stream().expect("Failed to create stream");
+    let pool = Arc::new(PinnedByteBufferPool::new(ctx.clone()));
+
+    let size = 256 << 20; // 256MB
+    let iterations = 10;
+
+    // Pinned test
+    let mut pinned = pool.get(size).expect("Failed to get pinned buffer");
+    pinned.as_mut_slice().expect("slice").fill(0x42);
+    let mut device = unsafe { stream.alloc::<u8>(size) }.expect("Failed to alloc device");
+
+    // Warmup
+    for _ in 0..3 {
+        stream
+            .memcpy_htod(&pinned, &mut device)
+            .expect("H2D failed");
+        stream.synchronize().expect("Sync failed");
+    }
+
+    let start = Instant::now();
+    for _ in 0..iterations {
+        stream
+            .memcpy_htod(&pinned, &mut device)
+            .expect("H2D failed");
+        stream.synchronize().expect("Sync failed");
+    }
+    let pinned_time = start.elapsed();
+    let pinned_bw = (size * iterations) as f64 / pinned_time.as_secs_f64() / 1e9;
+
+    pool.put(pinned).ok();
+
+    // Regular test
+    let regular: Vec<u8> = vec![0x42u8; size];
+
+    // Warmup
+    for _ in 0..3 {
+        stream
+            .memcpy_htod(&regular, &mut device)
+            .expect("H2D failed");
+        stream.synchronize().expect("Sync failed");
+    }
+
+    let start = Instant::now();
+    for _ in 0..iterations {
+        stream
+            .memcpy_htod(&regular, &mut device)
+            .expect("H2D failed");
+        stream.synchronize().expect("Sync failed");
+    }
+    let regular_time = start.elapsed();
+    let regular_bw = (size * iterations) as f64 / regular_time.as_secs_f64() / 1e9;
+
+    println!("Buffer size: {} MB", size >> 20);
+    println!("Iterations: {}", iterations);
+    println!();
+    println!(
+        "Pinned memory:  {:.2} GB/s ({:.2} ms per transfer)",
+        pinned_bw,
+        pinned_time.as_secs_f64() * 1000.0 / iterations as f64
+    );
+    println!(
+        "Regular memory: {:.2} GB/s ({:.2} ms per transfer)",
+        regular_bw,
+        regular_time.as_secs_f64() * 1000.0 / iterations as f64
+    );
+    println!("Speedup: {:.2}x", pinned_bw / regular_bw);
+    println!();
+}
+
+fn all_benchmarks(c: &mut Criterion) {
+    if !has_nvcc() {
+        eprintln!("nvcc not found, skipping CUDA benchmarks");
+        return;
+    }
+
+    // Print quick summary first
+    print_bandwidth_summary();
+
+    // Run detailed benchmarks
+    bench_h2d_pinned(c);
+    bench_h2d_regular(c);
+    bench_h2d_pooled_pinned(c);
+    bench_ram_to_pinned_to_gpu(c);
+    bench_ram_to_gpu_direct(c);
+}
+
+criterion_group!(benches, all_benchmarks);
+criterion_main!(benches);
diff --git a/vortex-cuda/benches/pinned_scan.rs b/vortex-cuda/benches/pinned_scan.rs
new file mode 100644
index 00000000000..f6dc1568ff7
--- /dev/null
+++ b/vortex-cuda/benches/pinned_scan.rs
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Benchmark Vortex file scanning with pinned buffer allocator.
+//!
+//! Run with: cargo bench -p vortex-cuda --bench pinned_scan
+//!
+//! This benchmark:
+//! 1. Creates a synthetic Vortex file in memory
+//! 2. Scans it with default allocator vs pinned allocator
+//! 3. Measures total I/O + decode time
+
+#![allow(clippy::unwrap_used)]
+#![allow(clippy::expect_used)]
+#![allow(clippy::len_zero)]
+
+use std::sync::Arc;
+use std::time::Duration;
+use std::time::Instant;
+
+use criterion::BenchmarkId;
+use criterion::Criterion;
+use criterion::Throughput;
+use criterion::criterion_group;
+use criterion::criterion_main;
+use cudarc::driver::CudaContext;
+use tokio::runtime::Runtime;
+use vortex_array::IntoArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::expr::session::ExprSession;
+use vortex_array::session::ArraySession;
+use vortex_array::stream::ArrayStreamExt;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_buffer::ByteBuffer;
+use vortex_buffer::ByteBufferMut;
+use vortex_cuda::PinnedBufferAllocator;
+use vortex_cuda::PinnedByteBufferPool;
+use vortex_cuda::PinnedDeviceAllocator;
+use vortex_cuda::has_nvcc;
+use vortex_file::OpenOptionsSessionExt;
+use vortex_file::WriteOptionsSessionExt;
+use vortex_file::register_default_encodings;
+use vortex_io::session::RuntimeSession;
+use vortex_layout::session::LayoutSession;
+use vortex_metrics::VortexMetrics;
+use vortex_session::VortexSession;
+
+// Test sizes: 1M, 10M, 100M rows of i64 (8 bytes each)
+const ROW_COUNTS: &[(usize, &str)] = &[
+    (1_000_000, "1M_rows"),
+    (10_000_000, "10M_rows"),
+    (100_000_000, "100M_rows"),
+];
+
+fn create_session() -> VortexSession {
+    let mut session = VortexSession::empty()
+        .with::<VortexMetrics>()
+        .with::<ArraySession>()
+        .with::<LayoutSession>()
+        .with::<ExprSession>()
+        .with::<RuntimeSession>();
+    register_default_encodings(&mut session);
+    session
+}
+
+/// Create a synthetic Vortex file in memory with the given number of rows.
+fn create_vortex_buffer(session: &VortexSession, num_rows: usize) -> ByteBuffer {
+    let rt = Runtime::new().unwrap();
+
+    // Create a simple i64 array with predictable data
+    let data: Vec<i64> = (0..num_rows as i64).collect();
+    let array = PrimitiveArray::new(Buffer::from(data), Validity::NonNullable).into_array();
+
+    let mut buf = ByteBufferMut::empty();
+    rt.block_on(async {
+        session
+            .write_options()
+            .write(&mut buf, array.to_array_stream())
+            .await
+            .expect("Failed to write Vortex file");
+    });
+
+    ByteBuffer::from(buf)
+}
+
+/// Scan with default allocator (regular memory).
+fn scan_default(session: &VortexSession, buffer: &ByteBuffer) -> Duration {
+    let rt = Runtime::new().unwrap();
+
+    rt.block_on(async {
+        let file = session
+            .open_options()
+            .open_buffer(buffer.clone())
+            .expect("Failed to open file");
+
+        let start = Instant::now();
+
+        let result = file
+            .scan()
+            .expect("Failed to create scan")
+            .into_array_stream()
+            .expect("Failed to create stream")
+            .read_all()
+            .await
+            .expect("Scan failed");
+
+        let elapsed = start.elapsed();
+        assert!(result.len() > 0);
+        elapsed
+    })
+}
+
+/// Scan with pinned allocator (data stays on host in pinned memory).
+fn scan_pinned(
+    session: &VortexSession,
+    buffer: &ByteBuffer,
+    pool: &Arc<PinnedByteBufferPool>,
+) -> Duration {
+    let rt = Runtime::new().unwrap();
+    let allocator = Arc::new(PinnedBufferAllocator::new(pool.clone()));
+
+    rt.block_on(async {
+        let file = session
+            .open_options()
+            .with_allocator(allocator)
+            .open_buffer(buffer.clone())
+            .expect("Failed to open file");
+
+        let start = Instant::now();
+
+        let result = file
+            .scan()
+            .expect("Failed to create scan")
+            .into_array_stream()
+            .expect("Failed to create stream")
+            .read_all()
+            .await
+            .expect("Scan failed");
+
+        let elapsed = start.elapsed();
+        assert!(result.len() > 0);
+        elapsed
+    })
+}
+
+/// Scan with pinned device allocator (data transferred to GPU).
+fn scan_device(
+    session: &VortexSession,
+    buffer: &ByteBuffer,
+    pool: &Arc<PinnedByteBufferPool>,
+    stream: &Arc<cudarc::driver::CudaStream>,
+) -> Duration {
+    let rt = Runtime::new().unwrap();
+    let allocator = Arc::new(PinnedDeviceAllocator::new(pool.clone(), stream.clone()));
+
+    rt.block_on(async {
+        let file = session
+            .open_options()
+            .with_allocator(allocator.clone())
+            .open_buffer(buffer.clone())
+            .expect("Failed to open file");
+
+        let start = Instant::now();
+
+        let result = file
+            .scan()
+            .expect("Failed to create scan")
+            .into_array_stream()
+            .expect("Failed to create stream")
+            .read_all()
+            .await
+            .expect("Scan failed");
+
+        // Synchronize to ensure all H2D transfers complete
+        allocator.synchronize().expect("Failed to synchronize");
+
+        let elapsed = start.elapsed();
+        assert!(result.len() > 0);
+        elapsed
+    })
+}
+
+fn bench_scan_default(c: &mut Criterion) {
+    let session = create_session();
+
+    let mut group = c.benchmark_group("scan_default");
+    group.sample_size(10);
+
+    for (num_rows, label) in ROW_COUNTS {
+        // Skip very large for CI
+        if *num_rows > 10_000_000 {
+            continue;
+        }
+
+        let buffer = create_vortex_buffer(&session, *num_rows);
+        let bytes = buffer.len();
+
+        group.throughput(Throughput::Bytes(bytes as u64));
+        group.bench_with_input(BenchmarkId::new("default", label), &buffer, |b, buffer| {
+            b.iter_custom(|iters| {
+                let mut total = Duration::ZERO;
+                for _ in 0..iters {
+                    total += scan_default(&session, buffer);
+                }
+                total
+            });
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_scan_pinned(c: &mut Criterion) {
+    if !has_nvcc() {
+        eprintln!("nvcc not found, skipping pinned scan benchmark");
+        return;
+    }
+
+    let ctx = CudaContext::new(0).expect("Failed to create CUDA context");
+    let pool = Arc::new(PinnedByteBufferPool::new(ctx));
+    let session = create_session();
+
+    let mut group = c.benchmark_group("scan_pinned");
+    group.sample_size(10);
+
+    for (num_rows, label) in ROW_COUNTS {
+        if *num_rows > 10_000_000 {
+            continue;
+        }
+
+        let buffer = create_vortex_buffer(&session, *num_rows);
+        let bytes = buffer.len();
+
+        group.throughput(Throughput::Bytes(bytes as u64));
+        group.bench_with_input(BenchmarkId::new("pinned", label), &buffer, |b, buffer| {
+            b.iter_custom(|iters| {
+                let mut total = Duration::ZERO;
+                for _ in 0..iters {
+                    total += scan_pinned(&session, buffer, &pool);
+                }
+                total
+            });
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_scan_device(c: &mut Criterion) {
+    if !has_nvcc() {
+        eprintln!("nvcc not found, skipping device scan benchmark");
+        return;
+    }
+
+    let ctx = CudaContext::new(0).expect("Failed to create CUDA context");
+    let stream = Arc::new(ctx.new_stream().expect("Failed to create stream"));
+    let pool = Arc::new(PinnedByteBufferPool::new(ctx));
+    let session = create_session();
+
+    let mut group = c.benchmark_group("scan_device");
+    group.sample_size(10);
+
+    for (num_rows, label) in ROW_COUNTS {
+        if *num_rows > 10_000_000 {
+            continue;
+        }
+
+        let buffer = create_vortex_buffer(&session, *num_rows);
+        let bytes = buffer.len();
+
+        group.throughput(Throughput::Bytes(bytes as u64));
+        group.bench_with_input(BenchmarkId::new("device", label), &buffer, |b, buffer| {
+            b.iter_custom(|iters| {
+                let mut total = Duration::ZERO;
+                for _ in 0..iters {
+                    total += scan_device(&session, buffer, &pool, &stream);
+                }
+                total
+            });
+        });
+    }
+
+    group.finish();
+}
+
+/// Quick comparison that prints results directly.
+fn print_scan_comparison() {
+    if !has_nvcc() {
+        eprintln!("nvcc not found, skipping scan comparison");
+        return;
+    }
+
+    println!("\n=== Vortex Scan: Default vs Pinned vs Device Allocator ===\n");
+
+    let ctx = CudaContext::new(0).expect("Failed to create CUDA context");
+    let stream = Arc::new(ctx.new_stream().expect("Failed to create stream"));
+    let pool = Arc::new(PinnedByteBufferPool::new(ctx));
+    let session = create_session();
+
+    let num_rows = 10_000_000; // 10M rows
+    println!("Creating Vortex file with {} rows...", num_rows);
+    let buffer = create_vortex_buffer(&session, num_rows);
+    println!(
+        "File size: {:.2} MB ({} bytes)\n",
+        buffer.len() as f64 / 1e6,
+        buffer.len()
+    );
+
+    let iterations = 5;
+
+    // Warmup
+    println!("Warming up...");
+    scan_default(&session, &buffer);
+    scan_pinned(&session, &buffer, &pool);
+    scan_device(&session, &buffer, &pool, &stream);
+
+    // Default allocator
+    println!(
+        "Running {} iterations with default allocator...",
+        iterations
+    );
+    let start = Instant::now();
+    for _ in 0..iterations {
+        scan_default(&session, &buffer);
+    }
+    let default_time = start.elapsed();
+    let default_throughput = (buffer.len() * iterations) as f64 / default_time.as_secs_f64() / 1e9;
+
+    // Pinned allocator (host)
+    println!(
+        "Running {} iterations with pinned allocator (host)...",
+        iterations
+    );
+    let start = Instant::now();
+    for _ in 0..iterations {
+        scan_pinned(&session, &buffer, &pool);
+    }
+    let pinned_time = start.elapsed();
+    let pinned_throughput = (buffer.len() * iterations) as f64 / pinned_time.as_secs_f64() / 1e9;
+
+    // Device allocator (pinned + H2D)
+    println!(
+        "Running {} iterations with device allocator (pinned + H2D)...",
+        iterations
+    );
+    let start = Instant::now();
+    for _ in 0..iterations {
+        scan_device(&session, &buffer, &pool, &stream);
+    }
+    let device_time = start.elapsed();
+    let device_throughput = (buffer.len() * iterations) as f64 / device_time.as_secs_f64() / 1e9;
+
+    println!();
+    println!("Results:");
+    println!(
+        "  Default allocator:        {:.2} GB/s ({:.2} ms avg)",
+        default_throughput,
+        default_time.as_secs_f64() * 1000.0 / iterations as f64
+    );
+    println!(
+        "  Pinned allocator (host):  {:.2} GB/s ({:.2} ms avg)",
+        pinned_throughput,
+        pinned_time.as_secs_f64() * 1000.0 / iterations as f64
+    );
+    println!(
+        "  Device allocator (H2D):   {:.2} GB/s ({:.2} ms avg)",
+        device_throughput,
+        device_time.as_secs_f64() * 1000.0 / iterations as f64
+    );
+    println!();
+    println!("Ratios vs default:");
+    println!(
+        "  Pinned (host): {:.2}x",
+        pinned_throughput / default_throughput.max(0.001)
+    );
+    println!(
+        "  Device (H2D):  {:.2}x",
+        device_throughput / default_throughput.max(0.001)
+    );
+    println!();
+}
+
+fn all_benchmarks(c: &mut Criterion) {
+    // Print quick summary first
+    print_scan_comparison();
+
+    // Run detailed benchmarks
+    bench_scan_default(c);
+    bench_scan_pinned(c);
+    bench_scan_device(c);
+}
+
+criterion_group!(benches, all_benchmarks);
+criterion_main!(benches);
diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
index 170b47c9011..4d76d712fb5 100644
--- a/vortex-cuda/src/device_buffer.rs
+++ b/vortex-cuda/src/device_buffer.rs
@@ -99,9 +99,7 @@ impl Drop for CudaDeviceBuffer {
 
 impl PartialEq for CudaDeviceBuffer {
     fn eq(&self, other: &Self) -> bool {
-        Arc::ptr_eq(&self.data, &other.data)
-            && self.offset == other.offset
-            && self.len == other.len
+        Arc::ptr_eq(&self.data, &other.data) && self.offset == other.offset && self.len == other.len
     }
 }
 
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index 3693fefaf7f..03075a16371 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -3,8 +3,8 @@
 
 //! CUDA support for Vortex arrays.
 
-pub mod executor;
 mod device_buffer;
+pub mod executor;
 mod for_;
 mod kernel;
 pub mod pinned;
@@ -15,14 +15,14 @@ use std::process::Command;
 
 pub use executor::CudaExecutionCtx;
 pub use executor::CudaKernelEvents;
+use for_::ForExecutor;
 pub use pinned::PinnedByteBuffer;
 pub use pinned::PinnedByteBufferPool;
 pub use pinned::PooledPinnedBuffer;
 pub use pinned_allocator::PinnedBufferAllocator;
 pub use pinned_allocator::PinnedDeviceAllocator;
-pub use session::CudaSessionExt;
-use for_::ForExecutor;
 pub use session::CudaSession;
+pub use session::CudaSessionExt;
 
 /// Check if the NVIDIA CUDA Compiler is available.
 pub fn has_nvcc() -> bool {
diff --git a/vortex-cuda/src/pinned.rs b/vortex-cuda/src/pinned.rs
index d9a51a8cf5f..3a1acd43484 100644
--- a/vortex-cuda/src/pinned.rs
+++ b/vortex-cuda/src/pinned.rs
@@ -92,9 +92,7 @@ impl HostSlice<u8> for PinnedByteBuffer {
         &'a self,
         stream: &'a CudaStream,
     ) -> (&'a [u8], SyncOnDrop<'a>) {
-        unsafe {
-            <PinnedHostSlice<u8> as HostSlice<u8>>::stream_synced_slice(&self.inner, stream)
-        }
+        unsafe { <PinnedHostSlice<u8> as HostSlice<u8>>::stream_synced_slice(&self.inner, stream) }
     }
 
     unsafe fn stream_synced_mut_slice<'a>(
diff --git a/vortex-cuda/src/pinned_allocator.rs b/vortex-cuda/src/pinned_allocator.rs
index 0fd82492f71..805dae91070 100644
--- a/vortex-cuda/src/pinned_allocator.rs
+++ b/vortex-cuda/src/pinned_allocator.rs
@@ -12,9 +12,9 @@ use vortex_io::BufferAllocator;
 use vortex_io::WriteTarget;
 use vortex_session::VortexSession;
 
-use crate::device_buffer::CudaDeviceBuffer;
 use crate::PinnedByteBufferPool;
 use crate::PooledPinnedBuffer;
+use crate::device_buffer::CudaDeviceBuffer;
 use crate::session::CudaSessionExt;
 
 /// Allocator that sources buffers from a CUDA pinned pool.
diff --git a/vortex-file/src/read/request.rs b/vortex-file/src/read/request.rs
index be5ee6a1e7a..1fa3af25135 100644
--- a/vortex-file/src/read/request.rs
+++ b/vortex-file/src/read/request.rs
@@ -7,8 +7,8 @@ use std::fmt::Formatter;
 use std::ops::Range;
 use std::sync::Arc;
 
-use vortex_buffer::Alignment;
 use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
 use vortex_error::VortexError;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
diff --git a/vortex-file/src/segments/source.rs b/vortex-file/src/segments/source.rs
index 9a55389cacb..4fa49ecab09 100644
--- a/vortex-file/src/segments/source.rs
+++ b/vortex-file/src/segments/source.rs
@@ -166,7 +166,11 @@ impl SegmentSource for FileSegmentSource {
             .boxed()
         });
 
-        async move { maybe_fut.ok_or_else(|| vortex_err!("Missing segment: {}", id))?.await }
+        async move {
+            maybe_fut
+                .ok_or_else(|| vortex_err!("Missing segment: {}", id))?
+                .await
+        }
         .boxed()
     }
 }
diff --git a/vortex-io/src/lib.rs b/vortex-io/src/lib.rs
index e086bdc433f..f7706f63244 100644
--- a/vortex-io/src/lib.rs
+++ b/vortex-io/src/lib.rs
@@ -10,26 +10,26 @@
 //! This crate provides core traits for positioned and streaming IO, and via feature
 //! flags implements the core traits for several common async runtimes and backing stores.
 
+pub use allocator::*;
 pub use io_buf::*;
 pub use limit::*;
 #[cfg(feature = "object_store")]
 pub use object_store::*;
-pub use allocator::*;
 pub use read::*;
-pub use write_target::*;
 pub use write::*;
+pub use write_target::*;
 
-pub mod file;
 mod allocator;
+pub mod file;
 mod io_buf;
 pub mod kanal_ext;
 mod limit;
 #[cfg(feature = "object_store")]
 mod object_store;
 mod read;
-mod write_target;
 pub mod runtime;
 pub mod session;
 #[cfg(feature = "tokio")]
 mod tokio;
 mod write;
+mod write_target;
diff --git a/vortex-io/src/read.rs b/vortex-io/src/read.rs
index 07388ad3d7a..91c54d6c3c8 100644
--- a/vortex-io/src/read.rs
+++ b/vortex-io/src/read.rs
@@ -232,12 +232,13 @@ impl VortexReadAt for ByteBuffer {
                     buffer.len()
                 );
             }
-            target.as_mut_slice().copy_from_slice(&buffer.as_ref()[start..end]);
+            target
+                .as_mut_slice()
+                .copy_from_slice(&buffer.as_ref()[start..end]);
             target.into_handle()
         }
         .boxed()
     }
-
 }
 
 /// A wrapper that instruments a [`VortexReadAt`] with metrics.
diff --git a/vortex-layout/src/layout.rs b/vortex-layout/src/layout.rs
index f654eb612e1..b881bbcc8dd 100644
--- a/vortex-layout/src/layout.rs
+++ b/vortex-layout/src/layout.rs
@@ -240,7 +240,10 @@ pub fn collect_segment_ids(layout: &LayoutRef) -> VortexResult<Vec<SegmentId>> {
     Ok(segment_ids)
 }
 
-fn collect_segments_to_fetch(layout: &LayoutRef, segment_ids: &mut Vec<SegmentId>) -> VortexResult<()> {
+fn collect_segments_to_fetch(
+    layout: &LayoutRef,
+    segment_ids: &mut Vec<SegmentId>,
+) -> VortexResult<()> {
     if let Some(flat_layout) = layout.as_opt::<FlatVTable>() {
         if flat_layout.array_tree().is_none() {
             segment_ids.push(flat_layout.segment_id());