From 194c9648284871b04d52caa3fadc9c9f3d41dd23 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 19 Jan 2026 16:47:56 -0800 Subject: [PATCH 1/3] feat: stats pruning for LIKE filters Signed-off-by: Andrew Duffy --- vortex-array/src/expr/exprs/like.rs | 93 +++++++++++++++++++++++ vortex-array/src/expr/exprs/mod.rs | 1 + vortex-array/src/expr/exprs/utf8.rs | 113 ++++++++++++++++++++++++++++ 3 files changed, 207 insertions(+) create mode 100644 vortex-array/src/expr/exprs/utf8.rs diff --git a/vortex-array/src/expr/exprs/like.rs b/vortex-array/src/expr/exprs/like.rs index 43ea22226b3..3d83bf429c9 100644 --- a/vortex-array/src/expr/exprs/like.rs +++ b/vortex-array/src/expr/exprs/like.rs @@ -22,9 +22,17 @@ use crate::expr::ChildName; use crate::expr::ExecutionArgs; use crate::expr::ExprId; use crate::expr::Expression; +use crate::expr::Literal; +use crate::expr::StatsCatalog; use crate::expr::VTable; use crate::expr::VTableExt; use crate::expr::and; +use crate::expr::exprs::utf8::increment_utf8; +use crate::expr::gt; +use crate::expr::gt_eq; +use crate::expr::lit; +use crate::expr::lt; +use crate::expr::or; /// Expression that performs SQL LIKE pattern matching. pub struct Like; @@ -153,6 +161,69 @@ impl VTable for Like { fn is_null_sensitive(&self, _instance: &Self::Options) -> bool { false } + + fn stat_falsification( + &self, + like_opts: &LikeOptions, + expr: &Expression, + catalog: &dyn StatsCatalog, + ) -> Option { + // Attempt to do min/max pruning for LIKE 'exact' or LIKE 'prefix%' + + // Don't attempt to handle ilike or negated like + if like_opts.negated || like_opts.case_insensitive { + return None; + } + + // Extract the pattern out + let pat = expr.child(1).as_::(); + + // LIKE NULL is nonsensical, don't try to handle it + let Some(pat_str) = pat.as_utf8().value() else { + return None; + }; + + let src = expr.child(0).clone(); + let src_min = src.stat_min(catalog)?; + let src_max = src.stat_max(catalog)?; + + match LikeVariant::from_str(&pat_str)? { + LikeVariant::Exact(text) => { + // col LIKE 'exact' ==> col.min > 'exact' || col.max < 'exact' + Some(or(gt(src_min, lit(text)), lt(src_max, lit(text)))) + } + LikeVariant::Prefix(prefix) => { + // col LIKE 'prefix%' ==> col.max < 'prefix' || col.min >= 'prefiy' + let succ = increment_utf8(prefix)?; + + Some(or(gt_eq(src_min, lit(succ)), lt(src_max, lit(prefix)))) + } + } + } +} + +/// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.s +#[derive(Debug, PartialEq)] +enum LikeVariant<'a> { + Exact(&'a str), + Prefix(&'a str), +} + +impl<'a> LikeVariant<'a> { + /// Parse a LIKE pattern string into its relevant variant + fn from_str(string: &str) -> Option> { + let Some(wildcard_pos) = string.find(['%', '_']) else { + return Some(LikeVariant::Exact(string)); + }; + + // Can't handle wildcard in the front. + if wildcard_pos == 0 { + return None; + } + + let prefix = &string[..wildcard_pos]; + Some(LikeVariant::Prefix(prefix)) + } } pub fn like(child: Expression, pattern: Expression) -> Expression { @@ -203,6 +274,7 @@ mod tests { use crate::ToCanonical; use crate::arrays::BoolArray; use crate::expr::exprs::get_item::get_item; + use crate::expr::exprs::like::LikeVariant; use crate::expr::exprs::like::like; use crate::expr::exprs::like::not_ilike; use crate::expr::exprs::literal::lit; @@ -243,4 +315,25 @@ mod tests { let expr2 = not_ilike(root(), lit("test*")); assert_eq!(expr2.to_string(), "$ not ilike \"test*\""); } + + #[test] + fn test_like_variant() { + // Supported patterns + assert_eq!( + LikeVariant::from_str("simple"), + Some(LikeVariant::Exact("simple")) + ); + assert_eq!( + LikeVariant::from_str("prefix%"), + Some(LikeVariant::Prefix("prefix")) + ); + assert_eq!( + LikeVariant::from_str("first%rest_stuff"), + Some(LikeVariant::Prefix("first")) + ); + + // Unsupported patterns + assert_eq!(LikeVariant::from_str("%suffix"), None); + assert_eq!(LikeVariant::from_str("_pattern"), None); + } } diff --git a/vortex-array/src/expr/exprs/mod.rs b/vortex-array/src/expr/exprs/mod.rs index c606b53f5a0..dfe16c93c6b 100644 --- a/vortex-array/src/expr/exprs/mod.rs +++ b/vortex-array/src/expr/exprs/mod.rs @@ -17,6 +17,7 @@ pub(crate) mod operators; pub(crate) mod pack; pub(crate) mod root; pub(crate) mod select; +mod utf8; pub use between::*; pub use binary::*; diff --git a/vortex-array/src/expr/exprs/utf8.rs b/vortex-array/src/expr/exprs/utf8.rs new file mode 100644 index 00000000000..8b03b20ef18 --- /dev/null +++ b/vortex-array/src/expr/exprs/utf8.rs @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Licensed to Apache Software Foundation (ASF) + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file is derived from DataFusion code: +// https://github.com/apache/datafusion/blob/d90d0746d64bf6e91a81b3ec6954369bd0851bb2/datafusion/pruning/src/pruning_predicate.rs + +/// Increment a UTF8 string by one, returning `None` if it can't be incremented. +/// This makes it so that the returned string will always compare greater than the input string +/// or any other string with the same prefix. +/// This is necessary since the statistics may have been truncated: if we have a min statistic +/// of "fo" that may have originally been "foz" or anything else with the prefix "fo". +/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"` +/// In this example `increment_utf8("foo") == "fop" +pub(crate) fn increment_utf8(data: &str) -> Option { + // Helper function to check if a character is valid to use + fn is_valid_unicode(c: char) -> bool { + let cp = c as u32; + + // Filter out non-characters (https://www.unicode.org/versions/corrigendum9.html) + if [0xFFFE, 0xFFFF].contains(&cp) || (0xFDD0..=0xFDEF).contains(&cp) { + return false; + } + + // Filter out private use area + if cp >= 0x110000 { + return false; + } + + true + } + + // Convert string to vector of code points + let mut code_points: Vec = data.chars().collect(); + + // Work backwards through code points + for idx in (0..code_points.len()).rev() { + let original = code_points[idx] as u32; + + // Try incrementing the code point + if let Some(next_char) = char::from_u32(original + 1) + && is_valid_unicode(next_char) + { + code_points[idx] = next_char; + // truncate the string to the current index + code_points.truncate(idx + 1); + return Some(code_points.into_iter().collect()); + } + } + + None +} + +#[cfg(test)] +mod tests { + use crate::expr::exprs::utf8::increment_utf8; + + #[test] + fn test_increment_utf8() { + // Basic ASCII + assert_eq!(increment_utf8("abc").unwrap(), "abd"); + assert_eq!(increment_utf8("abz").unwrap(), "ab{"); + + // Test around ASCII 127 (DEL) + assert_eq!(increment_utf8("~").unwrap(), "\u{7f}"); // 126 -> 127 + assert_eq!(increment_utf8("\u{7f}").unwrap(), "\u{80}"); // 127 -> 128 + + // Test 2-byte UTF-8 sequences + assert_eq!(increment_utf8("ß").unwrap(), "à"); // U+00DF -> U+00E0 + + // Test 3-byte UTF-8 sequences + assert_eq!(increment_utf8("℣").unwrap(), "ℤ"); // U+2123 -> U+2124 + + // Test at UTF-8 boundaries + assert_eq!(increment_utf8("\u{7FF}").unwrap(), "\u{800}"); // 2-byte to 3-byte boundary + assert_eq!(increment_utf8("\u{FFFF}").unwrap(), "\u{10000}"); // 3-byte to 4-byte boundary + + // Test that if we can't increment we return None + assert!(increment_utf8("").is_none()); + assert!(increment_utf8("\u{10FFFF}").is_none()); // U+10FFFF is the max code point + + // Test that if we can't increment the last character we do the previous one and truncate + assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b"); + + // Test surrogate pair range (0xD800..=0xDFFF) + assert_eq!(increment_utf8("a\u{D7FF}").unwrap(), "b"); + assert!(increment_utf8("\u{D7FF}").is_none()); + + // Test non-characters range (0xFDD0..=0xFDEF) + assert_eq!(increment_utf8("a\u{FDCF}").unwrap(), "b"); + assert!(increment_utf8("\u{FDCF}").is_none()); + + // Test private use area limit (>= 0x110000) + assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b"); + assert!(increment_utf8("\u{10FFFF}").is_none()); // Can't increment past max valid codepoint + } +} From f05164b54c1988744c06bb008a8bc629ea5442d7 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 19 Jan 2026 17:14:33 -0800 Subject: [PATCH 2/3] clippy Signed-off-by: Andrew Duffy --- vortex-array/src/expr/exprs/like.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vortex-array/src/expr/exprs/like.rs b/vortex-array/src/expr/exprs/like.rs index 3d83bf429c9..5b48dca7c15 100644 --- a/vortex-array/src/expr/exprs/like.rs +++ b/vortex-array/src/expr/exprs/like.rs @@ -179,9 +179,7 @@ impl VTable for Like { let pat = expr.child(1).as_::(); // LIKE NULL is nonsensical, don't try to handle it - let Some(pat_str) = pat.as_utf8().value() else { - return None; - }; + let pat_str = pat.as_utf8().value()?; let src = expr.child(0).clone(); let src_min = src.stat_min(catalog)?; From fbff6c016080019d16f58d34dc5e31428ec59a2d Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 19 Jan 2026 17:15:16 -0800 Subject: [PATCH 3/3] ignore typos in vendored file Signed-off-by: Andrew Duffy --- _typos.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_typos.toml b/_typos.toml index b2af33e423b..a3b41745771 100644 --- a/_typos.toml +++ b/_typos.toml @@ -8,7 +8,7 @@ extend-ignore-re = [ ] [files] -extend-exclude = ["/vortex-bench/**", "/docs/references.bib", "benchmarks/**"] +extend-exclude = ["/vortex-bench/**", "/docs/references.bib", "benchmarks/**", "/vortex-array/src/expr/exprs/utf8.rs"] [type.py] extend-ignore-identifiers-re = [