Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion _typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ extend-ignore-re = [
]

[files]
extend-exclude = ["/vortex-bench/**", "/docs/references.bib", "benchmarks/**"]
extend-exclude = ["/vortex-bench/**", "/docs/references.bib", "benchmarks/**", "/vortex-array/src/expr/exprs/utf8.rs"]

[type.py]
extend-ignore-identifiers-re = [
Expand Down
91 changes: 91 additions & 0 deletions vortex-array/src/expr/exprs/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,17 @@ use crate::expr::ChildName;
use crate::expr::ExecutionArgs;
use crate::expr::ExprId;
use crate::expr::Expression;
use crate::expr::Literal;
use crate::expr::StatsCatalog;
use crate::expr::VTable;
use crate::expr::VTableExt;
use crate::expr::and;
use crate::expr::exprs::utf8::increment_utf8;
use crate::expr::gt;
use crate::expr::gt_eq;
use crate::expr::lit;
use crate::expr::lt;
use crate::expr::or;

/// Expression that performs SQL LIKE pattern matching.
pub struct Like;
Expand Down Expand Up @@ -153,6 +161,67 @@ impl VTable for Like {
fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
false
}

fn stat_falsification(
&self,
like_opts: &LikeOptions,
expr: &Expression,
catalog: &dyn StatsCatalog,
) -> Option<Expression> {
// Attempt to do min/max pruning for LIKE 'exact' or LIKE 'prefix%'

// Don't attempt to handle ilike or negated like
if like_opts.negated || like_opts.case_insensitive {
return None;
}

// Extract the pattern out
let pat = expr.child(1).as_::<Literal>();

// LIKE NULL is nonsensical, don't try to handle it
let pat_str = pat.as_utf8().value()?;

let src = expr.child(0).clone();
let src_min = src.stat_min(catalog)?;
let src_max = src.stat_max(catalog)?;

match LikeVariant::from_str(&pat_str)? {
LikeVariant::Exact(text) => {
// col LIKE 'exact' ==> col.min > 'exact' || col.max < 'exact'
Some(or(gt(src_min, lit(text)), lt(src_max, lit(text))))
}
LikeVariant::Prefix(prefix) => {
// col LIKE 'prefix%' ==> col.max < 'prefix' || col.min >= 'prefiy'
let succ = increment_utf8(prefix)?;

Some(or(gt_eq(src_min, lit(succ)), lt(src_max, lit(prefix))))
}
}
}
}

/// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.s
#[derive(Debug, PartialEq)]
enum LikeVariant<'a> {
Exact(&'a str),
Prefix(&'a str),
}

impl<'a> LikeVariant<'a> {
/// Parse a LIKE pattern string into its relevant variant
fn from_str(string: &str) -> Option<LikeVariant<'_>> {
let Some(wildcard_pos) = string.find(['%', '_']) else {
return Some(LikeVariant::Exact(string));
};

// Can't handle wildcard in the front.
if wildcard_pos == 0 {
return None;
}

let prefix = &string[..wildcard_pos];
Some(LikeVariant::Prefix(prefix))
}
}

pub fn like(child: Expression, pattern: Expression) -> Expression {
Expand Down Expand Up @@ -203,6 +272,7 @@ mod tests {
use crate::ToCanonical;
use crate::arrays::BoolArray;
use crate::expr::exprs::get_item::get_item;
use crate::expr::exprs::like::LikeVariant;
use crate::expr::exprs::like::like;
use crate::expr::exprs::like::not_ilike;
use crate::expr::exprs::literal::lit;
Expand Down Expand Up @@ -243,4 +313,25 @@ mod tests {
let expr2 = not_ilike(root(), lit("test*"));
assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
}

#[test]
fn test_like_variant() {
// Supported patterns
assert_eq!(
LikeVariant::from_str("simple"),
Some(LikeVariant::Exact("simple"))
);
assert_eq!(
LikeVariant::from_str("prefix%"),
Some(LikeVariant::Prefix("prefix"))
);
assert_eq!(
LikeVariant::from_str("first%rest_stuff"),
Some(LikeVariant::Prefix("first"))
);

// Unsupported patterns
assert_eq!(LikeVariant::from_str("%suffix"), None);
assert_eq!(LikeVariant::from_str("_pattern"), None);
}
}
1 change: 1 addition & 0 deletions vortex-array/src/expr/exprs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pub(crate) mod operators;
pub(crate) mod pack;
pub(crate) mod root;
pub(crate) mod select;
mod utf8;

pub use between::*;
pub use binary::*;
Expand Down
113 changes: 113 additions & 0 deletions vortex-array/src/expr/exprs/utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Licensed to Apache Software Foundation (ASF)

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// This file is derived from DataFusion code:
// https://github.com/apache/datafusion/blob/d90d0746d64bf6e91a81b3ec6954369bd0851bb2/datafusion/pruning/src/pruning_predicate.rs

/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
/// This makes it so that the returned string will always compare greater than the input string
/// or any other string with the same prefix.
/// This is necessary since the statistics may have been truncated: if we have a min statistic
/// of "fo" that may have originally been "foz" or anything else with the prefix "fo".
/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
/// In this example `increment_utf8("foo") == "fop"
pub(crate) fn increment_utf8(data: &str) -> Option<String> {
// Helper function to check if a character is valid to use
fn is_valid_unicode(c: char) -> bool {
let cp = c as u32;

// Filter out non-characters (https://www.unicode.org/versions/corrigendum9.html)
if [0xFFFE, 0xFFFF].contains(&cp) || (0xFDD0..=0xFDEF).contains(&cp) {
return false;
}

// Filter out private use area
if cp >= 0x110000 {
return false;
}

true
}

// Convert string to vector of code points
let mut code_points: Vec<char> = data.chars().collect();

// Work backwards through code points
for idx in (0..code_points.len()).rev() {
let original = code_points[idx] as u32;

// Try incrementing the code point
if let Some(next_char) = char::from_u32(original + 1)
&& is_valid_unicode(next_char)
{
code_points[idx] = next_char;
// truncate the string to the current index
code_points.truncate(idx + 1);
return Some(code_points.into_iter().collect());
}
}

None
}

#[cfg(test)]
mod tests {
use crate::expr::exprs::utf8::increment_utf8;

#[test]
fn test_increment_utf8() {
// Basic ASCII
assert_eq!(increment_utf8("abc").unwrap(), "abd");
assert_eq!(increment_utf8("abz").unwrap(), "ab{");

// Test around ASCII 127 (DEL)
assert_eq!(increment_utf8("~").unwrap(), "\u{7f}"); // 126 -> 127
assert_eq!(increment_utf8("\u{7f}").unwrap(), "\u{80}"); // 127 -> 128

// Test 2-byte UTF-8 sequences
assert_eq!(increment_utf8("ß").unwrap(), "à"); // U+00DF -> U+00E0

// Test 3-byte UTF-8 sequences
assert_eq!(increment_utf8("℣").unwrap(), "ℤ"); // U+2123 -> U+2124

// Test at UTF-8 boundaries
assert_eq!(increment_utf8("\u{7FF}").unwrap(), "\u{800}"); // 2-byte to 3-byte boundary
assert_eq!(increment_utf8("\u{FFFF}").unwrap(), "\u{10000}"); // 3-byte to 4-byte boundary

// Test that if we can't increment we return None
assert!(increment_utf8("").is_none());
assert!(increment_utf8("\u{10FFFF}").is_none()); // U+10FFFF is the max code point

// Test that if we can't increment the last character we do the previous one and truncate
assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b");

// Test surrogate pair range (0xD800..=0xDFFF)
assert_eq!(increment_utf8("a\u{D7FF}").unwrap(), "b");
assert!(increment_utf8("\u{D7FF}").is_none());

// Test non-characters range (0xFDD0..=0xFDEF)
assert_eq!(increment_utf8("a\u{FDCF}").unwrap(), "b");
assert!(increment_utf8("\u{FDCF}").is_none());

// Test private use area limit (>= 0x110000)
assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b");
assert!(increment_utf8("\u{10FFFF}").is_none()); // Can't increment past max valid codepoint
}
}
Loading