diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 4c3ab66424f7..0fe6606abc95 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1252,9 +1252,11 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", + "datafusion-physical-expr", "hex", "itertools", "log", + "regex", ] [[package]] diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index c857202c237e..cea701492910 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -468,7 +468,7 @@ async fn test_fn_regexp_match() -> Result<()> { #[tokio::test] #[cfg(feature = "unicode_expressions")] async fn test_fn_regexp_replace() -> Result<()> { - let expr = regexp_replace(vec![col("a"), lit("[a-z]"), lit("x"), lit("g")]); + let expr = regexp_replace(col("a"), lit("[a-z]"), lit("x"), lit("g")); let expected = [ "+----------------------------------------------------------+", diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 6b3f2b956dfd..9cc7b4e855cb 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -218,9 +218,6 @@ pub enum BuiltinScalarFunction { OctetLength, /// random Random, - /// regexp_match - /// regexp_replace - RegexpReplace, /// repeat Repeat, /// replace @@ -417,7 +414,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::MD5 => Volatility::Immutable, BuiltinScalarFunction::OctetLength => Volatility::Immutable, BuiltinScalarFunction::Radians => Volatility::Immutable, - BuiltinScalarFunction::RegexpReplace => Volatility::Immutable, BuiltinScalarFunction::Repeat => Volatility::Immutable, BuiltinScalarFunction::Replace => Volatility::Immutable, BuiltinScalarFunction::Reverse => Volatility::Immutable, @@ -674,9 +670,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Pi => Ok(Float64), BuiltinScalarFunction::Random => Ok(Float64), BuiltinScalarFunction::Uuid => Ok(Utf8), - BuiltinScalarFunction::RegexpReplace => { - utf8_to_str_type(&input_expr_types[0], "regexp_replace") - } BuiltinScalarFunction::Repeat => { utf8_to_str_type(&input_expr_types[0], "repeat") } @@ -1161,14 +1154,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Replace | BuiltinScalarFunction::Translate => { Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], self.volatility()) } - BuiltinScalarFunction::RegexpReplace => Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![Utf8, Utf8, Utf8, Utf8]), - ], - self.volatility(), - ), - BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()), BuiltinScalarFunction::Random => Signature::exact(vec![], self.volatility()), BuiltinScalarFunction::Uuid => Signature::exact(vec![], self.volatility()), @@ -1398,9 +1383,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"], BuiltinScalarFunction::FindInSet => &["find_in_set"], - // regex functions - BuiltinScalarFunction::RegexpReplace => &["regexp_replace"], - // time/date functions BuiltinScalarFunction::Now => &["now"], BuiltinScalarFunction::CurrentDate => &["current_date", "today"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index ec53fd4ef1de..78546dddd589 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -822,11 +822,6 @@ nary_scalar_expr!( rpad, "fill up a string to the length by appending the characters" ); -nary_scalar_expr!( - RegexpReplace, - regexp_replace, - "replace strings that match a regular expression" -); nary_scalar_expr!( Btrim, btrim, @@ -1314,21 +1309,6 @@ mod test { test_scalar_expr!(Ltrim, ltrim, string); test_scalar_expr!(MD5, md5, string); test_scalar_expr!(OctetLength, octet_length, string); - test_nary_scalar_expr!( - RegexpReplace, - regexp_replace, - string, - pattern, - replacement - ); - test_nary_scalar_expr!( - RegexpReplace, - regexp_replace, - string, - pattern, - replacement, - flags - ); test_scalar_expr!(Replace, replace, string, from, to); test_scalar_expr!(Repeat, repeat, string, count); test_scalar_expr!(Reverse, reverse, string); diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 5ae436506c96..e14682820cc8 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -55,10 +55,11 @@ chrono = { workspace = true } datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-physical-expr = { workspace = true, default-features = true } hex = { version = "0.4", optional = true } itertools = { workspace = true } log = { workspace = true } - +regex = { version = "1.8" } [dev-dependencies] criterion = "0.5" rand = { workspace = true } diff --git a/datafusion/functions/benches/regx.rs b/datafusion/functions/benches/regx.rs index 390676f8f249..153cd4efe2c8 100644 --- a/datafusion/functions/benches/regx.rs +++ b/datafusion/functions/benches/regx.rs @@ -17,17 +17,18 @@ extern crate criterion; -use std::sync::Arc; - use arrow_array::builder::StringBuilder; use arrow_array::{ArrayRef, StringArray}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use datafusion_functions::regex::regexplike::regexp_like; use datafusion_functions::regex::regexpmatch::regexp_match; +use datafusion_functions::regex::regexpreplace::regexp_replace; use rand::distributions::Alphanumeric; use rand::rngs::ThreadRng; use rand::seq::SliceRandom; use rand::Rng; +use std::iter; +use std::sync::Arc; fn data(rng: &mut ThreadRng) -> StringArray { let mut data: Vec = vec![]; for _ in 0..1000 { @@ -101,6 +102,42 @@ fn criterion_benchmark(c: &mut Criterion) { ) }) }); + + c.bench_function("regexp_match_1000", |b| { + let mut rng = rand::thread_rng(); + let data = Arc::new(data(&mut rng)) as ArrayRef; + let regex = Arc::new(regex(&mut rng)) as ArrayRef; + let flags = Arc::new(flags(&mut rng)) as ArrayRef; + + b.iter(|| { + black_box( + regexp_match::(&[data.clone(), regex.clone(), flags.clone()]) + .expect("regexp_match should work on valid values"), + ) + }) + }); + + c.bench_function("regexp_replace_1000", |b| { + let mut rng = rand::thread_rng(); + let data = Arc::new(data(&mut rng)) as ArrayRef; + let regex = Arc::new(regex(&mut rng)) as ArrayRef; + let flags = Arc::new(flags(&mut rng)) as ArrayRef; + let replacement = + Arc::new(StringArray::from_iter_values(iter::repeat("XX").take(1000))) + as ArrayRef; + + b.iter(|| { + black_box( + regexp_replace::(&[ + data.clone(), + regex.clone(), + replacement.clone(), + flags.clone(), + ]) + .expect("regexp_replace should work on valid values"), + ) + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/datafusion/functions/src/regex/mod.rs b/datafusion/functions/src/regex/mod.rs index 1e0c7799c6a5..5c12d4559e74 100644 --- a/datafusion/functions/src/regex/mod.rs +++ b/datafusion/functions/src/regex/mod.rs @@ -19,10 +19,15 @@ pub mod regexplike; pub mod regexpmatch; - +pub mod regexpreplace; // create UDFs make_udf_function!(regexpmatch::RegexpMatchFunc, REGEXP_MATCH, regexp_match); make_udf_function!(regexplike::RegexpLikeFunc, REGEXP_LIKE, regexp_like); +make_udf_function!( + regexpreplace::RegexpReplaceFunc, + REGEXP_REPLACE, + regexp_replace +); export_functions!(( regexp_match, input_arg1 input_arg2, @@ -31,4 +36,4 @@ export_functions!(( regexp_like, input_arg1 input_arg2, "Returns true if a has at least one match in a string,false otherwise." -)); +),(regexp_replace, arg1 arg2 arg3 arg4, "Replaces substrings in a string that match")); diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/functions/src/regex/regexpreplace.rs similarity index 73% rename from datafusion/physical-expr/src/regex_expressions.rs rename to datafusion/functions/src/regex/regexpreplace.rs index 99e6597dad82..d90996e04b3f 100644 --- a/datafusion/physical-expr/src/regex_expressions.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -15,112 +15,112 @@ // specific language governing permissions and limitations // under the License. -// Some of these functions reference the Postgres documentation -// or implementation to ensure compatibility and are subject to -// the Postgres license. - -//! Regex expressions - -use std::sync::{Arc, OnceLock}; - -use arrow::array::{ - new_null_array, Array, ArrayDataBuilder, ArrayRef, BufferBuilder, GenericStringArray, - OffsetSizeTrait, +//! Regx expressions +use arrow::array::new_null_array; +use arrow::array::ArrayDataBuilder; +use arrow::array::BufferBuilder; +use arrow::array::GenericStringArray; +use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow::datatypes::DataType; +use datafusion_common::exec_err; +use datafusion_common::plan_err; +use datafusion_common::ScalarValue; +use datafusion_common::{ + cast::as_generic_string_array, internal_err, DataFusionError, Result, }; -use hashbrown::HashMap; +use datafusion_expr::ColumnarValue; +use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use datafusion_physical_expr::functions::Hint; use regex::Regex; +use std::any::Any; +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::OnceLock; +#[derive(Debug)] +pub(super) struct RegexpReplaceFunc { + signature: Signature, +} +impl RegexpReplaceFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Utf8, Utf8]), + Exact(vec![Utf8, Utf8, Utf8, Utf8]), + ], + Volatility::Immutable, + ), + } + } +} -use datafusion_common::{arrow_datafusion_err, exec_err, plan_err}; -use datafusion_common::{cast::as_generic_string_array, DataFusionError, Result}; -use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation}; +impl ScalarUDFImpl for RegexpReplaceFunc { + fn as_any(&self) -> &dyn Any { + self + } -use crate::functions::{ - make_scalar_function_inner, make_scalar_function_with_hints, Hint, -}; + fn name(&self) -> &str { + "regexp_replace" + } -/// Get the first argument from the given string array. -/// -/// Note: If the array is empty or the first argument is null, -/// then calls the given early abort function. -macro_rules! fetch_string_arg { - ($ARG:expr, $NAME:expr, $T:ident, $EARLY_ABORT:ident) => {{ - let array = as_generic_string_array::($ARG)?; - if array.len() == 0 || array.is_null(0) { - return $EARLY_ABORT(array); + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match &arg_types[0] { + LargeUtf8 | LargeBinary => LargeUtf8, + Utf8 | Binary => Utf8, + Null => Null, + Dictionary(_, t) => match **t { + LargeUtf8 | LargeBinary => LargeUtf8, + Utf8 | Binary => Utf8, + Null => Null, + _ => { + return plan_err!( + "the regexp_replace can only accept strings but got {:?}", + **t + ); + } + }, + other => { + return plan_err!( + "The regexp_replace function can only accept strings. Got {other}" + ); + } + }) + } + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let len = args + .iter() + .fold(Option::::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + + let is_scalar = len.is_none(); + let result = regexp_replace_func(args); + if is_scalar { + // If all inputs are scalar, keeps output as scalar + let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); + result.map(ColumnarValue::Scalar) } else { - array.value(0) + result.map(ColumnarValue::Array) } - }}; + } } - -/// Extract a specific group from a string column, using a regular expression. -/// -/// The full list of supported features and syntax can be found at -/// -/// -/// Supported flags can be found at -/// -/// -/// # Examples -/// -/// ```ignore -/// # use datafusion::prelude::*; -/// # use datafusion::error::Result; -/// # #[tokio::main] -/// # async fn main() -> Result<()> { -/// let ctx = SessionContext::new(); -/// let df = ctx.read_csv("tests/data/regex.csv", CsvReadOptions::new()).await?; -/// -/// // use the regexp_match function to test col 'values', -/// // against patterns in col 'patterns' without flags -/// let df = df.with_column( -/// "a", -/// regexp_match(vec![col("values"), col("patterns")]) -/// )?; -/// // use the regexp_match function to test col 'values', -/// // against patterns in col 'patterns' with flags -/// let df = df.with_column( -/// "b", -/// regexp_match(vec![col("values"), col("patterns"), col("flags")]), -/// )?; -/// -/// // literals can be used as well with dataframe calls -/// let df = df.with_column( -/// "c", -/// regexp_match(vec![lit("foobarbequebaz"), lit("(bar)(beque)")]), -/// )?; -/// -/// df.show().await?; -/// -/// # Ok(()) -/// # } -/// ``` -pub fn regexp_match(args: &[ArrayRef]) -> Result { - match args.len() { - 2 => { - let values = as_generic_string_array::(&args[0])?; - let regex = as_generic_string_array::(&args[1])?; - arrow_string::regexp::regexp_match(values, regex, None) - .map_err(|e| arrow_datafusion_err!(e)) - } - 3 => { - let values = as_generic_string_array::(&args[0])?; - let regex = as_generic_string_array::(&args[1])?; - let flags = as_generic_string_array::(&args[2])?; - - if flags.iter().any(|s| s == Some("g")) { - return plan_err!("regexp_match() does not support the \"global\" option") - } - - arrow_string::regexp::regexp_match(values, regex, Some(flags)) - .map_err(|e| arrow_datafusion_err!(e)) +fn regexp_replace_func(args: &[ColumnarValue]) -> Result { + match args[0].data_type() { + DataType::Utf8 => specialize_regexp_replace::(args), + DataType::LargeUtf8 => specialize_regexp_replace::(args), + other => { + internal_err!("Unsupported data type {other:?} for function regexp_replace") } - other => exec_err!( - "regexp_match was called with {other} arguments. It requires at least 2 and at most 3." - ), } } - /// replace POSIX capture groups (like \1) with Rust Regex group (like ${1}) /// used by regexp_replace fn regex_replace_posix_groups(replacement: &str) -> String { @@ -283,7 +283,20 @@ fn _regexp_replace_early_abort( // Also acts like an early abort mechanism when the input array is empty. Ok(new_null_array(input_array.data_type(), input_array.len())) } - +/// Get the first argument from the given string array. +/// +/// Note: If the array is empty or the first argument is null, +/// then calls the given early abort function. +macro_rules! fetch_string_arg { + ($ARG:expr, $NAME:expr, $T:ident, $EARLY_ABORT:ident) => {{ + let array = as_generic_string_array::($ARG)?; + if array.len() == 0 || array.is_null(0) { + return $EARLY_ABORT(array); + } else { + array.value(0) + } + }}; +} /// Special cased regex_replace implementation for the scenario where /// the pattern, replacement and flags are static (arrays that are derived /// from scalars). This means we can skip regex caching system and basically @@ -358,7 +371,7 @@ fn _regexp_replace_static_pattern_replace( /// on the given set of arguments. pub fn specialize_regexp_replace( args: &[ColumnarValue], -) -> Result { +) -> Result { // This will serve as a dispatch table where we can // leverage it in order to determine whether the scalarity // of the given set of arguments fits a better specialized @@ -371,7 +384,13 @@ pub fn specialize_regexp_replace( // it is not available, we'll claim that it is scalar. matches!(args.get(3), Some(ColumnarValue::Scalar(_)) | None), ); - + let len = args + .iter() + .fold(Option::::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + let inferred_length = len.unwrap_or(1); match ( is_source_scalar, is_pattern_scalar, @@ -391,92 +410,46 @@ pub fn specialize_regexp_replace( // we will create many regexes and it is best to use the implementation // that caches it. If there are no flags, we can simply ignore it here, // and let the specialized function handle it. - (_, true, true, true) => Ok(make_scalar_function_with_hints( - _regexp_replace_static_pattern_replace::, - vec![ + (_, true, true, true) => { + let hints = [ Hint::Pad, Hint::AcceptsSingular, Hint::AcceptsSingular, Hint::AcceptsSingular, - ], - )), + ]; + let args = args + .iter() + .zip(hints.iter().chain(std::iter::repeat(&Hint::Pad))) + .map(|(arg, hint)| { + // Decide on the length to expand this scalar to depending + // on the given hints. + let expansion_len = match hint { + Hint::AcceptsSingular => 1, + Hint::Pad => inferred_length, + }; + arg.clone().into_array(expansion_len) + }) + .collect::>>()?; + _regexp_replace_static_pattern_replace::(&args) + } // If there are no specialized implementations, we'll fall back to the // generic implementation. - (_, _, _, _) => Ok(make_scalar_function_inner(regexp_replace::)), + (_, _, _, _) => { + let args = args + .iter() + .map(|arg| arg.clone().into_array(inferred_length)) + .collect::>>()?; + regexp_replace::(&args) + } } } - #[cfg(test)] mod tests { use arrow::array::*; - use datafusion_common::ScalarValue; - use super::*; - #[test] - fn test_case_sensitive_regexp_match() { - let values = StringArray::from(vec!["abc"; 5]); - let patterns = - StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]); - - let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.values().append_value("a"); - expected_builder.append(true); - expected_builder.append(false); - expected_builder.values().append_value("b"); - expected_builder.append(true); - expected_builder.append(false); - expected_builder.append(false); - let expected = expected_builder.finish(); - - let re = regexp_match::(&[Arc::new(values), Arc::new(patterns)]).unwrap(); - - assert_eq!(re.as_ref(), &expected); - } - - #[test] - fn test_case_insensitive_regexp_match() { - let values = StringArray::from(vec!["abc"; 5]); - let patterns = - StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]); - let flags = StringArray::from(vec!["i"; 5]); - - let elem_builder: GenericStringBuilder = GenericStringBuilder::new(); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.values().append_value("a"); - expected_builder.append(true); - expected_builder.values().append_value("a"); - expected_builder.append(true); - expected_builder.values().append_value("b"); - expected_builder.append(true); - expected_builder.values().append_value("b"); - expected_builder.append(true); - expected_builder.append(false); - let expected = expected_builder.finish(); - - let re = - regexp_match::(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) - .unwrap(); - - assert_eq!(re.as_ref(), &expected); - } - - #[test] - fn test_unsupported_global_flag_regexp_match() { - let values = StringArray::from(vec!["abc"]); - let patterns = StringArray::from(vec!["^(a)"]); - let flags = StringArray::from(vec!["g"]); - - let re_err = - regexp_match::(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)]) - .expect_err("unsupported flag should have failed"); - - assert_eq!(re_err.strip_backtrace(), "Error during planning: regexp_match() does not support the \"global\" option"); - } - #[test] fn test_static_pattern_regexp_replace() { let values = StringArray::from(vec!["abc"; 5]); @@ -587,39 +560,6 @@ mod tests { ); } - #[test] - fn test_regexp_can_specialize_all_cases() { - macro_rules! make_scalar { - () => { - ColumnarValue::Scalar(ScalarValue::Utf8(Some("foo".to_string()))) - }; - } - - macro_rules! make_array { - () => { - ColumnarValue::Array( - Arc::new(StringArray::from(vec!["bar"; 2])) as ArrayRef - ) - }; - } - - for source in [make_scalar!(), make_array!()] { - for pattern in [make_scalar!(), make_array!()] { - for replacement in [make_scalar!(), make_array!()] { - for flags in [Some(make_scalar!()), Some(make_array!()), None] { - let mut args = - vec![source.clone(), pattern.clone(), replacement.clone()]; - if let Some(flags) = flags { - args.push(flags.clone()); - } - let regex_func = specialize_regexp_replace::(&args); - assert!(regex_func.is_ok()); - } - } - } - } - } - #[test] fn test_static_pattern_regexp_replace_with_null_buffers() { let values = StringArray::from(vec![ diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 80edb91c741c..0dd6fd2a0710 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -90,10 +90,6 @@ name = "in_list" harness = false name = "make_date" -[[bench]] -harness = false -name = "regexp" - [[bench]] harness = false name = "to_char" diff --git a/datafusion/physical-expr/benches/regexp.rs b/datafusion/physical-expr/benches/regexp.rs deleted file mode 100644 index 32acd6ca8f28..000000000000 --- a/datafusion/physical-expr/benches/regexp.rs +++ /dev/null @@ -1,115 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -extern crate criterion; - -use std::iter; -use std::sync::Arc; - -use arrow_array::builder::StringBuilder; -use arrow_array::{ArrayRef, StringArray}; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_physical_expr::regex_expressions::{regexp_match, regexp_replace}; -use rand::distributions::Alphanumeric; -use rand::rngs::ThreadRng; -use rand::seq::SliceRandom; -use rand::Rng; -fn data(rng: &mut ThreadRng) -> StringArray { - let mut data: Vec = vec![]; - for _ in 0..1000 { - data.push( - rng.sample_iter(&Alphanumeric) - .take(7) - .map(char::from) - .collect(), - ); - } - - StringArray::from(data) -} - -fn regex(rng: &mut ThreadRng) -> StringArray { - let samples = vec![ - ".*([A-Z]{1}).*".to_string(), - "^(A).*".to_string(), - r#"[\p{Letter}-]+"#.to_string(), - r#"[\p{L}-]+"#.to_string(), - "[a-zA-Z]_[a-zA-Z]{2}".to_string(), - ]; - let mut data: Vec = vec![]; - for _ in 0..1000 { - data.push(samples.choose(rng).unwrap().to_string()); - } - - StringArray::from(data) -} - -fn flags(rng: &mut ThreadRng) -> StringArray { - let samples = vec![Some("i".to_string()), Some("im".to_string()), None]; - let mut sb = StringBuilder::new(); - for _ in 0..1000 { - let sample = samples.choose(rng).unwrap(); - if sample.is_some() { - sb.append_value(sample.clone().unwrap()); - } else { - sb.append_null(); - } - } - - sb.finish() -} - -fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("regexp_match_1000", |b| { - let mut rng = rand::thread_rng(); - let data = Arc::new(data(&mut rng)) as ArrayRef; - let regex = Arc::new(regex(&mut rng)) as ArrayRef; - let flags = Arc::new(flags(&mut rng)) as ArrayRef; - - b.iter(|| { - black_box( - regexp_match::(&[data.clone(), regex.clone(), flags.clone()]) - .expect("regexp_match should work on valid values"), - ) - }) - }); - - c.bench_function("regexp_replace_1000", |b| { - let mut rng = rand::thread_rng(); - let data = Arc::new(data(&mut rng)) as ArrayRef; - let regex = Arc::new(regex(&mut rng)) as ArrayRef; - let flags = Arc::new(flags(&mut rng)) as ArrayRef; - let replacement = - Arc::new(StringArray::from_iter_values(iter::repeat("XX").take(1000))) - as ArrayRef; - - b.iter(|| { - black_box( - regexp_replace::(&[ - data.clone(), - regex.clone(), - replacement.clone(), - flags.clone(), - ]) - .expect("regexp_replace should work on valid values"), - ) - }) - }); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 81013882ad89..abc80a75c2b9 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -117,14 +117,6 @@ macro_rules! invoke_on_array_if_regex_expressions_feature_flag { }; } -#[cfg(feature = "regex_expressions")] -macro_rules! invoke_on_columnar_value_if_regex_expressions_feature_flag { - ($FUNC:ident, $T:tt, $NAME:expr) => {{ - use crate::regex_expressions; - regex_expressions::$FUNC::<$T> - }}; -} - #[cfg(not(feature = "regex_expressions"))] macro_rules! invoke_on_columnar_value_if_regex_expressions_feature_flag { ($FUNC:ident, $T:tt, $NAME:expr) => { @@ -158,7 +150,7 @@ macro_rules! invoke_if_unicode_expressions_feature_flag { } #[derive(Debug, Clone, Copy)] -pub(crate) enum Hint { +pub enum Hint { /// Indicates the argument needs to be padded if it is scalar Pad, /// Indicates the argument can be converted to an array of length 1 @@ -552,31 +544,6 @@ pub fn create_physical_fun( _ => unreachable!(), }, }), - BuiltinScalarFunction::RegexpReplace => { - Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let specializer_func = invoke_on_columnar_value_if_regex_expressions_feature_flag!( - specialize_regexp_replace, - i32, - "regexp_replace" - ); - let func = specializer_func(args)?; - func(args) - } - DataType::LargeUtf8 => { - let specializer_func = invoke_on_columnar_value_if_regex_expressions_feature_flag!( - specialize_regexp_replace, - i64, - "regexp_replace" - ); - let func = specializer_func(args)?; - func(args) - } - other => exec_err!( - "Unsupported data type {other:?} for function regexp_replace" - ), - }) - } BuiltinScalarFunction::Repeat => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function_inner(string_expressions::repeat::)(args) @@ -1677,131 +1644,6 @@ mod tests { Int32, Int32Array ); - #[cfg(feature = "regex_expressions")] - test_function!( - RegexpReplace, - &[lit("Thomas"), lit(".[mN]a."), lit("M"),], - Ok(Some("ThM")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "regex_expressions")] - test_function!( - RegexpReplace, - &[lit("foobarbaz"), lit("b.."), lit("X"),], - Ok(Some("fooXbaz")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "regex_expressions")] - test_function!( - RegexpReplace, - &[lit("foobarbaz"), lit("b.."), lit("X"), lit("g"),], - Ok(Some("fooXX")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "regex_expressions")] - test_function!( - RegexpReplace, - &[lit("foobarbaz"), lit("b(..)"), lit("X\\1Y"), lit("g"),], - Ok(Some("fooXarYXazY")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "regex_expressions")] - test_function!( - RegexpReplace, - &[ - lit(ScalarValue::Utf8(None)), - lit("b(..)"), - lit("X\\1Y"), - lit("g"), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "regex_expressions")] - test_function!( - RegexpReplace, - &[ - lit("foobarbaz"), - lit(ScalarValue::Utf8(None)), - lit("X\\1Y"), - lit("g"), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "regex_expressions")] - test_function!( - RegexpReplace, - &[ - lit("foobarbaz"), - lit("b(..)"), - lit(ScalarValue::Utf8(None)), - lit("g"), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "regex_expressions")] - test_function!( - RegexpReplace, - &[ - lit("foobarbaz"), - lit("b(..)"), - lit("X\\1Y"), - lit(ScalarValue::Utf8(None)), - ], - Ok(None), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "regex_expressions")] - test_function!( - RegexpReplace, - &[lit("ABCabcABC"), lit("(abc)"), lit("X"), lit("gi"),], - Ok(Some("XXX")), - &str, - Utf8, - StringArray - ); - #[cfg(feature = "regex_expressions")] - test_function!( - RegexpReplace, - &[lit("ABCabcABC"), lit("(abc)"), lit("X"), lit("i"),], - Ok(Some("XabcABC")), - &str, - Utf8, - StringArray - ); - #[cfg(not(feature = "regex_expressions"))] - test_function!( - RegexpReplace, - &[ - lit("foobarbaz"), - lit("b.."), - lit("X"), - ], - internal_err!( - "function regexp_replace requires compilation with feature flag: regex_expressions." - ), - &str, - Utf8, - StringArray - ); test_function!( Repeat, &[lit("Pg"), lit(ScalarValue::Int64(Some(4))),], diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 41d36d8bcbed..125da4a2b9c2 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -32,8 +32,6 @@ pub mod math_expressions; mod partitioning; mod physical_expr; pub mod planner; -#[cfg(feature = "regex_expressions")] -pub mod regex_expressions; mod scalar_function; mod sort_expr; pub mod sort_properties; diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index c47b9abadb0e..6700877f2610 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -587,7 +587,7 @@ enum ScalarFunction { // 36 was NullIf OctetLength = 37; Random = 38; - RegexpReplace = 39; + // 39 was RegexpReplace Repeat = 40; Replace = 41; Reverse = 42; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index c9be1bb7f3e5..3415574c1547 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22356,7 +22356,6 @@ impl serde::Serialize for ScalarFunction { Self::Md5 => "MD5", Self::OctetLength => "OctetLength", Self::Random => "Random", - Self::RegexpReplace => "RegexpReplace", Self::Repeat => "Repeat", Self::Replace => "Replace", Self::Reverse => "Reverse", @@ -22485,7 +22484,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "MD5", "OctetLength", "Random", - "RegexpReplace", "Repeat", "Replace", "Reverse", @@ -22643,7 +22641,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "MD5" => Ok(ScalarFunction::Md5), "OctetLength" => Ok(ScalarFunction::OctetLength), "Random" => Ok(ScalarFunction::Random), - "RegexpReplace" => Ok(ScalarFunction::RegexpReplace), "Repeat" => Ok(ScalarFunction::Repeat), "Replace" => Ok(ScalarFunction::Replace), "Reverse" => Ok(ScalarFunction::Reverse), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 4d19b79a3b2c..c12d04f33f9e 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2676,7 +2676,7 @@ pub enum ScalarFunction { /// 36 was NullIf OctetLength = 37, Random = 38, - RegexpReplace = 39, + /// 39 was RegexpReplace Repeat = 40, Replace = 41, Reverse = 42, @@ -2817,7 +2817,6 @@ impl ScalarFunction { ScalarFunction::Md5 => "MD5", ScalarFunction::OctetLength => "OctetLength", ScalarFunction::Random => "Random", - ScalarFunction::RegexpReplace => "RegexpReplace", ScalarFunction::Repeat => "Repeat", ScalarFunction::Replace => "Replace", ScalarFunction::Reverse => "Reverse", @@ -2940,7 +2939,6 @@ impl ScalarFunction { "MD5" => Some(Self::Md5), "OctetLength" => Some(Self::OctetLength), "Random" => Some(Self::Random), - "RegexpReplace" => Some(Self::RegexpReplace), "Repeat" => Some(Self::Repeat), "Replace" => Some(Self::Replace), "Reverse" => Some(Self::Reverse), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index aee53849c806..fb056c78291a 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -61,12 +61,12 @@ use datafusion_expr::{ left, levenshtein, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, lower, lpad, ltrim, md5, nanvl, now, octet_length, overlay, pi, power, radians, - random, regexp_replace, repeat, replace, reverse, right, round, rpad, rtrim, sha224, - sha256, sha384, sha512, signum, sin, sinh, split_part, sqrt, starts_with, - string_to_array, strpos, struct_fun, substr, substr_index, substring, tan, tanh, - to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, Between, BinaryExpr, - BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, - GetIndexedField, GroupingSet, + random, repeat, replace, reverse, right, round, rpad, rtrim, sha224, sha256, sha384, + sha512, signum, sin, sinh, split_part, sqrt, starts_with, string_to_array, strpos, + struct_fun, substr, substr_index, substring, tan, tanh, to_hex, translate, trim, + trunc, upper, uuid, AggregateFunction, Between, BinaryExpr, BuiltInWindowFunction, + BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField, + GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -530,7 +530,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Left => Self::Left, ScalarFunction::Lpad => Self::Lpad, ScalarFunction::Random => Self::Random, - ScalarFunction::RegexpReplace => Self::RegexpReplace, ScalarFunction::Repeat => Self::Repeat, ScalarFunction::Replace => Self::Replace, ScalarFunction::Reverse => Self::Reverse, @@ -1703,12 +1702,6 @@ pub fn parse_expr( .map(|expr| parse_expr(expr, registry, codec)) .collect::, _>>()?, )), - ScalarFunction::RegexpReplace => Ok(regexp_replace( - args.to_owned() - .iter() - .map(|expr| parse_expr(expr, registry, codec)) - .collect::, _>>()?, - )), ScalarFunction::Btrim => Ok(btrim( args.to_owned() .iter() diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index a4e9fd423bbf..96f750f3d22a 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1508,7 +1508,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Lpad => Self::Lpad, BuiltinScalarFunction::Random => Self::Random, BuiltinScalarFunction::Uuid => Self::Uuid, - BuiltinScalarFunction::RegexpReplace => Self::RegexpReplace, BuiltinScalarFunction::Repeat => Self::Repeat, BuiltinScalarFunction::Replace => Self::Replace, BuiltinScalarFunction::Reverse => Self::Reverse,