Skip to content

Commit

Permalink
Port regexp_replace functions and related tests (#9454)
Browse files Browse the repository at this point in the history
* Port regexp_replace functions and related tests

* porting tests

* delete files

* change config

* adding dependency

* change tests

* fix

* optimize code

* remove unused
  • Loading branch information
Lordworms authored Mar 5, 2024
1 parent 64f998f commit f755626
Show file tree
Hide file tree
Showing 17 changed files with 202 additions and 547 deletions.
2 changes: 2 additions & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion datafusion/core/tests/dataframe/dataframe_functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ async fn test_fn_regexp_match() -> Result<()> {
#[tokio::test]
#[cfg(feature = "unicode_expressions")]
async fn test_fn_regexp_replace() -> Result<()> {
let expr = regexp_replace(vec![col("a"), lit("[a-z]"), lit("x"), lit("g")]);
let expr = regexp_replace(col("a"), lit("[a-z]"), lit("x"), lit("g"));

let expected = [
"+----------------------------------------------------------+",
Expand Down
18 changes: 0 additions & 18 deletions datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,9 +218,6 @@ pub enum BuiltinScalarFunction {
OctetLength,
/// random
Random,
/// regexp_match
/// regexp_replace
RegexpReplace,
/// repeat
Repeat,
/// replace
Expand Down Expand Up @@ -417,7 +414,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::MD5 => Volatility::Immutable,
BuiltinScalarFunction::OctetLength => Volatility::Immutable,
BuiltinScalarFunction::Radians => Volatility::Immutable,
BuiltinScalarFunction::RegexpReplace => Volatility::Immutable,
BuiltinScalarFunction::Repeat => Volatility::Immutable,
BuiltinScalarFunction::Replace => Volatility::Immutable,
BuiltinScalarFunction::Reverse => Volatility::Immutable,
Expand Down Expand Up @@ -674,9 +670,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Pi => Ok(Float64),
BuiltinScalarFunction::Random => Ok(Float64),
BuiltinScalarFunction::Uuid => Ok(Utf8),
BuiltinScalarFunction::RegexpReplace => {
utf8_to_str_type(&input_expr_types[0], "regexp_replace")
}
BuiltinScalarFunction::Repeat => {
utf8_to_str_type(&input_expr_types[0], "repeat")
}
Expand Down Expand Up @@ -1161,14 +1154,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Replace | BuiltinScalarFunction::Translate => {
Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], self.volatility())
}
BuiltinScalarFunction::RegexpReplace => Signature::one_of(
vec![
Exact(vec![Utf8, Utf8, Utf8]),
Exact(vec![Utf8, Utf8, Utf8, Utf8]),
],
self.volatility(),
),

BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()),
BuiltinScalarFunction::Random => Signature::exact(vec![], self.volatility()),
BuiltinScalarFunction::Uuid => Signature::exact(vec![], self.volatility()),
Expand Down Expand Up @@ -1398,9 +1383,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"],
BuiltinScalarFunction::FindInSet => &["find_in_set"],

// regex functions
BuiltinScalarFunction::RegexpReplace => &["regexp_replace"],

// time/date functions
BuiltinScalarFunction::Now => &["now"],
BuiltinScalarFunction::CurrentDate => &["current_date", "today"],
Expand Down
20 changes: 0 additions & 20 deletions datafusion/expr/src/expr_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -822,11 +822,6 @@ nary_scalar_expr!(
rpad,
"fill up a string to the length by appending the characters"
);
nary_scalar_expr!(
RegexpReplace,
regexp_replace,
"replace strings that match a regular expression"
);
nary_scalar_expr!(
Btrim,
btrim,
Expand Down Expand Up @@ -1314,21 +1309,6 @@ mod test {
test_scalar_expr!(Ltrim, ltrim, string);
test_scalar_expr!(MD5, md5, string);
test_scalar_expr!(OctetLength, octet_length, string);
test_nary_scalar_expr!(
RegexpReplace,
regexp_replace,
string,
pattern,
replacement
);
test_nary_scalar_expr!(
RegexpReplace,
regexp_replace,
string,
pattern,
replacement,
flags
);
test_scalar_expr!(Replace, replace, string, from, to);
test_scalar_expr!(Repeat, repeat, string, count);
test_scalar_expr!(Reverse, reverse, string);
Expand Down
3 changes: 2 additions & 1 deletion datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,11 @@ chrono = { workspace = true }
datafusion-common = { workspace = true }
datafusion-execution = { workspace = true }
datafusion-expr = { workspace = true }
datafusion-physical-expr = { workspace = true, default-features = true }
hex = { version = "0.4", optional = true }
itertools = { workspace = true }
log = { workspace = true }

regex = { version = "1.8" }
[dev-dependencies]
criterion = "0.5"
rand = { workspace = true }
Expand Down
41 changes: 39 additions & 2 deletions datafusion/functions/benches/regx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,18 @@

extern crate criterion;

use std::sync::Arc;

use arrow_array::builder::StringBuilder;
use arrow_array::{ArrayRef, StringArray};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_functions::regex::regexplike::regexp_like;
use datafusion_functions::regex::regexpmatch::regexp_match;
use datafusion_functions::regex::regexpreplace::regexp_replace;
use rand::distributions::Alphanumeric;
use rand::rngs::ThreadRng;
use rand::seq::SliceRandom;
use rand::Rng;
use std::iter;
use std::sync::Arc;
fn data(rng: &mut ThreadRng) -> StringArray {
let mut data: Vec<String> = vec![];
for _ in 0..1000 {
Expand Down Expand Up @@ -101,6 +102,42 @@ fn criterion_benchmark(c: &mut Criterion) {
)
})
});

c.bench_function("regexp_match_1000", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
let flags = Arc::new(flags(&mut rng)) as ArrayRef;

b.iter(|| {
black_box(
regexp_match::<i32>(&[data.clone(), regex.clone(), flags.clone()])
.expect("regexp_match should work on valid values"),
)
})
});

c.bench_function("regexp_replace_1000", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
let flags = Arc::new(flags(&mut rng)) as ArrayRef;
let replacement =
Arc::new(StringArray::from_iter_values(iter::repeat("XX").take(1000)))
as ArrayRef;

b.iter(|| {
black_box(
regexp_replace::<i32>(&[
data.clone(),
regex.clone(),
replacement.clone(),
flags.clone(),
])
.expect("regexp_replace should work on valid values"),
)
})
});
}

criterion_group!(benches, criterion_benchmark);
Expand Down
9 changes: 7 additions & 2 deletions datafusion/functions/src/regex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,15 @@
pub mod regexplike;
pub mod regexpmatch;

pub mod regexpreplace;
// create UDFs
make_udf_function!(regexpmatch::RegexpMatchFunc, REGEXP_MATCH, regexp_match);
make_udf_function!(regexplike::RegexpLikeFunc, REGEXP_LIKE, regexp_like);
make_udf_function!(
regexpreplace::RegexpReplaceFunc,
REGEXP_REPLACE,
regexp_replace
);
export_functions!((
regexp_match,
input_arg1 input_arg2,
Expand All @@ -31,4 +36,4 @@ export_functions!((
regexp_like,
input_arg1 input_arg2,
"Returns true if a has at least one match in a string,false otherwise."
));
),(regexp_replace, arg1 arg2 arg3 arg4, "Replaces substrings in a string that match"));
Loading

0 comments on commit f755626

Please sign in to comment.