Skip to content

Commit

Permalink
[CALCITE-6310] Add REGEXP_REPLACE function (enabled in PostgreSQL lib…
Browse files Browse the repository at this point in the history
…rary)

PostgreSQL allows use of the flags string at any parameter after the replacement string
  • Loading branch information
jduo authored and normanj-bitquill committed Jul 10, 2024
1 parent dad9073 commit 0bf0a2b
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,14 @@ static RelDataType deriveTypeSplit(SqlOperatorBinding operatorBinding,
@LibraryOperator(libraries = {BIG_QUERY, MYSQL, ORACLE, REDSHIFT})
public static final SqlFunction REGEXP_REPLACE = new SqlRegexpReplaceFunction();

/** The PostgreSQL variant of
* "REGEXP_REPLACE(value, regexp, rep [, pos [, occurrence]] [, matchType])"
* function. Replaces all substrings of value that match regexp with
* {@code rep} and returns modified value. */
@LibraryOperator(libraries = {POSTGRESQL})
public static final SqlFunction PG_REGEXP_REPLACE = new SqlPgRegexpReplaceFunction();


/** The "REGEXP_SUBSTR(value, regexp[, position[, occurrence]])" function.
* Returns the substring in value that matches the regexp. Returns NULL if there is no match. */
@LibraryOperator(libraries = {BIG_QUERY})
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.calcite.sql.fun;

import org.apache.calcite.sql.SqlCallBinding;
import org.apache.calcite.sql.type.OperandTypes;
import org.apache.calcite.sql.type.SqlTypeFamily;

import java.util.ArrayList;
import java.util.List;

/**
* The PostgreSQL
* REGEXP_REPLACE(source_string, pattern, replacement [, pos, [, occurrence]] [, match_type])
* searches for a regular expression pattern and replaces every occurrence of the pattern
* with the specified string. It differs from the standard REGEXP_REPLACE in that there is
* no type inference for position or occurrence parameters and allows the match_type parameter
* to be used as the 3rd, 4th, or 5th parameters instead.
*/
public class SqlPgRegexpReplaceFunction extends SqlRegexpReplaceFunction {

@Override public String getAllowedSignatures(String opNameToUse) {
return opNameToUse + "(VARCHAR, VARCHAR, VARCHAR [, INTEGER [, INTEGER]] [, VARCHAR])";
}

@Override public boolean checkOperandTypes(SqlCallBinding callBinding,
boolean throwOnFailure) {
final int operandCount = callBinding.getOperandCount();
assert operandCount >= 3;
if (operandCount == 3) {
return OperandTypes.STRING_STRING_STRING
.checkOperandTypes(callBinding, throwOnFailure);
}
final List<SqlTypeFamily> families = new ArrayList<>();
families.add(SqlTypeFamily.STRING);
families.add(SqlTypeFamily.STRING);
families.add(SqlTypeFamily.STRING);
for (int i = 3; i < operandCount; i++) {
// The argument type at index 3 and 4 can be either integer or string.
// Index 3 can either be the start pos or the flags.
// Index 4 can either be the end pos of the flags.
// If the flags get used at index 3 or 4, there can be no more arguments, since index 5
// can only be flags.
if (i == 3) {
if (SqlTypeFamily.STRING.contains(callBinding.getOperandType(i))) {
families.add(SqlTypeFamily.STRING);
break;
}
families.add(SqlTypeFamily.INTEGER);
} else if (i == 4) {
if (SqlTypeFamily.STRING.contains(callBinding.getOperandType(i))) {
families.add(SqlTypeFamily.STRING);
break;
}
families.add(SqlTypeFamily.INTEGER);
} else if (i == 5) {
families.add(SqlTypeFamily.STRING);
}
}

if (throwOnFailure && operandCount != families.size()) {
throw callBinding.newValidationSignatureError();
}
return OperandTypes.family(families.toArray(new SqlTypeFamily[0]))
.checkOperandTypes(callBinding, throwOnFailure);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ public SqlRegexpReplaceFunction() {
return SqlOperandCountRanges.between(3, 6);
}

@Override public String getAllowedSignatures(String opNameToUse) {
return opNameToUse + "(VARCHAR, VARCHAR, VARCHAR [, INTEGER [, INTEGER [, VARCHAR]]])";
}

@Override public boolean checkOperandTypes(SqlCallBinding callBinding,
boolean throwOnFailure) {
final int operandCount = callBinding.getOperandCount();
Expand All @@ -59,16 +63,20 @@ public SqlRegexpReplaceFunction() {
families.add(SqlTypeFamily.STRING);
families.add(SqlTypeFamily.STRING);
for (int i = 3; i < operandCount; i++) {
// The argument type at index 3 and 4 can be either integer or string.
// Index 3 can either be the start pos or the flags.
// Index 4 can either be the end pos of the flags.
// If the flags get used at index 3 or 4, there can be no more arguments, since index 5
// can only be flags.
if (i == 3) {
families.add(SqlTypeFamily.INTEGER);
}
if (i == 4) {
} else if (i == 4) {
families.add(SqlTypeFamily.INTEGER);
}
if (i == 5) {
} else if (i == 5) {
families.add(SqlTypeFamily.STRING);
}
}

return OperandTypes.family(families.toArray(new SqlTypeFamily[0]))
.checkOperandTypes(callBinding, throwOnFailure);
}
Expand Down
45 changes: 45 additions & 0 deletions core/src/test/java/org/apache/calcite/test/SqlValidatorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11715,6 +11715,51 @@ private void checkCustomColumnResolving(String table) {
.columnType("VARCHAR NOT NULL");
}

@Test void testPgRegexpReplace() {
final SqlOperatorTable opTable = operatorTableFor(SqlLibrary.POSTGRESQL);

expr("REGEXP_REPLACE('a b c', 'a', 'X')")
.withOperatorTable(opTable)
.columnType("VARCHAR NOT NULL");
expr("REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 2)")
.withOperatorTable(opTable)
.columnType("VARCHAR NOT NULL");
expr("REGEXP_REPLACE('abc def GHI', '[a-z]+', 'X', 'c')")
.withOperatorTable(opTable)
.columnType("VARCHAR NOT NULL");
expr("REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 3)")
.withOperatorTable(opTable)
.columnType("VARCHAR NOT NULL");
expr("REGEXP_REPLACE('abc def GHI', '[a-z]+', 'X', 1, 'c')")
.withOperatorTable(opTable)
.columnType("VARCHAR NOT NULL");
expr("REGEXP_REPLACE('abc def GHI', '[a-z]+', 'X', 1, 3, 'c')")
.withOperatorTable(opTable)
.columnType("VARCHAR NOT NULL");
// Implicit type coercion.
expr("REGEXP_REPLACE(null, '(-)', '###')")
.withOperatorTable(opTable)
.columnType("VARCHAR");
expr("REGEXP_REPLACE('100-200', null, '###')")
.withOperatorTable(opTable)
.columnType("VARCHAR");
expr("REGEXP_REPLACE('100-200', '(-)', null)")
.withOperatorTable(opTable)
.columnType("VARCHAR");

// If a String parameter is used after index 3, it must be the flags parameter.
// No other parameters can be used after.
expr("^REGEXP_REPLACE('abc def GHI', '[a-z]+', 'X', 'c', 1)^")
.withOperatorTable(opTable)
.fails("Cannot apply 'REGEXP_REPLACE' to arguments of type .*");
expr("^REGEXP_REPLACE('abc def GHI', '[a-z]+', 'X', 'c', 'c')^")
.withOperatorTable(opTable)
.fails("Cannot apply 'REGEXP_REPLACE' to arguments of type .*");
expr("^REGEXP_REPLACE('abc def GHI', '[a-z]+', 'X', 1, 'c', 'c')^")
.withOperatorTable(opTable)
.fails("Cannot apply 'REGEXP_REPLACE' to arguments of type .*");
}

@Test void testInvalidFunctionCall() {
final SqlOperatorTable operatorTable =
MockSqlOperatorTable.standard().extend();
Expand Down
3 changes: 2 additions & 1 deletion site/_docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -2847,7 +2847,8 @@ In the following:
| b | REGEXP_EXTRACT_ALL(string, regexp) | Returns an array of all substrings in *string* that matches the *regexp*. Returns an empty array if there is no match
| b | REGEXP_INSTR(string, regexp [, position [, occurrence [, occurrence_position]]]) | Returns the lowest 1-based position of the substring in *string* that matches the *regexp*, starting search at *position* (default 1), and until locating the nth *occurrence* (default 1). Setting occurrence_position (default 0) to 1 returns the end position of substring + 1. Returns 0 if there is no match
| m o p r s | REGEXP_LIKE(string, regexp [, flags]) | Equivalent to `string1 RLIKE string2` with an optional parameter for search flags. Supported flags are: <ul><li>i: case-insensitive matching</li><li>c: case-sensitive matching</li><li>n: newline-sensitive matching</li><li>s: non-newline-sensitive matching</li><li>m: multi-line</li></ul>
| b m o r | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [, matchType]]]) | Replaces all substrings of *string* that match *regexp* with *rep* at the starting *pos* in expr (if omitted, the default is 1), *occurrence* specifies which occurrence of a match to search for (if omitted, the default is 1), *matchType* specifies how to perform matching
| b m o | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence [, matchType]]]) | Replaces all substrings of *string* that match *regexp* with *rep* at the starting *pos* in expr (if omitted, the default is 1), *occurrence* specifies which occurrence of a match to search for (if omitted, the default is 1), *matchType* specifies how to perform matching
| p | REGEXP_REPLACE(string, regexp, rep [, pos [, occurrence]] [, matchType]) | Replaces all substrings of *string* that match *regexp* with *rep* at the starting *pos* in expr (if omitted, the default is 1), *occurrence* specifies which occurrence of a match to search for (if omitted, the default is 1), *matchType* specifies how to perform matching
| b | REGEXP_SUBSTR(string, regexp [, position [, occurrence]]) | Synonym for REGEXP_EXTRACT
| b m p r s | REPEAT(string, integer) | Returns a string consisting of *string* repeated of *integer* times; returns an empty string if *integer* is less than 1
| b m | REVERSE(string) | Returns *string* with the order of the characters reversed
Expand Down

0 comments on commit 0bf0a2b

Please sign in to comment.