Skip to content

Commit

Permalink
Fix REPLACE function with replace patterns (#1755)
Browse files Browse the repository at this point in the history
In the replace string of the `REPLACE` function, QLever now uses a dollar sign `$` to dynamically fetch the replacement from the input. For example `REPLACE("aabc", "(a+)b", "$1def$1")` will now correctly result in "aadefaac".
This fix is implemented by manually converting the replacement string to the format the Google RE2 expects where the replacement format is not `$n` but `\n`.
Fixes #1664
  • Loading branch information
RobinTF authored Feb 5, 2025
1 parent 6c0e792 commit 1dbe2af
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 1 deletion.
49 changes: 49 additions & 0 deletions src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,55 @@ std::optional<std::string> StringValueGetter::operator()(
}
}

// ____________________________________________________________________________
std::optional<std::string> ReplacementStringGetter::operator()(
Id id, const EvaluationContext* context) const {
std::optional<std::string> originalString =
StringValueGetter::operator()(id, context);
if (!originalString.has_value()) {
return originalString;
}
return convertToReplacementString(originalString.value());
}

// ____________________________________________________________________________
std::optional<std::string> ReplacementStringGetter::operator()(
const LiteralOrIri& s, const EvaluationContext*) const {
return convertToReplacementString(asStringViewUnsafe(s.getContent()));
}

// ____________________________________________________________________________
std::string ReplacementStringGetter::convertToReplacementString(
std::string_view view) {
std::string result;
// Rough estimate of the size of the result string.
result.reserve(view.size());
for (size_t i = 0; i < view.size(); i++) {
char c = view.at(i);
switch (c) {
case '$':
// Re2 used \1, \2, ... for backreferences, so we change $ to \.
result.push_back('\\');
break;
case '\\':
// "\$" is unescaped to "$"
if (i + 1 < view.size() && view.at(i + 1) == '$') {
result.push_back('$');
i++;
} else {
// Escape existing backslashes.
result.push_back(c);
result.push_back(c);
}
break;
default:
result.push_back(c);
break;
}
}
return result;
}

// ____________________________________________________________________________
template <auto isSomethingFunction, auto prefix>
Id IsSomethingValueGetter<isSomethingFunction, prefix>::operator()(
Expand Down
16 changes: 16 additions & 0 deletions src/engine/sparqlExpressions/SparqlExpressionValueGetters.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,22 @@ struct StringValueGetter : Mixin<StringValueGetter> {
return std::string(asStringViewUnsafe(s.getContent()));
}
};
// Similar to `StringValueGetter`, but correctly preprocesses strings so that
// they can be used by re2 as replacement strings. So '$1 \abc \$' becomes
// '\1 \\abc $', where the former variant is valid in the SPARQL standard and
// the latter represents the format that re2 expects.
struct ReplacementStringGetter : StringValueGetter,
Mixin<ReplacementStringGetter> {
using Mixin<ReplacementStringGetter>::operator();
std::optional<std::string> operator()(ValueId,
const EvaluationContext*) const;

std::optional<std::string> operator()(const LiteralOrIri& s,
const EvaluationContext*) const;

private:
static std::string convertToReplacementString(std::string_view view);
};

// Boolean value getter that checks whether the given `Id` is a `ValueId` of the
// given `datatype`.
Expand Down
2 changes: 1 addition & 1 deletion src/engine/sparqlExpressions/StringExpressions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ using StrBeforeExpression =

using ReplaceExpression =
StringExpressionImpl<3, decltype(replaceImpl), RegexValueGetter,
StringValueGetter>;
ReplacementStringGetter>;

// CONCAT
class ConcatExpression : public detail::VariadicExpression {
Expand Down
10 changes: 10 additions & 0 deletions test/SparqlExpressionTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1288,6 +1288,16 @@ TEST(SparqlExpression, ReplaceExpression) {
idOrLitOrStringVec({"null", "Xs", "zwei", "drei", U, U}),
std::tuple{idOrLitOrStringVec({"null", "eins", "zwei", "drei", U, U}),
IdOrLiteralOrIri{lit("e.[a-z]")}, IdOrLiteralOrIri{lit("X")}});
// A regex with replacement with substitutions
checkReplace(idOrLitOrStringVec({R"("$1 \\2 A \\bc")", R"("$1 \\2 DE \\f")"}),
std::tuple{idOrLitOrStringVec({"Abc", "DEf"}),
IdOrLiteralOrIri{lit("([A-Z]+)")},
IdOrLiteralOrIri{lit(R"("\\$1 \\2 $1 \\")")}});

checkReplace(idOrLitOrStringVec({"truebc", "truef"}),
std::tuple{idOrLitOrStringVec({"Abc", "DEf"}),
IdOrLiteralOrIri{lit("([A-Z]+)")},
IdOrLiteralOrIri{Id::makeFromBool(true)}});

// Case-insensitive matching using the hack for google regex:
checkReplace(idOrLitOrStringVec({"null", "xxns", "zwxx", "drxx"}),
Expand Down

0 comments on commit 1dbe2af

Please sign in to comment.