From b58cea6c5f328fec20a4175e608113c6ea247715 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Tue, 4 Feb 2025 11:50:41 +0100 Subject: [PATCH 1/5] Fix regex replacement strings --- .../SparqlExpressionValueGetters.cpp | 49 +++++++++++++++++++ .../SparqlExpressionValueGetters.h | 14 ++++++ .../sparqlExpressions/StringExpressions.cpp | 2 +- test/SparqlExpressionTest.cpp | 6 +++ 4 files changed, 70 insertions(+), 1 deletion(-) diff --git a/src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp b/src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp index 3ed18e7b99..932e111537 100644 --- a/src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp +++ b/src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp @@ -90,6 +90,55 @@ std::optional StringValueGetter::operator()( } } +// ____________________________________________________________________________ +std::optional ReplacementStringGetter::operator()( + Id id, const EvaluationContext* context) const { + std::optional originalString = + StringValueGetter::operator()(id, context); + if (!originalString.has_value()) { + return originalString; + } + return convertToReplacementString(originalString.value()); +} + +// ____________________________________________________________________________ +std::optional ReplacementStringGetter::operator()( + const LiteralOrIri& s, const EvaluationContext*) const { + return convertToReplacementString(asStringViewUnsafe(s.getContent())); +} + +// ____________________________________________________________________________ +std::string ReplacementStringGetter::convertToReplacementString( + std::string_view view) { + std::string result; + // Rough estimate of the size of the result string. + result.reserve(view.size()); + for (size_t i = 0; i < view.size(); i++) { + char c = view.at(i); + switch (c) { + case '$': + // "$$" is unescaped to "$" + if (i + 1 < view.size() && view.at(i + 1) == '$') { + result.push_back(c); + i++; + } else { + // Re2 used \1, \2, ... for backreferences, so we change $ to \. + result.push_back('\\'); + } + break; + case '\\': + // Escape existing backslashes. + result.push_back(c); + result.push_back(c); + break; + default: + result.push_back(c); + break; + } + } + return result; +} + // ____________________________________________________________________________ template Id IsSomethingValueGetter::operator()( diff --git a/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h b/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h index 88dbc2d825..95bbf8f4a8 100644 --- a/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h +++ b/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h @@ -140,6 +140,20 @@ struct StringValueGetter : Mixin { return std::string(asStringViewUnsafe(s.getContent())); } }; +// Similar to `StringValueGetter`, but correctly escapes and unescapes string +// sequences. +struct ReplacementStringGetter : StringValueGetter, + Mixin { + using Mixin::operator(); + std::optional operator()(ValueId, + const EvaluationContext*) const; + + std::optional operator()(const LiteralOrIri& s, + const EvaluationContext*) const; + + private: + static std::string convertToReplacementString(std::string_view view); +}; // Boolean value getter that checks whether the given `Id` is a `ValueId` of the // given `datatype`. diff --git a/src/engine/sparqlExpressions/StringExpressions.cpp b/src/engine/sparqlExpressions/StringExpressions.cpp index 3378ae7fdb..be97ea2b16 100644 --- a/src/engine/sparqlExpressions/StringExpressions.cpp +++ b/src/engine/sparqlExpressions/StringExpressions.cpp @@ -357,7 +357,7 @@ using StrBeforeExpression = using ReplaceExpression = StringExpressionImpl<3, decltype(replaceImpl), RegexValueGetter, - StringValueGetter>; + ReplacementStringGetter>; // CONCAT class ConcatExpression : public detail::VariadicExpression { diff --git a/test/SparqlExpressionTest.cpp b/test/SparqlExpressionTest.cpp index 71a24908a2..a13c30e794 100644 --- a/test/SparqlExpressionTest.cpp +++ b/test/SparqlExpressionTest.cpp @@ -1242,6 +1242,12 @@ TEST(SparqlExpression, ReplaceExpression) { idOrLitOrStringVec({"null", "Xs", "zwei", "drei", U, U}), std::tuple{idOrLitOrStringVec({"null", "eins", "zwei", "drei", U, U}), IdOrLiteralOrIri{lit("e.[a-z]")}, IdOrLiteralOrIri{lit("X")}}); + // A regex with replacement with substitutions + checkReplace( + idOrLitOrStringVec({"\"$1 \\\\2 A \\\\bc\"", "\"$1 \\\\2 DE \\\\f\""}), + std::tuple{idOrLitOrStringVec({"Abc", "DEf"}), + IdOrLiteralOrIri{lit("([A-Z]+)")}, + IdOrLiteralOrIri{lit("\"$$1 \\\\2 $1 \\\\\"")}}); // Case-insensitive matching using the hack for google regex: checkReplace(idOrLitOrStringVec({"null", "xxns", "zwxx", "drxx"}), From 24af25460c14d06b2c0395c2a36c5eb14e913984 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Tue, 4 Feb 2025 16:24:17 +0100 Subject: [PATCH 2/5] Test non string replacement --- test/SparqlExpressionTest.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/SparqlExpressionTest.cpp b/test/SparqlExpressionTest.cpp index a13c30e794..dd6c5b75e8 100644 --- a/test/SparqlExpressionTest.cpp +++ b/test/SparqlExpressionTest.cpp @@ -1249,6 +1249,11 @@ TEST(SparqlExpression, ReplaceExpression) { IdOrLiteralOrIri{lit("([A-Z]+)")}, IdOrLiteralOrIri{lit("\"$$1 \\\\2 $1 \\\\\"")}}); + checkReplace(idOrLitOrStringVec({"truebc", "truef"}), + std::tuple{idOrLitOrStringVec({"Abc", "DEf"}), + IdOrLiteralOrIri{lit("([A-Z]+)")}, + IdOrLiteralOrIri{Id::makeFromBool(true)}}); + // Case-insensitive matching using the hack for google regex: checkReplace(idOrLitOrStringVec({"null", "xxns", "zwxx", "drxx"}), std::tuple{idOrLitOrStringVec({"null", "eIns", "zwEi", "drei"}), From 8fcbe4f3dfe0ded277a7191f6cbbc70eba6b2b68 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Tue, 4 Feb 2025 16:30:08 +0100 Subject: [PATCH 3/5] Escape $ with \ instead of other $ according to SPARQL spec --- .../SparqlExpressionValueGetters.cpp | 18 +++++++++--------- test/SparqlExpressionTest.cpp | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp b/src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp index 932e111537..8eda73d065 100644 --- a/src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp +++ b/src/engine/sparqlExpressions/SparqlExpressionValueGetters.cpp @@ -117,20 +117,20 @@ std::string ReplacementStringGetter::convertToReplacementString( char c = view.at(i); switch (c) { case '$': - // "$$" is unescaped to "$" + // Re2 used \1, \2, ... for backreferences, so we change $ to \. + result.push_back('\\'); + break; + case '\\': + // "\$" is unescaped to "$" if (i + 1 < view.size() && view.at(i + 1) == '$') { - result.push_back(c); + result.push_back('$'); i++; } else { - // Re2 used \1, \2, ... for backreferences, so we change $ to \. - result.push_back('\\'); + // Escape existing backslashes. + result.push_back(c); + result.push_back(c); } break; - case '\\': - // Escape existing backslashes. - result.push_back(c); - result.push_back(c); - break; default: result.push_back(c); break; diff --git a/test/SparqlExpressionTest.cpp b/test/SparqlExpressionTest.cpp index dd6c5b75e8..ef2193fc28 100644 --- a/test/SparqlExpressionTest.cpp +++ b/test/SparqlExpressionTest.cpp @@ -1247,7 +1247,7 @@ TEST(SparqlExpression, ReplaceExpression) { idOrLitOrStringVec({"\"$1 \\\\2 A \\\\bc\"", "\"$1 \\\\2 DE \\\\f\""}), std::tuple{idOrLitOrStringVec({"Abc", "DEf"}), IdOrLiteralOrIri{lit("([A-Z]+)")}, - IdOrLiteralOrIri{lit("\"$$1 \\\\2 $1 \\\\\"")}}); + IdOrLiteralOrIri{lit("\"\\\\$1 \\\\2 $1 \\\\\"")}}); checkReplace(idOrLitOrStringVec({"truebc", "truef"}), std::tuple{idOrLitOrStringVec({"Abc", "DEf"}), From cde19ea23dade01b6fe060384d1f98194f829a75 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Tue, 4 Feb 2025 18:19:39 +0100 Subject: [PATCH 4/5] Use raw string literal --- test/SparqlExpressionTest.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/SparqlExpressionTest.cpp b/test/SparqlExpressionTest.cpp index ef2193fc28..344a46fe83 100644 --- a/test/SparqlExpressionTest.cpp +++ b/test/SparqlExpressionTest.cpp @@ -1243,11 +1243,10 @@ TEST(SparqlExpression, ReplaceExpression) { std::tuple{idOrLitOrStringVec({"null", "eins", "zwei", "drei", U, U}), IdOrLiteralOrIri{lit("e.[a-z]")}, IdOrLiteralOrIri{lit("X")}}); // A regex with replacement with substitutions - checkReplace( - idOrLitOrStringVec({"\"$1 \\\\2 A \\\\bc\"", "\"$1 \\\\2 DE \\\\f\""}), - std::tuple{idOrLitOrStringVec({"Abc", "DEf"}), - IdOrLiteralOrIri{lit("([A-Z]+)")}, - IdOrLiteralOrIri{lit("\"\\\\$1 \\\\2 $1 \\\\\"")}}); + checkReplace(idOrLitOrStringVec({R"("$1 \\2 A \\bc")", R"("$1 \\2 DE \\f")"}), + std::tuple{idOrLitOrStringVec({"Abc", "DEf"}), + IdOrLiteralOrIri{lit("([A-Z]+)")}, + IdOrLiteralOrIri{lit(R"("\\$1 \\2 $1 \\")")}}); checkReplace(idOrLitOrStringVec({"truebc", "truef"}), std::tuple{idOrLitOrStringVec({"Abc", "DEf"}), From 69b38ae54f00f5ee92627b7dbf52aa2b759a18c6 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Tue, 4 Feb 2025 18:22:28 +0100 Subject: [PATCH 5/5] Clarify comment --- src/engine/sparqlExpressions/SparqlExpressionValueGetters.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h b/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h index 95bbf8f4a8..bea3a60e3a 100644 --- a/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h +++ b/src/engine/sparqlExpressions/SparqlExpressionValueGetters.h @@ -140,8 +140,10 @@ struct StringValueGetter : Mixin { return std::string(asStringViewUnsafe(s.getContent())); } }; -// Similar to `StringValueGetter`, but correctly escapes and unescapes string -// sequences. +// Similar to `StringValueGetter`, but correctly preprocesses strings so that +// they can be used by re2 as replacement strings. So '$1 \abc \$' becomes +// '\1 \\abc $', where the former variant is valid in the SPARQL standard and +// the latter represents the format that re2 expects. struct ReplacementStringGetter : StringValueGetter, Mixin { using Mixin::operator();