Unescape Unicode sequences in the SPARQL parser (#1770)

This PR makes sure escape sequences are applied before passing the string to ANTLR for the real parsing step (see [the SPARQL 1.1 specification](https://www.w3.org/TR/sparql11-query/#codepointEscape) for details). UTF-16 surrogate pairs are correctly handled. Also the ctre version is incremented to use `search_all` (non-deprecated variant of `range`).
ad-freiburg · Feb 7, 2025 · d7a70c7 · d7a70c7
1 parent 3e7df42
commit d7a70c7
Show file tree

Hide file tree

Showing 6 changed files with 200 additions and 4 deletions.
diff --git a/.codespellrc b/.codespellrc
@@ -3,6 +3,6 @@
 skip = .git*,.codespellrc,*.pdf,generated
 check-hidden = true
 # Ignore mixedCase variables, lines with latin, lines with codespell-ignore pragma, etc
-ignore-regex = \b([A-Z]*[a-z]+[A-Z][a-zA-Z]*)\b|.*(Lorem ipsum|eleifend|feugait|codespell-ignore).*
+ignore-regex = \b([A-Z]*[a-z]+[A-Z][a-zA-Z]*)\b|.*(Lorem ipsum|eleifend|feugait|codespell-ignore).*|https?://\S+
 # alph - is used frequently in tests, just ignore altogether
 ignore-words-list = ser,alph,inbetween,interm
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -231,7 +231,7 @@ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
 FetchContent_Declare(
         ctre
         GIT_REPOSITORY https://github.com/hanickadot/compile-time-regular-expressions.git
-        GIT_TAG b3d7788b559e34d985c8530c3e0e7260b67505a6 # v3.8.1
+        GIT_TAG eb9577aae3515d14e6c5564f9aeb046d2e7c1124 # v3.9.0
 )
 
 ################################

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
@@ -4,6 +4,11 @@
 
 #include "SparqlParserHelpers.h"
 
+#include <unicode/unistr.h>
+
+#include <charconv>
+#include <ctre-unicode.hpp>
+
 #include "sparqlParser/generated/SparqlAutomaticLexer.h"
 
 namespace sparqlParserHelpers {
@@ -13,7 +18,8 @@ using std::string;
 ParserAndVisitor::ParserAndVisitor(
     std::string input,
     SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks)
-    : input_{std::move(input)}, visitor_{{}, disableSomeChecks} {
+    : input_{unescapeUnicodeSequences(std::move(input))},
+      visitor_{{}, disableSomeChecks} {
   // The default in ANTLR is to log all errors to the console and to continue
   // the parsing. We need to turn parse errors into exceptions instead to
   // propagate them to the user.
@@ -30,4 +36,87 @@ ParserAndVisitor::ParserAndVisitor(
     : ParserAndVisitor{std::move(input), disableSomeChecks} {
   visitor_.setPrefixMapManually(std::move(prefixes));
 }
+
+// _____________________________________________________________________________
+std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
+  std::string_view view{input};
+  std::string output;
+  bool noEscapeSequenceFound = true;
+  size_t lastPos = 0;
+  UChar32 highSurrogate = 0;
+
+  auto throwError = [](bool condition, std::string_view message) {
+    if (!condition) {
+      throw InvalidSparqlQueryException{
+          absl::StrCat("Error in unicode escape sequence. ", message)};
+    }
+  };
+
+  for (const auto& match :
+       ctre::search_all<R"(\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4})">(view)) {
+    if (noEscapeSequenceFound) {
+      output.reserve(input.size());
+      noEscapeSequenceFound = false;
+    }
+    auto inBetweenPart =
+        view.substr(lastPos, match.data() - (view.data() + lastPos));
+
+    throwError(
+        inBetweenPart.empty() || highSurrogate == 0,
+        "A high surrogate must be directly followed by a low surrogate.");
+
+    output += inBetweenPart;
+    lastPos = match.data() + match.size() - view.data();
+
+    auto hexValue = match.to_view();
+    hexValue.remove_prefix(std::string_view{"\\U"}.size());
+
+    UChar32 codePoint;
+    auto result = std::from_chars(
+        hexValue.data(), hexValue.data() + hexValue.size(), codePoint, 16);
+    AD_CORRECTNESS_CHECK(result.ec == std::errc{});
+    AD_CORRECTNESS_CHECK(
+        hexValue.size() == 8 || hexValue.size() == 4,
+        "Unicode escape sequences must be either 8 or 4 characters long.");
+
+    bool isFullCodePoint = hexValue.size() == 8;
+
+    // See https://symbl.cc/en/unicode/blocks/high-surrogates/ for more
+    // information.
+    if (U16_IS_LEAD(codePoint)) {
+      throwError(!isFullCodePoint,
+                 "Surrogates should not be encoded as full code points.");
+      throwError(
+          highSurrogate == 0,
+          "A high surrogate cannot be followed by another high surrogate.");
+      highSurrogate = codePoint;
+      continue;
+    } else if (U16_IS_TRAIL(codePoint)) {
+      throwError(!isFullCodePoint,
+                 "Surrogates should not be encoded as full code points.");
+      throwError(highSurrogate != 0,
+                 "A low surrogate cannot be the first surrogate.");
+      codePoint = U16_GET_SUPPLEMENTARY(highSurrogate, codePoint);
+      highSurrogate = 0;
+    } else {
+      throwError(
+          highSurrogate == 0,
+          "A high surrogate cannot be followed by a regular code point.");
+    }
+
+    icu::UnicodeString helper{codePoint};
+    helper.toUTF8String(output);
+  }
+
+  // Avoid redundant copy if no escape sequences were found.
+  if (noEscapeSequenceFound) {
+    return input;
+  }
+
+  throwError(highSurrogate == 0,
+             "A high surrogate must be followed by a low surrogate.");
+
+  output += view.substr(lastPos);
+  return output;
+}
 }  // namespace sparqlParserHelpers
diff --git a/src/parser/SparqlParserHelpers.h b/src/parser/SparqlParserHelpers.h
@@ -35,6 +35,10 @@ struct ParserAndVisitor {
   ad_utility::antlr_utility::ThrowingErrorListener<InvalidSparqlQueryException>
       errorListener_{};
 
+  // Unescapes unicode sequences like \U01234567 and \u0123 in the input string
+  // before beginning with actual parsing as the SPARQL standard mandates.
+  static std::string unescapeUnicodeSequences(std::string input);
+
  public:
   SparqlAutomaticParser parser_{&tokens_};
   SparqlQleverVisitor visitor_;

diff --git a/src/util/StringUtilsImpl.h b/src/util/StringUtilsImpl.h
@@ -74,7 +74,7 @@ std::string insertThousandSeparator(const std::string_view str,
       "])(?<digit>\\d{4,})"};
   auto parseIterator = std::begin(str);
   ql::ranges::for_each(
-      ctre::range<regexPatDigitSequence>(str),
+      ctre::search_all<regexPatDigitSequence>(str),
       [&parseIterator, &ostream, &insertSeparator](const auto& match) {
         /*
         The digit sequence, that must be transformed. Note: The string view

diff --git a/test/SparqlParserTest.cpp b/test/SparqlParserTest.cpp
@@ -23,6 +23,7 @@ auto lit = ad_utility::testing::tripleComponentLiteral;
 auto iri = ad_utility::testing::iri;
 }  // namespace
 
+// _____________________________________________________________________________
 TEST(ParserTest, testParse) {
   {
     auto pq = SparqlParser::parseQuery("SELECT ?x WHERE {?x ?y ?z}");
@@ -698,6 +699,7 @@ TEST(ParserTest, testParse) {
   }
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, testFilterWithoutDot) {
   ParsedQuery pq = SparqlParser::parseQuery(
       "PREFIX fb: <http://rdf.freebase.com/ns/>\n"
@@ -726,6 +728,7 @@ TEST(ParserTest, testFilterWithoutDot) {
   ASSERT_EQ("(?1 != fb:m.018mts)", filters[2].expression_.getDescriptor());
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, testExpandPrefixes) {
   ParsedQuery pq = SparqlParser::parseQuery(
       "PREFIX : <http://rdf.myprefix.com/>\n"
@@ -754,6 +757,7 @@ TEST(ParserTest, testExpandPrefixes) {
   ASSERT_EQ(0, pq._limitOffset._offset);
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, testLiterals) {
   ParsedQuery pq = SparqlParser::parseQuery(
       "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> SELECT * WHERE { "
@@ -772,6 +776,7 @@ TEST(ParserTest, testLiterals) {
   ASSERT_EQ(DateYearOrDuration{Date(2000, 1, 1, -1)}, c._triples[1].o_);
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, testSolutionModifiers) {
   {
     ParsedQuery pq =
@@ -993,6 +998,7 @@ TEST(ParserTest, testSolutionModifiers) {
   }
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, testGroupByAndAlias) {
   ParsedQuery pq = SparqlParser::parseQuery(
       "SELECT (COUNT(?a) as ?count) WHERE { ?b <rel> ?a } GROUP BY ?b");
@@ -1008,6 +1014,7 @@ TEST(ParserTest, testGroupByAndAlias) {
   EXPECT_THAT(pq, m::GroupByVariables({Var{"?b"}}));
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, Bind) {
   ParsedQuery pq =
       SparqlParser::parseQuery("SELECT ?a WHERE { BIND (10 - 5 as ?a) . }");
@@ -1020,6 +1027,7 @@ TEST(ParserTest, Bind) {
   ASSERT_EQ(bind._expression.getDescriptor(), "10 - 5");
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, Order) {
   {
     ParsedQuery pq =
@@ -1101,6 +1109,7 @@ TEST(ParserTest, Order) {
    */
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, Group) {
   {
     ParsedQuery pq = SparqlParser::parseQuery(
@@ -1165,6 +1174,7 @@ TEST(ParserTest, Group) {
   }
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, LanguageFilterPostProcessing) {
   {
     ParsedQuery q = SparqlParser::parseQuery(
@@ -1246,3 +1256,96 @@ TEST(ParserTest, LanguageFilterPostProcessing) {
         triples[2]);
   }
 }
+
+// _____________________________________________________________________________
+namespace {
+std::string getFirstTriple(const ParsedQuery& q) {
+  return q._rootGraphPattern._graphPatterns.at(0)
+      .getBasic()
+      ._triples.at(0)
+      .asString();
+}
+}  // namespace
+
+// _____________________________________________________________________________
+TEST(ParserTest, HandlesBasicUnicodeEscapeSequences) {
+  ParsedQuery q1 = SparqlParser::parseQuery(
+      R"(SELECT * WHERE { ?s <http://a.example/p1> '\u0080\u07FF\u0800\u0FFF\u1000\uCFFF\uD000\uD7FF\uE000\uFFFD\U00010000\U0003FFFD\U00040000\U000FFFFD\U00100000\U0010FFFD'})");
+  EXPECT_EQ(getFirstTriple(q1),
+            "{s: ?s, p: <http://a.example/p1>, o: "
+            "\"\u0080\u07FF\u0800\u0FFF\u1000\uCFFF\uD000\uD7FF\uE000\uFFFD"
+            "\U00010000\U0003FFFD\U00040000\U000FFFFD\U00100000\U0010FFFD\"}");
+
+  ParsedQuery q2 =
+      SparqlParser::parseQuery(R"(SELECT * WHERE { ?s ?p "\U0001f46a" . })");
+  EXPECT_EQ(getFirstTriple(q2), "{s: ?s, p: ?p, o: \"\U0001f46a\"}");
+
+  ParsedQuery q3 = SparqlParser::parseQuery(
+      R"(PREFIX \u03B1: <http://example.com/\u00E9fg> SELECT * WHERE { ?s ?p α\u003Aba . })");
+  EXPECT_EQ(getFirstTriple(q3),
+            "{s: ?s, p: ?p, o: <http://example.com/éfgba>}");
+
+  ParsedQuery q4 = SparqlParser::parseQuery(
+      R"(SELECT * WHERE { <http://example.com/\U0001F937\U0001F3FD\u200D\U00002642\ufe0F> ?p\u00201. })");
+  EXPECT_EQ(getFirstTriple(q4),
+            "{s: <http://example.com/🤷🏽‍♂️>, p: ?p, o: 1}");
+
+  // Ensure we don't double-unescape, \u sequences are not allowed in literals
+  EXPECT_THROW(
+      SparqlParser::parseQuery(R"(SELECT * WHERE { "\u005Cu2764" ?p 1. })"),
+      InvalidSparqlQueryException);
+}
+
+// _____________________________________________________________________________
+TEST(ParserTest, HandlesSurrogatesCorrectly) {
+  using SP = SparqlParser;
+  using ::testing::HasSubstr;
+  ParsedQuery q = SP::parseQuery(
+      R"(SELECT * WHERE { "\uD83E\udD37\uD83C\uDFFD\u200D\u2642\uFE0F" ?p 1. })");
+  EXPECT_EQ(getFirstTriple(q), "{s: \"🤷🏽‍♂️\", p: ?p, o: 1}");
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD83C \uDFFD' })"),
+      HasSubstr(
+          "A high surrogate must be directly followed by a low surrogate."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800' })"),
+      HasSubstr("A high surrogate must be followed by a low surrogate."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\U0000D800' })"),
+      HasSubstr("Surrogates should not be encoded as full code points."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800\uD800' })"),
+      HasSubstr(
+          "A high surrogate cannot be followed by another high surrogate."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\U0000DFFD' })"),
+      HasSubstr("Surrogates should not be encoded as full code points."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uDFFD' })"),
+      HasSubstr("A low surrogate cannot be the first surrogate."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800\u0020' })"),
+      HasSubstr("A high surrogate cannot be followed by a regular code point."),
+      InvalidSparqlQueryException);
+
+  // Note: We don't allow mixing escaped and unescape surrogates, that's just
+  // weird and the C++ compiler rightfully won't compile strings like these:
+  // SELECT * WHERE { ?s ?p '\\uD83C\uDFFD' }
+  // SELECT * WHERE { ?s ?p '\uD83C\\uDFFD' }
+
+  // So writing unit tests for these cases is not possible without creating
+  // semi-invalid UTF-8 strings.
+}