Skip to content

Commit

Permalink
Unescape Unicode sequences in the SPARQL parser (#1770)
Browse files Browse the repository at this point in the history
This PR makes sure escape sequences are applied before passing the string to ANTLR for the real parsing step (see [the SPARQL 1.1 specification](https://www.w3.org/TR/sparql11-query/#codepointEscape) for details). UTF-16 surrogate pairs are correctly handled. Also the ctre version is incremented to use `search_all` (non-deprecated variant of `range`).
  • Loading branch information
RobinTF authored Feb 7, 2025
1 parent 3e7df42 commit d7a70c7
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .codespellrc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
skip = .git*,.codespellrc,*.pdf,generated
check-hidden = true
# Ignore mixedCase variables, lines with latin, lines with codespell-ignore pragma, etc
ignore-regex = \b([A-Z]*[a-z]+[A-Z][a-zA-Z]*)\b|.*(Lorem ipsum|eleifend|feugait|codespell-ignore).*
ignore-regex = \b([A-Z]*[a-z]+[A-Z][a-zA-Z]*)\b|.*(Lorem ipsum|eleifend|feugait|codespell-ignore).*|https?://\S+
# alph - is used frequently in tests, just ignore altogether
ignore-words-list = ser,alph,inbetween,interm
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
FetchContent_Declare(
ctre
GIT_REPOSITORY https://github.com/hanickadot/compile-time-regular-expressions.git
GIT_TAG b3d7788b559e34d985c8530c3e0e7260b67505a6 # v3.8.1
GIT_TAG eb9577aae3515d14e6c5564f9aeb046d2e7c1124 # v3.9.0
)

################################
Expand Down
91 changes: 90 additions & 1 deletion src/parser/SparqlParserHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

#include "SparqlParserHelpers.h"

#include <unicode/unistr.h>

#include <charconv>
#include <ctre-unicode.hpp>

#include "sparqlParser/generated/SparqlAutomaticLexer.h"

namespace sparqlParserHelpers {
Expand All @@ -13,7 +18,8 @@ using std::string;
ParserAndVisitor::ParserAndVisitor(
std::string input,
SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks)
: input_{std::move(input)}, visitor_{{}, disableSomeChecks} {
: input_{unescapeUnicodeSequences(std::move(input))},
visitor_{{}, disableSomeChecks} {
// The default in ANTLR is to log all errors to the console and to continue
// the parsing. We need to turn parse errors into exceptions instead to
// propagate them to the user.
Expand All @@ -30,4 +36,87 @@ ParserAndVisitor::ParserAndVisitor(
: ParserAndVisitor{std::move(input), disableSomeChecks} {
visitor_.setPrefixMapManually(std::move(prefixes));
}

// _____________________________________________________________________________
std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
std::string_view view{input};
std::string output;
bool noEscapeSequenceFound = true;
size_t lastPos = 0;
UChar32 highSurrogate = 0;

auto throwError = [](bool condition, std::string_view message) {
if (!condition) {
throw InvalidSparqlQueryException{
absl::StrCat("Error in unicode escape sequence. ", message)};
}
};

for (const auto& match :
ctre::search_all<R"(\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4})">(view)) {
if (noEscapeSequenceFound) {
output.reserve(input.size());
noEscapeSequenceFound = false;
}
auto inBetweenPart =
view.substr(lastPos, match.data() - (view.data() + lastPos));

throwError(
inBetweenPart.empty() || highSurrogate == 0,
"A high surrogate must be directly followed by a low surrogate.");

output += inBetweenPart;
lastPos = match.data() + match.size() - view.data();

auto hexValue = match.to_view();
hexValue.remove_prefix(std::string_view{"\\U"}.size());

UChar32 codePoint;
auto result = std::from_chars(
hexValue.data(), hexValue.data() + hexValue.size(), codePoint, 16);
AD_CORRECTNESS_CHECK(result.ec == std::errc{});
AD_CORRECTNESS_CHECK(
hexValue.size() == 8 || hexValue.size() == 4,
"Unicode escape sequences must be either 8 or 4 characters long.");

bool isFullCodePoint = hexValue.size() == 8;

// See https://symbl.cc/en/unicode/blocks/high-surrogates/ for more
// information.
if (U16_IS_LEAD(codePoint)) {
throwError(!isFullCodePoint,
"Surrogates should not be encoded as full code points.");
throwError(
highSurrogate == 0,
"A high surrogate cannot be followed by another high surrogate.");
highSurrogate = codePoint;
continue;
} else if (U16_IS_TRAIL(codePoint)) {
throwError(!isFullCodePoint,
"Surrogates should not be encoded as full code points.");
throwError(highSurrogate != 0,
"A low surrogate cannot be the first surrogate.");
codePoint = U16_GET_SUPPLEMENTARY(highSurrogate, codePoint);
highSurrogate = 0;
} else {
throwError(
highSurrogate == 0,
"A high surrogate cannot be followed by a regular code point.");
}

icu::UnicodeString helper{codePoint};
helper.toUTF8String(output);
}

// Avoid redundant copy if no escape sequences were found.
if (noEscapeSequenceFound) {
return input;
}

throwError(highSurrogate == 0,
"A high surrogate must be followed by a low surrogate.");

output += view.substr(lastPos);
return output;
}
} // namespace sparqlParserHelpers
4 changes: 4 additions & 0 deletions src/parser/SparqlParserHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ struct ParserAndVisitor {
ad_utility::antlr_utility::ThrowingErrorListener<InvalidSparqlQueryException>
errorListener_{};

// Unescapes unicode sequences like \U01234567 and \u0123 in the input string
// before beginning with actual parsing as the SPARQL standard mandates.
static std::string unescapeUnicodeSequences(std::string input);

public:
SparqlAutomaticParser parser_{&tokens_};
SparqlQleverVisitor visitor_;
Expand Down
2 changes: 1 addition & 1 deletion src/util/StringUtilsImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ std::string insertThousandSeparator(const std::string_view str,
"])(?<digit>\\d{4,})"};
auto parseIterator = std::begin(str);
ql::ranges::for_each(
ctre::range<regexPatDigitSequence>(str),
ctre::search_all<regexPatDigitSequence>(str),
[&parseIterator, &ostream, &insertSeparator](const auto& match) {
/*
The digit sequence, that must be transformed. Note: The string view
Expand Down
103 changes: 103 additions & 0 deletions test/SparqlParserTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ auto lit = ad_utility::testing::tripleComponentLiteral;
auto iri = ad_utility::testing::iri;
} // namespace

// _____________________________________________________________________________
TEST(ParserTest, testParse) {
{
auto pq = SparqlParser::parseQuery("SELECT ?x WHERE {?x ?y ?z}");
Expand Down Expand Up @@ -698,6 +699,7 @@ TEST(ParserTest, testParse) {
}
}

// _____________________________________________________________________________
TEST(ParserTest, testFilterWithoutDot) {
ParsedQuery pq = SparqlParser::parseQuery(
"PREFIX fb: <http://rdf.freebase.com/ns/>\n"
Expand Down Expand Up @@ -726,6 +728,7 @@ TEST(ParserTest, testFilterWithoutDot) {
ASSERT_EQ("(?1 != fb:m.018mts)", filters[2].expression_.getDescriptor());
}

// _____________________________________________________________________________
TEST(ParserTest, testExpandPrefixes) {
ParsedQuery pq = SparqlParser::parseQuery(
"PREFIX : <http://rdf.myprefix.com/>\n"
Expand Down Expand Up @@ -754,6 +757,7 @@ TEST(ParserTest, testExpandPrefixes) {
ASSERT_EQ(0, pq._limitOffset._offset);
}

// _____________________________________________________________________________
TEST(ParserTest, testLiterals) {
ParsedQuery pq = SparqlParser::parseQuery(
"PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> SELECT * WHERE { "
Expand All @@ -772,6 +776,7 @@ TEST(ParserTest, testLiterals) {
ASSERT_EQ(DateYearOrDuration{Date(2000, 1, 1, -1)}, c._triples[1].o_);
}

// _____________________________________________________________________________
TEST(ParserTest, testSolutionModifiers) {
{
ParsedQuery pq =
Expand Down Expand Up @@ -993,6 +998,7 @@ TEST(ParserTest, testSolutionModifiers) {
}
}

// _____________________________________________________________________________
TEST(ParserTest, testGroupByAndAlias) {
ParsedQuery pq = SparqlParser::parseQuery(
"SELECT (COUNT(?a) as ?count) WHERE { ?b <rel> ?a } GROUP BY ?b");
Expand All @@ -1008,6 +1014,7 @@ TEST(ParserTest, testGroupByAndAlias) {
EXPECT_THAT(pq, m::GroupByVariables({Var{"?b"}}));
}

// _____________________________________________________________________________
TEST(ParserTest, Bind) {
ParsedQuery pq =
SparqlParser::parseQuery("SELECT ?a WHERE { BIND (10 - 5 as ?a) . }");
Expand All @@ -1020,6 +1027,7 @@ TEST(ParserTest, Bind) {
ASSERT_EQ(bind._expression.getDescriptor(), "10 - 5");
}

// _____________________________________________________________________________
TEST(ParserTest, Order) {
{
ParsedQuery pq =
Expand Down Expand Up @@ -1101,6 +1109,7 @@ TEST(ParserTest, Order) {
*/
}

// _____________________________________________________________________________
TEST(ParserTest, Group) {
{
ParsedQuery pq = SparqlParser::parseQuery(
Expand Down Expand Up @@ -1165,6 +1174,7 @@ TEST(ParserTest, Group) {
}
}

// _____________________________________________________________________________
TEST(ParserTest, LanguageFilterPostProcessing) {
{
ParsedQuery q = SparqlParser::parseQuery(
Expand Down Expand Up @@ -1246,3 +1256,96 @@ TEST(ParserTest, LanguageFilterPostProcessing) {
triples[2]);
}
}

// _____________________________________________________________________________
namespace {
std::string getFirstTriple(const ParsedQuery& q) {
return q._rootGraphPattern._graphPatterns.at(0)
.getBasic()
._triples.at(0)
.asString();
}
} // namespace

// _____________________________________________________________________________
TEST(ParserTest, HandlesBasicUnicodeEscapeSequences) {
ParsedQuery q1 = SparqlParser::parseQuery(
R"(SELECT * WHERE { ?s <http://a.example/p1> '\u0080\u07FF\u0800\u0FFF\u1000\uCFFF\uD000\uD7FF\uE000\uFFFD\U00010000\U0003FFFD\U00040000\U000FFFFD\U00100000\U0010FFFD'})");
EXPECT_EQ(getFirstTriple(q1),
"{s: ?s, p: <http://a.example/p1>, o: "
"\"\u0080\u07FF\u0800\u0FFF\u1000\uCFFF\uD000\uD7FF\uE000\uFFFD"
"\U00010000\U0003FFFD\U00040000\U000FFFFD\U00100000\U0010FFFD\"}");

ParsedQuery q2 =
SparqlParser::parseQuery(R"(SELECT * WHERE { ?s ?p "\U0001f46a" . })");
EXPECT_EQ(getFirstTriple(q2), "{s: ?s, p: ?p, o: \"\U0001f46a\"}");

ParsedQuery q3 = SparqlParser::parseQuery(
R"(PREFIX \u03B1: <http://example.com/\u00E9fg> SELECT * WHERE { ?s ?p α\u003Aba . })");
EXPECT_EQ(getFirstTriple(q3),
"{s: ?s, p: ?p, o: <http://example.com/éfgba>}");

ParsedQuery q4 = SparqlParser::parseQuery(
R"(SELECT * WHERE { <http://example.com/\U0001F937\U0001F3FD\u200D\U00002642\ufe0F> ?p\u00201. })");
EXPECT_EQ(getFirstTriple(q4),
"{s: <http://example.com/🤷🏽‍♂️>, p: ?p, o: 1}");

// Ensure we don't double-unescape, \u sequences are not allowed in literals
EXPECT_THROW(
SparqlParser::parseQuery(R"(SELECT * WHERE { "\u005Cu2764" ?p 1. })"),
InvalidSparqlQueryException);
}

// _____________________________________________________________________________
TEST(ParserTest, HandlesSurrogatesCorrectly) {
using SP = SparqlParser;
using ::testing::HasSubstr;
ParsedQuery q = SP::parseQuery(
R"(SELECT * WHERE { "\uD83E\udD37\uD83C\uDFFD\u200D\u2642\uFE0F" ?p 1. })");
EXPECT_EQ(getFirstTriple(q), "{s: \"🤷🏽‍♂️\", p: ?p, o: 1}");

AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD83C \uDFFD' })"),
HasSubstr(
"A high surrogate must be directly followed by a low surrogate."),
InvalidSparqlQueryException);

AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800' })"),
HasSubstr("A high surrogate must be followed by a low surrogate."),
InvalidSparqlQueryException);

AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\U0000D800' })"),
HasSubstr("Surrogates should not be encoded as full code points."),
InvalidSparqlQueryException);

AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800\uD800' })"),
HasSubstr(
"A high surrogate cannot be followed by another high surrogate."),
InvalidSparqlQueryException);

AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\U0000DFFD' })"),
HasSubstr("Surrogates should not be encoded as full code points."),
InvalidSparqlQueryException);

AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uDFFD' })"),
HasSubstr("A low surrogate cannot be the first surrogate."),
InvalidSparqlQueryException);

AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800\u0020' })"),
HasSubstr("A high surrogate cannot be followed by a regular code point."),
InvalidSparqlQueryException);

// Note: We don't allow mixing escaped and unescape surrogates, that's just
// weird and the C++ compiler rightfully won't compile strings like these:
// SELECT * WHERE { ?s ?p '\\uD83C\uDFFD' }
// SELECT * WHERE { ?s ?p '\uD83C\\uDFFD' }

// So writing unit tests for these cases is not possible without creating
// semi-invalid UTF-8 strings.
}

0 comments on commit d7a70c7

Please sign in to comment.