diff --git a/.clang-tidy b/.clang-tidy index 813d1d16..ab1f0c1e 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -84,7 +84,7 @@ Checks: > WarningsAsErrors: "*" -HeaderFilterRegex: "\\./(archive|azm|browser|css|css2|dom|engine|etest|geom|gfx|html|html2|idna|img|js|layout|net|os|protocol|render|style|tui|type|unicode|uri|url|util|wasm)/" +HeaderFilterRegex: "\\./(archive|azm|browser|css|css2|dom|engine|etest|geom|gfx|html|html2|idna|img|js|json|layout|net|os|protocol|render|style|tui|type|unicode|uri|url|util|wasm)/" CheckOptions: # bugprone-suspicious-stringview-data-usage diff --git a/.gitlint b/.gitlint index fa3074a3..a6c8844b 100644 --- a/.gitlint +++ b/.gitlint @@ -4,4 +4,4 @@ ignore=body-is-missing # TODO(robinlinden): Better way of documenting and setting this up. # Each commit must start with the main area it affects. [title-match-regex] -regex=^(archive|azm|browser|bzl|css|css2|dom|dom2|engine|etest|geom|gfx|html|html2|idna|img|js|layout|net|os|protocol|render|style|tui|type|unicode|uri|url|util|wasm|all|build|ci|deps|doc|meta)(/.*|\+.*)?: +regex=^(archive|azm|browser|bzl|css|css2|dom|dom2|engine|etest|geom|gfx|html|html2|idna|img|js|json|layout|net|os|protocol|render|style|tui|type|unicode|uri|url|util|wasm|all|build|ci|deps|doc|meta)(/.*|\+.*)?: diff --git a/html2/BUILD b/html2/BUILD index 73c5d8f5..374ed6b8 100644 --- a/html2/BUILD +++ b/html2/BUILD @@ -58,13 +58,10 @@ cc_binary( ], "//conditions:default": [], }), - tags = ["no-cross"], - # simdjson seems to blow up qemu when we run our aarch64 crosscompiled - # tests. deps = [ ":html2", "//etest", - "@simdjson", + "//json", ], ) @@ -92,9 +89,12 @@ genrule( ":html5lib_test_runner", test, ], - # simdjson seems to blow up qemu when we run our aarch64 crosscompiled - # tests. - tags = ["no-cross"], + target_compatible_with = select({ + # TODO(robinlinden): Investigate why we fail to open the test files when + # running as a wasi binary. + "@platforms//os:wasi": ["@platforms//:incompatible"], + "//conditions:default": [], + }), ) for test in [ "@html5lib-tests//:tokenizer/contentModelFlags.test", "@html5lib-tests//:tokenizer/domjs.test", diff --git a/html2/html5lib_test.cpp b/html2/html5lib_test.cpp index 1bf24f0d..586ad413 100644 --- a/html2/html5lib_test.cpp +++ b/html2/html5lib_test.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Robin Lindén +// SPDX-FileCopyrightText: 2024-2025 Robin Lindén // // SPDX-License-Identifier: BSD-2-Clause @@ -6,12 +6,14 @@ #include "html2/tokenizer.h" #include "etest/etest2.h" - -#include // IWYU pragma: keep +#include "json/json.h" #include +#include #include +#include #include +#include #include #include #include @@ -83,28 +85,26 @@ std::pair, std::vector> tokenize( return {std::move(tokens), std::move(errors)}; } -// NOLINTBEGIN(clang-analyzer-unix.Errno): Problem in simdjson that probably doesn't affect us. -// NOLINTBEGIN(misc-include-cleaner): What you're meant to include from -// simdjson depends on things like the architecture you're compiling for. -// This is handled automagically with detection macros inside simdjson. -std::vector to_html2_tokens(simdjson::ondemand::array tokens) { - constexpr auto kGetOptionalStr = [](simdjson::ondemand::value v) -> std::optional { - if (v.is_null()) { - return std::nullopt; +std::vector to_html2_tokens(json::Array const &tokens) { + constexpr auto kGetOptionalStr = [](json::Value const &v) -> std::optional { + if (auto const *str = std::get_if(&v)) { + return *str; } - return std::string{v.get_string().value()}; + return std::nullopt; }; std::vector result; - for (auto token : tokens) { - auto it = token.begin().value(); - auto kind = (*it).get_string().value(); + for (auto const &token : tokens.values) { + assert(std::holds_alternative(token)); + auto const &t = std::get(token); + auto it = t.values.begin(); + auto const &kind = std::get((*it)); if (kind == "DOCTYPE") { - auto name = kGetOptionalStr((*++it).value()); - auto public_id = kGetOptionalStr((*++it).value()); - auto system_id = kGetOptionalStr((*++it).value()); + auto name = kGetOptionalStr((*++it)); + auto public_id = kGetOptionalStr((*++it)); + auto system_id = kGetOptionalStr((*++it)); // The json has "correctness" instead of "force quirks", so we negate it. - auto force_quirks = !(*++it).value().get_bool().value(); + auto force_quirks = !(std::get(*++it)); result.emplace_back(html2::DoctypeToken{ std::move(name), std::move(public_id), @@ -115,22 +115,22 @@ std::vector to_html2_tokens(simdjson::ondemand::array tokens) { } if (kind == "Comment") { - result.emplace_back(html2::CommentToken{std::string{(*++it).value().get_string().value()}}); + result.emplace_back(html2::CommentToken{std::get(*++it)}); continue; } if (kind == "StartTag") { - html2::StartTagToken start{std::string{(*++it).value().get_string().value()}}; - auto attrs = (*++it).value().get_object().value(); - for (auto attr : attrs) { + html2::StartTagToken start{std::get(*++it)}; + auto attrs = std::get(*++it); + for (auto const &attr : attrs.values) { start.attributes.push_back({ - std::string{attr.unescaped_key().value()}, - std::string{attr.value().get_string().value()}, + std::string{attr.first}, + std::string{std::get(attr.second)}, }); } - if (++it != simdjson::ondemand::array_iterator{}) { - start.self_closing = (*it).value().get_bool().value(); + if (++it != t.values.end()) { + start.self_closing = std::get(*it); } result.emplace_back(std::move(start)); @@ -138,12 +138,12 @@ std::vector to_html2_tokens(simdjson::ondemand::array tokens) { } if (kind == "EndTag") { - result.emplace_back(html2::EndTagToken{std::string{(*++it).value().get_string().value()}}); + result.emplace_back(html2::EndTagToken{std::get(*++it)}); continue; } if (kind == "Character") { - auto characters = (*++it).value().get_string().value(); + auto const &characters = std::get(*++it); for (auto c : characters) { result.emplace_back(html2::CharacterToken{c}); } @@ -370,8 +370,8 @@ std::optional to_parse_error(std::string_view error_name) { return std::nullopt; } -std::optional to_error(simdjson::ondemand::value error) { - auto code = error["code"].get_string().value(); +std::optional to_error(json::Object const &error) { + auto code = std::get(error.at("code")); if (code == "control-character-in-input-stream" || code == "noncharacter-in-input-stream") { // TODO(robinlinden): Handle. std::cerr << "Unhandled error: " << code << '\n'; @@ -380,18 +380,18 @@ std::optional to_error(simdjson::ondemand::value error) { auto parse_error = to_parse_error(code); assert(parse_error.has_value()); - auto line = error["line"].get_uint64().value(); - auto col = error["col"].get_uint64().value(); + auto line = std::get(error.at("line")); + auto col = std::get(error.at("col")); return Error{ parse_error.value(), {static_cast(line), static_cast(col)}, }; } -std::optional> to_errors(simdjson::ondemand::array errors) { +std::optional> to_errors(json::Array const &errors) { std::vector result; - for (auto error : errors) { - auto maybe_error = to_error(error.value()); + for (auto const &error : errors.values) { + auto maybe_error = to_error(std::get(error)); if (!maybe_error.has_value()) { return std::nullopt; } @@ -410,35 +410,43 @@ int main(int argc, char **argv) { return 1; } - auto json = simdjson::padded_string::load(argv[1]); - if (json.error() != simdjson::SUCCESS) { - std::cerr << "Error loading test file: " << json.error() << '\n'; + std::ifstream test_file{argv[1], std::fstream::in | std::fstream::binary}; + if (!test_file) { + std::cerr << "Failed to open test file '" << argv[1] << "'\n"; + return 1; + } + + std::string test_bytes{std::istreambuf_iterator(test_file), std::istreambuf_iterator()}; + + auto json = json::parse(test_bytes); + if (!json) { + std::cerr << "Error loading test file.\n"; return 1; } etest::Suite s; - simdjson::ondemand::parser parser; - simdjson::ondemand::document doc = parser.iterate(json); - auto tests = doc.find_field("tests").get_array().value(); - for (auto test : tests) { - auto name = test["description"].get_string().value(); + auto const &doc = std::get(*json); + auto const &tests = std::get(doc.at("tests")); + for (auto const &v : tests.values) { + auto const &test = std::get(v); + auto name = std::get(test.at("description")); // TOOD(robinlinden): Don't skip these. - if (test["doubleEscaped"].error() == simdjson::SUCCESS) { + if (test.contains("doubleEscaped")) { continue; } std::vector initial_states{html2::State::Data}; - if (test["initialStates"].error() == simdjson::SUCCESS) { + if (auto it = test.find("initialStates"); it != test.values.end()) { initial_states.clear(); - auto state_names = test["initialStates"].get_array().value(); - for (auto state_name : state_names) { - auto state = to_state(state_name.get_string().value()); + auto state_names = std::get(it->second); + for (auto const &state_name : state_names.values) { + auto state = to_state(std::get(state_name)); if (!state.has_value()) { - std::cerr << "Unhandled state: " << state_name.get_string().value() << '\n'; + std::cerr << "Unhandled state: " << std::get(state_name) << '\n'; return 1; } @@ -447,22 +455,22 @@ int main(int argc, char **argv) { } std::optional last_start_tag; - if (test["lastStartTag"].error() == simdjson::SUCCESS) { - last_start_tag = test["lastStartTag"].get_string().value(); + if (auto it = test.find("lastStartTag"); it != test.values.end()) { + last_start_tag = std::get(it->second); } - auto in = test["input"].get_string().value(); + auto in = std::get(test.at("input")); // TOOD(robinlinden): Don't skip these. // See: https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream if (in.contains('\r')) { continue; } - auto out_tokens = to_html2_tokens(test["output"].get_array().value()); + auto out_tokens = to_html2_tokens(std::get(test.at("output"))); std::vector out_errors; - if (test["errors"].error() == simdjson::SUCCESS) { - auto maybe_errors = to_errors(test["errors"].get_array().value()); + if (auto it = test.find("errors"); it != test.values.end()) { + auto maybe_errors = to_errors(std::get(it->second)); if (!maybe_errors.has_value()) { continue; } @@ -482,5 +490,3 @@ int main(int argc, char **argv) { return s.run(); } -// NOLINTEND(misc-include-cleaner) -// NOLINTEND(clang-analyzer-unix.Errno) diff --git a/json/BUILD b/json/BUILD new file mode 100644 index 00000000..7d25fb35 --- /dev/null +++ b/json/BUILD @@ -0,0 +1,38 @@ +load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") +load("@rules_fuzzing//fuzzing:cc_defs.bzl", "cc_fuzz_test") +load("//bzl:copts.bzl", "HASTUR_COPTS", "HASTUR_FUZZ_PLATFORMS") + +cc_library( + name = "json", + hdrs = glob(["*.h"]), + copts = HASTUR_COPTS, + visibility = ["//visibility:public"], + deps = [ + "//unicode:util", + "//util:from_chars", + ], +) + +[cc_test( + name = src.removesuffix(".cpp"), + size = "small", + srcs = [src], + copts = HASTUR_COPTS, + deps = [ + ":json", + "//etest", + ], +) for src in glob( + include = ["*_test.cpp"], + exclude = ["*_fuzz_test.cpp"], +)] + +[cc_fuzz_test( + name = src.removesuffix(".cpp"), + size = "small", + testonly = True, + srcs = [src], + copts = HASTUR_COPTS, + target_compatible_with = HASTUR_FUZZ_PLATFORMS, + deps = [":%s" % src.removesuffix("_fuzz_test.cpp")], +) for src in glob(["*_fuzz_test.cpp"])] diff --git a/json/json.h b/json/json.h new file mode 100644 index 00000000..cdff6a69 --- /dev/null +++ b/json/json.h @@ -0,0 +1,501 @@ +// SPDX-FileCopyrightText: 2025 Robin Lindén +// +// SPDX-License-Identifier: BSD-2-Clause + +#ifndef JSON_JSON_H_ +#define JSON_JSON_H_ + +#include "unicode/util.h" +#include "util/from_chars.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace json { + +struct Null { + constexpr bool operator==(Null const &) const = default; +}; + +struct Array; +struct Object; + +using Value = std::variant; + +struct Array { + std::vector values; + inline bool operator==(Array const &) const; +}; + +struct Object { + std::vector> values; + inline bool operator==(Object const &) const; + + [[nodiscard]] constexpr Value const &at(std::string_view key) const { + auto it = std::ranges::find(values, key, &decltype(values)::value_type::first); + assert(it != values.end()); + return it->second; + } + + [[nodiscard]] constexpr decltype(values)::const_iterator find(std::string_view key) const { + return std::ranges::find(values, key, &decltype(values)::value_type::first); + } + + [[nodiscard]] constexpr bool contains(std::string_view key) const { return find(key) != values.end(); } +}; + +// TODO(robinlinden): Clang 17 and 18 crash if these are = default. Clang 19 is fine. +inline bool Array::operator==(Array const &v) const { + return values == v.values; +} + +inline bool Object::operator==(Object const &v) const { + return values == v.values; +} + +// TODO(robinlinden): Make things more constexpr once we've dropped libc++ 17, 18. +// https://www.json.org/json-en.html +class Parser { +public: + explicit constexpr Parser(std::string_view json) : json_{json} {} + + constexpr bool is_eof() const { return pos_ >= json_.size(); } + + constexpr bool is_whitespace(char c) const { + switch (c) { + case 0x09: // '\t' + case 0x0A: // '\n' + case 0x0D: // '\r' + case 0x20: // ' ' + return true; + default: + return false; + } + } + + constexpr bool is_whitespace(std::optional c) const { return c && is_whitespace(*c); } + + constexpr std::optional peek() const { + if (is_eof()) { + return std::nullopt; + } + + return json_[pos_]; + } + + [[nodiscard]] constexpr std::optional consume() { + if (is_eof()) { + return std::nullopt; + } + + return json_[pos_++]; + } + + constexpr void skip_whitespace() { + while (!is_eof() && is_whitespace(peek())) { + std::ignore = consume(); + } + } + + std::optional parse() { + auto v = parse_value(); + skip_whitespace(); + + if (!is_eof()) { + return std::nullopt; + } + + return v; + } + + // NOLINTNEXTLINE(misc-no-recursion) + std::optional parse_value() { + skip_whitespace(); + auto c = peek(); + if (!c) { + return std::nullopt; + } + + if (*c == '-' || (*c >= '0' && *c <= '9')) { + return parse_number(); + } + + switch (*c) { + case '"': + return parse_string(); + case 't': + return parse_true(); + case 'f': + return parse_false(); + case 'n': + return parse_null(); + case '[': + return parse_array(); + case '{': + return parse_object(); + default: + return std::nullopt; + } + } + + std::optional parse_number() { + std::string number; + if (auto c = peek(); c == '-') { + number.push_back('-'); + std::ignore = consume(); + } + + if (auto c = peek(); c == '0') { + number.push_back('0'); + std::ignore = consume(); + } else if (c >= '1' && c <= '9') { + assert(c.has_value()); // clang-tidy 19 needs some help here. + number.push_back(*c); + std::ignore = consume(); + + for (c = peek(); c >= '0' && c <= '9'; c = peek()) { + assert(c.has_value()); // clang-tidy 19 needs some help here. + number.push_back(*c); + std::ignore = consume(); + } + } else { + return std::nullopt; + } + + bool is_floating_point = false; + if (peek() == '.') { + number.push_back('.'); + std::ignore = consume(); + is_floating_point = true; + + auto c = peek(); + if (!c || *c < '0' || *c > '9') { + return std::nullopt; + } + + number.push_back(*c); + std::ignore = consume(); + + while ((c = peek())) { + if (*c >= '0' && *c <= '9') { + number.push_back(*c); + std::ignore = consume(); + continue; + } + + break; + } + } + + if (auto c = peek(); c == 'e' || c == 'E') { + number.push_back(*c); + std::ignore = consume(); + is_floating_point = true; + + if (c = peek(); c == '+' || c == '-') { + number.push_back(*c); + std::ignore = consume(); + } + + if (c = peek(); !c || *c < '0' || *c > '9') { + return std::nullopt; + } + + number.push_back(*c); + std::ignore = consume(); + + while ((c = peek())) { + if (*c >= '0' && *c <= '9') { + number.push_back(*c); + std::ignore = consume(); + continue; + } + + break; + } + } + + if (!is_floating_point) { + std::int64_t value{}; + if (auto [p, ec] = std::from_chars(number.data(), number.data() + number.size(), value); + ec != std::errc{} || p != number.data() + number.size()) { + return std::nullopt; + } + + return Value{value}; + } + + double value{}; + if (auto [p, ec] = util::from_chars(number.data(), number.data() + number.size(), value); + ec != std::errc{} || p != number.data() + number.size()) { + return std::nullopt; + } + + return Value{value}; + } + + // NOLINTNEXTLINE(misc-no-recursion) + std::optional parse_object() { + std::ignore = consume(); // '{' + skip_whitespace(); + + if (peek() == '}') { + std::ignore = consume(); + return Object{}; + } + + Object object; + while (true) { + skip_whitespace(); + + auto key = parse_string(); + if (!key) { + return std::nullopt; + } + + skip_whitespace(); + if (consume() != ':') { + return std::nullopt; + } + + auto value = parse_value(); + if (!value) { + return std::nullopt; + } + + object.values.emplace_back(std::get(*std::move(key)), *std::move(value)); + skip_whitespace(); + + auto c = peek(); + if (!c) { + return std::nullopt; + } + + if (*c == ',') { + std::ignore = consume(); + continue; + } + + if (*c == '}') { + std::ignore = consume(); + return object; + } + + return std::nullopt; + } + } + + // NOLINTNEXTLINE(misc-no-recursion) + std::optional parse_array() { + std::ignore = consume(); // '[' + skip_whitespace(); + + if (peek() == ']') { + std::ignore = consume(); + return Array{}; + } + + Array array; + while (true) { + auto v = parse_value(); + if (!v) { + return std::nullopt; + } + + array.values.push_back(*std::move(v)); + skip_whitespace(); + + auto c = peek(); + if (!c) { + return std::nullopt; + } + + if (*c == ',') { + std::ignore = consume(); + continue; + } + + if (*c == ']') { + std::ignore = consume(); + return array; + } + + return std::nullopt; + } + } + + std::optional parse_true() { + std::ignore = consume(); // 't' + auto r = consume(); + auto u = consume(); + auto e = consume(); + if (r != 'r' || u != 'u' || e != 'e') { + return std::nullopt; + } + + return Value{true}; + } + + std::optional parse_false() { + std::ignore = consume(); // 'f' + auto a = consume(); + auto l = consume(); + auto s = consume(); + auto e = consume(); + if (a != 'a' || l != 'l' || s != 's' || e != 'e') { + return std::nullopt; + } + + return Value{false}; + } + + std::optional parse_null() { + std::ignore = consume(); // 'n' + auto u = consume(); + auto l1 = consume(); + auto l2 = consume(); + if (u != 'u' || l1 != 'l' || l2 != 'l') { + return std::nullopt; + } + + return Value{Null{}}; + } + + std::optional parse_string() { + std::string value; + if (consume() != '"') { + return std::nullopt; + } + + while (auto c = consume()) { + if (*c == '"') { + return value; + } + + if (*c == '\\') { + auto escaped = consume(); + if (!escaped) { + return std::nullopt; + } + + switch (*escaped) { + case '"': + value.push_back('"'); + break; + case '\\': + value.push_back('\\'); + break; + case '/': + value.push_back('/'); + break; + case 'b': + value.push_back('\b'); + break; + case 'f': + value.push_back('\f'); + break; + case 'n': + value.push_back('\n'); + break; + case 'r': + value.push_back('\r'); + break; + case 't': + value.push_back('\t'); + break; + case 'u': { + auto code_unit = parse_utf16_escaped_hex(); + if (!code_unit) { + return std::nullopt; + } + + if (unicode::is_high_surrogate(*code_unit)) { + if (consume() != '\\' || consume() != 'u') { + return std::nullopt; + } + + auto low_surrogate = parse_utf16_escaped_hex(); + if (!low_surrogate || !unicode::is_low_surrogate(*low_surrogate)) { + return std::nullopt; + } + + auto code_point = unicode::utf16_surrogate_pair_to_code_point(*code_unit, *low_surrogate); + if (!code_point) { + return std::nullopt; + } + + auto utf8 = unicode::to_utf8(*code_point); + // The only error-checking in to_utf8 is to make sure + // that the code point isn't too large. Not possible + // when there are only 4 digits. + assert(!utf8.empty()); + + value += utf8; + break; + } + + auto utf8 = unicode::utf16_to_utf8(*code_unit); + if (!utf8) { + return std::nullopt; + } + + value += *utf8; + break; + } + default: + return std::nullopt; + } + + continue; + } + + value.push_back(*c); + } + + return std::nullopt; + } + + // This *only* parses the 4 hex digits after the \u. + std::optional parse_utf16_escaped_hex() { + std::string hex; + for (int i = 0; i < 4; ++i) { + auto hex_digit = consume(); + if (!hex_digit) { + return std::nullopt; + } + + hex.push_back(*hex_digit); + } + + std::uint16_t code_unit{}; + if (auto [p, ec] = std::from_chars(hex.data(), hex.data() + hex.size(), code_unit, 16); + ec != std::errc{} || p != hex.data() + hex.size()) { + return std::nullopt; + } + + return code_unit; + } + +private: + std::string_view json_; + std::size_t pos_{0}; +}; + +inline std::optional parse(std::string_view json) { + return Parser{json}.parse(); +} + +} // namespace json + +#endif diff --git a/json/json_fuzz_test.cpp b/json/json_fuzz_test.cpp new file mode 100644 index 00000000..ae403209 --- /dev/null +++ b/json/json_fuzz_test.cpp @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: 2025 Robin Lindén +// +// SPDX-License-Identifier: BSD-2-Clause + +#include "json/json.h" + +#include +#include +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(std::uint8_t const *data, std::size_t size); + +extern "C" int LLVMFuzzerTestOneInput(std::uint8_t const *data, std::size_t size) { + std::ignore = json::parse(std::string_view{reinterpret_cast(data), size}); + return 0; +} diff --git a/json/json_test.cpp b/json/json_test.cpp new file mode 100644 index 00000000..22933dae --- /dev/null +++ b/json/json_test.cpp @@ -0,0 +1,146 @@ +// SPDX-FileCopyrightText: 2025 Robin Lindén +// +// SPDX-License-Identifier: BSD-2-Clause + +#include "json/json.h" + +#include "etest/etest2.h" + +#include +#include + +int main() { + using json::Value; + etest::Suite s{}; + + s.add_test("bad input", [](etest::IActions &a) { + a.expect_eq(json::parse(""), std::nullopt); + a.expect_eq(json::parse(","), std::nullopt); + }); + + s.add_test("string", [](etest::IActions &a) { + a.expect_eq(json::parse(R"("hello")"), json::Value{"hello"}); + a.expect_eq(json::parse(R"( "hello" )"), json::Value{"hello"}); + a.expect_eq(json::parse("\t\n\r \"hello\"\t\n\r "), json::Value{"hello"}); + a.expect_eq(json::parse(R"("hello",)"), std::nullopt); + a.expect_eq(json::parse(R"("")"), json::Value{""}); + a.expect_eq(json::parse(R"("hello)"), std::nullopt); + a.expect_eq(json::parse(R"(")"), std::nullopt); + }); + + s.add_test("string, escapes", [](etest::IActions &a) { + a.expect_eq(json::parse(R"("hello\n")"), json::Value{"hello\n"}); + a.expect_eq(json::parse(R"("hello\"")"), json::Value{"hello\""}); + a.expect_eq(json::parse(R"("hello\\")"), json::Value{"hello\\"}); + a.expect_eq(json::parse(R"("hello\/")"), json::Value{"hello/"}); + a.expect_eq(json::parse(R"("hello\b")"), json::Value{"hello\b"}); + a.expect_eq(json::parse(R"("hello\f")"), json::Value{"hello\f"}); + a.expect_eq(json::parse(R"("hello\r")"), json::Value{"hello\r"}); + a.expect_eq(json::parse(R"("hello\t")"), json::Value{"hello\t"}); + a.expect_eq(json::parse(R"("hello\u0041")"), json::Value{"helloA"}); + a.expect_eq(json::parse(R"("hello\u004120")"), json::Value{"helloA20"}); + + a.expect_eq(json::parse(R"("hello\u")"), std::nullopt); + a.expect_eq(json::parse(R"("hello\u004")"), std::nullopt); + a.expect_eq(json::parse(R"("hello\u004G")"), std::nullopt); + + a.expect_eq(json::parse(R"("hello\p")"), std::nullopt); + + // Surrogates. + a.expect_eq(json::parse(R"("\uD852\uDF62")"), json::Value{"𤭢"}); + a.expect_eq(json::parse(R"("\uD83D")"), std::nullopt); + a.expect_eq(json::parse(R"("\uDE00")"), std::nullopt); + }); + + s.add_test("true", [](etest::IActions &a) { + a.expect_eq(json::parse("true"), json::Value{true}); + a.expect_eq(json::parse("tru0"), std::nullopt); + a.expect_eq(json::parse("tr00"), std::nullopt); + a.expect_eq(json::parse("t000"), std::nullopt); + a.expect_eq(json::parse("true!"), std::nullopt); + }); + + s.add_test("false", [](etest::IActions &a) { + a.expect_eq(json::parse("false"), json::Value{false}); + a.expect_eq(json::parse("fals0"), std::nullopt); + a.expect_eq(json::parse("fal00"), std::nullopt); + a.expect_eq(json::parse("fa000"), std::nullopt); + a.expect_eq(json::parse("f0000"), std::nullopt); + a.expect_eq(json::parse("false!"), std::nullopt); + }); + + s.add_test("null", [](etest::IActions &a) { + a.expect_eq(json::parse("null"), json::Value{json::Null{}}); + a.expect_eq(json::parse("nul0"), std::nullopt); + a.expect_eq(json::parse("nu00"), std::nullopt); + a.expect_eq(json::parse("n000"), std::nullopt); + a.expect_eq(json::parse("null!"), std::nullopt); + }); + + s.add_test("array", [](etest::IActions &a) { + a.expect_eq(json::parse("[]"), Value{json::Array{}}); + a.expect_eq(json::parse("[ ]"), Value{json::Array{}}); + a.expect_eq(json::parse(R"(["1"])"), Value{json::Array{{Value{"1"}}}}); + a.expect_eq(json::parse(R"([null, true, "hello", false, []])"), + Value{json::Array{ + {Value{json::Null{}}, Value{true}, Value{"hello"}, Value{false}, Value{json::Array{}}}, + }}); + + a.expect_eq(json::parse("["), std::nullopt); + a.expect_eq(json::parse("[blah"), std::nullopt); + a.expect_eq(json::parse("[null"), std::nullopt); + a.expect_eq(json::parse("[null,"), std::nullopt); + }); + + s.add_test("object", [](etest::IActions &a) { + a.expect_eq(json::parse("{}"), Value{json::Object{}}); + a.expect_eq(json::parse("{ }"), Value{json::Object{}}); + a.expect_eq(json::parse(R"({"key": "value"})"), Value{json::Object{{{"key", Value{"value"}}}}}); + a.expect_eq(json::parse(R"({"key": "value", "key2": "value2"})"), + Value{json::Object{{{"key", Value{"value"}}, {"key2", Value{"value2"}}}}}); + a.expect_eq(json::parse(R"({"key": true, "key2": "value2", "key3": false})"), + Value{json::Object{{{"key", Value{true}}, {"key2", Value{"value2"}}, {"key3", Value{false}}}}}); + + a.expect_eq(json::parse(R"({"key": {"key": "value"}})"), + Value{json::Object{{{"key", Value{json::Object{{{"key", Value{"value"}}}}}}}}}); + + a.expect_eq(json::parse("{"), std::nullopt); + a.expect_eq(json::parse("{blah"), std::nullopt); + a.expect_eq(json::parse("{null"), std::nullopt); + a.expect_eq(json::parse(R"({"key")"), std::nullopt); + a.expect_eq(json::parse(R"({"key":)"), std::nullopt); + a.expect_eq(json::parse(R"({"key":asdf)"), std::nullopt); + a.expect_eq(json::parse(R"({"key":true)"), std::nullopt); + a.expect_eq(json::parse(R"({"key":true,)"), std::nullopt); + a.expect_eq(json::parse(R"({"key":true})"), Value{json::Object{{{"key", Value{true}}}}}); + }); + + s.add_test("object helpers", [](etest::IActions &a) { + json::Object o{{{"key", Value{"value"}}}}; + + a.expect(o.contains("key")); + a.expect_eq(o.at("key"), Value{"value"}); + a.expect_eq(o.find("key"), std::ranges::find(o.values, "key", &decltype(o.values)::value_type::first)); + a.expect_eq(o.find("blah"), std::ranges::find(o.values, "end", &decltype(o.values)::value_type::first)); + }); + + s.add_test("numbers", [](etest::IActions &a) { + a.expect_eq(json::parse("0"), Value{0}); + a.expect_eq(json::parse("1"), Value{1}); + a.expect_eq(json::parse("123"), Value{123}); + a.expect_eq(json::parse("123.456"), Value{123.456}); + a.expect_eq(json::parse("-0"), Value{-0}); + a.expect_eq(json::parse("-1"), Value{-1}); + a.expect_eq(json::parse("-123"), Value{-123}); + a.expect_eq(json::parse("-123.456"), Value{-123.456}); + a.expect_eq(json::parse("0.123"), Value{0.123}); + a.expect_eq(json::parse("0.123e4"), Value{0.123e4}); + a.expect_eq(json::parse("0.123e-4"), Value{0.123e-4}); + a.expect_eq(json::parse("0.123e+4"), Value{0.123e+4}); + + a.expect_eq(json::parse("0.123e456"), std::nullopt); // out-of-range + a.expect_eq(json::parse("123."), std::nullopt); + }); + + return s.run(); +} diff --git a/unicode/util.h b/unicode/util.h index c797b377..98b47065 100644 --- a/unicode/util.h +++ b/unicode/util.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2022-2024 Robin Lindén +// SPDX-FileCopyrightText: 2022-2025 Robin Lindén // // SPDX-License-Identifier: BSD-2-Clause @@ -65,9 +65,33 @@ constexpr std::string to_utf8(std::uint32_t code_point) { } } +constexpr bool is_high_surrogate(std::uint32_t code_point) { + return code_point >= 0xD800 && code_point <= 0xDBFF; +} + +constexpr bool is_low_surrogate(std::uint32_t code_point) { + return code_point >= 0xDC00 && code_point <= 0xDFFF; +} + // https://infra.spec.whatwg.org/#surrogate constexpr bool is_surrogate(std::uint32_t code_point) { - return code_point >= 0xD800 && code_point <= 0xDFFF; + return is_high_surrogate(code_point) || is_low_surrogate(code_point); +} + +constexpr std::optional utf16_surrogate_pair_to_code_point(std::uint16_t high, std::uint16_t low) { + if (!is_high_surrogate(high) || !is_low_surrogate(low)) { + return std::nullopt; + } + + return 0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF); +} + +constexpr std::optional utf16_to_utf8(std::uint16_t code_unit) { + if (is_surrogate(code_unit)) { + return std::nullopt; + } + + return to_utf8(code_unit); } // https://infra.spec.whatwg.org/#noncharacter diff --git a/unicode/util_test.cpp b/unicode/util_test.cpp index 490181e5..1da8e31f 100644 --- a/unicode/util_test.cpp +++ b/unicode/util_test.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2022-2024 Robin Lindén +// SPDX-FileCopyrightText: 2022-2025 Robin Lindén // // SPDX-License-Identifier: BSD-2-Clause @@ -53,10 +53,26 @@ int main() { s.add_test("is_surrogate", [](etest::IActions &a) { a.expect(!is_surrogate(0xD799)); + a.expect(!is_high_surrogate(0xD799)); + a.expect(!is_low_surrogate(0xD799)); + a.expect(is_surrogate(0xD800)); // First leading surrogate. a.expect(is_surrogate(0xDBFF)); // Last leading surrogate. + a.expect(is_high_surrogate(0xD800)); + a.expect(is_high_surrogate(0xDBFF)); + + a.expect(!is_low_surrogate(0xD800)); + a.expect(!is_low_surrogate(0xDBFF)); + a.expect(is_surrogate(0xDC00)); // First trailing surrogate. a.expect(is_surrogate(0xDFFF)); // Last trailing surrogate. + a.expect(is_low_surrogate(0xDC00)); + a.expect(is_low_surrogate(0xDFFF)); + + a.expect(!is_high_surrogate(0xDC00)); + a.expect(!is_high_surrogate(0xDFFF)); + a.expect(!is_high_surrogate(0xE000)); + a.expect(!is_low_surrogate(0xE000)); a.expect(!is_surrogate(0xE000)); }); @@ -128,5 +144,41 @@ int main() { a.expect_eq(into_code_points("\xe1\xa8\x9f"sv), std::vector{0x1a1f}); }); + s.add_test("utf16_surrogate_pair_to_code_point", [](etest::IActions &a) { + a.expect_eq(utf16_surrogate_pair_to_code_point(0xD800, 0xDC00), 0x10000U); + a.expect_eq(utf16_surrogate_pair_to_code_point(0xDBFF, 0xDFFF), 0x10FFFFU); + + // Invalid pairs. + a.expect_eq(utf16_surrogate_pair_to_code_point(0xD800, 0xDBFF), std::nullopt); + a.expect_eq(utf16_surrogate_pair_to_code_point(0xDBFF, 0xDBFF), std::nullopt); + a.expect_eq(utf16_surrogate_pair_to_code_point(0xDC00, 0xD800), std::nullopt); + a.expect_eq(utf16_surrogate_pair_to_code_point(0xDFFF, 0xDBFF), std::nullopt); + + // Non-surrogate pairs. + a.expect_eq(utf16_surrogate_pair_to_code_point(0x0000, 0x0000), std::nullopt); + a.expect_eq(utf16_surrogate_pair_to_code_point(0x0000, 0xFFFF), std::nullopt); + a.expect_eq(utf16_surrogate_pair_to_code_point(0xFFFF, 0x0000), std::nullopt); + a.expect_eq(utf16_surrogate_pair_to_code_point(0xFFFF, 0xFFFF), std::nullopt); + a.expect_eq(utf16_surrogate_pair_to_code_point(0x42, 0x42), std::nullopt); + }); + + s.add_test("utf16_to_utf8", [](etest::IActions &a) { + a.expect_eq(utf16_to_utf8(0x002f), "/"sv); + + a.expect_eq(utf16_to_utf8(0x00a3), "£"sv); + a.expect_eq(utf16_to_utf8(0x07f9), "߹"sv); + + a.expect_eq(utf16_to_utf8(0x0939), "ह"sv); + a.expect_eq(utf16_to_utf8(0x20ac), "€"sv); + a.expect_eq(utf16_to_utf8(0xd55c), "한"sv); + a.expect_eq(utf16_to_utf8(0xfffd), "�"sv); + + // Lone surrogates. + a.expect_eq(utf16_to_utf8(0xD800), std::nullopt); + a.expect_eq(utf16_to_utf8(0xDBFF), std::nullopt); + a.expect_eq(utf16_to_utf8(0xDC00), std::nullopt); + a.expect_eq(utf16_to_utf8(0xDFFF), std::nullopt); + }); + return s.run(); }