From 71dff132862108f2620c5b7b14a3bc65bb7b6096 Mon Sep 17 00:00:00 2001 From: Robin Linden Date: Sat, 14 Oct 2023 13:08:58 +0200 Subject: [PATCH 1/3] ci: Add a Python code formatting job --- .github/workflows/ci.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c4abf84e..63d447f1 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -356,6 +356,15 @@ jobs: - run: pip install gitlint==0.19.1 - run: gitlint --commits origin/master.. + # https://github.com/psf/black + black: + runs-on: ubuntu-22.04 + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + - run: pip install black==23.11.0 + - run: black --check --diff . + concurrency: group: ${{ github.head_ref || github.run_id }} cancel-in-progress: true From 68ffafb271947a64af4fe66b73928231950748d2 Mon Sep 17 00:00:00 2001 From: Robin Linden Date: Mon, 11 Mar 2024 22:12:45 +0100 Subject: [PATCH 2/3] ci: Add a Python type-checking job --- .github/workflows/ci.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 63d447f1..9c60861d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -365,6 +365,19 @@ jobs: - run: pip install black==23.11.0 - run: black --check --diff . + # https://github.com/python/mypy + mypy: + runs-on: ubuntu-22.04 + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + # mypy gets upset about \ in f-strings if Python is too old. + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install mypy==1.9.0 + - run: mypy --strict $(find . -name "*.py") + concurrency: group: ${{ github.head_ref || github.run_id }} cancel-in-progress: true From 4ca9f900f74870969084aabd2dd4f0600f4782f6 Mon Sep 17 00:00:00 2001 From: Robin Linden Date: Sat, 14 Oct 2023 12:42:10 +0200 Subject: [PATCH 3/3] idna: Process the unicode UTS46-mappings into something usable in C++ --- WORKSPACE | 9 +- idna/BUILD | 16 ++- idna/idna_data_processor.py | 278 ++++++++++++++++++++++++++++++++++++ idna/idna_data_test.cpp | 21 +++ 4 files changed, 322 insertions(+), 2 deletions(-) create mode 100644 idna/idna_data_processor.py create mode 100644 idna/idna_data_test.cpp diff --git a/WORKSPACE b/WORKSPACE index dd66b98e..297ed5b9 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,4 +1,4 @@ -load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file") # Bazel # ========================================================= @@ -150,6 +150,13 @@ http_archive( url = "https://github.com/unicode-org/icu/archive/refs/tags/release-74-2.tar.gz", ) +# https://www.unicode.org/Public/idna/ +http_file( + name = "idna_mapping_table", + sha256 = "402cbd285f1f952fcd0834b63541d54f69d3d8f1b8f8599bf71a1a14935f82c4", + url = "https://www.unicode.org/Public/idna/15.1.0/IdnaMappingTable.txt", +) + # https://github.com/ocornut/imgui http_archive( name = "imgui", # MIT diff --git a/idna/BUILD b/idna/BUILD index 9496c830..af7ad657 100644 --- a/idna/BUILD +++ b/idna/BUILD @@ -1,9 +1,23 @@ load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") +load("@rules_python//python:defs.bzl", "py_binary") load("//bzl:copts.bzl", "HASTUR_COPTS") +py_binary( + name = "idna_data_processor", + srcs = ["idna_data_processor.py"], +) + +genrule( + name = "generate_idna_data", + srcs = ["@idna_mapping_table//file"], + outs = ["idna_data.h"], + cmd = "$(location :idna_data_processor) $(location @idna_mapping_table//file) >$@", + tools = [":idna_data_processor"], +) + cc_library( name = "idna", - hdrs = glob(["*.h"]), + hdrs = [":generate_idna_data"] + glob(["*.h"]), copts = HASTUR_COPTS, visibility = ["//visibility:public"], deps = ["//util:unicode"], diff --git a/idna/idna_data_processor.py b/idna/idna_data_processor.py new file mode 100644 index 00000000..1c6792cb --- /dev/null +++ b/idna/idna_data_processor.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: 2023-2024 Robin Lindén +# +# SPDX-License-Identifier: BSD-2-Clause + +import dataclasses +import sys +import textwrap +import typing + + +@dataclasses.dataclass +class Disallowed: + pass + + +@dataclasses.dataclass +class DisallowedStd3Valid: + pass + + +@dataclasses.dataclass +class DisallowedStd3Mapped: + maps_to: typing.List[int] + + @staticmethod + def from_string(s: str) -> "DisallowedStd3Mapped": + return DisallowedStd3Mapped([int(n, base=16) for n in s.split(" ")]) + + +@dataclasses.dataclass +class Ignored: + pass + + +@dataclasses.dataclass +class Mapped: + maps_to: typing.List[int] + + @staticmethod + def from_string(s: str) -> "Mapped": + return Mapped([int(n, base=16) for n in s.split(" ")]) + + +@dataclasses.dataclass +class Deviation: + maps_to: typing.List[int] + + @staticmethod + def from_string(s: str) -> "Deviation": + return Deviation([int(n, base=16) for n in s.split(" ") if len(n) > 0]) + + +@dataclasses.dataclass +class Valid: + pass + + +@dataclasses.dataclass +class ValidNv8: + pass + + +@dataclasses.dataclass +class ValidXv8: + pass + + +Mapping = ( + Disallowed + | DisallowedStd3Valid + | DisallowedStd3Mapped + | Ignored + | Mapped + | Deviation + | Valid + | ValidNv8 + | ValidXv8 +) + + +class IDNA: + def __init__(self) -> None: + # List of each code point starting a new mapping. I.e. if code point 1 + # and 2 are disallowed, and 3 is valid, this list will be + # `[(1, Disallowed), (3, Valid)]`. + self.mappings: typing.List[tuple[int, Mapping]] = [] + + # https://www.unicode.org/reports/tr46/#Table_Data_File_Fields + @staticmethod + def from_table(table_rows: typing.List[str]) -> "IDNA": + mappings: typing.List[tuple[int, Mapping]] = [] + for row in table_rows: + # Drop the trailing comment about what code point this is. + row = row.split("#")[0].strip() + + cols = [col.strip() for col in row.split(";")] + # Some rows are blank or just a comment. + if len(cols) <= 1: + continue + + code_point = int(cols[0].split("..")[0].lstrip("0") or "0", 16) + status = cols[1] + if status == "disallowed": + assert len(cols) == 2 + if len(mappings) > 0 and isinstance(mappings[-1][1], Disallowed): + continue + mappings.append((code_point, Disallowed())) + elif status == "disallowed_STD3_valid": + assert len(cols) == 2 + if len(mappings) > 0 and isinstance( + mappings[-1][1], DisallowedStd3Valid + ): + continue + mappings.append((code_point, DisallowedStd3Valid())) + elif status == "disallowed_STD3_mapped": + assert len(cols) == 3 + if len(mappings) > 0 and mappings[-1][ + 1 + ] == DisallowedStd3Mapped.from_string(cols[2]): + continue + mappings.append((code_point, DisallowedStd3Mapped.from_string(cols[2]))) + elif status == "ignored": + assert len(cols) == 2 + if len(mappings) > 0 and isinstance(mappings[-1][1], Ignored): + continue + mappings.append((code_point, Ignored())) + elif status == "mapped": + assert len(cols) == 3 + if len(mappings) > 0 and mappings[-1][1] == Mapped.from_string(cols[2]): + continue + mappings.append((code_point, Mapped.from_string(cols[2]))) + elif status == "deviation": + assert len(cols) == 3 + if len(mappings) > 0 and mappings[-1][1] == Deviation.from_string( + cols[2] + ): + continue + mappings.append((code_point, Deviation.from_string(cols[2]))) + elif status == "valid" and len(cols) == 2: + if len(mappings) > 0 and isinstance(mappings[-1][1], Valid): + continue + mappings.append((code_point, Valid())) + elif status == "valid" and len(cols) == 4 and cols[3] == "NV8": + if len(mappings) > 0 and isinstance(mappings[-1][1], ValidNv8): + continue + mappings.append((code_point, ValidNv8())) + elif status == "valid" and len(cols) == 4 and cols[3] == "XV8": + if len(mappings) > 0 and isinstance(mappings[-1][1], ValidXv8): + continue + mappings.append((code_point, ValidXv8())) + else: + raise Exception(f"Unable to parse data: {cols}") + + idna = IDNA() + idna.mappings = mappings + return idna + + +def to_cxx_variant(a: Mapping) -> str: + if isinstance(a, Disallowed): + return "Disallowed{}" + elif isinstance(a, DisallowedStd3Valid): + return "DisallowedStd3Valid{}" + elif isinstance(a, DisallowedStd3Mapped): + mapping = "".join(f"\\u{c:04X}" for c in a.maps_to) + return f'DisallowedStd3Mapped{{"{mapping}"}}' + elif isinstance(a, Ignored): + return "Ignored{}" + elif isinstance(a, Mapped): + mapping = "".join(f"\\u{c:04X}" for c in a.maps_to) + return f'Mapped{{"{mapping}"}}' + elif isinstance(a, Deviation): + mapping = "".join(f"\\u{c:04X}" for c in a.maps_to) + return f'Deviation{{"{mapping}"}}' + elif isinstance(a, Valid): + return "Valid{}" + elif isinstance(a, ValidNv8): + return "ValidNv8{}" + elif isinstance(a, ValidXv8): + return "ValidXv8{}" + else: + raise Exception(f"Unknown mapping: {a}") + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print( + f"Usage: {sys.argv[0]} ", + file=sys.stderr, + ) + sys.exit(1) + + with open(sys.argv[1]) as table: + idna = IDNA.from_table(table.readlines()) + + sys.stdout.buffer.write( + textwrap.dedent( + f"""\ + // SPDX-FileCopyrightText: 2023-2024 Robin Lindén + // + // SPDX-License-Identifier: BSD-2-Clause + + // This file is generated. Do not touch it. + + #ifndef IDNA_IDNA_DATA_H_ + #define IDNA_IDNA_DATA_H_ + // clang-format off + + #include + #include + #include + #include + + namespace idna::uts46 {{ + + struct Disallowed {{ + constexpr bool operator==(Disallowed const &) const = default; + }}; + + struct DisallowedStd3Valid {{ + constexpr bool operator==(DisallowedStd3Valid const &) const = default; + }}; + + struct DisallowedStd3Mapped {{ + std::string_view maps_to; + constexpr bool operator==(DisallowedStd3Mapped const &) const = default; + }}; + + struct Ignored {{ + constexpr bool operator==(Ignored const &) const = default; + }}; + + struct Mapped {{ + std::string_view maps_to; + constexpr bool operator==(Mapped const &) const = default; + }}; + + struct Deviation {{ + std::string_view maps_to; + constexpr bool operator==(Deviation const &) const = default; + }}; + + struct Valid {{ + constexpr bool operator==(Valid const &) const = default; + }}; + + struct ValidNv8 {{ + constexpr bool operator==(ValidNv8 const &) const = default; + }}; + + struct ValidXv8 {{ + constexpr bool operator==(ValidXv8 const &) const = default; + }}; + + using Mapping = std::variant< + Disallowed, + DisallowedStd3Valid, + DisallowedStd3Mapped, + Ignored, + Mapped, + Deviation, + Valid, + ValidNv8, + ValidXv8>; + + constexpr std::array, {len(idna.mappings)}> kMappings{{{{ + {",\n ".join("{" + str(c[0]) + ", Mapping{" + to_cxx_variant(c[1]) + "}}" for c in idna.mappings)} + }}}}; + + }} // namespace idna::uts46 + + // clang-format on + #endif + """ + ).encode() + ) diff --git a/idna/idna_data_test.cpp b/idna/idna_data_test.cpp new file mode 100644 index 00000000..46280c1d --- /dev/null +++ b/idna/idna_data_test.cpp @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: 2024 Robin Lindén +// +// SPDX-License-Identifier: BSD-2-Clause + +#include "idna/idna_data.h" + +#include "etest/etest2.h" + +using Entry = decltype(idna::uts46::kMappings)::value_type; + +// https://www.unicode.org/reports/tr46/#IDNA_Mapping_Table +int main() { + etest::Suite s{}; + + s.add_test("everything before ascii '-' is ~disallowed", [](etest::IActions &a) { + a.expect_eq(idna::uts46::kMappings.at(0), Entry{char32_t{0}, idna::uts46::DisallowedStd3Valid{}}); + a.expect_eq(idna::uts46::kMappings.at(1), Entry{char32_t{'-'}, idna::uts46::Valid{}}); + }); + + return s.run(); +}