Skip to content

Commit

Permalink
idna: Process the unicode UTS46-mappings into something usable in C++
Browse files Browse the repository at this point in the history
  • Loading branch information
robinlinden committed Mar 12, 2024
1 parent 68ffafb commit 4ca9f90
Show file tree
Hide file tree
Showing 4 changed files with 322 additions and 2 deletions.
9 changes: 8 additions & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file")

# Bazel
# =========================================================
Expand Down Expand Up @@ -150,6 +150,13 @@ http_archive(
url = "https://github.com/unicode-org/icu/archive/refs/tags/release-74-2.tar.gz",
)

# https://www.unicode.org/Public/idna/
http_file(
name = "idna_mapping_table",
sha256 = "402cbd285f1f952fcd0834b63541d54f69d3d8f1b8f8599bf71a1a14935f82c4",
url = "https://www.unicode.org/Public/idna/15.1.0/IdnaMappingTable.txt",
)

# https://github.com/ocornut/imgui
http_archive(
name = "imgui", # MIT
Expand Down
16 changes: 15 additions & 1 deletion idna/BUILD
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
load("@rules_python//python:defs.bzl", "py_binary")
load("//bzl:copts.bzl", "HASTUR_COPTS")

py_binary(
name = "idna_data_processor",
srcs = ["idna_data_processor.py"],
)

genrule(
name = "generate_idna_data",
srcs = ["@idna_mapping_table//file"],
outs = ["idna_data.h"],
cmd = "$(location :idna_data_processor) $(location @idna_mapping_table//file) >$@",
tools = [":idna_data_processor"],
)

cc_library(
name = "idna",
hdrs = glob(["*.h"]),
hdrs = [":generate_idna_data"] + glob(["*.h"]),
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = ["//util:unicode"],
Expand Down
278 changes: 278 additions & 0 deletions idna/idna_data_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
#!/usr/bin/env python3

# SPDX-FileCopyrightText: 2023-2024 Robin Lindén <[email protected]>
#
# SPDX-License-Identifier: BSD-2-Clause

import dataclasses
import sys
import textwrap
import typing


@dataclasses.dataclass
class Disallowed:
pass


@dataclasses.dataclass
class DisallowedStd3Valid:
pass


@dataclasses.dataclass
class DisallowedStd3Mapped:
maps_to: typing.List[int]

@staticmethod
def from_string(s: str) -> "DisallowedStd3Mapped":
return DisallowedStd3Mapped([int(n, base=16) for n in s.split(" ")])


@dataclasses.dataclass
class Ignored:
pass


@dataclasses.dataclass
class Mapped:
maps_to: typing.List[int]

@staticmethod
def from_string(s: str) -> "Mapped":
return Mapped([int(n, base=16) for n in s.split(" ")])


@dataclasses.dataclass
class Deviation:
maps_to: typing.List[int]

@staticmethod
def from_string(s: str) -> "Deviation":
return Deviation([int(n, base=16) for n in s.split(" ") if len(n) > 0])


@dataclasses.dataclass
class Valid:
pass


@dataclasses.dataclass
class ValidNv8:
pass


@dataclasses.dataclass
class ValidXv8:
pass


Mapping = (
Disallowed
| DisallowedStd3Valid
| DisallowedStd3Mapped
| Ignored
| Mapped
| Deviation
| Valid
| ValidNv8
| ValidXv8
)


class IDNA:
def __init__(self) -> None:
# List of each code point starting a new mapping. I.e. if code point 1
# and 2 are disallowed, and 3 is valid, this list will be
# `[(1, Disallowed), (3, Valid)]`.
self.mappings: typing.List[tuple[int, Mapping]] = []

# https://www.unicode.org/reports/tr46/#Table_Data_File_Fields
@staticmethod
def from_table(table_rows: typing.List[str]) -> "IDNA":
mappings: typing.List[tuple[int, Mapping]] = []
for row in table_rows:
# Drop the trailing comment about what code point this is.
row = row.split("#")[0].strip()

cols = [col.strip() for col in row.split(";")]
# Some rows are blank or just a comment.
if len(cols) <= 1:
continue

code_point = int(cols[0].split("..")[0].lstrip("0") or "0", 16)
status = cols[1]
if status == "disallowed":
assert len(cols) == 2
if len(mappings) > 0 and isinstance(mappings[-1][1], Disallowed):
continue
mappings.append((code_point, Disallowed()))
elif status == "disallowed_STD3_valid":
assert len(cols) == 2
if len(mappings) > 0 and isinstance(
mappings[-1][1], DisallowedStd3Valid
):
continue
mappings.append((code_point, DisallowedStd3Valid()))
elif status == "disallowed_STD3_mapped":
assert len(cols) == 3
if len(mappings) > 0 and mappings[-1][
1
] == DisallowedStd3Mapped.from_string(cols[2]):
continue
mappings.append((code_point, DisallowedStd3Mapped.from_string(cols[2])))
elif status == "ignored":
assert len(cols) == 2
if len(mappings) > 0 and isinstance(mappings[-1][1], Ignored):
continue
mappings.append((code_point, Ignored()))
elif status == "mapped":
assert len(cols) == 3
if len(mappings) > 0 and mappings[-1][1] == Mapped.from_string(cols[2]):
continue
mappings.append((code_point, Mapped.from_string(cols[2])))
elif status == "deviation":
assert len(cols) == 3
if len(mappings) > 0 and mappings[-1][1] == Deviation.from_string(
cols[2]
):
continue
mappings.append((code_point, Deviation.from_string(cols[2])))
elif status == "valid" and len(cols) == 2:
if len(mappings) > 0 and isinstance(mappings[-1][1], Valid):
continue
mappings.append((code_point, Valid()))
elif status == "valid" and len(cols) == 4 and cols[3] == "NV8":
if len(mappings) > 0 and isinstance(mappings[-1][1], ValidNv8):
continue
mappings.append((code_point, ValidNv8()))
elif status == "valid" and len(cols) == 4 and cols[3] == "XV8":
if len(mappings) > 0 and isinstance(mappings[-1][1], ValidXv8):
continue
mappings.append((code_point, ValidXv8()))
else:
raise Exception(f"Unable to parse data: {cols}")

idna = IDNA()
idna.mappings = mappings
return idna


def to_cxx_variant(a: Mapping) -> str:
if isinstance(a, Disallowed):
return "Disallowed{}"
elif isinstance(a, DisallowedStd3Valid):
return "DisallowedStd3Valid{}"
elif isinstance(a, DisallowedStd3Mapped):
mapping = "".join(f"\\u{c:04X}" for c in a.maps_to)
return f'DisallowedStd3Mapped{{"{mapping}"}}'
elif isinstance(a, Ignored):
return "Ignored{}"
elif isinstance(a, Mapped):
mapping = "".join(f"\\u{c:04X}" for c in a.maps_to)
return f'Mapped{{"{mapping}"}}'
elif isinstance(a, Deviation):
mapping = "".join(f"\\u{c:04X}" for c in a.maps_to)
return f'Deviation{{"{mapping}"}}'
elif isinstance(a, Valid):
return "Valid{}"
elif isinstance(a, ValidNv8):
return "ValidNv8{}"
elif isinstance(a, ValidXv8):
return "ValidXv8{}"
else:
raise Exception(f"Unknown mapping: {a}")


if __name__ == "__main__":
if len(sys.argv) != 2:
print(
f"Usage: {sys.argv[0]} <IdnaMappingTable.txt>",
file=sys.stderr,
)
sys.exit(1)

with open(sys.argv[1]) as table:
idna = IDNA.from_table(table.readlines())

sys.stdout.buffer.write(
textwrap.dedent(
f"""\
// SPDX-FileCopyrightText: 2023-2024 Robin Lindén <[email protected]>
//
// SPDX-License-Identifier: BSD-2-Clause
// This file is generated. Do not touch it.
#ifndef IDNA_IDNA_DATA_H_
#define IDNA_IDNA_DATA_H_
// clang-format off
#include <array>
#include <string_view>
#include <variant>
#include <utility>
namespace idna::uts46 {{
struct Disallowed {{
constexpr bool operator==(Disallowed const &) const = default;
}};
struct DisallowedStd3Valid {{
constexpr bool operator==(DisallowedStd3Valid const &) const = default;
}};
struct DisallowedStd3Mapped {{
std::string_view maps_to;
constexpr bool operator==(DisallowedStd3Mapped const &) const = default;
}};
struct Ignored {{
constexpr bool operator==(Ignored const &) const = default;
}};
struct Mapped {{
std::string_view maps_to;
constexpr bool operator==(Mapped const &) const = default;
}};
struct Deviation {{
std::string_view maps_to;
constexpr bool operator==(Deviation const &) const = default;
}};
struct Valid {{
constexpr bool operator==(Valid const &) const = default;
}};
struct ValidNv8 {{
constexpr bool operator==(ValidNv8 const &) const = default;
}};
struct ValidXv8 {{
constexpr bool operator==(ValidXv8 const &) const = default;
}};
using Mapping = std::variant<
Disallowed,
DisallowedStd3Valid,
DisallowedStd3Mapped,
Ignored,
Mapped,
Deviation,
Valid,
ValidNv8,
ValidXv8>;
constexpr std::array<std::pair<char32_t, Mapping>, {len(idna.mappings)}> kMappings{{{{
{",\n ".join("{" + str(c[0]) + ", Mapping{" + to_cxx_variant(c[1]) + "}}" for c in idna.mappings)}
}}}};
}} // namespace idna::uts46
// clang-format on
#endif
"""
).encode()
)
21 changes: 21 additions & 0 deletions idna/idna_data_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// SPDX-FileCopyrightText: 2024 Robin Lindén <[email protected]>
//
// SPDX-License-Identifier: BSD-2-Clause

#include "idna/idna_data.h"

#include "etest/etest2.h"

using Entry = decltype(idna::uts46::kMappings)::value_type;

// https://www.unicode.org/reports/tr46/#IDNA_Mapping_Table
int main() {
etest::Suite s{};

s.add_test("everything before ascii '-' is ~disallowed", [](etest::IActions &a) {
a.expect_eq(idna::uts46::kMappings.at(0), Entry{char32_t{0}, idna::uts46::DisallowedStd3Valid{}});
a.expect_eq(idna::uts46::kMappings.at(1), Entry{char32_t{'-'}, idna::uts46::Valid{}});
});

return s.run();
}

0 comments on commit 4ca9f90

Please sign in to comment.