Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

idna: Generate the mapping table needed for uts46 #728

Merged
merged 3 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,28 @@ jobs:
- run: pip install gitlint==0.19.1
- run: gitlint --commits origin/master..

# https://github.com/psf/black
black:
runs-on: ubuntu-22.04
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
- run: pip install black==23.11.0
- run: black --check --diff .

# https://github.com/python/mypy
mypy:
runs-on: ubuntu-22.04
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
# mypy gets upset about \ in f-strings if Python is too old.
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- run: pip install mypy==1.9.0
- run: mypy --strict $(find . -name "*.py")

concurrency:
group: ${{ github.head_ref || github.run_id }}
cancel-in-progress: true
9 changes: 8 additions & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file")

# Bazel
# =========================================================
Expand Down Expand Up @@ -150,6 +150,13 @@ http_archive(
url = "https://github.com/unicode-org/icu/archive/refs/tags/release-74-2.tar.gz",
)

# https://www.unicode.org/Public/idna/
http_file(
name = "idna_mapping_table",
sha256 = "402cbd285f1f952fcd0834b63541d54f69d3d8f1b8f8599bf71a1a14935f82c4",
url = "https://www.unicode.org/Public/idna/15.1.0/IdnaMappingTable.txt",
)

# https://github.com/ocornut/imgui
http_archive(
name = "imgui", # MIT
Expand Down
16 changes: 15 additions & 1 deletion idna/BUILD
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
load("@rules_python//python:defs.bzl", "py_binary")
load("//bzl:copts.bzl", "HASTUR_COPTS")

py_binary(
name = "idna_data_processor",
srcs = ["idna_data_processor.py"],
)

genrule(
name = "generate_idna_data",
srcs = ["@idna_mapping_table//file"],
outs = ["idna_data.h"],
cmd = "$(location :idna_data_processor) $(location @idna_mapping_table//file) >$@",
tools = [":idna_data_processor"],
)

cc_library(
name = "idna",
hdrs = glob(["*.h"]),
hdrs = [":generate_idna_data"] + glob(["*.h"]),
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = ["//util:unicode"],
Expand Down
278 changes: 278 additions & 0 deletions idna/idna_data_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
#!/usr/bin/env python3

# SPDX-FileCopyrightText: 2023-2024 Robin Lindén <[email protected]>
#
# SPDX-License-Identifier: BSD-2-Clause

import dataclasses
import sys
import textwrap
import typing


@dataclasses.dataclass
class Disallowed:
pass


@dataclasses.dataclass
class DisallowedStd3Valid:
pass


@dataclasses.dataclass
class DisallowedStd3Mapped:
maps_to: typing.List[int]

@staticmethod
def from_string(s: str) -> "DisallowedStd3Mapped":
return DisallowedStd3Mapped([int(n, base=16) for n in s.split(" ")])


@dataclasses.dataclass
class Ignored:
pass


@dataclasses.dataclass
class Mapped:
maps_to: typing.List[int]

@staticmethod
def from_string(s: str) -> "Mapped":
return Mapped([int(n, base=16) for n in s.split(" ")])


@dataclasses.dataclass
class Deviation:
maps_to: typing.List[int]

@staticmethod
def from_string(s: str) -> "Deviation":
return Deviation([int(n, base=16) for n in s.split(" ") if len(n) > 0])


@dataclasses.dataclass
class Valid:
pass


@dataclasses.dataclass
class ValidNv8:
pass


@dataclasses.dataclass
class ValidXv8:
pass


Mapping = (
Disallowed
| DisallowedStd3Valid
| DisallowedStd3Mapped
| Ignored
| Mapped
| Deviation
| Valid
| ValidNv8
| ValidXv8
)


class IDNA:
def __init__(self) -> None:
# List of each code point starting a new mapping. I.e. if code point 1
# and 2 are disallowed, and 3 is valid, this list will be
# `[(1, Disallowed), (3, Valid)]`.
self.mappings: typing.List[tuple[int, Mapping]] = []

# https://www.unicode.org/reports/tr46/#Table_Data_File_Fields
@staticmethod
def from_table(table_rows: typing.List[str]) -> "IDNA":
mappings: typing.List[tuple[int, Mapping]] = []
for row in table_rows:
# Drop the trailing comment about what code point this is.
row = row.split("#")[0].strip()

cols = [col.strip() for col in row.split(";")]
# Some rows are blank or just a comment.
if len(cols) <= 1:
continue

code_point = int(cols[0].split("..")[0].lstrip("0") or "0", 16)
status = cols[1]
if status == "disallowed":
assert len(cols) == 2
if len(mappings) > 0 and isinstance(mappings[-1][1], Disallowed):
continue
mappings.append((code_point, Disallowed()))
elif status == "disallowed_STD3_valid":
assert len(cols) == 2
if len(mappings) > 0 and isinstance(
mappings[-1][1], DisallowedStd3Valid
):
continue
mappings.append((code_point, DisallowedStd3Valid()))
elif status == "disallowed_STD3_mapped":
assert len(cols) == 3
if len(mappings) > 0 and mappings[-1][
1
] == DisallowedStd3Mapped.from_string(cols[2]):
continue
mappings.append((code_point, DisallowedStd3Mapped.from_string(cols[2])))
elif status == "ignored":
assert len(cols) == 2
if len(mappings) > 0 and isinstance(mappings[-1][1], Ignored):
continue
mappings.append((code_point, Ignored()))
elif status == "mapped":
assert len(cols) == 3
if len(mappings) > 0 and mappings[-1][1] == Mapped.from_string(cols[2]):
continue
mappings.append((code_point, Mapped.from_string(cols[2])))
elif status == "deviation":
assert len(cols) == 3
if len(mappings) > 0 and mappings[-1][1] == Deviation.from_string(
cols[2]
):
continue
mappings.append((code_point, Deviation.from_string(cols[2])))
elif status == "valid" and len(cols) == 2:
if len(mappings) > 0 and isinstance(mappings[-1][1], Valid):
continue
mappings.append((code_point, Valid()))
elif status == "valid" and len(cols) == 4 and cols[3] == "NV8":
if len(mappings) > 0 and isinstance(mappings[-1][1], ValidNv8):
continue
mappings.append((code_point, ValidNv8()))
elif status == "valid" and len(cols) == 4 and cols[3] == "XV8":
if len(mappings) > 0 and isinstance(mappings[-1][1], ValidXv8):
continue
mappings.append((code_point, ValidXv8()))
else:
raise Exception(f"Unable to parse data: {cols}")

idna = IDNA()
idna.mappings = mappings
return idna


def to_cxx_variant(a: Mapping) -> str:
if isinstance(a, Disallowed):
return "Disallowed{}"
elif isinstance(a, DisallowedStd3Valid):
return "DisallowedStd3Valid{}"
elif isinstance(a, DisallowedStd3Mapped):
mapping = "".join(f"\\u{c:04X}" for c in a.maps_to)
return f'DisallowedStd3Mapped{{"{mapping}"}}'
elif isinstance(a, Ignored):
return "Ignored{}"
elif isinstance(a, Mapped):
mapping = "".join(f"\\u{c:04X}" for c in a.maps_to)
return f'Mapped{{"{mapping}"}}'
elif isinstance(a, Deviation):
mapping = "".join(f"\\u{c:04X}" for c in a.maps_to)
return f'Deviation{{"{mapping}"}}'
elif isinstance(a, Valid):
return "Valid{}"
elif isinstance(a, ValidNv8):
return "ValidNv8{}"
elif isinstance(a, ValidXv8):
return "ValidXv8{}"
else:
raise Exception(f"Unknown mapping: {a}")


if __name__ == "__main__":
if len(sys.argv) != 2:
print(
f"Usage: {sys.argv[0]} <IdnaMappingTable.txt>",
file=sys.stderr,
)
sys.exit(1)

with open(sys.argv[1]) as table:
idna = IDNA.from_table(table.readlines())

sys.stdout.buffer.write(
textwrap.dedent(
f"""\
// SPDX-FileCopyrightText: 2023-2024 Robin Lindén <[email protected]>
//
// SPDX-License-Identifier: BSD-2-Clause

// This file is generated. Do not touch it.

#ifndef IDNA_IDNA_DATA_H_
#define IDNA_IDNA_DATA_H_
// clang-format off

#include <array>
#include <string_view>
#include <variant>
#include <utility>

namespace idna::uts46 {{

struct Disallowed {{
constexpr bool operator==(Disallowed const &) const = default;
}};

struct DisallowedStd3Valid {{
constexpr bool operator==(DisallowedStd3Valid const &) const = default;
}};

struct DisallowedStd3Mapped {{
std::string_view maps_to;
constexpr bool operator==(DisallowedStd3Mapped const &) const = default;
}};

struct Ignored {{
constexpr bool operator==(Ignored const &) const = default;
}};

struct Mapped {{
std::string_view maps_to;
constexpr bool operator==(Mapped const &) const = default;
}};

struct Deviation {{
std::string_view maps_to;
constexpr bool operator==(Deviation const &) const = default;
}};

struct Valid {{
constexpr bool operator==(Valid const &) const = default;
}};

struct ValidNv8 {{
constexpr bool operator==(ValidNv8 const &) const = default;
}};

struct ValidXv8 {{
constexpr bool operator==(ValidXv8 const &) const = default;
}};

using Mapping = std::variant<
Disallowed,
DisallowedStd3Valid,
DisallowedStd3Mapped,
Ignored,
Mapped,
Deviation,
Valid,
ValidNv8,
ValidXv8>;

constexpr std::array<std::pair<char32_t, Mapping>, {len(idna.mappings)}> kMappings{{{{
{",\n ".join("{" + str(c[0]) + ", Mapping{" + to_cxx_variant(c[1]) + "}}" for c in idna.mappings)}
}}}};

}} // namespace idna::uts46

// clang-format on
#endif
"""
).encode()
)
21 changes: 21 additions & 0 deletions idna/idna_data_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// SPDX-FileCopyrightText: 2024 Robin Lindén <[email protected]>
//
// SPDX-License-Identifier: BSD-2-Clause

#include "idna/idna_data.h"

#include "etest/etest2.h"

using Entry = decltype(idna::uts46::kMappings)::value_type;

// https://www.unicode.org/reports/tr46/#IDNA_Mapping_Table
int main() {
etest::Suite s{};

s.add_test("everything before ascii '-' is ~disallowed", [](etest::IActions &a) {
a.expect_eq(idna::uts46::kMappings.at(0), Entry{char32_t{0}, idna::uts46::DisallowedStd3Valid{}});
a.expect_eq(idna::uts46::kMappings.at(1), Entry{char32_t{'-'}, idna::uts46::Valid{}});
});

return s.run();
}