Skip to content

Commit

Permalink
idna: Process the unicode UTS46-mappings into something usable in C++
Browse files Browse the repository at this point in the history
  • Loading branch information
robinlinden committed Oct 14, 2023
1 parent dcff638 commit 61136fe
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 2 deletions.
9 changes: 8 additions & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file")

# Bazel
# =========================================================
Expand Down Expand Up @@ -156,6 +156,13 @@ http_archive(
url = "https://github.com/unicode-org/icu/releases/download/release-73-2/icu4c-73_2-data-bin-l.zip",
)

# https://www.unicode.org/Public/idna/
http_file(
name = "idna_mapping_table",
sha256 = "402cbd285f1f952fcd0834b63541d54f69d3d8f1b8f8599bf71a1a14935f82c4",
url = "https://www.unicode.org/Public/idna/15.1.0/IdnaMappingTable.txt",
)

# https://github.com/ocornut/imgui
http_archive(
name = "imgui", # MIT
Expand Down
16 changes: 15 additions & 1 deletion idna/BUILD
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
load("@rules_python//python:defs.bzl", "py_binary")
load("//bzl:copts.bzl", "HASTUR_COPTS")

py_binary(
name = "idna_data_processor",
srcs = ["idna_data_processor.py"],
)

genrule(
name = "generate_idna_data",
srcs = ["@idna_mapping_table//file"],
outs = ["idna_data.h"],
cmd = "$(location :idna_data_processor) $(location @idna_mapping_table//file) >$@",
tools = [":idna_data_processor"],
)

cc_library(
name = "idna",
hdrs = glob(["*.h"]),
hdrs = [":generate_idna_data"] + glob(["*.h"]),
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = ["//util:unicode"],
Expand Down
124 changes: 124 additions & 0 deletions idna/idna_data_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/env python3

# SPDX-FileCopyrightText: 2023 Robin Lindén <[email protected]>
#
# SPDX-License-Identifier: BSD-2-Clause

import sys
import textwrap
import typing


def expand_code_point_ranges(ranges: [str]) -> [str]:
expanded = []
for r in ranges:
if ".." in r:
start, end = r.split("..")
expanded.extend(c for c in range(int(start, 16), int(end, 16) + 1))
else:
expanded.append(int(r, 16))
return expanded


class IDNA:
def __init__(self):
self.disallowed = set()
self.disallowed_std3_valid = set()
self.disallowed_std3_mapped = dict()
self.ignored = set()
self.mapped = dict()
self.deviation = dict()
self.valid = set()

# Valid in UTS46, but not in IDNA2008.
self.valid_nv8 = set()
self.valid_xv8 = set()

# https://www.unicode.org/reports/tr46/#Table_Data_File_Fields
@staticmethod
def from_table(table_rows: [str]) -> typing.Self:
idna = IDNA()
for row in table_rows:
# Drop the trailing comment about what code point this is.
row = row.split("#")[0].strip()

cols = [col.strip() for col in row.split(";")]
# Some rows are blank or just a comment.
if len(cols) <= 1:
continue

action = cols[1]
if action == "disallowed":
assert len(cols) == 2
idna.disallowed.add(cols[0])
elif action == "disallowed_STD3_valid":
assert len(cols) == 2
idna.disallowed_std3_valid.add(cols[0])
elif action == "disallowed_STD3_mapped":
assert len(cols) == 3
idna.disallowed_std3_mapped[cols[0]] = cols[2]
elif action == "ignored":
assert len(cols) == 2
idna.ignored.add(cols[0])
elif action == "mapped":
assert len(cols) == 3
idna.mapped[cols[0]] = cols[2]
elif action == "deviation":
assert len(cols) == 3
idna.deviation[cols[0]] = cols[2]
elif action == "valid" and len(cols) == 2:
idna.valid.add(cols[0])
elif action == "valid" and len(cols) == 4 and cols[3] == "NV8":
idna.valid_nv8.add(cols[0])
elif action == "valid" and len(cols) == 4 and cols[3] == "XV8":
idna.valid_xv8.add(cols[0])
else:
raise Exception(f"Unable to parse data: {cols}")

idna.disallowed = expand_code_point_ranges(idna.disallowed)
idna.ignored = expand_code_point_ranges(idna.ignored)
return idna


if __name__ == "__main__":
if len(sys.argv) != 2:
print(
f"Usage: {sys.argv[0]} <IdnaMappingTable.txt>",
file=sys.stderr,
)
sys.exit(1)

with open(sys.argv[1]) as table:
idna = IDNA.from_table(table.readlines())

print(
textwrap.dedent(
f"""\
// SPDX-FileCopyrightText: 2023 Robin Lindén <[email protected]>
//
// SPDX-License-Identifier: BSD-2-Clause
// This file is generated. Do not touch it.
#ifndef IDNA_IDNA_DATA_H_
#define IDNA_IDNA_DATA_H_
// clang-format off
#include <array>
namespace idna::uts46 {{
constexpr std::array<char32_t, {len(idna.disallowed)}> disallowed = {{
{", ".join(hex(c) for c in idna.disallowed)}
}};
constexpr std::array<char32_t, {len(idna.ignored)}> ignored = {{
{", ".join(hex(c) for c in idna.ignored)}
}};
}} // namespace idna::uts46
// clang-format on
#endif
"""
)
)
2 changes: 2 additions & 0 deletions idna/punycode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#include "idna/punycode.h"

#include "idna/idna_data.h" // FIXME(robinlinden): Hack to check if the header builds.

#include "etest/etest2.h"
#include "util/unicode.h"

Expand Down

0 comments on commit 61136fe

Please sign in to comment.