Merge pull request #92 from LaurenzV/3.0.0

Sync with 3.0.0.
harfbuzz · Feb 7, 2024 · 0f0b732 · 0f0b732
2 parents 4a7642e + c8db555
commit 0f0b732
Show file tree

Hide file tree

Showing 19 changed files with 346 additions and 126 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -16,8 +16,8 @@ exclude = ["benches/", "tests/"]
 bitflags = "2.4.1"
 bytemuck = { version = "1.5", features = ["extern_crate_alloc"] }
 smallvec = "1.6"
-unicode-bidi-mirroring = "0.1"
-unicode-ccc = "0.1.2"
+unicode-bidi-mirroring = "0.2"
+unicode-ccc = "0.2"
 unicode-properties = { version = "0.1.0", default-features = false, features = ["general-category"] }
 unicode-script = "0.5.2"
 libm = { version = "0.2.2", optional = true }

diff --git a/scripts/gen-arabic-table.py b/scripts/gen-arabic-table.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+
+# Based on harfbuzz/src/gen-arabic-table.py
+
+import os
+import urllib.request
+
+DEPENDENCIES = [
+    "ArabicShaping.txt",
+    "UnicodeData.txt",
+    "Blocks.txt",
+]
+
+for dep in DEPENDENCIES:
+    if not os.path.exists(dep):
+        urllib.request.urlretrieve("https://unicode.org/Public/14.0.0/ucd/" + dep, dep)
+
+files = [open(x, encoding="utf-8") for x in DEPENDENCIES]
+
+headers = [
+    [files[0].readline(), files[0].readline()],
+    [files[2].readline(), files[2].readline()],
+    ["UnicodeData.txt does not have a header."],
+]
+while files[0].readline().find("##################") < 0:
+    pass
+
+blocks = {}
+
+
+def read_blocks(f):
+    global blocks
+    for line in f:
+        j = line.find("#")
+        if j >= 0:
+            line = line[:j]
+
+        fields = [x.strip() for x in line.split(";")]
+        if len(fields) == 1:
+            continue
+
+        uu = fields[0].split("..")
+        start = int(uu[0], 16)
+        if len(uu) == 1:
+            end = start
+        else:
+            end = int(uu[1], 16)
+
+        t = fields[1]
+
+        for u in range(start, end + 1):
+            blocks[u] = t
+
+
+def print_joining_table(f):
+    values = {}
+    for line in f:
+        if line[0] == "#":
+            continue
+
+        fields = [x.strip() for x in line.split(";")]
+        if len(fields) == 1:
+            continue
+
+        u = int(fields[0], 16)
+
+        if fields[3] in ["ALAPH", "DALATH RISH"]:
+            value = "JOINING_GROUP_" + fields[3].replace(" ", "_")
+        else:
+            value = "JOINING_TYPE_" + fields[2]
+        values[u] = value
+
+    short_value = {}
+    for value in sorted(set([v for v in values.values()] + ["JOINING_TYPE_X"])):
+        short = "".join(x[0] for x in value.split("_")[2:])
+        assert short not in short_value.values()
+
+        short_value[value] = short
+
+    uu = sorted(values.keys())
+    num = len(values)
+    all_blocks = set([blocks[u] for u in uu])
+
+    last = -100000
+    ranges = []
+    for u in uu:
+        if u - last <= 1 + 16 * 5:
+            ranges[-1][-1] = u
+        else:
+            ranges.append([u, u])
+        last = u
+
+    print("#[rustfmt::skip]")
+    print("pub const JOINING_TABLE: &[JoiningType] = &[")
+    last_block = None
+    offset = 0
+
+    join_offsets = []
+
+    for start, end in ranges:
+        join_offsets.append(
+            "const JOINING_OFFSET_0X%04X: usize = %d;" % (start, offset)
+        )
+
+        for u in range(start, end + 1):
+            block = blocks.get(u, last_block)
+            value = values.get(u, "JOINING_TYPE_X")
+
+            if block != last_block or u == start:
+                if u != start:
+                    print()
+                if block in all_blocks:
+                    print("\n  /* %s */" % block)
+                else:
+                    print("\n  /* FILLER */")
+                last_block = block
+                if u % 32 != 0:
+                    print()
+                    print("  /* %04X */" % (u // 32 * 32), "  " * (u % 32), end="")
+
+            if u % 32 == 0:
+                print()
+                print("  /* %04X */ " % u, end="")
+
+            val = short_value[value]
+
+            if val == "C":
+                val = "D"
+
+            print("%s," % val, end="")
+        print()
+
+        offset += end - start + 1
+    print("];")
+    print()
+
+    for offset in join_offsets:
+        print(offset)
+
+    page_bits = 12
+    print()
+    print("pub fn joining_type(u: char) -> JoiningType {")
+    print("    let u = u as u32;")
+    print("    match u >> %d {" % page_bits)
+    pages = set(
+        [u >> page_bits for u in [s for s, e in ranges] + [e for s, e in ranges]]
+    )
+    for p in sorted(pages):
+        print("        0x%0X => {" % p)
+        for start, end in ranges:
+            if p not in [start >> page_bits, end >> page_bits]:
+                continue
+            offset = "JOINING_OFFSET_0X%04X" % start
+            print("            if (0x%04X..=0x%04X).contains(&u) {" % (start, end))
+            print(
+                "                return JOINING_TABLE[u as usize - 0x%04X + %s]"
+                % (start, offset)
+            )
+            print("            }")
+        print("        }")
+    print("        _ => {}")
+    print("    }")
+    print()
+    print("    X")
+    print("}")
+    print()
+
+
+print("// WARNING: this file was generated by ../scripts/gen-arabic-table.py")
+print()
+print(
+    "use super::arabic::JoiningType::{self, GroupAlaph as A, GroupDalathRish as DR, D, L, R, T, U, X};"
+)
+print()
+
+read_blocks(files[2])
+print_joining_table(files[0])
diff --git a/scripts/gen-indic-table.py b/scripts/gen-indic-table.py
@@ -4,7 +4,6 @@
 
 import io
 import os
-import sys
 import urllib.request
 
 DEPENDENCIES = [
@@ -15,7 +14,7 @@
 
 for dep in DEPENDENCIES:
     if not os.path.exists(dep):
-        urllib.request.urlretrieve('https://unicode.org/Public/12.0.0/ucd/' + dep, dep)
+        urllib.request.urlretrieve('https://unicode.org/Public/14.0.0/ucd/' + dep, dep)
 
 ALLOWED_SINGLES = [0x00A0, 0x25CC]
 ALLOWED_BLOCKS = [

diff --git a/scripts/gen-unicode-norm-table.py b/scripts/gen-unicode-norm-table.py
@@ -3,7 +3,7 @@
 import urllib.request
 import os
 
-URL = 'https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
+URL = 'https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt'
 FILE_NAME = 'UnicodeData.txt'
 
 
@@ -22,7 +22,7 @@ def hex_to_char_rs(c):
 print('//! The current implementation is not the fastest one. Just good enough.')
 print()
 print('#[allow(dead_code)]')
-print('pub const UNICODE_VERSION: (u8, u8, u8) = (13, 0, 0);')
+print('pub const UNICODE_VERSION: (u8, u8, u8) = (14, 0, 0);')
 print()
 print('// Rust support `Option<char>` layout optimization, so it will take only 4 bytes.')
 print('pub const DECOMPOSITION_TABLE: &[(char, char, Option<char>)] = &[')

diff --git a/scripts/gen-universal-table.py b/scripts/gen-universal-table.py
@@ -14,7 +14,7 @@
 for f in files:
     if not os.path.exists(f):
         urllib.request.urlretrieve(
-            'https://unicode.org/Public/13.0.0/ucd/' + f, f)
+            'https://unicode.org/Public/14.0.0/ucd/' + f, f)
 
 files = [io.open(x, encoding='utf-8') for x in files]
 

diff --git a/scripts/gen-vowel-constraints.py b/scripts/gen-vowel-constraints.py
@@ -15,7 +15,7 @@
 import urllib.request
 
 if not os.path.exists('Scripts.txt'):
-    urllib.request.urlretrieve('https://unicode.org/Public/12.0.0/ucd/Scripts.txt', 'Scripts.txt')
+    urllib.request.urlretrieve('https://unicode.org/Public/14.0.0/ucd/Scripts.txt', 'Scripts.txt')
 
 with io.open('Scripts.txt', encoding='utf-8') as f:
     scripts_header = [f.readline() for i in range(2)]

diff --git a/src/common.rs b/src/common.rs
@@ -129,7 +129,10 @@ impl Direction {
 
             // Unicode-13.0 additions
             script::CHORASMIAN |
-            script::YEZIDI => {
+            script::YEZIDI |
+
+            // Unicode-14.0 additions
+            script::OLD_UYGHUR => {
                 Some(Direction::RightToLeft)
             }
 
@@ -442,6 +445,12 @@ pub mod script {
     pub const DIVES_AKURU: Script = Script::from_bytes(b"Diak");
     pub const KHITAN_SMALL_SCRIPT: Script = Script::from_bytes(b"Kits");
     pub const YEZIDI: Script = Script::from_bytes(b"Yezi");
+    // Since 14.0
+    pub const CYPRO_MINOAN: Script = Script::from_bytes(b"Cpmn");
+    pub const OLD_UYGHUR: Script = Script::from_bytes(b"Ougr");
+    pub const TANGSA: Script = Script::from_bytes(b"Tnsa");
+    pub const TOTO: Script = Script::from_bytes(b"Toto");
+    pub const VITHKUQI: Script = Script::from_bytes(b"Vith");
 
     // https://github.com/harfbuzz/harfbuzz/issues/1162
     pub const MYANMAR_ZAWGYI: Script = Script::from_bytes(b"Qaag");