Skip to content

Commit

Permalink
Merge pull request #92 from LaurenzV/3.0.0
Browse files Browse the repository at this point in the history
Sync with 3.0.0.
  • Loading branch information
RazrFalcon authored Feb 7, 2024
2 parents 4a7642e + c8db555 commit 0f0b732
Show file tree
Hide file tree
Showing 19 changed files with 346 additions and 126 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ exclude = ["benches/", "tests/"]
bitflags = "2.4.1"
bytemuck = { version = "1.5", features = ["extern_crate_alloc"] }
smallvec = "1.6"
unicode-bidi-mirroring = "0.1"
unicode-ccc = "0.1.2"
unicode-bidi-mirroring = "0.2"
unicode-ccc = "0.2"
unicode-properties = { version = "0.1.0", default-features = false, features = ["general-category"] }
unicode-script = "0.5.2"
libm = { version = "0.2.2", optional = true }
Expand Down
177 changes: 177 additions & 0 deletions scripts/gen-arabic-table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#!/usr/bin/env python3

# Based on harfbuzz/src/gen-arabic-table.py

import os
import urllib.request

DEPENDENCIES = [
"ArabicShaping.txt",
"UnicodeData.txt",
"Blocks.txt",
]

for dep in DEPENDENCIES:
if not os.path.exists(dep):
urllib.request.urlretrieve("https://unicode.org/Public/14.0.0/ucd/" + dep, dep)

files = [open(x, encoding="utf-8") for x in DEPENDENCIES]

headers = [
[files[0].readline(), files[0].readline()],
[files[2].readline(), files[2].readline()],
["UnicodeData.txt does not have a header."],
]
while files[0].readline().find("##################") < 0:
pass

blocks = {}


def read_blocks(f):
global blocks
for line in f:
j = line.find("#")
if j >= 0:
line = line[:j]

fields = [x.strip() for x in line.split(";")]
if len(fields) == 1:
continue

uu = fields[0].split("..")
start = int(uu[0], 16)
if len(uu) == 1:
end = start
else:
end = int(uu[1], 16)

t = fields[1]

for u in range(start, end + 1):
blocks[u] = t


def print_joining_table(f):
values = {}
for line in f:
if line[0] == "#":
continue

fields = [x.strip() for x in line.split(";")]
if len(fields) == 1:
continue

u = int(fields[0], 16)

if fields[3] in ["ALAPH", "DALATH RISH"]:
value = "JOINING_GROUP_" + fields[3].replace(" ", "_")
else:
value = "JOINING_TYPE_" + fields[2]
values[u] = value

short_value = {}
for value in sorted(set([v for v in values.values()] + ["JOINING_TYPE_X"])):
short = "".join(x[0] for x in value.split("_")[2:])
assert short not in short_value.values()

short_value[value] = short

uu = sorted(values.keys())
num = len(values)
all_blocks = set([blocks[u] for u in uu])

last = -100000
ranges = []
for u in uu:
if u - last <= 1 + 16 * 5:
ranges[-1][-1] = u
else:
ranges.append([u, u])
last = u

print("#[rustfmt::skip]")
print("pub const JOINING_TABLE: &[JoiningType] = &[")
last_block = None
offset = 0

join_offsets = []

for start, end in ranges:
join_offsets.append(
"const JOINING_OFFSET_0X%04X: usize = %d;" % (start, offset)
)

for u in range(start, end + 1):
block = blocks.get(u, last_block)
value = values.get(u, "JOINING_TYPE_X")

if block != last_block or u == start:
if u != start:
print()
if block in all_blocks:
print("\n /* %s */" % block)
else:
print("\n /* FILLER */")
last_block = block
if u % 32 != 0:
print()
print(" /* %04X */" % (u // 32 * 32), " " * (u % 32), end="")

if u % 32 == 0:
print()
print(" /* %04X */ " % u, end="")

val = short_value[value]

if val == "C":
val = "D"

print("%s," % val, end="")
print()

offset += end - start + 1
print("];")
print()

for offset in join_offsets:
print(offset)

page_bits = 12
print()
print("pub fn joining_type(u: char) -> JoiningType {")
print(" let u = u as u32;")
print(" match u >> %d {" % page_bits)
pages = set(
[u >> page_bits for u in [s for s, e in ranges] + [e for s, e in ranges]]
)
for p in sorted(pages):
print(" 0x%0X => {" % p)
for start, end in ranges:
if p not in [start >> page_bits, end >> page_bits]:
continue
offset = "JOINING_OFFSET_0X%04X" % start
print(" if (0x%04X..=0x%04X).contains(&u) {" % (start, end))
print(
" return JOINING_TABLE[u as usize - 0x%04X + %s]"
% (start, offset)
)
print(" }")
print(" }")
print(" _ => {}")
print(" }")
print()
print(" X")
print("}")
print()


print("// WARNING: this file was generated by ../scripts/gen-arabic-table.py")
print()
print(
"use super::arabic::JoiningType::{self, GroupAlaph as A, GroupDalathRish as DR, D, L, R, T, U, X};"
)
print()

read_blocks(files[2])
print_joining_table(files[0])
3 changes: 1 addition & 2 deletions scripts/gen-indic-table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import sys
import urllib.request

DEPENDENCIES = [
Expand All @@ -15,7 +14,7 @@

for dep in DEPENDENCIES:
if not os.path.exists(dep):
urllib.request.urlretrieve('https://unicode.org/Public/12.0.0/ucd/' + dep, dep)
urllib.request.urlretrieve('https://unicode.org/Public/14.0.0/ucd/' + dep, dep)

ALLOWED_SINGLES = [0x00A0, 0x25CC]
ALLOWED_BLOCKS = [
Expand Down
4 changes: 2 additions & 2 deletions scripts/gen-unicode-norm-table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import urllib.request
import os

URL = 'https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
URL = 'https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt'
FILE_NAME = 'UnicodeData.txt'


Expand All @@ -22,7 +22,7 @@ def hex_to_char_rs(c):
print('//! The current implementation is not the fastest one. Just good enough.')
print()
print('#[allow(dead_code)]')
print('pub const UNICODE_VERSION: (u8, u8, u8) = (13, 0, 0);')
print('pub const UNICODE_VERSION: (u8, u8, u8) = (14, 0, 0);')
print()
print('// Rust support `Option<char>` layout optimization, so it will take only 4 bytes.')
print('pub const DECOMPOSITION_TABLE: &[(char, char, Option<char>)] = &[')
Expand Down
2 changes: 1 addition & 1 deletion scripts/gen-universal-table.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
for f in files:
if not os.path.exists(f):
urllib.request.urlretrieve(
'https://unicode.org/Public/13.0.0/ucd/' + f, f)
'https://unicode.org/Public/14.0.0/ucd/' + f, f)

files = [io.open(x, encoding='utf-8') for x in files]

Expand Down
2 changes: 1 addition & 1 deletion scripts/gen-vowel-constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import urllib.request

if not os.path.exists('Scripts.txt'):
urllib.request.urlretrieve('https://unicode.org/Public/12.0.0/ucd/Scripts.txt', 'Scripts.txt')
urllib.request.urlretrieve('https://unicode.org/Public/14.0.0/ucd/Scripts.txt', 'Scripts.txt')

with io.open('Scripts.txt', encoding='utf-8') as f:
scripts_header = [f.readline() for i in range(2)]
Expand Down
11 changes: 10 additions & 1 deletion src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,10 @@ impl Direction {

// Unicode-13.0 additions
script::CHORASMIAN |
script::YEZIDI => {
script::YEZIDI |

// Unicode-14.0 additions
script::OLD_UYGHUR => {
Some(Direction::RightToLeft)
}

Expand Down Expand Up @@ -442,6 +445,12 @@ pub mod script {
pub const DIVES_AKURU: Script = Script::from_bytes(b"Diak");
pub const KHITAN_SMALL_SCRIPT: Script = Script::from_bytes(b"Kits");
pub const YEZIDI: Script = Script::from_bytes(b"Yezi");
// Since 14.0
pub const CYPRO_MINOAN: Script = Script::from_bytes(b"Cpmn");
pub const OLD_UYGHUR: Script = Script::from_bytes(b"Ougr");
pub const TANGSA: Script = Script::from_bytes(b"Tnsa");
pub const TOTO: Script = Script::from_bytes(b"Toto");
pub const VITHKUQI: Script = Script::from_bytes(b"Vith");

// https://github.com/harfbuzz/harfbuzz/issues/1162
pub const MYANMAR_ZAWGYI: Script = Script::from_bytes(b"Qaag");
Expand Down
Loading

0 comments on commit 0f0b732

Please sign in to comment.