forked from magiclen/unicode-blocks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen-blocks.awk
68 lines (57 loc) · 1.73 KB
/
gen-blocks.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Script to regenerate src/unicode_blocks.rs from Blocks.txt
#
# Usage awk -f gen-blocks.awk Blocks.txt > src/unicode_blocks.rs
#
# where Blocks.txt is fetched from:
# https://www.unicode.org/Public/UNIDATA/Blocks.txt
BEGIN { FS=";" }
/^# Blocks-[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+\.txt/ {
pos = match($0, "[[:digit:]]+\\.[[:digit:]]+\\.[[:digit:]]+");
if (pos < 1) {
printf("unable to extract version from: %s\n", $0);
exit(1);
}
version = substr($0, RSTART, RLENGTH);
print "// The dataset is from https://www.unicode.org/Public/UNIDATA/Blocks.txt"
print ""
print "use crate::UnicodeBlock;"
print ""
printf("pub const VERSION: &str = \"%s\";\n", version);
idx = 1
}
# Only process non-comment lines
/^[[:digit:]A-F]/ {
split($1, range, "\\.\\.");
start = range[1];
end = range[2];
name = $2
# trim leading spaces
if (match(name, "^ +") > 0) {
name = substr(name, RLENGTH + 1);
}
# build Rust constant name
const = name
gsub("[ -]", "_", const);
const = toupper(const);
printf("pub const %s: UnicodeBlock = UnicodeBlock {\n", const);
printf(" name: \"%s\", start: 0x%s, end: 0x%s\n", name, start, end);
print "};"
# skip blocks that end in _SURROGATES as these are not valid as Unicode
# escapes in Rust.
if (match(const, "_SURROGATES$") < 1) {
match_line = sprintf(" '\\u{%s}'..='\\u{%s}' => Some(%s),", start, end, const);
match_lines[idx++] = match_line;
}
}
END {
print ""
print "/// Given a character, determine what unicode block contains it."
print "pub fn find_unicode_block(c: char) -> Option<UnicodeBlock> {"
print " match c {"
for (i = 1; i < idx; i++) {
print match_lines[i];
}
print " _ => None,"
print " }"
print "}"
}