-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun.sh
190 lines (173 loc) · 9.14 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# run.sh - worker script to generate sorted list of unicode strings
#
# This script is written to be compatible with very old Linux
# distributions, both RHEL and Ubuntu, as well as the latest
# available versions. This requires significant care around what
# features of bash and perl are used in the script. This script
# has been tested with RHEL5 using bash v3.2.25 and perl v5.8.8
#
# The script requires internet access because it downloads the Unicode
# spec directly from unicode.org which is then used to generate every
# valid code point. For each code point, a large number of carefully
# built strings are generated. See the main README for more information
# about this.
#
# This script is entirely self-contained so that it can be easily cut
# and pasted to any system and then it can be executed to generate a
# sorted file directly on that system.
#
# The script generates two outputs. First (and most important) is a file
# named unicode-${UNICODE_VERS}-chars-sorted-glibc-${GLIBC_VERS}.txt which
# contains the sorted list of strings. Second, the direct "stdout" of the
# script is intended to be captured. This will show additional diagnostics
# information, like the output of dpkg and rpm queries, execution timestamps,
# the version of the operating system, the AMI used (if applicable), etc.
#
set -x -e
# make sure that locale is set to en_US (utf8)
export LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8
date
# print information about the system to stdout
cd $(dirname $0)
pwd
which dpkg && dpkg -l libc6 locales
which rpm && rpm -qa|grep -E '(glibc|langpack)'
SOURCE_AMI=$(curl -s http://169.254.169.254/latest/meta-data/ami-id)
OS_VERS=$(cat /etc/issue)
which dpkg && GLIBC_VERS="$(dpkg -l libc6|awk '/libc6/{print$3}')"
which rpm && GLIBC_VERS="$(rpm -q glibc --queryformat '%{version}-%{release}')"
[ -f /etc/os-release ] && cat /etc/os-release
[ -f /etc/system-release ] && cat /etc/system-release
[ -f /etc/system-release-cpe ] && cat /etc/system-release-cpe
# directly download unicode spec, will use this to generate all legal code points
UNICODE_VERS="14"
time curl -kO https://www.unicode.org/Public/${UNICODE_VERS}.0.0/ucd/UnicodeData.txt
# this perl program will read the unicode spec source and use it to output each
# legal code point. for each code point, we output all the strings specified
# in the main README. note that the output is split into multiple files of 500k
# strings each. those files can be sorted in parallel to leverage multiple CPUs for
# increased performance.
#
# IMPORTANT: make sure to keep this block in sync with README and table.sh
#
time perl -naF';' -CO -e'
use utf8;
sub pr {
print chr($_[0]) . "\n"; # 199
print chr($_[0]) . "B\n"; # 200
print chr($_[0]) . "O\n"; # 201
print chr($_[0]) . "3\n"; # 202
print chr($_[0]) . ".\n"; # 203
print chr($_[0]) . " \n"; # 204
print chr($_[0]) . "様\n"; # 205
print chr($_[0]) . "ク\n"; # 206
print "B" . chr($_[0]) . "\n"; # 210
print "O" . chr($_[0]) . "\n"; # 211
print "3" . chr($_[0]) . "\n"; # 212
print "." . chr($_[0]) . "\n"; # 213
print " " . chr($_[0]) . "\n"; # 214
print "様" . chr($_[0]) . "\n"; # 215
print "ク" . chr($_[0]) . "\n"; # 216
print chr($_[0]) . chr($_[0]) . "\n"; # 299
print chr($_[0]) . "BB\n"; # 300
print chr($_[0]) . "OO\n"; # 301
print chr($_[0]) . "33\n"; # 302
print chr($_[0]) . "..\n"; # 303
print chr($_[0]) . " \n"; # 304
print chr($_[0]) . "様様\n"; # 305
print chr($_[0]) . "クク\n"; # 306
print "B" . chr($_[0]) . "B\n"; # 310
print "O" . chr($_[0]) . "O\n"; # 311
print "3" . chr($_[0]) . "3\n"; # 312
print "." . chr($_[0]) . ".\n"; # 313
print " " . chr($_[0]) . " \n"; # 314
print "様" . chr($_[0]) . "様\n"; # 315
print "ク" . chr($_[0]) . "ク\n"; # 316
print "BB" . chr($_[0]) . "\n"; # 320
print "OO" . chr($_[0]) . "\n"; # 321
print "33" . chr($_[0]) . "\n"; # 322
print ".." . chr($_[0]) . "\n"; # 323
print " " . chr($_[0]) . "\n"; # 324
print "様様" . chr($_[0]) . "\n"; # 325
print "クク" . chr($_[0]) . "\n"; # 326
print chr($_[0]) . chr($_[0]) . "B\n"; # 330
print chr($_[0]) . chr($_[0]) . "O\n"; # 331
print chr($_[0]) . chr($_[0]) . "3\n"; # 332
print chr($_[0]) . chr($_[0]) . ".\n"; # 333
print chr($_[0]) . chr($_[0]) . " \n"; # 334
print chr($_[0]) . chr($_[0]) . "様\n"; # 335
print chr($_[0]) . chr($_[0]) . "ク\n"; # 336
print chr($_[0]) . "B" . chr($_[0]) . "\n"; # 340
print chr($_[0]) . "O" . chr($_[0]) . "\n"; # 341
print chr($_[0]) . "3" . chr($_[0]) . "\n"; # 342
print chr($_[0]) . "." . chr($_[0]) . "\n"; # 343
print chr($_[0]) . " " . chr($_[0]) . "\n"; # 344
print chr($_[0]) . "様" . chr($_[0]) . "\n"; # 345
print chr($_[0]) . "ク" . chr($_[0]) . "\n"; # 346
print "B" . chr($_[0]) . chr($_[0]) . "\n"; # 350
print "O" . chr($_[0]) . chr($_[0]) . "\n"; # 351
print "3" . chr($_[0]) . chr($_[0]) . "\n"; # 352
print "." . chr($_[0]) . chr($_[0]) . "\n"; # 353
print " " . chr($_[0]) . chr($_[0]) . "\n"; # 354
print "様" . chr($_[0]) . chr($_[0]) . "\n"; # 355
print "ク" . chr($_[0]) . chr($_[0]) . "\n"; # 356
print "3B" . chr($_[0]) . "\n"; # 380
print chr($_[0]) . chr($_[0]) . chr($_[0]) . "\n"; # 399
print chr($_[0]) . chr($_[0]) . "BB\n"; # 400
print chr($_[0]) . chr($_[0]) . "OO\n"; # 401
print chr($_[0]) . chr($_[0]) . "33\n"; # 402
print chr($_[0]) . chr($_[0]) . "..\n"; # 403
print chr($_[0]) . chr($_[0]) . " \n"; # 404
print chr($_[0]) . chr($_[0]) . "様様\n"; # 405
print chr($_[0]) . chr($_[0]) . "クク\n"; # 406
print "B" . chr($_[0]) . chr($_[0]) . "B\n"; # 410
print "O" . chr($_[0]) . chr($_[0]) . "O\n"; # 411
print "3" . chr($_[0]) . chr($_[0]) . "3\n"; # 412
print "." . chr($_[0]) . chr($_[0]) . ".\n"; # 413
print " " . chr($_[0]) . chr($_[0]) . " \n"; # 414
print "様" . chr($_[0]) . chr($_[0]) . "様\n"; # 415
print "ク" . chr($_[0]) . chr($_[0]) . "ク\n"; # 416
print "BB" . chr($_[0]) . chr($_[0]) . "\n"; # 420
print "OO" . chr($_[0]) . chr($_[0]) . "\n"; # 421
print "33" . chr($_[0]) . chr($_[0]) . "\n"; # 422
print ".." . chr($_[0]) . chr($_[0]) . "\n"; # 423
print " " . chr($_[0]) . chr($_[0]) . "\n"; # 424
print "様様" . chr($_[0]) . chr($_[0]) . "\n"; # 425
print "クク" . chr($_[0]) . chr($_[0]) . "\n"; # 426
print "3B" . chr($_[0]) . "B\n"; # 480
print "3B-" . chr($_[0]) . "\n"; # 481
print chr($_[0]) . chr($_[0]) . chr($_[0]) . chr($_[0]) . "\n"; # 499
print "BB" . chr($_[0]) . chr($_[0]) . "\t\n"; # 580
print "\tBB" . chr($_[0]) . chr($_[0]) . "\n"; # 581
print "BB-" . chr($_[0]) . chr($_[0]) . "\n"; # 582
print "🙂👍" . chr($_[0]) . "❤™\n"; # 583
print chr($_[0]) . chr($_[0]) . ".33\n"; # 584
print "3B-" . chr($_[0]) . "B\n"; # 585
print chr($_[0]) . chr($_[0]) . chr($_[0]) . chr($_[0]) . chr($_[0]) . "\n"; # 599
}
if(/<control>/){next}; # skip control characters
if($F[2] eq "Cs"){next}; # skip surrogates
if(/ First>/){$fi=hex("0x".$F[0]);next}; # generate blocks
if(/ Last>/){$la=hex("0x".$F[0]);for($fi..$la){pr($_)};next};
pr(hex("0x".$F[0])) # generate individual characters
' UnicodeData.txt |split -l500000 - _base-characters
# write counts of strings (lines) to stdout
wc _base-characters*
# write locale to stdout, so that we have proof it was correct
locale
# this logic (which sorts files in parallel and waits for all to complete) was written
# to be backwards-compatible to bash 3.2.25
date
for FILE in $(ls -1 _base-characters*); do sort $FILE -o _s$FILE & done; jobs; wait
date
# note that "sort -m" expects the input files to all be pre-sorted
time sort -m _s_base-characters* -o unicode-${UNICODE_VERS}-chars-sorted-glibc-${GLIBC_VERS}.txt
# cleanup
rm -v _base-characters* _s_base-characters* UnicodeData.txt
# write file sizes and final count of strings (lines) to stdout, can crosscheck w earlier count
ls -ltr
wc unicode-${UNICODE_VERS}-chars-sorted-glibc-${GLIBC_VERS}.txt
# the simple test that started it all; might as well run it and write to stdout
# cf. https://wiki.postgresql.org/wiki/Locale_data_changes
( echo "1-1"; echo "11" ) | LC_COLLATE=en_US.UTF-8 sort
date