-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathcharsets.js
111 lines (111 loc) · 3.38 KB
/
charsets.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// We use this to generate a charset normalization table from iconv's charsets:
// `charsets.txt` is extracted from node-iconv's README.md.
var fs = require('fs');
var lines = fs.readFileSync('charsets.txt', 'utf-8').split(/\r\n|\n/);
var groups = {};
var group;
function expandCharset(charset, charsets) {
if (/(experimental)/i.test(charset)) return;
var open = charset.indexOf('{');
if (open === -1) return pushCharset(charset, charsets);
var prefix = charset.slice(0, open);
var close = charset.indexOf('}');
if (close !== charset.length - 1) {
throw new Error('bad closing } position: ' + charset);
}
var suffixes = charset.slice(open + 1, close).replace(/\s/g, '');
if (
!/^(\d+,)+\d+$/.test(suffixes) &&
!/^([a-z]+,)+[a-z]+$/i.test(suffixes)
) {
throw new Error('bad suffixes: ' + charset);
}
suffixes.split(',').forEach(
function(suffix) {
pushCharset(prefix + suffix, charsets);
}
);
}
function pushCharset(charset, charsets) {
charsets.push(charset);
}
function splitCharsets(line, charsets) {
function push(charset) {
charset = charset.trim();
if (charset) expandCharset(charset, charsets);
}
var start = 0;
var index = 0;
var length = line.length;
while (index < length) {
var char = line[index];
if (char === ',') {
push(line.slice(start, index));
index = start = index + 1;
} else if (char === '{') {
var close = line.indexOf('}', index + 1);
if (close === -1) throw new Error('closing } not found: ' + line);
push(line.slice(start, close + 1));
index = start = close + 2;
} else {
index++;
}
}
if (start < length) push(line.slice(start, length));
}
lines.forEach(
function(line) {
if (!line.trim()) return;
if (/^\s/.test(line)) {
if (!group) throw new Error('unknown group: ' + line);
if (groups.hasOwnProperty(group)) {
var charsets = groups[group];
} else {
var charsets = groups[group] = [];
}
splitCharsets(line.trim(), charsets);
} else {
group = line.trim();
}
}
);
var charsets = [];
for (var group in groups) {
if (/Full Unicode, in terms of/i.test(group)) continue;
if (/Locale dependent, in terms of/i.test(group)) continue;
charsets = charsets.concat(groups[group]);
}
charsets.sort();
var canon = {};
for (var index = 0, length = charsets.length; index < length; index++) {
var charset = charsets[index];
var key = charset.toUpperCase().replace(/[^A-Z0-9]/g, '');
if (canon.hasOwnProperty(key)) {
throw new Error(
'duplicate charset keys: ' + canon[key] + ' vs ' + charset
);
}
canon[key] = charset;
}
// Outlook 2007 uses "automatic message encoding" and can get it wrong.
// Specifically, Microsoft uses "ks_c_5601-1987" as an alias for CP949 (Korean).
// See: http://docs.activestate.com/activeperl/5.8/lib/Encode/KR.html
canon['BKSC56011987'] = 'CP949';
canon['KSC56011987'] = 'CP949';
canon['UHC'] = 'CP949';
canon['WIN949'] = 'CP949';
canon['WINDOWS949'] = 'CP949';
canon['XUHC'] = 'CP949';
canon['XWIN949'] = 'CP949';
canon['XWINDOWS949'] = 'CP949';
// ANSI_X3.110-1983 is an ancient character set almost identical to ISO-8859-1:
canon['ANSIX31101983'] = 'ISO88591';
var string = '{\n';
Object.keys(canon).sort().forEach(
function(key) {
string += " " + key + ": '" + canon[key] + "',\n";
}
);
string = string.replace(/,(?=\n$)/, '');
string += '};';
console.log(string);