-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathdigest.py
169 lines (145 loc) · 4.9 KB
/
digest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import argparse
import gzip
def edit_dist(s1, s2, lim=0):
last = list(range(len(s1) + 1))
this = [0] * len(last)
for j in range(0, len(s2)):
this[0] = j + 1
for i in range(1, len(this)):
this[i] = min(last[i] + 1,
this[i - 1] + 1,
last[i - 1] + int(s2[j] != s1[i-1]))
if lim and min(this) >= lim:
return min(this)
last, this = this, last
return last[-1]
def build_common(digest, min_dist):
common = {}
prefixes_missing = set(prefixes)
n = 1
out = ''
for line in open('data/1gram_common.csv', encoding='utf-8', errors='replace'):
word_orig = line.split()[0]
word = word_orig.lower()
if word in common:
continue
prefix = word[:3]
if prefix in prefixes:
if min_dist and any(edit_dist(word, k, min_dist) < min_dist for k in common):
continue
if prefix in prefixes_missing:
prefixes_missing.remove(prefix)
common[word] = n
out += word_orig + '\n'
n += 1
if n & 0xff == 0:
print(word, n)
if prefixes_missing:
print(('unable to find %d prefixes in input!: %s'
% (len(prefixes_missing), ' '.join(sorted(prefixes_missing)))))
digest.write('%s\n' % n)
digest.write(out)
print('words:', n)
return common
def build_edges(common):
edges = [[] for _ in range(len(common) + 1)]
last_a = ''
prefix_transitions = set()
for line in gzip.GzipFile('data/2gram.csv.gz'):
parts = line.decode('utf8').lower().split()
if len(parts) == 3:
a, b, count = parts
if a not in common or b not in common:
continue
prefix_a = a[:3]
prefix_b = b[:3]
prefix_transitions.add(prefix_a + prefix_b)
if a != last_a:
if last_a:
if a[0] != last_a[0]:
print('!', last_a)
last_a = a
edges[common[a]].append(common[b])
print('Attested prefix-prefix combinations:', end=' ')
print('%.2f%%' % (100.0 * len(prefix_transitions) /
(1.0 * len(prefixes) * len(prefixes))))
return edges
def encode(l):
''' pack list of monotonically increasing positive integers into a string
replace elements with the difference from the previous element minus one,
runs of zeros with special 'zero repeat' characters (RLE for zeros),
otherwise encode as printable base-32 varints
>>> encode([1, 2, 3, 5, 8, 13, 21])
'bABDG'
'''
enc = ''
last_num = 0
zero_run = 0
for num in l:
delta = num - last_num - 1
assert delta >= 0, "input must be strictly increasing positive integers"
last_num = num
if delta == 0:
zero_run += 1
continue
while zero_run:
enc += chr(0x60 + min(0x1f, zero_run) - 1)
zero_run = max(0, zero_run - 0x1f)
while True:
enc += chr((0x40 if delta < 0x20 else 0x20) | (delta & 0x1f))
delta >>= 5
if not delta:
break
while zero_run:
enc += chr(0x60 + min(0x1f, zero_run) - 1)
zero_run = max(0, zero_run - 0x1f)
return enc
def decode(enc):
''' inverse of encode
>>> deencode('bABDG')
[1, 2, 3, 5, 8, 13, 21]
'''
enc_ind = 0
dec = []
last_num = 0
zero_run = 0
while enc_ind < len(enc) or zero_run:
delta = 0
delta_ind = 0
if zero_run:
zero_run -= 1
else:
val = ord(enc[enc_ind])
if val >= 0x60:
zero_run = val & 0x1f
delta_ind += 1
else:
while True:
val = ord(enc[enc_ind + delta_ind])
delta |= (val & 0x1f) << (5 * delta_ind)
delta_ind += 1
if val & 0x40:
break
enc_ind += delta_ind
num = last_num + delta + 1
last_num = num
dec.append(num)
return dec
for l in ([1, 2, 3, 5], [1], [1, 2, 3, 5, 6, 8, 9, 10, 11, 12, 3000000], list(range(1, 500))):
assert decode(encode(l)) == l
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--prefixes', default='data/prefixes.txt')
parser.add_argument('--output', default='wordlist_bigrams.txt')
parser.add_argument('--min_dist', default=0, type=int)
options = parser.parse_args()
prefixes = set()
for line in open(options.prefixes):
prefixes.add(line.split()[0])
digest = open(options.output, 'w')
common = build_common(digest, options.min_dist)
edges = build_edges(common)
for n, out in enumerate(edges):
digest.write(encode(sorted(set(out))))
digest.write('\n')
digest.close()