forked from kyunghyuncho/DefGen2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_alldefs.py
111 lines (89 loc) · 2.4 KB
/
preprocess_alldefs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import cPickle as pkl
import numpy
import sys
from collections import OrderedDict
from nltk.tokenize import wordpunct_tokenize
######
input_file = sys.argv[1]
embedding_file = sys.argv[2]
output_file = sys.argv[3]
dictionary_file = sys.argv[4]
if len(sys.argv) > 5:
existing_dict = sys.argv[5]
else:
existing_dict = False
######
def main():
with open(input_file, 'rb') as f:
wn_defs = pkl.load(f)
print 'Loading w2v...',
with open(embedding_file, 'rb') as f:
w2v = pkl.load(f)
print 'Done'
# build dictionary
print 'Building a dictionary...',
wordcounts = OrderedDict()
n_defs = 0
for kk, vv in wn_defs.iteritems():
if kk in w2v:
vec = w2v[kk]
elif kk.lower() in w2v:
vec = w2v[kk.lower()]
else:
continue
for dd in vv:
n_defs += 1
words = wordpunct_tokenize(dd.strip())
for ww in words:
if ww not in wordcounts:
wordcounts[ww] = 1
else:
wordcounts[ww] += 1
words = wordcounts.keys()
counts = wordcounts.values()
sorted_idx = numpy.argsort(counts)
if existing_dict:
with open(existing_dict) as inp:
worddict = cPickle.load(inp)
maxval = max(worddict.values())
counter = 0
for idx, sidx in enumerate(sorted_idx[::-1]):
if not words[sidx] in worddict:
counter +=1
worddict[words[sidx]] = maxval + counter
else:
continue
else:
worddict = OrderedDict()
for idx, sidx in enumerate(sorted_idx[::-1]):
worddict[words[sidx]] = idx+2
with open(dictionary_file, 'wb') as f:
pkl.dump(worddict, f)
print 'Done'
x = [None] * n_defs
y = [None] * n_defs
print 'Collection begins...'
ii = 0
for kk, vv in wn_defs.iteritems():
if kk in w2v:
vec = w2v[kk]
elif kk.lower() in w2v:
vec = w2v[kk.lower()]
else:
continue
for dd in vv:
words = wordpunct_tokenize(dd.strip())
seq = [worddict[w] for w in words]
x[ii] = vec
y[ii] = seq
ii += 1
if numpy.mod(ii, 1000):
print ii,'/',n_defs,','
print 'Done'
print 'Saving...',
with open(output_file, 'wb') as f:
pkl.dump(x,f)
pkl.dump(y,f)
print 'Done'
if __name__ == '__main__':
main()