-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathmake_wikicorpus.py
285 lines (223 loc) · 11.7 KB
/
make_wikicorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# -*- coding: utf-8 -*-
"""
Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
bz2-compressed dump of Wikipedia articles, in XML format.
This script was built on the one provided in gensim:
`gensim.scripts.make_wikicorpus`
"""
from gensim.models import TfidfModel, LsiModel
from gensim.corpora import Dictionary, WikiCorpus, MmCorpus
from gensim import similarities
from gensim import utils
import time
import sys
import logging
import os
def formatTime(seconds):
"""
Takes a number of elapsed seconds and returns a string in the format h:mm.
"""
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
return "%d:%02d" % (h, m)
# ======== main ========
# Main entry point for the script.
# This little check has to do with the multiprocess module (which is used by
# WikiCorpus). Without it, the code will spawn infinite processes and hang!
if __name__ == '__main__':
# Set up logging.
# This little snippet is to fix an issue with qtconsole that you may or
# may not have... Without this, I don't see any logs in Spyder.
# Source: http://stackoverflow.com/questions/24259952/logging-module-does-not-print-in-ipython
root = logging.getLogger()
for handler in root.handlers[:]:
root.removeHandler(handler)
# Create a logger
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
# Set the timestamp format to just hours, minutes, and seconds (no ms)
#
# Record the log to a file 'log.txt'--There is just under 5,000 lines of
# logging statements, so I've chosen to write these to a file instead of
# to the console. It's safe to have the log file open while the script is
# running, so you can check progress that way if you'd like.
logging.basicConfig(filename='log.txt', format='%(asctime)s : %(levelname)s : %(message)s', datefmt='%H:%M:%S')
logging.root.setLevel(level=logging.INFO)
# Download this file to get the latest wikipedia dump:
# https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
# On Jan 18th, 2017 it was ~13GB
dump_file = './data/enwiki-latest-pages-articles.xml.bz2'
# ======== STEP 1: Build Dictionary =========
# The first step is to parse through all of Wikipedia and identify all of
# the unique words that we want to have in our dictionary.
# This is a long process--it took 3.2hrs. on my Intel Core i7 4770
if True:
# Create an empty dictionary
dictionary = Dictionary()
# Create the WikiCorpus object. This doesn't do any processing yet since
# we've supplied the dictionary.
wiki = WikiCorpus(dump_file, dictionary=dictionary)
print('Parsing Wikipedia to build Dictionary...')
sys.stdout.flush()
t0 = time.time()
# Now it's time to parse all of Wikipedia and build the dictionary.
# This is a long process, 3.2hrs. on my Intel i7 4770. It will update
# you at every 10,000 documents.
#
# wiki.get_texts() will only return articles which pass a couple
# filters that weed out stubs, redirects, etc. If you included all of
# those, Wikpedia is more like ~17M articles.
#
# For each article, it's going to add the words in the article to the
# dictionary.
#
# If you look inside add_documents, you'll see that it calls doc2bow--
# this generates a bag of words vector, but we're not keeping it. The
# dictionary isn't finalized until all of the articles have been
# scanned, so we don't know the right mapping of words to ids yet.
#
# You can use the prune_at parameter to prevent the dictionary from
# growing too large during this process, but I think it's interesting
# to see the total count of unique tokens before pruning.
dictionary.add_documents(wiki.get_texts(), prune_at=None)
print(' Building dictionary took %s' % formatTime(time.time() - t0))
print(' %d unique tokens before pruning.' % len(dictionary))
sys.stdout.flush()
keep_words = 100000
# The initial dictionary is huge (~8.75M words in my Wikipedia dump),
# so let's filter it down. We want to keep the words that are neither
# very rare or overly common. To do this, we will keep only words that
# exist within at least 20 articles, but not more than 10% of all
# documents. Finally, we'll also put a hard limit on the dictionary
# size and just keep the 'keep_words' most frequent works.
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=keep_words)
# Write out the dictionary to disk.
# For my run, this file is 769KB when compressed.
# TODO -- This text format lets you peruse it, but you can
# compress it better as binary...
wiki.dictionary.save_as_text('./data/dictionary.txt.bz2')
else:
# Nothing to do here.
print('')
# ======== STEP 2: Convert Articles To Bag-of-words ========
# Now that we have our finalized dictionary, we can create bag-of-words
# representations for the Wikipedia articles. This means taking another
# pass over the Wikipedia dump!
if True:
# Load the dictionary if you're just running this section.
dictionary = Dictionary.load_from_text('./data/dictionary.txt.bz2')
wiki = WikiCorpus(dump_file, dictionary=dictionary)
# Turn on metadata so that wiki.get_texts() returns the article titles.
wiki.metadata = True
print('\nConverting to bag of words...')
sys.stdout.flush()
t0 = time.time()
# Generate bag-of-words vectors (term-document frequency matrix) and
# write these directly to disk.
# On my machine, this took 3.53 hrs.
# By setting metadata = True, this will also record all of the article
# titles into a separate pickle file, 'bow.mm.metadata.cpickle'
MmCorpus.serialize('./data/bow.mm', wiki, metadata=True, progress_cnt=10000)
print(' Conversion to bag-of-words took %s' % formatTime(time.time() - t0))
sys.stdout.flush()
# Load the article titles back
id_to_titles = utils.unpickle('./data/bow.mm.metadata.cpickle')
# Create the reverse mapping, from article title to index.
titles_to_id = {}
# For each article...
for at in id_to_titles.items():
# `at` is (index, (pageid, article_title)) e.g., (0, ('12', 'Anarchism'))
# at[1][1] is the article title.
# The pagied property is unused.
titles_to_id[at[1][1]] = at[0]
# Store the resulting map.
utils.pickle(titles_to_id, './data/titles_to_id.pickle')
# We're done with the article titles so free up their memory.
del id_to_titles
del titles_to_id
# To clean up some memory, we can delete our original dictionary and
# wiki objects, and load back the dictionary directly from the file.
del dictionary
del wiki
# Load the dictionary back from disk.
# (0.86sec on my machine loading from an SSD)
dictionary = Dictionary.load_from_text('./data/dictionary.txt.bz2')
# Load the bag-of-words vectors back from disk.
# (0.8sec on my machine loading from an SSD)
corpus_bow = MmCorpus('./data/bow.mm')
# If we previously completed this step, just load the pieces we need.
else:
print('\nLoading the bag-of-words corpus from disk.')
# Load the bag-of-words vectors back from disk.
# (0.8sec on my machine loading from an SSD)
corpus_bow = MmCorpus('./data/bow.mm')
# ======== STEP 3: Learn tf-idf model ========
# At this point, we're all done with the original Wikipedia text, and we
# just have our bag-of-words representation.
# Now we can look at the word frequencies and document frequencies to
# build a tf-idf model which we'll use in the next step.
if True:
print('\nLearning tf-idf model from data...')
t0 = time.time()
# Build a Tfidf Model from the bag-of-words dataset.
# This took 47 min. on my machine.
# TODO - Why not normalize?
model_tfidf = TfidfModel(corpus_bow, id2word=dictionary, normalize=False)
print(' Building tf-idf model took %s' % formatTime(time.time() - t0))
model_tfidf.save('./data/tfidf.tfidf_model')
# If we previously completed this step, just load the pieces we need.
else:
print('\nLoading the tf-idf model from disk.')
model_tfidf = TfidfModel.load('./data/tfidf.tfidf_model')
# ======== STEP 4: Convert articles to tf-idf ========
# We've learned the word statistics and built a tf-idf model, now it's time
# to apply it and convert the vectors to the tf-idf representation.
if True:
print('\nApplying tf-idf model to all vectors...')
t0 = time.time()
# Apply the tf-idf model to all of the vectors.
# This took 1hr. and 40min. on my machine.
# The resulting corpus file is large--17.9 GB for me.
MmCorpus.serialize('./data/corpus_tfidf.mm', model_tfidf[corpus_bow], progress_cnt=10000)
print(' Applying tf-idf model took %s' % formatTime(time.time() - t0))
else:
# Nothing to do here.
print('')
# ======== STEP 5: Train LSI on the articles ========
# Learn an LSI model from the tf-idf vectors.
if True:
# The number of topics to use.
num_topics = 300
# Load the tf-idf corpus back from disk.
corpus_tfidf = MmCorpus('./data/corpus_tfidf.mm')
# Train LSI
print('\nLearning LSI model from the tf-idf vectors...')
t0 = time.time()
# Build the LSI model
# This took 2hrs. and 7min. on my machine.
model_lsi = LsiModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary)
print(' Building LSI model took %s' % formatTime(time.time() - t0))
# Write out the LSI model to disk.
# The LSI model is big but not as big as the corpus.
# The largest piece is the projection matrix:
# 100,000 words x 300 topics x 8-bytes per val x (1MB / 2^20 bytes) = ~229MB
# This is saved as `lsi.lsi_model.projection.u.npy`
model_lsi.save('./data/lsi.lsi_model')
# If we previously completed this step, just load the pieces we need.
else:
# Load the tf-idf corpus and trained LSI model back from disk.
corpus_tfidf = MmCorpus('./data/corpus_tfidf.mm')
model_lsi = LsiModel.load('./data/lsi.lsi_model')
# ========= STEP 6: Convert articles to LSI with index ========
# Transform corpus to LSI space and index it
if True:
print('\nApplying LSI model to all vectors...')
t0 = time.time()
# You could apply Apply the LSI model to all of the tf-idf vectors and
# write them to disk as an MmCorpus, but this is huge--33.2GB.
#MmCorpus.serialize('./data/corpus_lsi.mm', model_lsi[corpus_tfidf], progress_cnt=10000)
# Instead, we'll convert the vectors to LSI and store them as a dense
# matrix, all in one step.
index = similarities.MatrixSimilarity(model_lsi[corpus_tfidf], num_features=num_topics)
index.save('./data/lsi_index.mm')
print(' Applying LSI model took %s' % formatTime(time.time() - t0))