-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcategories_script.py
91 lines (74 loc) · 2.92 KB
/
categories_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from queue import PriorityQueue
import numpy as np
import pickle
import sys
import pandas as pd
top_cat = sys.argv[1]
print("FOR TOP CATEGORY={}".format(top_cat))
class Queue(list):
def __init__(self, first):
self.queue = PriorityQueue()
self.queue.put((0, first))
self.seen = set([first])
self.n_searched = 0
self.res = []
def next(self):
self.n_searched += 1
if self.queue.empty():
print("emptyqueu1")
raise ValueError
return self.queue.get()
def add_list(self, priority_number, l):
for e in l:
if (e not in self.seen and isinstance(e, str)):
self.seen.add(e)
self.queue.put((priority_number + 1, e))
def save_values(queue, top_cat):
"""
Save the results
"""
with open('queue_seen_{}.pkl'.format(top_cat), 'wb+') as f:
pickle.dump(queue.seen, f)
with open('queue_queue_{}.pkl'.format(top_cat), 'wb+') as f:
pickle.dump(queue.queue.queue, f)
with open('queue_res_{}.pkl'.format(top_cat), 'wb+') as f:
pickle.dump(queue.res, f)
print("OK for pickles, n_searched={}".format(queue.n_searched))
pages_index = pd.read_csv('../wiki_data/seded_index.txt',
sep='#', header=None)
categories = pages_index[pages_index[2].astype(str).apply(lambda x: x.startswith('Catégorie:'))]
# in order to quickly get corresponding, we index
cat_index = categories.set_index(1, drop=True)
# remove "Catégorie:" from the name
# and replace spaces with _ as it is in wikipedia
cat_index[2] = cat_index[2].apply(lambda x: x[10:])
cat_index[2] = cat_index[2].apply(lambda x: x.replace(" ", "_"))
cat_indices = cat_index.index
# the real pages (non category)
non_cat = pages_index[~pages_index[2].apply(lambda x: ":" in x if isinstance(x, str) else True)]
non_cat_index = non_cat.set_index(1, drop=True)
non_cat_indices = non_cat_index.index
links = pd.read_csv('../wiki_data/links.csv', sep='#', header=None)
links_index = links.set_index(0, drop=True)[1]
# hashed strings allows for much faster comparison
hashed_links_index = links_index.apply(hash)
queue = Queue(top_cat)
k = 0
try:
while True:
priority, cat = queue.next()
sub = links_index[hashed_links_index == hash(cat)]
indices = cat_indices.intersection(sub.index)
page_indices = non_cat_indices.intersection(sub.index)
for ix, row in non_cat_index.loc[page_indices].iterrows():
queue.res.append((ix, row[0], row[2], top_cat, priority))
queue.add_list(priority, cat_index.loc[indices][2].values)
if not k % 100:
print("looking for={} queue_size={} n_searched={} priority={}".format(
cat, len(queue.seen), queue.n_searched, priority))
k+=1
except KeyboardInterrupt:
save_values(queue, top_cat)
except ValueError:
print("############ Empty queue")
save_values(queue, top_cat)