-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl-wikipedia.py
executable file
·279 lines (236 loc) · 9.91 KB
/
crawl-wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#!/usr/bin/env python3
#+-----------------------------------------------------------------------+
#| Copyright (C) 2020 George Z. Zachos |
#+-----------------------------------------------------------------------+
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Contact Information:
# Name: George Z. Zachos
# Email: gzzachos_at_gmail.com
import os
import sys
import time
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import threading
########################
# Function definitions #
########################
# Print message to STDERR.
def perror(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
sys.stderr.flush()
# Article titles starting with a '.' will be stored as hidden files.
# i.e. '.NET_Framework'. This function replaces leading '.' with '__dot__'.
def canonicalize(filename):
if filename.startswith('.'):
return '__dot__' + filename[1:]
return filename
# Read seeds from text file.
def read_seeds():
seeds = []
try:
f = open(seeds_filename, mode='r', encoding='utf-8')
for line in f.readlines():
seeds.append(line.strip())
return seeds
except OSError as ose:
perror(seeds_filename + ': ' + ose.strerror)
exit(ose.errno)
# Parses an HTML text and adds the hyperlinks currently not in the crawl
# frontier. Articles named {ISO,IEC,IEEE}_* and 802.* are excluded to support
# a larger variety of articles as there are many variants of them.
# i.e. IEEE_802.11{ac,ad,af,ah,ai,ax,ay,be}
# Return: 1) True if no more hyperlinks need to be extracted
# 2) True if parsing was successful
def expand_frontier(html_text):
limit_reached = False
success = True
try:
soup = BeautifulSoup(html_text, 'html.parser')
links = soup.find('div', id='mw-content-text').find_all('a')
title = soup.find('h1', id="firstHeading").string
for link in links:
href = str(link.get('href'))
path_tokens = href.strip('/').split('/')
if href.startswith('/wiki/') and len(path_tokens) == 2 \
and not ('#' in href or ':' in href) \
and not ('ISO_' in href or 'IEEE_' in href) \
and not ('802.' in href or 'IEC_' in href):
if not href in crawl_frontier:
print('Adding \'%s\' to frontier' % (href))
crawl_frontier.append(href)
if len(crawl_frontier) == article_limit:
limit_reached = True
break
except:
success = False
perror('Error parsing article \'%s\' for hyperlinks' % (title))
return limit_reached, success
# Download an article and parse it to extract more hyperlinks.
# Return: 1) True if no more hyperlinks need to be extracted
# 2) True if hyperlink extraction was successful
def extract_hrefs_from_article(href):
download_attempts = 0
limit_reached = success = False
while download_attempts <= max_downld_retries:
try:
url = url_prefix + href
print('Parsing \'%s\'' % (url))
req = requests.get(url)
if req.status_code != 200:
raise Exception('Status code: ' + str(req.status_code))
limit_reached, success = expand_frontier(req.text)
break
except Exception as e:
perror('Error extracting hrefs from: \'%s\'' % (url))
download_attempts += 1
time.sleep(5)
return limit_reached, success
# Build crawl frontier using input seeds.
def build_crawl_frontier(seeds):
global crawl_frontier
crawl_frontier = seeds
webpages_parsed = 0
for href in crawl_frontier:
limit_reached, success = extract_hrefs_from_article(href)
if success == True:
webpages_parsed += 1
if limit_reached == True:
break
return crawl_frontier, webpages_parsed
# Write URLs to urls.txt
def write_urls_tofile(article_hrefs):
try:
outfile = open(repo_path + 'urls.txt', mode='w', encoding='utf-8')
for href in article_hrefs:
outfile.write(href + '\n')
outfile.close()
except OSError as ose:
perror('Cannot write \'' + repo_path + 'urls.txt\': ' + ose.strerror)
exit(ose.errno)
# Download article and save raw HTML file.
# Return number of downloaded articles (0 or 1).
def download_article(href):
download_attempts = 0
while download_attempts <= max_downld_retries:
try:
url = url_prefix + href
filename = canonicalize(href.split('/')[-1]) + '.html'
print('Downloading \'%s\' -> \'%s\'' % (url, filename))
req = requests.get(url)
if req.status_code != 200:
raise RequestException()
outfile = open(repo_path + filename, mode='w', encoding='utf-8')
outfile.write(req.text)
outfile.close()
return 1 # Downloaded one article
except RequestException as e:
# perror(e)
perror('Error downloading: \'%s\' [attempt %d/%d]' %
(url, download_attempts + 1, max_downld_retries + 1))
except OSError as ose:
perror('Error writing: \'%s\' -> \'%s\': %s [attempt %d/%d' %
(url, repo_path + filename, ose.strerror,
download_attempts + 1, max_downld_retries + 1))
download_attempts += 1
time.sleep(1)
return 0 # No article was stored
# Statically assign work to each thread.
def calculate_chunk(article_hrefs, tid, num_threads):
num_hrefs = len(article_hrefs)
chunksize = num_hrefs // num_threads
remainder = num_hrefs % num_threads
if remainder != 0: # hrefs cannot be divided in equal chunks
if chunksize == 0: # num_hrefs < num_threads
chunksize = 1
lb = tid
if tid >= num_hrefs:
return (-1, -1)
else: # num_hrefs > num_threads
# First remainder threads get one more href than the remaining
if tid < remainder:
chunksize += 1
lb = tid * chunksize
else:
lb = remainder * (chunksize + 1) + (tid - remainder) * chunksize
else: # Chunksize is the same for all threads
lb = tid * chunksize
return (lb, lb + chunksize)
# Perform download of assigned chunk.
def download(article_hrefs, tid, num_threads):
global total_downloads
local_downloads = 0
# Scrape articles assigned to me
# print("TID %3d: [%d-%d)" % (tid, lb, ub))
for href in article_hrefs:
local_downloads += download_article(href)
# Update total_downloads using synchronization to avoid race conditions
# perror('down:tid: %d' % (local_downloads))
tlock.acquire()
total_downloads += local_downloads
tlock.release()
# print('Thread %3d is exiting...' % (tid))
def multithreaded_download(article_hrefs):
thread_list = []
# Create threads
for i in range(num_threads):
lb, ub = calculate_chunk(article_hrefs, i, num_threads)
if lb == -1 and ub == -1: # No work to be assigned
continue
arg_list = (article_hrefs[lb:ub], i, num_threads)
thread = threading.Thread(target=download, args = arg_list)
thread_list.append(thread)
thread.start()
# Join threads
for thread in thread_list:
thread.join()
def print_stats(webpages_parsed, frontier_build_time, download_time):
print('\n############################## STATS ######################################')
print('Extracted %d hyperlinks from %d articles in %.2f sec' %
(article_limit, webpages_parsed, frontier_build_time))
print('Downloaded %d/%d articles in %.2f sec using %d threads [%d processors]'
% (total_downloads, article_limit, download_time, num_threads, num_processors))
print('###########################################################################\n')
def print_startup_info():
print('############################## INFO ##############################')
print('Raw HTML files will be stored in: \'%s\'' % (repo_path))
print('##################################################################\n')
def main():
print_startup_info()
seeds = read_seeds()
t0 = time.time()
article_hrefs, webpages_parsed = build_crawl_frontier(seeds)
t1 = time.time()
write_urls_tofile(article_hrefs)
t2 = time.time()
actual_downloads = multithreaded_download(article_hrefs)
t3 = time.time()
frontier_build_time = t1 - t0
download_time = t3 - t2
print_stats(webpages_parsed, frontier_build_time, download_time)
###############
# Global data #
###############
repo_path = './repository/' # Where downloaded HTML files will be stored
url_prefix = 'https://en.wikipedia.org'
seeds_filename = 'crawler-seeds.txt' # Crawler seeds
article_limit = 6000 # Number of articles to download
num_processors = os.cpu_count()
num_threads = num_processors * 16 # Number of threads used during downloading
max_downld_retries = 3 # How many times (at most) retry downloading an article
total_downloads = 0 # How many articles where downloaded by all threads
tlock = threading.Lock() # Protects access to total_downloads
if __name__ == '__main__':
main()