-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbroken_link_search.py
230 lines (208 loc) · 7.21 KB
/
broken_link_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import argparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
import csv
# Dictionary containing http status codes
# with their corresponding meanings.
RESPONSE_CODES = {
200: "OK",
301: "Moved Permanently",
302: "Found",
400: "Bad Request",
401: "Unauthorized",
403: "Forbidden",
404: "Not Found"
}
# Names of the CSV fields
FIELDS = [
'origin_url', # 0
'origin_status_code', # 1
'status_description', # 2
'outbound_anchor_text', # 3
'outbound_hyperlink', # 4
'outbound_status_code', # 5
]
class GLI_Spider(CrawlSpider):
"""
Searches the Gilder Lehrman website
for all hyperlinks and page statuses.
"""
# Parameters for this Crawl Spider
name = 'broken_links_from_homepage'
allowed_domains = ['gilderlehrman.org']
start_urls = [
'https://www.gilderlehrman.org/',
'https://www.gilderlehrman.org/news/'
]
handle_httpstatus_list = [
200,
301,
302,
400,
401,
403,
404
]
# How and where the spider will crawl
rules = [
Rule(LinkExtractor(allow_domains='gilderlehrman.org'),
callback='parse_info',
follow=True)
]
# Another link extractor -- not for crawling but
# for finding all hyperlinks on a given page.
le = LinkExtractor(
allow_domains='gilderlehrman.org',
unique=False
)
def parse_info(self, response):
"""
Defines what the spider will extract
from a page that it visits.
"""
# Extract information from the reponse
status = response.status
desc = RESPONSE_CODES[status]
internal_links = self.le.extract_links(response)
# OUTPUT for the CSV
# If the page is not working:
if status >= 400:
yield {
# Collect only the url and status for this page
FIELDS[0] : response.url,
FIELDS[1] : status,
FIELDS[2] : desc,
}
# If the page is working:
else:
for out_link in internal_links:
# Also collect all outbound links on this page
yield {
FIELDS[0] : response.url,
FIELDS[1] : status,
FIELDS[2] : desc,
FIELDS[3] : format_for_csv(out_link.text.strip()),
FIELDS[4] : remove_bookmarks(out_link.url)
}
def remove_bookmarks(hyperlink):
"""
Strips any text following a hashtag ('#') in a hyperlink.
"""
if '#' in hyperlink:
index = hyperlink.find('#')
hyperlink = hyperlink[0:index]
return hyperlink
def format_for_csv(description):
"""
Removes commas and newlines from text that should
be a standalone cell in the output csv.
"""
split_desc = str(description).split(sep=',')
split_desc = " ".join(split_desc).splitlines()
return (" ".join(split_desc))
class CSV_URLs():
"""
Processes the raw results of the webcrawl and outputs
a new csv file of the broken links.
One limitation of Scrapy is that it is not always
able to report the origin page by which it arrived
at a broken page. This class resolves this issue by
essentially reversing the direction of the directed
graph so that the destination pages now point to their
origins. The ultimate output of this process is a new
csv file exclusively containing the broken links.
"""
def __init__(self, fname):
self.filename = fname
self.broken_pages = dict()
self.find_broken_pages()
self.broken_links = list()
self.find_broken_links()
self.rewrite_csv()
def find_broken_pages(self):
"""
Scans the csv file for any pages with a status code
of 400 or greater.
"""
# Open the CSV file and prepare it for reading
with open(self.filename, newline='') as link_list:
link_reader = csv.DictReader(link_list, delimiter=',')
# Find all broken pages in the csv
for row in link_reader:
status = int(row[FIELDS[1]])
if status == 400 or status == 401 or status == 404:
self.broken_pages[row[FIELDS[0]]] = status
def find_broken_links(self):
"""
Scans the csv file for any outbound links that
lead to a broken page.
"""
with open(self.filename, newline='') as link_list:
link_reader = csv.DictReader(link_list, delimiter=',')
# Find all broken links in the csv
for row in link_reader:
if row[FIELDS[4]] in self.broken_pages:
self.broken_links.append({
FIELDS[0]: row[FIELDS[0]],
FIELDS[1]: row[FIELDS[1]],
FIELDS[2]: row[FIELDS[2]],
FIELDS[3]: row[FIELDS[3]],
FIELDS[4]: row[FIELDS[4]],
FIELDS[5]: self.broken_pages[row[FIELDS[4]]],
})
def rewrite_csv(self):
"""
Only pages with broken links persist in the
final version of the output csv
"""
with open("broken_" + self.filename, 'w', newline='') as broken_link_list:
bll_writer = csv.DictWriter(broken_link_list, FIELDS)
bll_writer.writeheader()
for entry in self.broken_links:
bll_writer.writerow({
FIELDS[0]: entry[FIELDS[0]],
FIELDS[1]: entry[FIELDS[1]],
FIELDS[2]: entry[FIELDS[2]],
FIELDS[3]: entry[FIELDS[3]],
FIELDS[4]: entry[FIELDS[4]],
FIELDS[5]: entry[FIELDS[5]],
})
def main():
# GET ARGUMENTS
parser = argparse.ArgumentParser(description='options')
# Parameters: filename and number of pages to search
parser.add_argument('--fname', dest='fname', help='Name of output file')
parser.add_argument('--number', dest='number', help='Number of pages to search')
args = parser.parse_args()
# Parse the arguments
if args.fname is not None:
links_fname = args.fname + ".csv"
else:
links_fname = 'gli_hyperlinks.csv'
if args.number is not None and args.number.isdigit():
num_searches = int(args.number)
else:
num_searches = 10000
# Initialize settings for webcrawl
process = CrawlerProcess(settings={
'FEEDS': {
links_fname : {
'format': 'csv',
'fields' : FIELDS,
'overwrite': True,
},
},
'CLOSESPIDER_PAGECOUNT': num_searches,
'LOG_LEVEL': 'CRITICAL',
'DEPTH_PRIORITY': 1,
'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue',
})
# Start the webcrawl
process.crawl(GLI_Spider)
process.start()
# For all broken pages, find the source hyperlink
adjust_csv = CSV_URLs(links_fname)
if __name__ == "__main__":
main()