-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebCrawlerWithLatencyTimer.py
333 lines (284 loc) · 11.7 KB
/
webCrawlerWithLatencyTimer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
'''
Created on Sep 5, 2015
@author: tlisowski
'''
from lxml import html
import requests
import os
import csv
import json
from threading import Thread, Lock
from datetime import datetime, timedelta
import socket
from ipwhois import IPWhois
import urllib3.contrib.pyopenssl
import time
class WebContentCrawler(object):
'''
Web Crawler for web page content analysis system
'''
'''
Constructor
@param webPageListPath: path to .csv file having Alexa 1M web pages
'''
def __init__(self, webPageListPath,):
'''
Constructor
'''
self.webpageListPath=webPageListPath;
#Max threads that can be active at one time
self.threadMax=175;
#number of active threads
self.activeThreads=0;
#number of completed threads
self.completedThreads=0;
#mutex for synchronization
self.mutex=Lock();
#mutex to examine latencies
self.webContentLatencyMutex=Lock();
self.ipWhoIsLatencyMutex=Lock();
self.ipWhoIsLatencyFile=open('ipWhoIsLatency.csv','a');
self.webContentLatencyFile=open('webLatency.csv','a');
#total pages to be fetched
self.totalPages=0;
#fix for SSL issues with python library
urllib3.contrib.pyopenssl.inject_into_urllib3();
return;
def __del__(self):
self.ipWhoIsLatencyFile.close();
self.webContentLatencyFile.close();
'''
getIPandWhoIsData maps domain tot IP and gets whois data for the IP address
@param url: Domain to map to IP and get whoisdata for
@return: dictonary containing domain as key and whois info as value
'''
def getIPandWhoIsData(self,url):
try:
ip=socket.gethostbyname(url);
obj=IPWhois(ip);
timeBefore=time.time();
whoIsDict=obj.lookup();
timeAfter=time.time();
latency=timeAfter-timeBefore;
self.ipWhoIsLatencyMutex.acquire();
self.ipWhoIsLatencyFile.write(url+','+str(latency)+'\n');
self.ipWhoIsLatencyMutex.release();
whoIsDict['resolved_IP']=ip;
return {url : whoIsDict};
except Exception:
return dict();
'''
storeIP handles taking accumulated IPwhois data and store it as json files
@param ipDict: dictionary of domains and whois data
@param timePath: path to store json files
'''
def storeIP(self,ipDict,timePath):
if not os.path.exists(timePath):
os.makedirs(timePath);
for key in ipDict:
try:
#print timePath+key;
f= open(timePath+key+'.json','a');
json.dump(ipDict[key],f);
f.close();
except Exception:
os.remove(timePath+key+'.json');
return;
'''
fetchDomTree gets webpage for given url
@param url: url to fetch page
@return: dictionary containing web page content as DOM tree
'''
def fetchDomTree(self,url):
try:
completeUrl='http://'+url;
timeBefore=time.time();
page= requests.get(completeUrl,stream=True,timeout=3);
timeAfter=time.time();
latency=timeAfter-timeBefore;
tree= html.fromstring(page.content);
self.webContentLatencyMutex.acquire();
self.webContentLatencyFile.write(url+','+str(latency)+'\n');
self.webContentLatencyMutex.release();
except Exception:
try:
completeUrl='https://'+url;
timeBefore=time.time();
page= requests.get(completeUrl,stream=True,timeout=3,verify=False);
timeAfter=time.time();
latency=timeAfter-timeBefore;
tree= html.fromstring(page.content);
self.webContentLatencyMutex.acquire();
self.webContentLatencyFile.write(url+','+str(latency)+'\n');
self.webContentLatencyMutex.release();
except Exception:
return dict();
return {url : tree}
'''
fetchAndUpdateDict fetches data for a domain and updates the accumulation dictionaries
@param url: domain(url) to fetch data for
@param urlTreeDict: dictionary contaning accumulated url and web content data
@param ipdict: dictionary containing whois data for resolved IPs
'''
def fetchAndUpdateDict(self,url,urlTreeDict,ipdict):
tempDict= self.fetchDomTree(url);
tempIPDict=self.getIPandWhoIsData(url);
self.mutex.acquire();
urlTreeDict.update(tempDict);
ipdict.update(tempIPDict);
#print "in fetch Update Dict"
#print urlTreeDict
self.completedThreads=self.completedThreads+1;
self.activeThreads=self.activeThreads-1;
self.mutex.release();
return;
'''
storeTree takes in domain and web content dictionary and stores it in a .html files
@param urlDomTreeDictionary: dictionary containing web content data
@param timePath: directory to store content
'''
def storeTree(self,urlDomTreeDictionary,timePath):
#add in iterator for list
if not os.path.exists(timePath):
os.makedirs(timePath);
for key in urlDomTreeDictionary:
try:
f= open(timePath+key+'.html','a');
f.write(html.tostring(urlDomTreeDictionary[key]));
f.close();
except Exception:
os.remove(timePath+key+'.html');
return;
'''
loadTreeFromMem loads a stored webpage as a DOM tree
@param path: file to load webpage from
@return: DOM tree of stored web page
'''
def loadTreeFromMem(self,path):
f= open(path,'r');
tree=html.fromstring(f.read());
f.close();
return tree;
'''
loadIpDataFromMem loads whois IP data from a stored .json file
@param path: file to load whois data from
@return: dictionary containing ipwhois data
'''
def loadIpDataFromMem(self,path):
f=open(path,'r');
ipDict=json.load(f);
f.close();
return ipDict;
'''
fetchAndStoreTopPages fetches range of pages from the Alexa Top 1M and stores them in directory based on time fetched
@param startPage: page rank to start fetch from
@param numPages: number of pages to fetch
@param time: time of fetch
'''
def fetchAndStoreTopPages(self,startPage,numPages,time):
self.totalPages=numPages;
storeCheckPoint=0;
storeInteval=self.threadMax;
timeString=str(time.year)+'_'+str(time.month)+'_'+str(time.day)+'_'+str(time.hour)+'/';
f= open(self.webpageListPath,'r');
csvReader=csv.reader(f,delimiter=',');
tempNumPages= numPages;
urlTreeDict={};
iPDict={};
newCheck=storeCheckPoint*storeInteval+storeInteval;
checkPoint= min(newCheck,numPages);
startPage=startPage-1;
#get to starting page in csv file
while startPage>0:
next(csvReader);
startPage-=1;
#start fetching data
while True:
self.mutex.acquire();
#if active thread limit not reached, spawn another fetching thread
if self.activeThreads<self.threadMax and self.completedThreads+self.activeThreads < self.totalPages:
row=next(csvReader);
t= Thread(target=self.fetchAndUpdateDict, args=(row[1],urlTreeDict,iPDict));
#run in background
t.daemon=True;
t.start();
self.activeThreads=self.activeThreads+1;
#if at storing checkpoint, store the data
if self.completedThreads>=checkPoint:
self.storeTree(urlTreeDict, 'web/'+timeString);
urlTreeDict.clear();
self.storeIP(iPDict, 'ip/'+timeString);
iPDict.clear();
storeCheckPoint=storeCheckPoint+1;
newCheck=storeCheckPoint*storeInteval+storeInteval;
checkPoint= min(newCheck,numPages);
#if all pages fetched, break main while loop
if self.completedThreads==self.totalPages:
break;
self.mutex.release();
self.mutex.release();
#reset class variables for next call
self.completedThreads=0;
self.activeThreads=0;
f.close();
return;
'''
getPagesInTime returns historic page content and whois data for the resolved ip
@param url: domain(url) to fetch content for
@param startDate: begininning date to get data from. In form [year,month,day,hour]
@param differenceFormat: amount of time to go back from start date to get historic fetch from start date
in form [weeks,days,hours] back
@return: dictionary containing whois data and page content for selected domain
'''
def getPagesInTime(self,url,startDate,differenceFormat):
#Difference format is [weeks,days,hours];
#Start date is [year, month, day, hour]
for i in differenceFormat:
if not isinstance(i,int):
print 'ERROR: ALL VALUES NEED TO BE INTS';
return list();
dateDiff=timedelta(hours=differenceFormat[2],days=differenceFormat[1],weeks=differenceFormat[0]);
startDateString=str(startDate[0])+'_'+str(startDate[1])+'_'+str(startDate[2])+'_'+str(startDate[3]);
endDate=datetime(startDate[0],startDate[1],startDate[2],startDate[3])-dateDiff;
endDateString=str(endDate.year)+'_'+str(endDate.month)+'_'+str(endDate.day)+'_'+str(endDate.hour);
if not (os.path.exists('web/') and os.path.exists('ip/')):
print 'Directory doesnt exist'
return list();
dirList=sorted(os.listdir('web/'),reverse=True);
if len(dirList)<2:
#not enough for difference
return list();
actualStartTime='';
actualEndTime='';
#look for closest fetch date to start and end time
for i in range(0,len(dirList)):
if actualStartTime=='':
if startDateString >= dirList[i]:
actualStartTime=dirList[i];
#to ensure end date gets next one
continue;
if actualEndTime=='':
if endDateString>= dirList[i]:
actualEndTime=dirList[i];
continue;
if actualStartTime!='' and actualEndTime!='':
break;
#handle corner case if times too far in past
if actualStartTime=='' or actualEndTime=='':
if actualStartTime=='' or actualStartTime==dirList[-1]:
#set both start time will always get set first
actualStartTime=dirList[-2];
actualEndTime=dirList[-1];
else:
#know endTime is only thing that has not been set
actualEndTime=dirList[-1];
startPage=self.loadTreeFromMem('web/'+actualStartTime+'/'+url+'.html');
startIpData=self.loadIpDataFromMem('ip/'+actualStartTime+'/'+url+'.json');
endPage=self.loadTreeFromMem('web/'+actualEndTime+'/'+url+'.html');
endIpData=self.loadIpDataFromMem('ip/'+actualEndTime+'/'+url+'.json');
returnDict=dict();
returnDict['start']=[actualStartTime,startPage,startIpData];
returnDict['past']=[actualEndTime,endPage,endIpData];
return returnDict;
def processPage(self,url):
return;