-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmini_spider.py
155 lines (124 loc) · 4.83 KB
/
mini_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# -*- coding: utf-8 -*-
from crawl_thread import WeiboCrawlerThread
from queue import Queue
import configparser
import os
import util
import logging
import base64
import requests
import urllib
import rsa
import binascii
class LoginSina(object):
def __init__(self, username, password, config):
self.username = username
self.password = password
self.headers = {'User-Agent': config['Main']['User-Agent']}
def get_su(self):
username_base64 = base64.b64encode(urllib.parse.quote_plus(self.username).encode("utf-8"))
return username_base64.decode("utf-8")
def get_server_data(self, su):
pre_url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack" \
"&su={}&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)&_= "
pre_data_res = requests.get(pre_url.format(su), headers=self.headers)
sever_data = eval(pre_data_res.content.decode("utf-8").replace("sinaSSOController.preloginCallBack", ''))
return sever_data
def get_password(self, password, server_time, nonce, pubkey):
rsaPublickey = int(pubkey, 16)
key = rsa.PublicKey(rsaPublickey, 65537)
message = str(server_time) + '\t' + str(nonce) + '\n' + str(password)
message = message.encode("utf-8")
passwd = rsa.encrypt(message, key)
passwd = binascii.b2a_hex(passwd)
return passwd
def get_cookies(self):
su = self.get_su()
d = self.get_server_data(su)
postdata = {
'entry': 'sso',
'gateway': '1',
'from': 'null',
'savestate': '0',
'useticket': '0',
'pagerefer': 'http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout'
'.php%3Fbackurl',
'vsnf': '1',
'su': su,
'service': 'sso',
'servertime': d['servertime'],
'nonce': d['nonce'],
'pwencode': 'rsa2',
'rsakv': '1330428213',
'sp': self.get_password(self.password, d['servertime'], d['nonce'], d['pubkey']),
'sr': '1366*768',
'encoding': 'UTF-8',
'cdult': '3',
'domain': 'sina.com.cn',
'prelt': '27',
'returntype': 'TEXT'
}
login_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
res = requests.post(login_url, data=postdata, headers=self.headers)
ticket = eval(res.text)['crossDomainUrlList'][0][eval(res.text)['crossDomainUrlList'][0].find('ticket'):]
new_url = 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack' \
'&{}&retcode=0'.format(
ticket)
return requests.get(new_url).cookies
def get_query_list(query_num, off_set, config):
query_list = []
return query_list
def main():
config = configparser.ConfigParser()
config.read('config.ini')
# Makedir for downloading data
dir_name = os.path.join(os.getcwd(), config['Main']['download_dir_name'])
if not os.path.exists(dir_name):
os.makedirs(dir_name)
# Init logging
logger = logging.getLogger('logger')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('log')
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)
cnt = 0
# get cookie
username = config['Weibo']['username']
password = config['Weibo']['password']
ls = LoginSina(username, password, config)
cookie = ls.get_cookies()
while True:
if cnt >= int(config['Main']['max_num']):
break
# Queue of queries
idQueue = Queue()
if idQueue.empty():
query_list = util.get_query_list(cnt, config)
for q in query_list:
idQueue.put(q)
# consume all of the query & break
if idQueue.empty():
break
# Launch threads
crawlList = []
for idx in range(0, int(config['Main']['crawler_num'])):
name = 'crawler-{}'.format(idx)
crawlList.append(name)
threadList = []
for crawlerName in crawlList:
weiboCrawlerThreads = WeiboCrawlerThread(crawlerName, idQueue, config, logger, cookie)
weiboCrawlerThreads.start()
threadList.append(weiboCrawlerThreads)
for weiboCrawlerThreads in threadList:
weiboCrawlerThreads.join()
cnt += len(query_list)
logger.info("{} queries have been conducted. ".format(cnt))
logger.info("Main thread quit. ")
if __name__ == '__main__':
main()