-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcrawler.py
56 lines (48 loc) · 2.71 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import json
import time
import scrapy
from scrapy.http import Response
from downloader import NoticeDownloader
# START_URL = 'https://www.baidu.com/s?ie=utf-8&mod=1&isbd=1&isid=bc2f909300079a0e&ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=%E6%8E%A5%E5%8F%97%20%E6%8D%90%E8%B5%A0%20%E5%85%AC%E5%91%8A&oq=%25E6%258E%25A5%25E6%2594%25B6%2520%25E6%258D%2590%25E8%25B5%25A0%2520%25E5%2585%25AC%25E5%2591%258A&rsv_pq=eded3efd00091080&rsv_t=9ef98hp0HSiUm0bNbdA84SEEpwePky%2BBs%2FjCdnuez7JYqJgE%2FgZJeXl0%2Ff4&rqlang=cn&rsv_enter=0&rsv_dl=tb&inputT=14413&si=gov.cn&ct=2097152&gpc=stf%3D1580299698%2C1580386098%7Cstftype%3D1&bs=%E6%8E%A5%E5%8F%97%20%E6%8D%90%E8%B5%A0%20%E5%85%AC%E5%91%8A&rsv_sid=undefined&_ss=1&clist=8812c631e533d725%098812c631e533d725&hsug=&f4s=1&csor=2&_cr1=42814'
START_URL = 'https://www.baidu.com/s?ie=utf-8&mod=1&isbd=1&isid=e073477000175be2&wd=intitle%3A%E6%8E%A5%E5%8F%97%20%E6%8D%90%E8%B5%A0%20%E5%85%AC%E5%91%8A&pn=0&oq=intitle%3A%E6%8E%A5%E5%8F%97%20%E6%8D%90%E8%B5%A0%20%E5%85%AC%E5%91%8A&ie=utf-8&rsv_idx=1&rsv_pq=e073477000175be2&rsv_t=145ejrjh88UrFNYml03NHWT29F56s0Ypgo3gKNFiy1su3V3MwByPiNXWtEw&gpc=stf%3D1580486400%2C1580572798%7Cstftype%3D2&tfflag=1&bs=intitle%3A%E6%8E%A5%E5%8F%97%20%E6%8D%90%E8%B5%A0%20%E5%85%AC%E5%91%8A&rsv_sid=undefined&_ss=1&clist=89eed845b2bd4677&hsug=&f4s=1&csor=0&_cr1=35370'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}
class NoticeSpider(scrapy.Spider):
name = 'notice-spider'
start_urls = [START_URL]
custom_settings = {
'DEFAULT_REQUEST_HEADERS': headers,
}
def parse(self, response):
"""
:type response: Response
"""
duplicate_cnt = 0
for i, result in enumerate(response.css('div.result .f13')):
item = result.css('::attr(data-tools)').get()
if not item:
continue
item = json.loads(item)
title = item['title']
url = item['url']
host = result.css('.c-showurl ::text').get()
try:
host = host.split('/', 1)[0]
except:
host = ''
if 'nor-src-wrap' in host:
continue # 忽略百家号的...
print(title, host)
nd = NoticeDownloader(url, title, host)
if nd.get_status() is not None:
duplicate_cnt += 1
if duplicate_cnt == 10:
print('Crawling Finished!')
return
next_page = response.css('a.n ::attr(href)').getall()
next_page = next_page[-1] if next_page else None
if next_page:
import random
time.sleep(random.randint(1, 5))
yield response.follow(next_page, self.parse)