-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclass_12_crawler_preliminary.py
54 lines (42 loc) · 1.64 KB
/
class_12_crawler_preliminary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
import re
class MyCrawler:
def __init__(self, filename):
self.filename = filename
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
# 1. 采——网页的采集
def download(self, url):
req = requests.get(url, headers=self.headers)
return req.text
# 2. 抽——信息的抽取
def extract(self, content, pattern):
info = re.findall(pattern, content)
return info
# 3. 存——保存采集结果
def save(self, info):
with open(self.filename, 'w', encoding='utf-8') as f:
f.write('\t'.join(['链接', '标题', '播放量', '弹幕数', 'UP主', '综合得分']) + '\n')
for res in info:
f.write('\t'.join(res) + '\n')
def crawler(self, url, pattern):
content = self.download(url)
info = self.extract(content, pattern)
self.save(info)
# filename = './crawler-media/mobile.txt'
# url = 'https://wap.zol.com.cn/top/cell_phone/hot.html'
# pattern = '<p class="pro-info-name f28">(.*?)</p>[\S\s]*?<span class="pro-info-price f24">(.*?)</span>'
# mycrawler = MyCrawler(filename=filename)
# mycrawler.crawler(url, pattern)
'''
爬取bilibili rank
'''
filename = './crawler-media/bilibili_rank.txt'
url = 'https://www.bilibili.com/v/popular/rank/all'
pattern = '<div class="info"><a href="\/\/(.*?)" target="_blank" class="title">(.*?)<\/a>.*?<\/i>\s+(.*?)\s+.*?<\/i>\s+(.*?)\s+.*?<\/i>\s+(.*?)\s+.*?<div class="pts"><div>(\d+)<\/div>'
mycrawler = MyCrawler(filename=filename)
content = mycrawler.download(url)
info = mycrawler.extract(content, pattern)
mycrawler.save(info)
### mycrawler.crawler(url, pattern)