-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
137 lines (121 loc) · 4.81 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import datetime # 转换时间用
import multiprocessing as mp # 多进程
import os
import re # 正则表达式提取文本
import pandas as pd # 存取csv文件
import requests # 发送请求
from jsonpath import jsonpath # 解析json数据
import config
def trans_time(v_str):
"""转换GMT时间为标准格式"""
gmt_format = "%a %b %d %H:%M:%S +0800 %Y"
time_array = datetime.datetime.strptime(v_str, gmt_format)
ret_time = time_array.strftime("%Y-%m-%d %H:%M:%S")
return ret_time
def get_weibo_list(v_keyword, page):
"""
爬取微博内容列表
:param v_keyword:搜索关键字
:param v_max_page: 爬取前几页
:return: None
"""
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62",
"accept": "application/json, text/plain, */*",
"accept-encoding": "gzip, deflate, br",
}
df_all = pd.DataFrame(
columns=["页码", "微博id", "微博bid", "微博作者", "发布时间", "微博内容", "转发数", "评论数", "点赞数"]
)
print("===开始爬取第{}页微博==".format(page))
# 请求地址
url = "https://m.weibo.cn/api/container/getIndex"
# 请求参数
params = {
"containerid": "100103type=1&q={}".format(v_keyword),
"page_type": "searchall",
"page": page,
}
# 发送请求
r = requests.get(url, headers=headers, params=params)
# 解析json数据
cards = r.json()["data"]["cards"]
# 微博内容
text_list = jsonpath(cards, "$..mblog.text")
# 微博内容-正则表达式数据清洗
dr = re.compile(r"<[^>]+>", re.S)
text2_list = []
if not text_list: # 如果未获取到微博内容,进入下一轮循环
print("{}页爬取失败".format(page))
return pd.DataFrame(
columns=["页码", "微博id", "微博bid", "微博作者", "发布时间", "微博内容", "转发数", "评论数", "点赞数"]
)
if type(text_list) == list and len(text_list) > 0:
for text in text_list:
text2 = dr.sub("", text) # 正则表达式提取微博内容
text2_list.append(text2)
# 微博创建时间
time_list = jsonpath(cards, "$..mblog.created_at")
time_list = [trans_time(v_str=i) for i in time_list]
# 微博作者
author_list = jsonpath(cards, "$..mblog.user.screen_name")
# 微博id
id_list = jsonpath(cards, "$..mblog.id")
# 微博bid
bid_list = jsonpath(cards, "$..mblog.bid")
# 转发数
reposts_count_list = jsonpath(cards, "$..mblog.reposts_count")
# 评论数
comments_count_list = jsonpath(cards, "$..mblog.comments_count")
# 点赞数
attitudes_count_list = jsonpath(cards, "$..mblog.attitudes_count")
# 把列表数据保存成DataFrame数据
df = pd.DataFrame(
{
"页码": [page] * len(id_list),
"微博id": id_list,
"微博bid": bid_list,
"微博作者": author_list,
"发布时间": time_list,
"微博内容": text2_list,
"转发数": reposts_count_list,
"评论数": comments_count_list,
"点赞数": attitudes_count_list,
}
)
return df
def crawler(max_page, search_keyword):
v_weibo_file = "dataset/test/微博清单_{}_前{}页.csv".format(search_keyword, max_page)
if os.path.exists(v_weibo_file):
os.remove(v_weibo_file)
print("文件已存在,已删除:{}".format(v_weibo_file))
res_l = [] # 储存进程池中输出的结果
po = mp.Pool(10) # 引入进程池
for page in range(max_page):
res = po.apply_async(
get_weibo_list, (search_keyword, page + 1)
) # getInfo为具体的爬取函数
res_l.append(res)
po.close() # 关闭进程池,关闭后po不再接收新的请求
po.join() # 等待po中的所有子进程执行完成,必须放在close语句之后
print("爬取完毕,开始整合数据...")
df = pd.DataFrame(
columns=["页码", "微博id", "微博bid", "微博作者", "发布时间", "微博内容", "转发数", "评论数", "点赞数"]
) # 储存进程池中输出的结果
for res in res_l:
res_ = pd.DataFrame(res.get())
df = pd.concat([df, res_]) # 把进程池的结果合并到dataframe里面
# 数据清洗-去重
df.drop_duplicates(subset=["微博id"], inplace=True, keep="first")
# 再次保存到csv文件
df.to_csv(v_weibo_file, index=False, encoding="utf-8")
print("数据保存完毕")
return df
if __name__ == "__main__":
# # 爬取前几页
max_page = config.max_page
# 搜索关键字
search_keyword = config.search_keyword
# 保存文件名
df = crawler(max_page, search_keyword)