-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnimeScraper.py
186 lines (166 loc) · 5.47 KB
/
AnimeScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import requests
from enum import Enum
import time
import mysql.connector
ua = UserAgent()
headers={"User-Agent": ua.random}
urls = [f"https://bangumi.tv/anime/list/556647/collect?page={i}" for i in range(1, 13)]
bangumiPgUrls = []
totalPgNum = 12
currentPgNum = 1
class INFOBOX(Enum):
episodes = 1
def soupUrl(url):
"""
Soup一个网址
:param url: 要soup的网址
:return: 返回BeautifulSoup实例
"""
source = requests.get(url, headers=headers)
source.encoding = "utf-8"
html = source.text
return BeautifulSoup(html, "lxml")
def fkSpace(strs):
"""
把空格都cao飞
:param strs: 任意字符串数组
:return: 前后无空格的strs
"""
for i in range(len(strs)):
strs[i]=strs[i].strip()
return strs
def toDate(str):
"""
yyyy年mm月dd日 转 yyyy-mm-dd
:param str: yyyy年mm月dd日
:return: yyyy-mm-dd
"""
str=str.replace("年","-")
str=str.replace("月","-")
str=str.replace("日","")
return str
commands = {"episodes":"话数: ", "studio":"动画制作: ", "original":"原作: ",
"director":"导演: ", "official":"官方网站: "}
def getBangumiPgInfoBoxData(url, command):
"""
从infobox提取信息
:param url: 某一番剧页面的网址
:param command: 指令
1. episodes:获取话数
2. studio:获取动画制作公司
:return: 返回信息的字符串
"""
result = ""
cmd = commands.get(command)
if cmd is None:
print("Invalid Command!")
return
soup = soupUrl(url)
infoBox = soup.find("ul", id="infobox")
result = infoBox.find("span", string=cmd)
if result is None:
return "未知"
return result.parent.text.split(" ")[1]
def printItems(items):
"""
打印Bangumi动画列表里的全部内容
:param items: 包含页面上番剧所有li标签的list
:return: 无
"""
for item in items:
title = item.find("h3")
bangumiPgUrl = "https://bangumi.tv" + item.find("a")["href"]
coverUrl = "https:" + item.find("img", class_="cover")["src"].replace("/s/", "/c/") #/s/为小封面/c/为大封面
description = item.find("p").text.split("/")
description = fkSpace(description)[:3]
if description[0].find("年") != -1:
episodes = getBangumiPgInfoBoxData(bangumiPgUrl, "episodes")
date = toDate(description[0])
else:
episodes = description[0].replace("话", "")
date = toDate(description[1])
try:
original = description[2]
except IndexError:
original = "没显示"
chineseName = title.find("a").text
japaneseName = title.find(("small")).text if (title.find(("small")) is not None) else chineseName
print(f"中文名:{chineseName}")
print(f"日文名:{japaneseName}")
print(f"首播日期:{date}")
print(f"集数:{episodes}")
print(f"原作:{original}") #这个original有时会变成导演或者其他奇怪的东西,想要准确的最好用getBangumiPgInfoBoxData()
print(f"封面链接:{coverUrl}")
print(f"Bangumi页面:{bangumiPgUrl}")
return
def loadLinks(items):
"""
loadPageUrls HELPER
加载Bangumi动画列表里的全部内容到bangumiPgUrls里
:param items: 包含页面上番剧所有li标签的list
:return: 无
"""
for item in items:
bangumiPgUrl = "https://bangumi.tv" + item.find("a")["href"]
#print(bangumiPgUrl)
bangumiPgUrls.append(bangumiPgUrl)
return
def loadPageUrls(pgUrl):
"""
loadAllPagesUrls HELPER
加载Bangumi番剧列表的其中一页并加入BangumiPgUrls里
:param pgNum: 页面url
:return: 无
"""
soup = soupUrl(pgUrl)
itemHolder = soup.find("ul", class_="browserFull")
items = itemHolder.findAll("li")
loadLinks(items)
return
def loadPage1(pgNum):
"""
加载Bangumi番剧列表的其中一页并打印
:param pgNum: 页数
:return: 无
"""
global totalPgNum
pgUrl = "https://bangumi.tv/anime/list/556647/collect?page=" + str(pgNum)
source = requests.get(pgUrl, headers=headers)
source.encoding = "utf-8"
html = source.text
soup = BeautifulSoup(html, "lxml")
itemHolder = soup.find("ul", class_="browserFull")
items = itemHolder.findAll("li")
#pgNumHolder = soup.find("span", class_="p_edge").text.split("\xa0")
#totalPgNum = int(pgNumHolder[3])
printItems(items)
#print(f"页数:{pgNum}/{totalPgNum}")
return
def loadAllPages1():
"""
加载所有Bangumi番剧列表并打印 (速度很快但信息不完全, 原作信息或有错漏)
:return:无
"""
global currentPgNum
global totalPgNum
while currentPgNum <= totalPgNum:
loadPage1(currentPgNum)
currentPgNum+=1
currentPgNum = 1
return
def loadAllPagesUrls():
"""
加载所有Bangumi番剧页面到bangumiPgUrls里
:return:无
"""
global totalPgNum
for url in urls:
loadPageUrls(url)
return
start = time.time()
#loadAllPages1()
source = requests.get("https://bangumi.tv/anime/list/556647/collect?page=3", headers=headers)
end = time.time()
print(end-start)