-
Notifications
You must be signed in to change notification settings - Fork 0
/
sakura_like.py
108 lines (101 loc) · 3.63 KB
/
sakura_like.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from bs4 import BeautifulSoup
import requests
import re
import time
import os
from urllib.request import urlretrieve
host = 'halihali2.com'
list_host = '121.4.190.96:9991'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
js_url = 'http://d.gqyy8.com:8077/ne2/s%s.js?1622904201'
cached = {}
def get_resp(url, with_try=(1, 3, 5, 10, 30)):
try:
resp = requests.get(url, headers=headers)
except Exception as e:
print('connection error')
if len(with_try) == 0:
raise e
time.sleep(with_try[0])
return get_resp(url, with_try[1:])
resp.encoding = 'utf-8'
return resp
def get_season_info(season_id):
season_id = str(season_id)
if season_id in cached:
return cached[season_id]
resp = get_resp('http://%s/acg/%s/' % (host, season_id))
if re.search(r'该链接已失效|无此片源', resp.text):
return None
page = BeautifulSoup(resp.text, features='html.parser')
season = {}
season['id'] = season_id
info_div = page.select('div.wrap > div.content.mb.clearfix > div.info')[0]
season['name'] = list(info_div.select('dl > dt.name')[0].strings)[0]
strings = list(info_div.select('dl')[0].strings)
for i in range(len(strings)):
if re.search(r'地区', strings[i]):
season['area'] = strings[i+1]
elif re.search(r'年代', strings[i]):
season['age'] = strings[i+1]
tag_a = info_div.select('dl > dd > a')
season['tags'] = []
for a in tag_a:
season['tags'].append(a.text)
img = page.select('div.wrap > div.content.mb.clearfix > div.pic > img')[0]
season['cover'] = img['data-original'].replace('http://', 'https://')
season['evaluate'] = list(info_div.select('.desd .des2')[0].strings)[1]
season['episodes'] = []
tag_a = page.select('#stab_1_71 > ul > li > a')
resp = get_resp(js_url % season_id)
videos = find_videos(resp.text)
for a in tag_a:
if re.search(r'备', a.text):
continue
num = int(re.findall(r'\/(\d+)\.html$', a['href'])[0])
if num not in videos:
continue
if not re.search(r'\.mp4', str(videos[num])):
continue
for url in videos[num]:
if re.search(r'\.mp4', url):
video = url
break
season['episodes'].append({
'title': a.text,
'video': url,
'id': '%s-%d' % (season_id, num),
'name': season['name'] + ': ' + a.text,
})
cached[season_id] = season
return season if season['episodes'] else None
def get_episode_info(epid):
season_id, ep_num = epid.split('-')
season = get_season_info(season_id)
for ep in season['episodes']:
if ep['id'] == epid:
return ep
def find_videos(js):
videos = {}
pattern = '%s\\[(\\d+)\\]=\"(.*?),.*?\"'
for num, url in re.findall(pattern % 'playarr', js):
url = url.replace('http://', 'https://')
videos[int(num)] = [url]
arr_num = 1
while re.findall(pattern % 'playarr_%d' % arr_num, js):
for num, url in re.findall(pattern % 'playarr_%d' % arr_num, js):
url = url.replace('http://', 'https://')
if int(num) in videos:
videos[int(num)].append(url)
else:
videos[int(num)] = [url]
arr_num += 1
return videos
def download(url, path):
if not os.path.exists(path):
os.makedirs(path)
filename = os.path.join(path, 'video.mp4')
print('start download from %s to %s' % (url, filename))
urlretrieve(url, filename)