-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathcraw_pic.py
145 lines (127 loc) · 4.96 KB
/
craw_pic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# coding=utf-8
"""
******** 整站抓取图片,使用gevent+requests异步下载方式 ********
*1.lxml包xpath工具获取url,其中xpath从chrome开发者工具获取
*2.使requests下载
*3.正则爬取url所有图片
*4.协程处理图片下载任务
*5.加入代理proxies
*6.自动打开文件夹窗口展示图片
requirements:
- requests (2.11.1)
- lxml (3.6.4)
- gevent (1.1.2)
- tqdm (4.23.0)
"""
import re
import os
import time
import requests
from tqdm import tqdm
from lxml import etree
import gevent
from gevent import monkey
# --------创建路径,储存图片------------
BASEPATH = os.getcwd() + '/49vvpic/'
# http://qwx2.com/index.html # curl访问,用于获取ORIGIN_URL
ORIGIN_URL = 'http://www.gf3s.com/404.html?/'
# 使用socks5代理,可用其他vpn.先设置本地1080端口翻墙,然后设置requests的代理为该端口
PROXIES = dict(http='socks5://127.0.0.1:1080', https='socks5://127.0.0.1:1080')
TIMEOUT = 10 # 单位:秒
def get_urls(root_url, numth_url):
res = requests.get(numth_url)
selector = etree.HTML(res.content)
# ------取中文标题------"
pic_name = selector.xpath(
"//*[@id='gotop']/div[2]/div/div/div[3]/div/ul/li/a/text()")
pic = selector.xpath("//*[@id='gotop']/div[2]/div/div/div[3]/div/ul/li/a")
# 将url名与对应的url进行打包
urls = zip(pic_name, [root_url + node.attrib['href'] for node in pic])
return (len(pic), urls)
def get_pic_url(url): # 获取指定url下的所有图片url
res = requests.get(url)
# 图片所在标签: <a href=""><img src="" alt=""/></a></P>
pic_urls = re.findall(
'<a href=".*?"><img src="(.*?)" alt=".*?" /></a></P>',
res.text, re.S) # 正则匹配
# 每个网页最多下20张,因为很多重复的图片
return pic_urls if len(pic_urls) <= 10 else pic_urls[:20]
def download(title, url, proxy_on=False): # 图片下载
if proxy_on:
proxies = PROXIES
else:
proxies = {}
url = re.sub('%2[eE]', '.', url, re.IGNORECASE) # 如果.被编码为%2E则改回.
res = requests.get(url, proxies=proxies, timeout=TIMEOUT)
print('正在下载图片:' + url)
name = re.split('\.|/+', url)
filesavepath = title + name[-3] + str(time.time()) + '.' + name[-1]
with open(filesavepath, 'wb') as f: # 图片下载
f.write(res.content)
def choose(root_url):
"""
用户选择抓取的图片区
"""
areas = ['asia', 'oumei', 'zipai', 'meitui', 'cartoon', '']
while True:
area = input(
'亚洲:asia,欧美:oumei,自拍:zipai,美腿:meitui,动漫:cartoon\n爬取什么区? 请输入选项:')
if area in areas:
if area == '': # 默认为亚洲区
area = 'asia'
break
print('错误,请重新输入!')
while True:
number = input('爬取第几页?:')
if number.isdigit():
break
print('错误,请重新输入!')
# 首页index,第二页index-2,第三页index-3...
number = '' if number == '1' or number == '' else '-' + number
return root_url + 'AAtupian/AAtb/' + area + '/index' + number + '.html'
def do_work(BASEPATH, urls, proxy_on=False):
urls_dict = dict(urls)
for pic_name in tqdm(urls_dict): # tqdm封装迭代器,显示进度
title = BASEPATH + pic_name + '/' # 根据标题,创建子目录,易于区分图集
print("\n" + "-----------" * 6)
try:
print('\n开始下载:{0}, 图片数:{1}张'.format(
pic_name, pic_name.split("P)")[0].split("(")[-1]))
except Exception:
pass
pic_urls = get_pic_url(urls_dict[pic_name]) # 正则爬取图片url
if not os.path.isdir(title) and len(pic_urls) > 0:
os.mkdir(title)
gevent.joinall([gevent.spawn(download, title, pic_url, proxy_on)
for pic_url in pic_urls]) # gevent并发下载
print('\n下载完成!!')
def craw():
try:
# 从防黑页面获取root_url,防止网页挂掉
r = requests.get(ORIGIN_URL, allow_redirects=True)
except Exception as e:
print('网络链接失败!请检查网络:{}\n'.format(e))
exit(0)
root_url = re.findall(
'<div class="zhuli">.*?<a href=".*?">(.*?)</a>', r.text, re.S)
print('root_url:', root_url)
# 选择页面,第一页,第二页>..
if not root_url:
print('网页已挂,请更改ORIGIN_URL。')
exit(0) # 退出
numth_url = choose(root_url[0])
print("正在爬取:{}".format(numth_url))
# 解决指定页有时返回0个url的情况
while True:
(numofurl, urls) = get_urls(root_url[0], numth_url)
if numofurl != 0:
break
# 创建根目录
if not os.path.isdir(BASEPATH):
os.mkdir(BASEPATH)
os.system('nautilus ' + BASEPATH) # 打开文件夹图像窗口
# 开始下载,开启代理
do_work(BASEPATH, urls, True)
if __name__ == '__main__':
monkey.patch_all()
craw()