Skip to content

Commit

Permalink
Add 邪恶漫画吧
Browse files Browse the repository at this point in the history
  • Loading branch information
turgenevivan committed Apr 23, 2017
1 parent a35d355 commit 65f2f74
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
python spiders

[01. 爱丝](https://github.com/x-spiders/aiss-spider)
[02. 邪恶漫画吧]()
2 changes: 1 addition & 1 deletion aiss-spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

这个时候,会在 `data/` 目录下生成图片文件。

共有2万多张图片,默认10个进程并发下载,在网速3M/s的情况下,大约20分钟下载完毕。下载完后如下图所示:
共有2万 (28349) 多张图片,默认10个进程并发下载,在网速3M/s的情况下,大约20分钟下载完毕。下载完后如下图所示:

![](assets/download.png?raw=true)

Expand Down
111 changes: 111 additions & 0 deletions xeall/xeall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# -*- coding:utf8 -*-

'''
http://www.xieemanhuaba.com/xieedontaitu/
sudo -H pip install requests beautifulsoup
'''

import os # path, makedirs
import requests # 网络请求
import urllib # 下载文件
from bs4 import BeautifulSoup # 网页分析
import re # 正则表达式
import thread
import multiprocessing
from multiprocessing import Pool
import time

CURR_DIR = os.path.dirname(os.path.abspath(__file__))
FOLDRE = 'xeba1'

def download_url(url):
if len(url) < 1: return
path = os.path.join(CURR_DIR, FOLDRE, os.path.basename(url))
print url, path
mkdir(os.path.dirname(path))
if os.path.exists(path): return
urllib.urlretrieve(url, path)

def mkdir(path):
if not os.path.exists(path):
os.makedirs(path)

def fixUrl(url):
return 'http://www.xieemanhuaba.com' + url

def get_image_url(soup):
item = soup.find('li', id='imgshow')
img_url = ''
try:
title, img_url = item.img.get('alt'), item.img.get('src')
print title, img_url
except Exception,e:
print '解析图片失败 %s' % e.message
return img_url

def get_pagelist(url, soup):
ret = []
pagelist = soup.find('ul', class_='pagelist').find_all('li')
if len(pagelist):
# 使用正则表达式获取 分页
pagecount = re.findall("\d+", pagelist[0].a.text)[0]
pagecount = int(pagecount)
print '子页面数量:', pagecount
baseurl = url.replace('.html', '')
for index in xrange(2, pagecount+1):
nexturl = '%s_%d.html' % (baseurl, index)
ret.append(nexturl)
return ret

def get_all_img_urls():
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}

url = 'http://www.xieemanhuaba.com/xieedontaitu/'
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
item = soup.find('div', class_='kg')
count = int(item.span.text)
print '已有%d集漫画.' % count


title = item.a.get('title')
url = fixUrl(item.a.get('href'))
print title, url

urls = []

for x in xrange(1,count+1):
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
# 下载页面中的动图
urls.append(get_image_url(soup))

# 下载页面中子页面的动图
pagelist = get_pagelist(url, soup)
for page in pagelist:
print 'page', page
html = requests.get(page, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
urls.append(get_image_url(soup))

# 获取到下一章链接
url = re.findall("var str = \S+<a href='(\S+)'", html.text)[1]
url = fixUrl(url)
return urls

def download(urls, processes=10):
""" 并发下载所有图片 """
print u'开始下载所有图片'
start_time = time.time()
pool = Pool(processes)
for img_url in urls:
pool.apply_async(download_url, (img_url,))

pool.close()
pool.join()
print('下载完毕,用时: %s 秒' % (time.time() - start_time))

btime = time.time()
download(get_all_img_urls())
print '总共花了多久:', time.time()-btime

0 comments on commit 65f2f74

Please sign in to comment.