From 65f2f748c6acb643b7c9f3de31b490ddba2a576e Mon Sep 17 00:00:00 2001 From: Jianren Yin Date: Mon, 24 Apr 2017 01:06:41 +0800 Subject: [PATCH] =?UTF-8?q?Add=20=E9=82=AA=E6=81=B6=E6=BC=AB=E7=94=BB?= =?UTF-8?q?=E5=90=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + aiss-spider/README.md | 2 +- xeall/xeall.py | 111 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 1 deletion(-) create mode 100755 xeall/xeall.py diff --git a/README.md b/README.md index f73d4c0..72ba5fa 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,4 @@ python spiders [01. 爱丝](https://github.com/x-spiders/aiss-spider) +[02. 邪恶漫画吧]() diff --git a/aiss-spider/README.md b/aiss-spider/README.md index f7586e9..5c787cf 100644 --- a/aiss-spider/README.md +++ b/aiss-spider/README.md @@ -9,7 +9,7 @@ 这个时候,会在 `data/` 目录下生成图片文件。 -共有2万多张图片,默认10个进程并发下载,在网速3M/s的情况下,大约20分钟下载完毕。下载完后如下图所示: +共有2万 (28349) 多张图片,默认10个进程并发下载,在网速3M/s的情况下,大约20分钟下载完毕。下载完后如下图所示: ![](assets/download.png?raw=true) diff --git a/xeall/xeall.py b/xeall/xeall.py new file mode 100755 index 0000000..1ae9931 --- /dev/null +++ b/xeall/xeall.py @@ -0,0 +1,111 @@ +# -*- coding:utf8 -*- + +''' +http://www.xieemanhuaba.com/xieedontaitu/ + +sudo -H pip install requests beautifulsoup +''' + +import os # path, makedirs +import requests # 网络请求 +import urllib # 下载文件 +from bs4 import BeautifulSoup # 网页分析 +import re # 正则表达式 +import thread +import multiprocessing +from multiprocessing import Pool +import time + +CURR_DIR = os.path.dirname(os.path.abspath(__file__)) +FOLDRE = 'xeba1' + +def download_url(url): + if len(url) < 1: return + path = os.path.join(CURR_DIR, FOLDRE, os.path.basename(url)) + print url, path + mkdir(os.path.dirname(path)) + if os.path.exists(path): return + urllib.urlretrieve(url, path) + +def mkdir(path): + if not os.path.exists(path): + os.makedirs(path) + +def fixUrl(url): + return 'http://www.xieemanhuaba.com' + url + +def get_image_url(soup): + item = soup.find('li', id='imgshow') + img_url = '' + try: + title, img_url = item.img.get('alt'), item.img.get('src') + print title, img_url + except Exception,e: + print '解析图片失败 %s' % e.message + return img_url + +def get_pagelist(url, soup): + ret = [] + pagelist = soup.find('ul', class_='pagelist').find_all('li') + if len(pagelist): + # 使用正则表达式获取 分页 + pagecount = re.findall("\d+", pagelist[0].a.text)[0] + pagecount = int(pagecount) + print '子页面数量:', pagecount + baseurl = url.replace('.html', '') + for index in xrange(2, pagecount+1): + nexturl = '%s_%d.html' % (baseurl, index) + ret.append(nexturl) + return ret + +def get_all_img_urls(): + headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} + + url = 'http://www.xieemanhuaba.com/xieedontaitu/' + html = requests.get(url, headers=headers) + soup = BeautifulSoup(html.text, 'lxml') + item = soup.find('div', class_='kg') + count = int(item.span.text) + print '已有%d集漫画.' % count + + + title = item.a.get('title') + url = fixUrl(item.a.get('href')) + print title, url + + urls = [] + + for x in xrange(1,count+1): + html = requests.get(url, headers=headers) + soup = BeautifulSoup(html.text, 'lxml') + # 下载页面中的动图 + urls.append(get_image_url(soup)) + + # 下载页面中子页面的动图 + pagelist = get_pagelist(url, soup) + for page in pagelist: + print 'page', page + html = requests.get(page, headers=headers) + soup = BeautifulSoup(html.text, 'lxml') + urls.append(get_image_url(soup)) + + # 获取到下一章链接 + url = re.findall("var str = \S+