From e07784559a7c34e9797fececa09777f36e5a7428 Mon Sep 17 00:00:00 2001 From: yinjimmy Date: Wed, 23 May 2018 21:35:28 +0800 Subject: [PATCH] fix url join and png file checking. --- aiss-spider/download_pictures.py | 16 ++++++++++++++-- aiss-spider/run.py | 5 ++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/aiss-spider/download_pictures.py b/aiss-spider/download_pictures.py index 522e781..662cf17 100644 --- a/aiss-spider/download_pictures.py +++ b/aiss-spider/download_pictures.py @@ -3,6 +3,7 @@ import json import requests import time +from urlparse import urljoin def get_info(): @@ -24,7 +25,8 @@ def get_info_imgs(info,host='http://aiss-1254466972.costj.myqcloud.com/picture') issue = item["issue"] pictureCount = item["pictureCount"] for pic_idx in range(pictureCount): - url = "%s/%s/%s/%s.jpg" % (host, catalog, issue, pic_idx) + url = "%s/%s/%s.jpg" % (catalog, issue, pic_idx) + url = urljoin(host, url) directory = os.path.join("data", name, "%s-%s" % (issue, nickname)) filepath = os.path.join(directory, "%s.jpg" % pic_idx) # 每张图片一组,包含 图片url,所在目录,存储路径 @@ -45,12 +47,22 @@ def setup_download_dir(directory): from multiprocessing import Process, Queue, Pool +def check_png(filepath): + from PIL import Image + try: + Image.open(filepath) + return True + except IOError: + # filename not an image file + return False + return False def download_one(img): """ 下载一张图片 """ url, directory, filepath = img # 如果文件已经存在,放弃下载 - if os.path.exists(filepath): + # if os.path.exists(filepath): + if check_png(filepath): print('exists:', filepath) return diff --git a/aiss-spider/run.py b/aiss-spider/run.py index 3c687bb..cd28b05 100644 --- a/aiss-spider/run.py +++ b/aiss-spider/run.py @@ -10,6 +10,9 @@ info = get_info() # 获取每张图片的url,存储文件夹,本地文件名 imgs = get_info_imgs(info,host=fetch_picture_url_header()) - # test imgs = imgs[-2:] + + # test + # imgs = imgs[-11:] + # # 以10个进程并发下载图片 download(imgs, processes=10)