Skip to content

Commit

Permalink
fix url join and png file checking.
Browse files Browse the repository at this point in the history
  • Loading branch information
turgenevivan committed May 23, 2018
1 parent 93c4018 commit e077845
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 3 deletions.
16 changes: 14 additions & 2 deletions aiss-spider/download_pictures.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import requests
import time
from urlparse import urljoin


def get_info():
Expand All @@ -24,7 +25,8 @@ def get_info_imgs(info,host='http://aiss-1254466972.costj.myqcloud.com/picture')
issue = item["issue"]
pictureCount = item["pictureCount"]
for pic_idx in range(pictureCount):
url = "%s/%s/%s/%s.jpg" % (host, catalog, issue, pic_idx)
url = "%s/%s/%s.jpg" % (catalog, issue, pic_idx)
url = urljoin(host, url)
directory = os.path.join("data", name, "%s-%s" % (issue, nickname))
filepath = os.path.join(directory, "%s.jpg" % pic_idx)
# 每张图片一组,包含 图片url,所在目录,存储路径
Expand All @@ -45,12 +47,22 @@ def setup_download_dir(directory):

from multiprocessing import Process, Queue, Pool

def check_png(filepath):
from PIL import Image
try:
Image.open(filepath)
return True
except IOError:
# filename not an image file
return False
return False

def download_one(img):
""" 下载一张图片 """
url, directory, filepath = img
# 如果文件已经存在,放弃下载
if os.path.exists(filepath):
# if os.path.exists(filepath):
if check_png(filepath):
print('exists:', filepath)
return

Expand Down
5 changes: 4 additions & 1 deletion aiss-spider/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
info = get_info()
# 获取每张图片的url,存储文件夹,本地文件名
imgs = get_info_imgs(info,host=fetch_picture_url_header())
# test imgs = imgs[-2:]

# test
# imgs = imgs[-11:]
#
# 以10个进程并发下载图片
download(imgs, processes=10)

0 comments on commit e077845

Please sign in to comment.