-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathscrape.py
130 lines (88 loc) · 2.38 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# coding: utf-8
import argparse
parser = argparse.ArgumentParser(description='Download some images with a query')
parser.add_argument('--query', dest='query',
help='Query to search.')
parser.add_argument('--n', dest='n',
help='Number of items to return.')
args = parser.parse_args()
# In[1]:
import json
keys = json.loads(open('keys.json', 'r').read())
# keys
# In[68]:
from googleapiclient.discovery import build
def image_search (query, start=1):
service = build("customsearch", "v1",
developerKey=keys['developerKey'])
return service.cse().list(
q=query,
filter="1", # filter duplicates
start=start,
cx=keys['cx'],
searchType='image',
).execute()
def images_from (res):
urls = [item['link'] for item in res['items']]
return urls
# In[72]:
from time import sleep
def query_images (q, n=100):
resList = []
queried=0
while(queried<n):
res = image_search(q, queried+1)
resList.append(res)
queried+=10
sleep(1)
return resList
# In[74]:
#from io import BytesIO
import requests
#from PIL import Image
def load_image_bytes (url):
return requests.get(url).content
#def load_image (bytes):
# img = BytesIO(bytes)
# return Image.open(img)
#img = img_urls[0]
#load_image(img)
# In[75]:
query_res = query_images(args.query)#, n=args.n)
url_lists = [images_from(res) for res in query_res]
img_urls = [url for url_list in url_lists for url in url_list]
print(img_urls)
# In[80]:
# query_res
# In[76]:
def filename (url):
return url.split('/')[-1]
#filename(img_urls[0])
# In[79]:
from os import mkdir
from os.path import exists
from os.path import join
from datetime import datetime
from io import TextIOWrapper
if not exists(args.query):
mkdir(args.query)
out_dir = join(args.query,
datetime.now().isoformat())
mkdir(out_dir)
for i, url in enumerate(img_urls):
try:
print(url)
fn = str(i) + '_' + filename(url)
path = join(out_dir, fn)
print(path)
img = load_image_bytes(url)
print('downloaded')
with open(path, 'wb') as f:
f.write(img)
except:
print('something went wrong?')
path = join(out_dir, 'query-results.json')
with open (path, 'w') as f:
f.write(json.dumps(query_res))
print('done', path)
# In[ ]: