This repository has been archived by the owner on Apr 30, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsetup_dataset.py
115 lines (87 loc) · 3.98 KB
/
setup_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import json
import logging
import os
import urllib.request
import imageio
import numpy as np
from numpy import ndarray
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('setup_datasets')
dataset_folder = '../dataset/'
img_folder = dataset_folder + 'img/'
emoji_repo_url = 'https://github.com/iamcal/emoji-data/'
img_repo_url = emoji_repo_url + 'raw/master/'
categories_json = 'categories.json'
categories_repo_url = 'https://raw.githubusercontent.com/iamcal/emoji-data/master/' + categories_json
emoji_json = 'emoji.json'
emoji_repo_url = 'https://raw.githubusercontent.com/iamcal/emoji-data/master/' + emoji_json
companies = ['apple', 'facebook', 'google', 'messenger', 'twitter']
resolutions = [16, 20, 32, 64]
def download_dataset(img_folder: str) -> None:
logger.info('downloading dataset...')
for company in companies:
for resolution in resolutions:
filename = '_'.join(['sheet', company, str(resolution)]) + '.png'
filepath = img_folder + filename
if not os.path.exists(filepath):
logger.info(filename + ' not found, downloading...')
urllib.request.urlretrieve(img_repo_url + filename, filepath)
if not os.path.exists(categories_json):
logger.info(categories_json + ' not found, downloading...')
urllib.request.urlretrieve(categories_repo_url, categories_json)
if not os.path.exists(emoji_json):
logger.info(emoji_json + ' not found, downloading...')
urllib.request.urlretrieve(emoji_repo_url, emoji_json)
logger.info('done')
def png_to_dataset(png: ndarray, resolution: int) -> ndarray:
emojis = []
height, width, _ = png.shape
for i in range(1, height, resolution + 2):
for j in range(1, width, resolution + 2):
emoji = png[j:j + resolution, i:i + resolution]
if np.max(emoji[:, :, -1] != 0):
emojis.append(emoji)
return np.array(emojis)
if __name__ == "__main__":
if not os.path.exists(img_folder):
os.makedirs(img_folder)
download_dataset(img_folder)
categories_data = json.load(open(dataset_folder + 'categories.json', 'r'))
categories_names = list(categories_data.keys())[:-1] # Removing skin tones as it won't have samples
emoji_data = json.load(open(dataset_folder + 'emoji.json', 'r'))
with open(dataset_folder + 'companies_names.json', 'w') as f:
json.dump(companies, f)
with open(dataset_folder + 'categories_names.json', 'w') as f:
json.dump(categories_names, f)
logger.info('generating datasets...')
for resolution in resolutions:
dataset = []
classes = []
companies_pngs = []
for company in companies:
sheet_filepath = img_folder + ('_'.join(['sheet', company, str(resolution)])) + '.png'
companies_pngs.append(imageio.imread(sheet_filepath))
for element in emoji_data:
valid = True
for company in companies:
if not element['has_img_' + company]:
valid = False
break
if valid:
y = element['sheet_x']
x = element['sheet_y']
category = element['category']
base_x = 1 + x * (2 + resolution)
base_y = 1 + y * (2 + resolution)
for company in companies:
company_index = companies.index(company)
category_index = categories_names.index(category)
emoji = companies_pngs[company_index][base_x:base_x + resolution, base_y:base_y + resolution, :]
dataset.append(emoji)
classes.append([company_index, category_index])
dataset = np.array(dataset)
dataset = dataset / 255.0
classes = np.array(classes)
np.save(dataset_folder + 'emojis_' + str(resolution) + '.npy', dataset.astype(np.float32))
np.save(dataset_folder + 'emojis_classes' + '.npy', classes)
logger.info('done')