-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
676 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Crawling for Favicons |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import json | ||
import scrapy | ||
from scrapy.pipelines.files import FilesPipeline | ||
|
||
class ExtendedFilePipeline(FilesPipeline): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.info_file = open('scraped/info.jsonl', 'w') | ||
|
||
def process_item(self, item, spider): | ||
super().process_item(item, spider) | ||
|
||
def item_completed(self, results, item, info): | ||
super().item_completed(results, item, info) | ||
|
||
json.dump(list(map(lambda x: x[1], filter(lambda x: x[0], results))), self.info_file) | ||
self.info_file.write('\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import os | ||
import json | ||
import re | ||
import imagehash | ||
from io import BytesIO | ||
from PIL import Image | ||
import cairosvg | ||
from tqdm import tqdm | ||
|
||
target_size = 32 | ||
|
||
file_name = re.compile(r"^.*/(\w+)$") | ||
|
||
known_hashes = set() | ||
|
||
def save(image, path, downscale = False): | ||
if downscale: | ||
image = image.resize((target_size, target_size)) | ||
|
||
img_hash = imagehash.average_hash(image) | ||
if img_hash in known_hashes: | ||
return | ||
|
||
known_hashes.add(img_hash) | ||
image.save(os.path.join("scraped/filtered/", path + '.png')) | ||
|
||
|
||
if __name__ == "__main__": | ||
i = 0 | ||
for i, line in tqdm(enumerate(open("scraped/info.jsonl").readlines())): | ||
if line == '': | ||
continue | ||
entries = json.loads(line) | ||
|
||
biggest_image = None | ||
biggest_size = None | ||
|
||
for entry in entries: | ||
path = entry['path'] | ||
|
||
if path.endswith('.svg'): | ||
try: | ||
bytestring = cairosvg.svg2png(url=os.path.join("scraped", path), parent_width=target_size, parent_height=target_size) | ||
image = Image.open(BytesIO(bytestring)).convert('RGBA') | ||
except: | ||
continue | ||
|
||
else: | ||
try: | ||
image = Image.open(os.path.join("scraped", path)).convert('RGBA') | ||
except: | ||
continue | ||
|
||
width, height = image.size | ||
|
||
if width != height or width < target_size: | ||
continue | ||
|
||
|
||
if width == target_size: | ||
biggest_image = None | ||
|
||
save(image, str(i)) | ||
break | ||
|
||
if biggest_size is None or width > biggest_size: | ||
biggest_size = width | ||
biggest_image = image | ||
|
||
if biggest_image is not None: | ||
save(biggest_image, str(i), downscale=True) | ||
|
||
# i += 1 | ||
# if i > 100: | ||
# break | ||
|
||
# for (dirpath, dirnames, filenames) in os.walk("scraped/full"): | ||
# for filename in filenames: | ||
# if i > 10: | ||
# break | ||
|
||
# i += 1 | ||
# print(filename) | ||
# image = Image.open(os.path.join(dirpath, filename)) | ||
# width, height = image.size | ||
# print(width, height) | ||
# break | ||
|
||
# print(i) |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import csv | ||
import re | ||
from urllib.parse import urljoin, urlparse | ||
import scrapy | ||
from scrapy.crawler import CrawlerProcess | ||
|
||
|
||
|
||
linkTagRe = re.compile(r'<link ([^>]+)>') | ||
|
||
propRe = re.compile(r'(?P<prop>\w+)=(?P<quote>[""\'])(?P<value>([^""\']*))(?P=quote)') | ||
|
||
|
||
class FavLink: | ||
def __init__(self): | ||
self.is_icon = False | ||
self.rel = None | ||
self.href = None | ||
self.sizes = None | ||
|
||
def __str__(self): | ||
return '<link rel="{}" href="{}" sizes="{}">'.format(self.rel, self.href, self.sizes) | ||
|
||
@classmethod | ||
def parse(cls, tag): | ||
favLink = cls() | ||
|
||
for match in propRe.finditer(tag): | ||
prop = match.group('prop') | ||
value = match.group('value') | ||
|
||
if prop == 'rel': | ||
favLink.rel = value | ||
if value == 'icon' or value == 'mask-icon' or 'icon' in value.split(' '): | ||
favLink.is_icon = True | ||
|
||
if prop == 'href': | ||
favLink.href = value | ||
|
||
if prop == 'sizes': | ||
favLink.sizes = value | ||
|
||
return favLink | ||
|
||
|
||
class IconItem(scrapy.Item): | ||
file_urls = scrapy.Field() | ||
files = scrapy.Field() | ||
|
||
class IconSpider(scrapy.Spider): | ||
name="icon-gan" | ||
#download_delay=5.0 | ||
|
||
def start_requests(self): | ||
|
||
with open("top-1m.csv", "r") as csv_file: | ||
reader = csv.reader(csv_file) | ||
for i, domain in reader: | ||
# if int(i) > 200: | ||
# break | ||
|
||
yield scrapy.Request("https://{}".format(domain), self.parse) | ||
|
||
|
||
def parse(self, response): | ||
url = response.url | ||
|
||
item = IconItem() | ||
item['file_urls'] = [urljoin(url, 'favicon.ico')] | ||
|
||
for tag in response.xpath('//link[contains(@rel, "icon")]').getall(): | ||
match = linkTagRe.match(tag) | ||
|
||
if match is None: | ||
continue | ||
|
||
linkTag = FavLink.parse(match.group(1)) | ||
|
||
if linkTag.is_icon: | ||
item['file_urls'].append(urljoin(url, linkTag.href)) | ||
|
||
return item | ||
|
||
if __name__ == "__main__": | ||
process = CrawlerProcess(settings={ | ||
'ROBOTSTXT_OBEY': True, | ||
'USER_AGENT': 'Icon GAN', | ||
'ROBOTSTXT_USER_AGENT': 'Icon GAN', | ||
'ITEM_PIPELINES': { | ||
'files.ExtendedFilePipeline': 1 | ||
}, | ||
'FILES_STORE': 'scraped', | ||
'MEDIA_ALLOW_REDIRECTS': True, | ||
'AUTOTHROTTLE_ENABLED': True, | ||
|
||
'LOG_LEVEL': 'CRITICAL', | ||
|
||
# speedup | ||
'CONCURRENT_REQUESTS': 256, | ||
'DOWNLOAD_DELAY': 0, | ||
'DOWNLOAD_TIMEOUT': 15, | ||
'RANDOMIZE_DOWNLOAD_DELAY': True, | ||
'RETRY_ENABLED': False, | ||
|
||
#'HTTPCACHE_ENABLED': True | ||
}) | ||
process.crawl(IconSpider) | ||
process.start() |