Skip to content

Commit

Permalink
favicon scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
Akatuoro committed Sep 28, 2020
1 parent de59319 commit a2cd5b1
Show file tree
Hide file tree
Showing 7 changed files with 676 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ convertlist.txt
.vscode
dist
models
scraped

.scrapy

# Created by https://www.gitignore.io/api/node,macos,jupyternotebooks
# Edit at https://www.gitignore.io/?templates=node,macos,jupyternotebooks
Expand Down
1 change: 1 addition & 0 deletions scrapper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Crawling for Favicons
Empty file added scrapper/__init__.py
Empty file.
17 changes: 17 additions & 0 deletions scrapper/files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import json
import scrapy
from scrapy.pipelines.files import FilesPipeline

class ExtendedFilePipeline(FilesPipeline):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.info_file = open('scraped/info.jsonl', 'w')

def process_item(self, item, spider):
super().process_item(item, spider)

def item_completed(self, results, item, info):
super().item_completed(results, item, info)

json.dump(list(map(lambda x: x[1], filter(lambda x: x[0], results))), self.info_file)
self.info_file.write('\n')
89 changes: 89 additions & 0 deletions scrapper/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import os
import json
import re
import imagehash
from io import BytesIO
from PIL import Image
import cairosvg
from tqdm import tqdm

target_size = 32

file_name = re.compile(r"^.*/(\w+)$")

known_hashes = set()

def save(image, path, downscale = False):
if downscale:
image = image.resize((target_size, target_size))

img_hash = imagehash.average_hash(image)
if img_hash in known_hashes:
return

known_hashes.add(img_hash)
image.save(os.path.join("scraped/filtered/", path + '.png'))


if __name__ == "__main__":
i = 0
for i, line in tqdm(enumerate(open("scraped/info.jsonl").readlines())):
if line == '':
continue
entries = json.loads(line)

biggest_image = None
biggest_size = None

for entry in entries:
path = entry['path']

if path.endswith('.svg'):
try:
bytestring = cairosvg.svg2png(url=os.path.join("scraped", path), parent_width=target_size, parent_height=target_size)
image = Image.open(BytesIO(bytestring)).convert('RGBA')
except:
continue

else:
try:
image = Image.open(os.path.join("scraped", path)).convert('RGBA')
except:
continue

width, height = image.size

if width != height or width < target_size:
continue


if width == target_size:
biggest_image = None

save(image, str(i))
break

if biggest_size is None or width > biggest_size:
biggest_size = width
biggest_image = image

if biggest_image is not None:
save(biggest_image, str(i), downscale=True)

# i += 1
# if i > 100:
# break

# for (dirpath, dirnames, filenames) in os.walk("scraped/full"):
# for filename in filenames:
# if i > 10:
# break

# i += 1
# print(filename)
# image = Image.open(os.path.join(dirpath, filename))
# width, height = image.size
# print(width, height)
# break

# print(i)
458 changes: 458 additions & 0 deletions scrapper/inspect.ipynb

Large diffs are not rendered by default.

108 changes: 108 additions & 0 deletions scrapper/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import csv
import re
from urllib.parse import urljoin, urlparse
import scrapy
from scrapy.crawler import CrawlerProcess



linkTagRe = re.compile(r'<link ([^>]+)>')

propRe = re.compile(r'(?P<prop>\w+)=(?P<quote>[""\'])(?P<value>([^""\']*))(?P=quote)')


class FavLink:
def __init__(self):
self.is_icon = False
self.rel = None
self.href = None
self.sizes = None

def __str__(self):
return '<link rel="{}" href="{}" sizes="{}">'.format(self.rel, self.href, self.sizes)

@classmethod
def parse(cls, tag):
favLink = cls()

for match in propRe.finditer(tag):
prop = match.group('prop')
value = match.group('value')

if prop == 'rel':
favLink.rel = value
if value == 'icon' or value == 'mask-icon' or 'icon' in value.split(' '):
favLink.is_icon = True

if prop == 'href':
favLink.href = value

if prop == 'sizes':
favLink.sizes = value

return favLink


class IconItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()

class IconSpider(scrapy.Spider):
name="icon-gan"
#download_delay=5.0

def start_requests(self):

with open("top-1m.csv", "r") as csv_file:
reader = csv.reader(csv_file)
for i, domain in reader:
# if int(i) > 200:
# break

yield scrapy.Request("https://{}".format(domain), self.parse)


def parse(self, response):
url = response.url

item = IconItem()
item['file_urls'] = [urljoin(url, 'favicon.ico')]

for tag in response.xpath('//link[contains(@rel, "icon")]').getall():
match = linkTagRe.match(tag)

if match is None:
continue

linkTag = FavLink.parse(match.group(1))

if linkTag.is_icon:
item['file_urls'].append(urljoin(url, linkTag.href))

return item

if __name__ == "__main__":
process = CrawlerProcess(settings={
'ROBOTSTXT_OBEY': True,
'USER_AGENT': 'Icon GAN',
'ROBOTSTXT_USER_AGENT': 'Icon GAN',
'ITEM_PIPELINES': {
'files.ExtendedFilePipeline': 1
},
'FILES_STORE': 'scraped',
'MEDIA_ALLOW_REDIRECTS': True,
'AUTOTHROTTLE_ENABLED': True,

'LOG_LEVEL': 'CRITICAL',

# speedup
'CONCURRENT_REQUESTS': 256,
'DOWNLOAD_DELAY': 0,
'DOWNLOAD_TIMEOUT': 15,
'RANDOMIZE_DOWNLOAD_DELAY': True,
'RETRY_ENABLED': False,

#'HTTPCACHE_ENABLED': True
})
process.crawl(IconSpider)
process.start()

0 comments on commit a2cd5b1

Please sign in to comment.