favicon scraping

Akatuoro · Sep 28, 2020 · a2cd5b1 · a2cd5b1
1 parent de59319
commit a2cd5b1
Show file tree

Hide file tree

Showing 7 changed files with 676 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,9 @@ convertlist.txt
 .vscode
 dist
 models
+scraped
+
+.scrapy
 
 # Created by https://www.gitignore.io/api/node,macos,jupyternotebooks
 # Edit at https://www.gitignore.io/?templates=node,macos,jupyternotebooks

diff --git a/scrapper/README.md b/scrapper/README.md
@@ -0,0 +1 @@
+# Crawling for Favicons
diff --git a/scrapper/__init__.py b/scrapper/__init__.py
diff --git a/scrapper/files.py b/scrapper/files.py
@@ -0,0 +1,17 @@
+import json
+import scrapy
+from scrapy.pipelines.files import FilesPipeline
+
+class ExtendedFilePipeline(FilesPipeline):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.info_file = open('scraped/info.jsonl', 'w')
+
+    def process_item(self, item, spider):
+        super().process_item(item, spider)
+
+    def item_completed(self, results, item, info):
+        super().item_completed(results, item, info)
+
+        json.dump(list(map(lambda x: x[1], filter(lambda x: x[0], results))), self.info_file)
+        self.info_file.write('\n')
diff --git a/scrapper/filter.py b/scrapper/filter.py
@@ -0,0 +1,89 @@
+import os
+import json
+import re
+import imagehash
+from io import BytesIO
+from PIL import Image
+import cairosvg
+from tqdm import tqdm
+
+target_size = 32
+
+file_name = re.compile(r"^.*/(\w+)$")
+
+known_hashes = set()
+
+def save(image, path, downscale = False):
+    if downscale:
+        image = image.resize((target_size, target_size))
+
+    img_hash = imagehash.average_hash(image)
+    if img_hash in known_hashes:
+        return
+
+    known_hashes.add(img_hash)
+    image.save(os.path.join("scraped/filtered/", path + '.png'))
+
+
+if __name__ == "__main__":
+    i = 0
+    for i, line in tqdm(enumerate(open("scraped/info.jsonl").readlines())):
+        if line == '':
+            continue
+        entries = json.loads(line)
+
+        biggest_image = None
+        biggest_size = None
+
+        for entry in entries:
+            path = entry['path']
+
+            if path.endswith('.svg'):
+                try:
+                    bytestring = cairosvg.svg2png(url=os.path.join("scraped", path), parent_width=target_size, parent_height=target_size)
+                    image = Image.open(BytesIO(bytestring)).convert('RGBA')
+                except:
+                    continue
+
+            else:
+                try:
+                    image = Image.open(os.path.join("scraped", path)).convert('RGBA')
+                except:
+                    continue
+
+            width, height = image.size
+
+            if width != height or width < target_size:
+                continue
+
+
+            if width == target_size:
+                biggest_image = None
+
+                save(image, str(i))
+                break
+
+            if biggest_size is None or width > biggest_size:
+                biggest_size = width
+                biggest_image = image
+
+        if biggest_image is not None:
+            save(biggest_image, str(i), downscale=True)
+
+        # i += 1
+        # if i > 100:
+        #     break
+
+    # for (dirpath, dirnames, filenames) in os.walk("scraped/full"):
+    #     for filename in filenames:
+    #         if i > 10:
+    #             break
+
+    #         i += 1
+    #         print(filename)
+    #         image = Image.open(os.path.join(dirpath, filename))
+    #         width, height = image.size
+    #         print(width, height)
+    #     break
+
+    # print(i)
diff --git a/scrapper/inspect.ipynb b/scrapper/inspect.ipynb
diff --git a/scrapper/main.py b/scrapper/main.py
@@ -0,0 +1,108 @@
+import csv
+import re
+from urllib.parse import urljoin, urlparse
+import scrapy
+from scrapy.crawler import CrawlerProcess
+
+
+
+linkTagRe = re.compile(r'<link ([^>]+)>')
+
+propRe = re.compile(r'(?P<prop>\w+)=(?P<quote>[""\'])(?P<value>([^""\']*))(?P=quote)')
+
+
+class FavLink:
+    def __init__(self):
+        self.is_icon = False
+        self.rel = None
+        self.href = None
+        self.sizes = None
+
+    def __str__(self):
+        return '<link rel="{}" href="{}" sizes="{}">'.format(self.rel, self.href, self.sizes)
+
+    @classmethod
+    def parse(cls, tag):
+        favLink = cls()
+
+        for match in propRe.finditer(tag):
+            prop = match.group('prop')
+            value = match.group('value')
+
+            if prop == 'rel':
+                favLink.rel = value
+                if value == 'icon' or value == 'mask-icon' or 'icon' in value.split(' '):
+                    favLink.is_icon = True
+
+            if prop == 'href':
+                favLink.href = value
+
+            if prop == 'sizes':
+                favLink.sizes = value
+
+        return favLink
+
+
+class IconItem(scrapy.Item):
+    file_urls = scrapy.Field()
+    files = scrapy.Field()
+
+class IconSpider(scrapy.Spider):
+    name="icon-gan"
+    #download_delay=5.0
+
+    def start_requests(self):
+
+        with open("top-1m.csv", "r") as csv_file:
+            reader = csv.reader(csv_file)
+            for i, domain in reader:
+                # if int(i) > 200:
+                #     break
+
+                yield scrapy.Request("https://{}".format(domain), self.parse)
+
+
+    def parse(self, response):
+        url = response.url
+
+        item = IconItem()
+        item['file_urls'] = [urljoin(url, 'favicon.ico')]
+
+        for tag in response.xpath('//link[contains(@rel, "icon")]').getall():
+            match = linkTagRe.match(tag)
+
+            if match is None:
+                continue
+
+            linkTag = FavLink.parse(match.group(1))
+
+            if linkTag.is_icon:
+                item['file_urls'].append(urljoin(url, linkTag.href))
+
+        return item
+
+if __name__ == "__main__":
+    process = CrawlerProcess(settings={
+        'ROBOTSTXT_OBEY': True,
+        'USER_AGENT': 'Icon GAN',
+        'ROBOTSTXT_USER_AGENT': 'Icon GAN',
+        'ITEM_PIPELINES': {
+            'files.ExtendedFilePipeline': 1
+        },
+        'FILES_STORE': 'scraped',
+        'MEDIA_ALLOW_REDIRECTS': True,
+        'AUTOTHROTTLE_ENABLED': True,
+
+        'LOG_LEVEL': 'CRITICAL',
+
+        # speedup
+        'CONCURRENT_REQUESTS': 256,
+        'DOWNLOAD_DELAY': 0,
+        'DOWNLOAD_TIMEOUT': 15,
+        'RANDOMIZE_DOWNLOAD_DELAY': True,
+        'RETRY_ENABLED': False,
+
+        #'HTTPCACHE_ENABLED': True
+        })
+    process.crawl(IconSpider)
+    process.start()
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,6 +15,9 @@ convertlist.txt @@
     .vscode
     dist
     models
+    scraped
+    .scrapy
     # Created by https://www.gitignore.io/api/node,macos,jupyternotebooks
     # Edit at https://www.gitignore.io/?templates=node,macos,jupyternotebooks
@@ Expand Down @@