PyAr · frapell · Dec 16, 2021 · Dec 17, 2021 · Jan 26, 2022 · Jan 26, 2022
diff --git a/config.py b/config.py
@@ -123,6 +123,9 @@
 LOG_TITLES = DIR_TEMP + "/titles.txt"
 LOG_LOCALE = DIR_TEMP + "/locale.txt"
 
+# Maximum filename length
+IMG_MAX_NAME_LEN = 240
+
 # prefix for URL of local images
 IMAGES_URL_PREFIX = "/images/"
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -9,3 +9,4 @@ progress
 pytest
 pytest-mock
 lxml
+python-magic
diff --git a/src/images/embed.py b/src/images/embed.py
@@ -17,19 +17,28 @@
 """Embed pre-selected images in HTML source."""
 
 import logging
+import magic
 import os
 
 import bs4
 
 import config
 
 logger = logging.getLogger('images.embed')
+mimetype = magic.Magic(mime=True)
 
 
 def image_is_embeddable(imgpath, imgsize):
     """Decide if given image will be embedded in HTML source."""
+    result = False
     _, ext = os.path.splitext(imgpath)
-    return ext.lower() == '.svg' and imgsize < 40960
+    if ext.lower() == '.svg' and imgsize < 40960:
+        # Do not assume an image is an SVG based only in file extension
+        if os.path.exists(imgpath):
+            mt = mimetype.from_file(imgpath)
+            if mt.startswith('image/svg'):
+                result = True
+    return result
 
 
 class _EmbedImages:

diff --git a/src/images/extract.py b/src/images/extract.py
@@ -252,6 +252,25 @@ def replace(tag):
             logger.warning("Unsupported image with GET args. Won't be included: %s", dsk_url)
             return None, None
 
+        # Make sure dsk_url lenght is below filesystem limit
+        limit = config.IMG_MAX_NAME_LEN
+        basedir, filename = os.path.split(dsk_url)
+        name, ext = os.path.splitext(filename)
+        if len(filename) > limit:
+            # We cannot simply get the [:limit] part of the name, since we
+            # cannot know if we will have conflicts with other image names,
+            # so we'll split the filename into subfolders.
+            # superbigfilename.png would be super/bigfi/lename.png
+            logger.debug("Filename too long for %r", dsk_url)
+            new_split_name = []
+            for i in range((len(name)//limit)+1):
+                new_part = name[i*limit:(i+1)*limit]
+                new_part.replace('.', '').replace('-', '').replace('/', '')
+                new_split_name.append(new_part)
+            new_dir = os.path.join(*new_split_name)
+            dsk_url = os.path.join(basedir,new_dir)
+            dsk_url += ext
+
         logger.debug("web url: %r, dsk_url %r", web_url, dsk_url)
 
         # Replace the width and height by a querystring in the src of the image

diff --git a/src/images/scale.py b/src/images/scale.py
@@ -105,7 +105,7 @@ def run(verbose, src):
                 logger.debug("Rescaling to %d%% image %s", scale, dskurl)
             if scale == 100:
                 done_now[dskurl] = 100
-                if embed_enabled and image_is_embeddable(dskurl, imgsize):
+                if embed_enabled and image_is_embeddable(frompath, imgsize):
                     # don't copy image, leave it out of image blocks, it will
                     # be embedded from original location (without any reduction)
                     images_embed.add(dskurl)

diff --git a/src/scraping/css.py b/src/scraping/css.py
@@ -35,6 +35,7 @@
 Reference: https://www.mediawiki.org/wiki/API:Styling_content
 """
 
+import html
 import logging
 import functools
 import os
@@ -163,7 +164,7 @@ def _module_names(self):
 
         unique_names = set()
         for link in raw_links:
-            url = urllib.parse.urlparse(link)
+            url = urllib.parse.urlparse(html.unescape(link))
             query = dict(urllib.parse.parse_qsl(url.query))
             names = query.get('modules')
             if not names:

diff --git a/src/scraping/scraper.py b/src/scraping/scraper.py
@@ -130,11 +130,16 @@ def fetch_html(url):
         try:
             req = urllib.request.Request(url, headers=REQUEST_HEADERS)
             resp = urllib.request.urlopen(req, timeout=60)
-            compressedstream = io.BytesIO(resp.read())
+            resp_content = resp.read()
+            compressedstream = io.BytesIO(resp_content)
             gzipper = gzip.GzipFile(fileobj=compressedstream)
             html = gzipper.read().decode('utf-8')
             return html
 
+        except gzip.BadGzipFile:
+            # response content is uncompressed
+            return resp_content.decode('utf-8')
+
         except Exception as err:
             if isinstance(err, urllib.error.HTTPError) and err.code == 404:
                 raise FetchingError("Failed with HTTPError 404 on url %r", url)
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,3 +9,4 @@ progress @@
     pytest
     pytest-mock
     lxml
+    python-magic