diff --git a/config.py b/config.py index db37c2fc..6eec8a7b 100644 --- a/config.py +++ b/config.py @@ -123,6 +123,9 @@ LOG_TITLES = DIR_TEMP + "/titles.txt" LOG_LOCALE = DIR_TEMP + "/locale.txt" +# Maximum filename length +IMG_MAX_NAME_LEN = 240 + # prefix for URL of local images IMAGES_URL_PREFIX = "/images/" diff --git a/requirements-dev.txt b/requirements-dev.txt index 80ea8629..e8636eac 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,3 +9,4 @@ progress pytest pytest-mock lxml +python-magic diff --git a/src/images/embed.py b/src/images/embed.py index e4e7562d..7434511d 100644 --- a/src/images/embed.py +++ b/src/images/embed.py @@ -17,6 +17,7 @@ """Embed pre-selected images in HTML source.""" import logging +import magic import os import bs4 @@ -24,12 +25,20 @@ import config logger = logging.getLogger('images.embed') +mimetype = magic.Magic(mime=True) def image_is_embeddable(imgpath, imgsize): """Decide if given image will be embedded in HTML source.""" + result = False _, ext = os.path.splitext(imgpath) - return ext.lower() == '.svg' and imgsize < 40960 + if ext.lower() == '.svg' and imgsize < 40960: + # Do not assume an image is an SVG based only in file extension + if os.path.exists(imgpath): + mt = mimetype.from_file(imgpath) + if mt.startswith('image/svg'): + result = True + return result class _EmbedImages: diff --git a/src/images/extract.py b/src/images/extract.py index 208edc79..c064182b 100644 --- a/src/images/extract.py +++ b/src/images/extract.py @@ -252,6 +252,25 @@ def replace(tag): logger.warning("Unsupported image with GET args. Won't be included: %s", dsk_url) return None, None + # Make sure dsk_url lenght is below filesystem limit + limit = config.IMG_MAX_NAME_LEN + basedir, filename = os.path.split(dsk_url) + name, ext = os.path.splitext(filename) + if len(filename) > limit: + # We cannot simply get the [:limit] part of the name, since we + # cannot know if we will have conflicts with other image names, + # so we'll split the filename into subfolders. + # superbigfilename.png would be super/bigfi/lename.png + logger.debug("Filename too long for %r", dsk_url) + new_split_name = [] + for i in range((len(name)//limit)+1): + new_part = name[i*limit:(i+1)*limit] + new_part.replace('.', '').replace('-', '').replace('/', '') + new_split_name.append(new_part) + new_dir = os.path.join(*new_split_name) + dsk_url = os.path.join(basedir,new_dir) + dsk_url += ext + logger.debug("web url: %r, dsk_url %r", web_url, dsk_url) # Replace the width and height by a querystring in the src of the image diff --git a/src/images/scale.py b/src/images/scale.py index 8c6e9d08..659b69e6 100644 --- a/src/images/scale.py +++ b/src/images/scale.py @@ -105,7 +105,7 @@ def run(verbose, src): logger.debug("Rescaling to %d%% image %s", scale, dskurl) if scale == 100: done_now[dskurl] = 100 - if embed_enabled and image_is_embeddable(dskurl, imgsize): + if embed_enabled and image_is_embeddable(frompath, imgsize): # don't copy image, leave it out of image blocks, it will # be embedded from original location (without any reduction) images_embed.add(dskurl) diff --git a/src/scraping/css.py b/src/scraping/css.py index 0f1433b7..4e85ad3f 100644 --- a/src/scraping/css.py +++ b/src/scraping/css.py @@ -35,6 +35,7 @@ Reference: https://www.mediawiki.org/wiki/API:Styling_content """ +import html import logging import functools import os @@ -163,7 +164,7 @@ def _module_names(self): unique_names = set() for link in raw_links: - url = urllib.parse.urlparse(link) + url = urllib.parse.urlparse(html.unescape(link)) query = dict(urllib.parse.parse_qsl(url.query)) names = query.get('modules') if not names: diff --git a/src/scraping/scraper.py b/src/scraping/scraper.py index 062d4335..13973859 100755 --- a/src/scraping/scraper.py +++ b/src/scraping/scraper.py @@ -130,11 +130,16 @@ def fetch_html(url): try: req = urllib.request.Request(url, headers=REQUEST_HEADERS) resp = urllib.request.urlopen(req, timeout=60) - compressedstream = io.BytesIO(resp.read()) + resp_content = resp.read() + compressedstream = io.BytesIO(resp_content) gzipper = gzip.GzipFile(fileobj=compressedstream) html = gzipper.read().decode('utf-8') return html + except gzip.BadGzipFile: + # response content is uncompressed + return resp_content.decode('utf-8') + except Exception as err: if isinstance(err, urllib.error.HTTPError) and err.code == 404: raise FetchingError("Failed with HTTPError 404 on url %r", url)