Merge branch 'main' into issue-244

Webperf-se · Mar 15, 2024 · 5519f60 · 5519f60
2 parents 96a2a2a + 3673acb
commit 5519f60
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 2 deletions.
diff --git a/default.py b/default.py
@@ -149,6 +149,8 @@ def main(argv):
                 from engines.csv import read_sites, add_site, delete_site
             elif (file_ending == ".xml"):  # https://example.com/sitemap.xml
                 from engines.sitemap import read_sites, add_site, delete_site
+            elif (file_long_ending == ".xml.gz"):  # https://example.com/sitemap.xml.gz
+                from engines.sitemap import read_sites, add_site, delete_site
             elif file_long_ending == ".result":
                 from engines.sitespeed_result import read_sites, add_site, delete_site
             elif file_long_ending == ".webprf":

diff --git a/engines/sitemap.py b/engines/sitemap.py
@@ -4,22 +4,54 @@
 import config
 from tests.utils import *
 import re
+import gzip
+import io
 
 
 def read_sites(input_sitemap_url, input_skip, input_take):
+    if input_sitemap_url.endswith('.xml'):
+        sitemap_content = httpRequestGetContent(input_sitemap_url, True, True)
+        return read_sites_from_xml(sitemap_content, input_skip, input_take)
+    elif input_sitemap_url.endswith('.xml.gz'):
+        # unpack gzip:ed sitemap
+        sitemap_content = httpRequestGetContent(input_sitemap_url, True, False)
+        gzip_io = io.BytesIO(sitemap_content)
+        with gzip.GzipFile(fileobj=gzip_io, mode='rb') as gzip_file:
+            gzip_content = gzip_file.read()
+            sitemap_content = gzip_content.decode('utf-8')
+            return read_sites_from_xml(sitemap_content, input_skip, input_take)
+    else:
+        sites = list()
+        return sites
+
+def read_sites_from_xml(sitemap_content, input_skip, input_take):
     sites = list()
 
-    sitemap_content = httpRequestGetContent(input_sitemap_url)
+    # do we have sitemaps in our sitemap?...
+    is_recursive = '<sitemap>' in sitemap_content
 
     regex = r"<loc>(?P<itemurl>[^<]+)<"
     matches = re.finditer(regex, sitemap_content, re.MULTILINE)
 
     current_index = 0
     for matchNum, match in enumerate(matches, start=1):
 
+        if not use_item(current_index, input_skip, input_take):
+            current_index += 1
+            continue
+
         item_url = match.group('itemurl')
 
-        if use_item(current_index, input_skip, input_take):
+        if is_recursive:
+            tmp_sites = read_sites(item_url, input_skip, input_take)
+            current_index += len(tmp_sites)
+            sites.extend(tmp_sites)
+        else:
+            content_type = get_content_type(item_url, config.cache_time_delta)
+            if 'html' not in content_type:
+                print('- skipping index {0} because it is of type: {1}'.format(current_index, content_type))
+                current_index += 1
+                continue
             sites.append([current_index, item_url])
         current_index += 1
     return sites

diff --git a/tests/utils.py b/tests/utils.py
@@ -229,6 +229,47 @@ def httpRequestGetContent(url, allow_redirects=False, use_text_instead_of_conten
         pass
     return ''
 
+def get_content_type(url, cache_time_delta):
+    headers = get_url_headers(url, cache_time_delta)
+
+    has_content_type_header = 'Content-Type' in headers
+    if has_content_type_header:
+        return headers['Content-Type']
+    return None
+
+def get_url_headers(url, cache_time_delta):
+    """Trying to fetch the response content
+    Attributes: url, as for the URL to fetch
+    """
+
+    try:
+        key = url.replace('https://', 'heads://').replace('http://', 'head://')
+
+        content = get_cache_file(
+            key, True, cache_time_delta)
+        if content != None:
+            headers = json.loads(content)
+            return headers
+
+        headers = {'user-agent': useragent}
+        a = requests.head(url, allow_redirects=True,
+                         headers=headers, timeout=request_timeout*2)
+
+        time.sleep(5)
+
+        headers = dict(a.headers)
+        nice_headers = json.dumps(headers, indent=3)
+        set_cache_file(key, nice_headers, True)
+        return headers
+    except ssl.CertificateError as error:
+        print('Info: Certificate error. {0}'.format(error.reason))
+        return dict()
+    except requests.exceptions.SSLError:
+        return dict()
+    except requests.exceptions.ConnectionError:
+        return dict()
+    except Exception as ex:
+        return dict()
 
 def has_redirect(url):
     """Trying to fetch the response content