some research of sitemaps

Webperf-se · Mar 18, 2024 · 310a02e · 310a02e
1 parent e12388d
commit 310a02e
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 22 deletions.
diff --git a/engines/sitemap.py b/engines/sitemap.py
@@ -7,24 +7,37 @@
 import gzip
 import io
 
-
 def read_sites(input_sitemap_url, input_skip, input_take):
-    if input_sitemap_url.endswith('.xml'):
-        sitemap_content = httpRequestGetContent(input_sitemap_url, True, True)
-        return read_sites_from_xml(sitemap_content, input_skip, input_take)
-    elif input_sitemap_url.endswith('.xml.gz'):
+    ignore_none_html = True
+    return read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html)
+
+def read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html):
+    # TODO, handle this?: <loc><![CDATA[https://melanomforeningen.se/post-sitemap.xml]]></loc>
+
+    # TODO: CDATA everything: https://melanomforeningen.se/post-sitemap.xml
+	# <url>
+	# 	<loc><![CDATA[https://melanomforeningen.se/nyheter/]]></loc>
+	# 	<lastmod><![CDATA[2024-01-26T11:22:43+00:00]]></lastmod>
+	# 	<changefreq><![CDATA[weekly]]></changefreq>
+	# 	<priority><![CDATA[0.7]]></priority>
+	# 	<image:image>
+	# 		<image:loc><![CDATA[https://melanomforeningen.se/wp-content/uploads/newspapers-444447_1280.jpg]]></image:loc>
+	# 	</image:image>
+	# </url>    
+
+    if input_sitemap_url.endswith('.xml.gz'):
         # unpack gzip:ed sitemap
         sitemap_content = httpRequestGetContent(input_sitemap_url, True, False)
         gzip_io = io.BytesIO(sitemap_content)
         with gzip.GzipFile(fileobj=gzip_io, mode='rb') as gzip_file:
             gzip_content = gzip_file.read()
-            sitemap_content = gzip_content.decode('utf-8')
-            return read_sites_from_xml(sitemap_content, input_skip, input_take)
+            sitemap_content = gzip_content.decode('utf-8', 'ignore')
+            return read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html)
     else:
-        sites = list()
-        return sites
+        sitemap_content = httpRequestGetContent(input_sitemap_url, True, True)
+        return read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html)
 
-def read_sites_from_xml(sitemap_content, input_skip, input_take):
+def read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html):
     sites = list()
 
     # do we have sitemaps in our sitemap?...
@@ -41,17 +54,37 @@ def read_sites_from_xml(sitemap_content, input_skip, input_take):
             continue
 
         item_url = match.group('itemurl')
+        # TODO: validate url encoding ( Example: https://www.gotene.se/webdav/files/Centrumhuset/Kultur, turism & fritid/Biblioteket/hemsidefilm/loss_teckensprak.html )
+        item_url = item_url.replace(' ', '%20')
 
         if is_recursive:
-            tmp_sites = read_sites(item_url, input_skip, input_take)
+            tmp_sites = read_sitemap(item_url, input_skip, input_take, ignore_none_html)
             current_index += len(tmp_sites)
             sites.extend(tmp_sites)
         else:
-            content_type = get_content_type(item_url, config.cache_time_delta)
-            if 'html' not in content_type:
-                print('- skipping index {0} because it is of type: {1}'.format(current_index, content_type))
-                current_index += 1
-                continue
+            if ignore_none_html:
+                item_type = 'html'
+                parsed_item_url = urlparse(item_url)
+                tmp = os.path.splitext(parsed_item_url.path)[1].strip('.').lower()
+                ext_len = len(tmp)
+                if ext_len <= 11 and ext_len >= 2:
+                    item_type = tmp
+
+                if 'html' != item_type and 'htm' != item_type:
+                    print('- skipping because it is of type: {0}'.format(item_type))
+                    # current_index += 1
+                    continue
+
+                item_content_type = get_content_type(item_url, cache_time_delta)
+                print('content-type', item_content_type)
+                if item_content_type == 401:
+                    print('- skipping because it is of status-code: {0}'.format(item_content_type))
+                    continue
+                elif item_content_type != None and 'html' not in item_content_type:
+                    print('- skipping because it is of content-type: {0}'.format(item_content_type))
+                    # current_index += 1
+                    continue
+
             sites.append([current_index, item_url])
         current_index += 1
     return sites

diff --git a/tests/standard_files.py b/tests/standard_files.py
@@ -7,6 +7,7 @@
 import config
 from tests.utils import *
 import gettext
+from engines.sitemap import read_sitemap
 _local = gettext.gettext
 
 # DEFAULTS
@@ -100,19 +101,56 @@ def validate_sitemap(_, _local, robots_content, has_robots_txt):
     else:
         return_dict['sitemap'] = 'ok'
 
-        smap_pos = robots_content.lower().find('sitemap')
-        smaps = robots_content[smap_pos:].split('\n')
+        regex = r"sitemap\:(?P<url>[^\n]+)"
         found_smaps = []
-        for line in smaps:
-            if 'sitemap:' in line.lower():
-                found_smaps.append(
-                    line.lower().replace('sitemap:', '').strip())
+        matches = re.finditer(regex, robots_content, re.MULTILINE | re.IGNORECASE)
+        for matchNum, match in enumerate(matches, start=1):
+            sitemap_url = match.group('url').strip()
+            found_smaps.append(sitemap_url)
 
         return_dict["num_sitemaps"] = len(found_smaps)
 
+        # NOTE: https://internetverkstan.se/ has styled sitemap
+        # TODO: https://internetverkstan.se/ has _1_ entry
+
         if len(found_smaps) > 0:
             return_dict["sitemaps"] = found_smaps
 
+            print('found sitemaps = ', found_smaps)
+
+            sitemap_items = read_sitemap(found_smaps[0], -1, -1, False)
+
+            print(found_smaps[0])
+            # print('\tTotal of Items = ', len(sitemap_items))
+
+            item_types = {}
+            type_spread = {}
+            for item in sitemap_items:
+                item_type = 'webpage'
+                item_url = item[1]
+                # TODO: validate url encoding ( Example: https://www.gotene.se/webdav/files/Centrumhuset/Kultur, turism & fritid/Biblioteket/hemsidefilm/loss_teckensprak.html )
+                parsed_item_url = urlparse(item_url)
+                tmp = os.path.splitext(parsed_item_url.path)[1].strip('.').lower()
+                ext_len = len(tmp)
+                # print('ext', tmp)
+                if ext_len <= 11 and ext_len >= 2:
+                    item_type = tmp
+                elif parsed_item_url.path.startswith('/download/'):
+                    item_type = 'unknown-in-download'
+
+
+                if item_type not in item_types:
+                    item_types[item_type] = list()
+                item_types[item_type].append(item_url)
+
+            item_type_keys = sorted(list(item_types.keys()))
+            type_spread['total'] = len(sitemap_items)
+            for key in item_type_keys:
+                type_spread[key] = len(item_types[key])
+
+            nice_items = json.dumps(type_spread, indent=14)
+            print('\tsitemap[distribution of types]', nice_items)
+
             smap_content = httpRequestGetContent(found_smaps[0], True)
 
             if not is_sitemap(smap_content):

diff --git a/tests/utils.py b/tests/utils.py
@@ -238,8 +238,14 @@ def httpRequestGetContent(url, allow_redirects=False, use_text_instead_of_conten
     return ''
 
 def get_content_type(url, cache_time_delta):
+    print(url)
     headers = get_url_headers(url, cache_time_delta)
 
+    if headers['status-code'] == 401:
+        return 401
+
+    print('\t- headers =', headers)
+
     if 'Content-Type' in headers:
         return headers['Content-Type']
     if 'content-type' in headers:
@@ -264,10 +270,18 @@ def get_url_headers(url, cache_time_delta):
         headers = {'user-agent': useragent}
         a = requests.head(url, allow_redirects=True,
                          headers=headers, timeout=request_timeout*2)
+
+        print('\t- status =', a.status_code)
+
+        if a.status_code == 401:
+            return {
+                'status-code': a.status_code
+            }
 
         time.sleep(5)
 
         headers = dict(a.headers)
+        headers['status-code'] = a.status_code
         nice_headers = json.dumps(headers, indent=3)
         set_cache_file(key, nice_headers, True)
         return headers