Skip to content

Commit

Permalink
some research of sitemaps
Browse files Browse the repository at this point in the history
  • Loading branch information
7h3Rabbit committed Mar 18, 2024
1 parent e12388d commit 310a02e
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 22 deletions.
65 changes: 49 additions & 16 deletions engines/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,37 @@
import gzip
import io


def read_sites(input_sitemap_url, input_skip, input_take):
if input_sitemap_url.endswith('.xml'):
sitemap_content = httpRequestGetContent(input_sitemap_url, True, True)
return read_sites_from_xml(sitemap_content, input_skip, input_take)
elif input_sitemap_url.endswith('.xml.gz'):
ignore_none_html = True
return read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html)

def read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html):
# TODO, handle this?: <loc><![CDATA[https://melanomforeningen.se/post-sitemap.xml]]></loc>

# TODO: CDATA everything: https://melanomforeningen.se/post-sitemap.xml
# <url>
# <loc><![CDATA[https://melanomforeningen.se/nyheter/]]></loc>
# <lastmod><![CDATA[2024-01-26T11:22:43+00:00]]></lastmod>
# <changefreq><![CDATA[weekly]]></changefreq>
# <priority><![CDATA[0.7]]></priority>
# <image:image>
# <image:loc><![CDATA[https://melanomforeningen.se/wp-content/uploads/newspapers-444447_1280.jpg]]></image:loc>
# </image:image>
# </url>

if input_sitemap_url.endswith('.xml.gz'):
# unpack gzip:ed sitemap
sitemap_content = httpRequestGetContent(input_sitemap_url, True, False)
gzip_io = io.BytesIO(sitemap_content)
with gzip.GzipFile(fileobj=gzip_io, mode='rb') as gzip_file:
gzip_content = gzip_file.read()
sitemap_content = gzip_content.decode('utf-8')
return read_sites_from_xml(sitemap_content, input_skip, input_take)
sitemap_content = gzip_content.decode('utf-8', 'ignore')
return read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html)
else:
sites = list()
return sites
sitemap_content = httpRequestGetContent(input_sitemap_url, True, True)
return read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html)

def read_sites_from_xml(sitemap_content, input_skip, input_take):
def read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html):
sites = list()

# do we have sitemaps in our sitemap?...
Expand All @@ -41,17 +54,37 @@ def read_sites_from_xml(sitemap_content, input_skip, input_take):
continue

item_url = match.group('itemurl')
# TODO: validate url encoding ( Example: https://www.gotene.se/webdav/files/Centrumhuset/Kultur, turism & fritid/Biblioteket/hemsidefilm/loss_teckensprak.html )
item_url = item_url.replace(' ', '%20')

if is_recursive:
tmp_sites = read_sites(item_url, input_skip, input_take)
tmp_sites = read_sitemap(item_url, input_skip, input_take, ignore_none_html)
current_index += len(tmp_sites)
sites.extend(tmp_sites)
else:
content_type = get_content_type(item_url, config.cache_time_delta)
if 'html' not in content_type:
print('- skipping index {0} because it is of type: {1}'.format(current_index, content_type))
current_index += 1
continue
if ignore_none_html:
item_type = 'html'
parsed_item_url = urlparse(item_url)
tmp = os.path.splitext(parsed_item_url.path)[1].strip('.').lower()
ext_len = len(tmp)
if ext_len <= 11 and ext_len >= 2:
item_type = tmp

if 'html' != item_type and 'htm' != item_type:
print('- skipping because it is of type: {0}'.format(item_type))
# current_index += 1
continue

item_content_type = get_content_type(item_url, cache_time_delta)
print('content-type', item_content_type)
if item_content_type == 401:
print('- skipping because it is of status-code: {0}'.format(item_content_type))
continue
elif item_content_type != None and 'html' not in item_content_type:
print('- skipping because it is of content-type: {0}'.format(item_content_type))
# current_index += 1
continue

sites.append([current_index, item_url])
current_index += 1
return sites
Expand Down
50 changes: 44 additions & 6 deletions tests/standard_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import config
from tests.utils import *
import gettext
from engines.sitemap import read_sitemap
_local = gettext.gettext

# DEFAULTS
Expand Down Expand Up @@ -100,19 +101,56 @@ def validate_sitemap(_, _local, robots_content, has_robots_txt):
else:
return_dict['sitemap'] = 'ok'

smap_pos = robots_content.lower().find('sitemap')
smaps = robots_content[smap_pos:].split('\n')
regex = r"sitemap\:(?P<url>[^\n]+)"
found_smaps = []
for line in smaps:
if 'sitemap:' in line.lower():
found_smaps.append(
line.lower().replace('sitemap:', '').strip())
matches = re.finditer(regex, robots_content, re.MULTILINE | re.IGNORECASE)
for matchNum, match in enumerate(matches, start=1):
sitemap_url = match.group('url').strip()
found_smaps.append(sitemap_url)

return_dict["num_sitemaps"] = len(found_smaps)

# NOTE: https://internetverkstan.se/ has styled sitemap
# TODO: https://internetverkstan.se/ has _1_ entry

if len(found_smaps) > 0:
return_dict["sitemaps"] = found_smaps

print('found sitemaps = ', found_smaps)

sitemap_items = read_sitemap(found_smaps[0], -1, -1, False)

print(found_smaps[0])
# print('\tTotal of Items = ', len(sitemap_items))

item_types = {}
type_spread = {}
for item in sitemap_items:
item_type = 'webpage'
item_url = item[1]
# TODO: validate url encoding ( Example: https://www.gotene.se/webdav/files/Centrumhuset/Kultur, turism & fritid/Biblioteket/hemsidefilm/loss_teckensprak.html )
parsed_item_url = urlparse(item_url)
tmp = os.path.splitext(parsed_item_url.path)[1].strip('.').lower()
ext_len = len(tmp)
# print('ext', tmp)
if ext_len <= 11 and ext_len >= 2:
item_type = tmp
elif parsed_item_url.path.startswith('/download/'):
item_type = 'unknown-in-download'


if item_type not in item_types:
item_types[item_type] = list()
item_types[item_type].append(item_url)

item_type_keys = sorted(list(item_types.keys()))
type_spread['total'] = len(sitemap_items)
for key in item_type_keys:
type_spread[key] = len(item_types[key])

nice_items = json.dumps(type_spread, indent=14)
print('\tsitemap[distribution of types]', nice_items)

smap_content = httpRequestGetContent(found_smaps[0], True)

if not is_sitemap(smap_content):
Expand Down
14 changes: 14 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,14 @@ def httpRequestGetContent(url, allow_redirects=False, use_text_instead_of_conten
return ''

def get_content_type(url, cache_time_delta):
print(url)
headers = get_url_headers(url, cache_time_delta)

if headers['status-code'] == 401:
return 401

print('\t- headers =', headers)

if 'Content-Type' in headers:
return headers['Content-Type']
if 'content-type' in headers:
Expand All @@ -264,10 +270,18 @@ def get_url_headers(url, cache_time_delta):
headers = {'user-agent': useragent}
a = requests.head(url, allow_redirects=True,
headers=headers, timeout=request_timeout*2)

print('\t- status =', a.status_code)

if a.status_code == 401:
return {
'status-code': a.status_code
}

time.sleep(5)

headers = dict(a.headers)
headers['status-code'] = a.status_code
nice_headers = json.dumps(headers, indent=3)
set_cache_file(key, nice_headers, True)
return headers
Expand Down

0 comments on commit 310a02e

Please sign in to comment.