Skip to content

Commit

Permalink
Merge branch 'main' into issue-244
Browse files Browse the repository at this point in the history
  • Loading branch information
7h3Rabbit committed Mar 15, 2024
2 parents 96a2a2a + 3673acb commit 5519f60
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 2 deletions.
2 changes: 2 additions & 0 deletions default.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def main(argv):
from engines.csv import read_sites, add_site, delete_site
elif (file_ending == ".xml"): # https://example.com/sitemap.xml
from engines.sitemap import read_sites, add_site, delete_site
elif (file_long_ending == ".xml.gz"): # https://example.com/sitemap.xml.gz
from engines.sitemap import read_sites, add_site, delete_site
elif file_long_ending == ".result":
from engines.sitespeed_result import read_sites, add_site, delete_site
elif file_long_ending == ".webprf":
Expand Down
36 changes: 34 additions & 2 deletions engines/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,54 @@
import config
from tests.utils import *
import re
import gzip
import io


def read_sites(input_sitemap_url, input_skip, input_take):
if input_sitemap_url.endswith('.xml'):
sitemap_content = httpRequestGetContent(input_sitemap_url, True, True)
return read_sites_from_xml(sitemap_content, input_skip, input_take)
elif input_sitemap_url.endswith('.xml.gz'):
# unpack gzip:ed sitemap
sitemap_content = httpRequestGetContent(input_sitemap_url, True, False)
gzip_io = io.BytesIO(sitemap_content)
with gzip.GzipFile(fileobj=gzip_io, mode='rb') as gzip_file:
gzip_content = gzip_file.read()
sitemap_content = gzip_content.decode('utf-8')
return read_sites_from_xml(sitemap_content, input_skip, input_take)
else:
sites = list()
return sites

def read_sites_from_xml(sitemap_content, input_skip, input_take):
sites = list()

sitemap_content = httpRequestGetContent(input_sitemap_url)
# do we have sitemaps in our sitemap?...
is_recursive = '<sitemap>' in sitemap_content

regex = r"<loc>(?P<itemurl>[^<]+)<"
matches = re.finditer(regex, sitemap_content, re.MULTILINE)

current_index = 0
for matchNum, match in enumerate(matches, start=1):

if not use_item(current_index, input_skip, input_take):
current_index += 1
continue

item_url = match.group('itemurl')

if use_item(current_index, input_skip, input_take):
if is_recursive:
tmp_sites = read_sites(item_url, input_skip, input_take)
current_index += len(tmp_sites)
sites.extend(tmp_sites)
else:
content_type = get_content_type(item_url, config.cache_time_delta)
if 'html' not in content_type:
print('- skipping index {0} because it is of type: {1}'.format(current_index, content_type))
current_index += 1
continue
sites.append([current_index, item_url])
current_index += 1
return sites
Expand Down
41 changes: 41 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,47 @@ def httpRequestGetContent(url, allow_redirects=False, use_text_instead_of_conten
pass
return ''

def get_content_type(url, cache_time_delta):
headers = get_url_headers(url, cache_time_delta)

has_content_type_header = 'Content-Type' in headers
if has_content_type_header:
return headers['Content-Type']
return None

def get_url_headers(url, cache_time_delta):
"""Trying to fetch the response content
Attributes: url, as for the URL to fetch
"""

try:
key = url.replace('https://', 'heads://').replace('http://', 'head://')

content = get_cache_file(
key, True, cache_time_delta)
if content != None:
headers = json.loads(content)
return headers

headers = {'user-agent': useragent}
a = requests.head(url, allow_redirects=True,
headers=headers, timeout=request_timeout*2)

time.sleep(5)

headers = dict(a.headers)
nice_headers = json.dumps(headers, indent=3)
set_cache_file(key, nice_headers, True)
return headers
except ssl.CertificateError as error:
print('Info: Certificate error. {0}'.format(error.reason))
return dict()
except requests.exceptions.SSLError:
return dict()
except requests.exceptions.ConnectionError:
return dict()
except Exception as ex:
return dict()

def has_redirect(url):
"""Trying to fetch the response content
Expand Down

0 comments on commit 5519f60

Please sign in to comment.