diff --git a/.gitignore b/.gitignore index 88aa4b6..e138b05 100644 --- a/.gitignore +++ b/.gitignore @@ -31,12 +31,6 @@ _site/ *.sqlite3 !sample/sample.sqlite3 -# Crawl files, except for the sample crawl # -############################################ -*.warc -*.warc.gz -!sample/crawl.warc.gz - # OS generated files # ###################### .DS_Store diff --git a/crawler/management/commands/warc_to_csv.py b/crawler/management/commands/warc_to_csv.py deleted file mode 100644 index 882842a..0000000 --- a/crawler/management/commands/warc_to_csv.py +++ /dev/null @@ -1,104 +0,0 @@ -import csv - -import djclick as click - -from crawler.models import Component, Error, Link, Page, Redirect -from crawler.reader import generate_instances - - -@click.command() -@click.argument("warc", type=click.File("rb")) -@click.option( - "--pages-csv", - type=click.File("w", encoding="utf-8-sig"), - default="pages.csv", - show_default=True, -) -@click.option( - "--errors-csv", - type=click.File("w", encoding="utf-8-sig"), - default="errors.csv", - show_default=True, -) -@click.option( - "--redirects-csv", - type=click.File("w", encoding="utf-8-sig"), - default="redirects.csv", - show_default=True, -) -@click.option( - "--links-csv", - type=click.File("w", encoding="utf-8-sig"), - default="links.csv", - show_default=True, -) -@click.option( - "--components-csv", - type=click.File("w", encoding="utf-8-sig"), - default="components.csv", - show_default=True, -) -@click.option( - "--max-pages", type=int, help="Maximum number of pages to read from the archive" -) -def command( - warc, pages_csv, errors_csv, redirects_csv, links_csv, components_csv, max_pages -): - writers = { - model: csv.writer(model_csv, csv.QUOTE_ALL) - for model, model_csv in { - Page: pages_csv, - Error: errors_csv, - Redirect: redirects_csv, - Link: links_csv, - Component: components_csv, - }.items() - } - - for instance in generate_instances(warc, max_pages=max_pages): - if isinstance(instance, Page): - writers[Page].writerow( - [ - instance.timestamp, - instance.url, - instance.title, - instance.language, - ] - ) - - for component in instance.components.all(): - writers[Component].writerow( - [ - instance.url, - component.class_name, - ] - ) - - for link in instance.links.all(): - writers[Link].writerow( - [ - instance.url, - link.href, - ] - ) - elif isinstance(instance, Error): - writers[Error].writerow( - [ - instance.timestamp, - instance.url, - instance.status_code, - instance.referrer, - ] - ) - elif isinstance(instance, Redirect): - writers[Redirect].writerow( - [ - instance.timestamp, - instance.url, - instance.status_code, - instance.referrer, - instance.location, - ] - ) - else: - raise ValueError(instance) diff --git a/crawler/management/commands/warc_to_db.py b/crawler/management/commands/warc_to_db.py deleted file mode 100644 index 43f348c..0000000 --- a/crawler/management/commands/warc_to_db.py +++ /dev/null @@ -1,76 +0,0 @@ -import os -import os.path -from collections import defaultdict - -from django.db import connections -from django.conf import settings -from django.core.management import call_command -from django.test import override_settings - -import djclick as click - -from crawler.reader import generate_instances -from crawler.writer import DatabaseWriter - - -@click.command() -@click.argument("warc", type=click.File("rb")) -@click.argument("db_filename", type=click.Path()) -@click.option( - "--max-pages", type=int, help="Maximum number of pages to read from the archive" -) -@click.option( - "--recreate", - is_flag=True, - show_default=True, - default=False, - help="Recreate database file if it already exists.", -) -@click.option( - "--noinput", - "--no-input", - is_flag=True, - default=False, - help="Do not prompt the user for input of any kind.", -) -@click.option( - "--multiple-domains/--no-multiple-domains", - is_flag=True, - show_default=True, - default=False, - help="Limit pages to the first domain seen.", -) -def command(warc, db_filename, max_pages, recreate, noinput, multiple_domains): - if os.path.exists(db_filename): - if not recreate: - if noinput: - raise click.ClickException( - f"File {db_filename} already exists, use --recreate to recreate." - ) - - click.confirm( - f"File {db_filename} already exists, do you wish to recreate?", - abort=True, - ) - - os.remove(db_filename) - - db_alias = "warc_to_db" - - connections.databases[db_alias] = { - "ENGINE": "django.db.backends.sqlite3", - "NAME": db_filename, - } - - click.echo("Creating empty database tables...") - call_command("migrate", database=db_alias, app_label="crawler", run_syncdb=True) - - click.echo("Reading WARC content into database tables...") - writer = DatabaseWriter(db_alias) - - for instance in generate_instances( - warc, max_pages=max_pages, single_domain_only=not multiple_domains - ): - writer.write(instance) - - writer.analyze() diff --git a/crawler/reader.py b/crawler/reader.py deleted file mode 100644 index a2eb866..0000000 --- a/crawler/reader.py +++ /dev/null @@ -1,189 +0,0 @@ -import os.path -import re -from urllib.parse import urlparse, urlunparse - -import click -import lxml.html -from warcio.archiveiterator import ArchiveIterator - -from crawler.models import Component, Error, Link, Page, Redirect - - -WHITESPACE = re.compile(r"\s+") - - -COMPONENT_SEARCH = re.compile(r"(?:(?:class=\")|\s)((?:o|m|a)-[\w\-]*)") - - -def read_warc_records(warc, silent=False): - iterator = ArchiveIterator(warc) - progress_bar = None - progress_last = 0 - warc_request = None - - if not silent: - file_size = os.path.getsize(warc.name) - progress_bar = click.progressbar(length=file_size) - - for warc_record in iterator: - if warc_record.rec_type == "request": - warc_request = warc_record - else: - if warc_record.rec_type == "response": - yield warc_request, warc_record - - warc_request = None - - if progress_bar: - progress_current = iterator.fh.tell() - progress_step = progress_current - progress_last - progress_bar.update(progress_step) - progress_last = progress_current - - -def get_body(tree): - body = tree.find("./body") - - if body is not None: - drop_element_selectors = [ - ".o-header", - ".o-footer", - ".skip-nav", - "img", - "script", - "style", - ] - - for drop_element_selector in drop_element_selectors: - for element in body.cssselect(drop_element_selector): - element.drop_tree() - - return body - - -def make_instance_from_warc_record( - warc_request, warc_response, seen_urls, limit_domain -): - url = warc_response.rec_headers.get_header("WARC-Target-URI") - - # Skip non-HTTP responses (e.g. DNS lookups). - if not warc_response.http_headers: - return - - # This code is needed because, surprisingly, WARCs may contain multiple - # records pointing to the same URL. This can happen if multiple redirects - # or relative links point to the same target URL. We only want to generate - # records for each URL a single time, so we keep a record of which ones - # we've already seen. - if url in seen_urls: - return - - seen_urls.add(url) - - status_code = int(warc_response.http_headers.get_statuscode()) - content_type = warc_response.http_headers.get_header("Content-Type") - timestamp = warc_response.rec_headers.get_header("WARC-Date") - - if warc_request: - referrer = warc_request.http_headers.get_header("Referer") - else: - referrer = None - - if status_code >= 300: - if status_code < 400: - location = warc_response.http_headers.get("Location") - return Redirect( - timestamp=timestamp, - url=url, - status_code=status_code, - referrer=referrer, - location=location, - ) - else: - return Error( - timestamp=timestamp, url=url, status_code=status_code, referrer=referrer - ) - - return - - if 200 != status_code: - raise ValueError(f"Unexpected status code {status_code} for {url}") - - if not content_type: - raise ValueError(f"Missing content type for {url}") - - if not content_type.startswith("text/html"): - return - - if limit_domain and not url.startswith(limit_domain): - return - - html = warc_response.content_stream().read().decode("utf-8") - tree = lxml.html.fromstring(html) - title_tag = tree.find(".//title") - title = title_tag.text.strip() if title_tag is not None else None - language = tree.find(".").get("lang") - - if title is None: - return - - body = get_body(tree) - - if body is not None: - text = WHITESPACE.sub(" ", body.text_content()).strip() - else: - text = None - - page = Page( - timestamp=timestamp, - url=url, - title=title, - language=language, - html=html, - text=text, - ) - - hrefs = set( - href - for element, attribute, href, pos in body.iterlinks() - if "a" == element.tag and "href" == attribute - ) - - page.links = [Link(href=href) for href in sorted(hrefs)] - - body_html = lxml.etree.tostring(body, encoding="unicode") - - class_names = set(COMPONENT_SEARCH.findall(body_html)) - page.components = [ - Component(class_name=class_name) for class_name in sorted(class_names) - ] - - return page - - -def generate_instances(warc, max_pages=None, single_domain_only=True, silent=False): - page_count = 0 - seen_urls = set() - limit_domain = None - - for warc_request, warc_response in read_warc_records(warc, silent=silent): - instance = make_instance_from_warc_record( - warc_request, warc_response, seen_urls, limit_domain - ) - - if not instance: - continue - - yield instance - - if isinstance(instance, Page): - page_count += 1 - - if max_pages and page_count >= max_pages: - break - - if single_domain_only and not limit_domain: - parsed = urlparse(instance.url) - limit_domain = urlunparse( - (parsed.scheme, parsed.netloc, "/", "", "", "") - ) diff --git a/crawler/wpull_plugin.py b/crawler/wpull_plugin.py index 2c932d3..0e27cea 100644 --- a/crawler/wpull_plugin.py +++ b/crawler/wpull_plugin.py @@ -69,7 +69,7 @@ def deactivate(self): self.db_writer.analyze() def init_db(self): - db_alias = "warc_to_db" + db_alias = "crawler" connections.databases[db_alias] = { "ENGINE": "django.db.backends.sqlite3", diff --git a/list_qs_params.py b/list_qs_params.py deleted file mode 100755 index 48749b7..0000000 --- a/list_qs_params.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 - -# Given a pages.csv file generated by "read_warc.py dump_csvs", output a list -# of query string parameters used across all page URLs. Useful for creating a -# proper --reject-regex parameter for wget, for example: -# -# ./list_qs_params.py pages.csv \ -# --ignore-param=page \ -# --ignore-param=ext_url \ -# --ignore-param=signature \ -# --wget-reject-regex -# -# This command will dump out a list of query string parameters, excluding -# "page", "ext_url", and "signature" in a format compatible with -# wget --reject-regex: -# -# CatID=|NavCode=|_gl=|activity_type=|... -# -# See relevant wget documentation at: -# -# https://www.gnu.org/software/wget/manual/html_node/Recursive-Accept_002fReject-Options.html -import argparse -import csv -from itertools import chain -from operator import itemgetter -from urllib.parse import parse_qs, urlparse - - -def list_qs_params(pages_csv, ignore_param, wget_reject_regex): - reader = csv.reader(pages_csv) - urls = list(map(itemgetter(1), reader)) - params = sorted(set(chain(*(parse_qs(urlparse(url).query).keys() for url in urls)))) - params = [p for p in params if p not in ignore_param] - - if wget_reject_regex: - print("|".join(f"{param}=" for param in params)) - else: - print(params) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument("pages_csv", type=argparse.FileType("r", encoding="utf-8")) - parser.add_argument("--ignore-param", nargs="*", default=[]) - parser.add_argument("--wget-reject-regex", action="store_true") - - args = parser.parse_args() - - list_qs_params(**vars(args)) diff --git a/requirements/base.txt b/requirements/base.txt index 91679fe..cc5e6c2 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -9,7 +9,6 @@ django-modelcluster==5.3 djangorestframework==3.13.1 djangorestframework-csv==2.1.1 lxml==4.9.1 -warcio==1.7.4 whitenoise==5.3.0 wpull==2.0.1 diff --git a/wget_crawl.sh b/wget_crawl.sh deleted file mode 100755 index b8d4300..0000000 --- a/wget_crawl.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env bash - -# Recursively crawl a website and save its HTML locally. -# -# Example usage: -# -# ./wget_crawl.sh [-d depth] https://www.consumerfinance.gov/ -# -# Optionally specify -d depth to limit the crawl depth. - -# If a command fails, stop executing this script and return its error code. -set -e - -depth=0 - -while getopts ":d:" opt; do - case $opt in - d ) - depth="$OPTARG"; - number_regex='^[0-9]+$' - if ! [[ $depth =~ $number_regex ]] ; then - echo "Crawl depth must be a number." 1>&2 - exit 1 - fi - ;; - \? ) - echo "Invalid option: -$OPTARG." 1>&2 - exit 1 - ;; - : ) - echo "Invalid option: -$OPTARG requires an argument." 1>&2 - exit 1 - ;; - esac -done - -shift $((OPTIND -1)) - -url=$1 - -if [ -z "$url" ]; then - echo "Must specify URL to crawl." - exit 1 -fi - -echo "Starting crawl at $url." - -domain=$url -domain="${domain#http://}" -domain="${domain#https://}" -domain="${domain%%:*}" -domain="${domain%%\?*}" -domain="${domain%%/*}" -echo "Limiting crawl to domain $domain." - -if [ $depth -ne 0 ]; then - echo "Limiting crawl to depth $depth." -fi - -# Crawl into a temporary directory to avoid potential unexpected overwriting -# due to use of --trust-server-names. -# See https://nvd.nist.gov/vuln/detail/CVE-2010-2252. -tmp_dir=$(mktemp -d -t wget-$(date +%Y-%m-%d-%H-%M-%S)-XXXXXXXX) -echo "Working in $tmp_dir." - -pushd "$tmp_dir" > /dev/null - -time wget \ - --domains="$domain" \ - --no-verbose \ - --delete-after \ - --no-directories \ - --warc-file=crawl \ - --warc-cdx=on \ - --warc-tempdir="$tmp_dir" \ - --execute robots=off \ - --wait=0.5 \ - --random-wait \ - --ignore-case \ - --no-hsts \ - --reject '*.css,*.csv,*.do,*.doc,*.docx,*.epub,*.gif,*.ico,*.jpg,*.js,*.json,*.mp3,*.pdf,*.png,*.pptx,*.py,*.r,*.sas,*.sps,*.svg,*.tmp,*.txt,*.wav,*.webmanifest,*.woff,*.woff2,*.xls,*xlsx,*.xml,*.zip' \ - --reject-regex "CatID=|NavCode=|_gl=|activity_type=|authors=|book=|categories=|chartType=|charttype=|clhx=|dateInterval=|date_received_min=|dateinterval=|entx=|ext_url=|fdx=|filter1_topics=|filter2_topics=|form-id=|gib=|gpl=|grade_level=|has_narrative=|hltx=|hous=|houx=|insi=|insl=|inst=|iped=|issue=|language=|lens=|mta=|oid=|othg=|othr=|othx=|parl=|pelg=|perl=|pid=|ppl=|product=|prvf=|prvi=|prvl=|q=|regs=|retx=|schg=|school_subject=|searchField=|search_field=|searchfield=|signature=|size=|sort=|stag=|subl=|tab=|taxx=|title=|topic=|topics=|totl=|tran=|trnx=|tuit=|unsl=|utm_campaign=|utm_medium=|utm_source=|wkst=" \ - --recursive \ - --level="$depth" \ - --user-agent="crawsqueal" \ - "$url" 2>&1 | tee wget.log - -popd > /dev/null - -# Copy back log and WARC file from temporary directory. -cp "$tmp_dir"/wget.log . -cp "$tmp_dir"/crawl.{warc.gz,cdx} . - -# Clean up temporary directory. -rm -rf "$tmp_dir"