diff --git a/.github/workflows/check-all-lists.yml b/.github/workflows/check-all-lists.yml index fb6682c83..70e87b092 100644 --- a/.github/workflows/check-all-lists.yml +++ b/.github/workflows/check-all-lists.yml @@ -11,5 +11,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + - name: Install hatch + run: pip install hatch + - name: Run lint lists against all lists - run: python scripts/lint-lists.py lists/ + run: hatch run lint-lists lists/ diff --git a/docs/spec.md b/docs/spec.md new file mode 100644 index 000000000..812629619 --- /dev/null +++ b/docs/spec.md @@ -0,0 +1,102 @@ +## Test Lists v1 data format + +The goal of this section is to outline the current dataformat for the testing +lists. + +Ideally we would enrich this data format spec with also some additional notes +on the existing pain points and what are the current limitations. + +### v1 data format + +The testing lists are broken down into CSV files, which are named as: +* `global.csv` for testing lists that apply to all countries +* `[country_code].csv` for country specific lists, where `country_code` is the + lowercase + [ISO3166](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes) alpha + 2 country code. The only exception is the `cis` category code that is + for Commonwealth of Independent States nations. + +Each CSV file contains the following columns: + +* `url` - Full URL of the resource, which must match the following regular expression: +``` +re.compile( +r'^(?:http)s?://' # http:// or https:// +r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... +r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip +r'(?::\d+)?' # optional port +r'(?:/?|[/?]\S+)$', re.IGNORECASE) +``` +* `category_code` - Category code (see current category codes) +* `category_description` - Description of the category +* `date_added` - [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) timestamp of when it was added to the list in the format `YYYY-MM-DD` +* `source` - opaque string representing the name of the person that added it to the list +* `notes` - opaque string with notes about this string + +### v1 category codes + +* Alcohol & Drugs,ALDR +* Religion,REL +* Pornography,PORN +* Provocative Attire,PROV +* Political Criticism,POLR +* Human Rights Issues,HUMR +* Environment,ENV +* Terrorism and Militants,MILX +* Hate Speech,HATE +* News Media,NEWS +* Sex Education,XED +* Public Health,PUBH +* Gambling,GMB +* Anonymization and circumvention tools,ANON +* Online Dating,DATE +* Social Networking,GRP +* LGBT,LGBT +* File-sharing,FILE +* Hacking Tools,HACK +* Communication Tools,COMT +* Media sharing,MMED +* Hosting and Blogging Platforms,HOST +* Search Engines,SRCH +* Gaming,GAME +* Culture,CULTR +* Economics,ECON +* Government,GOVT +* E-commerce,COMM +* Control content,CTRL +* Intergovernmental Organizations,IGO +* Miscellaneous content,MISC + +## v1.5 data format + +The goal of the v1.5 data format is to come up with an incremental set of +changes to the lists formats such that it's possible to relatively easily +backport changes from upstream while we work on fully migrating over to the new +format. + +Ideally it would include only the addition of new columns, without any +drammatic changes to minimize the likelyhood of conflicts when it's merged from +upstream. + +* `url` - Full URL of the resource +* `category_code` - Category code (see current category codes) +* `category_description` - Description of the category +* `date_added` - ISO timestamp of when it was added +* `source` - string representing the name of the person that added it +* `notes` - a JSON string representing metadata for the URL (see URL Meta below) + +### URL Meta + +URL meta is a JSON encoded metadata column that expresses metadata related to +the a URL that is relevant to analysts permorning data analysis. + +It should be extensible without needing to add new columns (adding or changing +columns has the potential of breaking parsers of CSV). + +This field is optional and parsers should not expect it to be present or it +containing any of the specific keys defined below. + +Defined keys +* `notes`: value coming from the existing notes column +* `context_*`: values representing context that's specific to the URL + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..b6c2c72ca --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,65 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "test-lists" +dynamic = ["version"] +description = '' +readme = "README.md" +requires-python = ">=3.8" +license = "MPL-2.0" +keywords = [] +authors = [{ name = "Arturo Filastò", email = "arturo@filasto.net" }] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [] + +[project.urls] +Documentation = "https://github.com/ooni/test-lists#readme" +Issues = "https://github.com/ooni/test-lists/issues" +Source = "https://github.com/ooni/test-lists" + +[tool.hatch.version] +path = "src/test_lists/__about__.py" + +[tool.hatch.envs.default] +dependencies = ["coverage[toml]>=6.5", "pytest", "click"] +path = ".venv/" + +[tool.hatch.envs.default.scripts] +lint-lists = "python -m test_lists.cli lint-lists {args}" +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = ["- coverage combine", "coverage report"] +cov = ["test-cov", "cov-report"] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11", "3.12"] + +[tool.hatch.envs.types] +dependencies = ["mypy>=1.0.0"] +[tool.hatch.envs.types.scripts] +check = "mypy --install-types --non-interactive {args:src/test_lists tests}" + +[tool.coverage.run] +source_pkgs = ["test_lists", "tests"] +branch = true +parallel = true +omit = ["src/test_lists/__about__.py"] + +[tool.coverage.paths] +test_lists = ["src/test_lists", "*/test-lists/src/test_lists"] +tests = ["tests", "*/test-lists/tests"] + +[tool.coverage.report] +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] diff --git a/scripts/lint-lists.py b/scripts/lint-lists.py deleted file mode 100755 index b7784c06b..000000000 --- a/scripts/lint-lists.py +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/env python - -from __future__ import print_function - -import argparse -import datetime -import os -import re -import sys -import csv -from glob import glob - -try: - from urlparse import urlparse -except: - from urllib.parse import urlparse - -VALID_URL = regex = re.compile( - r'^(?:http)s?://' # http:// or https:// - r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... - r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip - r'(?::\d+)?' # optional port - r'(?:/?|[/?]\S+)$', re.IGNORECASE) - -BAD_CHARS = ["\r", "\n", "\t", "\\"] - -CATEGORY_CODES = {} -COUNTRY_CODES = {} - -NEW_CATEGORY_CODES = "00-LEGEND-new_category_codes.csv" -LEGACY_CATEGORY_CODES = "00-LEGEND-category_codes.csv" -COUNTRY_CODES = "00-LEGEND-country_codes.csv" - -def is_valid_date(d): - try: - if datetime.datetime.strptime(d, "%Y-%m-%d").date().isoformat() == d: - return True - except Exception: - pass - return False - -class TestListError(object): - name = 'Test List Error' - def __init__(self, csv_path, line_number): - self.csv_path = csv_path - self.line_number = line_number - - def print(self): - print('{} (line {}): {}'.format( - self.csv_path, self.line_number, self.name - )) - -class TestListErrorWithValue(TestListError): - def __init__(self, value, csv_path, line_number, details=None): - super(TestListErrorWithValue, self).__init__(csv_path, line_number) - self.value = value - self.details = details - - def print(self): - msg = '{} (line {}): {} "{}"'.format( - self.csv_path, self.line_number, self.name, self.value - ) - if self.details: - msg += ' ({})'.format(self.details) - print(msg) - -class InvalidHeader(TestListError): - name = 'Invalid Header' - -class InvalidColumnNumber(TestListError): - name = 'Invalid Column Number' - -class InvalidURL(TestListErrorWithValue): - name = 'Invalid URL' - -class InvalidNotes(TestListErrorWithValue): - name = 'Invalid Notes' - -class InvalidSource(TestListErrorWithValue): - name = 'Invalid Source' - -class DuplicateURL(TestListErrorWithValue): - name = 'Duplicate URL' - -class InvalidCategoryCode(TestListErrorWithValue): - name = 'Invalid Category Code' - -class InvalidCategoryDesc(TestListErrorWithValue): - name = 'Invalid Category Description' - -class InvalidDate(TestListErrorWithValue): - name = 'Invalid Date' - -class DuplicateURLWithGlobalList(TestListErrorWithValue): - name = "Duplicate URL between Local List and Global List" - -def get_legacy_description_code(row): - return row[1], row[0] - -def get_new_description_code(row): - return row[0], row[1] - -def load_categories(path, get_description_code=get_new_description_code): - code_map = {} - with open(path, 'r') as in_file: - reader = csv.reader(in_file, delimiter=',') - next(reader) # skip header - for row in reader: - desc, code = get_description_code(row) - code_map[code] = desc - return code_map - -def load_global_list(path): - check_list = set() - with open(path, 'r') as in_file: - reader = csv.reader(in_file, delimiter=',') - for idx, row in enumerate(reader): - if idx != 0 and (len(row) == 6): - check_list.add(row[0]) - return check_list - - -ERR_NOSLASH = "No trailing slash" - -def check(url): - if not VALID_URL.match(url): - return "No match" - elif any([c in url for c in BAD_CHARS]): - return "Bad chars" - elif url != url.strip(): - return "Extra spaces at ends" - elif urlparse(url).path == "": - return ERR_NOSLASH - - -def main(lists_path, fix_duplicates=False, fix_slash=False): - all_errors = [] - total_urls = 0 - total_countries = 0 - CATEGORY_CODES = load_categories( - os.path.join(lists_path, NEW_CATEGORY_CODES), - get_new_description_code - ) - header = ['url', 'category_code', 'category_description', - 'date_added', 'source', 'notes'] - # preload the global list to check against looking for dupes - global_urls_bag = load_global_list(os.path.join(lists_path, "global.csv")) - for csv_path in glob(os.path.join(lists_path, "*")): - if os.path.basename(csv_path).startswith('00-'): - continue - if not csv_path.endswith('.csv'): - continue - with open(csv_path, 'r', encoding='utf-8') as in_file: - reader = csv.reader(in_file, delimiter=',') - first_line = next(reader) - if first_line != header: - errors.append( - InvalidHeader(csv_path, 0) - ) - urls_bag = set() - errors = [] - rows = [] - duplicates = 0 - without_slash = 0 - idx = -1 - for idx, row in enumerate(reader): - if len(row) != 6: - errors.append( - InvalidColumnNumber(csv_path, idx+2) - ) - continue - url, cat_code, cat_desc, date_added, source, notes = row - err = check(url) - if err: - errors.append( - InvalidURL(url, csv_path, idx+2, details=err) - ) - if err == ERR_NOSLASH: - without_slash += 1 - row[0] = row[0] + "/" - if os.path.basename(csv_path) != "global.csv": - if url in global_urls_bag: - errors.append( - DuplicateURLWithGlobalList(url, csv_path, idx+2) - ) - if fix_duplicates: - duplicates += 1 - continue - - try: - cat_description = CATEGORY_CODES[cat_code] - except KeyError: - errors.append( - InvalidCategoryCode(cat_code, csv_path, idx+2) - ) - if cat_description != cat_desc: - errors.append( - InvalidCategoryDesc(cat_desc, csv_path, idx+2) - ) - if url in urls_bag: - if not fix_duplicates: - errors.append( - DuplicateURL(url, csv_path, idx+2) - ) - duplicates += 1 - continue - if not is_valid_date(date_added): - errors.append( - InvalidDate(date_added, csv_path, idx+2) - ) - if any([c in notes for c in BAD_CHARS]): - errors.append( - InvalidNotes(notes, csv_path, idx+2) - ) - if any([c in source for c in BAD_CHARS]): - errors.append( - InvalidSource(source, csv_path, idx+2) - ) - urls_bag.add(url) - rows.append(row) - print('* {}'.format(csv_path)) - print(' {} URLs'.format(idx+1)) - print(' {} Errors'.format(len(errors))) - all_errors += errors - total_urls += idx+1 - total_countries += 1 - - if fix_slash and without_slash > 0: - print('Fixing slash in %s' % csv_path) - rows.insert(0, header) - with open(csv_path + '.fixed', 'w') as out_file: - csv_writer = csv.writer(out_file, quoting=csv.QUOTE_MINIMAL, lineterminator='\n') - csv_writer.writerows(rows) - os.rename(csv_path + '.fixed', csv_path) - - if fix_duplicates and duplicates > 0: - rows.sort(key=lambda x: x[0].split('//')[1]) - rows.insert(0, header) - with open(csv_path + '.fixed', 'w') as out_file: - csv_writer = csv.writer(out_file, quoting=csv.QUOTE_MINIMAL, lineterminator='\n') - csv_writer.writerows(rows) - print('Sorting %s - Found %d duplicates' % (csv_path, duplicates)) - os.rename(csv_path + '.fixed', csv_path) - - print('----------') - print('Analyzed {} URLs in {} countries'.format(total_urls, total_countries)) - if len(all_errors) == 0: - print('ALL OK') - sys.exit(0) - - print("{} errors present".format(len(all_errors))) - for error in all_errors: - error.print() - sys.exit(1) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Check that the test lists are OK') - parser.add_argument('lists_path', metavar='LISTS_PATH', help='path to the test list') - parser.add_argument('--fix-duplicates', action='store_true') - parser.add_argument('--fix-slash', action='store_true') - - args = parser.parse_args() - main(args.lists_path, fix_duplicates=args.fix_duplicates, fix_slash=args.fix_slash) diff --git a/src/test_lists/__about__.py b/src/test_lists/__about__.py new file mode 100644 index 000000000..201bf4834 --- /dev/null +++ b/src/test_lists/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2024-present Arturo Filastò +# +# SPDX-License-Identifier: MPL-2.0 +__version__ = "0.0.1" diff --git a/src/test_lists/__init__.py b/src/test_lists/__init__.py new file mode 100644 index 000000000..1614e8c8e --- /dev/null +++ b/src/test_lists/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2024-present Arturo Filastò +# +# SPDX-License-Identifier: MPL-2.0 diff --git a/src/test_lists/cli.py b/src/test_lists/cli.py new file mode 100644 index 000000000..a99f45c1b --- /dev/null +++ b/src/test_lists/cli.py @@ -0,0 +1,43 @@ +import click + +from .lint_lists import lint_lists + + +@click.group() +def cli(): + """Test Lists management command line tool.""" + pass + + +@cli.command("lint-lists") +@click.argument("lists_path", type=click.Path()) +@click.option("--fix-duplicates", is_flag=True, help="Fix duplicates in the test list") +@click.option("--fix-slash", is_flag=True, help="Fix slashes in the test list") +@click.option( + "--fix-notes", is_flag=True, help="Fix notes field by converting it to JSON" +) +@click.option( + "--force-update", is_flag=True, help="Forces updating of the tests list formats" +) +def cli_lint_lists(lists_path, fix_duplicates, fix_slash, fix_notes, force_update): + """ + Check that the test lists are OK. + + Args: + lists_path (str): Path to the test list. + fix_duplicates (bool): Option to fix duplicates in the test list. + fix_slash (bool): Option to fix slashes in the test list. + fix_notes (bool): Option to fix notes in the test list. + force_update (bool): Option to force update of the test list. Useful to change quoting. + """ + lint_lists( + lists_path, + fix_duplicates=fix_duplicates, + fix_slash=fix_slash, + fix_notes=fix_notes, + force_update=force_update, + ) + + +if __name__ == "__main__": + cli() diff --git a/src/test_lists/lint_lists.py b/src/test_lists/lint_lists.py new file mode 100755 index 000000000..a89c19d41 --- /dev/null +++ b/src/test_lists/lint_lists.py @@ -0,0 +1,325 @@ +import datetime +import os +import re +import sys +import csv +import json +from glob import glob + +from urllib.parse import urlparse + +VALID_URL = regex = re.compile( + r"^(?:http)s?://" # http:// or https:// + r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain... + r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip + r"(?::\d+)?" # optional port + r"(?:/?|[/?]\S+)$", + re.IGNORECASE, +) + +BAD_CHARS = ["\r", "\n", "\t", "\\"] + + +NEW_CATEGORY_CODES = "00-LEGEND-new_category_codes.csv" +LEGACY_CATEGORY_CODES = "00-LEGEND-category_codes.csv" +COUNTRY_CODES = "00-LEGEND-country_codes.csv" + + +def is_valid_date(d): + try: + if datetime.datetime.strptime(d, "%Y-%m-%d").date().isoformat() == d: + return True + except Exception: + pass + return False + + +class TestListError(object): + name = "Test List Error" + + def __init__(self, csv_path, line_number): + self.csv_path = csv_path + self.line_number = line_number + + def print(self): + print("{} (line {}): {}".format(self.csv_path, self.line_number, self.name)) + + +class TestListErrorWithValue(TestListError): + def __init__(self, value, csv_path, line_number, details=None): + super(TestListErrorWithValue, self).__init__(csv_path, line_number) + self.value = value + self.details = details + + def print(self): + msg = '{} (line {}): {} "{}"'.format( + self.csv_path, self.line_number, self.name, self.value + ) + if self.details: + msg += " ({})".format(self.details) + print(msg) + + +class InvalidHeader(TestListError): + name = "Invalid Header" + + +class InvalidColumnNumber(TestListError): + name = "Invalid Column Number" + + +class InvalidURL(TestListErrorWithValue): + name = "Invalid URL" + + +class InvalidNotes(TestListErrorWithValue): + name = "Invalid Notes" + + +class InvalidSource(TestListErrorWithValue): + name = "Invalid Source" + + +class DuplicateURL(TestListErrorWithValue): + name = "Duplicate URL" + + +class InvalidCategoryCode(TestListErrorWithValue): + name = "Invalid Category Code" + + +class InvalidCategoryDesc(TestListErrorWithValue): + name = "Invalid Category Description" + + +class InvalidDate(TestListErrorWithValue): + name = "Invalid Date" + + +class DuplicateURLWithGlobalList(TestListErrorWithValue): + name = "Duplicate URL between Local List and Global List" + + +def get_legacy_description_code(row): + return row[1], row[0] + + +def get_new_description_code(row): + return row[0], row[1] + + +def load_categories(path, get_description_code=get_new_description_code): + code_map = {} + with open(path, "r") as in_file: + reader = csv.reader(in_file, delimiter=",") + next(reader) # skip header + for row in reader: + desc, code = get_description_code(row) + code_map[code] = desc + return code_map + + +def load_global_list(path, quotechar): + check_list = set() + with open(path, "r") as in_file: + reader = csv.reader(in_file, delimiter=",", quotechar=quotechar) + for idx, row in enumerate(reader): + if idx != 0 and (len(row) == 6): + check_list.add(row[0]) + return check_list + + +VALID_NOTES_JSON_KEYS = ["notes"] + + +def validate_notes_keys(notes): + for k in notes.keys(): + assert k in VALID_NOTES_JSON_KEYS, f"invalid notes key {k}" + + +ERR_NOSLASH = "No trailing slash" + + +def check(url): + if not VALID_URL.match(url): + return "No match" + elif any([c in url for c in BAD_CHARS]): + return "Bad chars" + elif url != url.strip(): + return "Extra spaces at ends" + elif urlparse(url).path == "": + return ERR_NOSLASH + + +TEST_LISTS_HEADER = [ + "url", + "category_code", + "category_description", + "date_added", + "source", + "notes", +] + + +class TestListProcessor: + def __init__( + self, + csv_path, + category_codes, + global_urls_bag, + writer_quotechar, + fix_duplicates=False, + fix_slash=False, + fix_notes=False, + ): + self.csv_path = csv_path + self.global_urls_bag = global_urls_bag + self.fix_duplicates = fix_duplicates + self.fix_slash = fix_slash + self.fix_notes = fix_notes + self.errors = [] + self.category_codes = category_codes + + self.rows = [TEST_LISTS_HEADER] + self.urls_bag = set() + self.errors = [] + self.idx = -1 + self.writer_quotechar = writer_quotechar + + def open(self): + return open(self.csv_path, "r", encoding="utf-8") + + def process_row(self, idx, row): + if len(row) != 6: + self.errors.append(InvalidColumnNumber(self.csv_path, idx + 2)) + return + + url, cat_code, cat_desc, date_added, source, notes = row + err = check(url) + if err: + self.errors.append(InvalidURL(url, self.csv_path, idx + 2, details=err)) + if err == ERR_NOSLASH and self.fix_slash: + row[0] = row[0] + "/" + + if notes.startswith("{"): + try: + d = json.loads(notes) + validate_notes_keys(d) + except AssertionError as exc: + err = f"{exc} in {notes}" + self.errors.append( + InvalidNotes(err, self.csv_path, idx + 2, details=err) + ) + except json.decoder.JSONDecodeError: + self.errors.append( + InvalidNotes(notes, self.csv_path, idx + 2, details=err) + ) + elif self.fix_notes: + row[5] = json.dumps({"notes": notes}) + + if os.path.basename(self.csv_path) != "global.csv": + if url in self.global_urls_bag: + self.errors.append( + DuplicateURLWithGlobalList(url, self.csv_path, idx + 2) + ) + if self.fix_duplicates: + return + + try: + cat_description = self.category_codes[cat_code] + except KeyError: + self.errors.append(InvalidCategoryCode(cat_code, self.csv_path, idx + 2)) + if cat_description != cat_desc: + self.errors.append(InvalidCategoryDesc(cat_desc, self.csv_path, idx + 2)) + if url in self.urls_bag: + if not self.fix_duplicates: + self.errors.append(DuplicateURL(url, self.csv_path, idx + 2)) + return + + if not is_valid_date(date_added): + self.errors.append(InvalidDate(date_added, self.csv_path, idx + 2)) + if any([c in notes for c in BAD_CHARS]): + self.errors.append(InvalidNotes(notes, self.csv_path, idx + 2)) + if any([c in source for c in BAD_CHARS]): + self.errors.append(InvalidSource(source, self.csv_path, idx + 2)) + self.urls_bag.add(url) + + self.rows.append(row) + + def write_fixed(self): + with open(self.csv_path + ".fixed", "w") as out_file: + csv_writer = csv.writer( + out_file, + quoting=csv.QUOTE_MINIMAL, + quotechar=self.writer_quotechar, + lineterminator="\n", + ) + csv_writer.writerows(self.rows) + os.rename(self.csv_path + ".fixed", self.csv_path) + + +def lint_lists( + lists_path, + fix_duplicates=False, + fix_slash=False, + fix_notes=False, + force_update=False, + reader_quotechar='"', + writer_quotechar='"', +): + all_errors = [] + total_urls = 0 + total_countries = 0 + + category_codes = load_categories( + os.path.join(lists_path, NEW_CATEGORY_CODES), get_new_description_code + ) + + # preload the global list to check against looking for dupes + global_urls_bag = load_global_list( + os.path.join(lists_path, "global.csv"), quotechar=reader_quotechar + ) + for csv_path in glob(os.path.join(lists_path, "*")): + if os.path.basename(csv_path).startswith("00-"): + continue + if not csv_path.endswith(".csv"): + continue + processor = TestListProcessor( + csv_path, + category_codes=category_codes, + global_urls_bag=global_urls_bag, + fix_duplicates=fix_duplicates, + fix_notes=fix_notes, + fix_slash=fix_slash, + writer_quotechar=writer_quotechar, + ) + with processor.open() as in_file: + reader = csv.reader(in_file, delimiter=",", quotechar=reader_quotechar) + first_line = next(reader) + if first_line != TEST_LISTS_HEADER: + processor.errors.append(InvalidHeader(csv_path, 0)) + + for idx, row in enumerate(reader): + processor.process_row(idx, row) + + if fix_slash or fix_duplicates or fix_notes or force_update: + processor.write_fixed() + + print(f"* {processor.csv_path}") + print(f" {idx+1} URLs") + print(f" {len(processor.errors)} Errors") + + all_errors += processor.errors + total_urls += idx + 1 + total_countries += 1 + + print("----------") + print("Analyzed {} URLs in {} countries".format(total_urls, total_countries)) + if len(all_errors) == 0: + print("ALL OK") + sys.exit(0) + + print("{} errors present".format(len(all_errors))) + for error in all_errors: + error.print() + sys.exit(1)