From 2976781f553bd4582d08be1d579ed54d9633c6ed Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sat, 19 Oct 2024 21:29:49 +1100 Subject: [PATCH 01/23] implement quality checker implement the quality checker that reports errors for - wrong escape characters - wrong starting letters - presence of non-utf-8 characters and reports warning for - duplicate entries - same full forms - same abbreviations - outdated 'Manage' abbreviation --- scripts/check_quality.py | 90 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 scripts/check_quality.py diff --git a/scripts/check_quality.py b/scripts/check_quality.py new file mode 100644 index 0000000..dcf599b --- /dev/null +++ b/scripts/check_quality.py @@ -0,0 +1,90 @@ +import os +import re +import sys +import itertools +import csv + +# Path to the journals folder (change this path accordingly) +JOURNALS_FOLDER_PATH = "./journals/" + +# Error tracking +def error(message): + print(f"ERROR: {message}") + sys.exit(1) + +# Warning tracking +def warning(message): + print(f"WARN: {message}") + +# Check if non-UTF8 characters are present in the file +def check_non_utf8_characters(filepath): + try: + with open(filepath, 'r', encoding='utf-8') as f: + f.read() + except UnicodeDecodeError: + error(f"File {filepath} contains non-UTF8 characters") + +# Check if there are wrong escape characters in abbreviation entries +def check_wrong_escape(filepath): + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + for line_number, row in enumerate(reader, start=1): + for field in row: + if re.search(r"[a-zA-Z]*\\[,\"]", field): + error(f"Wrong escape character found in file {filepath} at line {line_number}: {field}") + +# Check for wrong beginning letters in journal abbreviations +def check_wrong_beginning_letters(filepath): + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + for line_number, row in enumerate(reader, start=1): + if row[0].startswith("\""): + error(f"Wrong beginning letter found in file {filepath} at line {line_number}: {row[0]}") + +# Check for duplicate entries +def check_duplicates(filepath): + entries = set() + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + for line_number, row in enumerate(reader, start=1): + line = ','.join(row) + if line in entries: + warning(f"Duplicate entry found in file {filepath} at line {line_number}: {line}") + else: + entries.add(line) + +# Check if abbreviation and full form are the same +def check_full_form_identical_to_abbreviation(filepath): + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + for line_number, row in enumerate(reader, start=1): + if len(row) == 2 and row[0].strip() == row[1].strip(): + warning(f"Abbreviation is the same as full form in file {filepath} at line {line_number}: {row[0]}") + +# Check for outdated abbreviations +def check_outdated_abbreviations(filepath): + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + for line_number, row in enumerate(reader, start=1): + if "Manage." in row and "Manag." not in row: + warning(f"Outdated abbreviation used in file {filepath} at line {line_number}: {','.join(row)}") + +if __name__ == "__main__": + if not os.path.exists(JOURNALS_FOLDER_PATH): + print("Journals folder not found. Please make sure the path is correct.") + sys.exit(1) + + # Iterate through all CSV files in the journals folder + for filename in os.listdir(JOURNALS_FOLDER_PATH): + if filename.endswith(".csv"): + filepath = os.path.join(JOURNALS_FOLDER_PATH, filename) + + # Run the checks + check_non_utf8_characters(filepath) + check_wrong_escape(filepath) + check_wrong_beginning_letters(filepath) + check_duplicates(filepath) + check_full_form_identical_to_abbreviation(filepath) + check_outdated_abbreviations(filepath) + + print("Quality check completed.") \ No newline at end of file From 0ebac9db4e0c67f407a8e8b480e3ed994cff2d82 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sat, 19 Oct 2024 21:29:51 +1100 Subject: [PATCH 02/23] Create quality_checker.py --- scripts/quality_checker.py | 100 +++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 scripts/quality_checker.py diff --git a/scripts/quality_checker.py b/scripts/quality_checker.py new file mode 100644 index 0000000..e433b30 --- /dev/null +++ b/scripts/quality_checker.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +""" +Python script for checking multiple quality aspects of .csv journal abbreviation files. +This script enforces conventions to ensure that abbreviations of journal titles meet quality standards. + +The script performs the following checks: +1. Checks for wrong escape sequences. +2. Checks for incorrect beginning letters. +3. Checks for non-UTF8 characters. +4. Checks for duplicate entries (full names and abbreviations). +5. Checks if abbreviations match full names (for one-word titles). +6. Checks for outdated abbreviations. + +The script will print out issues found and exit with a failure code if any issues are detected. +The script does NOT automatically fix these errors. Corrections must be done manually. + +The script will automatically run whenever there is a push to the main branch of the +abbreviations repo (abbrv.jabref.org) using GitHub Actions. +""" + +import os +import itertools +import csv +import re +import sys + +# Define paths and file collections +PATH_TO_JOURNALS = "./journals/" +fileNames = next(itertools.islice(os.walk(PATH_TO_JOURNALS), 0, None))[2] + +# Error collections +errors = [] + +# Utility functions for checking conditions +def is_utf8(text): + try: + text.encode('utf-8') + return True + except UnicodeEncodeError: + return False + +def check_abbreviation_duplicates(full_name, abbrev, seen_full_names, seen_abbrevs): + if full_name in seen_full_names or abbrev in seen_abbrevs: + return True + return False + +def is_outdated_abbreviation(abbrev): + # Add a basic rule to detect outdated abbreviations (e.g., "Manage." instead of "Manag.") + outdated_patterns = [r"Manage\.\b"] + for pattern in outdated_patterns: + if re.search(pattern, abbrev): + return True + return False + +# Perform checks +for file in fileNames: + if file.endswith(".csv"): + with open(PATH_TO_JOURNALS + file, mode='r', encoding='utf-8') as f: + reader = csv.reader(f) + seen_full_names = set() + seen_abbrevs = set() + + for row_index, row in enumerate(reader, start=1): + if len(row) < 2: + continue # Skip rows without expected data + + full_name, abbrev = row[0], row[1] + + # Check for escaped ampersands + if '\\&' in full_name or '\\&' in abbrev: + errors.append(f"Escaped ampersand in file {file}, row {row_index}") + + # Check for non-UTF8 characters + if not is_utf8(full_name) or not is_utf8(abbrev): + errors.append(f"Non-UTF8 character in file {file}, row {row_index}") + + # Check for duplicate entries + if check_abbreviation_duplicates(full_name, abbrev, seen_full_names, seen_abbrevs): + errors.append(f"Duplicate entry in file {file}, row {row_index}") + + # Check if abbreviation matches full form for one-word titles + if full_name.strip().lower() == abbrev.strip().lower(): + errors.append(f"Full form matches abbreviation in file {file}, row {row_index}") + + # Check for outdated abbreviations + if is_outdated_abbreviation(abbrev): + errors.append(f"Outdated abbreviation in file {file}, row {row_index}") + + # Update seen sets + seen_full_names.add(full_name) + seen_abbrevs.add(abbrev) + +# Print errors and exit with failure code if any issues found +if errors: + error_message = "Quality check failed:\n" + "\n".join(errors) + print(error_message, file=sys.stderr) + sys.exit(1) +else: + print("Quality check passed. No issues found.") From 5c3f483f835159ad9ee69a8c34160ef5b8d146e2 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sat, 19 Oct 2024 21:58:58 +1100 Subject: [PATCH 03/23] fix early stop in check_quality.py prevent the script from stopping by error-triggered exit --- scripts/check_quality.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/scripts/check_quality.py b/scripts/check_quality.py index dcf599b..1bc0a71 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -7,10 +7,10 @@ # Path to the journals folder (change this path accordingly) JOURNALS_FOLDER_PATH = "./journals/" +errors = [] # Error tracking def error(message): - print(f"ERROR: {message}") - sys.exit(1) + errors.append(f"ERROR: {message}") # Warning tracking def warning(message): @@ -31,7 +31,7 @@ def check_wrong_escape(filepath): for line_number, row in enumerate(reader, start=1): for field in row: if re.search(r"[a-zA-Z]*\\[,\"]", field): - error(f"Wrong escape character found in file {filepath} at line {line_number}: {field}") + error(f"Wrong escape character found in {filepath} at line {line_number}: {field}") # Check for wrong beginning letters in journal abbreviations def check_wrong_beginning_letters(filepath): @@ -39,19 +39,19 @@ def check_wrong_beginning_letters(filepath): reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): if row[0].startswith("\""): - error(f"Wrong beginning letter found in file {filepath} at line {line_number}: {row[0]}") + error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}") # Check for duplicate entries def check_duplicates(filepath): - entries = set() + entries = {} with open(filepath, 'r', encoding='utf-8') as f: reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): line = ','.join(row) if line in entries: - warning(f"Duplicate entry found in file {filepath} at line {line_number}: {line}") + warning(f"Duplicate found in {filepath} at line {line_number}: {line}, first instance seen at line {entries[line]}") else: - entries.add(line) + entries[line] = line_number # Check if abbreviation and full form are the same def check_full_form_identical_to_abbreviation(filepath): @@ -59,7 +59,7 @@ def check_full_form_identical_to_abbreviation(filepath): reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): if len(row) == 2 and row[0].strip() == row[1].strip(): - warning(f"Abbreviation is the same as full form in file {filepath} at line {line_number}: {row[0]}") + warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}") # Check for outdated abbreviations def check_outdated_abbreviations(filepath): @@ -67,7 +67,7 @@ def check_outdated_abbreviations(filepath): reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): if "Manage." in row and "Manag." not in row: - warning(f"Outdated abbreviation used in file {filepath} at line {line_number}: {','.join(row)}") + warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}") if __name__ == "__main__": if not os.path.exists(JOURNALS_FOLDER_PATH): @@ -87,4 +87,10 @@ def check_outdated_abbreviations(filepath): check_full_form_identical_to_abbreviation(filepath) check_outdated_abbreviations(filepath) - print("Quality check completed.") \ No newline at end of file + # Print all errors at the end + if errors: + for err in errors: + print(err) + sys.exit(1) + else: + print("Quality check completed with no errors.") \ No newline at end of file From 7229041327a370c1b2056fb6ddce2a5b32009e1e Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sat, 19 Oct 2024 23:15:44 +1100 Subject: [PATCH 04/23] Deploy checker on GitHub Action - ignore single-name journals with same abbreviation as full name - generate error summary and deploy checker on GitHub Action --- .github/workflows/quality-check.yml | 39 +++++++++++++++++++++++++++++ scripts/check_quality.py | 26 ++++++++++++++----- 2 files changed, 59 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/quality-check.yml diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml new file mode 100644 index 0000000..0589b0a --- /dev/null +++ b/.github/workflows/quality-check.yml @@ -0,0 +1,39 @@ +name: Quality Check + +on: [push, pull_request] + +jobs: + quality-check: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Run Quality Check + run: | + python quality_checker.py + continue-on-error: true + + - name: Upload Quality Check Summary + if: always() + uses: actions/upload-artifact@v2 + with: + name: quality-check-summary + path: quality_check_summary.txt + + - name: Add Quality Check Summary to Job + if: always() + run: | + echo "Generating GitHub Actions job summary..." + echo 'Quality Check Summary:' >> $GITHUB_STEP_SUMMARY + cat quality_check_summary.txt >> $GITHUB_STEP_SUMMARY diff --git a/scripts/check_quality.py b/scripts/check_quality.py index 1bc0a71..7be2fab 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -6,15 +6,16 @@ # Path to the journals folder (change this path accordingly) JOURNALS_FOLDER_PATH = "./journals/" - +SUMMARY_FILE_PATH = "./check_quality_summary.txt" errors = [] +warnings = [] # Error tracking def error(message): errors.append(f"ERROR: {message}") # Warning tracking def warning(message): - print(f"WARN: {message}") + warnings.append(f"WARN: {message}") # Check if non-UTF8 characters are present in the file def check_non_utf8_characters(filepath): @@ -58,7 +59,7 @@ def check_full_form_identical_to_abbreviation(filepath): with open(filepath, 'r', encoding='utf-8') as f: reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): - if len(row) == 2 and row[0].strip() == row[1].strip(): + if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip(): warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}") # Check for outdated abbreviations @@ -87,10 +88,23 @@ def check_outdated_abbreviations(filepath): check_full_form_identical_to_abbreviation(filepath) check_outdated_abbreviations(filepath) - # Print all errors at the end + # Write the summary to a file + with open(SUMMARY_FILE_PATH, 'w') as summary_file: + if errors or warnings: + summary_file.write("Quality Check Summary:\n") + if errors: + summary_file.write("\nErrors:\n") + for err in errors: + summary_file.write(f"{err}\n") + if warnings: + summary_file.write("\nWarnings:\n") + for warn in warnings: + summary_file.write(f"{warn}\n") + else: + summary_file.write("Quality check completed with no errors or warnings.\n") + + # Print summary and set exit code if errors: - for err in errors: - print(err) sys.exit(1) else: print("Quality check completed with no errors.") \ No newline at end of file From 215ee88c794c5a44a8ba4c793e3d41695cdf7798 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sat, 19 Oct 2024 23:29:45 +1100 Subject: [PATCH 05/23] solve deprecation change upload-artifact@v2 to @v3 --- .github/workflows/quality-check.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index 0589b0a..7437f1c 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -17,16 +17,16 @@ jobs: - name: Install dependencies run: | - pip install -r requirements.txt + pip install -r requirements.txt || true # Only if you have dependencies listed here - name: Run Quality Check run: | python quality_checker.py - continue-on-error: true + continue-on-error: true # Continue if there are warnings/errors, so we can log the output - name: Upload Quality Check Summary if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: quality-check-summary path: quality_check_summary.txt @@ -34,6 +34,5 @@ jobs: - name: Add Quality Check Summary to Job if: always() run: | - echo "Generating GitHub Actions job summary..." - echo 'Quality Check Summary:' >> $GITHUB_STEP_SUMMARY + echo "### Quality Check Summary" >> $GITHUB_STEP_SUMMARY cat quality_check_summary.txt >> $GITHUB_STEP_SUMMARY From 7c2ca05f5872ee30ed3ed8f7f479331e599ac304 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sat, 19 Oct 2024 23:35:32 +1100 Subject: [PATCH 06/23] solve file not found error --- .github/workflows/quality-check.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index 7437f1c..56ecf56 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -21,15 +21,15 @@ jobs: - name: Run Quality Check run: | - python quality_checker.py + python ./scripts/check_quality.py continue-on-error: true # Continue if there are warnings/errors, so we can log the output - name: Upload Quality Check Summary if: always() uses: actions/upload-artifact@v3 with: - name: quality-check-summary - path: quality_check_summary.txt + name: check-quality-summary + path: ./check_quality_summary.txt - name: Add Quality Check Summary to Job if: always() From 6c02bba43aab58357a4e9dcc4130fa773fda354e Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sat, 19 Oct 2024 23:37:28 +1100 Subject: [PATCH 07/23] fix mismatched file name fix mismatched quality check file name --- .github/workflows/quality-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index 56ecf56..2108c5b 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -35,4 +35,4 @@ jobs: if: always() run: | echo "### Quality Check Summary" >> $GITHUB_STEP_SUMMARY - cat quality_check_summary.txt >> $GITHUB_STEP_SUMMARY + cat check_quality_summary.txt >> $GITHUB_STEP_SUMMARY From 9989eb73deeafacc08bb9a711d88564f1235b64f Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 20 Oct 2024 00:31:28 +1100 Subject: [PATCH 08/23] format output structure provide better visualisation of error/warning output --- .github/workflows/quality-check.yml | 4 -- scripts/check_quality.py | 75 +++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index 2108c5b..bd5193d 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -15,10 +15,6 @@ jobs: with: python-version: '3.x' - - name: Install dependencies - run: | - pip install -r requirements.txt || true # Only if you have dependencies listed here - - name: Run Quality Check run: | python ./scripts/check_quality.py diff --git a/scripts/check_quality.py b/scripts/check_quality.py index 7be2fab..6c5559e 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -9,13 +9,27 @@ SUMMARY_FILE_PATH = "./check_quality_summary.txt" errors = [] warnings = [] + +# Error and Warning Counts +error_counts = { + 'ERROR Wrong Escape': 0, + 'ERROR Wrong Starting Letter': 0, + 'ERROR Non-UTF8': 0 +} +warning_counts = { + 'WARN Duplicate FullName/Abbreviation': 0, + 'WARN Same Abbreviation as Full Name': 0, + 'WARN Outdated Manage Abbreviation': 0 +} # Error tracking -def error(message): - errors.append(f"ERROR: {message}") +def error(message, error_type): + errors.append((error_type, f"ERROR: {message}")) + error_counts[error_type] += 1 # Warning tracking -def warning(message): - warnings.append(f"WARN: {message}") +def warning(message, warning_type): + warnings.append((warning_type, f"WARN: {message}")) + warning_counts[warning_type] += 1 # Check if non-UTF8 characters are present in the file def check_non_utf8_characters(filepath): @@ -23,7 +37,7 @@ def check_non_utf8_characters(filepath): with open(filepath, 'r', encoding='utf-8') as f: f.read() except UnicodeDecodeError: - error(f"File {filepath} contains non-UTF8 characters") + error(f"File {filepath} contains non-UTF8 characters", 'ERROR Non-UTF8') # Check if there are wrong escape characters in abbreviation entries def check_wrong_escape(filepath): @@ -32,7 +46,7 @@ def check_wrong_escape(filepath): for line_number, row in enumerate(reader, start=1): for field in row: if re.search(r"[a-zA-Z]*\\[,\"]", field): - error(f"Wrong escape character found in {filepath} at line {line_number}: {field}") + error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape') # Check for wrong beginning letters in journal abbreviations def check_wrong_beginning_letters(filepath): @@ -40,7 +54,7 @@ def check_wrong_beginning_letters(filepath): reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): if row[0].startswith("\""): - error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}") + error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}", 'ERROR Wrong Starting Letter') # Check for duplicate entries def check_duplicates(filepath): @@ -50,7 +64,7 @@ def check_duplicates(filepath): for line_number, row in enumerate(reader, start=1): line = ','.join(row) if line in entries: - warning(f"Duplicate found in {filepath} at line {line_number}: {line}, first instance seen at line {entries[line]}") + warning(f"Duplicate found in {filepath} at line {line_number}: {line}, first instance seen at line {entries[line]}", 'WARN Duplicate FullName/Abbreviation') else: entries[line] = line_number @@ -60,7 +74,7 @@ def check_full_form_identical_to_abbreviation(filepath): reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip(): - warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}") + warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name') # Check for outdated abbreviations def check_outdated_abbreviations(filepath): @@ -68,7 +82,7 @@ def check_outdated_abbreviations(filepath): reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): if "Manage." in row and "Manag." not in row: - warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}") + warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation') if __name__ == "__main__": if not os.path.exists(JOURNALS_FOLDER_PATH): @@ -89,19 +103,40 @@ def check_outdated_abbreviations(filepath): check_outdated_abbreviations(filepath) # Write the summary to a file + total_issues = sum(error_counts.values()) + sum(warning_counts.values()) with open(SUMMARY_FILE_PATH, 'w') as summary_file: + # Write summary table with vertical headers + summary_file.write(f"Total: {total_issues}\n") + summary_file.write(f"ERROR Wrong Escape: {error_counts['ERROR Wrong Escape']}\n") + summary_file.write(f"ERROR Wrong Starting Letter: {error_counts['ERROR Wrong Starting Letter']}\n") + summary_file.write(f"ERROR Non-UTF8: {error_counts['ERROR Non-UTF8']}\n") + summary_file.write(f"WARN Duplicate FullName/Abbreviation: {warning_counts['WARN Duplicate FullName/Abbreviation']}\n") + summary_file.write(f"WARN Same Abbreviation as Full Name: {warning_counts['WARN Same Abbreviation as Full Name']}\n") + summary_file.write(f"WARN Outdated Manage Abbreviation: {warning_counts['WARN Outdated Manage Abbreviation']}\n") + + # Write detailed errors and warnings if errors or warnings: - summary_file.write("Quality Check Summary:\n") - if errors: - summary_file.write("\nErrors:\n") - for err in errors: - summary_file.write(f"{err}\n") - if warnings: - summary_file.write("\nWarnings:\n") - for warn in warnings: - summary_file.write(f"{warn}\n") + summary_file.write("\nQuality Check Summary:\n") + for subtitle in [ + 'ERROR Wrong Escape', + 'ERROR Wrong Starting Letter', + 'ERROR Non-UTF8', + 'WARN Duplicate FullName/Abbreviation', + 'WARN Same Abbreviation as Full Name', + 'WARN Outdated Manage Abbreviation' + ]: + # Write subtitle and corresponding messages + filtered_errors = [err for err_type, err in errors if err_type == subtitle] + filtered_warnings = [warn for warn_type, warn in warnings if warn_type == subtitle] + if filtered_errors or filtered_warnings: + count = len(filtered_errors) + len(filtered_warnings) + summary_file.write(f"\n{subtitle}: with {count} instances\n") + for err in filtered_errors: + summary_file.write(f"{err}\n") + for warn in filtered_warnings: + summary_file.write(f"{warn}\n") else: - summary_file.write("Quality check completed with no errors or warnings.\n") + summary_file.write("\nQuality check completed with no errors or warnings.\n") # Print summary and set exit code if errors: From c4f615f82b2c827d7b148b68d60595f633fc3d7d Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 20 Oct 2024 05:17:17 +1100 Subject: [PATCH 09/23] consolidate quality check functions - enhance invalid escape character check - group full name duplication and abbreviation duplication into same warning - ignore articles and preposition in check wrong beginning letters --- scripts/check_quality.py | 69 ++++++++++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/scripts/check_quality.py b/scripts/check_quality.py index 6c5559e..dea1557 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -35,38 +35,79 @@ def warning(message, warning_type): def check_non_utf8_characters(filepath): try: with open(filepath, 'r', encoding='utf-8') as f: - f.read() - except UnicodeDecodeError: - error(f"File {filepath} contains non-UTF8 characters", 'ERROR Non-UTF8') + for line_number, line in enumerate(f, start=1): + try: + line.encode('utf-8') + except UnicodeEncodeError as e: + error(f"Non-UTF8 character found in {filepath} at line {line_number}: {e}", 'ERROR Non-UTF8') + except UnicodeDecodeError as e: + error(f"File {filepath} contains non-UTF-8 characters: {e}", 'ERROR Non-UTF8') # Check if there are wrong escape characters in abbreviation entries def check_wrong_escape(filepath): + valid_escapes = {'\\', '\n', '\t', '\r', '\"'} with open(filepath, 'r', encoding='utf-8') as f: reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): for field in row: - if re.search(r"[a-zA-Z]*\\[,\"]", field): - error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape') + matches = re.findall(r"\\.", field) + for match in matches: + if match not in valid_escapes: + error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape') # Check for wrong beginning letters in journal abbreviations def check_wrong_beginning_letters(filepath): + # Words that are typically ignored when creating abbreviations + ignore_words = {'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'la', 'el', 'le', 'et'} + with open(filepath, 'r', encoding='utf-8') as f: reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): - if row[0].startswith("\""): - error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}", 'ERROR Wrong Starting Letter') + if len(row) < 2: # Skip if row doesn't have both full name and abbreviation + continue + + full_name = row[0].strip() + abbreviation = row[1].strip() + + # Skip empty entries + if not full_name or not abbreviation: + continue + + # Get significant words from full name (ignoring articles, prepositions, etc.) + full_words = [word for word in full_name.split() + if word.lower() not in ignore_words] + abbrev_words = abbreviation.split() + + # Skip if either is empty after filtering + if not full_words or not abbrev_words: + continue + + # Check if abbreviation starts with the same letter as the first significant word + if not abbrev_words[0].lower().startswith(full_words[0][0].lower()): + error(f"Wrong beginning letter found in {filepath} at line {line_number} " f"Full: '{full_name}', Abbrev: '{abbreviation}')", + 'ERROR Wrong Starting Letter') + + # Check for duplicate entries def check_duplicates(filepath): - entries = {} + full_name_entries = {} + abbreviation_entries = {} with open(filepath, 'r', encoding='utf-8') as f: reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): - line = ','.join(row) - if line in entries: - warning(f"Duplicate found in {filepath} at line {line_number}: {line}, first instance seen at line {entries[line]}", 'WARN Duplicate FullName/Abbreviation') + if len(row) < 2: + continue + + full_name = row[0].strip() + abbreviation = row[1].strip() + + # Check for duplicate full names or abbreviations + if full_name in full_name_entries or abbreviation in abbreviation_entries: + warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation') else: - entries[line] = line_number + full_name_entries[full_name] = line_number + abbreviation_entries[abbreviation] = line_number # Check if abbreviation and full form are the same def check_full_form_identical_to_abbreviation(filepath): @@ -104,9 +145,9 @@ def check_outdated_abbreviations(filepath): # Write the summary to a file total_issues = sum(error_counts.values()) + sum(warning_counts.values()) - with open(SUMMARY_FILE_PATH, 'w') as summary_file: + with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file: # Write summary table with vertical headers - summary_file.write(f"Total: {total_issues}\n") + summary_file.write(f"Total vulnerabilities: {total_issues}\n") summary_file.write(f"ERROR Wrong Escape: {error_counts['ERROR Wrong Escape']}\n") summary_file.write(f"ERROR Wrong Starting Letter: {error_counts['ERROR Wrong Starting Letter']}\n") summary_file.write(f"ERROR Non-UTF8: {error_counts['ERROR Non-UTF8']}\n") From 56110cbd76c0d66afd49c984f14cc2c34c20a1c8 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 20 Oct 2024 05:45:14 +1100 Subject: [PATCH 10/23] test GitHub Action make quality check action exit with code 1 if errors are present --- .github/workflows/quality-check.yml | 9 +-- scripts/check_quality.py | 86 ++++++++++------------------- 2 files changed, 33 insertions(+), 62 deletions(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index bd5193d..b654b6e 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -16,6 +16,7 @@ jobs: python-version: '3.x' - name: Run Quality Check + id: quality_check run: | python ./scripts/check_quality.py continue-on-error: true # Continue if there are warnings/errors, so we can log the output @@ -27,8 +28,8 @@ jobs: name: check-quality-summary path: ./check_quality_summary.txt - - name: Add Quality Check Summary to Job - if: always() + - name: Fail on Errors + if: steps.quality_check.outcome == 'failure' run: | - echo "### Quality Check Summary" >> $GITHUB_STEP_SUMMARY - cat check_quality_summary.txt >> $GITHUB_STEP_SUMMARY + echo "Quality check failed due to errors." + exit 1 diff --git a/scripts/check_quality.py b/scripts/check_quality.py index dea1557..01c7159 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -57,36 +57,11 @@ def check_wrong_escape(filepath): # Check for wrong beginning letters in journal abbreviations def check_wrong_beginning_letters(filepath): - # Words that are typically ignored when creating abbreviations - ignore_words = {'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'la', 'el', 'le', 'et'} - with open(filepath, 'r', encoding='utf-8') as f: reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): - if len(row) < 2: # Skip if row doesn't have both full name and abbreviation - continue - - full_name = row[0].strip() - abbreviation = row[1].strip() - - # Skip empty entries - if not full_name or not abbreviation: - continue - - # Get significant words from full name (ignoring articles, prepositions, etc.) - full_words = [word for word in full_name.split() - if word.lower() not in ignore_words] - abbrev_words = abbreviation.split() - - # Skip if either is empty after filtering - if not full_words or not abbrev_words: - continue - - # Check if abbreviation starts with the same letter as the first significant word - if not abbrev_words[0].lower().startswith(full_words[0][0].lower()): - error(f"Wrong beginning letter found in {filepath} at line {line_number} " f"Full: '{full_name}', Abbrev: '{abbreviation}')", - 'ERROR Wrong Starting Letter') - + if row[0].startswith("\""): + error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}", 'ERROR Wrong Starting Letter') # Check for duplicate entries @@ -146,41 +121,36 @@ def check_outdated_abbreviations(filepath): # Write the summary to a file total_issues = sum(error_counts.values()) + sum(warning_counts.values()) with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file: - # Write summary table with vertical headers - summary_file.write(f"Total vulnerabilities: {total_issues}\n") - summary_file.write(f"ERROR Wrong Escape: {error_counts['ERROR Wrong Escape']}\n") - summary_file.write(f"ERROR Wrong Starting Letter: {error_counts['ERROR Wrong Starting Letter']}\n") - summary_file.write(f"ERROR Non-UTF8: {error_counts['ERROR Non-UTF8']}\n") - summary_file.write(f"WARN Duplicate FullName/Abbreviation: {warning_counts['WARN Duplicate FullName/Abbreviation']}\n") - summary_file.write(f"WARN Same Abbreviation as Full Name: {warning_counts['WARN Same Abbreviation as Full Name']}\n") - summary_file.write(f"WARN Outdated Manage Abbreviation: {warning_counts['WARN Outdated Manage Abbreviation']}\n") + # Write summary table with visual symbols + summary_file.write("# Quality Check Summary Report\n\n") + summary_file.write("| Status | Count |\n") + summary_file.write("| ------------- | ----- |\n") + summary_file.write(f"| πŸ” Total Issues | {total_issues} |\n") + summary_file.write(f"| ❌ Errors Found | {sum(error_counts.values())} |\n") + summary_file.write(f"| ⚠️ Warnings Found | {sum(warning_counts.values())} |\n\n") # Write detailed errors and warnings if errors or warnings: - summary_file.write("\nQuality Check Summary:\n") - for subtitle in [ - 'ERROR Wrong Escape', - 'ERROR Wrong Starting Letter', - 'ERROR Non-UTF8', - 'WARN Duplicate FullName/Abbreviation', - 'WARN Same Abbreviation as Full Name', - 'WARN Outdated Manage Abbreviation' - ]: - # Write subtitle and corresponding messages - filtered_errors = [err for err_type, err in errors if err_type == subtitle] - filtered_warnings = [warn for warn_type, warn in warnings if warn_type == subtitle] - if filtered_errors or filtered_warnings: - count = len(filtered_errors) + len(filtered_warnings) - summary_file.write(f"\n{subtitle}: with {count} instances\n") - for err in filtered_errors: - summary_file.write(f"{err}\n") - for warn in filtered_warnings: - summary_file.write(f"{warn}\n") + summary_file.write("## Errors per Input File\n\n") + files = set([msg.split(' in ')[1].split(' at ')[0] for _, msg in errors + warnings]) + for file in files: + summary_file.write(f"### Issues in file `{file}`\n") + file_errors = [msg for err_type, msg in errors if file in msg] + file_warnings = [msg for warn_type, msg in warnings if file in msg] + if file_errors: + summary_file.write("#### Errors:\n") + for err in file_errors: + summary_file.write(f"- {err.split('ERROR: ')[1]}\n") + if file_warnings: + summary_file.write("#### Warnings:\n") + for warn in file_warnings: + summary_file.write(f"- {warn.split('WARN: ')[1]}\n") + summary_file.write("\n") else: - summary_file.write("\nQuality check completed with no errors or warnings.\n") + summary_file.write("Quality check completed with no errors or warnings.\n") # Print summary and set exit code - if errors: - sys.exit(1) + if sum(error_counts.values()) > 0: + sys.exit(1) # Fail with an exit code if errors are found else: - print("Quality check completed with no errors.") \ No newline at end of file + sys.exit(0) # Exit successfully if no errors \ No newline at end of file From ba6169f6d3ee87d408f747044ba7307d95689fc5 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 20 Oct 2024 05:54:45 +1100 Subject: [PATCH 11/23] print error messages on Git Action print out error and warning messages on GitHub Action under quality check --- .github/workflows/quality-check.yml | 4 +- scripts/check_quality.py | 94 +++++++++++++++++++---------- 2 files changed, 66 insertions(+), 32 deletions(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index b654b6e..7606113 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -27,7 +27,9 @@ jobs: with: name: check-quality-summary path: ./check_quality_summary.txt - + - name: Print Errors and Warnings Summary + if: failure() + run: cat check_quality_summary.txt - name: Fail on Errors if: steps.quality_check.outcome == 'failure' run: | diff --git a/scripts/check_quality.py b/scripts/check_quality.py index 01c7159..bbbf940 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -57,11 +57,35 @@ def check_wrong_escape(filepath): # Check for wrong beginning letters in journal abbreviations def check_wrong_beginning_letters(filepath): + # Words that are typically ignored when creating abbreviations + ignore_words = {'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'la', 'el', 'le', 'et'} + with open(filepath, 'r', encoding='utf-8') as f: reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): - if row[0].startswith("\""): - error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}", 'ERROR Wrong Starting Letter') + if len(row) < 2: # Skip if row doesn't have both full name and abbreviation + continue + + full_name = row[0].strip() + abbreviation = row[1].strip() + + # Skip empty entries + if not full_name or not abbreviation: + continue + + # Get significant words from full name (ignoring articles, prepositions, etc.) + full_words = [word for word in full_name.split() + if word.lower() not in ignore_words] + abbrev_words = abbreviation.split() + + # Skip if either is empty after filtering + if not full_words or not abbrev_words: + continue + + # Check if abbreviation starts with the same letter as the first significant word + if not abbrev_words[0].lower().startswith(full_words[0][0].lower()): + error(f"Wrong beginning letter found in {filepath} at line {line_number} " f"Full: '{full_name}', Abbrev: '{abbreviation}')", + 'ERROR Wrong Starting Letter') # Check for duplicate entries @@ -120,36 +144,44 @@ def check_outdated_abbreviations(filepath): # Write the summary to a file total_issues = sum(error_counts.values()) + sum(warning_counts.values()) + summary_output = [] + + summary_output.append("# Quality Check Summary Report\n") + summary_output.append("| Status | Count |\n") + summary_output.append("| ------------- | ----- |\n") + summary_output.append(f"| πŸ” Total Issues | {total_issues} |\n") + summary_output.append(f"| ❌ Errors Found | {sum(error_counts.values())} |\n") + summary_output.append(f"| ⚠️ Warnings Found | {sum(warning_counts.values())} |\n\n") + + # Write detailed errors and warnings + if errors or warnings: + summary_output.append("## Errors per Input File\n\n") + files = set([msg.split(' in ')[1].split(' at ')[0] for _, msg in errors + warnings]) + for file in files: + summary_output.append(f"### Issues in file `{file}`\n") + file_errors = [msg for err_type, msg in errors if file in msg] + file_warnings = [msg for warn_type, msg in warnings if file in msg] + if file_errors: + summary_output.append("#### Errors:\n") + for err in file_errors: + summary_output.append(f"- {err.split('ERROR: ')[1]}\n") + if file_warnings: + summary_output.append("#### Warnings:\n") + for warn in file_warnings: + summary_output.append(f"- {warn.split('WARN: ')[1]}\n") + summary_output.append("\n") + else: + summary_output.append("Quality check completed with no errors or warnings.\n") + + # Write to summary file with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file: - # Write summary table with visual symbols - summary_file.write("# Quality Check Summary Report\n\n") - summary_file.write("| Status | Count |\n") - summary_file.write("| ------------- | ----- |\n") - summary_file.write(f"| πŸ” Total Issues | {total_issues} |\n") - summary_file.write(f"| ❌ Errors Found | {sum(error_counts.values())} |\n") - summary_file.write(f"| ⚠️ Warnings Found | {sum(warning_counts.values())} |\n\n") - - # Write detailed errors and warnings - if errors or warnings: - summary_file.write("## Errors per Input File\n\n") - files = set([msg.split(' in ')[1].split(' at ')[0] for _, msg in errors + warnings]) - for file in files: - summary_file.write(f"### Issues in file `{file}`\n") - file_errors = [msg for err_type, msg in errors if file in msg] - file_warnings = [msg for warn_type, msg in warnings if file in msg] - if file_errors: - summary_file.write("#### Errors:\n") - for err in file_errors: - summary_file.write(f"- {err.split('ERROR: ')[1]}\n") - if file_warnings: - summary_file.write("#### Warnings:\n") - for warn in file_warnings: - summary_file.write(f"- {warn.split('WARN: ')[1]}\n") - summary_file.write("\n") - else: - summary_file.write("Quality check completed with no errors or warnings.\n") - - # Print summary and set exit code + summary_file.writelines(summary_output) + + # Print the summary to console + for line in summary_output: + print(line, end='') + + # Set exit code based on errors if sum(error_counts.values()) > 0: sys.exit(1) # Fail with an exit code if errors are found else: From 8918f171747c30f5a0ba2499b8f45913f6291791 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 20 Oct 2024 05:56:58 +1100 Subject: [PATCH 12/23] fixed path for check_quality_summary.txt --- .github/workflows/quality-check.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index 7606113..b43f9d0 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -27,9 +27,11 @@ jobs: with: name: check-quality-summary path: ./check_quality_summary.txt + - name: Print Errors and Warnings Summary if: failure() - run: cat check_quality_summary.txt + run: cat ./check_quality_summary.txt + - name: Fail on Errors if: steps.quality_check.outcome == 'failure' run: | From 238cfd6c3449a329c826f52fac41ebb8f0d2fbaf Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 20 Oct 2024 06:04:36 +1100 Subject: [PATCH 13/23] test force print error summary --- .github/workflows/quality-check.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index b43f9d0..7114560 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -29,9 +29,9 @@ jobs: path: ./check_quality_summary.txt - name: Print Errors and Warnings Summary - if: failure() + if: always() run: cat ./check_quality_summary.txt - + - name: Fail on Errors if: steps.quality_check.outcome == 'failure' run: | From 9cc5622e8145dbda2baed34652fbc9267be374ed Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 20 Oct 2024 06:09:10 +1100 Subject: [PATCH 14/23] remove force print error summary Removed force print error summary in quality-check.yml since default GitHub Action console could not accommodate the size of error/warning summary. Partial error message can be seen in Run Quality Check --- .github/workflows/quality-check.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index 7114560..b654b6e 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -28,10 +28,6 @@ jobs: name: check-quality-summary path: ./check_quality_summary.txt - - name: Print Errors and Warnings Summary - if: always() - run: cat ./check_quality_summary.txt - - name: Fail on Errors if: steps.quality_check.outcome == 'failure' run: | From f2970a4c6f73e23ba4323319f867de1cb22d9ba0 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 20 Oct 2024 06:26:01 +1100 Subject: [PATCH 15/23] Delete quality_checker.py deleted a redundant quality checker --- scripts/quality_checker.py | 100 ------------------------------------- 1 file changed, 100 deletions(-) delete mode 100644 scripts/quality_checker.py diff --git a/scripts/quality_checker.py b/scripts/quality_checker.py deleted file mode 100644 index e433b30..0000000 --- a/scripts/quality_checker.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 - -""" -Python script for checking multiple quality aspects of .csv journal abbreviation files. -This script enforces conventions to ensure that abbreviations of journal titles meet quality standards. - -The script performs the following checks: -1. Checks for wrong escape sequences. -2. Checks for incorrect beginning letters. -3. Checks for non-UTF8 characters. -4. Checks for duplicate entries (full names and abbreviations). -5. Checks if abbreviations match full names (for one-word titles). -6. Checks for outdated abbreviations. - -The script will print out issues found and exit with a failure code if any issues are detected. -The script does NOT automatically fix these errors. Corrections must be done manually. - -The script will automatically run whenever there is a push to the main branch of the -abbreviations repo (abbrv.jabref.org) using GitHub Actions. -""" - -import os -import itertools -import csv -import re -import sys - -# Define paths and file collections -PATH_TO_JOURNALS = "./journals/" -fileNames = next(itertools.islice(os.walk(PATH_TO_JOURNALS), 0, None))[2] - -# Error collections -errors = [] - -# Utility functions for checking conditions -def is_utf8(text): - try: - text.encode('utf-8') - return True - except UnicodeEncodeError: - return False - -def check_abbreviation_duplicates(full_name, abbrev, seen_full_names, seen_abbrevs): - if full_name in seen_full_names or abbrev in seen_abbrevs: - return True - return False - -def is_outdated_abbreviation(abbrev): - # Add a basic rule to detect outdated abbreviations (e.g., "Manage." instead of "Manag.") - outdated_patterns = [r"Manage\.\b"] - for pattern in outdated_patterns: - if re.search(pattern, abbrev): - return True - return False - -# Perform checks -for file in fileNames: - if file.endswith(".csv"): - with open(PATH_TO_JOURNALS + file, mode='r', encoding='utf-8') as f: - reader = csv.reader(f) - seen_full_names = set() - seen_abbrevs = set() - - for row_index, row in enumerate(reader, start=1): - if len(row) < 2: - continue # Skip rows without expected data - - full_name, abbrev = row[0], row[1] - - # Check for escaped ampersands - if '\\&' in full_name or '\\&' in abbrev: - errors.append(f"Escaped ampersand in file {file}, row {row_index}") - - # Check for non-UTF8 characters - if not is_utf8(full_name) or not is_utf8(abbrev): - errors.append(f"Non-UTF8 character in file {file}, row {row_index}") - - # Check for duplicate entries - if check_abbreviation_duplicates(full_name, abbrev, seen_full_names, seen_abbrevs): - errors.append(f"Duplicate entry in file {file}, row {row_index}") - - # Check if abbreviation matches full form for one-word titles - if full_name.strip().lower() == abbrev.strip().lower(): - errors.append(f"Full form matches abbreviation in file {file}, row {row_index}") - - # Check for outdated abbreviations - if is_outdated_abbreviation(abbrev): - errors.append(f"Outdated abbreviation in file {file}, row {row_index}") - - # Update seen sets - seen_full_names.add(full_name) - seen_abbrevs.add(abbrev) - -# Print errors and exit with failure code if any issues found -if errors: - error_message = "Quality check failed:\n" + "\n".join(errors) - print(error_message, file=sys.stderr) - sys.exit(1) -else: - print("Quality check passed. No issues found.") From 34577b5c880a7b03edada84b88a10dca4c9a7c5f Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 20 Oct 2024 17:55:43 +1100 Subject: [PATCH 16/23] refine check_wrong_beginning_letters function the function now considers abbreviations valid if they are similar to full text while not strictly having the same starting letters as the full names --- scripts/check_quality.py | 125 ++++++++++++++++++++++++++++++++------- 1 file changed, 102 insertions(+), 23 deletions(-) diff --git a/scripts/check_quality.py b/scripts/check_quality.py index bbbf940..c7839de 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -58,34 +58,113 @@ def check_wrong_escape(filepath): # Check for wrong beginning letters in journal abbreviations def check_wrong_beginning_letters(filepath): # Words that are typically ignored when creating abbreviations - ignore_words = {'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'la', 'el', 'le', 'et'} + ignore_words = { + 'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', + 'la', 'el', 'le', 'et', 'der', 'die', 'das', 'dem', 'und', 'fΓΌr' # Articles in multiple languages + } + # Special cases for abbreviations + special_cases = { + 'proceedings': ['p', 'proc'], + 'or': ['or'], + 'spie': ['spie'], + 'notes': ['notes'] + } + + def clean_text(text): + # Remove special characters except periods (important for compound abbreviations) + # and normalize spaces + cleaned = re.sub(r'[^\w\s\.]', ' ', text) + return ' '.join(filter(None, cleaned.lower().split())) + + def split_compound_abbrev(abbrev): + # Split abbreviation that might contain compound parts (e.g., "Nat.forsch") + parts = [] + for part in abbrev.split(): + # Split on periods but keep them with the preceding part + subparts = [sp for sp in re.split(r'(?<=\.)(?=[^\.])', part) if sp] + parts.extend(subparts) + return parts + + def get_significant_words(text): + # Split text into words and filter out ignore words + return [w for w in clean_text(text).split() if w.lower() not in ignore_words] + + def is_compound_word_match(full_word, abbrev_part): + # Handle compound word abbreviations (e.g., "Nat.forsch" matching "Naturforschenden") + if '.' in abbrev_part: + # Split the compound abbreviation + abbrev_subparts = abbrev_part.split('.') + # Get the first few characters of the full word to match against first part + word_start = full_word[:len(abbrev_subparts[0])] + + # For the second part (if exists), try to find it within the remaining word + if len(abbrev_subparts) > 1 and abbrev_subparts[1]: + remaining_word = full_word[len(abbrev_subparts[0]):] + return (word_start.lower() == abbrev_subparts[0].lower() and + abbrev_subparts[1].lower() in remaining_word.lower()) + + return word_start.lower() == abbrev_subparts[0].lower() + return False + + def is_valid_abbreviation(full_name, abbrev): + # Clean and split both strings + full_words = get_significant_words(full_name) + abbrev_parts = split_compound_abbrev(clean_text(abbrev)) + + # Handle cases where abbreviation is the same as full name + if clean_text(full_name) == clean_text(abbrev): + return True + + # Handle special cases + for special_word, valid_abbrevs in special_cases.items(): + if special_word in full_words: + if any(va in abbrev_parts for va in valid_abbrevs): + return True + + # Track matched parts and their positions + matched_parts = 0 + used_full_words = set() + + for abbrev_part in abbrev_parts: + found_match = False + + # Try matching against each full word + for i, full_word in enumerate(full_words): + if i in used_full_words: + continue + + # Check for compound word match + if is_compound_word_match(full_word, abbrev_part): + found_match = True + matched_parts += 1 + used_full_words.add(i) + break + + # Check for regular abbreviation patterns + elif (full_word.lower().startswith(abbrev_part.lower()) or + (len(abbrev_part) >= 2 and abbrev_part[0].lower() == full_word[0].lower())): + found_match = True + matched_parts += 1 + used_full_words.add(i) + break + + # Consider the abbreviation valid if we matched most parts + min_required_matches = max(1, len(abbrev_parts) * 0.5) + return matched_parts >= min_required_matches + with open(filepath, 'r', encoding='utf-8') as f: reader = csv.reader(f) for line_number, row in enumerate(reader, start=1): - if len(row) < 2: # Skip if row doesn't have both full name and abbreviation - continue + if len(row) >= 2: + full_name = row[0].strip() + abbreviation = row[1].strip() - full_name = row[0].strip() - abbreviation = row[1].strip() - - # Skip empty entries - if not full_name or not abbreviation: - continue - - # Get significant words from full name (ignoring articles, prepositions, etc.) - full_words = [word for word in full_name.split() - if word.lower() not in ignore_words] - abbrev_words = abbreviation.split() - - # Skip if either is empty after filtering - if not full_words or not abbrev_words: - continue - - # Check if abbreviation starts with the same letter as the first significant word - if not abbrev_words[0].lower().startswith(full_words[0][0].lower()): - error(f"Wrong beginning letter found in {filepath} at line {line_number} " f"Full: '{full_name}', Abbrev: '{abbreviation}')", - 'ERROR Wrong Starting Letter') + if not is_valid_abbreviation(full_name, abbreviation): + error( + f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:" f"Full: '{full_name}', " f"Abbrev: '{abbreviation}'", + 'ERROR Wrong Starting Letter' + ) # Check for duplicate entries From c335da0d03b86413e00641a774e98a218ad1a303 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 27 Oct 2024 04:00:02 +1100 Subject: [PATCH 17/23] Improve quality checker efficiency Content of each abbreviation csv is now loaded into memory once and used by check functions, instead of being read multiple times upon function calls --- scripts/check_quality.py | 135 ++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 64 deletions(-) diff --git a/scripts/check_quality.py b/scripts/check_quality.py index c7839de..f271196 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -21,6 +21,7 @@ 'WARN Same Abbreviation as Full Name': 0, 'WARN Outdated Manage Abbreviation': 0 } + # Error tracking def error(message, error_type): errors.append((error_type, f"ERROR: {message}")) @@ -31,32 +32,44 @@ def warning(message, warning_type): warnings.append((warning_type, f"WARN: {message}")) warning_counts[warning_type] += 1 -# Check if non-UTF8 characters are present in the file -def check_non_utf8_characters(filepath): +# Perform all checks on the file's content +def perform_checks(filepath, rows): + check_non_utf8_characters(filepath, rows) + check_wrong_escape(filepath, rows) + check_wrong_beginning_letters(filepath, rows) + check_duplicates(filepath, rows) + check_full_form_identical_to_abbreviation(filepath, rows) + check_outdated_abbreviations(filepath, rows) + +# Load the content of a CSV file into memory once +def load_csv_content(filepath): try: with open(filepath, 'r', encoding='utf-8') as f: - for line_number, line in enumerate(f, start=1): - try: - line.encode('utf-8') - except UnicodeEncodeError as e: - error(f"Non-UTF8 character found in {filepath} at line {line_number}: {e}", 'ERROR Non-UTF8') + return list(csv.reader(f)) except UnicodeDecodeError as e: error(f"File {filepath} contains non-UTF-8 characters: {e}", 'ERROR Non-UTF8') + return [] + +# Check if non-UTF8 characters are present in the file +def check_non_utf8_characters(filepath, rows): + for line_number, row in enumerate(rows, start=1): + try: + str(row).encode('utf-8') + except UnicodeEncodeError as e: + error(f"Non-UTF8 character found in {filepath} at line {line_number}: {e}", 'ERROR Non-UTF8') # Check if there are wrong escape characters in abbreviation entries -def check_wrong_escape(filepath): +def check_wrong_escape(filepath, rows): valid_escapes = {'\\', '\n', '\t', '\r', '\"'} - with open(filepath, 'r', encoding='utf-8') as f: - reader = csv.reader(f) - for line_number, row in enumerate(reader, start=1): - for field in row: - matches = re.findall(r"\\.", field) - for match in matches: - if match not in valid_escapes: - error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape') + for line_number, row in enumerate(rows, start=1): + for field in row: + matches = re.findall(r"\\.", field) + for match in matches: + if match not in valid_escapes: + error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape') # Check for wrong beginning letters in journal abbreviations -def check_wrong_beginning_letters(filepath): +def check_wrong_beginning_letters(filepath, rows): # Words that are typically ignored when creating abbreviations ignore_words = { 'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', @@ -153,73 +166,67 @@ def is_valid_abbreviation(full_name, abbrev): min_required_matches = max(1, len(abbrev_parts) * 0.5) return matched_parts >= min_required_matches - with open(filepath, 'r', encoding='utf-8') as f: - reader = csv.reader(f) - for line_number, row in enumerate(reader, start=1): - if len(row) >= 2: - full_name = row[0].strip() - abbreviation = row[1].strip() - - if not is_valid_abbreviation(full_name, abbreviation): - error( - f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:" f"Full: '{full_name}', " f"Abbrev: '{abbreviation}'", - 'ERROR Wrong Starting Letter' - ) + for line_number, row in enumerate(rows, start=1): + if len(row) >= 2: + full_name = row[0].strip() + abbreviation = row[1].strip() + + if not is_valid_abbreviation(full_name, abbreviation): + error( + f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:" + f"\nFull: '{full_name}'," + f"\nAbbrev: '{abbreviation}'", + 'ERROR Wrong Starting Letter') # Check for duplicate entries -def check_duplicates(filepath): +def check_duplicates(filepath, rows): full_name_entries = {} abbreviation_entries = {} - with open(filepath, 'r', encoding='utf-8') as f: - reader = csv.reader(f) - for line_number, row in enumerate(reader, start=1): - if len(row) < 2: - continue - full_name = row[0].strip() - abbreviation = row[1].strip() - - # Check for duplicate full names or abbreviations - if full_name in full_name_entries or abbreviation in abbreviation_entries: - warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation') - else: - full_name_entries[full_name] = line_number - abbreviation_entries[abbreviation] = line_number + for line_number, row in enumerate(rows, start=1): + if len(row) < 2: + continue + + full_name = row[0].strip() + abbreviation = row[1].strip() + + # Check for duplicate full names or abbreviations + if full_name in full_name_entries or abbreviation in abbreviation_entries: + warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation') + else: + full_name_entries[full_name] = line_number + abbreviation_entries[abbreviation] = line_number # Check if abbreviation and full form are the same -def check_full_form_identical_to_abbreviation(filepath): - with open(filepath, 'r', encoding='utf-8') as f: - reader = csv.reader(f) - for line_number, row in enumerate(reader, start=1): - if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip(): - warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name') +def check_full_form_identical_to_abbreviation(filepath, rows): + for line_number, row in enumerate(rows, start=1): + if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip(): + warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name') # Check for outdated abbreviations -def check_outdated_abbreviations(filepath): - with open(filepath, 'r', encoding='utf-8') as f: - reader = csv.reader(f) - for line_number, row in enumerate(reader, start=1): - if "Manage." in row and "Manag." not in row: - warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation') +def check_outdated_abbreviations(filepath, rows): + for line_number, row in enumerate(rows, start=1): + if "Manage." in row and "Manag." not in row: + warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation') +# Main entry point if __name__ == "__main__": if not os.path.exists(JOURNALS_FOLDER_PATH): print("Journals folder not found. Please make sure the path is correct.") sys.exit(1) - + # Iterate through all CSV files in the journals folder for filename in os.listdir(JOURNALS_FOLDER_PATH): if filename.endswith(".csv"): filepath = os.path.join(JOURNALS_FOLDER_PATH, filename) - # Run the checks - check_non_utf8_characters(filepath) - check_wrong_escape(filepath) - check_wrong_beginning_letters(filepath) - check_duplicates(filepath) - check_full_form_identical_to_abbreviation(filepath) - check_outdated_abbreviations(filepath) + # Load the CSV content once + rows = load_csv_content(filepath) + + # Run all checks on the loaded data + if rows: + perform_checks(filepath, rows) # Write the summary to a file total_issues = sum(error_counts.values()) + sum(warning_counts.values()) From 66de966e79b78a2656123d6791b949aa2f20e46e Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 27 Oct 2024 04:12:05 +1100 Subject: [PATCH 18/23] Write Summary to GitHub Action Attempt to write issue report to GITHUB_STEP_SUMMARY --- .github/workflows/quality-check.yml | 11 ++++------- scripts/check_quality.py | 12 +++++++++++- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index b654b6e..f95cb54 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -18,15 +18,12 @@ jobs: - name: Run Quality Check id: quality_check run: | - python ./scripts/check_quality.py - continue-on-error: true # Continue if there are warnings/errors, so we can log the output + python ./scripts/quality_checker.py - - name: Upload Quality Check Summary + - name: Upload Logs to GitHub Summary if: always() - uses: actions/upload-artifact@v3 - with: - name: check-quality-summary - path: ./check_quality_summary.txt + run: | + cat ./check_quality_summary.txt >> $GITHUB_STEP_SUMMARY - name: Fail on Errors if: steps.quality_check.outcome == 'failure' diff --git a/scripts/check_quality.py b/scripts/check_quality.py index f271196..772887b 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -22,6 +22,13 @@ 'WARN Outdated Manage Abbreviation': 0 } +# After generating the summary, write to the GITHUB_STEP_SUMMARY file if available +def write_to_github_summary(): + github_summary_path = os.getenv('GITHUB_STEP_SUMMARY') + if github_summary_path: + with open(github_summary_path, 'w', encoding='utf-8') as summary_file: + summary_file.writelines(summary_output) + # Error tracking def error(message, error_type): errors.append((error_type, f"ERROR: {message}")) @@ -259,7 +266,7 @@ def check_outdated_abbreviations(filepath, rows): else: summary_output.append("Quality check completed with no errors or warnings.\n") - # Write to summary file + # Write the summary to a file with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file: summary_file.writelines(summary_output) @@ -267,6 +274,9 @@ def check_outdated_abbreviations(filepath, rows): for line in summary_output: print(line, end='') + # Write to GitHub Actions summary, if available + write_to_github_summary() + # Set exit code based on errors if sum(error_counts.values()) > 0: sys.exit(1) # Fail with an exit code if errors are found From 7bc3629eab60ba35ba026e68a35ca56939bdd847 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 27 Oct 2024 04:18:21 +1100 Subject: [PATCH 19/23] Fix File Name Error Fix quality checker name error --- .github/workflows/quality-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index f95cb54..0b48567 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -18,7 +18,7 @@ jobs: - name: Run Quality Check id: quality_check run: | - python ./scripts/quality_checker.py + python ./scripts/check_quality.py - name: Upload Logs to GitHub Summary if: always() From 19d121359668b0b15ff212c931557fb2e25c065e Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 27 Oct 2024 04:26:20 +1100 Subject: [PATCH 20/23] Try uploading large error report as Artifact Try uploading large error report as artifect --- .github/workflows/quality-check.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml index 0b48567..b0ac078 100644 --- a/.github/workflows/quality-check.yml +++ b/.github/workflows/quality-check.yml @@ -20,13 +20,16 @@ jobs: run: | python ./scripts/check_quality.py - - name: Upload Logs to GitHub Summary + - name: Upload Quality Check Summary as Artifact if: always() - run: | - cat ./check_quality_summary.txt >> $GITHUB_STEP_SUMMARY + uses: actions/upload-artifact@v3 + with: + name: check-quality-summary + path: ./check_quality_summary.txt - name: Fail on Errors if: steps.quality_check.outcome == 'failure' run: | echo "Quality check failed due to errors." exit 1 + From 512c4c423f7ac594f48a80979d1e7f8f1f761a1f Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 27 Oct 2024 04:38:24 +1100 Subject: [PATCH 21/23] Attempt shorten error/warning message Shorten error/warning message for smaller summary size --- scripts/check_quality.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/check_quality.py b/scripts/check_quality.py index 772887b..aea484b 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -73,7 +73,7 @@ def check_wrong_escape(filepath, rows): matches = re.findall(r"\\.", field) for match in matches: if match not in valid_escapes: - error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape') + error(f"Wrong escape in {filepath} line {line_number}: {field}", 'ERROR Wrong Escape') # Check for wrong beginning letters in journal abbreviations def check_wrong_beginning_letters(filepath, rows): @@ -180,9 +180,9 @@ def is_valid_abbreviation(full_name, abbrev): if not is_valid_abbreviation(full_name, abbreviation): error( - f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:" + f"Wrong abbreviation in {filepath} line {line_number}:" f"\nFull: '{full_name}'," - f"\nAbbrev: '{abbreviation}'", + f"\nAbbr: '{abbreviation}'", 'ERROR Wrong Starting Letter') @@ -200,7 +200,7 @@ def check_duplicates(filepath, rows): # Check for duplicate full names or abbreviations if full_name in full_name_entries or abbreviation in abbreviation_entries: - warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation') + warning(f"Duplicate in {filepath} line {line_number}: Full: '{full_name}', Abbr: '{abbreviation}', first seen in line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation') else: full_name_entries[full_name] = line_number abbreviation_entries[abbreviation] = line_number @@ -209,13 +209,13 @@ def check_duplicates(filepath, rows): def check_full_form_identical_to_abbreviation(filepath, rows): for line_number, row in enumerate(rows, start=1): if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip(): - warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name') + warning(f"Abbr same as Full in {filepath} line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name') # Check for outdated abbreviations def check_outdated_abbreviations(filepath, rows): for line_number, row in enumerate(rows, start=1): if "Manage." in row and "Manag." not in row: - warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation') + warning(f"Outdated abbr in {filepath} line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation') # Main entry point if __name__ == "__main__": @@ -242,9 +242,9 @@ def check_outdated_abbreviations(filepath, rows): summary_output.append("# Quality Check Summary Report\n") summary_output.append("| Status | Count |\n") summary_output.append("| ------------- | ----- |\n") - summary_output.append(f"| πŸ” Total Issues | {total_issues} |\n") - summary_output.append(f"| ❌ Errors Found | {sum(error_counts.values())} |\n") - summary_output.append(f"| ⚠️ Warnings Found | {sum(warning_counts.values())} |\n\n") + summary_output.append(f"| πŸ” Total | {total_issues} |\n") + summary_output.append(f"| ❌ Errors | {sum(error_counts.values())} |\n") + summary_output.append(f"| ⚠️ Warnings | {sum(warning_counts.values())} |\n\n") # Write detailed errors and warnings if errors or warnings: From c19931b4262038946cf3dd7884f7436260e8e0c9 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 27 Oct 2024 04:42:54 +1100 Subject: [PATCH 22/23] Revert "Attempt shorten error/warning message" This reverts commit 512c4c423f7ac594f48a80979d1e7f8f1f761a1f. --- scripts/check_quality.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/check_quality.py b/scripts/check_quality.py index aea484b..772887b 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -73,7 +73,7 @@ def check_wrong_escape(filepath, rows): matches = re.findall(r"\\.", field) for match in matches: if match not in valid_escapes: - error(f"Wrong escape in {filepath} line {line_number}: {field}", 'ERROR Wrong Escape') + error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape') # Check for wrong beginning letters in journal abbreviations def check_wrong_beginning_letters(filepath, rows): @@ -180,9 +180,9 @@ def is_valid_abbreviation(full_name, abbrev): if not is_valid_abbreviation(full_name, abbreviation): error( - f"Wrong abbreviation in {filepath} line {line_number}:" + f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:" f"\nFull: '{full_name}'," - f"\nAbbr: '{abbreviation}'", + f"\nAbbrev: '{abbreviation}'", 'ERROR Wrong Starting Letter') @@ -200,7 +200,7 @@ def check_duplicates(filepath, rows): # Check for duplicate full names or abbreviations if full_name in full_name_entries or abbreviation in abbreviation_entries: - warning(f"Duplicate in {filepath} line {line_number}: Full: '{full_name}', Abbr: '{abbreviation}', first seen in line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation') + warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation') else: full_name_entries[full_name] = line_number abbreviation_entries[abbreviation] = line_number @@ -209,13 +209,13 @@ def check_duplicates(filepath, rows): def check_full_form_identical_to_abbreviation(filepath, rows): for line_number, row in enumerate(rows, start=1): if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip(): - warning(f"Abbr same as Full in {filepath} line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name') + warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name') # Check for outdated abbreviations def check_outdated_abbreviations(filepath, rows): for line_number, row in enumerate(rows, start=1): if "Manage." in row and "Manag." not in row: - warning(f"Outdated abbr in {filepath} line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation') + warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation') # Main entry point if __name__ == "__main__": @@ -242,9 +242,9 @@ def check_outdated_abbreviations(filepath, rows): summary_output.append("# Quality Check Summary Report\n") summary_output.append("| Status | Count |\n") summary_output.append("| ------------- | ----- |\n") - summary_output.append(f"| πŸ” Total | {total_issues} |\n") - summary_output.append(f"| ❌ Errors | {sum(error_counts.values())} |\n") - summary_output.append(f"| ⚠️ Warnings | {sum(warning_counts.values())} |\n\n") + summary_output.append(f"| πŸ” Total Issues | {total_issues} |\n") + summary_output.append(f"| ❌ Errors Found | {sum(error_counts.values())} |\n") + summary_output.append(f"| ⚠️ Warnings Found | {sum(warning_counts.values())} |\n\n") # Write detailed errors and warnings if errors or warnings: From c4e51609b48a4c3bc0fc770ddb182d9129e86282 Mon Sep 17 00:00:00 2001 From: Philip Cai Date: Sun, 27 Oct 2024 06:49:57 +1100 Subject: [PATCH 23/23] Attempt reduce error summary size Shorten message and provide a more efficient error summary --- scripts/check_quality.py | 508 +++++++++++++++++++-------------------- 1 file changed, 253 insertions(+), 255 deletions(-) diff --git a/scripts/check_quality.py b/scripts/check_quality.py index 772887b..a7f7f6c 100644 --- a/scripts/check_quality.py +++ b/scripts/check_quality.py @@ -3,282 +3,280 @@ import sys import itertools import csv +from collections import defaultdict # Path to the journals folder (change this path accordingly) JOURNALS_FOLDER_PATH = "./journals/" SUMMARY_FILE_PATH = "./check_quality_summary.txt" -errors = [] -warnings = [] - -# Error and Warning Counts -error_counts = { - 'ERROR Wrong Escape': 0, - 'ERROR Wrong Starting Letter': 0, - 'ERROR Non-UTF8': 0 -} -warning_counts = { - 'WARN Duplicate FullName/Abbreviation': 0, - 'WARN Same Abbreviation as Full Name': 0, - 'WARN Outdated Manage Abbreviation': 0 -} - -# After generating the summary, write to the GITHUB_STEP_SUMMARY file if available -def write_to_github_summary(): - github_summary_path = os.getenv('GITHUB_STEP_SUMMARY') - if github_summary_path: - with open(github_summary_path, 'w', encoding='utf-8') as summary_file: - summary_file.writelines(summary_output) - -# Error tracking -def error(message, error_type): - errors.append((error_type, f"ERROR: {message}")) - error_counts[error_type] += 1 - -# Warning tracking -def warning(message, warning_type): - warnings.append((warning_type, f"WARN: {message}")) - warning_counts[warning_type] += 1 - -# Perform all checks on the file's content -def perform_checks(filepath, rows): - check_non_utf8_characters(filepath, rows) - check_wrong_escape(filepath, rows) - check_wrong_beginning_letters(filepath, rows) - check_duplicates(filepath, rows) - check_full_form_identical_to_abbreviation(filepath, rows) - check_outdated_abbreviations(filepath, rows) - -# Load the content of a CSV file into memory once -def load_csv_content(filepath): - try: - with open(filepath, 'r', encoding='utf-8') as f: - return list(csv.reader(f)) - except UnicodeDecodeError as e: - error(f"File {filepath} contains non-UTF-8 characters: {e}", 'ERROR Non-UTF8') - return [] - -# Check if non-UTF8 characters are present in the file -def check_non_utf8_characters(filepath, rows): - for line_number, row in enumerate(rows, start=1): - try: - str(row).encode('utf-8') - except UnicodeEncodeError as e: - error(f"Non-UTF8 character found in {filepath} at line {line_number}: {e}", 'ERROR Non-UTF8') - -# Check if there are wrong escape characters in abbreviation entries -def check_wrong_escape(filepath, rows): - valid_escapes = {'\\', '\n', '\t', '\r', '\"'} - for line_number, row in enumerate(rows, start=1): - for field in row: - matches = re.findall(r"\\.", field) - for match in matches: - if match not in valid_escapes: - error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape') - -# Check for wrong beginning letters in journal abbreviations -def check_wrong_beginning_letters(filepath, rows): - # Words that are typically ignored when creating abbreviations - ignore_words = { - 'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', - 'la', 'el', 'le', 'et', 'der', 'die', 'das', 'dem', 'und', 'fΓΌr' # Articles in multiple languages - } - - # Special cases for abbreviations - special_cases = { - 'proceedings': ['p', 'proc'], - 'or': ['or'], - 'spie': ['spie'], - 'notes': ['notes'] - } - - def clean_text(text): - # Remove special characters except periods (important for compound abbreviations) - # and normalize spaces - cleaned = re.sub(r'[^\w\s\.]', ' ', text) - return ' '.join(filter(None, cleaned.lower().split())) - - def split_compound_abbrev(abbrev): - # Split abbreviation that might contain compound parts (e.g., "Nat.forsch") - parts = [] - for part in abbrev.split(): - # Split on periods but keep them with the preceding part - subparts = [sp for sp in re.split(r'(?<=\.)(?=[^\.])', part) if sp] - parts.extend(subparts) - return parts - - def get_significant_words(text): - # Split text into words and filter out ignore words - return [w for w in clean_text(text).split() if w.lower() not in ignore_words] - - def is_compound_word_match(full_word, abbrev_part): - # Handle compound word abbreviations (e.g., "Nat.forsch" matching "Naturforschenden") - if '.' in abbrev_part: - # Split the compound abbreviation - abbrev_subparts = abbrev_part.split('.') - # Get the first few characters of the full word to match against first part - word_start = full_word[:len(abbrev_subparts[0])] - - # For the second part (if exists), try to find it within the remaining word - if len(abbrev_subparts) > 1 and abbrev_subparts[1]: - remaining_word = full_word[len(abbrev_subparts[0]):] - return (word_start.lower() == abbrev_subparts[0].lower() and - abbrev_subparts[1].lower() in remaining_word.lower()) - - return word_start.lower() == abbrev_subparts[0].lower() - return False - def is_valid_abbreviation(full_name, abbrev): - # Clean and split both strings - full_words = get_significant_words(full_name) - abbrev_parts = split_compound_abbrev(clean_text(abbrev)) +class QualityChecker: + def __init__(self): + # Use defaultdict to avoid key existence checks + self.error_counts = defaultdict(int) + self.warning_counts = defaultdict(int) + # Store issues by file for more efficient grouping + self.issues_by_file = defaultdict(lambda: {'errors': [], 'warnings': []}) - # Handle cases where abbreviation is the same as full name - if clean_text(full_name) == clean_text(abbrev): - return True - - # Handle special cases - for special_word, valid_abbrevs in special_cases.items(): - if special_word in full_words: - if any(va in abbrev_parts for va in valid_abbrevs): - return True - - # Track matched parts and their positions - matched_parts = 0 - used_full_words = set() + def error(self, filepath, message, error_type): + self.error_counts[error_type] += 1 + # Remove filepath from message if it's included + message = message.replace(f"in {filepath} ", "") + full_message = f"{error_type}: {message}" + self.issues_by_file[filepath]['errors'].append(full_message) + + def warning(self, filepath, message, warning_type): + self.warning_counts[warning_type] += 1 + # Remove filepath from message if it's included + message = message.replace(f"in {filepath} ", "") + full_message = f"{warning_type}: {message}" + self.issues_by_file[filepath]['warnings'].append(full_message) + + def write_summary(self, summary_lines): + # Write to file in a single operation + with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file: + summary_file.writelines(summary_lines) + + # Print to console in chunks + for line in summary_lines: + print(line, end='') + + # Write to GitHub Actions summary if available + github_summary_path = os.getenv('GITHUB_STEP_SUMMARY') + if github_summary_path: + with open(github_summary_path, 'w', encoding='utf-8') as summary_file: + summary_file.writelines(summary_lines) + + def check_non_utf8_characters(self, filepath, rows): + for line_number, row in enumerate(rows, start=1): + try: + str(row).encode('utf-8') + except UnicodeEncodeError as e: + self.error( + filepath, + f"at line {line_number}: {e}", + 'ERROR Non-UTF8' + ) + + def check_wrong_escape(self, filepath, rows): + valid_escapes = {'\\', '\n', '\t', '\r', '\"'} + for line_number, row in enumerate(rows, start=1): + for field in row: + matches = re.findall(r"\\.", field) + for match in matches: + if match not in valid_escapes: + self.error( + filepath, + f"at line {line_number}: {field}", + 'ERROR Wrong Escape' + ) + + def check_wrong_beginning_letters(self, filepath, rows): + # Words that are typically ignored when creating abbreviations + ignore_words = { + 'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', + 'la', 'el', 'le', 'et', 'der', 'die', 'das', 'dem', 'und', 'fΓΌr' + } - for abbrev_part in abbrev_parts: - found_match = False - - # Try matching against each full word - for i, full_word in enumerate(full_words): - if i in used_full_words: - continue + # Special cases for abbreviations + special_cases = { + 'proceedings': ['p', 'proc'], + 'or': ['or'], + 'spie': ['spie'], + 'notes': ['notes'] + } + + def clean_text(text): + cleaned = re.sub(r'[^\w\s\.]', ' ', text) + return ' '.join(filter(None, cleaned.lower().split())) + + def split_compound_abbrev(abbrev): + parts = [] + for part in abbrev.split(): + subparts = [sp for sp in re.split(r'(?<=\.)(?=[^\.])', part) if sp] + parts.extend(subparts) + return parts + + def get_significant_words(text): + return [w for w in clean_text(text).split() if w.lower() not in ignore_words] + + def is_compound_word_match(full_word, abbrev_part): + if '.' in abbrev_part: + abbrev_subparts = abbrev_part.split('.') + word_start = full_word[:len(abbrev_subparts[0])] - # Check for compound word match - if is_compound_word_match(full_word, abbrev_part): - found_match = True - matched_parts += 1 - used_full_words.add(i) - break + if len(abbrev_subparts) > 1 and abbrev_subparts[1]: + remaining_word = full_word[len(abbrev_subparts[0]):] + return (word_start.lower() == abbrev_subparts[0].lower() and + abbrev_subparts[1].lower() in remaining_word.lower()) - # Check for regular abbreviation patterns - elif (full_word.lower().startswith(abbrev_part.lower()) or - (len(abbrev_part) >= 2 and abbrev_part[0].lower() == full_word[0].lower())): - found_match = True - matched_parts += 1 - used_full_words.add(i) - break - - # Consider the abbreviation valid if we matched most parts - min_required_matches = max(1, len(abbrev_parts) * 0.5) - return matched_parts >= min_required_matches - - for line_number, row in enumerate(rows, start=1): - if len(row) >= 2: - full_name = row[0].strip() - abbreviation = row[1].strip() + return word_start.lower() == abbrev_subparts[0].lower() + return False + + def is_valid_abbreviation(full_name, abbrev): + full_words = get_significant_words(full_name) + abbrev_parts = split_compound_abbrev(clean_text(abbrev)) - if not is_valid_abbreviation(full_name, abbreviation): - error( - f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:" - f"\nFull: '{full_name}'," - f"\nAbbrev: '{abbreviation}'", - 'ERROR Wrong Starting Letter') + if clean_text(full_name) == clean_text(abbrev): + return True + + for special_word, valid_abbrevs in special_cases.items(): + if special_word in full_words: + if any(va in abbrev_parts for va in valid_abbrevs): + return True + matched_parts = 0 + used_full_words = set() + + for abbrev_part in abbrev_parts: + found_match = False + + for i, full_word in enumerate(full_words): + if i in used_full_words: + continue + + if is_compound_word_match(full_word, abbrev_part): + found_match = True + matched_parts += 1 + used_full_words.add(i) + break + + elif (full_word.lower().startswith(abbrev_part.lower()) or + (len(abbrev_part) >= 2 and abbrev_part[0].lower() == full_word[0].lower())): + found_match = True + matched_parts += 1 + used_full_words.add(i) + break + + min_required_matches = max(1, len(abbrev_parts) * 0.5) + return matched_parts >= min_required_matches + + for line_number, row in enumerate(rows, start=1): + if len(row) >= 2: + full_name = row[0].strip() + abbreviation = row[1].strip() + + if not is_valid_abbreviation(full_name, abbreviation): + self.error( + filepath, + f"at line {line_number} Full: '{full_name}', Abbr: '{abbreviation}'", + 'ERROR Wrong Abbreviation' + ) -# Check for duplicate entries -def check_duplicates(filepath, rows): - full_name_entries = {} - abbreviation_entries = {} + def check_duplicates(self, filepath, rows): + full_name_entries = {} + abbreviation_entries = {} - for line_number, row in enumerate(rows, start=1): - if len(row) < 2: - continue + for line_number, row in enumerate(rows, start=1): + if len(row) < 2: + continue - full_name = row[0].strip() - abbreviation = row[1].strip() + full_name = row[0].strip() + abbreviation = row[1].strip() + + if full_name in full_name_entries or abbreviation in abbreviation_entries: + self.warning( + filepath, + f"at line {line_number} Full: '{full_name}', Abbr: '{abbreviation}', first seen in line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", + 'WARN Duplicate FullName/Abbreviation' + ) + else: + full_name_entries[full_name] = line_number + abbreviation_entries[abbreviation] = line_number + + def check_full_form_identical_to_abbreviation(self, filepath, rows): + for line_number, row in enumerate(rows, start=1): + if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip(): + self.warning( + filepath, + f"at line {line_number}: {row[0]}", + 'WARN Same Abbrev. as Full Name' + ) + + def check_outdated_abbreviations(self, filepath, rows): + for line_number, row in enumerate(rows, start=1): + if "Manage." in row and "Manag." not in row: + self.warning( + filepath, + f"at line {line_number}: {','.join(row)}", + 'WARN Outdated Manage Abbreviation' + ) + + def perform_checks(self, filepath, rows): + self.check_non_utf8_characters(filepath, rows) + self.check_wrong_escape(filepath, rows) + self.check_wrong_beginning_letters(filepath, rows) + self.check_duplicates(filepath, rows) + self.check_full_form_identical_to_abbreviation(filepath, rows) + self.check_outdated_abbreviations(filepath, rows) + + def generate_summary(self): + total_issues = sum(self.error_counts.values()) + sum(self.warning_counts.values()) - # Check for duplicate full names or abbreviations - if full_name in full_name_entries or abbreviation in abbreviation_entries: - warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation') + # Pre-allocate list with estimated size + summary_lines = [] + summary_lines.extend([ + "# Quality Check Summary Report\n", + "| Status | Count |\n", + "| ------------- | ----- |\n", + f"| πŸ” Total Issues | {total_issues} |\n", + f"| ❌ Errors Found | {sum(self.error_counts.values())} |\n", + f"| ⚠️ Warnings Found | {sum(self.warning_counts.values())} |\n\n" + ]) + + # Add detailed error/warning counts + if self.error_counts: + summary_lines.append("## Error Counts\n") + for error_type, count in sorted(self.error_counts.items()): + summary_lines.append(f"- {error_type}: {count}\n") + summary_lines.append("\n") + + if self.warning_counts: + summary_lines.append("## Warning Counts\n") + for warning_type, count in sorted(self.warning_counts.items()): + summary_lines.append(f"- {warning_type}: {count}\n") + summary_lines.append("\n") + + if self.issues_by_file: + summary_lines.append("## Issues per Input File\n\n") + for filepath, issues in sorted(self.issues_by_file.items()): + summary_lines.append(f"### Issues in file `{filepath}`\n") + if issues['errors']: + summary_lines.append("#### Errors:\n") + summary_lines.extend(f"- {err}\n" for err in sorted(issues['errors'])) + + if issues['warnings']: + summary_lines.append("#### Warnings:\n") + summary_lines.extend(f"- {warn}\n" for warn in sorted(issues['warnings'])) + + summary_lines.append("\n") else: - full_name_entries[full_name] = line_number - abbreviation_entries[abbreviation] = line_number - -# Check if abbreviation and full form are the same -def check_full_form_identical_to_abbreviation(filepath, rows): - for line_number, row in enumerate(rows, start=1): - if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip(): - warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name') - -# Check for outdated abbreviations -def check_outdated_abbreviations(filepath, rows): - for line_number, row in enumerate(rows, start=1): - if "Manage." in row and "Manag." not in row: - warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation') - -# Main entry point -if __name__ == "__main__": + summary_lines.append("Quality check completed with no errors or warnings.\n") + + return summary_lines +def main(): if not os.path.exists(JOURNALS_FOLDER_PATH): print("Journals folder not found. Please make sure the path is correct.") sys.exit(1) - # Iterate through all CSV files in the journals folder + checker = QualityChecker() + + # Process all files for filename in os.listdir(JOURNALS_FOLDER_PATH): if filename.endswith(".csv"): filepath = os.path.join(JOURNALS_FOLDER_PATH, filename) - - # Load the CSV content once - rows = load_csv_content(filepath) - - # Run all checks on the loaded data - if rows: - perform_checks(filepath, rows) + try: + with open(filepath, 'r', encoding='utf-8') as f: + rows = list(csv.reader(f)) + checker.perform_checks(filepath, rows) + except UnicodeDecodeError as e: + checker.error(filepath, f"File contains non-UTF8 characters: {e}", 'ERROR Non-UTF8') + + # Generate and write summary + summary_lines = checker.generate_summary() + checker.write_summary(summary_lines) - # Write the summary to a file - total_issues = sum(error_counts.values()) + sum(warning_counts.values()) - summary_output = [] - - summary_output.append("# Quality Check Summary Report\n") - summary_output.append("| Status | Count |\n") - summary_output.append("| ------------- | ----- |\n") - summary_output.append(f"| πŸ” Total Issues | {total_issues} |\n") - summary_output.append(f"| ❌ Errors Found | {sum(error_counts.values())} |\n") - summary_output.append(f"| ⚠️ Warnings Found | {sum(warning_counts.values())} |\n\n") - - # Write detailed errors and warnings - if errors or warnings: - summary_output.append("## Errors per Input File\n\n") - files = set([msg.split(' in ')[1].split(' at ')[0] for _, msg in errors + warnings]) - for file in files: - summary_output.append(f"### Issues in file `{file}`\n") - file_errors = [msg for err_type, msg in errors if file in msg] - file_warnings = [msg for warn_type, msg in warnings if file in msg] - if file_errors: - summary_output.append("#### Errors:\n") - for err in file_errors: - summary_output.append(f"- {err.split('ERROR: ')[1]}\n") - if file_warnings: - summary_output.append("#### Warnings:\n") - for warn in file_warnings: - summary_output.append(f"- {warn.split('WARN: ')[1]}\n") - summary_output.append("\n") - else: - summary_output.append("Quality check completed with no errors or warnings.\n") - - # Write the summary to a file - with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file: - summary_file.writelines(summary_output) - - # Print the summary to console - for line in summary_output: - print(line, end='') - - # Write to GitHub Actions summary, if available - write_to_github_summary() - - # Set exit code based on errors - if sum(error_counts.values()) > 0: - sys.exit(1) # Fail with an exit code if errors are found - else: - sys.exit(0) # Exit successfully if no errors \ No newline at end of file + # Exit with appropriate code + sys.exit(1 if sum(checker.error_counts.values()) > 0 else 0) + +if __name__ == "__main__": + main() \ No newline at end of file