From 2976781f553bd4582d08be1d579ed54d9633c6ed Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sat, 19 Oct 2024 21:29:49 +1100
Subject: [PATCH 01/23] implement quality checker

implement the quality checker that reports errors for
- wrong escape characters
- wrong starting letters
- presence of non-utf-8 characters
and reports warning for
- duplicate entries
- same full forms
- same abbreviations
- outdated 'Manage' abbreviation
---
 scripts/check_quality.py | 90 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 scripts/check_quality.py

diff --git a/scripts/check_quality.py b/scripts/check_quality.py
new file mode 100644
index 0000000..dcf599b
--- /dev/null
+++ b/scripts/check_quality.py
@@ -0,0 +1,90 @@
+import os
+import re
+import sys
+import itertools
+import csv
+
+# Path to the journals folder (change this path accordingly)
+JOURNALS_FOLDER_PATH = "./journals/"
+
+# Error tracking
+def error(message):
+    print(f"ERROR: {message}")
+    sys.exit(1)
+
+# Warning tracking
+def warning(message):
+    print(f"WARN: {message}")
+
+# Check if non-UTF8 characters are present in the file
+def check_non_utf8_characters(filepath):
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            f.read()
+    except UnicodeDecodeError:
+        error(f"File {filepath} contains non-UTF8 characters")
+
+# Check if there are wrong escape characters in abbreviation entries
+def check_wrong_escape(filepath):
+    with open(filepath, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        for line_number, row in enumerate(reader, start=1):
+            for field in row:
+                if re.search(r"[a-zA-Z]*\\[,\"]", field):
+                    error(f"Wrong escape character found in file {filepath} at line {line_number}: {field}")
+
+# Check for wrong beginning letters in journal abbreviations
+def check_wrong_beginning_letters(filepath):
+    with open(filepath, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        for line_number, row in enumerate(reader, start=1):
+            if row[0].startswith("\""):
+                error(f"Wrong beginning letter found in file {filepath} at line {line_number}: {row[0]}")
+
+# Check for duplicate entries
+def check_duplicates(filepath):
+    entries = set()
+    with open(filepath, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        for line_number, row in enumerate(reader, start=1):
+            line = ','.join(row)
+            if line in entries:
+                warning(f"Duplicate entry found in file {filepath} at line {line_number}: {line}")
+            else:
+                entries.add(line)
+
+# Check if abbreviation and full form are the same
+def check_full_form_identical_to_abbreviation(filepath):
+    with open(filepath, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        for line_number, row in enumerate(reader, start=1):
+            if len(row) == 2 and row[0].strip() == row[1].strip():
+                warning(f"Abbreviation is the same as full form in file {filepath} at line {line_number}: {row[0]}")
+
+# Check for outdated abbreviations
+def check_outdated_abbreviations(filepath):
+    with open(filepath, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        for line_number, row in enumerate(reader, start=1):
+            if "Manage." in row and "Manag." not in row:
+                warning(f"Outdated abbreviation used in file {filepath} at line {line_number}: {','.join(row)}")
+
+if __name__ == "__main__":
+    if not os.path.exists(JOURNALS_FOLDER_PATH):
+        print("Journals folder not found. Please make sure the path is correct.")
+        sys.exit(1)
+    
+    # Iterate through all CSV files in the journals folder
+    for filename in os.listdir(JOURNALS_FOLDER_PATH):
+        if filename.endswith(".csv"):
+            filepath = os.path.join(JOURNALS_FOLDER_PATH, filename)
+            
+            # Run the checks
+            check_non_utf8_characters(filepath)
+            check_wrong_escape(filepath)
+            check_wrong_beginning_letters(filepath)
+            check_duplicates(filepath)
+            check_full_form_identical_to_abbreviation(filepath)
+            check_outdated_abbreviations(filepath)
+    
+    print("Quality check completed.")
\ No newline at end of file

From 0ebac9db4e0c67f407a8e8b480e3ed994cff2d82 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sat, 19 Oct 2024 21:29:51 +1100
Subject: [PATCH 02/23] Create quality_checker.py

---
 scripts/quality_checker.py | 100 +++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 scripts/quality_checker.py

diff --git a/scripts/quality_checker.py b/scripts/quality_checker.py
new file mode 100644
index 0000000..e433b30
--- /dev/null
+++ b/scripts/quality_checker.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+"""
+Python script for checking multiple quality aspects of .csv journal abbreviation files.
+This script enforces conventions to ensure that abbreviations of journal titles meet quality standards.
+
+The script performs the following checks:
+1. Checks for wrong escape sequences.
+2. Checks for incorrect beginning letters.
+3. Checks for non-UTF8 characters.
+4. Checks for duplicate entries (full names and abbreviations).
+5. Checks if abbreviations match full names (for one-word titles).
+6. Checks for outdated abbreviations.
+
+The script will print out issues found and exit with a failure code if any issues are detected.
+The script does NOT automatically fix these errors. Corrections must be done manually.
+
+The script will automatically run whenever there is a push to the main branch of the
+abbreviations repo (abbrv.jabref.org) using GitHub Actions.
+"""
+
+import os
+import itertools
+import csv
+import re
+import sys
+
+# Define paths and file collections
+PATH_TO_JOURNALS = "./journals/"
+fileNames = next(itertools.islice(os.walk(PATH_TO_JOURNALS), 0, None))[2]
+
+# Error collections
+errors = []
+
+# Utility functions for checking conditions
+def is_utf8(text):
+    try:
+        text.encode('utf-8')
+        return True
+    except UnicodeEncodeError:
+        return False
+
+def check_abbreviation_duplicates(full_name, abbrev, seen_full_names, seen_abbrevs):
+    if full_name in seen_full_names or abbrev in seen_abbrevs:
+        return True
+    return False
+
+def is_outdated_abbreviation(abbrev):
+    # Add a basic rule to detect outdated abbreviations (e.g., "Manage." instead of "Manag.")
+    outdated_patterns = [r"Manage\.\b"]
+    for pattern in outdated_patterns:
+        if re.search(pattern, abbrev):
+            return True
+    return False
+
+# Perform checks
+for file in fileNames:
+    if file.endswith(".csv"):
+        with open(PATH_TO_JOURNALS + file, mode='r', encoding='utf-8') as f:
+            reader = csv.reader(f)
+            seen_full_names = set()
+            seen_abbrevs = set()
+
+            for row_index, row in enumerate(reader, start=1):
+                if len(row) < 2:
+                    continue  # Skip rows without expected data
+                
+                full_name, abbrev = row[0], row[1]
+
+                # Check for escaped ampersands
+                if '\\&' in full_name or '\\&' in abbrev:
+                    errors.append(f"Escaped ampersand in file {file}, row {row_index}")
+
+                # Check for non-UTF8 characters
+                if not is_utf8(full_name) or not is_utf8(abbrev):
+                    errors.append(f"Non-UTF8 character in file {file}, row {row_index}")
+
+                # Check for duplicate entries
+                if check_abbreviation_duplicates(full_name, abbrev, seen_full_names, seen_abbrevs):
+                    errors.append(f"Duplicate entry in file {file}, row {row_index}")
+
+                # Check if abbreviation matches full form for one-word titles
+                if full_name.strip().lower() == abbrev.strip().lower():
+                    errors.append(f"Full form matches abbreviation in file {file}, row {row_index}")
+
+                # Check for outdated abbreviations
+                if is_outdated_abbreviation(abbrev):
+                    errors.append(f"Outdated abbreviation in file {file}, row {row_index}")
+
+                # Update seen sets
+                seen_full_names.add(full_name)
+                seen_abbrevs.add(abbrev)
+
+# Print errors and exit with failure code if any issues found
+if errors:
+    error_message = "Quality check failed:\n" + "\n".join(errors)
+    print(error_message, file=sys.stderr)
+    sys.exit(1)
+else:
+    print("Quality check passed. No issues found.")

From 5c3f483f835159ad9ee69a8c34160ef5b8d146e2 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sat, 19 Oct 2024 21:58:58 +1100
Subject: [PATCH 03/23] fix early stop in check_quality.py

prevent the script from stopping by error-triggered exit
---
 scripts/check_quality.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index dcf599b..1bc0a71 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -7,10 +7,10 @@
 # Path to the journals folder (change this path accordingly)
 JOURNALS_FOLDER_PATH = "./journals/"
 
+errors = []
 # Error tracking
 def error(message):
-    print(f"ERROR: {message}")
-    sys.exit(1)
+    errors.append(f"ERROR: {message}")
 
 # Warning tracking
 def warning(message):
@@ -31,7 +31,7 @@ def check_wrong_escape(filepath):
         for line_number, row in enumerate(reader, start=1):
             for field in row:
                 if re.search(r"[a-zA-Z]*\\[,\"]", field):
-                    error(f"Wrong escape character found in file {filepath} at line {line_number}: {field}")
+                    error(f"Wrong escape character found in {filepath} at line {line_number}: {field}")
 
 # Check for wrong beginning letters in journal abbreviations
 def check_wrong_beginning_letters(filepath):
@@ -39,19 +39,19 @@ def check_wrong_beginning_letters(filepath):
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
             if row[0].startswith("\""):
-                error(f"Wrong beginning letter found in file {filepath} at line {line_number}: {row[0]}")
+                error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}")
 
 # Check for duplicate entries
 def check_duplicates(filepath):
-    entries = set()
+    entries = {}
     with open(filepath, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
             line = ','.join(row)
             if line in entries:
-                warning(f"Duplicate entry found in file {filepath} at line {line_number}: {line}")
+                warning(f"Duplicate found in {filepath} at line {line_number}: {line}, first instance seen at line {entries[line]}")
             else:
-                entries.add(line)
+                entries[line] = line_number
 
 # Check if abbreviation and full form are the same
 def check_full_form_identical_to_abbreviation(filepath):
@@ -59,7 +59,7 @@ def check_full_form_identical_to_abbreviation(filepath):
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
             if len(row) == 2 and row[0].strip() == row[1].strip():
-                warning(f"Abbreviation is the same as full form in file {filepath} at line {line_number}: {row[0]}")
+                warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}")
 
 # Check for outdated abbreviations
 def check_outdated_abbreviations(filepath):
@@ -67,7 +67,7 @@ def check_outdated_abbreviations(filepath):
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
             if "Manage." in row and "Manag." not in row:
-                warning(f"Outdated abbreviation used in file {filepath} at line {line_number}: {','.join(row)}")
+                warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}")
 
 if __name__ == "__main__":
     if not os.path.exists(JOURNALS_FOLDER_PATH):
@@ -87,4 +87,10 @@ def check_outdated_abbreviations(filepath):
             check_full_form_identical_to_abbreviation(filepath)
             check_outdated_abbreviations(filepath)
     
-    print("Quality check completed.")
\ No newline at end of file
+    # Print all errors at the end
+    if errors:
+        for err in errors:
+            print(err)
+        sys.exit(1)
+    else:
+        print("Quality check completed with no errors.")
\ No newline at end of file

From 7229041327a370c1b2056fb6ddce2a5b32009e1e Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sat, 19 Oct 2024 23:15:44 +1100
Subject: [PATCH 04/23] Deploy checker on GitHub Action

- ignore single-name journals with same abbreviation as full name
- generate error summary and deploy checker on GitHub Action
---
 .github/workflows/quality-check.yml | 39 +++++++++++++++++++++++++++++
 scripts/check_quality.py            | 26 ++++++++++++++-----
 2 files changed, 59 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/quality-check.yml

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
new file mode 100644
index 0000000..0589b0a
--- /dev/null
+++ b/.github/workflows/quality-check.yml
@@ -0,0 +1,39 @@
+name: Quality Check
+
+on: [push, pull_request]
+
+jobs:
+  quality-check:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        pip install -r requirements.txt
+
+    - name: Run Quality Check
+      run: |
+        python quality_checker.py
+      continue-on-error: true
+
+    - name: Upload Quality Check Summary
+      if: always()
+      uses: actions/upload-artifact@v2
+      with:
+        name: quality-check-summary
+        path: quality_check_summary.txt
+
+    - name: Add Quality Check Summary to Job
+      if: always()
+      run: |
+        echo "Generating GitHub Actions job summary..."
+        echo 'Quality Check Summary:' >> $GITHUB_STEP_SUMMARY
+        cat quality_check_summary.txt >> $GITHUB_STEP_SUMMARY
diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index 1bc0a71..7be2fab 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -6,15 +6,16 @@
 
 # Path to the journals folder (change this path accordingly)
 JOURNALS_FOLDER_PATH = "./journals/"
-
+SUMMARY_FILE_PATH = "./check_quality_summary.txt"
 errors = []
+warnings = []
 # Error tracking
 def error(message):
     errors.append(f"ERROR: {message}")
 
 # Warning tracking
 def warning(message):
-    print(f"WARN: {message}")
+    warnings.append(f"WARN: {message}")
 
 # Check if non-UTF8 characters are present in the file
 def check_non_utf8_characters(filepath):
@@ -58,7 +59,7 @@ def check_full_form_identical_to_abbreviation(filepath):
     with open(filepath, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
-            if len(row) == 2 and row[0].strip() == row[1].strip():
+            if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip():
                 warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}")
 
 # Check for outdated abbreviations
@@ -87,10 +88,23 @@ def check_outdated_abbreviations(filepath):
             check_full_form_identical_to_abbreviation(filepath)
             check_outdated_abbreviations(filepath)
     
-    # Print all errors at the end
+    # Write the summary to a file
+    with open(SUMMARY_FILE_PATH, 'w') as summary_file:
+        if errors or warnings:
+            summary_file.write("Quality Check Summary:\n")
+            if errors:
+                summary_file.write("\nErrors:\n")
+                for err in errors:
+                    summary_file.write(f"{err}\n")
+            if warnings:
+                summary_file.write("\nWarnings:\n")
+                for warn in warnings:
+                    summary_file.write(f"{warn}\n")
+        else:
+            summary_file.write("Quality check completed with no errors or warnings.\n")
+
+    # Print summary and set exit code
     if errors:
-        for err in errors:
-            print(err)
         sys.exit(1)
     else:
         print("Quality check completed with no errors.")
\ No newline at end of file

From 215ee88c794c5a44a8ba4c793e3d41695cdf7798 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sat, 19 Oct 2024 23:29:45 +1100
Subject: [PATCH 05/23] solve deprecation

change upload-artifact@v2 to @v3
---
 .github/workflows/quality-check.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index 0589b0a..7437f1c 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -17,16 +17,16 @@ jobs:
 
     - name: Install dependencies
       run: |
-        pip install -r requirements.txt
+        pip install -r requirements.txt || true  # Only if you have dependencies listed here
 
     - name: Run Quality Check
       run: |
         python quality_checker.py
-      continue-on-error: true
+      continue-on-error: true  # Continue if there are warnings/errors, so we can log the output
 
     - name: Upload Quality Check Summary
       if: always()
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v3
       with:
         name: quality-check-summary
         path: quality_check_summary.txt
@@ -34,6 +34,5 @@ jobs:
     - name: Add Quality Check Summary to Job
       if: always()
       run: |
-        echo "Generating GitHub Actions job summary..."
-        echo 'Quality Check Summary:' >> $GITHUB_STEP_SUMMARY
+        echo "### Quality Check Summary" >> $GITHUB_STEP_SUMMARY
         cat quality_check_summary.txt >> $GITHUB_STEP_SUMMARY

From 7c2ca05f5872ee30ed3ed8f7f479331e599ac304 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sat, 19 Oct 2024 23:35:32 +1100
Subject: [PATCH 06/23] solve file not found error

---
 .github/workflows/quality-check.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index 7437f1c..56ecf56 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -21,15 +21,15 @@ jobs:
 
     - name: Run Quality Check
       run: |
-        python quality_checker.py
+        python ./scripts/check_quality.py
       continue-on-error: true  # Continue if there are warnings/errors, so we can log the output
 
     - name: Upload Quality Check Summary
       if: always()
       uses: actions/upload-artifact@v3
       with:
-        name: quality-check-summary
-        path: quality_check_summary.txt
+        name: check-quality-summary
+        path: ./check_quality_summary.txt
 
     - name: Add Quality Check Summary to Job
       if: always()

From 6c02bba43aab58357a4e9dcc4130fa773fda354e Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sat, 19 Oct 2024 23:37:28 +1100
Subject: [PATCH 07/23] fix mismatched file name

fix mismatched quality check file name
---
 .github/workflows/quality-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index 56ecf56..2108c5b 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -35,4 +35,4 @@ jobs:
       if: always()
       run: |
         echo "### Quality Check Summary" >> $GITHUB_STEP_SUMMARY
-        cat quality_check_summary.txt >> $GITHUB_STEP_SUMMARY
+        cat check_quality_summary.txt >> $GITHUB_STEP_SUMMARY

From 9989eb73deeafacc08bb9a711d88564f1235b64f Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 20 Oct 2024 00:31:28 +1100
Subject: [PATCH 08/23] format output structure

provide better visualisation of error/warning output
---
 .github/workflows/quality-check.yml |  4 --
 scripts/check_quality.py            | 75 +++++++++++++++++++++--------
 2 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index 2108c5b..bd5193d 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -15,10 +15,6 @@ jobs:
       with:
         python-version: '3.x'
 
-    - name: Install dependencies
-      run: |
-        pip install -r requirements.txt || true  # Only if you have dependencies listed here
-
     - name: Run Quality Check
       run: |
         python ./scripts/check_quality.py
diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index 7be2fab..6c5559e 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -9,13 +9,27 @@
 SUMMARY_FILE_PATH = "./check_quality_summary.txt"
 errors = []
 warnings = []
+
+# Error and Warning Counts
+error_counts = {
+    'ERROR Wrong Escape': 0,
+    'ERROR Wrong Starting Letter': 0,
+    'ERROR Non-UTF8': 0
+}
+warning_counts = {
+    'WARN Duplicate FullName/Abbreviation': 0,
+    'WARN Same Abbreviation as Full Name': 0,
+    'WARN Outdated Manage Abbreviation': 0
+}
 # Error tracking
-def error(message):
-    errors.append(f"ERROR: {message}")
+def error(message, error_type):
+    errors.append((error_type, f"ERROR: {message}"))
+    error_counts[error_type] += 1
 
 # Warning tracking
-def warning(message):
-    warnings.append(f"WARN: {message}")
+def warning(message, warning_type):
+    warnings.append((warning_type, f"WARN: {message}"))
+    warning_counts[warning_type] += 1
 
 # Check if non-UTF8 characters are present in the file
 def check_non_utf8_characters(filepath):
@@ -23,7 +37,7 @@ def check_non_utf8_characters(filepath):
         with open(filepath, 'r', encoding='utf-8') as f:
             f.read()
     except UnicodeDecodeError:
-        error(f"File {filepath} contains non-UTF8 characters")
+        error(f"File {filepath} contains non-UTF8 characters", 'ERROR Non-UTF8')
 
 # Check if there are wrong escape characters in abbreviation entries
 def check_wrong_escape(filepath):
@@ -32,7 +46,7 @@ def check_wrong_escape(filepath):
         for line_number, row in enumerate(reader, start=1):
             for field in row:
                 if re.search(r"[a-zA-Z]*\\[,\"]", field):
-                    error(f"Wrong escape character found in {filepath} at line {line_number}: {field}")
+                    error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape')
 
 # Check for wrong beginning letters in journal abbreviations
 def check_wrong_beginning_letters(filepath):
@@ -40,7 +54,7 @@ def check_wrong_beginning_letters(filepath):
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
             if row[0].startswith("\""):
-                error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}")
+                error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}", 'ERROR Wrong Starting Letter')
 
 # Check for duplicate entries
 def check_duplicates(filepath):
@@ -50,7 +64,7 @@ def check_duplicates(filepath):
         for line_number, row in enumerate(reader, start=1):
             line = ','.join(row)
             if line in entries:
-                warning(f"Duplicate found in {filepath} at line {line_number}: {line}, first instance seen at line {entries[line]}")
+                warning(f"Duplicate found in {filepath} at line {line_number}: {line}, first instance seen at line {entries[line]}", 'WARN Duplicate FullName/Abbreviation')
             else:
                 entries[line] = line_number
 
@@ -60,7 +74,7 @@ def check_full_form_identical_to_abbreviation(filepath):
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
             if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip():
-                warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}")
+                warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name')
 
 # Check for outdated abbreviations
 def check_outdated_abbreviations(filepath):
@@ -68,7 +82,7 @@ def check_outdated_abbreviations(filepath):
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
             if "Manage." in row and "Manag." not in row:
-                warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}")
+                warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation')
 
 if __name__ == "__main__":
     if not os.path.exists(JOURNALS_FOLDER_PATH):
@@ -89,19 +103,40 @@ def check_outdated_abbreviations(filepath):
             check_outdated_abbreviations(filepath)
     
     # Write the summary to a file
+    total_issues = sum(error_counts.values()) + sum(warning_counts.values())
     with open(SUMMARY_FILE_PATH, 'w') as summary_file:
+        # Write summary table with vertical headers
+        summary_file.write(f"Total: {total_issues}\n")
+        summary_file.write(f"ERROR Wrong Escape: {error_counts['ERROR Wrong Escape']}\n")
+        summary_file.write(f"ERROR Wrong Starting Letter: {error_counts['ERROR Wrong Starting Letter']}\n")
+        summary_file.write(f"ERROR Non-UTF8: {error_counts['ERROR Non-UTF8']}\n")
+        summary_file.write(f"WARN Duplicate FullName/Abbreviation: {warning_counts['WARN Duplicate FullName/Abbreviation']}\n")
+        summary_file.write(f"WARN Same Abbreviation as Full Name: {warning_counts['WARN Same Abbreviation as Full Name']}\n")
+        summary_file.write(f"WARN Outdated Manage Abbreviation: {warning_counts['WARN Outdated Manage Abbreviation']}\n")
+
+        # Write detailed errors and warnings
         if errors or warnings:
-            summary_file.write("Quality Check Summary:\n")
-            if errors:
-                summary_file.write("\nErrors:\n")
-                for err in errors:
-                    summary_file.write(f"{err}\n")
-            if warnings:
-                summary_file.write("\nWarnings:\n")
-                for warn in warnings:
-                    summary_file.write(f"{warn}\n")
+            summary_file.write("\nQuality Check Summary:\n")
+            for subtitle in [
+                'ERROR Wrong Escape', 
+                'ERROR Wrong Starting Letter', 
+                'ERROR Non-UTF8',
+                'WARN Duplicate FullName/Abbreviation',
+                'WARN Same Abbreviation as Full Name',
+                'WARN Outdated Manage Abbreviation'
+            ]:
+                # Write subtitle and corresponding messages
+                filtered_errors = [err for err_type, err in errors if err_type == subtitle]
+                filtered_warnings = [warn for warn_type, warn in warnings if warn_type == subtitle]
+                if filtered_errors or filtered_warnings:
+                    count = len(filtered_errors) + len(filtered_warnings)
+                    summary_file.write(f"\n{subtitle}: with {count} instances\n")
+                    for err in filtered_errors:
+                        summary_file.write(f"{err}\n")
+                    for warn in filtered_warnings:
+                        summary_file.write(f"{warn}\n")
         else:
-            summary_file.write("Quality check completed with no errors or warnings.\n")
+            summary_file.write("\nQuality check completed with no errors or warnings.\n")
 
     # Print summary and set exit code
     if errors:

From c4f615f82b2c827d7b148b68d60595f633fc3d7d Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 20 Oct 2024 05:17:17 +1100
Subject: [PATCH 09/23] consolidate quality check functions

- enhance invalid escape character check
- group full name duplication and abbreviation duplication into same warning
- ignore articles and preposition in check wrong beginning letters
---
 scripts/check_quality.py | 69 ++++++++++++++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 14 deletions(-)

diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index 6c5559e..dea1557 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -35,38 +35,79 @@ def warning(message, warning_type):
 def check_non_utf8_characters(filepath):
     try:
         with open(filepath, 'r', encoding='utf-8') as f:
-            f.read()
-    except UnicodeDecodeError:
-        error(f"File {filepath} contains non-UTF8 characters", 'ERROR Non-UTF8')
+            for line_number, line in enumerate(f, start=1):
+                try:
+                    line.encode('utf-8')
+                except UnicodeEncodeError as e:
+                    error(f"Non-UTF8 character found in {filepath} at line {line_number}: {e}", 'ERROR Non-UTF8')
+    except UnicodeDecodeError as e:
+        error(f"File {filepath} contains non-UTF-8 characters: {e}", 'ERROR Non-UTF8')
 
 # Check if there are wrong escape characters in abbreviation entries
 def check_wrong_escape(filepath):
+    valid_escapes = {'\\', '\n', '\t', '\r', '\"'}
     with open(filepath, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
             for field in row:
-                if re.search(r"[a-zA-Z]*\\[,\"]", field):
-                    error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape')
+                matches = re.findall(r"\\.", field)
+                for match in matches:
+                    if match not in valid_escapes:
+                        error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape')
 
 # Check for wrong beginning letters in journal abbreviations
 def check_wrong_beginning_letters(filepath):
+    # Words that are typically ignored when creating abbreviations
+    ignore_words = {'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'la', 'el', 'le', 'et'}
+    
     with open(filepath, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
-            if row[0].startswith("\""):
-                error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}", 'ERROR Wrong Starting Letter')
+            if len(row) < 2:  # Skip if row doesn't have both full name and abbreviation
+                continue
+                
+            full_name = row[0].strip()
+            abbreviation = row[1].strip()
+            
+            # Skip empty entries
+            if not full_name or not abbreviation:
+                continue
+            
+            # Get significant words from full name (ignoring articles, prepositions, etc.)
+            full_words = [word for word in full_name.split() 
+                         if word.lower() not in ignore_words]
+            abbrev_words = abbreviation.split()
+            
+            # Skip if either is empty after filtering
+            if not full_words or not abbrev_words:
+                continue
+            
+            # Check if abbreviation starts with the same letter as the first significant word
+            if not abbrev_words[0].lower().startswith(full_words[0][0].lower()):
+                error(f"Wrong beginning letter found in {filepath} at line {line_number} " f"Full: '{full_name}', Abbrev: '{abbreviation}')", 
+                      'ERROR Wrong Starting Letter')
+
+
 
 # Check for duplicate entries
 def check_duplicates(filepath):
-    entries = {}
+    full_name_entries = {}
+    abbreviation_entries = {}
     with open(filepath, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
-            line = ','.join(row)
-            if line in entries:
-                warning(f"Duplicate found in {filepath} at line {line_number}: {line}, first instance seen at line {entries[line]}", 'WARN Duplicate FullName/Abbreviation')
+            if len(row) < 2:
+                continue
+
+            full_name = row[0].strip()
+            abbreviation = row[1].strip()
+            
+            # Check for duplicate full names or abbreviations
+            if full_name in full_name_entries or abbreviation in abbreviation_entries:
+                warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation')
             else:
-                entries[line] = line_number
+                full_name_entries[full_name] = line_number
+                abbreviation_entries[abbreviation] = line_number
 
 # Check if abbreviation and full form are the same
 def check_full_form_identical_to_abbreviation(filepath):
@@ -104,9 +145,9 @@ def check_outdated_abbreviations(filepath):
     
     # Write the summary to a file
     total_issues = sum(error_counts.values()) + sum(warning_counts.values())
-    with open(SUMMARY_FILE_PATH, 'w') as summary_file:
+    with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file:
         # Write summary table with vertical headers
-        summary_file.write(f"Total: {total_issues}\n")
+        summary_file.write(f"Total vulnerabilities: {total_issues}\n")
         summary_file.write(f"ERROR Wrong Escape: {error_counts['ERROR Wrong Escape']}\n")
         summary_file.write(f"ERROR Wrong Starting Letter: {error_counts['ERROR Wrong Starting Letter']}\n")
         summary_file.write(f"ERROR Non-UTF8: {error_counts['ERROR Non-UTF8']}\n")

From 56110cbd76c0d66afd49c984f14cc2c34c20a1c8 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 20 Oct 2024 05:45:14 +1100
Subject: [PATCH 10/23] test GitHub Action

make quality check action exit with code 1 if errors are present
---
 .github/workflows/quality-check.yml |  9 +--
 scripts/check_quality.py            | 86 ++++++++++-------------------
 2 files changed, 33 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index bd5193d..b654b6e 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -16,6 +16,7 @@ jobs:
         python-version: '3.x'
 
     - name: Run Quality Check
+      id: quality_check
       run: |
         python ./scripts/check_quality.py
       continue-on-error: true  # Continue if there are warnings/errors, so we can log the output
@@ -27,8 +28,8 @@ jobs:
         name: check-quality-summary
         path: ./check_quality_summary.txt
 
-    - name: Add Quality Check Summary to Job
-      if: always()
+    - name: Fail on Errors
+      if: steps.quality_check.outcome == 'failure'
       run: |
-        echo "### Quality Check Summary" >> $GITHUB_STEP_SUMMARY
-        cat check_quality_summary.txt >> $GITHUB_STEP_SUMMARY
+        echo "Quality check failed due to errors."
+        exit 1
diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index dea1557..01c7159 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -57,36 +57,11 @@ def check_wrong_escape(filepath):
 
 # Check for wrong beginning letters in journal abbreviations
 def check_wrong_beginning_letters(filepath):
-    # Words that are typically ignored when creating abbreviations
-    ignore_words = {'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'la', 'el', 'le', 'et'}
-    
     with open(filepath, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
-            if len(row) < 2:  # Skip if row doesn't have both full name and abbreviation
-                continue
-                
-            full_name = row[0].strip()
-            abbreviation = row[1].strip()
-            
-            # Skip empty entries
-            if not full_name or not abbreviation:
-                continue
-            
-            # Get significant words from full name (ignoring articles, prepositions, etc.)
-            full_words = [word for word in full_name.split() 
-                         if word.lower() not in ignore_words]
-            abbrev_words = abbreviation.split()
-            
-            # Skip if either is empty after filtering
-            if not full_words or not abbrev_words:
-                continue
-            
-            # Check if abbreviation starts with the same letter as the first significant word
-            if not abbrev_words[0].lower().startswith(full_words[0][0].lower()):
-                error(f"Wrong beginning letter found in {filepath} at line {line_number} " f"Full: '{full_name}', Abbrev: '{abbreviation}')", 
-                      'ERROR Wrong Starting Letter')
-
+            if row[0].startswith("\""):
+                error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}", 'ERROR Wrong Starting Letter')
 
 
 # Check for duplicate entries
@@ -146,41 +121,36 @@ def check_outdated_abbreviations(filepath):
     # Write the summary to a file
     total_issues = sum(error_counts.values()) + sum(warning_counts.values())
     with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file:
-        # Write summary table with vertical headers
-        summary_file.write(f"Total vulnerabilities: {total_issues}\n")
-        summary_file.write(f"ERROR Wrong Escape: {error_counts['ERROR Wrong Escape']}\n")
-        summary_file.write(f"ERROR Wrong Starting Letter: {error_counts['ERROR Wrong Starting Letter']}\n")
-        summary_file.write(f"ERROR Non-UTF8: {error_counts['ERROR Non-UTF8']}\n")
-        summary_file.write(f"WARN Duplicate FullName/Abbreviation: {warning_counts['WARN Duplicate FullName/Abbreviation']}\n")
-        summary_file.write(f"WARN Same Abbreviation as Full Name: {warning_counts['WARN Same Abbreviation as Full Name']}\n")
-        summary_file.write(f"WARN Outdated Manage Abbreviation: {warning_counts['WARN Outdated Manage Abbreviation']}\n")
+        # Write summary table with visual symbols
+        summary_file.write("# Quality Check Summary Report\n\n")
+        summary_file.write("| Status        | Count |\n")
+        summary_file.write("| ------------- | ----- |\n")
+        summary_file.write(f"| 🔍 Total Issues      | {total_issues}   |\n")
+        summary_file.write(f"| ❌ Errors Found      | {sum(error_counts.values())}    |\n")
+        summary_file.write(f"| ⚠️ Warnings Found    | {sum(warning_counts.values())}   |\n\n")
 
         # Write detailed errors and warnings
         if errors or warnings:
-            summary_file.write("\nQuality Check Summary:\n")
-            for subtitle in [
-                'ERROR Wrong Escape', 
-                'ERROR Wrong Starting Letter', 
-                'ERROR Non-UTF8',
-                'WARN Duplicate FullName/Abbreviation',
-                'WARN Same Abbreviation as Full Name',
-                'WARN Outdated Manage Abbreviation'
-            ]:
-                # Write subtitle and corresponding messages
-                filtered_errors = [err for err_type, err in errors if err_type == subtitle]
-                filtered_warnings = [warn for warn_type, warn in warnings if warn_type == subtitle]
-                if filtered_errors or filtered_warnings:
-                    count = len(filtered_errors) + len(filtered_warnings)
-                    summary_file.write(f"\n{subtitle}: with {count} instances\n")
-                    for err in filtered_errors:
-                        summary_file.write(f"{err}\n")
-                    for warn in filtered_warnings:
-                        summary_file.write(f"{warn}\n")
+            summary_file.write("## Errors per Input File\n\n")
+            files = set([msg.split(' in ')[1].split(' at ')[0] for _, msg in errors + warnings])
+            for file in files:
+                summary_file.write(f"### Issues in file `{file}`\n")
+                file_errors = [msg for err_type, msg in errors if file in msg]
+                file_warnings = [msg for warn_type, msg in warnings if file in msg]
+                if file_errors:
+                    summary_file.write("#### Errors:\n")
+                    for err in file_errors:
+                        summary_file.write(f"- {err.split('ERROR: ')[1]}\n")
+                if file_warnings:
+                    summary_file.write("#### Warnings:\n")
+                    for warn in file_warnings:
+                        summary_file.write(f"- {warn.split('WARN: ')[1]}\n")
+                summary_file.write("\n")
         else:
-            summary_file.write("\nQuality check completed with no errors or warnings.\n")
+            summary_file.write("Quality check completed with no errors or warnings.\n")
 
     # Print summary and set exit code
-    if errors:
-        sys.exit(1)
+    if sum(error_counts.values()) > 0:
+        sys.exit(1)  # Fail with an exit code if errors are found
     else:
-        print("Quality check completed with no errors.")
\ No newline at end of file
+        sys.exit(0)  # Exit successfully if no errors
\ No newline at end of file

From ba6169f6d3ee87d408f747044ba7307d95689fc5 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 20 Oct 2024 05:54:45 +1100
Subject: [PATCH 11/23] print error messages on Git Action

print out error and warning messages on GitHub Action under quality check
---
 .github/workflows/quality-check.yml |  4 +-
 scripts/check_quality.py            | 94 +++++++++++++++++++----------
 2 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index b654b6e..7606113 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -27,7 +27,9 @@ jobs:
       with:
         name: check-quality-summary
         path: ./check_quality_summary.txt
-
+    - name: Print Errors and Warnings Summary
+      if: failure()
+      run: cat check_quality_summary.txt
     - name: Fail on Errors
       if: steps.quality_check.outcome == 'failure'
       run: |
diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index 01c7159..bbbf940 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -57,11 +57,35 @@ def check_wrong_escape(filepath):
 
 # Check for wrong beginning letters in journal abbreviations
 def check_wrong_beginning_letters(filepath):
+    # Words that are typically ignored when creating abbreviations
+    ignore_words = {'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'la', 'el', 'le', 'et'}
+    
     with open(filepath, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
-            if row[0].startswith("\""):
-                error(f"Wrong beginning letter found in {filepath} at line {line_number}: {row[0]}", 'ERROR Wrong Starting Letter')
+            if len(row) < 2:  # Skip if row doesn't have both full name and abbreviation
+                continue
+                
+            full_name = row[0].strip()
+            abbreviation = row[1].strip()
+            
+            # Skip empty entries
+            if not full_name or not abbreviation:
+                continue
+            
+            # Get significant words from full name (ignoring articles, prepositions, etc.)
+            full_words = [word for word in full_name.split() 
+                         if word.lower() not in ignore_words]
+            abbrev_words = abbreviation.split()
+            
+            # Skip if either is empty after filtering
+            if not full_words or not abbrev_words:
+                continue
+            
+            # Check if abbreviation starts with the same letter as the first significant word
+            if not abbrev_words[0].lower().startswith(full_words[0][0].lower()):
+                error(f"Wrong beginning letter found in {filepath} at line {line_number} " f"Full: '{full_name}', Abbrev: '{abbreviation}')", 
+                      'ERROR Wrong Starting Letter')
 
 
 # Check for duplicate entries
@@ -120,36 +144,44 @@ def check_outdated_abbreviations(filepath):
     
     # Write the summary to a file
     total_issues = sum(error_counts.values()) + sum(warning_counts.values())
+    summary_output = []
+
+    summary_output.append("# Quality Check Summary Report\n")
+    summary_output.append("| Status        | Count |\n")
+    summary_output.append("| ------------- | ----- |\n")
+    summary_output.append(f"| 🔍 Total Issues      | {total_issues}   |\n")
+    summary_output.append(f"| ❌ Errors Found      | {sum(error_counts.values())}    |\n")
+    summary_output.append(f"| ⚠️ Warnings Found    | {sum(warning_counts.values())}   |\n\n")
+
+    # Write detailed errors and warnings
+    if errors or warnings:
+        summary_output.append("## Errors per Input File\n\n")
+        files = set([msg.split(' in ')[1].split(' at ')[0] for _, msg in errors + warnings])
+        for file in files:
+            summary_output.append(f"### Issues in file `{file}`\n")
+            file_errors = [msg for err_type, msg in errors if file in msg]
+            file_warnings = [msg for warn_type, msg in warnings if file in msg]
+            if file_errors:
+                summary_output.append("#### Errors:\n")
+                for err in file_errors:
+                    summary_output.append(f"- {err.split('ERROR: ')[1]}\n")
+            if file_warnings:
+                summary_output.append("#### Warnings:\n")
+                for warn in file_warnings:
+                    summary_output.append(f"- {warn.split('WARN: ')[1]}\n")
+            summary_output.append("\n")
+    else:
+        summary_output.append("Quality check completed with no errors or warnings.\n")
+
+    # Write to summary file
     with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file:
-        # Write summary table with visual symbols
-        summary_file.write("# Quality Check Summary Report\n\n")
-        summary_file.write("| Status        | Count |\n")
-        summary_file.write("| ------------- | ----- |\n")
-        summary_file.write(f"| 🔍 Total Issues      | {total_issues}   |\n")
-        summary_file.write(f"| ❌ Errors Found      | {sum(error_counts.values())}    |\n")
-        summary_file.write(f"| ⚠️ Warnings Found    | {sum(warning_counts.values())}   |\n\n")
-
-        # Write detailed errors and warnings
-        if errors or warnings:
-            summary_file.write("## Errors per Input File\n\n")
-            files = set([msg.split(' in ')[1].split(' at ')[0] for _, msg in errors + warnings])
-            for file in files:
-                summary_file.write(f"### Issues in file `{file}`\n")
-                file_errors = [msg for err_type, msg in errors if file in msg]
-                file_warnings = [msg for warn_type, msg in warnings if file in msg]
-                if file_errors:
-                    summary_file.write("#### Errors:\n")
-                    for err in file_errors:
-                        summary_file.write(f"- {err.split('ERROR: ')[1]}\n")
-                if file_warnings:
-                    summary_file.write("#### Warnings:\n")
-                    for warn in file_warnings:
-                        summary_file.write(f"- {warn.split('WARN: ')[1]}\n")
-                summary_file.write("\n")
-        else:
-            summary_file.write("Quality check completed with no errors or warnings.\n")
-
-    # Print summary and set exit code
+        summary_file.writelines(summary_output)
+
+    # Print the summary to console
+    for line in summary_output:
+        print(line, end='')
+
+    # Set exit code based on errors
     if sum(error_counts.values()) > 0:
         sys.exit(1)  # Fail with an exit code if errors are found
     else:

From 8918f171747c30f5a0ba2499b8f45913f6291791 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 20 Oct 2024 05:56:58 +1100
Subject: [PATCH 12/23] fixed path for check_quality_summary.txt

---
 .github/workflows/quality-check.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index 7606113..b43f9d0 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -27,9 +27,11 @@ jobs:
       with:
         name: check-quality-summary
         path: ./check_quality_summary.txt
+
     - name: Print Errors and Warnings Summary
       if: failure()
-      run: cat check_quality_summary.txt
+      run: cat ./check_quality_summary.txt
+      
     - name: Fail on Errors
       if: steps.quality_check.outcome == 'failure'
       run: |

From 238cfd6c3449a329c826f52fac41ebb8f0d2fbaf Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 20 Oct 2024 06:04:36 +1100
Subject: [PATCH 13/23] test force print error summary

---
 .github/workflows/quality-check.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index b43f9d0..7114560 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -29,9 +29,9 @@ jobs:
         path: ./check_quality_summary.txt
 
     - name: Print Errors and Warnings Summary
-      if: failure()
+      if: always()
       run: cat ./check_quality_summary.txt
-      
+
     - name: Fail on Errors
       if: steps.quality_check.outcome == 'failure'
       run: |

From 9cc5622e8145dbda2baed34652fbc9267be374ed Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 20 Oct 2024 06:09:10 +1100
Subject: [PATCH 14/23] remove force print error summary

Removed force print error summary in quality-check.yml since default GitHub Action console could not accommodate the size of error/warning summary. Partial error message can be seen in Run Quality Check
---
 .github/workflows/quality-check.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index 7114560..b654b6e 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -28,10 +28,6 @@ jobs:
         name: check-quality-summary
         path: ./check_quality_summary.txt
 
-    - name: Print Errors and Warnings Summary
-      if: always()
-      run: cat ./check_quality_summary.txt
-
     - name: Fail on Errors
       if: steps.quality_check.outcome == 'failure'
       run: |

From f2970a4c6f73e23ba4323319f867de1cb22d9ba0 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 20 Oct 2024 06:26:01 +1100
Subject: [PATCH 15/23] Delete quality_checker.py

deleted a redundant quality checker
---
 scripts/quality_checker.py | 100 -------------------------------------
 1 file changed, 100 deletions(-)
 delete mode 100644 scripts/quality_checker.py

diff --git a/scripts/quality_checker.py b/scripts/quality_checker.py
deleted file mode 100644
index e433b30..0000000
--- a/scripts/quality_checker.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Python script for checking multiple quality aspects of .csv journal abbreviation files.
-This script enforces conventions to ensure that abbreviations of journal titles meet quality standards.
-
-The script performs the following checks:
-1. Checks for wrong escape sequences.
-2. Checks for incorrect beginning letters.
-3. Checks for non-UTF8 characters.
-4. Checks for duplicate entries (full names and abbreviations).
-5. Checks if abbreviations match full names (for one-word titles).
-6. Checks for outdated abbreviations.
-
-The script will print out issues found and exit with a failure code if any issues are detected.
-The script does NOT automatically fix these errors. Corrections must be done manually.
-
-The script will automatically run whenever there is a push to the main branch of the
-abbreviations repo (abbrv.jabref.org) using GitHub Actions.
-"""
-
-import os
-import itertools
-import csv
-import re
-import sys
-
-# Define paths and file collections
-PATH_TO_JOURNALS = "./journals/"
-fileNames = next(itertools.islice(os.walk(PATH_TO_JOURNALS), 0, None))[2]
-
-# Error collections
-errors = []
-
-# Utility functions for checking conditions
-def is_utf8(text):
-    try:
-        text.encode('utf-8')
-        return True
-    except UnicodeEncodeError:
-        return False
-
-def check_abbreviation_duplicates(full_name, abbrev, seen_full_names, seen_abbrevs):
-    if full_name in seen_full_names or abbrev in seen_abbrevs:
-        return True
-    return False
-
-def is_outdated_abbreviation(abbrev):
-    # Add a basic rule to detect outdated abbreviations (e.g., "Manage." instead of "Manag.")
-    outdated_patterns = [r"Manage\.\b"]
-    for pattern in outdated_patterns:
-        if re.search(pattern, abbrev):
-            return True
-    return False
-
-# Perform checks
-for file in fileNames:
-    if file.endswith(".csv"):
-        with open(PATH_TO_JOURNALS + file, mode='r', encoding='utf-8') as f:
-            reader = csv.reader(f)
-            seen_full_names = set()
-            seen_abbrevs = set()
-
-            for row_index, row in enumerate(reader, start=1):
-                if len(row) < 2:
-                    continue  # Skip rows without expected data
-                
-                full_name, abbrev = row[0], row[1]
-
-                # Check for escaped ampersands
-                if '\\&' in full_name or '\\&' in abbrev:
-                    errors.append(f"Escaped ampersand in file {file}, row {row_index}")
-
-                # Check for non-UTF8 characters
-                if not is_utf8(full_name) or not is_utf8(abbrev):
-                    errors.append(f"Non-UTF8 character in file {file}, row {row_index}")
-
-                # Check for duplicate entries
-                if check_abbreviation_duplicates(full_name, abbrev, seen_full_names, seen_abbrevs):
-                    errors.append(f"Duplicate entry in file {file}, row {row_index}")
-
-                # Check if abbreviation matches full form for one-word titles
-                if full_name.strip().lower() == abbrev.strip().lower():
-                    errors.append(f"Full form matches abbreviation in file {file}, row {row_index}")
-
-                # Check for outdated abbreviations
-                if is_outdated_abbreviation(abbrev):
-                    errors.append(f"Outdated abbreviation in file {file}, row {row_index}")
-
-                # Update seen sets
-                seen_full_names.add(full_name)
-                seen_abbrevs.add(abbrev)
-
-# Print errors and exit with failure code if any issues found
-if errors:
-    error_message = "Quality check failed:\n" + "\n".join(errors)
-    print(error_message, file=sys.stderr)
-    sys.exit(1)
-else:
-    print("Quality check passed. No issues found.")

From 34577b5c880a7b03edada84b88a10dca4c9a7c5f Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 20 Oct 2024 17:55:43 +1100
Subject: [PATCH 16/23] refine check_wrong_beginning_letters function

the function now considers abbreviations valid if they are similar to full text while not strictly having the same starting letters as the full names
---
 scripts/check_quality.py | 125 ++++++++++++++++++++++++++++++++-------
 1 file changed, 102 insertions(+), 23 deletions(-)

diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index bbbf940..c7839de 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -58,34 +58,113 @@ def check_wrong_escape(filepath):
 # Check for wrong beginning letters in journal abbreviations
 def check_wrong_beginning_letters(filepath):
     # Words that are typically ignored when creating abbreviations
-    ignore_words = {'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'la', 'el', 'le', 'et'}
+    ignore_words = {
+        'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 
+        'la', 'el', 'le', 'et', 'der', 'die', 'das', 'dem', 'und', 'für'  # Articles in multiple languages
+    }
     
+    # Special cases for abbreviations
+    special_cases = {
+        'proceedings': ['p', 'proc'],
+        'or': ['or'],
+        'spie': ['spie'],
+        'notes': ['notes']
+    }
+
+    def clean_text(text):
+        # Remove special characters except periods (important for compound abbreviations)
+        # and normalize spaces
+        cleaned = re.sub(r'[^\w\s\.]', ' ', text)
+        return ' '.join(filter(None, cleaned.lower().split()))
+
+    def split_compound_abbrev(abbrev):
+        # Split abbreviation that might contain compound parts (e.g., "Nat.forsch")
+        parts = []
+        for part in abbrev.split():
+            # Split on periods but keep them with the preceding part
+            subparts = [sp for sp in re.split(r'(?<=\.)(?=[^\.])', part) if sp]
+            parts.extend(subparts)
+        return parts
+
+    def get_significant_words(text):
+        # Split text into words and filter out ignore words
+        return [w for w in clean_text(text).split() if w.lower() not in ignore_words]
+
+    def is_compound_word_match(full_word, abbrev_part):
+        # Handle compound word abbreviations (e.g., "Nat.forsch" matching "Naturforschenden")
+        if '.' in abbrev_part:
+            # Split the compound abbreviation
+            abbrev_subparts = abbrev_part.split('.')
+            # Get the first few characters of the full word to match against first part
+            word_start = full_word[:len(abbrev_subparts[0])]
+            
+            # For the second part (if exists), try to find it within the remaining word
+            if len(abbrev_subparts) > 1 and abbrev_subparts[1]:
+                remaining_word = full_word[len(abbrev_subparts[0]):]
+                return (word_start.lower() == abbrev_subparts[0].lower() and 
+                       abbrev_subparts[1].lower() in remaining_word.lower())
+            
+            return word_start.lower() == abbrev_subparts[0].lower()
+        return False
+
+    def is_valid_abbreviation(full_name, abbrev):
+        # Clean and split both strings
+        full_words = get_significant_words(full_name)
+        abbrev_parts = split_compound_abbrev(clean_text(abbrev))
+        
+        # Handle cases where abbreviation is the same as full name
+        if clean_text(full_name) == clean_text(abbrev):
+            return True
+
+        # Handle special cases
+        for special_word, valid_abbrevs in special_cases.items():
+            if special_word in full_words:
+                if any(va in abbrev_parts for va in valid_abbrevs):
+                    return True
+
+        # Track matched parts and their positions
+        matched_parts = 0
+        used_full_words = set()
+        
+        for abbrev_part in abbrev_parts:
+            found_match = False
+            
+            # Try matching against each full word
+            for i, full_word in enumerate(full_words):
+                if i in used_full_words:
+                    continue
+                
+                # Check for compound word match
+                if is_compound_word_match(full_word, abbrev_part):
+                    found_match = True
+                    matched_parts += 1
+                    used_full_words.add(i)
+                    break
+                
+                # Check for regular abbreviation patterns
+                elif (full_word.lower().startswith(abbrev_part.lower()) or
+                      (len(abbrev_part) >= 2 and abbrev_part[0].lower() == full_word[0].lower())):
+                    found_match = True
+                    matched_parts += 1
+                    used_full_words.add(i)
+                    break
+
+        # Consider the abbreviation valid if we matched most parts
+        min_required_matches = max(1, len(abbrev_parts) * 0.5)
+        return matched_parts >= min_required_matches
+
     with open(filepath, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
-            if len(row) < 2:  # Skip if row doesn't have both full name and abbreviation
-                continue
+            if len(row) >= 2:
+                full_name = row[0].strip()
+                abbreviation = row[1].strip()
                 
-            full_name = row[0].strip()
-            abbreviation = row[1].strip()
-            
-            # Skip empty entries
-            if not full_name or not abbreviation:
-                continue
-            
-            # Get significant words from full name (ignoring articles, prepositions, etc.)
-            full_words = [word for word in full_name.split() 
-                         if word.lower() not in ignore_words]
-            abbrev_words = abbreviation.split()
-            
-            # Skip if either is empty after filtering
-            if not full_words or not abbrev_words:
-                continue
-            
-            # Check if abbreviation starts with the same letter as the first significant word
-            if not abbrev_words[0].lower().startswith(full_words[0][0].lower()):
-                error(f"Wrong beginning letter found in {filepath} at line {line_number} " f"Full: '{full_name}', Abbrev: '{abbreviation}')", 
-                      'ERROR Wrong Starting Letter')
+                if not is_valid_abbreviation(full_name, abbreviation):
+                    error(
+                        f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:" f"Full: '{full_name}', " f"Abbrev: '{abbreviation}'",
+                        'ERROR Wrong Starting Letter'
+                    )
 
 
 # Check for duplicate entries

From c335da0d03b86413e00641a774e98a218ad1a303 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 27 Oct 2024 04:00:02 +1100
Subject: [PATCH 17/23] Improve quality checker efficiency

Content of each abbreviation csv is now loaded into memory once and used by check functions, instead of being read multiple times upon function calls
---
 scripts/check_quality.py | 135 ++++++++++++++++++++-------------------
 1 file changed, 71 insertions(+), 64 deletions(-)

diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index c7839de..f271196 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -21,6 +21,7 @@
     'WARN Same Abbreviation as Full Name': 0,
     'WARN Outdated Manage Abbreviation': 0
 }
+
 # Error tracking
 def error(message, error_type):
     errors.append((error_type, f"ERROR: {message}"))
@@ -31,32 +32,44 @@ def warning(message, warning_type):
     warnings.append((warning_type, f"WARN: {message}"))
     warning_counts[warning_type] += 1
 
-# Check if non-UTF8 characters are present in the file
-def check_non_utf8_characters(filepath):
+# Perform all checks on the file's content
+def perform_checks(filepath, rows):
+    check_non_utf8_characters(filepath, rows)
+    check_wrong_escape(filepath, rows)
+    check_wrong_beginning_letters(filepath, rows)
+    check_duplicates(filepath, rows)
+    check_full_form_identical_to_abbreviation(filepath, rows)
+    check_outdated_abbreviations(filepath, rows)
+
+# Load the content of a CSV file into memory once
+def load_csv_content(filepath):
     try:
         with open(filepath, 'r', encoding='utf-8') as f:
-            for line_number, line in enumerate(f, start=1):
-                try:
-                    line.encode('utf-8')
-                except UnicodeEncodeError as e:
-                    error(f"Non-UTF8 character found in {filepath} at line {line_number}: {e}", 'ERROR Non-UTF8')
+            return list(csv.reader(f))
     except UnicodeDecodeError as e:
         error(f"File {filepath} contains non-UTF-8 characters: {e}", 'ERROR Non-UTF8')
+        return []
+
+# Check if non-UTF8 characters are present in the file
+def check_non_utf8_characters(filepath, rows):
+    for line_number, row in enumerate(rows, start=1):
+        try:
+            str(row).encode('utf-8')
+        except UnicodeEncodeError as e:
+            error(f"Non-UTF8 character found in {filepath} at line {line_number}: {e}", 'ERROR Non-UTF8')
 
 # Check if there are wrong escape characters in abbreviation entries
-def check_wrong_escape(filepath):
+def check_wrong_escape(filepath, rows):
     valid_escapes = {'\\', '\n', '\t', '\r', '\"'}
-    with open(filepath, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        for line_number, row in enumerate(reader, start=1):
-            for field in row:
-                matches = re.findall(r"\\.", field)
-                for match in matches:
-                    if match not in valid_escapes:
-                        error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape')
+    for line_number, row in enumerate(rows, start=1):
+        for field in row:
+            matches = re.findall(r"\\.", field)
+            for match in matches:
+                if match not in valid_escapes:
+                    error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape')
 
 # Check for wrong beginning letters in journal abbreviations
-def check_wrong_beginning_letters(filepath):
+def check_wrong_beginning_letters(filepath, rows):
     # Words that are typically ignored when creating abbreviations
     ignore_words = {
         'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 
@@ -153,73 +166,67 @@ def is_valid_abbreviation(full_name, abbrev):
         min_required_matches = max(1, len(abbrev_parts) * 0.5)
         return matched_parts >= min_required_matches
 
-    with open(filepath, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        for line_number, row in enumerate(reader, start=1):
-            if len(row) >= 2:
-                full_name = row[0].strip()
-                abbreviation = row[1].strip()
-                
-                if not is_valid_abbreviation(full_name, abbreviation):
-                    error(
-                        f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:" f"Full: '{full_name}', " f"Abbrev: '{abbreviation}'",
-                        'ERROR Wrong Starting Letter'
-                    )
+    for line_number, row in enumerate(rows, start=1):
+        if len(row) >= 2:
+            full_name = row[0].strip()
+            abbreviation = row[1].strip()
+            
+            if not is_valid_abbreviation(full_name, abbreviation):
+                error(
+                    f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:"
+                    f"\nFull: '{full_name}',"
+                    f"\nAbbrev: '{abbreviation}'",
+                    'ERROR Wrong Starting Letter')
 
 
 # Check for duplicate entries
-def check_duplicates(filepath):
+def check_duplicates(filepath, rows):
     full_name_entries = {}
     abbreviation_entries = {}
-    with open(filepath, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        for line_number, row in enumerate(reader, start=1):
-            if len(row) < 2:
-                continue
 
-            full_name = row[0].strip()
-            abbreviation = row[1].strip()
-            
-            # Check for duplicate full names or abbreviations
-            if full_name in full_name_entries or abbreviation in abbreviation_entries:
-                warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation')
-            else:
-                full_name_entries[full_name] = line_number
-                abbreviation_entries[abbreviation] = line_number
+    for line_number, row in enumerate(rows, start=1):
+        if len(row) < 2:
+            continue
+
+        full_name = row[0].strip()
+        abbreviation = row[1].strip()
+        
+        # Check for duplicate full names or abbreviations
+        if full_name in full_name_entries or abbreviation in abbreviation_entries:
+            warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation')
+        else:
+            full_name_entries[full_name] = line_number
+            abbreviation_entries[abbreviation] = line_number
 
 # Check if abbreviation and full form are the same
-def check_full_form_identical_to_abbreviation(filepath):
-    with open(filepath, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        for line_number, row in enumerate(reader, start=1):
-            if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip():
-                warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name')
+def check_full_form_identical_to_abbreviation(filepath, rows):
+    for line_number, row in enumerate(rows, start=1):
+        if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip():
+            warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name')
 
 # Check for outdated abbreviations
-def check_outdated_abbreviations(filepath):
-    with open(filepath, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        for line_number, row in enumerate(reader, start=1):
-            if "Manage." in row and "Manag." not in row:
-                warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation')
+def check_outdated_abbreviations(filepath, rows):
+    for line_number, row in enumerate(rows, start=1):
+        if "Manage." in row and "Manag." not in row:
+            warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation')
 
+# Main entry point
 if __name__ == "__main__":
     if not os.path.exists(JOURNALS_FOLDER_PATH):
         print("Journals folder not found. Please make sure the path is correct.")
         sys.exit(1)
-    
+
     # Iterate through all CSV files in the journals folder
     for filename in os.listdir(JOURNALS_FOLDER_PATH):
         if filename.endswith(".csv"):
             filepath = os.path.join(JOURNALS_FOLDER_PATH, filename)
             
-            # Run the checks
-            check_non_utf8_characters(filepath)
-            check_wrong_escape(filepath)
-            check_wrong_beginning_letters(filepath)
-            check_duplicates(filepath)
-            check_full_form_identical_to_abbreviation(filepath)
-            check_outdated_abbreviations(filepath)
+            # Load the CSV content once
+            rows = load_csv_content(filepath)
+
+            # Run all checks on the loaded data
+            if rows:
+                perform_checks(filepath, rows)
     
     # Write the summary to a file
     total_issues = sum(error_counts.values()) + sum(warning_counts.values())

From 66de966e79b78a2656123d6791b949aa2f20e46e Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 27 Oct 2024 04:12:05 +1100
Subject: [PATCH 18/23] Write Summary to GitHub Action

Attempt to write issue report to GITHUB_STEP_SUMMARY
---
 .github/workflows/quality-check.yml | 11 ++++-------
 scripts/check_quality.py            | 12 +++++++++++-
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index b654b6e..f95cb54 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -18,15 +18,12 @@ jobs:
     - name: Run Quality Check
       id: quality_check
       run: |
-        python ./scripts/check_quality.py
-      continue-on-error: true  # Continue if there are warnings/errors, so we can log the output
+        python ./scripts/quality_checker.py
 
-    - name: Upload Quality Check Summary
+    - name: Upload Logs to GitHub Summary
       if: always()
-      uses: actions/upload-artifact@v3
-      with:
-        name: check-quality-summary
-        path: ./check_quality_summary.txt
+      run: |
+        cat ./check_quality_summary.txt >> $GITHUB_STEP_SUMMARY
 
     - name: Fail on Errors
       if: steps.quality_check.outcome == 'failure'
diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index f271196..772887b 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -22,6 +22,13 @@
     'WARN Outdated Manage Abbreviation': 0
 }
 
+# After generating the summary, write to the GITHUB_STEP_SUMMARY file if available
+def write_to_github_summary():
+    github_summary_path = os.getenv('GITHUB_STEP_SUMMARY')
+    if github_summary_path:
+        with open(github_summary_path, 'w', encoding='utf-8') as summary_file:
+            summary_file.writelines(summary_output)
+
 # Error tracking
 def error(message, error_type):
     errors.append((error_type, f"ERROR: {message}"))
@@ -259,7 +266,7 @@ def check_outdated_abbreviations(filepath, rows):
     else:
         summary_output.append("Quality check completed with no errors or warnings.\n")
 
-    # Write to summary file
+    # Write the summary to a file
     with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file:
         summary_file.writelines(summary_output)
 
@@ -267,6 +274,9 @@ def check_outdated_abbreviations(filepath, rows):
     for line in summary_output:
         print(line, end='')
 
+    # Write to GitHub Actions summary, if available
+    write_to_github_summary()
+    
     # Set exit code based on errors
     if sum(error_counts.values()) > 0:
         sys.exit(1)  # Fail with an exit code if errors are found

From 7bc3629eab60ba35ba026e68a35ca56939bdd847 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 27 Oct 2024 04:18:21 +1100
Subject: [PATCH 19/23] Fix File Name Error

Fix quality checker name error
---
 .github/workflows/quality-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index f95cb54..0b48567 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -18,7 +18,7 @@ jobs:
     - name: Run Quality Check
       id: quality_check
       run: |
-        python ./scripts/quality_checker.py
+        python ./scripts/check_quality.py
 
     - name: Upload Logs to GitHub Summary
       if: always()

From 19d121359668b0b15ff212c931557fb2e25c065e Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 27 Oct 2024 04:26:20 +1100
Subject: [PATCH 20/23] Try uploading large error report as Artifact

Try uploading large error report as artifect
---
 .github/workflows/quality-check.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml
index 0b48567..b0ac078 100644
--- a/.github/workflows/quality-check.yml
+++ b/.github/workflows/quality-check.yml
@@ -20,13 +20,16 @@ jobs:
       run: |
         python ./scripts/check_quality.py
 
-    - name: Upload Logs to GitHub Summary
+    - name: Upload Quality Check Summary as Artifact
       if: always()
-      run: |
-        cat ./check_quality_summary.txt >> $GITHUB_STEP_SUMMARY
+      uses: actions/upload-artifact@v3
+      with:
+        name: check-quality-summary
+        path: ./check_quality_summary.txt
 
     - name: Fail on Errors
       if: steps.quality_check.outcome == 'failure'
       run: |
         echo "Quality check failed due to errors."
         exit 1
+

From 512c4c423f7ac594f48a80979d1e7f8f1f761a1f Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 27 Oct 2024 04:38:24 +1100
Subject: [PATCH 21/23] Attempt shorten error/warning message

Shorten error/warning message for smaller summary size
---
 scripts/check_quality.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index 772887b..aea484b 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -73,7 +73,7 @@ def check_wrong_escape(filepath, rows):
             matches = re.findall(r"\\.", field)
             for match in matches:
                 if match not in valid_escapes:
-                    error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape')
+                    error(f"Wrong escape in {filepath} line {line_number}: {field}", 'ERROR Wrong Escape')
 
 # Check for wrong beginning letters in journal abbreviations
 def check_wrong_beginning_letters(filepath, rows):
@@ -180,9 +180,9 @@ def is_valid_abbreviation(full_name, abbrev):
             
             if not is_valid_abbreviation(full_name, abbreviation):
                 error(
-                    f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:"
+                    f"Wrong abbreviation in {filepath} line {line_number}:"
                     f"\nFull: '{full_name}',"
-                    f"\nAbbrev: '{abbreviation}'",
+                    f"\nAbbr: '{abbreviation}'",
                     'ERROR Wrong Starting Letter')
 
 
@@ -200,7 +200,7 @@ def check_duplicates(filepath, rows):
         
         # Check for duplicate full names or abbreviations
         if full_name in full_name_entries or abbreviation in abbreviation_entries:
-            warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation')
+            warning(f"Duplicate in {filepath} line {line_number}: Full: '{full_name}', Abbr: '{abbreviation}', first seen in line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation')
         else:
             full_name_entries[full_name] = line_number
             abbreviation_entries[abbreviation] = line_number
@@ -209,13 +209,13 @@ def check_duplicates(filepath, rows):
 def check_full_form_identical_to_abbreviation(filepath, rows):
     for line_number, row in enumerate(rows, start=1):
         if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip():
-            warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name')
+            warning(f"Abbr same as Full in {filepath} line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name')
 
 # Check for outdated abbreviations
 def check_outdated_abbreviations(filepath, rows):
     for line_number, row in enumerate(rows, start=1):
         if "Manage." in row and "Manag." not in row:
-            warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation')
+            warning(f"Outdated abbr in {filepath} line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation')
 
 # Main entry point
 if __name__ == "__main__":
@@ -242,9 +242,9 @@ def check_outdated_abbreviations(filepath, rows):
     summary_output.append("# Quality Check Summary Report\n")
     summary_output.append("| Status        | Count |\n")
     summary_output.append("| ------------- | ----- |\n")
-    summary_output.append(f"| 🔍 Total Issues      | {total_issues}   |\n")
-    summary_output.append(f"| ❌ Errors Found      | {sum(error_counts.values())}    |\n")
-    summary_output.append(f"| ⚠️ Warnings Found    | {sum(warning_counts.values())}   |\n\n")
+    summary_output.append(f"| 🔍 Total       | {total_issues}   |\n")
+    summary_output.append(f"| ❌ Errors      | {sum(error_counts.values())}    |\n")
+    summary_output.append(f"| ⚠️ Warnings    | {sum(warning_counts.values())}   |\n\n")
 
     # Write detailed errors and warnings
     if errors or warnings:

From c19931b4262038946cf3dd7884f7436260e8e0c9 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 27 Oct 2024 04:42:54 +1100
Subject: [PATCH 22/23] Revert "Attempt shorten error/warning message"

This reverts commit 512c4c423f7ac594f48a80979d1e7f8f1f761a1f.
---
 scripts/check_quality.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index aea484b..772887b 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -73,7 +73,7 @@ def check_wrong_escape(filepath, rows):
             matches = re.findall(r"\\.", field)
             for match in matches:
                 if match not in valid_escapes:
-                    error(f"Wrong escape in {filepath} line {line_number}: {field}", 'ERROR Wrong Escape')
+                    error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape')
 
 # Check for wrong beginning letters in journal abbreviations
 def check_wrong_beginning_letters(filepath, rows):
@@ -180,9 +180,9 @@ def is_valid_abbreviation(full_name, abbrev):
             
             if not is_valid_abbreviation(full_name, abbreviation):
                 error(
-                    f"Wrong abbreviation in {filepath} line {line_number}:"
+                    f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:"
                     f"\nFull: '{full_name}',"
-                    f"\nAbbr: '{abbreviation}'",
+                    f"\nAbbrev: '{abbreviation}'",
                     'ERROR Wrong Starting Letter')
 
 
@@ -200,7 +200,7 @@ def check_duplicates(filepath, rows):
         
         # Check for duplicate full names or abbreviations
         if full_name in full_name_entries or abbreviation in abbreviation_entries:
-            warning(f"Duplicate in {filepath} line {line_number}: Full: '{full_name}', Abbr: '{abbreviation}', first seen in line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation')
+            warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation')
         else:
             full_name_entries[full_name] = line_number
             abbreviation_entries[abbreviation] = line_number
@@ -209,13 +209,13 @@ def check_duplicates(filepath, rows):
 def check_full_form_identical_to_abbreviation(filepath, rows):
     for line_number, row in enumerate(rows, start=1):
         if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip():
-            warning(f"Abbr same as Full in {filepath} line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name')
+            warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name')
 
 # Check for outdated abbreviations
 def check_outdated_abbreviations(filepath, rows):
     for line_number, row in enumerate(rows, start=1):
         if "Manage." in row and "Manag." not in row:
-            warning(f"Outdated abbr in {filepath} line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation')
+            warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation')
 
 # Main entry point
 if __name__ == "__main__":
@@ -242,9 +242,9 @@ def check_outdated_abbreviations(filepath, rows):
     summary_output.append("# Quality Check Summary Report\n")
     summary_output.append("| Status        | Count |\n")
     summary_output.append("| ------------- | ----- |\n")
-    summary_output.append(f"| 🔍 Total       | {total_issues}   |\n")
-    summary_output.append(f"| ❌ Errors      | {sum(error_counts.values())}    |\n")
-    summary_output.append(f"| ⚠️ Warnings    | {sum(warning_counts.values())}   |\n\n")
+    summary_output.append(f"| 🔍 Total Issues      | {total_issues}   |\n")
+    summary_output.append(f"| ❌ Errors Found      | {sum(error_counts.values())}    |\n")
+    summary_output.append(f"| ⚠️ Warnings Found    | {sum(warning_counts.values())}   |\n\n")
 
     # Write detailed errors and warnings
     if errors or warnings:

From c4e51609b48a4c3bc0fc770ddb182d9129e86282 Mon Sep 17 00:00:00 2001
From: Philip Cai <u7588991@anu.edu.au>
Date: Sun, 27 Oct 2024 06:49:57 +1100
Subject: [PATCH 23/23] Attempt reduce error summary size

Shorten message and provide a more efficient error summary
---
 scripts/check_quality.py | 508 +++++++++++++++++++--------------------
 1 file changed, 253 insertions(+), 255 deletions(-)

diff --git a/scripts/check_quality.py b/scripts/check_quality.py
index 772887b..a7f7f6c 100644
--- a/scripts/check_quality.py
+++ b/scripts/check_quality.py
@@ -3,282 +3,280 @@
 import sys
 import itertools
 import csv
+from collections import defaultdict
 
 # Path to the journals folder (change this path accordingly)
 JOURNALS_FOLDER_PATH = "./journals/"
 SUMMARY_FILE_PATH = "./check_quality_summary.txt"
-errors = []
-warnings = []
-
-# Error and Warning Counts
-error_counts = {
-    'ERROR Wrong Escape': 0,
-    'ERROR Wrong Starting Letter': 0,
-    'ERROR Non-UTF8': 0
-}
-warning_counts = {
-    'WARN Duplicate FullName/Abbreviation': 0,
-    'WARN Same Abbreviation as Full Name': 0,
-    'WARN Outdated Manage Abbreviation': 0
-}
-
-# After generating the summary, write to the GITHUB_STEP_SUMMARY file if available
-def write_to_github_summary():
-    github_summary_path = os.getenv('GITHUB_STEP_SUMMARY')
-    if github_summary_path:
-        with open(github_summary_path, 'w', encoding='utf-8') as summary_file:
-            summary_file.writelines(summary_output)
-
-# Error tracking
-def error(message, error_type):
-    errors.append((error_type, f"ERROR: {message}"))
-    error_counts[error_type] += 1
-
-# Warning tracking
-def warning(message, warning_type):
-    warnings.append((warning_type, f"WARN: {message}"))
-    warning_counts[warning_type] += 1
-
-# Perform all checks on the file's content
-def perform_checks(filepath, rows):
-    check_non_utf8_characters(filepath, rows)
-    check_wrong_escape(filepath, rows)
-    check_wrong_beginning_letters(filepath, rows)
-    check_duplicates(filepath, rows)
-    check_full_form_identical_to_abbreviation(filepath, rows)
-    check_outdated_abbreviations(filepath, rows)
-
-# Load the content of a CSV file into memory once
-def load_csv_content(filepath):
-    try:
-        with open(filepath, 'r', encoding='utf-8') as f:
-            return list(csv.reader(f))
-    except UnicodeDecodeError as e:
-        error(f"File {filepath} contains non-UTF-8 characters: {e}", 'ERROR Non-UTF8')
-        return []
-
-# Check if non-UTF8 characters are present in the file
-def check_non_utf8_characters(filepath, rows):
-    for line_number, row in enumerate(rows, start=1):
-        try:
-            str(row).encode('utf-8')
-        except UnicodeEncodeError as e:
-            error(f"Non-UTF8 character found in {filepath} at line {line_number}: {e}", 'ERROR Non-UTF8')
-
-# Check if there are wrong escape characters in abbreviation entries
-def check_wrong_escape(filepath, rows):
-    valid_escapes = {'\\', '\n', '\t', '\r', '\"'}
-    for line_number, row in enumerate(rows, start=1):
-        for field in row:
-            matches = re.findall(r"\\.", field)
-            for match in matches:
-                if match not in valid_escapes:
-                    error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape')
-
-# Check for wrong beginning letters in journal abbreviations
-def check_wrong_beginning_letters(filepath, rows):
-    # Words that are typically ignored when creating abbreviations
-    ignore_words = {
-        'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 
-        'la', 'el', 'le', 'et', 'der', 'die', 'das', 'dem', 'und', 'für'  # Articles in multiple languages
-    }
-    
-    # Special cases for abbreviations
-    special_cases = {
-        'proceedings': ['p', 'proc'],
-        'or': ['or'],
-        'spie': ['spie'],
-        'notes': ['notes']
-    }
-
-    def clean_text(text):
-        # Remove special characters except periods (important for compound abbreviations)
-        # and normalize spaces
-        cleaned = re.sub(r'[^\w\s\.]', ' ', text)
-        return ' '.join(filter(None, cleaned.lower().split()))
-
-    def split_compound_abbrev(abbrev):
-        # Split abbreviation that might contain compound parts (e.g., "Nat.forsch")
-        parts = []
-        for part in abbrev.split():
-            # Split on periods but keep them with the preceding part
-            subparts = [sp for sp in re.split(r'(?<=\.)(?=[^\.])', part) if sp]
-            parts.extend(subparts)
-        return parts
-
-    def get_significant_words(text):
-        # Split text into words and filter out ignore words
-        return [w for w in clean_text(text).split() if w.lower() not in ignore_words]
-
-    def is_compound_word_match(full_word, abbrev_part):
-        # Handle compound word abbreviations (e.g., "Nat.forsch" matching "Naturforschenden")
-        if '.' in abbrev_part:
-            # Split the compound abbreviation
-            abbrev_subparts = abbrev_part.split('.')
-            # Get the first few characters of the full word to match against first part
-            word_start = full_word[:len(abbrev_subparts[0])]
-            
-            # For the second part (if exists), try to find it within the remaining word
-            if len(abbrev_subparts) > 1 and abbrev_subparts[1]:
-                remaining_word = full_word[len(abbrev_subparts[0]):]
-                return (word_start.lower() == abbrev_subparts[0].lower() and 
-                       abbrev_subparts[1].lower() in remaining_word.lower())
-            
-            return word_start.lower() == abbrev_subparts[0].lower()
-        return False
 
-    def is_valid_abbreviation(full_name, abbrev):
-        # Clean and split both strings
-        full_words = get_significant_words(full_name)
-        abbrev_parts = split_compound_abbrev(clean_text(abbrev))
+class QualityChecker:
+    def __init__(self):
+        # Use defaultdict to avoid key existence checks
+        self.error_counts = defaultdict(int)
+        self.warning_counts = defaultdict(int)
+        # Store issues by file for more efficient grouping
+        self.issues_by_file = defaultdict(lambda: {'errors': [], 'warnings': []})
         
-        # Handle cases where abbreviation is the same as full name
-        if clean_text(full_name) == clean_text(abbrev):
-            return True
-
-        # Handle special cases
-        for special_word, valid_abbrevs in special_cases.items():
-            if special_word in full_words:
-                if any(va in abbrev_parts for va in valid_abbrevs):
-                    return True
-
-        # Track matched parts and their positions
-        matched_parts = 0
-        used_full_words = set()
+    def error(self, filepath, message, error_type):
+        self.error_counts[error_type] += 1
+        # Remove filepath from message if it's included
+        message = message.replace(f"in {filepath} ", "")
+        full_message = f"{error_type}: {message}"
+        self.issues_by_file[filepath]['errors'].append(full_message)
+
+    def warning(self, filepath, message, warning_type):
+        self.warning_counts[warning_type] += 1
+        # Remove filepath from message if it's included
+        message = message.replace(f"in {filepath} ", "")
+        full_message = f"{warning_type}: {message}"
+        self.issues_by_file[filepath]['warnings'].append(full_message)
+
+    def write_summary(self, summary_lines):
+        # Write to file in a single operation
+        with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file:
+            summary_file.writelines(summary_lines)
+
+        # Print to console in chunks
+        for line in summary_lines:
+            print(line, end='')
+
+        # Write to GitHub Actions summary if available
+        github_summary_path = os.getenv('GITHUB_STEP_SUMMARY')
+        if github_summary_path:
+            with open(github_summary_path, 'w', encoding='utf-8') as summary_file:
+                summary_file.writelines(summary_lines)
+
+    def check_non_utf8_characters(self, filepath, rows):
+        for line_number, row in enumerate(rows, start=1):
+            try:
+                str(row).encode('utf-8')
+            except UnicodeEncodeError as e:
+                self.error(
+                    filepath, 
+                    f"at line {line_number}: {e}",
+                    'ERROR Non-UTF8'
+                )
+
+    def check_wrong_escape(self, filepath, rows):
+        valid_escapes = {'\\', '\n', '\t', '\r', '\"'}
+        for line_number, row in enumerate(rows, start=1):
+            for field in row:
+                matches = re.findall(r"\\.", field)
+                for match in matches:
+                    if match not in valid_escapes:
+                        self.error(
+                            filepath,
+                            f"at line {line_number}: {field}",
+                            'ERROR Wrong Escape'
+                        )
+
+    def check_wrong_beginning_letters(self, filepath, rows):
+        # Words that are typically ignored when creating abbreviations
+        ignore_words = {
+            'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 
+            'la', 'el', 'le', 'et', 'der', 'die', 'das', 'dem', 'und', 'für'
+        }
         
-        for abbrev_part in abbrev_parts:
-            found_match = False
-            
-            # Try matching against each full word
-            for i, full_word in enumerate(full_words):
-                if i in used_full_words:
-                    continue
+        # Special cases for abbreviations
+        special_cases = {
+            'proceedings': ['p', 'proc'],
+            'or': ['or'],
+            'spie': ['spie'],
+            'notes': ['notes']
+        }
+
+        def clean_text(text):
+            cleaned = re.sub(r'[^\w\s\.]', ' ', text)
+            return ' '.join(filter(None, cleaned.lower().split()))
+
+        def split_compound_abbrev(abbrev):
+            parts = []
+            for part in abbrev.split():
+                subparts = [sp for sp in re.split(r'(?<=\.)(?=[^\.])', part) if sp]
+                parts.extend(subparts)
+            return parts
+
+        def get_significant_words(text):
+            return [w for w in clean_text(text).split() if w.lower() not in ignore_words]
+
+        def is_compound_word_match(full_word, abbrev_part):
+            if '.' in abbrev_part:
+                abbrev_subparts = abbrev_part.split('.')
+                word_start = full_word[:len(abbrev_subparts[0])]
                 
-                # Check for compound word match
-                if is_compound_word_match(full_word, abbrev_part):
-                    found_match = True
-                    matched_parts += 1
-                    used_full_words.add(i)
-                    break
+                if len(abbrev_subparts) > 1 and abbrev_subparts[1]:
+                    remaining_word = full_word[len(abbrev_subparts[0]):]
+                    return (word_start.lower() == abbrev_subparts[0].lower() and 
+                           abbrev_subparts[1].lower() in remaining_word.lower())
                 
-                # Check for regular abbreviation patterns
-                elif (full_word.lower().startswith(abbrev_part.lower()) or
-                      (len(abbrev_part) >= 2 and abbrev_part[0].lower() == full_word[0].lower())):
-                    found_match = True
-                    matched_parts += 1
-                    used_full_words.add(i)
-                    break
-
-        # Consider the abbreviation valid if we matched most parts
-        min_required_matches = max(1, len(abbrev_parts) * 0.5)
-        return matched_parts >= min_required_matches
-
-    for line_number, row in enumerate(rows, start=1):
-        if len(row) >= 2:
-            full_name = row[0].strip()
-            abbreviation = row[1].strip()
+                return word_start.lower() == abbrev_subparts[0].lower()
+            return False
+
+        def is_valid_abbreviation(full_name, abbrev):
+            full_words = get_significant_words(full_name)
+            abbrev_parts = split_compound_abbrev(clean_text(abbrev))
             
-            if not is_valid_abbreviation(full_name, abbreviation):
-                error(
-                    f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:"
-                    f"\nFull: '{full_name}',"
-                    f"\nAbbrev: '{abbreviation}'",
-                    'ERROR Wrong Starting Letter')
+            if clean_text(full_name) == clean_text(abbrev):
+                return True
+
+            for special_word, valid_abbrevs in special_cases.items():
+                if special_word in full_words:
+                    if any(va in abbrev_parts for va in valid_abbrevs):
+                        return True
 
+            matched_parts = 0
+            used_full_words = set()
+            
+            for abbrev_part in abbrev_parts:
+                found_match = False
+                
+                for i, full_word in enumerate(full_words):
+                    if i in used_full_words:
+                        continue
+                    
+                    if is_compound_word_match(full_word, abbrev_part):
+                        found_match = True
+                        matched_parts += 1
+                        used_full_words.add(i)
+                        break
+                    
+                    elif (full_word.lower().startswith(abbrev_part.lower()) or
+                          (len(abbrev_part) >= 2 and abbrev_part[0].lower() == full_word[0].lower())):
+                        found_match = True
+                        matched_parts += 1
+                        used_full_words.add(i)
+                        break
+
+            min_required_matches = max(1, len(abbrev_parts) * 0.5)
+            return matched_parts >= min_required_matches
+
+        for line_number, row in enumerate(rows, start=1):
+            if len(row) >= 2:
+                full_name = row[0].strip()
+                abbreviation = row[1].strip()
+                
+                if not is_valid_abbreviation(full_name, abbreviation):
+                    self.error(
+                        filepath,
+                        f"at line {line_number} Full: '{full_name}', Abbr: '{abbreviation}'",
+                        'ERROR Wrong Abbreviation'
+                    )
 
-# Check for duplicate entries
-def check_duplicates(filepath, rows):
-    full_name_entries = {}
-    abbreviation_entries = {}
+    def check_duplicates(self, filepath, rows):
+        full_name_entries = {}
+        abbreviation_entries = {}
 
-    for line_number, row in enumerate(rows, start=1):
-        if len(row) < 2:
-            continue
+        for line_number, row in enumerate(rows, start=1):
+            if len(row) < 2:
+                continue
 
-        full_name = row[0].strip()
-        abbreviation = row[1].strip()
+            full_name = row[0].strip()
+            abbreviation = row[1].strip()
+            
+            if full_name in full_name_entries or abbreviation in abbreviation_entries:
+                self.warning(
+                    filepath,
+                    f"at line {line_number} Full: '{full_name}', Abbr: '{abbreviation}', first seen in line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}",
+                    'WARN Duplicate FullName/Abbreviation'
+                )
+            else:
+                full_name_entries[full_name] = line_number
+                abbreviation_entries[abbreviation] = line_number
+
+    def check_full_form_identical_to_abbreviation(self, filepath, rows):
+        for line_number, row in enumerate(rows, start=1):
+            if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip():
+                self.warning(
+                    filepath,
+                    f"at line {line_number}: {row[0]}",
+                    'WARN Same Abbrev. as Full Name'
+                )
+
+    def check_outdated_abbreviations(self, filepath, rows):
+        for line_number, row in enumerate(rows, start=1):
+            if "Manage." in row and "Manag." not in row:
+                self.warning(
+                    filepath,
+                    f"at line {line_number}: {','.join(row)}",
+                    'WARN Outdated Manage Abbreviation'
+                )
+
+    def perform_checks(self, filepath, rows):
+        self.check_non_utf8_characters(filepath, rows)
+        self.check_wrong_escape(filepath, rows)
+        self.check_wrong_beginning_letters(filepath, rows)
+        self.check_duplicates(filepath, rows)
+        self.check_full_form_identical_to_abbreviation(filepath, rows)
+        self.check_outdated_abbreviations(filepath, rows)
+
+    def generate_summary(self):
+        total_issues = sum(self.error_counts.values()) + sum(self.warning_counts.values())
         
-        # Check for duplicate full names or abbreviations
-        if full_name in full_name_entries or abbreviation in abbreviation_entries:
-            warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation')
+        # Pre-allocate list with estimated size
+        summary_lines = []
+        summary_lines.extend([
+            "# Quality Check Summary Report\n",
+            "| Status        | Count |\n",
+            "| ------------- | ----- |\n",
+            f"| 🔍 Total Issues      | {total_issues}   |\n",
+            f"| ❌ Errors Found      | {sum(self.error_counts.values())}    |\n",
+            f"| ⚠️ Warnings Found    | {sum(self.warning_counts.values())}   |\n\n"
+        ])
+
+        # Add detailed error/warning counts
+        if self.error_counts:
+            summary_lines.append("## Error Counts\n")
+            for error_type, count in sorted(self.error_counts.items()):
+                summary_lines.append(f"- {error_type}: {count}\n")
+            summary_lines.append("\n")
+
+        if self.warning_counts:
+            summary_lines.append("## Warning Counts\n")
+            for warning_type, count in sorted(self.warning_counts.items()):
+                summary_lines.append(f"- {warning_type}: {count}\n")
+            summary_lines.append("\n")
+
+        if self.issues_by_file:
+            summary_lines.append("## Issues per Input File\n\n")
+            for filepath, issues in sorted(self.issues_by_file.items()):
+                summary_lines.append(f"### Issues in file `{filepath}`\n")
+                if issues['errors']:
+                    summary_lines.append("#### Errors:\n")
+                    summary_lines.extend(f"- {err}\n" for err in sorted(issues['errors']))
+                
+                if issues['warnings']:
+                    summary_lines.append("#### Warnings:\n")
+                    summary_lines.extend(f"- {warn}\n" for warn in sorted(issues['warnings']))
+                
+                summary_lines.append("\n")
         else:
-            full_name_entries[full_name] = line_number
-            abbreviation_entries[abbreviation] = line_number
-
-# Check if abbreviation and full form are the same
-def check_full_form_identical_to_abbreviation(filepath, rows):
-    for line_number, row in enumerate(rows, start=1):
-        if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip():
-            warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name')
-
-# Check for outdated abbreviations
-def check_outdated_abbreviations(filepath, rows):
-    for line_number, row in enumerate(rows, start=1):
-        if "Manage." in row and "Manag." not in row:
-            warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation')
-
-# Main entry point
-if __name__ == "__main__":
+            summary_lines.append("Quality check completed with no errors or warnings.\n")
+
+        return summary_lines
+def main():
     if not os.path.exists(JOURNALS_FOLDER_PATH):
         print("Journals folder not found. Please make sure the path is correct.")
         sys.exit(1)
 
-    # Iterate through all CSV files in the journals folder
+    checker = QualityChecker()
+    
+    # Process all files
     for filename in os.listdir(JOURNALS_FOLDER_PATH):
         if filename.endswith(".csv"):
             filepath = os.path.join(JOURNALS_FOLDER_PATH, filename)
-            
-            # Load the CSV content once
-            rows = load_csv_content(filepath)
-
-            # Run all checks on the loaded data
-            if rows:
-                perform_checks(filepath, rows)
+            try:
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    rows = list(csv.reader(f))
+                    checker.perform_checks(filepath, rows)
+            except UnicodeDecodeError as e:
+                checker.error(filepath, f"File contains non-UTF8 characters: {e}", 'ERROR Non-UTF8')
+
+    # Generate and write summary
+    summary_lines = checker.generate_summary()
+    checker.write_summary(summary_lines)
     
-    # Write the summary to a file
-    total_issues = sum(error_counts.values()) + sum(warning_counts.values())
-    summary_output = []
-
-    summary_output.append("# Quality Check Summary Report\n")
-    summary_output.append("| Status        | Count |\n")
-    summary_output.append("| ------------- | ----- |\n")
-    summary_output.append(f"| 🔍 Total Issues      | {total_issues}   |\n")
-    summary_output.append(f"| ❌ Errors Found      | {sum(error_counts.values())}    |\n")
-    summary_output.append(f"| ⚠️ Warnings Found    | {sum(warning_counts.values())}   |\n\n")
-
-    # Write detailed errors and warnings
-    if errors or warnings:
-        summary_output.append("## Errors per Input File\n\n")
-        files = set([msg.split(' in ')[1].split(' at ')[0] for _, msg in errors + warnings])
-        for file in files:
-            summary_output.append(f"### Issues in file `{file}`\n")
-            file_errors = [msg for err_type, msg in errors if file in msg]
-            file_warnings = [msg for warn_type, msg in warnings if file in msg]
-            if file_errors:
-                summary_output.append("#### Errors:\n")
-                for err in file_errors:
-                    summary_output.append(f"- {err.split('ERROR: ')[1]}\n")
-            if file_warnings:
-                summary_output.append("#### Warnings:\n")
-                for warn in file_warnings:
-                    summary_output.append(f"- {warn.split('WARN: ')[1]}\n")
-            summary_output.append("\n")
-    else:
-        summary_output.append("Quality check completed with no errors or warnings.\n")
-
-    # Write the summary to a file
-    with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file:
-        summary_file.writelines(summary_output)
-
-    # Print the summary to console
-    for line in summary_output:
-        print(line, end='')
-
-    # Write to GitHub Actions summary, if available
-    write_to_github_summary()
-    
-    # Set exit code based on errors
-    if sum(error_counts.values()) > 0:
-        sys.exit(1)  # Fail with an exit code if errors are found
-    else:
-        sys.exit(0)  # Exit successfully if no errors
\ No newline at end of file
+    # Exit with appropriate code
+    sys.exit(1 if sum(checker.error_counts.values()) > 0 else 0)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file