TLDR-709 TLDR-714 update text and classifier extraction benchmark (#456)

ispras · Jun 20, 2024 · 4cb3df2 · 4cb3df2
1 parent d2d309d
commit 4cb3df2
Show file tree

Hide file tree

Showing 16 changed files with 1,163 additions and 1,123 deletions.
diff --git a/dedoc/data_structures/unstructured_document.py b/dedoc/data_structures/unstructured_document.py
@@ -28,3 +28,6 @@ def __init__(self,
         self.attachments = attachments
         self.warnings = warnings if warnings else []
         self.metadata = metadata if metadata is not None else {}
+
+    def get_text(self) -> str:
+        return LineWithMeta.join(self.lines).line
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py
@@ -59,5 +59,9 @@ def __init__(self) -> None:
     def load_dataset(self, csv_path: str, image_path: str, batch_size: int = 4) -> DataLoader:
         trainset = DatasetImageOrient(csv_file=csv_path, root_dir=image_path, transform=self.transform)
         trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
+        self.amount = len(trainset)
 
         return trainloader
+
+    def __len__(self) -> int:
+        return self.amount
diff --git a/resources/benchmarks/orient_classifier_scores.txt b/resources/benchmarks/orient_classifier_scores.txt
@@ -0,0 +1,25 @@
+
+Orientation predictions:
++-------+-----------+--------+-------+-------+
+| Class | Precision | Recall |  F1   | Count |
++=======+===========+========+=======+=======+
+| 0     | 0.998     | 1      | 0.999 | 537   |
++-------+-----------+--------+-------+-------+
+| 90    | 1         | 0.998  | 0.999 | 537   |
++-------+-----------+--------+-------+-------+
+| 180   | 1         | 0.998  | 0.999 | 537   |
++-------+-----------+--------+-------+-------+
+| 270   | 0.998     | 1      | 0.999 | 537   |
++-------+-----------+--------+-------+-------+
+| AVG   | 0.999     | 0.999  | 0.999 | None  |
++-------+-----------+--------+-------+-------+
+Column predictions:
++-------+-----------+--------+-------+-------+
+| Class | Precision | Recall |  F1   | Count |
++=======+===========+========+=======+=======+
+| 1     | 1         | 0.999  | 0.999 | 1692  |
++-------+-----------+--------+-------+-------+
+| 2     | 0.996     | 1      | 0.998 | 456   |
++-------+-----------+--------+-------+-------+
+| AVG   | 0.999     | 0.999  | 0.999 | None  |
++-------+-----------+--------+-------+-------+
diff --git a/resources/benchmarks/tesseract_benchmark.txt b/resources/benchmarks/tesseract_benchmark.txt
diff --git a/resources/benchmarks/tesseract_benchmark_Correction.SAGE_CORRECTION.txt b/resources/benchmarks/tesseract_benchmark_Correction.SAGE_CORRECTION.txt
diff --git a/resources/benchmarks/tesseract_benchmark_Correction.WITHOUT_CORRECTION.txt b/resources/benchmarks/tesseract_benchmark_Correction.WITHOUT_CORRECTION.txt
diff --git a/resources/benchmarks/tesseract_benchmark_sage-correction.txt b/resources/benchmarks/tesseract_benchmark_sage-correction.txt
diff --git a/resources/benchmarks/tesseract_benchmark_textblob-correction.txt b/resources/benchmarks/tesseract_benchmark_textblob-correction.txt
diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py
@@ -14,6 +14,15 @@
 os.makedirs(path_result, exist_ok=True)
 path_result = os.path.join(path_result, "benchmarks_tl_correctness.json")
 
+"""
+Experiments are available -> https://github.com/alexander1999-hub/txt_layer_correctness/tree/main :
+    * generating synthetic incorrect text
+    * compare different classification models
+    * compare different input textual feature: TF-IDF and custom features
+    * compare on real data of correct/incorrect texts with GT using Levenstein (available on Confluence -> dataset page)
+Here (in this script) we calculate an accuracy of selected model (XGboost on custom features) on real data without GT. Data are pdfs with textual layer)
+"""
+
 host = "http://localhost:1231"
 param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed"))
 

diff --git a/scripts/tesseract_benchmark/ocr_correction.py b/scripts/tesseract_benchmark/ocr_correction.py
diff --git a/scripts/tesseract_benchmark/requirements.txt b/scripts/tesseract_benchmark/requirements.txt
diff --git a/scripts/tesseract_benchmark/text_blob_correction.py b/scripts/tesseract_benchmark/text_blob_correction.py
diff --git a/scripts/text_extraction_benchmark/analyze_ocr_errors.py b/scripts/text_extraction_benchmark/analyze_ocr_errors.py
@@ -0,0 +1,74 @@
+import os
+import re
+from typing import List, Tuple
+
+from texttable import Texttable
+
+
+def __parse_ocr_errors(lines: List[str]) -> List:
+    ocr_errors = []
+    matched_errors = [(line_num, line) for line_num, line in enumerate(lines) if "Errors   Marked   Correct-Generated" in line][0]
+    for line in lines[matched_errors[0] + 1:]:
+        # example line: " 2        0   { 6}-{б}"
+        errors = re.findall(r"(\d+)", line)[0]
+        chars = re.findall(r"{(.*)}-{(.*)}", line)[0]
+        ocr_errors.append([errors, chars[0], chars[1]])
+
+    return ocr_errors
+
+
+def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]:
+    symbols_info = []
+    matched_symbols = [(line_num, line) for line_num, line in enumerate(lines) if "Count   Missed   %Right" in line][-1]
+    start_block_line = matched_symbols[0]
+
+    for line in lines[start_block_line + 1:]:
+        # example line: "1187       11    99.07   {<\n>}"
+        row_values = [value.strip() for value in re.findall(r"\d+.\d*|{\S+|\W+}", line)]
+        row_values[-1] = row_values[-1][1:-1]  # get symbol value
+        symbols_info.append(row_values)
+    # Sort errors
+    symbols_info = sorted(symbols_info, key=lambda row: int(row[1]), reverse=True)  # by missed
+
+    return symbols_info, start_block_line
+
+
+def get_summary_symbol_error(path_reports: str) -> Texttable:
+    # 1 - call accsum for get summary of all reports
+    accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "accsum"))
+
+    if os.path.exists(f"{path_reports}/../accsum_report.txt"):
+        os.remove(f"{path_reports}/../accsum_report.txt")
+
+    file_reports = " ".join([os.path.join(path_reports, f) for f in os.listdir(path_reports) if os.path.isfile(os.path.join(path_reports, f))])
+
+    command = f"{accuracy_script_path} {file_reports} >> {path_reports}/../accsum_report.txt"
+    os.system(command)
+    accsum_report_path = os.path.join(path_reports, "..", "accsum_report.txt")
+
+    # 2 - parse report info
+    with open(accsum_report_path, "r") as f:
+        lines = f.readlines()
+
+        symbols_info, start_symbol_block_line = __parse_symbol_info(lines)
+        ocr_errors = __parse_ocr_errors(lines[:start_symbol_block_line - 1])
+
+    # 3 - calculate ocr errors for a symbol
+    ocr_errors_by_symbol = {}
+    for symbol_info in symbols_info:
+        ocr_errors_by_symbol[symbol_info[-1]] = []
+        for ocr_err in ocr_errors:
+            if ocr_err[-1] == "" or len(ocr_err[-2]) > 3 or len(ocr_err[-1]) > 3:  # to ignore errors with long text (len > 3) or without text
+                continue
+            if symbol_info[-1] in ocr_err[-2]:
+                ocr_errors_by_symbol[symbol_info[-1]].append(f"{ocr_err[0]} & <{ocr_err[1]}> -> <{ocr_err[2]}>")
+
+    # 4 - create table with OCR errors
+    ocr_err_by_symbol_table = Texttable()
+    title = [["Symbol", "Cnt Errors & Correct-Generated"]]
+    ocr_err_by_symbol_table.add_rows(title)
+    for symbol, value in ocr_errors_by_symbol.items():
+        if len(value) != 0:
+            ocr_err_by_symbol_table.add_row([symbol, value])
+
+    return ocr_err_by_symbol_table
diff --git a/scripts/text_extraction_benchmark/text_correction/sage_corrector.py b/scripts/text_extraction_benchmark/text_correction/sage_corrector.py
@@ -0,0 +1,41 @@
+import os
+
+import torch
+from sage.spelling_correction import AvailableCorrectors
+from sage.spelling_correction import RuM2M100ModelForSpellingCorrection
+
+
+"""
+Install sage library (for ocr correction step):
+git clone https://github.com/ai-forever/sage.git
+cd sage
+pip install .
+pip install -r requirements.txt
+
+Note: sage use 5.2 Gb GPU ......
+"""
+
+
+class SageCorrector:
+
+    def __init__(self, cache_dir: str, use_gpu: bool = True) -> None:
+        self.corrected_path = os.path.join(cache_dir, "result_corrected")
+        os.makedirs(self.corrected_path, exist_ok=True)
+
+        self.corrector = RuM2M100ModelForSpellingCorrection.from_pretrained(AvailableCorrectors.m2m100_1B.value)  # 4.49 Gb model (pytorch_model.bin)
+        self._init_device(use_gpu)
+
+    def _init_device(self, use_gpu: bool) -> None:
+        if torch.cuda.is_available() and use_gpu:
+            self.corrector.model.to(torch.device("cuda:0"))
+            print("use CUDA")
+        else:
+            print("use CPU")
+
+    def correction(self, text: str) -> str:
+        corrected_lines = []
+        for line in text.split("\n"):
+            corrected_lines.append(self.corrector.correct(line)[0])
+        corrected_text = "\n".join(corrected_lines)
+
+        return corrected_text