diff --git a/dedoc/data_structures/unstructured_document.py b/dedoc/data_structures/unstructured_document.py index 29e82917..94197e2e 100644 --- a/dedoc/data_structures/unstructured_document.py +++ b/dedoc/data_structures/unstructured_document.py @@ -28,3 +28,6 @@ def __init__(self, self.attachments = attachments self.warnings = warnings if warnings else [] self.metadata = metadata if metadata is not None else {} + + def get_text(self) -> str: + return LineWithMeta.join(self.lines).line diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py index b3b0790b..245234c7 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/dataset_executor.py @@ -59,5 +59,9 @@ def __init__(self) -> None: def load_dataset(self, csv_path: str, image_path: str, batch_size: int = 4) -> DataLoader: trainset = DatasetImageOrient(csv_file=csv_path, root_dir=image_path, transform=self.transform) trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2) + self.amount = len(trainset) return trainloader + + def __len__(self) -> int: + return self.amount diff --git a/resources/benchmarks/orient_classifier_scores.txt b/resources/benchmarks/orient_classifier_scores.txt new file mode 100644 index 00000000..9fe55d01 --- /dev/null +++ b/resources/benchmarks/orient_classifier_scores.txt @@ -0,0 +1,25 @@ + +Orientation predictions: ++-------+-----------+--------+-------+-------+ +| Class | Precision | Recall | F1 | Count | ++=======+===========+========+=======+=======+ +| 0 | 0.998 | 1 | 0.999 | 537 | ++-------+-----------+--------+-------+-------+ +| 90 | 1 | 0.998 | 0.999 | 537 | ++-------+-----------+--------+-------+-------+ +| 180 | 1 | 0.998 | 0.999 | 537 | ++-------+-----------+--------+-------+-------+ +| 270 | 0.998 | 1 | 0.999 | 537 | ++-------+-----------+--------+-------+-------+ +| AVG | 0.999 | 0.999 | 0.999 | None | ++-------+-----------+--------+-------+-------+ +Column predictions: ++-------+-----------+--------+-------+-------+ +| Class | Precision | Recall | F1 | Count | ++=======+===========+========+=======+=======+ +| 1 | 1 | 0.999 | 0.999 | 1692 | ++-------+-----------+--------+-------+-------+ +| 2 | 0.996 | 1 | 0.998 | 456 | ++-------+-----------+--------+-------+-------+ +| AVG | 0.999 | 0.999 | 0.999 | None | ++-------+-----------+--------+-------+-------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark.txt b/resources/benchmarks/tesseract_benchmark.txt deleted file mode 100644 index fd980a45..00000000 --- a/resources/benchmarks/tesseract_benchmark.txt +++ /dev/null @@ -1,256 +0,0 @@ -Tesseract version is 5.0.0 -Table 1 - Accuracy for each file -+---------------+---------------------+-------+-----------------+--------------+ -| Dataset | Image name | --psm | Amount of words | Accuracy OCR | -+===============+=====================+=======+=================+==============+ -| english-words | Kaspersky | 6 | 111 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | USB | 6 | 4 | 85.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words1 | 6 | 19 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words2 | 6 | 9 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words3 | 6 | 9 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 525 | 83.800 | -| | oga_00 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 241 | 88.800 | -| | oga_01 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | napalm_doc_2_2_6 | 4 | 124 | 86.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 695 | 99.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 696 | 99.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 699 | 99.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | article_multiline | 4 | 471 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_00 | 4 | 192 | 95.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_01 | 4 | 332 | 99.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | law_image | 4 | 182 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | napalm_doc_13_2 | 4 | 243 | 97.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukaz_prezidenta_1 | 4 | 264 | 99.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_00 | 4 | 287 | 99.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_01 | 4 | 340 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 146 | 95.700 | -| | 0 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 276 | 99.600 | -| | 1 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 165 | 98.800 | -| | 2 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 90 | 99.600 | -| | 3 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_00 | 4 | 78 | 97.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_01 | 4 | 296 | 98.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_02 | 4 | 309 | 98.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_03 | 4 | 337 | 98.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_04 | 4 | 257 | 96.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_05 | 4 | 238 | 98.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_06 | 4 | 219 | 93.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_07 | 4 | 233 | 98.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_08 | 4 | 284 | 97.200 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_09 | 4 | 154 | 97.500 | -+---------------+---------------------+-------+-----------------+--------------+ - -Table 2 - AVG by each type of symbols: -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | -| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | -| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | -| | s | ols | | ars | bols | | | | -+========+========+========+========+========+========+========+=======+=======+ -| englis | 100 | 99.333 | 100 | 0 | 0 | 94.540 | 152 | 97.06 | -| h- | | | | | | | | 0 | -| words | | | | | | | | | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| others | 90.967 | 77.400 | 89.533 | 0 | 0 | 86.433 | 890 | 86.23 | -| | | | | | | | | 3 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| tz-npa | 99.268 | 91.064 | 92.076 | 0 | 0 | 99.480 | 7483 | 98.39 | -| | | | | | | | | 6 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ - -Table 3 -OCR error by symbol: -+--------+---------------------------------------------------------------------+ -| Symbol | Cnt Errors & Correct-Generated | -+========+=====================================================================+ -| | ['3 & -> ', '2 & < 6> -> <б>', '2 & < > -> <__>', "2 & | -| | <1 > -> <'>", '2 & <и > -> <н>'] | -+--------+---------------------------------------------------------------------+ -| . | ['5 & <.> -> <,>', '3 & <3.> -> < De>', '3 & -> ', '2 & | -| | <6.> -> ', '2 & <г.> -> <Г>'] | -+--------+---------------------------------------------------------------------+ -| , | ['66 & <,> -> <.>', '3 & <ва,> -> <нь>'] | -+--------+---------------------------------------------------------------------+ -| 1 | ['6 & <1> -> <|>', '4 & <1С> -> ', "3 & <1> -> <'>", '3 & <№1> | -| | -> ', '3 & <№1»> -> ', "2 & <1 > -> <'>", '2 & <1C> -> | -| | ', '2 & <1C> -> <С>', '2 & <1> -> ', '1 & <1> -> <Г>', '1 & | -| | <1> -> <Т>'] | -+--------+---------------------------------------------------------------------+ -| е | ['6 & <е> -> <с>', '2 & <не> -> ', '2 & <ре> -> <с>', '1 & <е> | -| | -> <а>'] | -+--------+---------------------------------------------------------------------+ -| н | ['2 & <н> -> <и>', '2 & <не> -> ', '1 & <н> -> <й>', '1 & <н> | -| | -> <п>'] | -+--------+---------------------------------------------------------------------+ -| и | ['3 & <ти> -> < TH>', '3 & <тип> -> ', '2 & <и > -> <н>', '2 & | -| | <ис> -> <не>'] | -+--------+---------------------------------------------------------------------+ -| а | ['3 & <ва,> -> <нь>'] | -+--------+---------------------------------------------------------------------+ -| о | ['2 & <то> -> ', '1 & <о> -> <0>'] | -+--------+---------------------------------------------------------------------+ -| т | ['7 & <т> -> <г>', '4 & <т> -> < г>', '3 & <ти> -> < TH>', '3 & | -| | <тип> -> ', '2 & <то> -> '] | -+--------+---------------------------------------------------------------------+ -| 2 | ['2 & <28> -> ', '2 & <28> -> <ИР>', '2 & <28> -> <Я >'] | -+--------+---------------------------------------------------------------------+ -| л | ['2 & <л> -> <п>'] | -+--------+---------------------------------------------------------------------+ -| С | ['6 & <СЗВ> -> ', '4 & <1С> -> ', '4 & <ОС> -> ', '3 & | -| | <С> -> ', '2 & <СА> -> ', '1 & <С> -> <—>'] | -+--------+---------------------------------------------------------------------+ -| 3 | ['3 & <3.> -> < De>', '1 & <3> -> '] | -+--------+---------------------------------------------------------------------+ -| г | ['2 & <г.> -> <Г>', '2 & <г> -> <т >', '2 & <г> -> <т>', '2 & <гр> | -| | -> ', '2 & <гр> -> <тв>'] | -+--------+---------------------------------------------------------------------+ -| N | ['22 & -> <М>'] | -+--------+---------------------------------------------------------------------+ -| в | ['3 & <ва,> -> <нь>', '1 & <в> -> <В>', '1 & <в> -> <п>'] | -+--------+---------------------------------------------------------------------+ -| р | ['2 & <гр> -> ', '2 & <гр> -> <тв>', '2 & <ре> -> <с>'] | -+--------+---------------------------------------------------------------------+ -| Н | ['6 & <Н> -> <* П>', '6 & <Н> -> <° >', '3 & <Н> -> <¢ П>', '2 & | -| | <ЕН> -> <ек>', '2 & <Н> -> <. >', '2 & <Н> -> <И>'] | -+--------+---------------------------------------------------------------------+ -| с | ['2 & <ис> -> <не>', '1 & <с> -> ', '1 & <с> -> <©>', '1 & <с> | -| | -> <е>'] | -+--------+---------------------------------------------------------------------+ -| А | ['2 & <СА> -> '] | -+--------+---------------------------------------------------------------------+ -| И | ['3 & <И> -> ', '1 & <И> -> <Й>', '1 & <И> -> <Н>', '1 & <И> | -| | -> <П>'] | -+--------+---------------------------------------------------------------------+ -| д | ['3 & <д> -> <л>'] | -+--------+---------------------------------------------------------------------+ -| Е | ['2 & <ЕН> -> <ек>'] | -+--------+---------------------------------------------------------------------+ -| О | ['4 & <ОС> -> ', '2 & <ВО> -> <Ю>', '2 & <Об> -> <06>', '1 & | -| | <О> -> <о>'] | -+--------+---------------------------------------------------------------------+ -| П | ['1 & <П> -> <И>'] | -+--------+---------------------------------------------------------------------+ -| Т | ['4 & <Т> -> <Г>', '3 & <МРТ> -> ', '3 & <ТЗР> -> '] | -+--------+---------------------------------------------------------------------+ -| п | ['3 & <тип> -> ', '2 & <п> -> <и>', '2 & <п> -> <н>'] | -+--------+---------------------------------------------------------------------+ -| В | ['6 & <СЗВ> -> ', '2 & <ВЗ> -> <Ръ>', '2 & <ВО> -> <Ю>'] | -+--------+---------------------------------------------------------------------+ -| 0 | ['3 & <608> -> '] | -+--------+---------------------------------------------------------------------+ -| - | ['3 & <-> -> <=>', '1 & <-> -> <|>'] | -+--------+---------------------------------------------------------------------+ -| 6 | ['3 & <608> -> ', '2 & < 6> -> <б>', '2 & <6.> -> '] | -+--------+---------------------------------------------------------------------+ -| I | ['3 & -> ', '3 & -> <Ш>', '3 & -> <УП>', '1 | -| | & -> <|>'] | -+--------+---------------------------------------------------------------------+ -| М | ['3 & <МРТ> -> '] | -+--------+---------------------------------------------------------------------+ -| Р | ['3 & <МРТ> -> ', '3 & <ТЗР> -> '] | -+--------+---------------------------------------------------------------------+ -| б | ['2 & <Об> -> <06>'] | -+--------+---------------------------------------------------------------------+ -| 5 | ['2 & <75> -> <#2>'] | -+--------+---------------------------------------------------------------------+ -| ; | ['8 & <;> -> <:>'] | -+--------+---------------------------------------------------------------------+ -| ь | ['2 & <ь> -> < Ь>'] | -+--------+---------------------------------------------------------------------+ -| 8 | ['3 & <608> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | -| | <28> -> <Я >'] | -+--------+---------------------------------------------------------------------+ -| E | ['6 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| З | ['6 & <СЗВ> -> ', '3 & <БЗ> -> <653>', '3 & <ТЗР> -> ', | -| | '2 & <ВЗ> -> <Ръ>'] | -+--------+---------------------------------------------------------------------+ -| 7 | ['2 & <75> -> <#2>'] | -+--------+---------------------------------------------------------------------+ -| ц | ['1 & <ц> -> <щ>'] | -+--------+---------------------------------------------------------------------+ -| ч | ['1 & <ч> -> <з>'] | -+--------+---------------------------------------------------------------------+ -| C | ['2 & <1C> -> ', '2 & <1C> -> <С>', '2 & -> <С>'] | -+--------+---------------------------------------------------------------------+ -| Б | ['3 & <БЗ> -> <653>'] | -+--------+---------------------------------------------------------------------+ -| Д | ['1 & <Д> -> <З>'] | -+--------+---------------------------------------------------------------------+ -| й | ['1 & <й> -> <:>'] | -+--------+---------------------------------------------------------------------+ -| Ц | ['1 & <Ц> -> <Т>'] | -+--------+---------------------------------------------------------------------+ -| P | ['6 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| R | ['6 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| a | ['4 & -> <на>', '1 & -> <а>'] | -+--------+---------------------------------------------------------------------+ -| G | ['2 & -> <С>'] | -+--------+---------------------------------------------------------------------+ -| H | ['4 & -> <на>'] | -+--------+---------------------------------------------------------------------+ -| V | ['3 & -> <УП>'] | -+--------+---------------------------------------------------------------------+ -| m | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| | | ['1 & <|> -> <1>'] | -+--------+---------------------------------------------------------------------+ -| № | ['3 & <№1> -> ', '3 & <№1»> -> '] | -+--------+---------------------------------------------------------------------+ -| Ю | ['2 & <Ю> -> <1О>'] | -+--------+---------------------------------------------------------------------+ -| Y | ['1 & -> <У>'] | -+--------+---------------------------------------------------------------------+ -| _ | ['1 & <_> -> < >'] | -+--------+---------------------------------------------------------------------+ -| c | ['1 & -> <с>'] | -+--------+---------------------------------------------------------------------+ -| d | ['1 & -> <4>'] | -+--------+---------------------------------------------------------------------+ -| o | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| y | ['1 & -> <у>'] | -+--------+---------------------------------------------------------------------+ -| » | ['3 & <№1»> -> '] | -+--------+---------------------------------------------------------------------+ -| щ | ['1 & <щ> -> <ш>'] | -+--------+---------------------------------------------------------------------+ -| ‚ | ['2 & <‚> -> <_,>'] | -+--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark_Correction.SAGE_CORRECTION.txt b/resources/benchmarks/tesseract_benchmark_Correction.SAGE_CORRECTION.txt new file mode 100644 index 00000000..1cc5782f --- /dev/null +++ b/resources/benchmarks/tesseract_benchmark_Correction.SAGE_CORRECTION.txt @@ -0,0 +1,473 @@ +Tesseract version is 5.0.0 +Correction step: Correction.SAGE_CORRECTION + +Table 1 - Accuracy for each file ++---------------+----------------+--------------+---------------+--------------+ +| Dataset | Image name | OCR language | Amount of | Accuracy OCR | +| | | | words | | ++===============+================+==============+===============+==============+ +| english-words | Kaspersky | rus+eng | 111 | 99.300 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | USB | rus+eng | 4 | 0 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words1 | rus+eng | 19 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words2 | rus+eng | 9 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words3 | rus+eng | 9 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| low_quality | VKR_5 | rus | 68 | 50.700 | ++---------------+----------------+--------------+---------------+--------------+ +| others | Zaklyuchenie_n | rus | 525 | 83.200 | +| | evrologa_00 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| others | Zaklyuchenie_n | rus | 241 | 87.100 | +| | evrologa_01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| others | napalm_doc_2_2 | rus | 124 | 85.100 | +| | _6 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 05df9bb8-88bf- | rus | 301 | 99.300 | +| | 4bae-8eb4-dcce | | | | +| | 4961e588-2 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 05df9bb8-88bf- | rus | 230 | 97.400 | +| | 4bae-8eb4-dcce | | | | +| | 4961e588-3 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 695 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 696 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 699 | 99.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 155 | 88.700 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 266 | 97.700 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 307 | 95.800 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 343 | 96.900 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-04 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 482b19a1-6f22- | rus | 262 | 98.100 | +| | 4ed1-99c8-88a4 | | | | +| | f5ef18f8-001 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 482b19a1-6f22- | rus | 236 | 92.300 | +| | 4ed1-99c8-88a4 | | | | +| | f5ef18f8-002 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 5bb5625f-7765- | rus | 188 | 92.200 | +| | 48e3-ae49-4e4e | | | | +| | 974c9902-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 5bb5625f-7765- | rus | 59 | 94.900 | +| | 48e3-ae49-4e4e | | | | +| | 974c9902-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 151 | 98.700 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 243 | 97.800 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 322 | 97.900 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | LAW_11 | rus | 194 | 88.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_11 | rus | 76 | 94.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_21 | rus | 61 | 97.500 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_22 | rus | 278 | 98.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_23 | rus | 277 | 98 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_24 | rus | 288 | 99.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_25 | rus | 347 | 99.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_26 | rus | 192 | 99 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_27 | rus | 173 | 98 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_28 | rus | 133 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_29 | rus | 182 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_30 | rus | 178 | 98.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_31 | rus | 37 | 97.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_32 | rus | 221 | 99.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_33 | rus | 312 | 95.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_34 | rus | 83 | 92.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_35 | rus | 355 | 97.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_1 | rus | 86 | 99.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_2 | rus | 87 | 98.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_3 | rus | 89 | 95.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_4 | rus | 89 | 90.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_6 | rus | 117 | 99.100 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | article_multil | rus | 471 | 99.900 | +| | ine | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | fstek17_00 | rus | 192 | 92.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | fstek17_01 | rus | 332 | 99.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | law_image | rus | 182 | 99.100 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | napalm_doc_13_ | rus | 243 | 96.900 | +| | 2 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ukodeksrf_00 | rus | 287 | 99 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ukodeksrf_01 | rus | 340 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 146 | 94.700 | +| | ons_00 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 276 | 98.600 | +| | ons_01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 93 | 99 | +| | ons_02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 54 | 99.600 | +| | ons_03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_00 | rus | 78 | 96.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_01 | rus | 296 | 96.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_02 | rus | 309 | 98.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_03 | rus | 337 | 96.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_04 | rus | 257 | 77.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_05 | rus | 238 | 97.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_06 | rus | 219 | 94.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_07 | rus | 233 | 95.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_08 | rus | 284 | 98.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_09 | rus | 154 | 95.900 | ++---------------+----------------+--------------+---------------+--------------+ + +Table 2 - AVG by each type of symbols: ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | +| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | +| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | +| | s | ols | | ars | bols | | | | ++========+========+========+========+========+========+========+=======+=======+ +| englis | 79.820 | 66 | 50 | 0 | 0 | 80 | 152 | 79.86 | +| h- | | | | | | | | 0 | +| words | | | | | | | | | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| low_qu | 92.600 | 60 | 46.100 | 0 | 0 | 78.200 | 68 | 50.70 | +| ality | | | | | | | | 0 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| others | 89.933 | 76.967 | 87.167 | 0 | 0 | 87.100 | 890 | 85.13 | +| | | | | | | | | 3 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| tz- | 97.920 | 91.678 | 94.608 | 0 | 0 | 99.100 | 14029 | 96.77 | +| npa- | | | | | | | | 3 | +| vkr | | | | | | | | | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ + +Table 3 -OCR error by symbol: ++--------+---------------------------------------------------------------------+ +| Symbol | Cnt Errors & Correct-Generated | ++========+=====================================================================+ +| | ['8 & <№ > -> ', '4 & <* > -> <4.>', '3 & <— 1> -> <19>', '3 & | +| | <— П> -> <И>', '2 & < 3> -> <З>', '2 & < г> -> <.>', '2 & < г> -> | +| | <К>', '2 & < г> -> <т>', '2 & < п> -> <тн>', '2 & < —> -> <0>', '2 | +| | & < ‚> -> <,>', "2 & <1 > -> <'>", '2 & <8 > -> <Р>', '2 & <; > -> | +| | <.>', '2 & <и > -> <н>', '2 & <й > -> <ст>', '1 & < > -> <(>', '1 & | +| | < > -> '] | ++--------+---------------------------------------------------------------------+ +| . | ['10 & <.> -> <,>', '3 & <3.1> -> <ЗЛА>', '3 & <3Г.> -> <5>', '2 & | +| | <8.> -> <$>', '2 & <8.> -> <5>', '2 & -> <9>', '2 & <г.> -> | +| | <ГТ>', '1 & <.> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| — | ['7 & <—> -> <->', '3 & <— 1> -> <19>', '3 & <— П> -> <И>', '2 & < | +| | —> -> <0>'] | ++--------+---------------------------------------------------------------------+ +| № | ['170 & <№> -> ', '8 & <№ > -> ', '3 & <№17> -> <ДК>', '3 & | +| | <№> -> ', '3 & <№> -> ', '1 & <№> -> <и>'] | ++--------+---------------------------------------------------------------------+ +| 1 | ['4 & <1C> -> ', '4 & <1> -> <3>', "3 & <1> -> <'>", '3 & <3.1> | +| | -> <ЗЛА>', '3 & <31"> -> < А>', '3 & <— 1> -> <19>', '3 & <№17> -> | +| | <ДК>', "2 & <1 > -> <'>", '2 & <11> -> <И>', '2 & <1C> -> <С>', '2 | +| | & <1> -> <5>', '1 & <1> -> ', '1 & <1> -> <(>', '1 & <1> -> | +| | <2>', '1 & <1> -> <4>', '1 & <1> -> <Г>', '1 & <1> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| , | ['69 & <,> -> <.>', '3 & <ва,> -> <нь>', '2 & <Ш,> -> <П.>', '1 & | +| | <,> -> <;>'] | ++--------+---------------------------------------------------------------------+ +| е | ['2 & <е-> -> <ав>', '2 & <е-> -> <им>', '2 & <е-> -> <уд>', '2 & | +| | <е> -> <га>', '2 & <е> -> <и>', '2 & <е> -> <ё>', '2 & <ле> -> | +| | <У>', '1 & <е> -> <й>', '1 & <е> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| и | ['3 & <и> -> <е>', '2 & <и > -> <н>', '2 & <и> -> <И>', '2 & <из> | +| | -> <по>', '2 & <ис> -> <не>', '2 & <си> -> <ен>', '1 & <и> -> <В>', | +| | '1 & <и> -> <Н>', '1 & <и> -> <а>', '1 & <и> -> <н>', '1 & <и> -> | +| | <ь>'] | ++--------+---------------------------------------------------------------------+ +| а | ['8 & <а> -> <о>', '3 & <ва,> -> <нь>', '2 & <ав> -> <ыс>', '2 & | +| | <па> -> <те>', '1 & <а> -> <Б>', '1 & <а> -> <б>', '1 & <а> -> | +| | <у>'] | ++--------+---------------------------------------------------------------------+ +| н | ['3 & <льн> -> <з>', '3 & <нс> -> <эро>', '2 & <н> -> <п>', '1 & | +| | <н> -> <й>'] | ++--------+---------------------------------------------------------------------+ +| о | ['4 & <по> -> <на>', '3 & <фок> -> <М>', '2 & <о-> -> <ым>', '2 & | +| | <от> -> <и>', '1 & <о> -> <у>', '1 & <о> -> <я>'] | ++--------+---------------------------------------------------------------------+ +| т | ['2 & <от> -> <и>', '2 & <рт> -> <й>', '2 & <т> -> < >', '2 & <т> | +| | -> <1>', '2 & <т> -> <Д>', '2 & <т> -> <г>', '2 & <т> -> <ин>', '2 | +| | & <эт> -> <Юг>', '1 & <т> -> <б>', '1 & <т> -> <л>', '1 & <т> -> | +| | <н>'] | ++--------+---------------------------------------------------------------------+ +| - | ['6 & <-> -> <мы>', '4 & <-> -> <го>', '3 & <-> -> < и >', '3 & <-> | +| | -> <м>', '3 & <-> -> <ния>', '3 & <-> -> <тов>', '3 & <-> -> | +| | <тых>', '2 & <-> -> <»>', '2 & <-> -> <ия>', '2 & <-> -> <ки>', '2 | +| | & <-> -> <ли>', '2 & <-> -> <ма>', '2 & <-> -> <мо>', '2 & <-> -> | +| | <ры>', '2 & <-> -> <сы>', '2 & <-> -> <ы>', '2 & <е-> -> <ав>', '2 | +| | & <е-> -> <им>', '2 & <е-> -> <уд>', '2 & <о-> -> <ым>', '1 & <-> | +| | -> <ь>'] | ++--------+---------------------------------------------------------------------+ +| р | ['3 & <гр> -> <тав>', '2 & <р> -> <ол>', '2 & <рт> -> <й>', '2 & | +| | <эр> -> <ци>', '1 & <р> -> <Р>', '1 & <р> -> <д>'] | ++--------+---------------------------------------------------------------------+ +| с | ['3 & <нс> -> <эро>', '2 & <(с> -> <С>', '2 & <ис> -> <не>', '2 & | +| | <с> -> <ез>', '2 & <с> -> <ец>', '2 & <си> -> <ен>', '1 & <с> -> | +| | <е>'] | ++--------+---------------------------------------------------------------------+ +| 3 | ['3 & <3.1> -> <ЗЛА>', '3 & <31"> -> < А>', '3 & <3Г.> -> <5>', '2 | +| | & < 3> -> <З>', '1 & <3> -> <1>', '1 & <3> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| И | ['2 & <И> -> <АН>', '1 & <И> -> <В>', '1 & <И> -> <Й>', '1 & <И> -> | +| | <Н>'] | ++--------+---------------------------------------------------------------------+ +| 2 | ['2 & <28> -> <ИР>', '2 & <28> -> <Я>', '1 & <2> -> <1>', '1 & <2> | +| | -> <3>'] | ++--------+---------------------------------------------------------------------+ +| в | ['3 & <ва,> -> <нь>', '2 & <ав> -> <ыс>', '2 & <в> -> <по>', '1 & | +| | <в> -> <м>'] | ++--------+---------------------------------------------------------------------+ +| л | ['3 & <льн> -> <з>', '2 & <ле> -> <У>', '1 & <л> -> <Д>', '1 & <л> | +| | -> <Л>', '1 & <л> -> <д>', '1 & <л> -> <т>'] | ++--------+---------------------------------------------------------------------+ +| 6 | ['1 & <6> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| г | ['3 & <гр> -> <тав>', '2 & < г> -> <.>', '2 & < г> -> <К>', '2 & < | +| | г> -> <т>', '2 & <г.> -> <ГТ>', '2 & <г> -> <т>', '1 & <г> -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| А | ['3 & <МАЯ> -> <сам>'] | ++--------+---------------------------------------------------------------------+ +| д | ['1 & <д> -> <з>', '1 & <д> -> <л>', '1 & <д> -> <п>', '1 & <д> -> | +| | <ц>'] | ++--------+---------------------------------------------------------------------+ +| E | ['36 & -> <ЕВР>', '6 & -> <ЕКР>', '6 & -> <УЕВ>', | +| | '3 & -> <ЕЕР>', '1 & -> <Е>'] | ++--------+---------------------------------------------------------------------+ +| О | ['2 & <О> -> <СЯ>', '2 & <ПО> -> <по>', '1 & <О> -> <Ю>', '1 & <О> | +| | -> <о>'] | ++--------+---------------------------------------------------------------------+ +| б | ['2 & <б> -> <6>', '1 & <б> -> <ш>'] | ++--------+---------------------------------------------------------------------+ +| Н | ['2 & <Н> -> <ЕМ>', '1 & <Н> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| N | ['23 & -> <М>'] | ++--------+---------------------------------------------------------------------+ +| Е | ['2 & <ЕМ> -> <Ш>', '1 & <Е> -> <в>'] | ++--------+---------------------------------------------------------------------+ +| 4 | ['1 & <4> -> <6>', '1 & <4> -> <7>'] | ++--------+---------------------------------------------------------------------+ +| у | ['5 & <у> -> <ы>'] | ++--------+---------------------------------------------------------------------+ +| п | ['4 & <по> -> <на>', '2 & < п> -> <тн>', '2 & <п> -> <нн>', '2 & | +| | <па> -> <те>', '1 & <п> -> <к>'] | ++--------+---------------------------------------------------------------------+ +| Т | ['3 & <Т> -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| P | ['36 & -> <ЕВР>', '6 & -> <ЕКР>'] | ++--------+---------------------------------------------------------------------+ +| Р | ['1 & <Р> -> <р>'] | ++--------+---------------------------------------------------------------------+ +| 0 | ['2 & <08> -> <9Ф>'] | ++--------+---------------------------------------------------------------------+ +| R | ['36 & -> <ЕВР>', '6 & -> <ЕКР>'] | ++--------+---------------------------------------------------------------------+ +| м | ['3 & <мы> -> <«СП>', '2 & <мы> -> <ру>', '1 & <м> -> <й>'] | ++--------+---------------------------------------------------------------------+ +| ы | ['3 & <мы> -> <«СП>', '2 & <мы> -> <ру>', '1 & <ы> -> <а>', '1 & | +| | <ы> -> <б>'] | ++--------+---------------------------------------------------------------------+ +| я | ['2 & <яз> -> <л>', '1 & <я> -> <а>'] | ++--------+---------------------------------------------------------------------+ +| I | ['3 & -> <Ш>', '3 & -> <ТТХ>', '3 & -> <УП>', '2 & | +| | -> <1>', '2 & -> <”>'] | ++--------+---------------------------------------------------------------------+ +| C | ['7 & -> <С>', '6 & -> <С.>', '4 & <1C> -> ', '2 & | +| | <1C> -> <С>', '2 & -> <ОС>'] | ++--------+---------------------------------------------------------------------+ +| й | ['3 & <й> -> <е>', '2 & <й > -> <ст>', '2 & <й> -> <го>', '2 & <й> | +| | -> <е >', '2 & <й> -> <е:>'] | ++--------+---------------------------------------------------------------------+ +| П | ['3 & <П> -> <ИР >', '3 & <— П> -> <И>', '2 & <ПО> -> <по>', '1 & | +| | <П> -> <И>', '1 & <П> -> <К>'] | ++--------+---------------------------------------------------------------------+ +| 7 | ['3 & <№17> -> <ДК>', '1 & <7> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| М | ['3 & <МАЯ> -> <сам>', '2 & <ЕМ> -> <Ш>'] | ++--------+---------------------------------------------------------------------+ +| 8 | ['2 & <08> -> <9Ф>', '2 & <28> -> <ИР>', '2 & <28> -> <Я>', '2 & <8 | +| | > -> <Р>', '2 & <8.> -> <$>', '2 & <8.> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| r | ['2 & -> <Ка>', '2 & -> <ки>', '2 & -> <н>', '1 & | +| | -> <г>'] | ++--------+---------------------------------------------------------------------+ +| ь | ['3 & <льн> -> <з>', '1 & <ь> -> <т>'] | ++--------+---------------------------------------------------------------------+ +| o | ['3 & <(no> -> <по>', '3 & -> <2о>'] | ++--------+---------------------------------------------------------------------+ +| u | ['2 & -> <Ка>', '2 & -> <ки>', '2 & -> <н>'] | ++--------+---------------------------------------------------------------------+ +| з | ['2 & <из> -> <по>', '2 & <яз> -> <л>', '1 & <з> -> <3>'] | ++--------+---------------------------------------------------------------------+ +| к | ['3 & <фок> -> <М>', '1 & <к> -> <1>', '1 & <к> -> <с>'] | ++--------+---------------------------------------------------------------------+ +| : | ['6 & -> <С.>', '5 & <:> -> <.>'] | ++--------+---------------------------------------------------------------------+ +| ; | ['9 & <;> -> <:>', '2 & <; > -> <.>'] | ++--------+---------------------------------------------------------------------+ +| ч | ['1 & <ч> -> <д>'] | ++--------+---------------------------------------------------------------------+ +| a | ['4 & -> <на>', '2 & -> <На>', '2 & -> <а>'] | ++--------+---------------------------------------------------------------------+ +| В | ['2 & <ВЗ> -> <РИ>'] | ++--------+---------------------------------------------------------------------+ +| ц | ['1 & <ц> -> <«>', '1 & <ц> -> <С>', '1 & <ц> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| _ | ['1 & <_> -> <Х>'] | ++--------+---------------------------------------------------------------------+ +| Б | ['2 & <БЗ> -> <53>'] | ++--------+---------------------------------------------------------------------+ +| w | ['3 & -> ', '3 & -> <ув>'] | ++--------+---------------------------------------------------------------------+ +| d | ['3 & -> <рар>', '1 & -> <4>'] | ++--------+---------------------------------------------------------------------+ +| e | ['2 & -> <Не>'] | ++--------+---------------------------------------------------------------------+ +| O | ['2 & -> <ОС>'] | ++--------+---------------------------------------------------------------------+ +| Д | ['1 & <Д> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| З | ['2 & <БЗ> -> <53>', '2 & <ВЗ> -> <РИ>', '1 & <З> -> <У>'] | ++--------+---------------------------------------------------------------------+ +| Ц | ['1 & <Ц> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| Я | ['3 & <МАЯ> -> <сам>'] | ++--------+---------------------------------------------------------------------+ +| " | ['3 & <31"> -> < А>', '2 & <""> -> <с>'] | ++--------+---------------------------------------------------------------------+ +| D | ['3 & -> <ЕЕР>', '2 & -> <П>'] | ++--------+---------------------------------------------------------------------+ +| f | ['3 & -> <рар>'] | ++--------+---------------------------------------------------------------------+ +| ( | ['3 & <(no> -> <по>', '2 & <(с> -> <С>'] | ++--------+---------------------------------------------------------------------+ +| A | ['2 & -> <$А>'] | ++--------+---------------------------------------------------------------------+ +| H | ['4 & -> <на>', '3 & -> <КНМ>', '2 & -> <На>', '2 & | +| | -> <Не>'] | ++--------+---------------------------------------------------------------------+ +| V | ['3 & -> <УП>', '2 & -> <”>'] | ++--------+---------------------------------------------------------------------+ +| b | ['1 & -> <Ь>'] | ++--------+---------------------------------------------------------------------+ +| g | ['3 & -> <2о>'] | ++--------+---------------------------------------------------------------------+ +| n | ['3 & <(no> -> <по>'] | ++--------+---------------------------------------------------------------------+ +| p | ['3 & -> <рар>'] | ++--------+---------------------------------------------------------------------+ +| | | ['1 & <|> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| Г | ['3 & <3Г.> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| Ю | ['2 & <Ю> -> <1 >'] | ++--------+---------------------------------------------------------------------+ +| ш | ['1 & <ш> -> <с>', '1 & <ш> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| * | ['4 & <* > -> <4.>'] | ++--------+---------------------------------------------------------------------+ +| B | ['6 & -> <УЕВ>'] | ++--------+---------------------------------------------------------------------+ +| F | ['3 & -> <ЕЕР>'] | ++--------+---------------------------------------------------------------------+ +| S | ['2 & -> <$А>'] | ++--------+---------------------------------------------------------------------+ +| э | ['2 & <эр> -> <ци>', '2 & <эт> -> <Юг>'] | ++--------+---------------------------------------------------------------------+ +| ю | ['1 & <ю> -> <у>'] | ++--------+---------------------------------------------------------------------+ +| G | ['2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| M | ['3 & -> <КНМ>'] | ++--------+---------------------------------------------------------------------+ +| c | ['2 & -> <9>'] | ++--------+---------------------------------------------------------------------+ +| v | ['3 & -> <2о>'] | ++--------+---------------------------------------------------------------------+ +| » | ['3 & <»> -> <22%>'] | ++--------+---------------------------------------------------------------------+ +| Х | ['1 & <Х> -> <Д>'] | ++--------+---------------------------------------------------------------------+ +| Ш | ['2 & <Ш,> -> <П.>'] | ++--------+---------------------------------------------------------------------+ +| ф | ['3 & <фок> -> <М>'] | ++--------+---------------------------------------------------------------------+ +| ‚ | ['2 & < ‚> -> <,>'] | ++--------+---------------------------------------------------------------------+ +| L | ['2 & -> <ГХ>'] | ++--------+---------------------------------------------------------------------+ +| W | ['6 & -> <УЕВ>'] | ++--------+---------------------------------------------------------------------+ +| X | ['3 & -> <ТТХ>', '2 & -> <ГХ>'] | ++--------+---------------------------------------------------------------------+ +| y | ['2 & -> <П>', '1 & -> <у>'] | ++--------+---------------------------------------------------------------------+ +| K | ['3 & -> <КНМ>'] | ++--------+---------------------------------------------------------------------+ +| Y | ['1 & -> <У>'] | ++--------+---------------------------------------------------------------------+ +| ₁ | ['1 & <₁> -> <1>'] | ++--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark_Correction.WITHOUT_CORRECTION.txt b/resources/benchmarks/tesseract_benchmark_Correction.WITHOUT_CORRECTION.txt new file mode 100644 index 00000000..e4b6a15a --- /dev/null +++ b/resources/benchmarks/tesseract_benchmark_Correction.WITHOUT_CORRECTION.txt @@ -0,0 +1,443 @@ +Tesseract version is 5.0.0 +Correction step: Correction.WITHOUT_CORRECTION + +Table 1 - Accuracy for each file ++---------------+----------------+--------------+---------------+--------------+ +| Dataset | Image name | OCR language | Amount of | Accuracy OCR | +| | | | words | | ++===============+================+==============+===============+==============+ +| english-words | Kaspersky | rus+eng | 111 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | USB | rus+eng | 4 | 0 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words1 | rus+eng | 19 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words2 | rus+eng | 9 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| english-words | words3 | rus+eng | 9 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| low_quality | VKR_5 | rus | 68 | 51.600 | ++---------------+----------------+--------------+---------------+--------------+ +| others | Zaklyuchenie_n | rus | 525 | 83.800 | +| | evrologa_00 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| others | Zaklyuchenie_n | rus | 241 | 88.600 | +| | evrologa_01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| others | napalm_doc_2_2 | rus | 124 | 86.300 | +| | _6 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 05df9bb8-88bf- | rus | 301 | 99.600 | +| | 4bae-8eb4-dcce | | | | +| | 4961e588-2 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 05df9bb8-88bf- | rus | 230 | 97.700 | +| | 4bae-8eb4-dcce | | | | +| | 4961e588-3 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 695 | 99.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 696 | 99.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 1.620e+14 | rus | 699 | 99.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 155 | 92.500 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 266 | 99.300 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 307 | 97.400 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 30dc613d-a791- | rus | 343 | 99.600 | +| | 4097-9d8f-b8df | | | | +| | fc00f879-04 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 482b19a1-6f22- | rus | 262 | 99.900 | +| | 4ed1-99c8-88a4 | | | | +| | f5ef18f8-001 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 482b19a1-6f22- | rus | 236 | 94.100 | +| | 4ed1-99c8-88a4 | | | | +| | f5ef18f8-002 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 5bb5625f-7765- | rus | 188 | 95.100 | +| | 48e3-ae49-4e4e | | | | +| | 974c9902-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 5bb5625f-7765- | rus | 59 | 95.200 | +| | 48e3-ae49-4e4e | | | | +| | 974c9902-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 151 | 99.400 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 243 | 98.100 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | 6d3e9329-9716- | rus | 322 | 98.700 | +| | 4024-89d1-1d48 | | | | +| | b93ee790-03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | LAW_11 | rus | 194 | 91.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_11 | rus | 76 | 95 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_21 | rus | 61 | 98.500 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_22 | rus | 278 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_23 | rus | 277 | 98 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_24 | rus | 288 | 99.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_25 | rus | 347 | 99.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_26 | rus | 192 | 99.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_27 | rus | 173 | 98.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_28 | rus | 133 | 99.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_29 | rus | 182 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_30 | rus | 178 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_31 | rus | 37 | 99.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_32 | rus | 221 | 100 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_33 | rus | 312 | 96.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_34 | rus | 83 | 92.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | TZ_35 | rus | 355 | 98.800 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_1 | rus | 86 | 99.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_2 | rus | 87 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_3 | rus | 89 | 95.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_4 | rus | 89 | 91.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | VKR_6 | rus | 117 | 99.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | article_multil | rus | 471 | 100 | +| | ine | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | fstek17_00 | rus | 192 | 95.700 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | fstek17_01 | rus | 332 | 99.500 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | law_image | rus | 182 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | napalm_doc_13_ | rus | 243 | 97.400 | +| | 2 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ukodeksrf_00 | rus | 287 | 99 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ukodeksrf_01 | rus | 340 | 99.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 146 | 96.100 | +| | ons_00 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 276 | 99.400 | +| | ons_01 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 93 | 99.300 | +| | ons_02 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | with_applicati | rus | 54 | 99.800 | +| | ons_03 | | | | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_00 | rus | 78 | 97.100 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_01 | rus | 296 | 97 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_02 | rus | 309 | 98.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_03 | rus | 337 | 97.600 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_04 | rus | 257 | 78.200 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_05 | rus | 238 | 98.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_06 | rus | 219 | 95.300 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_07 | rus | 233 | 95.900 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_08 | rus | 284 | 98.400 | ++---------------+----------------+--------------+---------------+--------------+ +| tz-npa-vkr | ТЗ_09 | rus | 154 | 95.700 | ++---------------+----------------+--------------+---------------+--------------+ + +Table 2 - AVG by each type of symbols: ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | +| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | +| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | +| | s | ols | | ars | bols | | | | ++========+========+========+========+========+========+========+=======+=======+ +| englis | 80 | 66 | 50 | 0 | 0 | 80 | 152 | 79.92 | +| h- | | | | | | | | 0 | +| words | | | | | | | | | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| low_qu | 89.700 | 70 | 46.100 | 0 | 0 | 75.900 | 68 | 51.60 | +| ality | | | | | | | | 0 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| others | 90.833 | 77.267 | 87.167 | 0 | 0 | 87.100 | 890 | 86.23 | +| | | | | | | | | 3 | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ +| tz- | 98.292 | 93.183 | 94.602 | 0 | 0 | 99.164 | 14029 | 97.54 | +| npa- | | | | | | | | 1 | +| vkr | | | | | | | | | ++--------+--------+--------+--------+--------+--------+--------+-------+-------+ + +Table 3 -OCR error by symbol: ++--------+---------------------------------------------------------------------+ +| Symbol | Cnt Errors & Correct-Generated | ++========+=====================================================================+ +| | ['4 & <* > -> <.>', '3 & <(с > -> <С>', '3 & <— 1> -> <19>', '3 & | +| | <— П> -> <И>', '3 & <— н> -> <и>', '2 & < 6> -> <б>', '2 & < > -> | +| | <__>', '2 & < г> -> <.т>', '2 & < г> -> <т>', "2 & <1 > -> <'>", '2 | +| | & <8 > -> <Р>', '2 & <; > -> <.>', '2 & -> <№>', '2 & <е > -> | +| | <в>', '2 & <и > -> <н>', '2 & <й > -> <ст>', '1 & < > -> <_>'] | ++--------+---------------------------------------------------------------------+ +| . | ['10 & <.> -> <,>', '3 & <3.1> -> <ЗЛА>', '3 & <3Г.> -> <5>', '2 & | +| | <.3> -> < >', '2 & <1.> -> <„>', '2 & <8.> -> <$>', '2 & <8.> -> | +| | <5>', '2 & -> <|9>', '2 & <г.> -> <Г>', '1 & <.> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| — | ['8 & <—> -> <_>', '7 & <—> -> <->', '4 & <—> -> <=>', '3 & <— 1> | +| | -> <19>', '3 & <— П> -> <И>', '3 & <— н> -> <и>'] | ++--------+---------------------------------------------------------------------+ +| 1 | ['4 & <1C> -> ', "3 & <1> -> <'>", '3 & <1> -> <|>', '3 & <3.1> | +| | -> <ЗЛА>', '3 & <— 1> -> <19>', "2 & <1 > -> <'>", '2 & <1.> -> | +| | <„>', '2 & <11> -> <И>', '2 & <1C> -> <С>', '2 & <1> -> <[>', '1 & | +| | <1> -> ', '1 & <1> -> <(>', '1 & <1> -> <4>', '1 & <1> -> <\\>', | +| | '1 & <1> -> <Г>', '1 & <1> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| , | ['69 & <,> -> <.>', '3 & <ва,> -> <нь>', '2 & <Ш,> -> <П.>', '1 & | +| | <,> -> <;>', '1 & <,> -> <‚>'] | ++--------+---------------------------------------------------------------------+ +| е | ['5 & <е> -> <с>', '3 & <ект> -> <тн>', '2 & <е > -> <в>', '2 & | +| | <е-> -> <.>', '2 & <е> -> <а>', '2 & <ем> -> <вы>', '2 & <ен> -> | +| | <ая>', '2 & <ле> -> <ыи>'] | ++--------+---------------------------------------------------------------------+ +| н | ['3 & <на-> -> ', '3 & <— н> -> <и>', '2 & <ен> -> <ая>', '2 & | +| | <н> -> <и>', '1 & <н> -> <в>', '1 & <н> -> <й>', '1 & <н> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| и | ['4 & <ис> -> <не>', '3 & <цио> -> <с>', '2 & <Ди> -> <по>', '2 & | +| | <и > -> <н>', '2 & <и> -> <н>', '2 & <из> -> <по>', '1 & <и> -> | +| | <И>', '1 & <и> -> <я>'] | ++--------+---------------------------------------------------------------------+ +| а | ['3 & <аво> -> <ыс>', '3 & <ва,> -> <нь>', '3 & <на-> -> ', '2 & | +| | <ав> -> <иы>'] | ++--------+---------------------------------------------------------------------+ +| о | ['3 & <аво> -> <ыс>', '3 & <ор> -> <ель>', '3 & <цио> -> <с>', '2 & | +| | <по> -> <иб>', '2 & <фо> -> <уп>'] | ++--------+---------------------------------------------------------------------+ +| т | ['7 & <т> -> <г>', '4 & <т> -> < г>', '3 & <ект> -> <тн>', '2 & | +| | <рт> -> <й>', '2 & <т> -> <1>', '2 & <эт> -> <уг>'] | ++--------+---------------------------------------------------------------------+ +| с | ['7 & <с> -> <е>', '4 & <ис> -> <не>', '3 & <(с > -> <С>', '1 & <с> | +| | -> <©>', '1 & <с> -> <з>'] | ++--------+---------------------------------------------------------------------+ +| р | ['3 & <ор> -> <ель>', '2 & <гр> -> <тв>', '2 & <рт> -> <й>', '2 & | +| | <эр> -> <ци>', '1 & <р> -> <Р>', '1 & <р> -> <й>'] | ++--------+---------------------------------------------------------------------+ +| 3 | ['3 & <3.1> -> <ЗЛА>', '3 & <3Г.> -> <5>', '2 & <.3> -> < >', '1 & | +| | <3> -> <5>', '1 & <3> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| И | ['1 & <И> -> <Й>', '1 & <И> -> <Н>', '1 & <И> -> <П>', '1 & <И> -> | +| | <и>'] | ++--------+---------------------------------------------------------------------+ +| в | ['3 & <аво> -> <ыс>', '3 & <ва,> -> <нь>', '2 & <ав> -> <иы>', '1 & | +| | <в> -> <п>'] | ++--------+---------------------------------------------------------------------+ +| л | ['5 & <л> -> <п>', '3 & <глу> -> <по>', '2 & <ле> -> <ыи>', '1 & | +| | <л> -> <д>', '1 & <л> -> <и>'] | ++--------+---------------------------------------------------------------------+ +| 2 | ['2 & <28> -> <ИР>', '2 & <28> -> <Я >', '1 & <2> -> <1>', '1 & <2> | +| | -> <3>'] | ++--------+---------------------------------------------------------------------+ +| 6 | ['2 & < 6> -> <б>', '1 & <6> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| д | ['4 & <д> -> <л>', '3 & <д> -> <; ц>', '2 & <д> -> <; >'] | ++--------+---------------------------------------------------------------------+ +| О | ['2 & <ВО> -> <Ю>', '2 & <ПО> -> <по>', '1 & <О> -> <С>', '1 & <О> | +| | -> <о>'] | ++--------+---------------------------------------------------------------------+ +| г | ['3 & <глу> -> <по>', '2 & < г> -> <.т>', '2 & < г> -> <т>', '2 & | +| | <г.> -> <Г>', '2 & <г> -> ', '2 & <гр> -> <тв>', '1 & <г> -> < | +| | >', '1 & <г> -> <т>'] | ++--------+---------------------------------------------------------------------+ +| E | ['39 & -> <ЕВР>', '6 & -> <ЕКР>', '3 & -> <2ЕЮ>', | +| | '3 & -> <ЕЕР>', '1 & -> <Е>'] | ++--------+---------------------------------------------------------------------+ +| Н | ['2 & <ЕН> -> <ек>', '2 & <Н> -> <И>'] | ++--------+---------------------------------------------------------------------+ +| п | ['5 & <п> -> <и>', '2 & <п> -> <н>', '2 & <п> -> <нн>', '2 & <по> | +| | -> <иб>', '1 & <п> -> <т>'] | ++--------+---------------------------------------------------------------------+ +| N | ['23 & -> <М>', '2 & -> <№>'] | ++--------+---------------------------------------------------------------------+ +| Е | ['2 & <ЕН> -> <ек>', '1 & <Е> -> <в>'] | ++--------+---------------------------------------------------------------------+ +| б | ['2 & <б> -> <6>'] | ++--------+---------------------------------------------------------------------+ +| Т | ['4 & <Т> -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| P | ['39 & -> <ЕВР>', '6 & -> <ЕКР>'] | ++--------+---------------------------------------------------------------------+ +| Р | ['1 & <Р> -> <з>'] | ++--------+---------------------------------------------------------------------+ +| R | ['39 & -> <ЕВР>', '6 & -> <ЕКР>'] | ++--------+---------------------------------------------------------------------+ +| у | ['3 & <глу> -> <по>'] | ++--------+---------------------------------------------------------------------+ +| 0 | ['2 & <08> -> <9Ф>'] | ++--------+---------------------------------------------------------------------+ +| П | ['3 & <— П> -> <И>', '2 & <П> -> <И>', '2 & <ПО> -> <по>', '1 & <П> | +| | -> <Г>'] | ++--------+---------------------------------------------------------------------+ +| я | ['2 & <я> -> <го>', '1 & <я> -> <л>'] | ++--------+---------------------------------------------------------------------+ +| I | ['3 & -> <Ш>', '3 & -> <130>', '3 & -> <УП>', '2 | +| | & -> <ТХ>', '2 & -> <1>', '1 & -> <|>'] | ++--------+---------------------------------------------------------------------+ +| м | ['2 & <ем> -> <вы>'] | ++--------+---------------------------------------------------------------------+ +| C | ['7 & -> <С>', '6 & -> <С.>', '4 & <1C> -> ', '2 & | +| | <1C> -> <С>', '2 & -> <ОС>'] | ++--------+---------------------------------------------------------------------+ +| - | ['3 & <на-> -> ', '2 & <е-> -> <.>', '1 & <-> -> < >', '1 & <-> | +| | -> <|>'] | ++--------+---------------------------------------------------------------------+ +| 7 | ['2 & <75> -> <#>', '1 & <7> -> <1>', '1 & <7> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| 5 | ['2 & <75> -> <#>'] | ++--------+---------------------------------------------------------------------+ +| 8 | ['2 & <08> -> <9Ф>', '2 & <28> -> <ИР>', '2 & <28> -> <Я >', '2 & | +| | <8 > -> <Р>', '2 & <8.> -> <$>', '2 & <8.> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| r | ['4 & -> <га>', '4 & -> <ги>', '1 & -> <г>'] | ++--------+---------------------------------------------------------------------+ +| й | ['2 & <й > -> <ст>', '2 & <й> -> <го>', '1 & <й> -> <:>'] | ++--------+---------------------------------------------------------------------+ +| В | ['2 & <ВЗ> -> <Ръ>', '2 & <ВО> -> <Ю>'] | ++--------+---------------------------------------------------------------------+ +| o | ['3 & <(no> -> <по>', '3 & -> <2оу>'] | ++--------+---------------------------------------------------------------------+ +| u | ['4 & -> <га>', '4 & -> <ги>'] | ++--------+---------------------------------------------------------------------+ +| з | ['2 & <из> -> <по>', '1 & <з> -> <3>'] | ++--------+---------------------------------------------------------------------+ +| ; | ['9 & <;> -> <:>', '2 & <; > -> <.>'] | ++--------+---------------------------------------------------------------------+ +| ч | ['2 & <ч> -> <пр>', '1 & <ч> -> <з>'] | ++--------+---------------------------------------------------------------------+ +| : | ['6 & -> <С.>', '5 & <:> -> <.>'] | ++--------+---------------------------------------------------------------------+ +| a | ['4 & -> <на>', '2 & -> <На>', '2 & -> <а>'] | ++--------+---------------------------------------------------------------------+ +| к | ['3 & <ект> -> <тн>', '1 & <к> -> <ш>'] | ++--------+---------------------------------------------------------------------+ +| Б | ['2 & <БЗ> -> <53>', '1 & <Б> -> <Ъ>'] | ++--------+---------------------------------------------------------------------+ +| ц | ['3 & <цио> -> <с>', '1 & <ц> -> <п>', '1 & <ц> -> <ш>', '1 & <ц> | +| | -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| w | ['3 & -> <ууу>'] | ++--------+---------------------------------------------------------------------+ +| d | ['3 & -> <рар>', '1 & -> <4>'] | ++--------+---------------------------------------------------------------------+ +| e | ['2 & -> <Не>'] | ++--------+---------------------------------------------------------------------+ +| O | ['3 & -> <130>', '2 & -> <ОС>'] | ++--------+---------------------------------------------------------------------+ +| Д | ['2 & <Ди> -> <по>', '1 & <Д> -> <З>'] | ++--------+---------------------------------------------------------------------+ +| З | ['2 & <БЗ> -> <53>', '2 & <ВЗ> -> <Ръ>', '1 & <З> -> <У>'] | ++--------+---------------------------------------------------------------------+ +| Ц | ['1 & <Ц> -> <Т>'] | ++--------+---------------------------------------------------------------------+ +| D | ['3 & -> <2ЕЮ>', '3 & -> <ЕЕР>', '2 & -> <Пу>'] | ++--------+---------------------------------------------------------------------+ +| f | ['3 & -> <рар>'] | ++--------+---------------------------------------------------------------------+ +| ( | ['3 & <(no> -> <по>', '3 & <(с > -> <С>'] | ++--------+---------------------------------------------------------------------+ +| A | ['2 & -> <$А>'] | ++--------+---------------------------------------------------------------------+ +| H | ['4 & -> <на>', '3 & -> <КНМ>', '2 & -> <На>', '2 & | +| | -> <Не>'] | ++--------+---------------------------------------------------------------------+ +| V | ['3 & -> <УП>'] | ++--------+---------------------------------------------------------------------+ +| b | ['1 & -> <Ь>'] | ++--------+---------------------------------------------------------------------+ +| g | ['3 & -> <2оу>'] | ++--------+---------------------------------------------------------------------+ +| n | ['3 & <(no> -> <по>'] | ++--------+---------------------------------------------------------------------+ +| p | ['3 & -> <рар>'] | ++--------+---------------------------------------------------------------------+ +| Г | ['3 & <3Г.> -> <5>'] | ++--------+---------------------------------------------------------------------+ +| Ю | ['2 & <Ю> -> <1О>'] | ++--------+---------------------------------------------------------------------+ +| * | ['4 & <* > -> <.>'] | ++--------+---------------------------------------------------------------------+ +| F | ['3 & -> <2ЕЮ>', '3 & -> <ЕЕР>'] | ++--------+---------------------------------------------------------------------+ +| S | ['3 & -> <130>', '2 & -> <$А>'] | ++--------+---------------------------------------------------------------------+ +| « | ['1 & <«> -> <<>'] | ++--------+---------------------------------------------------------------------+ +| » | ['2 & <»> -> <2%>'] | ++--------+---------------------------------------------------------------------+ +| ш | ['1 & <ш> -> <щ>'] | ++--------+---------------------------------------------------------------------+ +| э | ['2 & <эр> -> <ци>', '2 & <эт> -> <уг>'] | ++--------+---------------------------------------------------------------------+ +| ю | ['1 & <ю> -> <о>'] | ++--------+---------------------------------------------------------------------+ +| G | ['2 & -> <С>'] | ++--------+---------------------------------------------------------------------+ +| M | ['3 & -> <КНМ>'] | ++--------+---------------------------------------------------------------------+ +| _ | ['1 & <_> -> < >'] | ++--------+---------------------------------------------------------------------+ +| c | ['2 & -> <|9>'] | ++--------+---------------------------------------------------------------------+ +| v | ['3 & -> <2оу>'] | ++--------+---------------------------------------------------------------------+ +| | | ['1 & <|> -> <1>'] | ++--------+---------------------------------------------------------------------+ +| ф | ['2 & <фо> -> <уп>'] | ++--------+---------------------------------------------------------------------+ +| L | ['2 & -> <ГХ>'] | ++--------+---------------------------------------------------------------------+ +| X | ['2 & -> <ТХ>', '2 & -> <ГХ>'] | ++--------+---------------------------------------------------------------------+ +| y | ['2 & -> <Пу>', '1 & -> <у>'] | ++--------+---------------------------------------------------------------------+ +| Ш | ['2 & <Ш,> -> <П.>'] | ++--------+---------------------------------------------------------------------+ +| щ | ['1 & <щ> -> <ш>'] | ++--------+---------------------------------------------------------------------+ +| № | ['1 & <№> -> <ы>'] | ++--------+---------------------------------------------------------------------+ +| K | ['3 & -> <КНМ>'] | ++--------+---------------------------------------------------------------------+ +| Y | ['1 & -> <У>'] | ++--------+---------------------------------------------------------------------+ +| ₁ | ['1 & <₁> -> <1>'] | ++--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark_sage-correction.txt b/resources/benchmarks/tesseract_benchmark_sage-correction.txt deleted file mode 100644 index f75ea71e..00000000 --- a/resources/benchmarks/tesseract_benchmark_sage-correction.txt +++ /dev/null @@ -1,359 +0,0 @@ -Tesseract version is 5.0.0 -Correction step: _sage-correction - -Table 1 - Accuracy for each file -+---------------+---------------------+-------+-----------------+--------------+ -| Dataset | Image name | --psm | Amount of words | Accuracy OCR | -+===============+=====================+=======+=================+==============+ -| english-words | Kaspersky | 6 | 111 | 99.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | USB | 6 | 4 | 80.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words1 | 6 | 19 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words2 | 6 | 9 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words3 | 6 | 9 | 100 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 0 | 4 | 315 | 94.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 1 | 4 | 308 | 94.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 2 | 4 | 238 | 96.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 3 | 4 | 313 | 96.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 4 | 4 | 218 | 94.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 5 | 4 | 291 | 94 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 6 | 4 | 268 | 95.200 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 7 | 4 | 390 | 95.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 8 | 4 | 117 | 94 | -+---------------+---------------------+-------+-----------------+--------------+ -| low_quality | 9 | 4 | 294 | 97.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 525 | 83 | -| | oga_00 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 241 | 87 | -| | oga_01 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | napalm_doc_2_2_6 | 4 | 124 | 85 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 695 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 696 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 699 | 99.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | article_multiline | 4 | 471 | 99.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_00 | 4 | 192 | 92.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_01 | 4 | 332 | 99.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | law_image | 4 | 182 | 99.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | napalm_doc_13_2 | 4 | 243 | 96.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukaz_prezidenta_1 | 4 | 264 | 98.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_00 | 4 | 287 | 99.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_01 | 4 | 340 | 99.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 146 | 94.400 | -| | 0 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 276 | 98.800 | -| | 1 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 165 | 98.500 | -| | 2 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 90 | 99.400 | -| | 3 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_00 | 4 | 78 | 97.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_01 | 4 | 296 | 98 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_02 | 4 | 309 | 98.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_03 | 4 | 337 | 98.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_04 | 4 | 257 | 96.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_05 | 4 | 238 | 97.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_06 | 4 | 219 | 93.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_07 | 4 | 233 | 98.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_08 | 4 | 284 | 95.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_09 | 4 | 154 | 97.600 | -+---------------+---------------------+-------+-----------------+--------------+ - -Table 2 - AVG by each type of symbols: -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | -| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | -| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | -| | s | ols | | ars | bols | | | | -+========+========+========+========+========+========+========+=======+=======+ -| englis | 94.820 | 99.333 | 100 | 0 | 0 | 94.540 | 152 | 96.04 | -| h- | | | | | | | | 0 | -| words | | | | | | | | | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| low_qu | 99.190 | 75.340 | 94.544 | 0 | 0 | 97.640 | 2752 | 95.29 | -| ality | | | | | | | | 0 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| others | 89.767 | 77.100 | 89.533 | 0 | 0 | 86.433 | 890 | 85 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| tz-npa | 98.956 | 90.920 | 92.104 | 0 | 0 | 99.488 | 7483 | 97.92 | -| | | | | | | | | 0 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ - -Table 3 -OCR error by symbol: -+--------+---------------------------------------------------------------------+ -| Symbol | Cnt Errors & Correct-Generated | -+========+=====================================================================+ -| | ['3 & <. №> -> < No>', '2 & < 2> -> ', '2 & < г> -> <К>', '2 & < | -| | ‚> -> <,>', "2 & <1 > -> <'>", '2 & <и > -> <н>', '2 & <№ > -> | -| | '] | -+--------+---------------------------------------------------------------------+ -| . | ['54 & <.> -> <,>', '3 & <. №> -> < No>', '3 & <3.> -> < De>', '3 & | -| | <В.В> -> ', '2 & <Г.> -> <С>', '2 & <г.> -> <ГТ>', '2 & <п.> -> | -| | <,>'] | -+--------+---------------------------------------------------------------------+ -| , | ['80 & <,> -> <.>', '3 & <ва,> -> <нь>', '1 & <,> -> <»>'] | -+--------+---------------------------------------------------------------------+ -| е | ['6 & <не> -> ', '4 & <е> -> <ё>', '3 & <все> -> <Ко>', '3 & | -| | <ге> -> <Кри>', '3 & <е-> -> <бов>', '3 & <е> -> <а>', '3 & <цев> | -| | -> ', '3 & <че-> -> <и»>', '2 & <е> -> <и>', '2 & <е> -> | -| | <ми>', '2 & <е> -> <с>', '2 & <ее> -> ', '2 & <ле> -> <У>', '1 | -| | & <е> -> <Е>', '1 & <е> -> <о>'] | -+--------+---------------------------------------------------------------------+ -| о | ['6 & <то> -> ', '3 & <По> -> ', '3 & <Про> -> <Ис>', '3 & | -| | <но> -> ', '3 & <она> -> ', '3 & <под> -> ', '3 & | -| | <фок> -> <М>', '2 & <во> -> <за>', '2 & <до> -> ', '2 & <до> -> | -| | ', '2 & <о> -> <ак>', '2 & <о> -> <у>', '2 & <об> -> <бы>', '2 | -| | & <по> -> <10>', '2 & <то> -> ', '1 & <о> -> <в>', '1 & <о> -> | -| | <я>'] | -+--------+---------------------------------------------------------------------+ -| а | ['5 & <а> -> <о>', '4 & <на> -> ', '3 & <Нам> -> ', '3 & | -| | <а> -> <ёту>', '3 & <ва,> -> <нь>', '3 & <на> -> <под>', '3 & <она> | -| | -> ', '3 & <рак> -> <Ли>', '3 & <сан> -> <еви>', '3 & <так> -> | -| | ', '2 & <Ла> -> <А>', '2 & <а> -> <ся>', '2 & <ва> -> <к>', '2 | -| | & <на> -> ', '1 & <а> -> <Б>', '1 & <а> -> <е>', '1 & <а> -> | -| | <у>', '1 & <а> -> <ы>', '1 & <а> -> <ь>'] | -+--------+---------------------------------------------------------------------+ -| н | ['6 & <не> -> ', '4 & <на> -> ', '3 & <на> -> <под>', '3 & | -| | <но> -> ', '3 & <она> -> ', '3 & <сан> -> <еви>', '2 & | -| | <йн> -> <ем>', '2 & <н> -> <п>', '2 & <на> -> ', '2 & <нк> -> | -| | <х>', '2 & <ны> -> <им>', '1 & <н> -> <Н>', '1 & <н> -> <и>', '1 & | -| | <н> -> <й>', '1 & <н> -> <л>', '1 & <н> -> <м>', '1 & <н> -> <ф>'] | -+--------+---------------------------------------------------------------------+ -| и | ['4 & <и> -> <е>', '3 & <ив> -> <ьюж>', '3 & <тип> -> ', '3 & | -| | <ции> -> <узы>', '2 & <и > -> <н>', '2 & <и> -> <10>', '2 & <и> -> | -| | <ей>', '2 & <и> -> <мм>', '2 & <ис> -> <не>', '2 & <их> -> ', | -| | '2 & <их> -> ', '2 & <си> -> <ен>', '1 & <и> -> <В>', '1 & <и> | -| | -> <а>', '1 & <и> -> <с>', '1 & <и> -> <ь>'] | -+--------+---------------------------------------------------------------------+ -| - | ['8 & <-> -> <но>', '6 & <-> -> <ния>', '5 & <-> -> <в>', '3 & <-> | -| | -> <жья>', '3 & <-> -> <ков>', '3 & <-> -> <нил>', '3 & <-> -> | -| | <щим>', '3 & <е-> -> <бов>', '3 & <че-> -> <и»>', '2 & <-> -> | -| | <ве>', '2 & <-> -> <да>', '2 & <-> -> <ие>', '2 & <-> -> <ко>', '2 | -| | & <-> -> <ли>', '2 & <-> -> <м">', '2 & <-> -> <м>', '2 & <-> -> | -| | <мо>', '2 & <-> -> <ны>', '2 & <-> -> <ры>', '2 & <-> -> <ых>', '2 | -| | & <-> -> <“>', '2 & <у-> -> <ем>', '2 & <ы-> -> <им>', '2 & <ы-> -> | -| | <ём>', '1 & <-> -> <">', '1 & <-> -> <»>', '1 & <-> -> <д>', '1 & | -| | <-> -> <л>', '1 & <-> -> <н>', '1 & <-> -> <ы>'] | -+--------+---------------------------------------------------------------------+ -| 1 | ["4 & <1> -> <'>", '4 & <1С> -> ', '3 & <1> -> <3>', '3 & <№1> | -| | -> ', '3 & <№1»> -> ', "2 & <1 > -> <'>", '2 & <1C> -> | -| | ', '2 & <1C> -> <С>', '2 & <1> -> <2>', '2 & <1> -> ', '1 & | -| | <1> -> ', '1 & <1> -> <5>', '1 & <1> -> <Г>', '1 & <1> -> <С>', | -| | '1 & <1> -> <Т>'] | -+--------+---------------------------------------------------------------------+ -| № | ['94 & <№> -> ', '6 & <№> -> ', '3 & <. №> -> < No>', '3 & | -| | <№1> -> ', '3 & <№1»> -> ', '2 & <№ > -> '] | -+--------+---------------------------------------------------------------------+ -| в | ['4 & <в> -> <6>', '3 & <ва,> -> <нь>', '3 & <все> -> <Ко>', '3 & | -| | <ив> -> <ьюж>', '3 & <ств> -> <У н>', '3 & <цев> -> ', '2 & | -| | <в> -> <«В>', '2 & <в> -> <зм>', '2 & <в> -> <м>', '2 & <в> -> | -| | <по>', '2 & <ва> -> <к>', '2 & <во> -> <за>', '1 & <в> -> ', '1 | -| | & <в> -> <В>', '1 & <в> -> <г>', '1 & <в> -> <н>'] | -+--------+---------------------------------------------------------------------+ -| с | ['3 & <все> -> <Ко>', '3 & <сан> -> <еви>', '3 & <ств> -> <У н>', | -| | '2 & <ис> -> <не>', '2 & <с> -> <Не>', '2 & <с> -> <От>', '2 & <си> | -| | -> <ен>', '1 & <с> -> ', '1 & <с> -> <б>', '1 & <с> -> <н>'] | -+--------+---------------------------------------------------------------------+ -| т | ['6 & <то> -> ', '3 & <ств> -> <У н>', '3 & <так> -> ', '3 | -| | & <тип> -> ', '2 & <т> -> <г>', '2 & <то> -> ', '1 & <т> | -| | -> <Д>', '1 & <т> -> <Т>', '1 & <т> -> <м>'] | -+--------+---------------------------------------------------------------------+ -| л | ['2 & <зл> -> <им>', '2 & <ле> -> <У>', '1 & <л> -> ', '1 & <л> | -| | -> <Л>', '1 & <л> -> <д>', '1 & <л> -> <т>'] | -+--------+---------------------------------------------------------------------+ -| р | ['3 & <Про> -> <Ис>', '3 & <гр> -> <тав>', '3 & <рак> -> <Ли>', '2 | -| | & <гр> -> ', '2 & <р> -> <ал>'] | -+--------+---------------------------------------------------------------------+ -| 2 | ['2 & < 2> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | -| | <28> -> <Я>'] | -+--------+---------------------------------------------------------------------+ -| д | ['3 & <д> -> <Пен>', '3 & <под> -> ', '2 & <до> -> ', '2 & | -| | <до> -> ', '1 & <д> -> <Т>', '1 & <д> -> <Ц>'] | -+--------+---------------------------------------------------------------------+ -| г | ['3 & <ге> -> <Кри>', '3 & <гр> -> <тав>', '2 & < г> -> <К>', '2 & | -| | <г.> -> <ГТ>', '2 & <г> -> <т>', '2 & <гр> -> '] | -+--------+---------------------------------------------------------------------+ -| 3 | ['3 & <3.> -> < De>', '1 & <3> -> <">', '1 & <3> -> '] | -+--------+---------------------------------------------------------------------+ -| С | ['6 & <СЗВ> -> ', '4 & <1С> -> ', '3 & <ОС> -> ', '3 | -| | & <С> -> ', '2 & <ОС> -> '] | -+--------+---------------------------------------------------------------------+ -| N | ['22 & -> <М>'] | -+--------+---------------------------------------------------------------------+ -| А | ['2 & <А> -> ', '2 & <А> -> <Ли>'] | -+--------+---------------------------------------------------------------------+ -| И | ['2 & <И> -> <АН>', '1 & <И> -> <В>', '1 & <И> -> <Й>'] | -+--------+---------------------------------------------------------------------+ -| п | ['3 & <под> -> ', '3 & <тип> -> ', '2 & <п.> -> <,>', '2 | -| | & <п> -> <и >', '2 & <п> -> <л>', '2 & <по> -> <10>', '1 & <п> -> | -| | <П>'] | -+--------+---------------------------------------------------------------------+ -| к | ['3 & <рак> -> <Ли>', '3 & <так> -> ', '3 & <фок> -> <М>', '2 | -| | & <нк> -> <х>'] | -+--------+---------------------------------------------------------------------+ -| у | ['3 & <у> -> <ы>', '2 & <у-> -> <ем>'] | -+--------+---------------------------------------------------------------------+ -| Н | ['3 & <Нам> -> ', '2 & <Н> -> <ЕМ>', '1 & <Н> -> <И>'] | -+--------+---------------------------------------------------------------------+ -| Е | ['2 & <ЕМ> -> <Ш>'] | -+--------+---------------------------------------------------------------------+ -| О | ['3 & <ОС> -> ', '2 & <ОС> -> ', '2 & <Об> -> <06>', '1 & | -| | <О> -> ', '1 & <О> -> <Ю>', '1 & <О> -> <о>'] | -+--------+---------------------------------------------------------------------+ -| П | ['3 & <По> -> ', '3 & <Про> -> <Ис>', '2 & <П> -> <И>', '1 & | -| | <П> -> <К>', '1 & <П> -> <п>'] | -+--------+---------------------------------------------------------------------+ -| б | ['3 & <"б"> -> <“8”>', '2 & <Об> -> <06>', '2 & <б> -> <«Л>', '2 & | -| | <об> -> <бы>'] | -+--------+---------------------------------------------------------------------+ -| ы | ['2 & <ны> -> <им>', '2 & <ы-> -> <им>', '2 & <ы-> -> <ём>', '1 & | -| | <ы> -> <б>', '1 & <ы> -> <е>'] | -+--------+---------------------------------------------------------------------+ -| ; | ['9 & <;> -> <:>', '1 & <;> -> <,>', '1 & <;> -> <.>'] | -+--------+---------------------------------------------------------------------+ -| Т | ['3 & <МРТ> -> ', '3 & <Т> -> <Г>', '3 & <ТЗР> -> '] | -+--------+---------------------------------------------------------------------+ -| м | ['3 & <Нам> -> '] | -+--------+---------------------------------------------------------------------+ -| В | ['6 & <СЗВ> -> ', '3 & <В.В> -> ', '2 & <ВЗ> -> <РИ>'] | -+--------+---------------------------------------------------------------------+ -| 0 | ['3 & <608> -> '] | -+--------+---------------------------------------------------------------------+ -| I | ['3 & -> <Ш>', '3 & -> <УП>', '1 & -> '] | -+--------+---------------------------------------------------------------------+ -| М | ['3 & <МРТ> -> ', '2 & <ЕМ> -> <Ш>'] | -+--------+---------------------------------------------------------------------+ -| 6 | ['3 & <608> -> '] | -+--------+---------------------------------------------------------------------+ -| Р | ['3 & <МРТ> -> ', '3 & <ТЗР> -> '] | -+--------+---------------------------------------------------------------------+ -| ц | ['3 & <цев> -> ', '3 & <ции> -> <узы>', '2 & <ц> -> <С>', '1 & | -| | <ц> -> <щ>'] | -+--------+---------------------------------------------------------------------+ -| Л | ['2 & <Ла> -> <А>'] | -+--------+---------------------------------------------------------------------+ -| 5 | ['2 & <75> -> <2>'] | -+--------+---------------------------------------------------------------------+ -| з | ['2 & <зл> -> <им>'] | -+--------+---------------------------------------------------------------------+ -| 8 | ['3 & <608> -> ', '2 & <28> -> ', '2 & <28> -> <ИР>', '2 & | -| | <28> -> <Я>'] | -+--------+---------------------------------------------------------------------+ -| й | ['2 & <й> -> <е:>', '2 & <йн> -> <ем>'] | -+--------+---------------------------------------------------------------------+ -| " | ['3 & <"б"> -> <“8”>', '2 & <"> -> <“>', '1 & <"> -> <”>'] | -+--------+---------------------------------------------------------------------+ -| 7 | ['2 & <75> -> <2>'] | -+--------+---------------------------------------------------------------------+ -| E | ['3 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| З | ['6 & <СЗВ> -> ', '3 & <БЗ> -> <653>', '3 & <ТЗР> -> ', | -| | '2 & <ВЗ> -> <РИ>'] | -+--------+---------------------------------------------------------------------+ -| ч | ['3 & <че-> -> <и»>'] | -+--------+---------------------------------------------------------------------+ -| : | ['2 & <:> -> '] | -+--------+---------------------------------------------------------------------+ -| [ | ['2 & <[> -> <(>'] | -+--------+---------------------------------------------------------------------+ -| ] | ['2 & <]> -> <)>'] | -+--------+---------------------------------------------------------------------+ -| 4 | ['1 & <4> -> <“>'] | -+--------+---------------------------------------------------------------------+ -| C | ['2 & <1C> -> ', '2 & <1C> -> <С>', '2 & -> <С>'] | -+--------+---------------------------------------------------------------------+ -| Б | ['3 & <БЗ> -> <653>'] | -+--------+---------------------------------------------------------------------+ -| Д | ['1 & <Д> -> <З>'] | -+--------+---------------------------------------------------------------------+ -| | | ['1 & <|> -> <1>'] | -+--------+---------------------------------------------------------------------+ -| Ц | ['1 & <Ц> -> <Т>'] | -+--------+---------------------------------------------------------------------+ -| ш | ['2 & <ш> -> <«Ч>', '1 & <ш> -> <ч>'] | -+--------+---------------------------------------------------------------------+ -| P | ['3 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| R | ['3 & -> <ЕВР>'] | -+--------+---------------------------------------------------------------------+ -| a | ['4 & -> <на>', '1 & -> <а>'] | -+--------+---------------------------------------------------------------------+ -| х | ['2 & <их> -> ', '2 & <их> -> '] | -+--------+---------------------------------------------------------------------+ -| — | ['1 & <—> -> <->'] | -+--------+---------------------------------------------------------------------+ -| G | ['2 & -> <С>'] | -+--------+---------------------------------------------------------------------+ -| H | ['4 & -> <на>', '2 & -> <Из>'] | -+--------+---------------------------------------------------------------------+ -| V | ['3 & -> <УП>'] | -+--------+---------------------------------------------------------------------+ -| m | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| ф | ['3 & <фок> -> <М>', '1 & <ф> -> <Ф>'] | -+--------+---------------------------------------------------------------------+ -| ю | ['1 & <ю> -> <у>'] | -+--------+---------------------------------------------------------------------+ -| c | ['2 & -> <со>', '1 & -> <с>'] | -+--------+---------------------------------------------------------------------+ -| o | ['2 & -> <со>', '2 & -> '] | -+--------+---------------------------------------------------------------------+ -| Ю | ['2 & <Ю> -> <1 >'] | -+--------+---------------------------------------------------------------------+ -| ‚ | ['2 & < ‚> -> <,>'] | -+--------+---------------------------------------------------------------------+ -| Y | ['1 & -> <У>'] | -+--------+---------------------------------------------------------------------+ -| _ | ['1 & <_> -> <Х>'] | -+--------+---------------------------------------------------------------------+ -| d | ['1 & -> <4>'] | -+--------+---------------------------------------------------------------------+ -| e | ['2 & -> <Из>'] | -+--------+---------------------------------------------------------------------+ -| x | ['1 & -> <х>'] | -+--------+---------------------------------------------------------------------+ -| y | ['1 & -> <у>'] | -+--------+---------------------------------------------------------------------+ -| » | ['3 & <№1»> -> '] | -+--------+---------------------------------------------------------------------+ -| Г | ['2 & <Г.> -> <С>'] | -+--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/resources/benchmarks/tesseract_benchmark_textblob-correction.txt b/resources/benchmarks/tesseract_benchmark_textblob-correction.txt deleted file mode 100644 index 2de957a5..00000000 --- a/resources/benchmarks/tesseract_benchmark_textblob-correction.txt +++ /dev/null @@ -1,318 +0,0 @@ -Tesseract version is 4.1.1 -Correction step: _textblob-correction - -Table 1 - Accuracy for each file -+---------------+---------------------+-------+-----------------+--------------+ -| Dataset | Image name | --psm | Amount of words | Accuracy OCR | -+===============+=====================+=======+=================+==============+ -| english-words | Kaspersky | 6 | 111 | 73.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | USB | 6 | 4 | 47.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words1 | 6 | 19 | 66.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words2 | 6 | 9 | 72.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| english-words | words3 | 6 | 9 | 61.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 525 | 80.200 | -| | oga_00 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | Zaklyuchenie_nevrol | 4 | 241 | 87 | -| | oga_01 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| others | napalm_doc_2_2_6 | 4 | 124 | 84.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 695 | 98.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 696 | 98.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | 1.620e+14 | 4 | 699 | 97.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | article_multiline | 4 | 471 | 98.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_00 | 4 | 192 | 91.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | fstek17_01 | 4 | 332 | 97.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | law_image | 4 | 182 | 99 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | napalm_doc_13_2 | 4 | 243 | 95.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukaz_prezidenta_1 | 4 | 264 | 97.100 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_00 | 4 | 287 | 98.200 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ukodeksrf_01 | 4 | 340 | 97.200 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 146 | 94.900 | -| | 0 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 276 | 98.700 | -| | 1 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 165 | 98.700 | -| | 2 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | with_applications_0 | 4 | 90 | 99.100 | -| | 3 | | | | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_00 | 4 | 78 | 91.900 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_01 | 4 | 296 | 94.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_02 | 4 | 309 | 96.700 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_03 | 4 | 337 | 95.500 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_04 | 4 | 257 | 94.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_05 | 4 | 238 | 96.600 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_06 | 4 | 219 | 95.800 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_07 | 4 | 233 | 96.400 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_08 | 4 | 284 | 94.300 | -+---------------+---------------------+-------+-----------------+--------------+ -| tz-npa | ТЗ_09 | 4 | 154 | 93.700 | -+---------------+---------------------+-------+-----------------+--------------+ - -Table 2 - AVG by each type of symbols: -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A | -| t | Spacin | Specia | Digits | Upperc | _Speci | ic | t of | ccura | -| | g_Char | l_Symb | | ase_Ch | al_Sym | | words | cy | -| | s | ols | | ars | bols | | | | -+========+========+========+========+========+========+========+=======+=======+ -| englis | 100 | 99.333 | 100 | 0 | 0 | 60.680 | 152 | 64.48 | -| h- | | | | | | | | 0 | -| words | | | | | | | | | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| others | 90.767 | 80.167 | 90.700 | 0 | 0 | 83.400 | 890 | 83.86 | -| | | | | | | | | 7 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ -| tz-npa | 99.328 | 91.692 | 85.916 | 0 | 0 | 97.300 | 7483 | 96.42 | -| | | | | | | | | 4 | -+--------+--------+--------+--------+--------+--------+--------+-------+-------+ - -Table 3 -OCR error by symbol: -+--------+---------------------------------------------------------------------+ -| Symbol | Cnt Errors & Correct-Generated | -+========+=====================================================================+ -| о | ['198 & <по> -> ', '118 & <от> -> ', '46 & <об> -> ', | -| | '12 & <во> -> ', '12 & <то> -> ', '10 & <до> -> ', '8 & | -| | <со> -> ', '4 & <По> -> ', '4 & <Со> -> ', '4 & <но> -> | -| | ', '4 & <он> -> ', '3 & <и о> -> ', '2 & <го> -> ', | -| | '2 & <по> -> '] | -+--------+---------------------------------------------------------------------+ -| н | ['246 & <на> -> ', '92 & <не> -> ', '4 & <но> -> ', '4 | -| | & <он> -> ', '2 & <нa> -> ', '2 & <н> -> <и>', '2 & <на> -> | -| | '] | -+--------+---------------------------------------------------------------------+ -| а | ['246 & <на> -> ', '56 & <за> -> ', '6 & <На> -> ', '4 | -| | & <За> -> ', '2 & <на> -> ', '2 & <ра> -> '] | -+--------+---------------------------------------------------------------------+ -| е | ['92 & <не> -> ', '12 & <ед> -> ', '6 & <ее> -> ', '4 & | -| | <Не> -> ', '3 & <е> -> <с>', '3 & <пер> -> ', '2 & <же> -> | -| | ', '2 & <те> -> ', '1 & <е> -> <а>'] | -+--------+---------------------------------------------------------------------+ -| т | ['118 & <от> -> ', '36 & <ст> -> ', '12 & <то> -> ', '6 | -| | & <т> -> <г>', '6 & <ти> -> < of>', '3 & <От > -> ', '3 & <тип> | -| | -> ', '2 & <рт> -> ', '2 & <те> -> '] | -+--------+---------------------------------------------------------------------+ -| п | ['198 & <по> -> ', '3 & <Тип> -> ', '3 & <пер> -> ', | -| | '3 & <тип> -> ', '2 & <п> -> <и>', '2 & <по> -> ', '1 & | -| | <п> -> <н>'] | -+--------+---------------------------------------------------------------------+ -| | ['3 & <От > -> ', '3 & <и о> -> ', '3 & <с 6> -> ', '2 | -| | & <. > -> < ‘>', "2 & <1 > -> <'>", '2 & -> <№>'] | -+--------+---------------------------------------------------------------------+ -| 1 | ['104 & <1С> -> ', '18 & <1C> -> ', '8 & <1С> -> ', '4 | -| | & <11> -> <И>', '4 & <1C> -> ', "3 & <1> -> <'>", "2 & <1 > -> | -| | <'>", '2 & <1C> -> ', '2 & <1C> -> <С>', '2 & <1> -> <|>', '2 & | -| | <31> -> ', '1 & <1> -> <\\>'] | -+--------+---------------------------------------------------------------------+ -| и | ['34 & <из> -> ', '32 & <их> -> ', '12 & <им> -> ', '6 | -| | & <ти> -> < of>', '3 & <Тип> -> ', '3 & <и о> -> ', '3 & | -| | <тип> -> ', '2 & <ис> -> <не>'] | -+--------+---------------------------------------------------------------------+ -| С | ['104 & <1С> -> ', '8 & <1С> -> ', '8 & <ОС> -> ', '4 & | -| | <Со> -> ', '3 & <НДС> -> ', '2 & <ДС> -> ', '2 & <ЮС> | -| | -> <1О>', '1 & <С> -> <—>'] | -+--------+---------------------------------------------------------------------+ -| , | ['64 & <,> -> <.>', '6 & <ПО,> -> ', '1 & <,> -> <;>'] | -+--------+---------------------------------------------------------------------+ -| . | ['3 & <.> -> <,>', '3 & <3.> -> < He>', '2 & <. > -> < ‘>', '2 & | -| | <г.> -> <Г>'] | -+--------+---------------------------------------------------------------------+ -| с | ['36 & <ст> -> ', '8 & <со> -> ', '4 & <см> -> ', '3 & | -| | <с 6> -> ', '3 & <ься> -> < by>', '2 & <ис> -> <не>', '1 & <с> | -| | -> ', '1 & <с> -> <©>', '1 & <с> -> <е>'] | -+--------+---------------------------------------------------------------------+ -| з | ['56 & <за> -> ', '34 & <из> -> '] | -+--------+---------------------------------------------------------------------+ -| О | ['20 & <ПО> -> ', '14 & <Об> -> ', '8 & <ДО> -> ', '8 & | -| | <ОС> -> ', '6 & <ПО,> -> ', '4 & <АО> -> ', '4 & <ЛО> | -| | -> ', '4 & <МО> -> ', '3 & <От > -> '] | -+--------+---------------------------------------------------------------------+ -| л | ['6 & <для> -> ', '6 & <мл> -> ', '3 & <для> -> <11>', '3 & | -| | <для> -> ', '3 & <л> -> <п>', '2 & <Эл> -> ', '2 & <ул> -> | -| | '] | -+--------+---------------------------------------------------------------------+ -| б | ['46 & <об> -> ', '14 & <Об> -> '] | -+--------+---------------------------------------------------------------------+ -| д | ['12 & <ед> -> ', '10 & <до> -> ', '6 & <для> -> ', '3 & | -| | <д> -> <л>', '3 & <для> -> <11>', '3 & <для> -> ', '1 & <д> -> | -| | <2>'] | -+--------+---------------------------------------------------------------------+ -| З | ['56 & <ФЗ> -> ', '4 & <За> -> ', '3 & <БЗ> -> <653>', '3 & | -| | <ТЗР> -> ', '2 & <ВЗ> -> <Ръ>'] | -+--------+---------------------------------------------------------------------+ -| в | ['12 & <во> -> ', '1 & <в> -> ', '1 & <в> -> <В>', '1 & <в> | -| | -> <п>'] | -+--------+---------------------------------------------------------------------+ -| Ф | ['56 & <ФЗ> -> ', '12 & <РФ> -> ', '2 & <ФД> -> '] | -+--------+---------------------------------------------------------------------+ -| м | ['12 & <им> -> ', '12 & <мг> -> ', '6 & <мл> -> ', '4 & | -| | <см> -> ', '2 & <мм> -> ', '2 & <мы> -> '] | -+--------+---------------------------------------------------------------------+ -| г | ['12 & <мг> -> ', '2 & <г.> -> <Г>', '2 & <г> -> <т >', '2 & | -| | <г> -> <т>', '2 & <го> -> ', '2 & <гр> -> ', '2 & <гр> -> | -| | <тв>', '1 & <г> -> '] | -+--------+---------------------------------------------------------------------+ -| р | ['3 & <пер> -> ', '2 & <гр> -> ', '2 & <гр> -> <тв>', '2 & | -| | <ра> -> ', '2 & <рт> -> ', '2 & <ры> -> '] | -+--------+---------------------------------------------------------------------+ -| П | ['20 & <ПО> -> ', '6 & <ПО,> -> ', '4 & <По> -> ', '1 | -| | & <П> -> <И>'] | -+--------+---------------------------------------------------------------------+ -| Н | ['6 & <Н> -> <* П>', '6 & <На> -> ', '4 & <Не> -> ', '3 & | -| | <Н> -> <И>', '3 & <НДС> -> ', '2 & <ЕН> -> <ек>', '2 & <НБ> -> | -| | ', '1 & <Н> -> <П>'] | -+--------+---------------------------------------------------------------------+ -| 2 | ['4 & <28> -> ', '2 & <28> -> <Я >'] | -+--------+---------------------------------------------------------------------+ -| N | ['22 & -> <М>', '2 & -> <№>'] | -+--------+---------------------------------------------------------------------+ -| E | ['45 & -> <ЕВР>', '3 & -> <ЕКР>'] | -+--------+---------------------------------------------------------------------+ -| А | ['4 & <АО> -> ', '2 & <АД> -> '] | -+--------+---------------------------------------------------------------------+ -| a | ['6 & -> ', '2 & -> ', '2 & <нa> -> ', '1 & | -| | -> <а>'] | -+--------+---------------------------------------------------------------------+ -| И | ['4 & <ИБ> -> ', '2 & <ИТ> -> ', '1 & <И> -> <Н>'] | -+--------+---------------------------------------------------------------------+ -| я | ['6 & <для> -> ', '3 & <для> -> <11>', '3 & <для> -> ', '3 & | -| | <ься> -> < by>'] | -+--------+---------------------------------------------------------------------+ -| 3 | ['3 & <3.> -> < He>', '2 & <31> -> ', '1 & <3> -> '] | -+--------+---------------------------------------------------------------------+ -| P | ['45 & -> <ЕВР>', '3 & -> <ЕКР>'] | -+--------+---------------------------------------------------------------------+ -| R | ['45 & -> <ЕВР>', '3 & -> <ЕКР>', '3 & -> <ОСК>'] | -+--------+---------------------------------------------------------------------+ -| Д | ['12 & <БД> -> ', '8 & <ДО> -> ', '4 & <ЕД> -> ', '3 & | -| | <НДС> -> ', '2 & <АД> -> ', '2 & <ДС> -> ', '2 & <ФД> | -| | -> ', '1 & <Д> -> <З>'] | -+--------+---------------------------------------------------------------------+ -| e | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| Е | ['4 & <ЕД> -> ', '2 & <ЕН> -> <ек>'] | -+--------+---------------------------------------------------------------------+ -| C | ['18 & <1C> -> ', '4 & <1C> -> ', '3 & -> <ОСК>', '2 | -| | & <1C> -> ', '2 & <1C> -> <С>', '2 & -> ', '2 & -> | -| | '] | -+--------+---------------------------------------------------------------------+ -| Р | ['12 & <РФ> -> ', '3 & <ТЗР> -> '] | -+--------+---------------------------------------------------------------------+ -| х | ['32 & <их> -> '] | -+--------+---------------------------------------------------------------------+ -| I | ['3 & -> <Ш>', '3 & -> <130>', '3 & -> ', '2 | -| | & -> ', '2 & -> <1>', '1 & -> <|>'] | -+--------+---------------------------------------------------------------------+ -| Б | ['12 & <БД> -> ', '4 & <ИБ> -> ', '3 & <БЗ> -> <653>', '2 & | -| | <НБ> -> '] | -+--------+---------------------------------------------------------------------+ -| Т | ['3 & <ТЗР> -> ', '3 & <Тип> -> ', '2 & <ИТ> -> ', '1 | -| | & <Т> -> <Г>'] | -+--------+---------------------------------------------------------------------+ -| 0 | ['3 & <608> -> '] | -+--------+---------------------------------------------------------------------+ -| М | ['4 & <МО> -> '] | -+--------+---------------------------------------------------------------------+ -| у | ['2 & <ул> -> '] | -+--------+---------------------------------------------------------------------+ -| 6 | ['3 & <608> -> ', '3 & <с 6> -> '] | -+--------+---------------------------------------------------------------------+ -| Л | ['4 & <ЛО> -> '] | -+--------+---------------------------------------------------------------------+ -| ь | ['3 & <ься> -> < by>'] | -+--------+---------------------------------------------------------------------+ -| - | ['1 & <-> -> <—>'] | -+--------+---------------------------------------------------------------------+ -| u | ['3 & -> '] | -+--------+---------------------------------------------------------------------+ -| ; | ['9 & <;> -> <:>'] | -+--------+---------------------------------------------------------------------+ -| В | ['2 & <ВЗ> -> <Ръ>'] | -+--------+---------------------------------------------------------------------+ -| ы | ['2 & <мы> -> ', '2 & <ры> -> '] | -+--------+---------------------------------------------------------------------+ -| c | ['1 & -> <с>'] | -+--------+---------------------------------------------------------------------+ -| p | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| ц | ['1 & <ц> -> <щ>'] | -+--------+---------------------------------------------------------------------+ -| 5 | ['2 & <75> -> <#2>'] | -+--------+---------------------------------------------------------------------+ -| 8 | ['4 & <28> -> ', '3 & <608> -> ', '2 & <28> -> <Я >'] | -+--------+---------------------------------------------------------------------+ -| O | ['3 & -> <130>', '3 & -> <ОСК>', '2 & -> '] | -+--------+---------------------------------------------------------------------+ -| S | ['3 & -> <130>'] | -+--------+---------------------------------------------------------------------+ -| ч | ['1 & <ч> -> <з>'] | -+--------+---------------------------------------------------------------------+ -| K | ['3 & -> <КНМ>'] | -+--------+---------------------------------------------------------------------+ -| d | ['2 & -> ', '1 & -> <4>'] | -+--------+---------------------------------------------------------------------+ -| й | ['1 & <й> -> <:>'] | -+--------+---------------------------------------------------------------------+ -| 7 | ['2 & <75> -> <#2>'] | -+--------+---------------------------------------------------------------------+ -| H | ['6 & -> ', '3 & -> <КНМ>', '2 & -> '] | -+--------+---------------------------------------------------------------------+ -| V | ['3 & -> '] | -+--------+---------------------------------------------------------------------+ -| Ц | ['1 & <Ц> -> <Т>'] | -+--------+---------------------------------------------------------------------+ -| M | ['3 & -> <КНМ>'] | -+--------+---------------------------------------------------------------------+ -| № | ['6 & <№> -> '] | -+--------+---------------------------------------------------------------------+ -| G | ['2 & -> <С>'] | -+--------+---------------------------------------------------------------------+ -| | | ['1 & <|> -> <1>'] | -+--------+---------------------------------------------------------------------+ -| « | ['3 & <«_»> -> '] | -+--------+---------------------------------------------------------------------+ -| » | ['3 & <«_»> -> '] | -+--------+---------------------------------------------------------------------+ -| Э | ['2 & <Эл> -> '] | -+--------+---------------------------------------------------------------------+ -| Ю | ['2 & <ЮС> -> <1О>'] | -+--------+---------------------------------------------------------------------+ -| ж | ['2 & <же> -> '] | -+--------+---------------------------------------------------------------------+ -| X | ['2 & -> '] | -+--------+---------------------------------------------------------------------+ -| Y | ['1 & -> <У>'] | -+--------+---------------------------------------------------------------------+ -| _ | ['3 & <«_»> -> '] | -+--------+---------------------------------------------------------------------+ -| — | ['1 & <—> -> <->'] | -+--------+---------------------------------------------------------------------+ \ No newline at end of file diff --git a/scripts/benchmark_tl_correctness.py b/scripts/benchmark_tl_correctness.py index d959a1f4..2538cdef 100644 --- a/scripts/benchmark_tl_correctness.py +++ b/scripts/benchmark_tl_correctness.py @@ -14,6 +14,15 @@ os.makedirs(path_result, exist_ok=True) path_result = os.path.join(path_result, "benchmarks_tl_correctness.json") +""" +Experiments are available -> https://github.com/alexander1999-hub/txt_layer_correctness/tree/main : + * generating synthetic incorrect text + * compare different classification models + * compare different input textual feature: TF-IDF and custom features + * compare on real data of correct/incorrect texts with GT using Levenstein (available on Confluence -> dataset page) +Here (in this script) we calculate an accuracy of selected model (XGboost on custom features) on real data without GT. Data are pdfs with textual layer) +""" + host = "http://localhost:1231" param_dist_errors = namedtuple("Param", ("total_file_size", "total_incorrect_files", "failed")) diff --git a/scripts/tesseract_benchmark/ocr_correction.py b/scripts/tesseract_benchmark/ocr_correction.py deleted file mode 100644 index 89fb87a1..00000000 --- a/scripts/tesseract_benchmark/ocr_correction.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -from typing import Tuple - -import torch -from sage.spelling_correction import AvailableCorrectors -from sage.spelling_correction import RuM2M100ModelForSpellingCorrection -from sage.spelling_correction.corrector import Corrector - -""" -Install sage library (for ocr correction step): -git clone https://github.com/ai-forever/sage.git -cd sage -pip install . -pip install -r requirements.txt - -Note: sage use 5.2 Gb GPU ...... -""" -USE_GPU = True - - -def correction(model: Corrector, ocr_text: str) -> str: - - corrected_lines = [] - for line in ocr_text.split("\n"): - corrected_lines.append(model.correct(line)[0]) - corrected_text = "\n".join(corrected_lines) - - return corrected_text - - -def init_correction_step(cache_dir: str) -> Tuple[Corrector, str]: - - corrected_path = os.path.join(cache_dir, "result_corrected") - os.makedirs(corrected_path, exist_ok=True) - corrector = RuM2M100ModelForSpellingCorrection.from_pretrained(AvailableCorrectors.m2m100_1B.value) # 4.49 Gb model (pytorch_model.bin) - if torch.cuda.is_available() and USE_GPU: - corrector.model.to(torch.device("cuda:0")) - print("use CUDA") - else: - print("use CPU") - return corrector, corrected_path diff --git a/scripts/tesseract_benchmark/requirements.txt b/scripts/tesseract_benchmark/requirements.txt deleted file mode 100644 index 5ef9a438..00000000 --- a/scripts/tesseract_benchmark/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -textblob==0.17.1 \ No newline at end of file diff --git a/scripts/tesseract_benchmark/text_blob_correction.py b/scripts/tesseract_benchmark/text_blob_correction.py deleted file mode 100644 index 73e8d70e..00000000 --- a/scripts/tesseract_benchmark/text_blob_correction.py +++ /dev/null @@ -1,9 +0,0 @@ -from textblob import TextBlob - - -class TextBlobCorrector: - def __init__(self) -> None: - return - - def correct(self, text: str) -> str: - return str(TextBlob(text).correct()) diff --git a/scripts/text_extraction_benchmark/analyze_ocr_errors.py b/scripts/text_extraction_benchmark/analyze_ocr_errors.py new file mode 100644 index 00000000..4ffd4697 --- /dev/null +++ b/scripts/text_extraction_benchmark/analyze_ocr_errors.py @@ -0,0 +1,74 @@ +import os +import re +from typing import List, Tuple + +from texttable import Texttable + + +def __parse_ocr_errors(lines: List[str]) -> List: + ocr_errors = [] + matched_errors = [(line_num, line) for line_num, line in enumerate(lines) if "Errors Marked Correct-Generated" in line][0] + for line in lines[matched_errors[0] + 1:]: + # example line: " 2 0 { 6}-{б}" + errors = re.findall(r"(\d+)", line)[0] + chars = re.findall(r"{(.*)}-{(.*)}", line)[0] + ocr_errors.append([errors, chars[0], chars[1]]) + + return ocr_errors + + +def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]: + symbols_info = [] + matched_symbols = [(line_num, line) for line_num, line in enumerate(lines) if "Count Missed %Right" in line][-1] + start_block_line = matched_symbols[0] + + for line in lines[start_block_line + 1:]: + # example line: "1187 11 99.07 {<\n>}" + row_values = [value.strip() for value in re.findall(r"\d+.\d*|{\S+|\W+}", line)] + row_values[-1] = row_values[-1][1:-1] # get symbol value + symbols_info.append(row_values) + # Sort errors + symbols_info = sorted(symbols_info, key=lambda row: int(row[1]), reverse=True) # by missed + + return symbols_info, start_block_line + + +def get_summary_symbol_error(path_reports: str) -> Texttable: + # 1 - call accsum for get summary of all reports + accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "accsum")) + + if os.path.exists(f"{path_reports}/../accsum_report.txt"): + os.remove(f"{path_reports}/../accsum_report.txt") + + file_reports = " ".join([os.path.join(path_reports, f) for f in os.listdir(path_reports) if os.path.isfile(os.path.join(path_reports, f))]) + + command = f"{accuracy_script_path} {file_reports} >> {path_reports}/../accsum_report.txt" + os.system(command) + accsum_report_path = os.path.join(path_reports, "..", "accsum_report.txt") + + # 2 - parse report info + with open(accsum_report_path, "r") as f: + lines = f.readlines() + + symbols_info, start_symbol_block_line = __parse_symbol_info(lines) + ocr_errors = __parse_ocr_errors(lines[:start_symbol_block_line - 1]) + + # 3 - calculate ocr errors for a symbol + ocr_errors_by_symbol = {} + for symbol_info in symbols_info: + ocr_errors_by_symbol[symbol_info[-1]] = [] + for ocr_err in ocr_errors: + if ocr_err[-1] == "" or len(ocr_err[-2]) > 3 or len(ocr_err[-1]) > 3: # to ignore errors with long text (len > 3) or without text + continue + if symbol_info[-1] in ocr_err[-2]: + ocr_errors_by_symbol[symbol_info[-1]].append(f"{ocr_err[0]} & <{ocr_err[1]}> -> <{ocr_err[2]}>") + + # 4 - create table with OCR errors + ocr_err_by_symbol_table = Texttable() + title = [["Symbol", "Cnt Errors & Correct-Generated"]] + ocr_err_by_symbol_table.add_rows(title) + for symbol, value in ocr_errors_by_symbol.items(): + if len(value) != 0: + ocr_err_by_symbol_table.add_row([symbol, value]) + + return ocr_err_by_symbol_table diff --git a/scripts/text_extraction_benchmark/text_correction/sage_corrector.py b/scripts/text_extraction_benchmark/text_correction/sage_corrector.py new file mode 100644 index 00000000..58d28d2e --- /dev/null +++ b/scripts/text_extraction_benchmark/text_correction/sage_corrector.py @@ -0,0 +1,41 @@ +import os + +import torch +from sage.spelling_correction import AvailableCorrectors +from sage.spelling_correction import RuM2M100ModelForSpellingCorrection + + +""" +Install sage library (for ocr correction step): +git clone https://github.com/ai-forever/sage.git +cd sage +pip install . +pip install -r requirements.txt + +Note: sage use 5.2 Gb GPU ...... +""" + + +class SageCorrector: + + def __init__(self, cache_dir: str, use_gpu: bool = True) -> None: + self.corrected_path = os.path.join(cache_dir, "result_corrected") + os.makedirs(self.corrected_path, exist_ok=True) + + self.corrector = RuM2M100ModelForSpellingCorrection.from_pretrained(AvailableCorrectors.m2m100_1B.value) # 4.49 Gb model (pytorch_model.bin) + self._init_device(use_gpu) + + def _init_device(self, use_gpu: bool) -> None: + if torch.cuda.is_available() and use_gpu: + self.corrector.model.to(torch.device("cuda:0")) + print("use CUDA") + else: + print("use CPU") + + def correction(self, text: str) -> str: + corrected_lines = [] + for line in text.split("\n"): + corrected_lines.append(self.corrector.correct(line)[0]) + corrected_text = "\n".join(corrected_lines) + + return corrected_text diff --git a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py b/scripts/text_extraction_benchmark/text_extraction_benchmarks.py similarity index 60% rename from scripts/tesseract_benchmark/calc_tesseract_benchmarks.py rename to scripts/text_extraction_benchmark/text_extraction_benchmarks.py index 07895d0d..ea1a000c 100644 --- a/scripts/tesseract_benchmark/calc_tesseract_benchmarks.py +++ b/scripts/text_extraction_benchmark/text_extraction_benchmarks.py @@ -2,29 +2,28 @@ import re import time import zipfile +from enum import Enum from typing import Dict, List, Tuple -import cv2 import numpy as np import pytesseract import wget from texttable import Texttable from dedoc.config import get_config -from scripts.tesseract_benchmark.ocr_correction import correction, init_correction_step -from scripts.tesseract_benchmark.text_blob_correction import TextBlobCorrector +from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader +from scripts.text_extraction_benchmark.analyze_ocr_errors import get_summary_symbol_error -WITHOUT_CORRECTION = "" -SAGE_CORRECTION = "_sage-correction" -TEXT_BLOB_CORRECTION = "_textblob-correction" +correction = Enum("Correction", ["SAGE_CORRECTION", "WITHOUT_CORRECTION"]) -USE_CORRECTION_OCR = TEXT_BLOB_CORRECTION +USE_CORRECTION_OCR = correction.WITHOUT_CORRECTION +reader = PdfImageReader() -def _call_tesseract(image: np.ndarray, language: str, psm: int = 3) -> str: - config = f"--psm {psm}" - text = pytesseract.image_to_string(image, lang=language, output_type=pytesseract.Output.DICT, config=config)["text"] - return text + +def _get_text_from_image(path: str, language: str) -> str: + document = reader.read(file_path=path, parameters={"language": language}) + return document.get_text() def _init_statistics_by_dataset(statistics: Dict, dataset_name: str) -> Dict: @@ -60,7 +59,6 @@ def _update_statistics_by_dataset(statistics: Dict, dataset: str, accuracy_path: acc_percent = re.findall(r"\d+\.\d+", matched[0])[0][:-1] statistic["Accuracy"].append(float(acc_percent)) statistic["Amount of words"].append(word_cnt) - statistic["ASCII_Spacing_Characters"] = _update_statistics_by_symbol_kind(statistic["ASCII_Spacing_Characters"], "ASCII Spacing Characters", lines) statistic["ASCII_Special_Symbols"] = _update_statistics_by_symbol_kind(statistic["ASCII_Special_Symbols"], "ASCII Special Symbols", lines) statistic["ASCII_Digits"] = _update_statistics_by_symbol_kind(statistic["ASCII_Digits"], "ASCII Digits", lines) @@ -89,77 +87,8 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List: ] -def __parse_symbol_info(lines: List[str]) -> Tuple[List, int]: - symbols_info = [] - matched_symbols = [(line_num, line) for line_num, line in enumerate(lines) if "Count Missed %Right" in line][-1] - start_block_line = matched_symbols[0] - - for line in lines[start_block_line + 1:]: - # example line: "1187 11 99.07 {<\n>}" - row_values = [value.strip() for value in re.findall(r"\d+.\d*|{\S+|\W+}", line)] - row_values[-1] = row_values[-1][1:-1] # get symbol value - symbols_info.append(row_values) - # Sort errors - symbols_info = sorted(symbols_info, key=lambda row: int(row[1]), reverse=True) # by missed - - return symbols_info, start_block_line - - -def __parse_ocr_errors(lines: List[str]) -> List: - ocr_errors = [] - matched_errors = [(line_num, line) for line_num, line in enumerate(lines) if "Errors Marked Correct-Generated" in line][0] - for line in lines[matched_errors[0] + 1:]: - # example line: " 2 0 { 6}-{б}" - errors = re.findall(r"(\d+)", line)[0] - chars = re.findall(r"{(.*)}-{(.*)}", line)[0] - ocr_errors.append([errors, chars[0], chars[1]]) - - return ocr_errors - - -def __get_summary_symbol_error(path_reports: str) -> Texttable: - # 1 - call accsum for get summary of all reports - accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "accsum")) - - if os.path.exists(f"{path_reports}/../accsum_report.txt"): - os.remove(f"{path_reports}/../accsum_report.txt") - - file_reports = " ".join([os.path.join(path_reports, f) for f in os.listdir(path_reports) if os.path.isfile(os.path.join(path_reports, f))]) - - command = f"{accuracy_script_path} {file_reports} >> {path_reports}/../accsum_report.txt" - os.system(command) - accsum_report_path = os.path.join(path_reports, "..", "accsum_report.txt") - - # 2 - parse report info - with open(accsum_report_path, "r") as f: - lines = f.readlines() - - symbols_info, start_symbol_block_line = __parse_symbol_info(lines) - ocr_errors = __parse_ocr_errors(lines[:start_symbol_block_line - 1]) - - # 3 - calculate ocr errors according to a symbol - ocr_errors_by_symbol = {} - for symbol_info in symbols_info: - ocr_errors_by_symbol[symbol_info[-1]] = [] - for ocr_err in ocr_errors: - if ocr_err[-1] == "" or len(ocr_err[-2]) > 3 or len(ocr_err[-1]) > 3: # to ignore errors with long text (len > 3) or without text - continue - if symbol_info[-1] in ocr_err[-2]: - ocr_errors_by_symbol[symbol_info[-1]].append(f"{ocr_err[0]} & <{ocr_err[1]}> -> <{ocr_err[2]}>") - - # 4 - create table with OCR errors - ocr_err_by_symbol_table = Texttable() - title = [["Symbol", "Cnt Errors & Correct-Generated"]] - ocr_err_by_symbol_table.add_rows(title) - for symbol, value in ocr_errors_by_symbol.items(): - if len(value) != 0: - ocr_err_by_symbol_table.add_row([symbol, value]) - - return ocr_err_by_symbol_table - - def __create_statistic_tables(statistics: dict, accuracy_values: List) -> Tuple[Texttable, Texttable]: - accs = [["Dataset", "Image name", "--psm", "Amount of words", "Accuracy OCR"]] + accs = [["Dataset", "Image name", "OCR language", "Amount of words", "Accuracy OCR"]] accs_common = [ [ "Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits", "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic", @@ -198,13 +127,9 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c result_dir = os.path.join(cache_dir, "result_ocr") os.makedirs(result_dir, exist_ok=True) - corrector, corrected_path = None, None - if USE_CORRECTION_OCR == SAGE_CORRECTION: - corrector, corrected_path = init_correction_step(cache_dir) - elif USE_CORRECTION_OCR == TEXT_BLOB_CORRECTION: - corrector = TextBlobCorrector() - corrected_path = os.path.join(cache_dir, "result_corrected") - os.makedirs(corrected_path, exist_ok=True) + if USE_CORRECTION_OCR == correction.SAGE_CORRECTION: + from scripts.text_extraction_benchmark.text_correction.sage_corrector import SageCorrector + corrector = SageCorrector(cache_dir=cache_dir, use_gpu=True) with zipfile.ZipFile(benchmark_data_path, "r") as arch_file: names_dirs = [member.filename for member in arch_file.infolist() if member.file_size > 0] @@ -228,10 +153,10 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c os.remove(accuracy_path) tmp_gt_path = os.path.join(result_dir, f"{img_name}_gt.txt") - tmp_ocr_path = os.path.join(result_dir, f"{img_name}_ocr.txt") + result_ocr_filepath = os.path.join(result_dir, f"{img_name}_ocr.txt") try: - with arch_file.open(gt_path) as gt_file, open(tmp_gt_path, "wb") as tmp_gt_file, open(tmp_ocr_path, "w") as tmp_ocr_file: + with arch_file.open(gt_path) as gt_file, open(tmp_gt_path, "wb") as tmp_gt_file, open(result_ocr_filepath, "w") as result_ocr_file: gt_text = gt_file.read().decode("utf-8") word_cnt = len(gt_text.split()) @@ -240,28 +165,27 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c tmp_gt_file.close() arch_file.extract(imgs_path, result_dir) - image = cv2.imread(result_dir + "/" + imgs_path) - # call ocr - psm = 6 if dataset_name == "english-words" else 4 - text = _call_tesseract(image, "rus+eng", psm=psm) - tmp_ocr_file.write(text) - tmp_ocr_file.close() + # 1 - call reader + language = "rus+eng" if dataset_name == "english-words" else "rus" + text = _get_text_from_image(path=os.path.join(result_dir, imgs_path), language=language) + result_ocr_file.write(text) + result_ocr_file.close() - # call correction step + # 2 - call correction step time_b = time.time() - if USE_CORRECTION_OCR in (SAGE_CORRECTION, TEXT_BLOB_CORRECTION): - corrected_text = correction(corrector, text) if USE_CORRECTION_OCR == SAGE_CORRECTION else corrector.correct(text) - tmp_corrected_path = os.path.join(corrected_path, f"{img_name}_ocr.txt") - with open(tmp_corrected_path, "w") as tmp_corrected_file: - tmp_corrected_file.write(corrected_text) - calculate_accuracy_script(tmp_gt_path, tmp_corrected_path, accuracy_path) - else: - calculate_accuracy_script(tmp_gt_path, tmp_ocr_path, accuracy_path) + if USE_CORRECTION_OCR == correction.SAGE_CORRECTION: + corrected_text = corrector.correction(text) + result_ocr_filepath = os.path.join(corrector.corrected_path, f"{img_name}_ocr.txt") + with open(result_ocr_filepath, "w") as tmp_corrected_file: + tmp_corrected_file.write(corrected_text) correction_times.append(time.time() - time_b) + + # 3 - calculate accuracy from GTs and result texts + calculate_accuracy_script(tmp_gt_path, result_ocr_filepath, accuracy_path) statistics = _update_statistics_by_dataset(statistics, dataset_name, accuracy_path, word_cnt) - accuracy_values.append([dataset_name, base_name, psm, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) + accuracy_values.append([dataset_name, base_name, language, word_cnt, statistics[dataset_name]["Accuracy"][-1]]) except Exception as ex: print(ex) @@ -274,6 +198,7 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c if __name__ == "__main__": base_zip = "data_tesseract_benchmarks" + output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) cache_dir = os.path.join(get_config()["intermediate_data_path"], "tesseract_data") os.makedirs(cache_dir, exist_ok=True) @@ -282,7 +207,7 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip") if not os.path.isfile(benchmark_data_path): - wget.download("https://at.ispras.ru/owncloud/index.php/s/wMyKioKInYITpYT/download", benchmark_data_path) + wget.download("https://at.ispras.ru/owncloud/index.php/s/gByenPIMlo0K7Gf/download", benchmark_data_path) print(f"Benchmark data downloaded to {benchmark_data_path}") else: print(f"Use cached benchmark data from {benchmark_data_path}") @@ -290,9 +215,9 @@ def __calculate_ocr_reports(cache_dir_accuracy: str, benchmark_data_path: str, c table_common, table_accuracy_per_image = __calculate_ocr_reports(cache_dir_accuracy, benchmark_data_path, cache_dir) - table_errors = __get_summary_symbol_error(path_reports=cache_dir_accuracy) + table_errors = get_summary_symbol_error(path_reports=cache_dir_accuracy) - with open(os.path.join(output_dir, f"tesseract_benchmark{USE_CORRECTION_OCR}.txt"), "w") as res_file: + with open(os.path.join(output_dir, f"tesseract_benchmark_{USE_CORRECTION_OCR}.txt"), "w") as res_file: res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\n") res_file.write(f"Correction step: {USE_CORRECTION_OCR}\n") res_file.write("\nTable 1 - Accuracy for each file\n") diff --git a/scripts/train/train_acc_orientation_classifier.py b/scripts/train/train_eval_orientation_classifier.py similarity index 67% rename from scripts/train/train_acc_orientation_classifier.py rename to scripts/train/train_eval_orientation_classifier.py index 43ba4737..abd558b6 100644 --- a/scripts/train/train_acc_orientation_classifier.py +++ b/scripts/train/train_eval_orientation_classifier.py @@ -3,10 +3,14 @@ from time import time from typing import List +import numpy as np import torch +from sklearn.metrics import precision_recall_fscore_support +from texttable import Texttable from torch import nn from torch import optim from torch.utils.data import DataLoader +from tqdm import tqdm from dedoc.config import get_config from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier @@ -16,6 +20,7 @@ checkpoint_path_save = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "efficient_net_b0_fixed.pth")) checkpoint_path_load = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "efficient_net_b0_fixed.pth")) checkpoint_path = "../../resources" +output_dir = os.path.abspath(os.path.join(checkpoint_path, "benchmarks")) parser.add_argument("-t", "--train", type=bool, help="run for train model", default=False) parser.add_argument("-s", "--checkpoint_save", help="Path to checkpoint for save or load", default=checkpoint_path_save) @@ -25,10 +30,16 @@ args = parser.parse_args() BATCH_SIZE = 1 -ON_GPU = False +ON_GPU = True +""" +Input data are available from our confluence (closed data). +First, you need generate full train/test data (all orientation of src documents) using scripts/gen_dataset.py +Then, you can use this script. +""" -def accuracy_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOrientationClassifier) -> None: + +def evaluation_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOrientationClassifier) -> None: """ Function calculates accuracy for the trained model :param data_executor: Extractor Data from path @@ -47,10 +58,22 @@ def accuracy_step(data_executor: DataLoaderImageOrient, net_executor: ColumnsOri print(f"GroundTruth: orientation {orientation}, columns {columns}") - calc_accuracy_by_classes(testloader, data_executor.classes, net_executor, batch_size=1) + evaluation(testloader, data_executor.classes, net_executor) + + +def print_metrics(precision: np.array, recall: np.array, f1: np.array, cnt: np.array, avg: np.array, classes: List[str]) -> Texttable: + table = Texttable() + + table.header(["Class", "Precision", "Recall", "F1", "Count"]) + for i, name_class in enumerate(classes): + table.add_row([name_class, precision[i], recall[i], f1[i], cnt[i]]) + table.add_row(["AVG", avg[0], avg[1], avg[2], "None"]) -def calc_accuracy_by_classes(testloader: DataLoader, classes: List, classifier: ColumnsOrientationClassifier, batch_size: int = 1) -> None: + return table + + +def evaluation(testloader: DataLoader, classes: List, classifier: ColumnsOrientationClassifier) -> None: """ Function calculates accuracy ba each class :param testloader: DataLoader @@ -59,43 +82,47 @@ def calc_accuracy_by_classes(testloader: DataLoader, classes: List, classifier: :param batch_size: size of batch :return: """ - class_correct = list(0. for _ in range(len(classes))) - class_total = list(0. for _ in range(len(classes))) + orientation_pred, orientation_true = [], [] + column_pred, column_true = [], [] + time_predict = 0 cnt_predict = 0 with torch.no_grad(): - for data in testloader: + for data in tqdm(testloader): images, orientation, columns = data["image"], data["orientation"], data["columns"] - time_begin = time() + time_begin = time() outputs = classifier.net(images.float().to(classifier.device)) time_predict += time() - time_begin cnt_predict += len(images) + # first 2 classes mean columns number # last 4 classes mean orientation columns_out, orientation_out = outputs[:, :2], outputs[:, 2:] _, columns_predicted = torch.max(columns_out, 1) _, orientation_predicted = torch.max(orientation_out, 1) - orientation_c = (orientation_predicted == orientation.to(classifier.device)).squeeze() - columns_c = (columns_predicted == columns.to(classifier.device)).squeeze() - - for i in range(batch_size): - orientation_i = orientation[i] - columns_i = columns[i] - orientation_bool_predict = orientation_c.item() if batch_size == 1 else orientation_c[i].item() - columns_bool_predict = columns_c.item() if batch_size == 1 else columns_c[i].item() - class_correct[2 + orientation_i] += orientation_bool_predict - class_total[2 + orientation_i] += 1 - class_correct[columns_i] += orientation_bool_predict - class_total[columns_i] += 1 - if not orientation_bool_predict or not columns_bool_predict: - print( - f'{data["image_name"][i]} predict as \norientation: {classes[2 + orientation_predicted[i]]} \ncolumns: {classes[columns_predicted[i]]}' - ) - - for i in range(len(classes)): - print(f"Accuracy of {classes[i]:5s} : {100 * class_correct[i] / class_total[i] if class_total[i] != 0 else 0:2d} %") + orientation_pred.append(classes[2 + orientation_predicted.squeeze().item()]) + orientation_true.append(classes[2 + orientation.to(classifier.device).squeeze().item()]) + + column_pred.append(classes[columns_predicted.squeeze().item()]) + column_true.append(classes[columns.to(classifier.device).squeeze().item()]) + + with open(os.path.join(output_dir, "orient_classifier_scores.txt"), "w") as benchmark_file: + orient_metrics = precision_recall_fscore_support(orientation_true, orientation_pred, average=None, labels=classes[2:]) + orient_avg = precision_recall_fscore_support(orientation_true, orientation_pred, average="weighted") + table = print_metrics(*orient_metrics, orient_avg, classes[2:]) + print(table.draw()) + benchmark_file.write("\nOrientation predictions:\n") + benchmark_file.write(table.draw()) + + column_metrics = precision_recall_fscore_support(column_true, column_pred, average=None, labels=classes[:2]) + column_avg = precision_recall_fscore_support(column_true, column_pred, average="weighted") + table = print_metrics(*column_metrics, column_avg, classes[:2]) + print(table.draw()) + benchmark_file.write("\nColumn predictions:\n") + benchmark_file.write(table.draw()) + print(f"=== AVG Time predict {time_predict / cnt_predict}") @@ -167,8 +194,8 @@ def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientat if __name__ == "__main__": config = get_config() data_executor = DataLoaderImageOrient() - net = ColumnsOrientationClassifier(on_gpu=True, checkpoint_path=checkpoint_path if not args.train else "", config=config) + net = ColumnsOrientationClassifier(on_gpu=ON_GPU, checkpoint_path=checkpoint_path if not args.train else "", config=config) if args.train: train_step(data_executor, net) else: - accuracy_step(data_executor, net) + evaluation_step(data_executor, net)