Benchmarks before changes

ispras · Nov 13, 2023 · 3524442 · 3524442
1 parent e6abe72
commit 3524442
Show file tree

Hide file tree

Showing 4 changed files with 229 additions and 1 deletion.
diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -58,6 +58,7 @@ def __init__(self, config: dict) -> None:
         """
         :param config: configuration of the reader, e.g. logger for logging
         """
+        config["n_jobs"] = config.get("n_jobs", 1)
         self.table_recognizer = TableRecognizer(config=config)
         self.metadata_extractor = LineMetadataExtractor(config=config)
         self.config = config

diff --git a/dedoc/scripts/benchmark_pdf_attachments.py b/dedoc/scripts/benchmark_pdf_attachments.py
@@ -0,0 +1,134 @@
+import json
+import os
+import shutil
+import tempfile
+import zipfile
+from collections import OrderedDict
+from typing import Tuple
+
+import wget
+
+from dedoc.attachments_extractors import AbstractAttachmentsExtractor, PDFAttachmentsExtractor
+from dedoc.config import get_config
+from dedoc.data_structures import AttachedFile
+from dedoc.readers import BaseReader, PdfTabbyReader, PdfTxtlayerReader
+
+
+def get_reader_attachments(reader: BaseReader, input_dir: str, attachments_dir: str) -> dict:
+    os.makedirs(attachments_dir)
+    result_dict = OrderedDict()
+
+    for file_name in sorted(os.listdir(input_dir)):
+        if not file_name.endswith("pdf") or file_name == "large.pdf":
+            continue
+
+        attachment_names = []
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            file_path = os.path.join(tmp_dir, file_name)
+            shutil.copy(os.path.join(input_dir, file_name), file_path)
+            document = reader.read(file_path, parameters={"with_attachments": "true"})
+            os.remove(file_path)
+
+            file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_"))
+            os.makedirs(file_attachments_dir)
+
+            png_files, json_files = 0, 0
+            for attachment in document.attachments:
+                if os.path.isfile(attachment.tmp_file_path):
+                    attachment_name, png_files, json_files = _get_attachment_name(attachment, png_files, json_files)
+                    shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
+                    attachment_names.append(attachment_name)
+
+        print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result")
+        result_dict[file_name] = sorted(attachment_names)
+
+    return result_dict
+
+
+def get_attachments(attachments_extractor: AbstractAttachmentsExtractor, input_dir: str, attachments_dir: str) -> dict:
+    os.makedirs(attachments_dir)
+    result_dict = OrderedDict()
+
+    for file_name in sorted(os.listdir(input_dir)):
+        if not file_name.endswith("pdf"):
+            continue
+
+        attachment_names = []
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            file_path = os.path.join(tmp_dir, file_name)
+            shutil.copy(os.path.join(input_dir, file_name), file_path)
+            attachments = attachments_extractor.get_attachments(tmpdir=tmp_dir, filename=file_name, parameters={})
+            os.remove(file_path)
+
+            file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_"))
+            os.makedirs(file_attachments_dir)
+
+            png_files, json_files = 0, 0
+            for attachment in attachments:
+                if os.path.isfile(attachment.tmp_file_path):
+                    attachment_name, png_files, json_files = _get_attachment_name(attachment, png_files, json_files)
+                    shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
+                    attachment_names.append(attachment_name)
+
+        print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result")
+        result_dict[file_name] = sorted(attachment_names)
+
+    return result_dict
+
+
+def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: int) -> Tuple[str, int, int]:
+    attachment_name = attachment.original_name
+    if attachment_name.endswith(".png"):
+        png_files += 1
+        attachment_name = f"{png_files}.png"
+    if attachment_name.endswith(".json"):
+        json_files += 1
+        attachment_name = f"{json_files}.json"
+    return attachment_name, png_files, json_files
+
+
+if __name__ == "__main__":
+    data_url = "https://at.ispras.ru/owncloud/index.php/s/EoczXGwWzai8ztN/download"
+    data_dir = os.path.join(get_config()["intermediate_data_path"], "benchmark_pdf_attachments")
+
+    if not os.path.isdir(data_dir):
+        os.makedirs(data_dir)
+        archive_path = os.path.join(data_dir, "with_attachments.zip")
+        wget.download(data_url, archive_path)
+        with zipfile.ZipFile(archive_path, "r") as zip_ref:
+            zip_ref.extractall(data_dir)
+        os.remove(archive_path)
+
+        print(f"Benchmark data downloaded to {data_dir}")
+    else:
+        print(f"Use cached benchmark data from {data_dir}")
+
+    in_dir = os.path.join(data_dir, "with_attachments")
+    out_dir = os.path.join(in_dir, "extracted_attachments")
+
+    if os.path.exists(out_dir):
+        shutil.rmtree(out_dir)
+    os.makedirs(out_dir)
+
+    benchmarks_dict = {}
+
+    print("Get tabby attachments")
+    tabby_reader = PdfTabbyReader(config={})
+    tabby_out_dir = os.path.join(out_dir, "tabby")
+    benchmarks_dict["tabby"] = get_reader_attachments(reader=tabby_reader, input_dir=in_dir, attachments_dir=tabby_out_dir)
+
+    print("Get pdfminer attachments")
+    pdfminer_reader = PdfTxtlayerReader(config={})
+    pdfminer_out_dir = os.path.join(out_dir, "pdfminer")
+    benchmarks_dict["pdfminer"] = get_reader_attachments(reader=pdfminer_reader, input_dir=in_dir, attachments_dir=pdfminer_out_dir)
+
+    print("Get common attachments")
+    common_out_dir = os.path.join(out_dir, "common")
+    pdf_attachments_extractor = PDFAttachmentsExtractor(config={})
+    benchmarks_dict["common"] = get_attachments(attachments_extractor=pdf_attachments_extractor, input_dir=in_dir, attachments_dir=common_out_dir)
+
+    json_out_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks"))
+    with open(os.path.join(json_out_dir, "benchmark_pdf_attachments.json"), "w") as f:
+        json.dump(benchmarks_dict, f, ensure_ascii=False, indent=2)
+
+    print(f"Attachments were extracted to {out_dir}")
diff --git a/docs/source/tutorials/add_new_doc_type.rst b/docs/source/tutorials/add_new_doc_type.rst
@@ -175,7 +175,8 @@ You should implement the following methods:
 For each line, you need to add its text, metadata, hierarchy level (if exists) and annotations (if exist).
 For tables, you need to add a list of rows (each row is a list of table cells) and metadata.
 You can use :ref:`dedoc_data_structures` to learn more about all the described structures.
-We use PyPDF2 to extract the text and tabula to extract tables. They must be added to ``requirements.txt`` of the project.
+We use `PyPDF2 <https://pypdf2.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
+They must be added to ``requirements.txt`` of the project.
 We use class ``PdfAttachmentsExtractor`` for attachments extraction (it was mentioned before).
 It must be added to the reader's constructor and used in ``read`` method.
 

diff --git a/resources/benchmarks/benchmark_pdf_attachments.json b/resources/benchmarks/benchmark_pdf_attachments.json
@@ -0,0 +1,92 @@
+{
+  "tabby": {
+    "Document635.pdf": [
+      "1.json",
+      "2.json"
+    ],
+    "example_with_attachments_depth_1.pdf": [
+      "1.json",
+      "attachment.txt",
+      "example_with_table4.jpg",
+      "header_test.pdf",
+      "header_test.pdf"
+    ],
+    "example_with_images.xlsx.pdf": [],
+    "with_attachments_0.docx.pdf": [],
+    "with_attachments_1.docx.pdf": [],
+    "with_attachments_1.pptx.pdf": [],
+    "with_attachments_2.docx.pdf": [],
+    "with_attachments_2.pptx.pdf": [],
+    "with_attachments_3.pdf": []
+  },
+  "pdfminer": {
+    "Document635.pdf": [
+      "1.json",
+      "1.png",
+      "2.json",
+      "2.png"
+    ],
+    "example_with_attachments_depth_1.pdf": [
+      "1.json",
+      "attachment.txt",
+      "example_with_table4.jpg",
+      "header_test.pdf",
+      "header_test.pdf"
+    ],
+    "example_with_images.xlsx.pdf": [
+      "1.png",
+      "2.png"
+    ],
+    "with_attachments_0.docx.pdf": [
+      "1.png",
+      "2.png",
+      "3.png",
+      "4.png"
+    ],
+    "with_attachments_1.docx.pdf": [
+      "1.png",
+      "2.png",
+      "3.png"
+    ],
+    "with_attachments_1.pptx.pdf": [
+      "1.png",
+      "2.png",
+      "3.png"
+    ],
+    "with_attachments_2.docx.pdf": [
+      "1.png",
+      "2.png"
+    ],
+    "with_attachments_2.pptx.pdf": [],
+    "with_attachments_3.pdf": [
+      "1.png",
+      "2.png",
+      "3.png",
+      "4.png",
+      "5.png",
+      "6.png",
+      "7.png"
+    ]
+  },
+  "common": {
+    "Document635.pdf": [
+      "1.json",
+      "2.json"
+    ],
+    "example_with_attachments_depth_1.pdf": [
+      "1.json",
+      "attachment.txt",
+      "example_with_table4.jpg",
+      "header_test.pdf",
+      "header_test.pdf"
+    ],
+    "example_with_images.xlsx.pdf": [],
+    "large.pdf": [],
+    "with_attachments_0.docx.pdf": [],
+    "with_attachments_1.docx.pdf": [],
+    "with_attachments_1.pptx.pdf": [],
+    "with_attachments_2.docx.pdf": [],
+    "with_attachments_2.pptx.pdf": [],
+    "with_attachments_3.pdf": []
+  }
+}