Skip to content

Commit

Permalink
Benchmarks before changes
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Nov 13, 2023
1 parent e6abe72 commit 3524442
Show file tree
Hide file tree
Showing 4 changed files with 229 additions and 1 deletion.
1 change: 1 addition & 0 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(self, config: dict) -> None:
"""
:param config: configuration of the reader, e.g. logger for logging
"""
config["n_jobs"] = config.get("n_jobs", 1)
self.table_recognizer = TableRecognizer(config=config)
self.metadata_extractor = LineMetadataExtractor(config=config)
self.config = config
Expand Down
134 changes: 134 additions & 0 deletions dedoc/scripts/benchmark_pdf_attachments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import json
import os
import shutil
import tempfile
import zipfile
from collections import OrderedDict
from typing import Tuple

import wget

from dedoc.attachments_extractors import AbstractAttachmentsExtractor, PDFAttachmentsExtractor
from dedoc.config import get_config
from dedoc.data_structures import AttachedFile
from dedoc.readers import BaseReader, PdfTabbyReader, PdfTxtlayerReader


def get_reader_attachments(reader: BaseReader, input_dir: str, attachments_dir: str) -> dict:
os.makedirs(attachments_dir)
result_dict = OrderedDict()

for file_name in sorted(os.listdir(input_dir)):
if not file_name.endswith("pdf") or file_name == "large.pdf":
continue

attachment_names = []
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, file_name)
shutil.copy(os.path.join(input_dir, file_name), file_path)
document = reader.read(file_path, parameters={"with_attachments": "true"})
os.remove(file_path)

file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_"))
os.makedirs(file_attachments_dir)

png_files, json_files = 0, 0
for attachment in document.attachments:
if os.path.isfile(attachment.tmp_file_path):
attachment_name, png_files, json_files = _get_attachment_name(attachment, png_files, json_files)
shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
attachment_names.append(attachment_name)

print(f"{file_name}: {len(attachment_names)} attachments, {len(document.attachments)} in result")
result_dict[file_name] = sorted(attachment_names)

return result_dict


def get_attachments(attachments_extractor: AbstractAttachmentsExtractor, input_dir: str, attachments_dir: str) -> dict:
os.makedirs(attachments_dir)
result_dict = OrderedDict()

for file_name in sorted(os.listdir(input_dir)):
if not file_name.endswith("pdf"):
continue

attachment_names = []
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, file_name)
shutil.copy(os.path.join(input_dir, file_name), file_path)
attachments = attachments_extractor.get_attachments(tmpdir=tmp_dir, filename=file_name, parameters={})
os.remove(file_path)

file_attachments_dir = os.path.join(attachments_dir, file_name.replace(".", "_"))
os.makedirs(file_attachments_dir)

png_files, json_files = 0, 0
for attachment in attachments:
if os.path.isfile(attachment.tmp_file_path):
attachment_name, png_files, json_files = _get_attachment_name(attachment, png_files, json_files)
shutil.copy(attachment.tmp_file_path, os.path.join(file_attachments_dir, attachment_name))
attachment_names.append(attachment_name)

print(f"{file_name}: {len(attachment_names)} attachments, {len(attachments)} in result")
result_dict[file_name] = sorted(attachment_names)

return result_dict


def _get_attachment_name(attachment: AttachedFile, png_files: int, json_files: int) -> Tuple[str, int, int]:
attachment_name = attachment.original_name
if attachment_name.endswith(".png"):
png_files += 1
attachment_name = f"{png_files}.png"
if attachment_name.endswith(".json"):
json_files += 1
attachment_name = f"{json_files}.json"
return attachment_name, png_files, json_files


if __name__ == "__main__":
data_url = "https://at.ispras.ru/owncloud/index.php/s/EoczXGwWzai8ztN/download"
data_dir = os.path.join(get_config()["intermediate_data_path"], "benchmark_pdf_attachments")

if not os.path.isdir(data_dir):
os.makedirs(data_dir)
archive_path = os.path.join(data_dir, "with_attachments.zip")
wget.download(data_url, archive_path)
with zipfile.ZipFile(archive_path, "r") as zip_ref:
zip_ref.extractall(data_dir)
os.remove(archive_path)

print(f"Benchmark data downloaded to {data_dir}")
else:
print(f"Use cached benchmark data from {data_dir}")

in_dir = os.path.join(data_dir, "with_attachments")
out_dir = os.path.join(in_dir, "extracted_attachments")

if os.path.exists(out_dir):
shutil.rmtree(out_dir)
os.makedirs(out_dir)

benchmarks_dict = {}

print("Get tabby attachments")
tabby_reader = PdfTabbyReader(config={})
tabby_out_dir = os.path.join(out_dir, "tabby")
benchmarks_dict["tabby"] = get_reader_attachments(reader=tabby_reader, input_dir=in_dir, attachments_dir=tabby_out_dir)

print("Get pdfminer attachments")
pdfminer_reader = PdfTxtlayerReader(config={})
pdfminer_out_dir = os.path.join(out_dir, "pdfminer")
benchmarks_dict["pdfminer"] = get_reader_attachments(reader=pdfminer_reader, input_dir=in_dir, attachments_dir=pdfminer_out_dir)

print("Get common attachments")
common_out_dir = os.path.join(out_dir, "common")
pdf_attachments_extractor = PDFAttachmentsExtractor(config={})
benchmarks_dict["common"] = get_attachments(attachments_extractor=pdf_attachments_extractor, input_dir=in_dir, attachments_dir=common_out_dir)

json_out_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks"))
with open(os.path.join(json_out_dir, "benchmark_pdf_attachments.json"), "w") as f:
json.dump(benchmarks_dict, f, ensure_ascii=False, indent=2)

print(f"Attachments were extracted to {out_dir}")
3 changes: 2 additions & 1 deletion docs/source/tutorials/add_new_doc_type.rst
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ You should implement the following methods:
For each line, you need to add its text, metadata, hierarchy level (if exists) and annotations (if exist).
For tables, you need to add a list of rows (each row is a list of table cells) and metadata.
You can use :ref:`dedoc_data_structures` to learn more about all the described structures.
We use PyPDF2 to extract the text and tabula to extract tables. They must be added to ``requirements.txt`` of the project.
We use `PyPDF2 <https://pypdf2.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
They must be added to ``requirements.txt`` of the project.
We use class ``PdfAttachmentsExtractor`` for attachments extraction (it was mentioned before).
It must be added to the reader's constructor and used in ``read`` method.

Expand Down
92 changes: 92 additions & 0 deletions resources/benchmarks/benchmark_pdf_attachments.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"tabby": {
"Document635.pdf": [
"1.json",
"2.json"
],
"example_with_attachments_depth_1.pdf": [
"1.json",
"attachment.txt",
"example_with_table4.jpg",
"header_test.pdf",
"header_test.pdf"
],
"example_with_images.xlsx.pdf": [],
"with_attachments_0.docx.pdf": [],
"with_attachments_1.docx.pdf": [],
"with_attachments_1.pptx.pdf": [],
"with_attachments_2.docx.pdf": [],
"with_attachments_2.pptx.pdf": [],
"with_attachments_3.pdf": []
},
"pdfminer": {
"Document635.pdf": [
"1.json",
"1.png",
"2.json",
"2.png"
],
"example_with_attachments_depth_1.pdf": [
"1.json",
"attachment.txt",
"example_with_table4.jpg",
"header_test.pdf",
"header_test.pdf"
],
"example_with_images.xlsx.pdf": [
"1.png",
"2.png"
],
"with_attachments_0.docx.pdf": [
"1.png",
"2.png",
"3.png",
"4.png"
],
"with_attachments_1.docx.pdf": [
"1.png",
"2.png",
"3.png"
],
"with_attachments_1.pptx.pdf": [
"1.png",
"2.png",
"3.png"
],
"with_attachments_2.docx.pdf": [
"1.png",
"2.png"
],
"with_attachments_2.pptx.pdf": [],
"with_attachments_3.pdf": [
"1.png",
"2.png",
"3.png",
"4.png",
"5.png",
"6.png",
"7.png"
]
},
"common": {
"Document635.pdf": [
"1.json",
"2.json"
],
"example_with_attachments_depth_1.pdf": [
"1.json",
"attachment.txt",
"example_with_table4.jpg",
"header_test.pdf",
"header_test.pdf"
],
"example_with_images.xlsx.pdf": [],
"large.pdf": [],
"with_attachments_0.docx.pdf": [],
"with_attachments_1.docx.pdf": [],
"with_attachments_1.pptx.pdf": [],
"with_attachments_2.docx.pdf": [],
"with_attachments_2.pptx.pdf": [],
"with_attachments_3.pdf": []
}
}

0 comments on commit 3524442

Please sign in to comment.