Skip to content

Commit

Permalink
Fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Nov 14, 2023
1 parent bc05ab8 commit af709ae
Show file tree
Hide file tree
Showing 11 changed files with 79 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, *, config: dict) -> None:
self.pdf_txtlayer_reader = PdfTxtlayerReader(config=config)
self.pdf_tabby_reader = PdfTabbyReader(config=config)
self.pdf_image_reader = PdfImageReader(config=config)
self.txtlayer_detector = TxtLayerDetector(pdf_reader=self.pdf_tabby_reader, config=config)
self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=config)

self.config = config
self.logger = config.get("logger", logging.getLogger())
Expand Down
19 changes: 15 additions & 4 deletions dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,21 @@
from dedoc.data_structures import LineWithMeta
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
from dedoc.utils.pdf_utils import get_pdf_page_count

PdfTxtlayerParameters = namedtuple("PdfTxtlayerParameters", ["is_correct_text_layer", "is_first_page_correct"])


class TxtLayerDetector:

def __init__(self, pdf_reader: PdfTabbyReader, *, config: dict) -> None:
def __init__(self, pdf_txtlayer_reader: PdfTxtlayerReader, pdf_tabby_reader: PdfTabbyReader, *, config: dict) -> None:
self.config = config
self.logger = config.get("logger", logging.getLogger())

self.txtlayer_classifier = TxtlayerClassifier(config=config)
self.pdf_tabby_reader = pdf_reader
self.pdf_txtlayer_reader = pdf_txtlayer_reader
self.pdf_tabby_reader = pdf_tabby_reader

def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
"""
Expand All @@ -39,8 +42,16 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:

def __get_lines_for_predict(self, path: str, parameters: dict) -> List[LineWithMeta]:
parameters_copy = deepcopy(parameters)
parameters_copy["pages"] = "1:10"
document = self.pdf_tabby_reader.read(path, parameters=parameters_copy)
parameters_copy["pages"] = "1:8" # two batches for pdf_txtlayer_reader
parameters_copy["need_pdf_table_analysis"] = "false"
num_pages = get_pdf_page_count(path)
if num_pages is None or num_pages >= 50:
# TODO remove this when TLDR-518 is done
document = self.pdf_txtlayer_reader.read(path, parameters=parameters_copy)
else:
# tabby reader reads the whole document regardless "pages" parameter
# still it's faster to use tabby for documents with <= 50 pages
document = self.pdf_tabby_reader.read(path, parameters=parameters_copy)

return document.lines

Expand Down
24 changes: 14 additions & 10 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,21 +105,25 @@ def __extract(self, path: str, parameters: dict, warnings: list)\
page_count = math.inf if page_count is None else page_count
first_page, last_page = get_param_page_slice(parameters)

if (first_page is not None and first_page >= page_count) or (last_page is not None and first_page >= last_page):
return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata

# in java tabby reader page numeration starts with 1, end_page is included
first_tabby_page = first_page + 1 if first_page is not None else 1
last_tabby_page = None if last_page is not None and last_page > page_count else last_page
document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page)

if (first_page is not None and first_page > 0) or (last_page is not None and last_page < page_count):
empty_page_limit = (first_page is not None and first_page >= page_count) or (last_page is not None and first_page >= last_page)
partial_page_limit = (first_page is not None and first_page > 0) or (last_page is not None and last_page < page_count)
if empty_page_limit or partial_page_limit:
warnings.append("The document is partially parsed")
document_metadata = dict(first_page=first_page)
if last_page is not None:
document_metadata["last_page"] = last_page

for page in document.get("pages", []):
if empty_page_limit:
return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata

# in java tabby reader page numeration starts with 1, end_page is included
# first_tabby_page = first_page + 1 if first_page is not None else 1
# last_tabby_page = None if last_page is not None and last_page > page_count else last_page
# document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page) TODO TLDR-518

document = self.__process_pdf(path=path)
pages = document.get("pages", [])
for page in pages[first_page:last_page]:
page_lines = self.__get_lines_with_location(page, file_hash)
if page_lines:
all_lines.extend(page_lines)
Expand Down
1 change: 1 addition & 0 deletions tests/api_tests/test_api_format_pdf_page_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def test_auto_text_layer(self) -> None:

def test_tabby_layer(self) -> None:
self.__check_limit("tabby", check_partially=True)
self.__check_out_of_limit("tabby")

def test_auto_tabby(self) -> None:
self.__check_limit("auto_tabby", check_partially=True)
Expand Down
4 changes: 3 additions & 1 deletion tests/api_tests/test_api_format_pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import unittest
from typing import List

from dedoc.data_structures import AttachAnnotation
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
Expand Down Expand Up @@ -212,13 +213,14 @@ def test_pdf_with_tables(self) -> None:

def test_pdf_annotations(self) -> None:
file_name = "Document635.pdf"
result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby", with_attachments=True))
content = result["content"]["structure"]["subparagraphs"]
annotations = content[0]["annotations"]
annotation_names = {annotation["name"] for annotation in annotations}
self.assertIn(BoldAnnotation.name, annotation_names)
self.assertIn(SpacingAnnotation.name, annotation_names)
self.assertIn(BBoxAnnotation.name, annotation_names)
self.assertIn(AttachAnnotation.name, annotation_names)

def test_tables_with_merged_cells(self) -> None:
file_name = "big_table_with_merged_cells.pdf"
Expand Down
6 changes: 4 additions & 2 deletions tests/api_tests/test_api_misc_with_attachments.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ def test_attachments_pmi_document(self) -> None:

attachments = result["attachments"]

self.assertEqual(attachments[0]["metadata"]["file_type"], "application/json")
self.assertEqual(attachments[1]["metadata"]["file_type"], "application/json")
self.assertEqual(attachments[0]["metadata"]["file_type"], "image/png")
self.assertEqual(attachments[1]["metadata"]["file_type"], "image/png")
self.assertEqual(attachments[2]["metadata"]["file_type"], "application/json")
self.assertEqual(attachments[3]["metadata"]["file_type"], "application/json")

def test_need_content_analysis(self) -> None:
file_name = "pdf_with_text_layer/Document635.pdf"
Expand Down
42 changes: 41 additions & 1 deletion tests/api_tests/test_api_misc_with_images_refs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class TestApiImageRefs(AbstractTestApiDocReader):

data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "docx")
data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "with_attachments")

def test_docx_with_images(self) -> None:
file_name = "docx_with_images.docx"
Expand Down Expand Up @@ -58,6 +58,46 @@ def test_docx_with_images_from_mac(self) -> None:
image_paragraph = content["subparagraphs"][5]
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image3.png"])

def test_pdf_pdfminer_images_refs(self) -> None:
file_name = "with_attachments_1.docx.pdf"
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear", pdf_with_text_layer="true"))
structure = result["content"]["structure"]

attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]}
self.assertEqual(len(attachment_uids), 3)

attach_annotation = structure["subparagraphs"][0]["annotations"][-1]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

attach_annotation = structure["subparagraphs"][3]["annotations"][-2]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

attach_annotation = structure["subparagraphs"][3]["annotations"][-1]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

def test_pdf_tabby_images_refs(self) -> None:
file_name = "with_attachments_1.docx.pdf"
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear", pdf_with_text_layer="tabby"))
structure = result["content"]["structure"]

attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]}
self.assertEqual(len(attachment_uids), 3)

attach_annotation = structure["subparagraphs"][2]["annotations"][-1]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

attach_annotation = structure["subparagraphs"][4]["annotations"][-2]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

attach_annotation = structure["subparagraphs"][4]["annotations"][-1]
self.assertEqual(attach_annotation["name"], "attachment")
self.assertIn(attach_annotation["value"], attachment_uids)

def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None:
text = image_paragraph["text"]
image_annotations = image_paragraph["annotations"]
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Binary file not shown.

0 comments on commit af709ae

Please sign in to comment.