Fix tests

ispras · Nov 14, 2023 · af709ae · af709ae
1 parent bc05ab8
commit af709ae
Show file tree

Hide file tree

Showing 11 changed files with 79 additions and 19 deletions.
diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py
@@ -38,7 +38,7 @@ def __init__(self, *, config: dict) -> None:
         self.pdf_txtlayer_reader = PdfTxtlayerReader(config=config)
         self.pdf_tabby_reader = PdfTabbyReader(config=config)
         self.pdf_image_reader = PdfImageReader(config=config)
-        self.txtlayer_detector = TxtLayerDetector(pdf_reader=self.pdf_tabby_reader, config=config)
+        self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=config)
 
         self.config = config
         self.logger = config.get("logger", logging.getLogger())

diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py
@@ -6,18 +6,21 @@
 from dedoc.data_structures import LineWithMeta
 from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier
 from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
+from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
+from dedoc.utils.pdf_utils import get_pdf_page_count
 
 PdfTxtlayerParameters = namedtuple("PdfTxtlayerParameters", ["is_correct_text_layer", "is_first_page_correct"])
 
 
 class TxtLayerDetector:
 
-    def __init__(self, pdf_reader: PdfTabbyReader, *, config: dict) -> None:
+    def __init__(self, pdf_txtlayer_reader: PdfTxtlayerReader, pdf_tabby_reader: PdfTabbyReader, *, config: dict) -> None:
         self.config = config
         self.logger = config.get("logger", logging.getLogger())
 
         self.txtlayer_classifier = TxtlayerClassifier(config=config)
-        self.pdf_tabby_reader = pdf_reader
+        self.pdf_txtlayer_reader = pdf_txtlayer_reader
+        self.pdf_tabby_reader = pdf_tabby_reader
 
     def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
         """
@@ -39,8 +42,16 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
 
     def __get_lines_for_predict(self, path: str, parameters: dict) -> List[LineWithMeta]:
         parameters_copy = deepcopy(parameters)
-        parameters_copy["pages"] = "1:10"
-        document = self.pdf_tabby_reader.read(path, parameters=parameters_copy)
+        parameters_copy["pages"] = "1:8"  # two batches for pdf_txtlayer_reader
+        parameters_copy["need_pdf_table_analysis"] = "false"
+        num_pages = get_pdf_page_count(path)
+        if num_pages is None or num_pages >= 50:
+            # TODO remove this when TLDR-518 is done
+            document = self.pdf_txtlayer_reader.read(path, parameters=parameters_copy)
+        else:
+            # tabby reader reads the whole document regardless "pages" parameter
+            # still it's faster to use tabby for documents with <= 50 pages
+            document = self.pdf_tabby_reader.read(path, parameters=parameters_copy)
 
         return document.lines
 

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -105,21 +105,25 @@ def __extract(self, path: str, parameters: dict, warnings: list)\
         page_count = math.inf if page_count is None else page_count
         first_page, last_page = get_param_page_slice(parameters)
 
-        if (first_page is not None and first_page >= page_count) or (last_page is not None and first_page >= last_page):
-            return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata
-
-        # in java tabby reader page numeration starts with 1, end_page is included
-        first_tabby_page = first_page + 1 if first_page is not None else 1
-        last_tabby_page = None if last_page is not None and last_page > page_count else last_page
-        document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page)
-
-        if (first_page is not None and first_page > 0) or (last_page is not None and last_page < page_count):
+        empty_page_limit = (first_page is not None and first_page >= page_count) or (last_page is not None and first_page >= last_page)
+        partial_page_limit = (first_page is not None and first_page > 0) or (last_page is not None and last_page < page_count)
+        if empty_page_limit or partial_page_limit:
             warnings.append("The document is partially parsed")
             document_metadata = dict(first_page=first_page)
             if last_page is not None:
                 document_metadata["last_page"] = last_page
 
-        for page in document.get("pages", []):
+            if empty_page_limit:
+                return all_lines, all_tables, all_tables_on_images, all_attached_images, document_metadata
+
+        # in java tabby reader page numeration starts with 1, end_page is included
+        # first_tabby_page = first_page + 1 if first_page is not None else 1
+        # last_tabby_page = None if last_page is not None and last_page > page_count else last_page
+        # document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page) TODO TLDR-518
+
+        document = self.__process_pdf(path=path)
+        pages = document.get("pages", [])
+        for page in pages[first_page:last_page]:
             page_lines = self.__get_lines_with_location(page, file_hash)
             if page_lines:
                 all_lines.extend(page_lines)

diff --git a/tests/api_tests/test_api_format_pdf_page_limit.py b/tests/api_tests/test_api_format_pdf_page_limit.py
@@ -34,6 +34,7 @@ def test_auto_text_layer(self) -> None:
 
     def test_tabby_layer(self) -> None:
         self.__check_limit("tabby", check_partially=True)
+        self.__check_out_of_limit("tabby")
 
     def test_auto_tabby(self) -> None:
         self.__check_limit("auto_tabby", check_partially=True)

diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py
@@ -2,6 +2,7 @@
 import unittest
 from typing import List
 
+from dedoc.data_structures import AttachAnnotation
 from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
 from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
 from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
@@ -212,13 +213,14 @@ def test_pdf_with_tables(self) -> None:
 
     def test_pdf_annotations(self) -> None:
         file_name = "Document635.pdf"
-        result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
+        result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby", with_attachments=True))
         content = result["content"]["structure"]["subparagraphs"]
         annotations = content[0]["annotations"]
         annotation_names = {annotation["name"] for annotation in annotations}
         self.assertIn(BoldAnnotation.name, annotation_names)
         self.assertIn(SpacingAnnotation.name, annotation_names)
         self.assertIn(BBoxAnnotation.name, annotation_names)
+        self.assertIn(AttachAnnotation.name, annotation_names)
 
     def test_tables_with_merged_cells(self) -> None:
         file_name = "big_table_with_merged_cells.pdf"

diff --git a/tests/api_tests/test_api_misc_with_attachments.py b/tests/api_tests/test_api_misc_with_attachments.py
@@ -50,8 +50,10 @@ def test_attachments_pmi_document(self) -> None:
 
         attachments = result["attachments"]
 
-        self.assertEqual(attachments[0]["metadata"]["file_type"], "application/json")
-        self.assertEqual(attachments[1]["metadata"]["file_type"], "application/json")
+        self.assertEqual(attachments[0]["metadata"]["file_type"], "image/png")
+        self.assertEqual(attachments[1]["metadata"]["file_type"], "image/png")
+        self.assertEqual(attachments[2]["metadata"]["file_type"], "application/json")
+        self.assertEqual(attachments[3]["metadata"]["file_type"], "application/json")
 
     def test_need_content_analysis(self) -> None:
         file_name = "pdf_with_text_layer/Document635.pdf"

diff --git a/tests/api_tests/test_api_misc_with_images_refs.py b/tests/api_tests/test_api_misc_with_images_refs.py
@@ -5,7 +5,7 @@
 
 class TestApiImageRefs(AbstractTestApiDocReader):
 
-    data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "docx")
+    data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "with_attachments")
 
     def test_docx_with_images(self) -> None:
         file_name = "docx_with_images.docx"
@@ -58,6 +58,46 @@ def test_docx_with_images_from_mac(self) -> None:
         image_paragraph = content["subparagraphs"][5]
         self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid["image3.png"])
 
+    def test_pdf_pdfminer_images_refs(self) -> None:
+        file_name = "with_attachments_1.docx.pdf"
+        result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear", pdf_with_text_layer="true"))
+        structure = result["content"]["structure"]
+
+        attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]}
+        self.assertEqual(len(attachment_uids), 3)
+
+        attach_annotation = structure["subparagraphs"][0]["annotations"][-1]
+        self.assertEqual(attach_annotation["name"], "attachment")
+        self.assertIn(attach_annotation["value"], attachment_uids)
+
+        attach_annotation = structure["subparagraphs"][3]["annotations"][-2]
+        self.assertEqual(attach_annotation["name"], "attachment")
+        self.assertIn(attach_annotation["value"], attachment_uids)
+
+        attach_annotation = structure["subparagraphs"][3]["annotations"][-1]
+        self.assertEqual(attach_annotation["name"], "attachment")
+        self.assertIn(attach_annotation["value"], attachment_uids)
+
+    def test_pdf_tabby_images_refs(self) -> None:
+        file_name = "with_attachments_1.docx.pdf"
+        result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear", pdf_with_text_layer="tabby"))
+        structure = result["content"]["structure"]
+
+        attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]}
+        self.assertEqual(len(attachment_uids), 3)
+
+        attach_annotation = structure["subparagraphs"][2]["annotations"][-1]
+        self.assertEqual(attach_annotation["name"], "attachment")
+        self.assertIn(attach_annotation["value"], attachment_uids)
+
+        attach_annotation = structure["subparagraphs"][4]["annotations"][-2]
+        self.assertEqual(attach_annotation["name"], "attachment")
+        self.assertIn(attach_annotation["value"], attachment_uids)
+
+        attach_annotation = structure["subparagraphs"][4]["annotations"][-1]
+        self.assertEqual(attach_annotation["name"], "attachment")
+        self.assertIn(attach_annotation["value"], attachment_uids)
+
     def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None:
         text = image_paragraph["text"]
         image_annotations = image_paragraph["annotations"]

diff --git a/tests/data/docx/doc_with_images.docx → ...ata/with_attachments/doc_with_images.docx b/tests/data/docx/doc_with_images.docx → ...ata/with_attachments/doc_with_images.docx
diff --git a/tests/data/docx/docx_with_images.docx → ...ta/with_attachments/docx_with_images.docx b/tests/data/docx/docx_with_images.docx → ...ta/with_attachments/docx_with_images.docx
diff --git a/tests/data/docx/odt_with_images.odt → ...data/with_attachments/odt_with_images.odt b/tests/data/docx/odt_with_images.odt → ...data/with_attachments/odt_with_images.odt
diff --git a/tests/data/with_attachments/with_attachments_1.docx.pdf b/tests/data/with_attachments/with_attachments_1.docx.pdf