ispras · alexander1999-hub · May 30, 2024 · May 21, 2024 · May 23, 2024 · May 24, 2024
diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py
@@ -1,14 +1,11 @@
 import hashlib
 import logging
-import os
 import re
-import zipfile
 from collections import defaultdict
-from typing import List, Optional
+from typing import List
 
 from bs4 import BeautifulSoup, Tag
 
-from dedoc.common.exceptions.bad_file_error import BadFileFormatError
 from dedoc.data_structures.attached_file import AttachedFile
 from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
 from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
@@ -19,6 +16,7 @@
 from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter
 from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
 from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
+from dedoc.utils.office_utils import get_bs_from_zip
 from dedoc.utils.utils import calculate_file_hash
 
 
@@ -28,8 +26,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
         self.path = path
         self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}
 
-        self.document_bs_tree = self.__get_bs_tree("word/document.xml")
-        self.document_bs_tree = self.__get_bs_tree("word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
+        self.document_bs_tree = get_bs_from_zip(self.path, "word/document.xml")
+        self.document_bs_tree = get_bs_from_zip(self.path, "word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
         self.body = self.document_bs_tree.body if self.document_bs_tree else None
         self.paragraph_maker = self.__get_paragraph_maker()
 
@@ -39,8 +37,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
         self.lines = self.__get_lines()
 
     def __get_paragraph_maker(self) -> ParagraphMaker:
-        styles_extractor = StylesExtractor(self.__get_bs_tree("word/styles.xml"), self.logger)
-        num_tree = self.__get_bs_tree("word/numbering.xml")
+        styles_extractor = StylesExtractor(get_bs_from_zip(self.path, "word/styles.xml"), self.logger)
+        num_tree = get_bs_from_zip(self.path, "word/numbering.xml")
         numbering_extractor = NumberingExtractor(num_tree, styles_extractor) if num_tree else None
         styles_extractor.numbering_extractor = numbering_extractor
 
@@ -49,8 +47,8 @@ def __get_paragraph_maker(self) -> ParagraphMaker:
             path_hash=calculate_file_hash(path=self.path),
             styles_extractor=styles_extractor,
             numbering_extractor=numbering_extractor,
-            footnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/footnotes.xml")),
-            endnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/endnotes.xml"), key="endnote")
+            footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
+            endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote")
         )
 
     def __get_lines(self) -> List[LineWithMeta]:
@@ -120,23 +118,6 @@ def __paragraphs2lines(self, image_refs: dict, table_refs: dict, diagram_refs: d
 
         return lines_with_meta
 
-    def __get_bs_tree(self, filename: str) -> Optional[BeautifulSoup]:
-        """
-        Gets xml bs tree from the given file inside the self.path.
-        :param filename: name of file to extract the tree
-        :return: BeautifulSoup tree or None if file wasn't found
-        """
-        try:
-            with zipfile.ZipFile(self.path) as document:
-                content = document.read(filename)
-                content = re.sub(br"\n[\t ]*", b"", content)
-                soup = BeautifulSoup(content, "xml")
-                return soup
-        except KeyError:
-            return None
-        except zipfile.BadZipFile:
-            raise BadFileFormatError(f"Bad docx file:\n file_name = {os.path.basename(self.path)}. Seems docx is broken")
-
     def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
         table = DocxTable(xml, self.paragraph_maker)
         self.tables.append(table.to_table())
@@ -150,9 +131,9 @@ def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
             table_refs[len(self.paragraph_list) - 1].append(table_uid)
 
     def __handle_images_xml(self, xmls: List[Tag], image_refs: dict) -> None:
-        rels = self.__get_bs_tree("word/_rels/document.xml.rels")
+        rels = get_bs_from_zip(self.path, "word/_rels/document.xml.rels")
         if rels is None:
-            rels = self.__get_bs_tree("word/_rels/document2.xml.rels")
+            rels = get_bs_from_zip(self.path, "word/_rels/document2.xml.rels")
 
         images_rels = dict()
         for rel in rels.find_all("Relationship"):

diff --git a/dedoc/readers/pptx_reader/numbering_extractor.py b/dedoc/readers/pptx_reader/numbering_extractor.py
@@ -0,0 +1,51 @@
+class NumberingExtractor:
+    """
+    This class is used to compute numbering text for list items.
+    For example: "1.", (i), "○"
+    """
+    def __init__(self) -> None:
+        # Mapping according to the ST_TextAutonumberScheme
+        # NOTE we ignore chinese, japanese, hindi, thai
+        self.numbering_types = dict(
+            arabic="1",  # 1, 2, 3, ..., 10, 11, 12, ...
+            alphaLc="a",  # a, b, c, ..., y, z, aa, bb, cc, ..., yy, zz, aaa, bbb, ccc, ...
+            alphaUc="A",  # A, B, C, ..., Y, Z, AA, BB, CC, ..., YY, ZZ, AAA, BBB, CCC, ...
+            romanLc="i",  # i, ii, iii, iv, ..., xviii, xix, xx, xxi, ...
+            romanUc="I"  # I, II, III, IV, ..., XVIII, XIX, XX, XXI, ...
+        )
+
+        self.numbering_formatting = dict(
+            ParenBoth="({}) ",
+            ParenR="{}) ",
+            Period="{}. ",
+            Plain="{} "
+        )
+
+        self.combined_types = {
+            num_type + num_formatting: (num_type, num_formatting) for num_type in self.numbering_types for num_formatting in self.numbering_formatting
+        }
+        self.roman_mapping = [(1000, "m"), (500, "d"), (100, "c"), (50, "l"), (10, "x"), (5, "v"), (1, "i")]
+
+    def get_text(self, numbering: str, shift: int) -> str:
+        """
+        Computes the next item of the list sequence.
+        :param numbering: type of the numbering, e.g. "arabicPeriod"
+        :param shift: shift from the beginning of list numbering
+        :return: string representation of the next numbering item
+        """
+        num_type, num_formatting = self.combined_types.get(numbering, ("arabic", "Period"))
+
+        if num_type in ("alphaLc", "alphaUc"):
+            shift1, shift2 = shift % 26, shift // 26 + 1
+            num_char = chr(ord(self.numbering_types[num_type]) + shift1) * shift2
+        elif num_type in ("romanLc", "romanUc"):
+            num_char = ""
+            for number, letter in self.roman_mapping:
+                cnt, shift = shift // number, shift % number
+                if num_type == "romanUc":
+                    letter = chr(ord(letter) + ord("A") - ord("a"))
+                num_char += letter * cnt
+        else:
+            num_char = str(int(self.numbering_types["arabic"]) + shift)
+
+        return self.numbering_formatting[num_formatting].format(num_char)
diff --git a/dedoc/readers/pptx_reader/paragraph.py b/dedoc/readers/pptx_reader/paragraph.py
@@ -0,0 +1,55 @@
+from bs4 import Tag
+
+from dedoc.data_structures import AlignmentAnnotation, BoldAnnotation, HierarchyLevel, ItalicAnnotation, LineMetadata, LineWithMeta, SizeAnnotation, \
+    StrikeAnnotation, SubscriptAnnotation, SuperscriptAnnotation, UnderlinedAnnotation
+from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor
+from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
+from dedoc.utils.annotation_merger import AnnotationMerger
+
+
+class PptxParagraph:
+    """
+    This class corresponds to one textual paragraph of some entity, e.g. shape or table cell (tag <a:p>).
+    """
+    def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor) -> None:
+        self.xml = xml
+        self.numbered_list_type = self.xml.buAutoNum.get("type", "arabicPeriod") if self.xml.buAutoNum else None
+        self.level = int(self.xml.pPr.get("lvl", 0)) + 1 if self.xml.pPr else 1
+        self.numbering_extractor = numbering_extractor
+        self.properties_extractor = properties_extractor
+        self.annotation_merger = AnnotationMerger()
+        annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation]
+        self.dict2annotation = {annotation.name: annotation for annotation in annotations}
+
+    def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta:
+        text = ""
+        paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level)
+        hierarchy_level = HierarchyLevel.create_raw_text()
+
+        if is_title or paragraph_properties.title:
+            hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False)
+        elif self.numbered_list_type:  # numbered list
+            text += self.numbering_extractor.get_text(self.numbered_list_type, shift)
+            hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=2, level_2=self.level, can_be_multiline=False)
+        elif self.xml.buChar:  # bullet list
+            text += self.xml.buChar["char"] + " "
+            hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=3, level_2=self.level, can_be_multiline=False)
+
+        annotations = []
+        if self.xml.r:
+            for run in self.xml.find_all("a:r"):
+                prev_text = text
+                for run_text in run:
+                    if run_text.name == "t" and run.text:
+                        text += run.text
+
+                run_properties = self.properties_extractor.get_properties(run.rPr, level=self.level, properties=paragraph_properties)
+                annotations.append(SizeAnnotation(start=len(prev_text), end=len(text), value=str(run_properties.size)))
+                for property_name in self.dict2annotation:
+                    if getattr(run_properties, property_name):
+                        annotations.append(self.dict2annotation[property_name](start=len(prev_text), end=len(text), value="True"))
+
+        text = f"{text}\n"
+        annotations = self.annotation_merger.merge_annotations(annotations, text)
+        annotations.append(AlignmentAnnotation(start=0, end=len(text), value=paragraph_properties.alignment))
+        return LineWithMeta(text, metadata=LineMetadata(page_id=page_id, line_id=line_id, tag_hierarchy_level=hierarchy_level), annotations=annotations)
diff --git a/dedoc/readers/pptx_reader/pptx_reader.py b/dedoc/readers/pptx_reader/pptx_reader.py
@@ -1,20 +1,20 @@
+import zipfile
 from typing import Dict, List, Optional
 
-from bs4 import BeautifulSoup
-from pptx import Presentation
-from pptx.shapes.graphfrm import GraphicFrame
-from pptx.shapes.picture import Picture
-from pptx.slide import Slide
+from bs4 import BeautifulSoup, Tag
 
 from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
 from dedoc.data_structures import AttachAnnotation, Table, TableAnnotation
-from dedoc.data_structures.cell_with_meta import CellWithMeta
 from dedoc.data_structures.line_metadata import LineMetadata
 from dedoc.data_structures.line_with_meta import LineWithMeta
-from dedoc.data_structures.table_metadata import TableMetadata
 from dedoc.data_structures.unstructured_document import UnstructuredDocument
 from dedoc.extensions import recognized_extensions, recognized_mimes
 from dedoc.readers.base_reader import BaseReader
+from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor
+from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
+from dedoc.readers.pptx_reader.shape import PptxShape
+from dedoc.readers.pptx_reader.table import PptxTable
+from dedoc.utils.office_utils import get_bs_from_zip
 from dedoc.utils.parameter_utils import get_param_with_attachments
 
 
@@ -27,6 +27,7 @@ class PptxReader(BaseReader):
     def __init__(self, *, config: Optional[dict] = None) -> None:
         super().__init__(config=config, recognized_extensions=recognized_extensions.pptx_like_format, recognized_mimes=recognized_mimes.pptx_like_format)
         self.attachments_extractor = PptxAttachmentsExtractor(config=self.config)
+        self.numbering_extractor = NumberingExtractor()
 
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """
@@ -36,55 +37,73 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         with_attachments = get_param_with_attachments(parameters)
         attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else []
         attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}
-
-        prs = Presentation(file_path)
-        lines, tables = [], []
-
-        for page_id, slide in enumerate(prs.slides, start=1):
-            images_rels = self.__get_slide_images_rels(slide)
-
-            for paragraph_id, shape in enumerate(slide.shapes, start=1):
-
-                if shape.has_text_frame:
-                    lines.append(LineWithMeta(line=f"{shape.text}\n", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
-
-                if shape.has_table:
-                    self.__add_table(lines, tables, page_id, paragraph_id, shape)
-
-                if with_attachments and hasattr(shape, "image"):
+        images_rels = self.__get_slide_images_rels(file_path)
+        properties_extractor = PropertiesExtractor(file_path)
+
+        slide_xml_list = self.__get_slides_bs(file_path, xml_prefix="ppt/slides/slide", xml_postfix=".xml")
+        lines = []
+        tables = []
+
+        for slide_id, slide_xml in enumerate(slide_xml_list):
+            shape_tree_xml = slide_xml.spTree
+
+            is_first_shape = True
+            for tag in shape_tree_xml:
+                if tag.name == "sp":
+                    if not tag.txBody:
+                        continue
+
+                    shape = PptxShape(tag, page_id=slide_id, init_line_id=len(lines), numbering_extractor=self.numbering_extractor,
+                                      properties_extractor=properties_extractor, is_title=is_first_shape)
+                    shape_lines = shape.get_lines()
+                    lines.extend(shape_lines)
+                    if is_first_shape and len(shape_lines) > 0:
+                        is_first_shape = False
+
+                elif tag.tbl:
+                    self.__add_table(lines=lines, tables=tables, page_id=slide_id, table_xml=tag.tbl, properties_extractor=properties_extractor)
+                elif tag.name == "pic" and tag.blip:
                     if len(lines) == 0:
-                        lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
-                    self.__add_attach_annotation(lines[-1], shape, attachment_name2uid, images_rels)
+                        lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=slide_id, line_id=0)))
+                    image_rel_id = str(slide_id) + tag.blip.get("r:embed", "")
+                    self.__add_attach_annotation(lines[-1], image_rel_id, attachment_name2uid, images_rels)
 
         return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[])
 
-    def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, paragraph_id: int, shape: GraphicFrame) -> None:
-        cells = [
-            [CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells]
-            for row in shape.table.rows
-        ]
-        table = Table(cells=cells, metadata=TableMetadata(page_id=page_id))
+    def __get_slides_bs(self, path: str, xml_prefix: str, xml_postfix: str) -> List[BeautifulSoup]:
+        with zipfile.ZipFile(path) as document:
+            xml_names = document.namelist()
+        filtered_names = [file_name for file_name in xml_names if file_name.startswith(xml_prefix) and file_name.endswith(xml_postfix)]
+        sorted_names = sorted(filtered_names, key=lambda x: int(x[len(xml_prefix):-len(xml_postfix)]))
+        slides_bs_list = [get_bs_from_zip(path, file_name, remove_spaces=True) for file_name in sorted_names]
+        return slides_bs_list
 
-        if len(lines) == 0:
-            lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
-        lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid))
-        tables.append(table)
-
-    def __get_slide_images_rels(self, slide: Slide) -> Dict[str, str]:
-        rels = BeautifulSoup(slide.part.rels.xml, "xml")
+    def __get_slide_images_rels(self, path: str) -> Dict[str, str]:
+        """
+        return mapping: {image Id -> image name}
+        """
+        rels_xml_list = self.__get_slides_bs(path, xml_prefix="ppt/slides/_rels/slide", xml_postfix=".xml.rels")
         images_dir = "../media/"
 
         images_rels = dict()
-        for rel in rels.find_all("Relationship"):
-            if rel["Target"].startswith(images_dir):
-                images_rels[rel["Id"]] = rel["Target"][len(images_dir):]
+        for slide_id, rels_xml in enumerate(rels_xml_list):
+            for rel in rels_xml.find_all("Relationship"):
+                if rel["Target"].startswith(images_dir):
+                    images_rels[str(slide_id) + rel["Id"]] = rel["Target"][len(images_dir):]
 
         return images_rels
 
-    def __add_attach_annotation(self, line: LineWithMeta, shape: Picture, attachment_name2uid: dict, images_rels: dict) -> None:
+    def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, table_xml: Tag, properties_extractor: PropertiesExtractor) -> None:
+        table = PptxTable(table_xml, page_id, self.numbering_extractor, properties_extractor).to_table()
+
+        if len(lines) == 0:
+            lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=0)))
+        lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid))
+        tables.append(table)
+
+    def __add_attach_annotation(self, line: LineWithMeta, image_rel_id: str, attachment_name2uid: dict, images_rels: dict) -> None:
         try:
-            image_rels_id = shape.element.blip_rId
-            image_name = images_rels[image_rels_id]
+            image_name = images_rels[image_rel_id]
             image_uid = attachment_name2uid[image_name]
             line.annotations.append(AttachAnnotation(start=0, end=len(line), attach_uid=image_uid))
         except KeyError as e: