Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-669 pptx fixes #448

Merged
merged 9 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 10 additions & 29 deletions dedoc/readers/docx_reader/data_structures/docx_document.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import hashlib
import logging
import os
import re
import zipfile
from collections import defaultdict
from typing import List, Optional
from typing import List

from bs4 import BeautifulSoup, Tag

from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
Expand All @@ -19,6 +16,7 @@
from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter
from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
from dedoc.utils.office_utils import get_bs_from_zip
from dedoc.utils.utils import calculate_file_hash


Expand All @@ -28,8 +26,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
self.path = path
self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}

self.document_bs_tree = self.__get_bs_tree("word/document.xml")
self.document_bs_tree = self.__get_bs_tree("word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
self.document_bs_tree = get_bs_from_zip(self.path, "word/document.xml")
self.document_bs_tree = get_bs_from_zip(self.path, "word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
self.body = self.document_bs_tree.body if self.document_bs_tree else None
self.paragraph_maker = self.__get_paragraph_maker()

Expand All @@ -39,8 +37,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
self.lines = self.__get_lines()

def __get_paragraph_maker(self) -> ParagraphMaker:
styles_extractor = StylesExtractor(self.__get_bs_tree("word/styles.xml"), self.logger)
num_tree = self.__get_bs_tree("word/numbering.xml")
styles_extractor = StylesExtractor(get_bs_from_zip(self.path, "word/styles.xml"), self.logger)
num_tree = get_bs_from_zip(self.path, "word/numbering.xml")
numbering_extractor = NumberingExtractor(num_tree, styles_extractor) if num_tree else None
styles_extractor.numbering_extractor = numbering_extractor

Expand All @@ -49,8 +47,8 @@ def __get_paragraph_maker(self) -> ParagraphMaker:
path_hash=calculate_file_hash(path=self.path),
styles_extractor=styles_extractor,
numbering_extractor=numbering_extractor,
footnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/footnotes.xml")),
endnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/endnotes.xml"), key="endnote")
footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote")
)

def __get_lines(self) -> List[LineWithMeta]:
Expand Down Expand Up @@ -120,23 +118,6 @@ def __paragraphs2lines(self, image_refs: dict, table_refs: dict, diagram_refs: d

return lines_with_meta

def __get_bs_tree(self, filename: str) -> Optional[BeautifulSoup]:
"""
Gets xml bs tree from the given file inside the self.path.
:param filename: name of file to extract the tree
:return: BeautifulSoup tree or None if file wasn't found
"""
try:
with zipfile.ZipFile(self.path) as document:
content = document.read(filename)
content = re.sub(br"\n[\t ]*", b"", content)
soup = BeautifulSoup(content, "xml")
return soup
except KeyError:
return None
except zipfile.BadZipFile:
raise BadFileFormatError(f"Bad docx file:\n file_name = {os.path.basename(self.path)}. Seems docx is broken")

def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
table = DocxTable(xml, self.paragraph_maker)
self.tables.append(table.to_table())
Expand All @@ -150,9 +131,9 @@ def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
table_refs[len(self.paragraph_list) - 1].append(table_uid)

def __handle_images_xml(self, xmls: List[Tag], image_refs: dict) -> None:
rels = self.__get_bs_tree("word/_rels/document.xml.rels")
rels = get_bs_from_zip(self.path, "word/_rels/document.xml.rels")
if rels is None:
rels = self.__get_bs_tree("word/_rels/document2.xml.rels")
rels = get_bs_from_zip(self.path, "word/_rels/document2.xml.rels")

images_rels = dict()
for rel in rels.find_all("Relationship"):
Expand Down
51 changes: 51 additions & 0 deletions dedoc/readers/pptx_reader/numbering_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
class NumberingExtractor:
"""
This class is used to compute numbering text for list items.
For example: "1.", (i), "○"
"""
def __init__(self) -> None:
# Mapping according to the ST_TextAutonumberScheme
# NOTE we ignore chinese, japanese, hindi, thai
self.numbering_types = dict(
arabic="1", # 1, 2, 3, ..., 10, 11, 12, ...
alphaLc="a", # a, b, c, ..., y, z, aa, bb, cc, ..., yy, zz, aaa, bbb, ccc, ...
alphaUc="A", # A, B, C, ..., Y, Z, AA, BB, CC, ..., YY, ZZ, AAA, BBB, CCC, ...
romanLc="i", # i, ii, iii, iv, ..., xviii, xix, xx, xxi, ...
romanUc="I" # I, II, III, IV, ..., XVIII, XIX, XX, XXI, ...
)

self.numbering_formatting = dict(
ParenBoth="({}) ",
ParenR="{}) ",
Period="{}. ",
Plain="{} "
)

self.combined_types = {
num_type + num_formatting: (num_type, num_formatting) for num_type in self.numbering_types for num_formatting in self.numbering_formatting
}
self.roman_mapping = [(1000, "m"), (500, "d"), (100, "c"), (50, "l"), (10, "x"), (5, "v"), (1, "i")]

def get_text(self, numbering: str, shift: int) -> str:
"""
Computes the next item of the list sequence.
:param numbering: type of the numbering, e.g. "arabicPeriod"
:param shift: shift from the beginning of list numbering
:return: string representation of the next numbering item
"""
num_type, num_formatting = self.combined_types.get(numbering, ("arabic", "Period"))

if num_type in ("alphaLc", "alphaUc"):
shift1, shift2 = shift % 26, shift // 26 + 1
num_char = chr(ord(self.numbering_types[num_type]) + shift1) * shift2
elif num_type in ("romanLc", "romanUc"):
num_char = ""
for number, letter in self.roman_mapping:
cnt, shift = shift // number, shift % number
if num_type == "romanUc":
letter = chr(ord(letter) + ord("A") - ord("a"))
num_char += letter * cnt
else:
num_char = str(int(self.numbering_types["arabic"]) + shift)

return self.numbering_formatting[num_formatting].format(num_char)
55 changes: 55 additions & 0 deletions dedoc/readers/pptx_reader/paragraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from bs4 import Tag

from dedoc.data_structures import AlignmentAnnotation, BoldAnnotation, HierarchyLevel, ItalicAnnotation, LineMetadata, LineWithMeta, SizeAnnotation, \
StrikeAnnotation, SubscriptAnnotation, SuperscriptAnnotation, UnderlinedAnnotation
from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor
from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
from dedoc.utils.annotation_merger import AnnotationMerger


class PptxParagraph:
"""
This class corresponds to one textual paragraph of some entity, e.g. shape or table cell (tag <a:p>).
"""
def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor) -> None:
self.xml = xml
self.numbered_list_type = self.xml.buAutoNum.get("type", "arabicPeriod") if self.xml.buAutoNum else None
self.level = int(self.xml.pPr.get("lvl", 0)) + 1 if self.xml.pPr else 1
self.numbering_extractor = numbering_extractor
self.properties_extractor = properties_extractor
self.annotation_merger = AnnotationMerger()
annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation]
self.dict2annotation = {annotation.name: annotation for annotation in annotations}

def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta:
text = ""
paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level)
hierarchy_level = HierarchyLevel.create_raw_text()

if is_title or paragraph_properties.title:
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False)
elif self.numbered_list_type: # numbered list
text += self.numbering_extractor.get_text(self.numbered_list_type, shift)
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=2, level_2=self.level, can_be_multiline=False)
elif self.xml.buChar: # bullet list
text += self.xml.buChar["char"] + " "
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=3, level_2=self.level, can_be_multiline=False)

annotations = []
if self.xml.r:
for run in self.xml.find_all("a:r"):
prev_text = text
for run_text in run:
if run_text.name == "t" and run.text:
text += run.text

run_properties = self.properties_extractor.get_properties(run.rPr, level=self.level, properties=paragraph_properties)
annotations.append(SizeAnnotation(start=len(prev_text), end=len(text), value=str(run_properties.size)))
for property_name in self.dict2annotation:
if getattr(run_properties, property_name):
annotations.append(self.dict2annotation[property_name](start=len(prev_text), end=len(text), value="True"))

text = f"{text}\n"
annotations = self.annotation_merger.merge_annotations(annotations, text)
annotations.append(AlignmentAnnotation(start=0, end=len(text), value=paragraph_properties.alignment))
return LineWithMeta(text, metadata=LineMetadata(page_id=page_id, line_id=line_id, tag_hierarchy_level=hierarchy_level), annotations=annotations)
107 changes: 63 additions & 44 deletions dedoc/readers/pptx_reader/pptx_reader.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import zipfile
from typing import Dict, List, Optional

from bs4 import BeautifulSoup
from pptx import Presentation
from pptx.shapes.graphfrm import GraphicFrame
from pptx.shapes.picture import Picture
from pptx.slide import Slide
from bs4 import BeautifulSoup, Tag

from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
from dedoc.data_structures import AttachAnnotation, Table, TableAnnotation
from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.table_metadata import TableMetadata
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.readers.base_reader import BaseReader
from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor
from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
from dedoc.readers.pptx_reader.shape import PptxShape
from dedoc.readers.pptx_reader.table import PptxTable
from dedoc.utils.office_utils import get_bs_from_zip
from dedoc.utils.parameter_utils import get_param_with_attachments


Expand All @@ -27,6 +27,7 @@ class PptxReader(BaseReader):
def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config, recognized_extensions=recognized_extensions.pptx_like_format, recognized_mimes=recognized_mimes.pptx_like_format)
self.attachments_extractor = PptxAttachmentsExtractor(config=self.config)
self.numbering_extractor = NumberingExtractor()

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
Expand All @@ -36,55 +37,73 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
with_attachments = get_param_with_attachments(parameters)
attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else []
attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}

prs = Presentation(file_path)
lines, tables = [], []

for page_id, slide in enumerate(prs.slides, start=1):
images_rels = self.__get_slide_images_rels(slide)

for paragraph_id, shape in enumerate(slide.shapes, start=1):

if shape.has_text_frame:
lines.append(LineWithMeta(line=f"{shape.text}\n", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))

if shape.has_table:
self.__add_table(lines, tables, page_id, paragraph_id, shape)

if with_attachments and hasattr(shape, "image"):
images_rels = self.__get_slide_images_rels(file_path)
properties_extractor = PropertiesExtractor(file_path)

slide_xml_list = self.__get_slides_bs(file_path, xml_prefix="ppt/slides/slide", xml_postfix=".xml")
lines = []
tables = []

for slide_id, slide_xml in enumerate(slide_xml_list):
shape_tree_xml = slide_xml.spTree

is_first_shape = True
for tag in shape_tree_xml:
if tag.name == "sp":
if not tag.txBody:
continue

shape = PptxShape(tag, page_id=slide_id, init_line_id=len(lines), numbering_extractor=self.numbering_extractor,
properties_extractor=properties_extractor, is_title=is_first_shape)
shape_lines = shape.get_lines()
lines.extend(shape_lines)
if is_first_shape and len(shape_lines) > 0:
is_first_shape = False

elif tag.tbl:
self.__add_table(lines=lines, tables=tables, page_id=slide_id, table_xml=tag.tbl, properties_extractor=properties_extractor)
elif tag.name == "pic" and tag.blip:
if len(lines) == 0:
lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
self.__add_attach_annotation(lines[-1], shape, attachment_name2uid, images_rels)
lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=slide_id, line_id=0)))
image_rel_id = str(slide_id) + tag.blip.get("r:embed", "")
self.__add_attach_annotation(lines[-1], image_rel_id, attachment_name2uid, images_rels)

return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[])

def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, paragraph_id: int, shape: GraphicFrame) -> None:
cells = [
[CellWithMeta(lines=[LineWithMeta(line=cell.text, metadata=LineMetadata(page_id=page_id, line_id=0))]) for cell in row.cells]
for row in shape.table.rows
]
table = Table(cells=cells, metadata=TableMetadata(page_id=page_id))
def __get_slides_bs(self, path: str, xml_prefix: str, xml_postfix: str) -> List[BeautifulSoup]:
with zipfile.ZipFile(path) as document:
xml_names = document.namelist()
filtered_names = [file_name for file_name in xml_names if file_name.startswith(xml_prefix) and file_name.endswith(xml_postfix)]
sorted_names = sorted(filtered_names, key=lambda x: int(x[len(xml_prefix):-len(xml_postfix)]))
slides_bs_list = [get_bs_from_zip(path, file_name, remove_spaces=True) for file_name in sorted_names]
return slides_bs_list

if len(lines) == 0:
lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=paragraph_id)))
lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid))
tables.append(table)

def __get_slide_images_rels(self, slide: Slide) -> Dict[str, str]:
rels = BeautifulSoup(slide.part.rels.xml, "xml")
def __get_slide_images_rels(self, path: str) -> Dict[str, str]:
"""
return mapping: {image Id -> image name}
"""
rels_xml_list = self.__get_slides_bs(path, xml_prefix="ppt/slides/_rels/slide", xml_postfix=".xml.rels")
images_dir = "../media/"

images_rels = dict()
for rel in rels.find_all("Relationship"):
if rel["Target"].startswith(images_dir):
images_rels[rel["Id"]] = rel["Target"][len(images_dir):]
for slide_id, rels_xml in enumerate(rels_xml_list):
for rel in rels_xml.find_all("Relationship"):
if rel["Target"].startswith(images_dir):
images_rels[str(slide_id) + rel["Id"]] = rel["Target"][len(images_dir):]

return images_rels

def __add_attach_annotation(self, line: LineWithMeta, shape: Picture, attachment_name2uid: dict, images_rels: dict) -> None:
def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, table_xml: Tag, properties_extractor: PropertiesExtractor) -> None:
table = PptxTable(table_xml, page_id, self.numbering_extractor, properties_extractor).to_table()

if len(lines) == 0:
lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=0)))
lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), name=table.metadata.uid))
tables.append(table)

def __add_attach_annotation(self, line: LineWithMeta, image_rel_id: str, attachment_name2uid: dict, images_rels: dict) -> None:
try:
image_rels_id = shape.element.blip_rId
image_name = images_rels[image_rels_id]
image_name = images_rels[image_rel_id]
image_uid = attachment_name2uid[image_name]
line.annotations.append(AttachAnnotation(start=0, end=len(line), attach_uid=image_uid))
except KeyError as e:
Expand Down
Loading
Loading