Skip to content

Commit

Permalink
Fix tests and add new one
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed May 29, 2024
1 parent 50d1e18 commit 81a6801
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 4 deletions.
5 changes: 5 additions & 0 deletions dedoc/readers/pptx_reader/shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,18 @@ def get_lines(self) -> List[LineWithMeta]:

lines = []
numbering2shift = defaultdict(int)
prev_list_level = None

for line_id, paragraph_xml in enumerate(self.xml.find_all("a:p")):
paragraph = PptxParagraph(paragraph_xml, self.numbering_extractor, self.properties_extractor)

if paragraph.numbered_list_type:
if prev_list_level and paragraph.level > prev_list_level:
numbering2shift[(paragraph.numbered_list_type, paragraph.level)] = 0

shift = numbering2shift[(paragraph.numbered_list_type, paragraph.level)]
numbering2shift[(paragraph.numbered_list_type, paragraph.level)] += 1
prev_list_level = paragraph.level
else:
shift = 0

Expand Down
4 changes: 2 additions & 2 deletions dedoc/utils/office_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@

from dedoc.common.exceptions.bad_file_error import BadFileFormatError

spaces_regexp = re.compile(br"\n[\t ]*")


def get_bs_from_zip(zip_path: str, xml_path: str) -> Optional[BeautifulSoup]:
"""
Expand All @@ -23,6 +21,8 @@ def get_bs_from_zip(zip_path: str, xml_path: str) -> Optional[BeautifulSoup]:
with zipfile.ZipFile(zip_path) as document:
content = document.read(xml_path)
content = re.sub(br"\n[\t ]*", b"", content)
# remove spaces between tags, don't remove spaces inside text fields, e.g. <a:t> </a:t> for pptx
content = re.sub(br"(?<!<\w:t)>\s+<", b"><", content)
soup = BeautifulSoup(content, "xml")
return soup
except KeyError:
Expand Down
136 changes: 134 additions & 2 deletions tests/api_tests/test_api_format_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,138 @@ def test_odp(self) -> None:
result = self._send_request(file_name, data=dict(structure_type="linear"))
self.__check_content(result["content"])

def test_structure_and_annotations(self) -> None:
file_name = "test-presentation.pptx"
result = self._send_request(file_name, data=dict(with_attachments="True"))
structure = result["content"]["structure"]

# Test headers
node = self._get_by_tree_path(structure, "0.0")
self.assertEqual("Title\n", node["text"])
self.assertEqual("header", node["metadata"]["paragraph_type"])
annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "size"]
self.assertEqual(1, len(annotations))
self.assertEqual(50.0, float(annotations[0]["value"]))
annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "alignment"]
self.assertEqual(1, len(annotations))
self.assertEqual("center", annotations[0]["value"])
node = self._get_by_tree_path(structure, "0.2")
self.assertEqual("Title\n", node["text"])
self.assertEqual("header", node["metadata"]["paragraph_type"])

# Test lists
self.assertEqual("list", self._get_by_tree_path(structure, "0.2.1")["metadata"]["paragraph_type"])
self.assertEqual("1. first item\n", self._get_by_tree_path(structure, "0.2.1.0")["text"])
self.assertEqual("2. second item\n", self._get_by_tree_path(structure, "0.2.1.1")["text"])
self.assertEqual("list", self._get_by_tree_path(structure, "0.2.1.1.0")["metadata"]["paragraph_type"])
self.assertEqual("a. subitem\n", self._get_by_tree_path(structure, "0.2.1.1.0.0")["text"])
self.assertEqual("3. third item\n", self._get_by_tree_path(structure, "0.2.1.2")["text"])
self.assertEqual("list", self._get_by_tree_path(structure, "0.2.1.2.0")["metadata"]["paragraph_type"])
self.assertEqual("a. \n", self._get_by_tree_path(structure, "0.2.1.2.0.0")["text"])

self.assertEqual("❏ first bullet item\n", self._get_by_tree_path(structure, "0.3.0.0")["text"])
self.assertEqual("❏ second bullet item\n", self._get_by_tree_path(structure, "0.3.0.1")["text"])
self.assertEqual("❏ subitem\n", self._get_by_tree_path(structure, "0.3.0.1.0.0")["text"])
self.assertEqual("A. first letter item\n", self._get_by_tree_path(structure, "0.3.1.0")["text"])
self.assertEqual("B. second letter item\n", self._get_by_tree_path(structure, "0.3.1.1")["text"])
self.assertEqual("○ first subitem\n", self._get_by_tree_path(structure, "0.3.1.1.0.0")["text"])
self.assertEqual("○ second subitem\n", self._get_by_tree_path(structure, "0.3.1.1.0.1")["text"])

# Test annotations
node = self._get_by_tree_path(structure, "0.5")
self.assertEqual("Custom title\n", node["text"])
self.assertEqual("header", node["metadata"]["paragraph_type"])
annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "size"]
self.assertEqual(30.0, float(annotations[0]["value"]))
annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "bold"]
self.assertEqual("True", annotations[0]["value"])
annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "alignment"]
self.assertEqual("center", annotations[0]["value"])

node = self._get_by_tree_path(structure, "0.5.0")
annotations = {float(annotation["value"]) for annotation in node["annotations"] if annotation["name"] == "size"}
self.assertSetEqual({18.0, 24.0, 10.0}, annotations)
self.assertIn({"start": 18, "end": 27, "name": "bold", "value": "True"}, node["annotations"])
self.assertIn({"start": 28, "end": 39, "name": "italic", "value": "True"}, node["annotations"])
self.assertIn({"start": 40, "end": 55, "name": "underlined", "value": "True"}, node["annotations"])
self.assertIn({"start": 56, "end": 67, "name": "strike", "value": "True"}, node["annotations"])
self.assertIn({"start": 68, "end": 79, "name": "superscript", "value": "True"}, node["annotations"])
self.assertIn({"start": 81, "end": 90, "name": "subscript", "value": "True"}, node["annotations"])

node = self._get_by_tree_path(structure, "0.6")
self.assertIn({"start": 0, "end": 12, "name": "bold", "value": "True"}, node["annotations"])
self.assertIn({"start": 0, "end": 12, "name": "italic", "value": "True"}, node["annotations"])
self.assertIn({"start": 0, "end": 12, "name": "underlined", "value": "True"}, node["annotations"])
self.assertIn({"start": 0, "end": 12, "name": "size", "value": "20.0"}, node["annotations"])
self.assertIn({"start": 0, "end": 13, "name": "alignment", "value": "right"}, node["annotations"])

# Test tables
tables = result["content"]["tables"]
self.assertEqual(1, len(tables))
table = tables[0]
node = self._get_by_tree_path(structure, "0.4")
annotations = [annotation for annotation in node["annotations"] if annotation["name"] == "table"]
self.assertEqual(table["metadata"]["uid"], annotations[0]["value"])
column_number = len(table["cells"][0])
for table_row in table["cells"]:
self.assertEqual(column_number, len(table_row))

cell = table["cells"][0][0]
self.assertEqual("Horizontally merged cells\n", cell["lines"][0]["text"])
self.assertEqual(1, cell["rowspan"])
self.assertEqual(2, cell["colspan"])
self.assertEqual(False, cell["invisible"])
cell = table["cells"][0][1]
self.assertEqual("Horizontally merged cells\n", cell["lines"][0]["text"])
self.assertEqual(1, cell["rowspan"])
self.assertEqual(1, cell["colspan"])
self.assertEqual(True, cell["invisible"])

cell = table["cells"][1][2]
self.assertEqual("Vertically merged cells\n", cell["lines"][0]["text"])
self.assertEqual(2, cell["rowspan"])
self.assertEqual(1, cell["colspan"])
self.assertEqual(False, cell["invisible"])
cell = table["cells"][2][2]
self.assertEqual("Vertically merged cells\n", cell["lines"][0]["text"])
self.assertEqual(1, cell["rowspan"])
self.assertEqual(1, cell["colspan"])
self.assertEqual(True, cell["invisible"])

cell = table["cells"][2][0]
self.assertEqual("Vertically merged cells 2\n", cell["lines"][0]["text"])
self.assertEqual(2, cell["rowspan"])
self.assertEqual(1, cell["colspan"])
self.assertEqual(False, cell["invisible"])
cell = table["cells"][3][0]
self.assertEqual("Vertically merged cells 2\n", cell["lines"][0]["text"])
self.assertEqual(1, cell["rowspan"])
self.assertEqual(1, cell["colspan"])
self.assertEqual(True, cell["invisible"])

cell = table["cells"][3][2]
self.assertEqual("Horizontally merged cells 2\n", cell["lines"][0]["text"])
self.assertEqual(1, cell["rowspan"])
self.assertEqual(3, cell["colspan"])
self.assertEqual(False, cell["invisible"])
cell = table["cells"][3][3]
self.assertEqual("Horizontally merged cells 2\n", cell["lines"][0]["text"])
self.assertEqual(1, cell["rowspan"])
self.assertEqual(1, cell["colspan"])
self.assertEqual(True, cell["invisible"])

# Test attachments
self.assertEqual(3, len(result["attachments"]))
attachment_uids = {attachment["metadata"]["uid"] for attachment in result["attachments"]}
node = self._get_by_tree_path(structure, "0.6")
annotations = [annotation["value"] for annotation in node["annotations"] if annotation["name"] == "attachment"]
self.assertIn(annotations[0], attachment_uids)
self.assertIn(annotations[1], attachment_uids)
node = self._get_by_tree_path(structure, "0.8.0")
self.assertEqual("Text text\n", node["text"])
annotations = [annotation["value"] for annotation in node["annotations"] if annotation["name"] == "attachment"]
self.assertIn(annotations[0], attachment_uids)

def __check_content(self, content: dict) -> None:
subparagraphs = content["structure"]["subparagraphs"]
self.assertEqual("A long time ago in a galaxy far far away", subparagraphs[0]["text"].strip())
Expand All @@ -31,8 +163,8 @@ def __check_content(self, content: dict) -> None:
self.assertEqual("This is simple table", subparagraphs[3]["text"].strip())

table = content["tables"][0]
self.assertListEqual(["", "Header1", "Header2", "Header3"], self._get_text_of_row(table["cells"][0]))
self.assertListEqual(["Some content", "A", "B", "C"], self._get_text_of_row(table["cells"][1]))
self.assertListEqual(["", "Header1\n", "Header2\n", "Header3\n"], self._get_text_of_row(table["cells"][0]))
self.assertListEqual(["Some content\n", "A\n", "B\n", "C\n"], self._get_text_of_row(table["cells"][1]))

table_annotations = [ann for ann in subparagraphs[2]["annotations"] if ann["name"] == TableAnnotation.name]
self.assertEqual(1, len(table_annotations))
Expand Down
Binary file added tests/data/pptx/test-presentation.pptx
Binary file not shown.

0 comments on commit 81a6801

Please sign in to comment.