Skip to content

Commit

Permalink
TLDR-695 additional node page_id (#459)
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget authored Jun 19, 2024
1 parent 5102112 commit d2d309d
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 6 deletions.
38 changes: 32 additions & 6 deletions dedoc/data_structures/tree_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,20 +51,18 @@ def create(lines: List[LineWithMeta] = None) -> "TreeNode":
"""
page_id = 0 if len(lines) == 0 else min((line.metadata.page_id for line in lines))
line_id = 0 if len(lines) == 0 else min((line.metadata.line_id for line in lines))
metadata = LineMetadata(page_id=page_id, line_id=line_id, hierarchy_level=HierarchyLevel.create_root())

texts = (line.line for line in lines)
annotations = []
text_length = 0
for line in lines:
annotations.extend(TreeNode.__shift_annotations(line=line, text_length=text_length))
TreeNode.__add_additional_page_id(start=text_length, metadata=metadata, other_line=line)

text_length += len(line.line)
text = "".join(texts)
return TreeNode("0",
text,
annotations=annotations,
metadata=LineMetadata(page_id=page_id, line_id=line_id, hierarchy_level=HierarchyLevel.create_root()),
subparagraphs=[],
parent=None)
return TreeNode("0", text, annotations=annotations, metadata=metadata, subparagraphs=[], parent=None)

def add_child(self, line: LineWithMeta) -> "TreeNode":
"""
Expand Down Expand Up @@ -93,6 +91,7 @@ def add_text(self, line: LineWithMeta) -> None:
text_length = len(self.text)
new_annotations = self.__shift_annotations(line, text_length)

self.__add_additional_page_id(start=len(self.text), metadata=self.metadata, other_line=line)
self.text += line.line
self.annotations.extend(new_annotations)

Expand Down Expand Up @@ -124,3 +123,30 @@ def merge_annotations(self) -> None:
node.annotations = merger.merge_annotations(node.annotations, node.text)
for sub_node in node.subparagraphs:
stack.append(sub_node)

@staticmethod
def __add_additional_page_id(start: int, metadata: LineMetadata, other_line: LineWithMeta) -> None:
"""
Adds additional page_id metadata for multi-page nodes.
If node is located on several pages, its metadata will contain "additional_page_id" attribute with list of dicts:
{
start: start index of the text on the next page,
end: end index (not included),
page_id: page id, where this textual part (node_text[start:end]) is located
}
"""
if metadata.page_id == other_line.metadata.page_id:
return

if hasattr(metadata, "additional_page_ids"):
last_page_id = metadata.additional_page_ids[-1]["page_id"]
if last_page_id == other_line.metadata.page_id:
metadata.additional_page_ids[-1]["end"] = start + len(other_line.line)
return

additional_page_id = {"start": start, "end": start + len(other_line.line), "page_id": other_line.metadata.page_id}
if hasattr(metadata, "additional_page_ids"):
metadata.additional_page_ids.append(additional_page_id)
else:
metadata.additional_page_ids = [additional_page_id]
14 changes: 14 additions & 0 deletions tests/api_tests/test_api_misc_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@ def test_tree_structure(self) -> None:
self.assertEqual("Пример документа", nodes[0]["text"].split("\n")[0])
self.assertEqual("1.2.1. Поясним за непонятное", nodes[1]["subparagraphs"][0]["text"].strip())

def test_page_id_tree_structure(self) -> None:
file_name = os.path.join("..", "pdf_with_text_layer", "test_page_id.pdf")
result = self._send_request(file_name, data={"structure_type": "tree"})
node = result["content"]["structure"]["subparagraphs"][0]

page_change_positions = [2135, 4270, 6405, 8540, 10675, 12810, 13323]
for idx, additional_page_id in enumerate(node["metadata"]["additional_page_ids"], start=1):
self.assertEqual(idx, additional_page_id["page_id"])
start, end = page_change_positions[idx - 1], page_change_positions[idx]
self.assertEqual(start, additional_page_id["start"])
self.assertEqual(end, additional_page_id["end"])
self.assertFalse(node["text"][start:end].startswith("\n"))
self.assertTrue(node["text"][start:end].endswith("\n"))

def test_incorrect_structure(self) -> None:
file_name = "example.docx"
_ = self._send_request(file_name, data={"structure_type": "bagel"}, expected_code=400)
Binary file added tests/data/pdf_with_text_layer/test_page_id.pdf
Binary file not shown.

0 comments on commit d2d309d

Please sign in to comment.