Skip to content

Commit

Permalink
refactor(paragraph_finder): improve xy_mode handling and add special …
Browse files Browse the repository at this point in the history
…character support

- reformat xy_mode parameter for better readability
- handle specific characters with unique layout requirements
- correct xy_mode logic for topleft and bottomright cases
  • Loading branch information
awwaawwa committed Jan 20, 2025
1 parent 2c3e141 commit 94708e9
Showing 1 changed file with 20 additions and 10 deletions.
30 changes: 20 additions & 10 deletions yadt/document_il/midend/paragraph_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,18 @@ def get_layout(
self,
char: PdfCharacter,
page: Page,
xy_mode: Union[Literal["topleft"], Literal["bottomright"], Literal["middle"]] = 'middle',
xy_mode: Union[
Literal["topleft"], Literal["bottomright"], Literal["middle"]
] = "middle",
):
# 这几个符号,解析出来的大小经常只有实际大小的一点点。
if xy_mode != 'bottomright' and char.char_unicode in [
"∑︁",
# 来源于 arXiv:2310.18608v2 第九页公式大括号
"(cid:17)",
"(cid:16)",
]:
return self.get_layout(char, page, "bottomright")
# current layouts
# {
# "title",
Expand All @@ -225,13 +235,13 @@ def get_layout(
"title",
]
char_box = char.box
if xy_mode == 'topleft':
if xy_mode == "topleft":
char_x = char_box.x
char_y = char_box.y
elif xy_mode == 'bottomright':
char_x = char_box.x2
char_y = char_box.y2
elif xy_mode == 'middle':
elif xy_mode == "bottomright":
char_x = char_box.x2
char_y = char_box.y
elif xy_mode == "middle":
char_x = (char_box.x + char_box.x2) / 2
char_y = (char_box.y + char_box.y2) / 2
else:
Expand All @@ -253,10 +263,10 @@ def get_layout(
if layout_name in matching_layouts:
return matching_layouts[layout_name]

if xy_mode == 'middle':
return self.get_layout(char, page, 'topleft')
if xy_mode == 'topleft':
return self.get_layout(char, page, 'bottomright')
if xy_mode == "middle":
return self.get_layout(char, page, "topleft")
if xy_mode == "topleft":
return self.get_layout(char, page, "bottomright")
return None

def create_line(self, chars: list[PdfCharacter]) -> PdfParagraphComposition:
Expand Down

0 comments on commit 94708e9

Please sign in to comment.