Skip to content

Commit

Permalink
feat(midend): add formula height ignore logic and improve layout hand…
Browse files Browse the repository at this point in the history
…ling

- introduce formular_height_ignore_char function to exclude specific characters from height calculations
- update update_formula_data method to use formular_height_ignore_char for min_y and max_y calculations
- adjust should_new_line logic in Layout class to consider formular_height_ignore_char
  • Loading branch information
awwaawwa committed Jan 20, 2025
1 parent d34f0fd commit cd02c93
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 14 deletions.
4 changes: 2 additions & 2 deletions yadt/document_il/backend/pdf_creater.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,9 @@ def write(self, translation_config: TranslationConfig):
mono_out_path,
garbage=3,
deflate=True,
clean=True,
clean=not translation_config.debug,
deflate_fonts=True,
linear=True,
linear=not translation_config.debug,
)
if translation_config.debug:
pdf.save(
Expand Down
20 changes: 17 additions & 3 deletions yadt/document_il/midend/paragraph_finder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from typing import Union, Literal

from yadt.document_il import (
Box,
Expand Down Expand Up @@ -196,6 +197,7 @@ def get_layout(
self,
char: PdfCharacter,
page: Page,
xy_mode: Union[Literal["topleft"], Literal["bottomright"], Literal["middle"]] = 'middle',
):
# current layouts
# {
Expand Down Expand Up @@ -223,9 +225,17 @@ def get_layout(
"title",
]
char_box = char.box
char_x = (char_box.x + char_box.x2) / 2
char_y = (char_box.y + char_box.y2) / 2

if xy_mode == 'topleft':
char_x = char_box.x
char_y = char_box.y
elif xy_mode == 'bottomright':
char_x = char_box.x2
char_y = char_box.y2
elif xy_mode == 'middle':
char_x = (char_box.x + char_box.x2) / 2
char_y = (char_box.y + char_box.y2) / 2
else:
raise ValueError(f"Invalid xy_mode: {xy_mode}")
# 按照优先级顺序检查每种布局
matching_layouts = {}
for layout in page.page_layout:
Expand All @@ -243,6 +253,10 @@ def get_layout(
if layout_name in matching_layouts:
return matching_layouts[layout_name]

if xy_mode == 'middle':
return self.get_layout(char, page, 'topleft')
if xy_mode == 'topleft':
return self.get_layout(char, page, 'bottomright')
return None

def create_line(self, chars: list[PdfCharacter]) -> PdfParagraphComposition:
Expand Down
17 changes: 15 additions & 2 deletions yadt/document_il/midend/styles_and_formulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from yadt.document_il.utils.layout_helper import (
get_char_unicode_string,
is_same_style,
formular_height_ignore_char,
)
from yadt.translation_config import TranslationConfig

Expand Down Expand Up @@ -380,9 +381,21 @@ def create_composition(

def update_formula_data(self, formula: PdfFormula):
min_x = min(char.box.x for char in formula.pdf_character)
min_y = min(char.box.y for char in formula.pdf_character)
max_x = max(char.box.x2 for char in formula.pdf_character)
max_y = max(char.box.y2 for char in formula.pdf_character)
if not all(map(formular_height_ignore_char, formula.pdf_character)):
min_y = min(
char.box.y
for char in formula.pdf_character
if not formular_height_ignore_char(char)
)
max_y = max(
char.box.y2
for char in formula.pdf_character
if not formular_height_ignore_char(char)
)
else:
min_y = min(char.box.y for char in formula.pdf_character)
max_y = max(char.box.y2 for char in formula.pdf_character)
formula.box = Box(min_x, min_y, max_x, max_y)

def is_formulas_font(self, font_name: str) -> bool:
Expand Down
10 changes: 7 additions & 3 deletions yadt/document_il/utils/fontmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@ def __init__(self, translation_config: TranslationConfig):
"SourceHanSansSC-Bold.ttf",
]
self.fonts = {
os.path.basename(file_name)
os.path.basename(file_name).split(".")[0]
.replace("-", "")
.lower(): pymupdf.Font(fontfile=get_cache_file_path(file_name))
for file_name in self.font_names
}
for k, v in self.fonts.items():
v.font_id = k
self.translation_config = translation_config
self.base_font_path = translation_config.font
self.fallback_font_path = get_cache_file_path("noto.ttf")
self.base_font = pymupdf.Font(fontfile=self.base_font_path)
Expand Down Expand Up @@ -64,14 +65,17 @@ def add_font(self, doc_zh: pymupdf.Document, il: il_version_1.Document):
font_list.extend(
[
(
os.path.basename(file_name).replace("-", "").lower(),
os.path.basename(file_name).split(".")[0]
.replace("-", "").lower(),
get_cache_file_path(file_name),
)
for file_name in self.font_names
]
)
font_id = {}
for page in doc_zh:
for i, page in enumerate(doc_zh):
if not self.translation_config.should_translate_page(i + 1):
continue
for font in font_list:
font_id[font[0]] = page.insert_font(font[0], font[1])
xreflen = doc_zh.xref_length()
Expand Down
18 changes: 16 additions & 2 deletions yadt/document_il/utils/layout_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@
)


def formular_height_ignore_char(char: PdfCharacter):
return char.pdf_character_id is None or char.char_unicode in (
"(",
")",
# 暂时假设cid:17和cid 16是特殊情况
# 来源于 arXiv:2310.18608v2 第九页公式大括号
'(cid:17)',
'(cid:16)',
)


class Layout:
def __init__(self, id, name):
self.id = id
Expand All @@ -30,8 +41,11 @@ def is_newline(prev_char: PdfCharacter, curr_char: PdfCharacter) -> bool:
# 这里使用字符高度的一半作为阈值
char_height = curr_char.box.y2 - curr_char.box.y
should_new_line = curr_char.box.y2 < prev_char.box.y
if should_new_line:
pass
if should_new_line and (
formular_height_ignore_char(curr_char)
or formular_height_ignore_char(prev_char)
):
return False
return should_new_line


Expand Down
4 changes: 2 additions & 2 deletions yadt/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,9 @@ def translate(translation_config: TranslationConfig):
)

# deepcopy
docs2 = xml_converter.deepcopy(docs)
# docs2 = xml_converter.deepcopy(docs)

pdf_creater = PDFCreater(original_pdf_path, docs2, translation_config)
pdf_creater = PDFCreater(original_pdf_path, docs, translation_config)

pdf_creater.write(translation_config)

Expand Down

0 comments on commit cd02c93

Please sign in to comment.