Skip to content

Commit

Permalink
feat(typesetting): add first line indent support and improve font map…
Browse files Browse the repository at this point in the history
…ping

- add first_line_indent attribute to PdfParagraph
- update typesetting logic to handle first line indent
- improve font mapping and character length calculation
- update various files for consistency and formatting

build: bump project version to 0.0.1a19

- update pyproject.toml with new version number
  • Loading branch information
awwaawwa committed Jan 17, 2025
1 parent d610aaa commit 8486a0a
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 43 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "yadt"
version = "0.0.1a18"
version = "0.0.1a19"
description = "Yet Another Document Translator"
license = "AGPL-3.0"
readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion yadt/document_il/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from yadt.document_il.il_version_1 import (
GraphicState1,
BaseOperations,
Box,
Cropbox,
Document,
GraphicState,
GraphicState1,
Mediabox,
Page,
PageLayout,
Expand Down
36 changes: 29 additions & 7 deletions yadt/document_il/backend/pdf_creater.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@ def render_graphic_state(
return
if graphic_state.stroking_color_space_name:
draw_op.append(
f"/{graphic_state.stroking_color_space_name}" f" CS \n".encode()
f"/{graphic_state.stroking_color_space_name}"
f" CS \n".encode()
)
if graphic_state.non_stroking_color_space_name:
draw_op.append(
f"/{graphic_state.non_stroking_color_space_name}" f" cs \n".encode()
f"/{graphic_state.non_stroking_color_space_name}"
f" cs \n".encode()
)
if graphic_state.ncolor is not None:
if len(graphic_state.ncolor) == 1:
Expand All @@ -69,7 +71,9 @@ def render_paragraph_to_char(
) -> list[il_version_1.PdfCharacter]:
chars = []
for composition in paragraph.pdf_paragraph_composition:
if not isinstance(composition.pdf_character, il_version_1.PdfCharacter):
if not isinstance(
composition.pdf_character, il_version_1.PdfCharacter
):
raise Exception(
f"Unknown composition type. "
f"This type only appears in the IL "
Expand Down Expand Up @@ -162,7 +166,9 @@ def write(self, translation_config: TranslationConfig):
# self.add_font(pdf, self.docs)
for page in self.docs.page:
available_font_list = self.get_available_font_list(pdf, page)
encoding_length_map = {f.font_id: f.encoding_length for f in page.pdf_font}
encoding_length_map = {
f.font_id: f.encoding_length for f in page.pdf_font
}
draw_op = BitStream()
# q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
draw_op.append(b"q ")
Expand Down Expand Up @@ -194,7 +200,9 @@ def write(self, translation_config: TranslationConfig):
if font_id not in available_font_list:
continue
draw_op.append(b"q ")
self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
self.render_graphic_state(
draw_op, char.pdf_style.graphic_state
)
if char.vertical:
draw_op.append(
f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {
Expand Down Expand Up @@ -224,7 +232,14 @@ def write(self, translation_config: TranslationConfig):
pdf[page.page_number].set_contents(op_container)
pdf.subset_fonts(fallback=False)
if not translation_config.no_mono:
pdf.save(mono_out_path, garbage=3, deflate=True, clean=True, deflate_fonts=True, linear=True)
pdf.save(
mono_out_path,
garbage=3,
deflate=True,
clean=True,
deflate_fonts=True,
linear=True,
)
if translation_config.debug:
pdf.save(
f"{mono_out_path}.decompressed.pdf",
Expand All @@ -241,7 +256,14 @@ def write(self, translation_config: TranslationConfig):
page_count = pdf.page_count
for id in range(page_count):
dual.move_page(page_count + id, id * 2 + 1)
dual.save(dual_out_path, garbage=3, deflate=True, clean=True, deflate_fonts=True, linear=True)
dual.save(
dual_out_path,
garbage=3,
deflate=True,
clean=True,
deflate_fonts=True,
linear=True,
)
if translation_config.debug:
dual.save(
f"{dual_out_path}.decompressed.pdf",
Expand Down
7 changes: 7 additions & 0 deletions yadt/document_il/il_version_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,13 @@ class Meta:
"type": "Attribute",
},
)
first_line_indent: Optional[bool] = field(
default=None,
metadata={
"name": "FirstLineIndent",
"type": "Attribute",
},
)


@dataclass
Expand Down
1 change: 1 addition & 0 deletions yadt/document_il/il_version_1.rnc
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ PDFParagraph =
attribute unicode { xsd:string },
attribute scale { xsd:float }?,
attribute vertical { xsd:boolean }?,
attribute FirstLineIndent { xsd:boolean }?,
Box,
PDFStyle,
PDFParagraphComposition*
Expand Down
5 changes: 5 additions & 0 deletions yadt/document_il/il_version_1.rng
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,11 @@
<data type="boolean"/>
</attribute>
</optional>
<optional>
<attribute name="FirstLineIndent">
<data type="boolean"/>
</attribute>
</optional>
<ref name="Box"/>
<ref name="PDFStyle"/>
<zeroOrMore>
Expand Down
1 change: 1 addition & 0 deletions yadt/document_il/il_version_1.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@
<xs:attribute name="unicode" use="required" type="xs:string"/>
<xs:attribute name="scale" type="xs:float"/>
<xs:attribute name="vertical" type="xs:boolean"/>
<xs:attribute name="FirstLineIndent" type="xs:boolean"/>
</xs:complexType>
</xs:element>
<xs:element name="pdfParagraphComposition">
Expand Down
45 changes: 18 additions & 27 deletions yadt/document_il/midend/paragraph_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@


class ParagraphFinder:
def update_paragraph_data(
self, paragraph: PdfParagraph, update_unicode=False
):
def update_paragraph_data(self, paragraph: PdfParagraph, update_unicode=False):
if not paragraph.pdf_paragraph_composition:
return

Expand Down Expand Up @@ -46,6 +44,15 @@ def update_paragraph_data(
paragraph.box = Box(min_x, min_y, max_x, max_y)
paragraph.vertical = chars[0].vertical

paragraph.first_line_indent = False
if (
paragraph.pdf_paragraph_composition[0].pdf_line
and paragraph.pdf_paragraph_composition[0].pdf_line.pdf_character[0].box.x
- paragraph.box.x
> 1
):
paragraph.first_line_indent = True

def update_line_data(self, line: PdfLine):
min_x = min(char.box.x for char in line.pdf_character)
min_y = min(char.box.y for char in line.pdf_character)
Expand Down Expand Up @@ -92,9 +99,7 @@ def create_paragraphs(self, page: Page) -> list[PdfParagraph]:
continue

# 检查是否需要开始新行
if current_line_chars and Layout.is_newline(
current_line_chars[-1], char
):
if current_line_chars and Layout.is_newline(current_line_chars[-1], char):
# 创建新行
if current_line_chars:
line = self.create_line(current_line_chars)
Expand All @@ -104,9 +109,7 @@ def create_paragraphs(self, page: Page) -> list[PdfParagraph]:
)
paragraphs.append(current_paragraph)
else:
current_paragraph.pdf_paragraph_composition.append(
line
)
current_paragraph.pdf_paragraph_composition.append(line)
self.update_paragraph_data(current_paragraph)
current_line_chars = []

Expand All @@ -115,9 +118,7 @@ def create_paragraphs(self, page: Page) -> list[PdfParagraph]:
if current_line_chars:
line = self.create_line(current_line_chars)
if current_paragraph is not None:
current_paragraph.pdf_paragraph_composition.append(
line
)
current_paragraph.pdf_paragraph_composition.append(line)
self.update_paragraph_data(current_paragraph)
else:
current_paragraph = PdfParagraph(
Expand Down Expand Up @@ -172,9 +173,7 @@ def process_paragraph_spacing(self, paragraph: PdfParagraph):
processed_chars.append(char)

# 移除尾随空格
while (
processed_chars and processed_chars[-1].char_unicode.isspace()
):
while processed_chars and processed_chars[-1].char_unicode.isspace():
processed_chars.pop()

if processed_chars: # 如果行内还有字符
Expand Down Expand Up @@ -246,18 +245,14 @@ def get_layout(

return None

def create_line(
self, chars: list[PdfCharacter]
) -> PdfParagraphComposition:
def create_line(self, chars: list[PdfCharacter]) -> PdfParagraphComposition:
assert chars

line = PdfLine(pdf_character=chars)
self.update_line_data(line)
return PdfParagraphComposition(pdf_line=line)

def calculate_median_line_width(
self, paragraphs: list[PdfParagraph]
) -> float:
def calculate_median_line_width(self, paragraphs: list[PdfParagraph]) -> float:
# 收集所有行的宽度
line_widths = []
for paragraph in paragraphs:
Expand All @@ -282,9 +277,7 @@ def process_independent_paragraphs(
i = 0
while i < len(paragraphs):
paragraph = paragraphs[i]
if (
len(paragraph.pdf_paragraph_composition) <= 1
): # 跳过只有一行的段落
if len(paragraph.pdf_paragraph_composition) <= 1: # 跳过只有一行的段落
i += 1
continue

Expand All @@ -297,9 +290,7 @@ def process_independent_paragraphs(

prev_line = prev_composition.pdf_line
prev_width = prev_line.box.x2 - prev_line.box.x
prev_text = "".join(
[c.char_unicode for c in prev_line.pdf_character]
)
prev_text = "".join([c.char_unicode for c in prev_line.pdf_character])

# 检查是否包含连续的点(至少 20 个)
# 如果有至少连续 20 个点,则代表这是目录条目
Expand Down
14 changes: 11 additions & 3 deletions yadt/document_il/midend/typesetting.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ def _layout_typesetting_units(
box: Box,
scale: float,
line_spacing: float,
paragraph: il_version_1.PdfParagraph,
) -> tuple[list[TypesettingUnit], bool]:
"""布局排版单元。
Expand All @@ -396,7 +397,7 @@ def _layout_typesetting_units(
font_sizes.sort()
font_size = statistics.mode(font_sizes)

space_width = self.font_mapper.base_font.char_lengths(" ", font_size * scale)[0]
space_width = self.font_mapper.base_font.char_lengths("", font_size * scale)[0] * 0.5

# 计算平均行高
avg_height = (
Expand All @@ -416,6 +417,8 @@ def _layout_typesetting_units(
all_units_fit = True
last_unit: Optional[TypesettingUnit] = None

if paragraph.first_line_indent:
current_x += space_width * 4
# 遍历所有排版单元
for unit in typesetting_units:
# 计算当前单元在当前缩放下的尺寸
Expand Down Expand Up @@ -490,7 +493,7 @@ def retypeset(
while scale >= min_scale:
# 尝试布局排版单元
typeset_units, all_units_fit = self._layout_typesetting_units(
typesetting_units, box, scale, line_spacing
typesetting_units, box, scale, line_spacing, paragraph
)

# 如果所有单元都放得下,就完成排版
Expand Down Expand Up @@ -566,7 +569,12 @@ def create_typesetting_units(
[
TypesettingUnit(
unicode=char_unicode,
font=self.font_mapper.map(fonts[composition.pdf_same_style_unicode_characters.pdf_style.font_id], char_unicode),
font=self.font_mapper.map(
fonts[
composition.pdf_same_style_unicode_characters.pdf_style.font_id
],
char_unicode,
),
font_size=composition.pdf_same_style_unicode_characters.pdf_style.font_size,
style=composition.pdf_same_style_unicode_characters.pdf_style,
)
Expand Down
12 changes: 8 additions & 4 deletions yadt/document_il/utils/fontmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ def __init__(self, translation_config: TranslationConfig):
"SourceHanSansSC-Bold.ttf",
]
self.fonts = {
os.path.basename(file_name).replace('-', '').lower(): pymupdf.Font(fontfile=get_cache_file_path(file_name))
os.path.basename(file_name).replace("-", "").lower(): pymupdf.Font(
fontfile=get_cache_file_path(file_name)
)
for file_name in self.font_names
}
for k,v in self.fonts.items():
for k, v in self.fonts.items():
v.font_id = k
self.base_font_path = translation_config.font
self.fallback_font_path = get_cache_file_path("noto.ttf")
Expand Down Expand Up @@ -61,8 +63,10 @@ def add_font(self, doc_zh: pymupdf.Document, il: il_version_1.Document):
]
font_list.extend(
[
(os.path.basename(file_name).replace(
'-', '').lower(), get_cache_file_path(file_name))
(
os.path.basename(file_name).replace("-", "").lower(),
get_cache_file_path(file_name),
)
for file_name in self.font_names
]
)
Expand Down

0 comments on commit 8486a0a

Please sign in to comment.