Skip to content

Commit

Permalink
feat: Add minimum text length translation parameter
Browse files Browse the repository at this point in the history
- Introduce `--min-text-length` CLI option to skip translating short texts
- Set default minimum text length to 5 characters
- Update README.md with new CLI parameter documentation
- Implement text length check in translation process
  • Loading branch information
awwaawwa committed Feb 18, 2025
1 parent 361a51a commit a51faad
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 3 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ uv run babeldoc --bing --files example.pdf --files example2.pdf
- `--ignore-cache`: Ignore translation cache and force retranslation
- `--no-dual`: Do not output bilingual PDF files
- `--no-mono`: Do not output monolingual PDF files
- `--min-text-length`: Minimum text length to translate (default: 5)
- `--openai`: Use OpenAI for translation (default: False)
- `--bing`: Use Bing for translation (default: False)
- `--google`: Use Google Translate for translation (default: False)
Expand Down
13 changes: 10 additions & 3 deletions babeldoc/document_il/midend/il_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import json
import logging
from pathlib import Path
from typing import List, Union

from tqdm import tqdm

Expand Down Expand Up @@ -179,7 +178,7 @@ class TranslateInput:
def __init__(
self,
unicode: str,
placeholders: List[Union[RichTextPlaceholder, FormulaPlaceholder]],
placeholders: list[RichTextPlaceholder | FormulaPlaceholder],
base_style: PdfStyle = None,
):
self.unicode = unicode
Expand Down Expand Up @@ -489,7 +488,8 @@ def remove_placeholder(text: str):
) and text.replace(" ", "") == "".join(
x.char_unicode for x in placeholder.composition.pdf_character
).replace(
" ", ""
" ",
"",
):
comp = PdfParagraphComposition(
pdf_same_style_characters=placeholder.composition,
Expand Down Expand Up @@ -547,6 +547,13 @@ def translate_paragraph(
tracker.set_input(translate_input.unicode)

text = translate_input.unicode

if len(text) < self.translation_config.min_text_length:
logger.debug(
f"Text too short to translate, skip. Text: {text}. Paragraph id: {paragraph.debug_id}.",
)
return

translated_text = self.translate_engine.translate(text)

tracker.set_output(translated_text)
Expand Down
7 changes: 7 additions & 0 deletions babeldoc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ def create_parser():
type=str,
help="Pages to translate. If not set, translate all pages. like: 1,2,1-,-3,3-5",
)
translation_params.add_argument(
"--min-text-length",
type=int,
default=5,
help="Minimum text length to translate (default: 5)",
)
translation_params.add_argument(
"--lang-in",
"-li",
Expand Down Expand Up @@ -421,6 +427,7 @@ async def main():
disable_rich_text_translate=args.disable_rich_text_translate,
enhance_compatibility=args.enhance_compatibility,
report_interval=args.report_interval,
min_text_length=args.min_text_length,
)

# Create progress handler
Expand Down
2 changes: 2 additions & 0 deletions babeldoc/translation_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(
disable_rich_text_translate: bool = False, # 是否禁用富文本翻译
enhance_compatibility: bool = False, # 增强兼容性模式
report_interval: float = 0.1, # Progress report interval in seconds
min_text_length: int = 5, # Minimum text length to translate
):
self.input_file = input_file
self.translator = translator
Expand All @@ -60,6 +61,7 @@ def __init__(
disable_rich_text_translate or enhance_compatibility
)
self.report_interval = report_interval
self.min_text_length = min_text_length
if progress_monitor:
if progress_monitor.cancel_event is None:
progress_monitor.cancel_event = threading.Event()
Expand Down

0 comments on commit a51faad

Please sign in to comment.