Skip to content

Commit

Permalink
Merge pull request #80 from funstory-ai/develop
Browse files Browse the repository at this point in the history
chore: update Ruff linting configuration and format code
  • Loading branch information
awwaawwa authored Feb 11, 2025
2 parents 3e4aff8 + 720cefc commit bd0a7fc
Show file tree
Hide file tree
Showing 27 changed files with 787 additions and 609 deletions.
47 changes: 43 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,45 @@ ignore = ["E203", "E261", "E501", "W503", "E741", "E501"]
max-line-length = 88

[tool.ruff]

src=["yadt"]
src = ["yadt"]
target-version = "py310"
show-fixes = true

lint.ignore = ["E203", "E261", "E501", "E741", "F841"]

[tool.ruff.format]
# Enable reformatting of code snippets in docstrings.
docstring-code-format = true

[tool.ruff.lint]
ignore = [
"E203", # 冒号前的空格
"E261", # 注释前至少两个空格
"E501", # 行太长
"E741", # 变量名歧义
"F841", # 未使用的变量
"C901", # 太复杂的函数
"S101", # use assert
"SIM", # flake8-simplify
"ARG002", # unused argument
"S110", # `try`-`except`-`pass` detected, consider logging the exception
"B024", # abstract class without abstract methods
"S112", # `try`-`except`-`continue` detected, consider logging the exception

]
select = [
"E", # pycodestyle 错误
"F", # Pyflakes
"N", # PEP8 命名
"B", # flake8-bugbear
"I", # isort
"C", # mccabe
"UP", # pyupgrade
"S", # flake8-bandit
"A", # flake8-builtins
"COM", # flake8-commas
"ARG", # flake8-unused-arguments
"PTH", # 使用 pathlib
]

[tool.ruff.lint.flake8-quotes]
docstring-quotes = "double"

Expand All @@ -70,6 +98,17 @@ force-single-line = true
[tool.ruff.lint.pydocstyle]
convention = "google"

# 设置一些规则的特定配置
[tool.ruff.lint.mccabe]
max-complexity = 10 # 函数圈复杂度阈值

[tool.ruff.lint.per-file-ignores]
"yadt/pdfinterp.py" = ["N"] # 忽略命名规范
"tests/*" = ["S101"] # 在测试文件中允许 assert
"**/__init__.py" = ["F401"] # 允许未使用的导入
# 忽略 S311 警告,因为这是有意的
"yadt/document_il/midend/paragraph_finder.py" = ["S311"]

[dependency-groups]
dev = [
"bumpver>=2024.1130",
Expand Down
6 changes: 3 additions & 3 deletions yadt/const.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from pathlib import Path

CACHE_FOLDER = os.path.join(os.path.expanduser("~"), ".cache", "yadt")
CACHE_FOLDER = Path.home() / ".cache" / "yadt"


def get_cache_file_path(filename):
return os.path.join(CACHE_FOLDER, filename)
return CACHE_FOLDER / filename
103 changes: 57 additions & 46 deletions yadt/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,26 @@
import logging
import re
import unicodedata
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
from pdfminer.converter import PDFConverter
from pdfminer.layout import LTChar, LTComponent, LTFigure, LTLine, LTPage, LTText
from pdfminer.layout import LTChar
from pdfminer.layout import LTComponent
from pdfminer.layout import LTFigure
from pdfminer.layout import LTLine
from pdfminer.layout import LTPage
from pdfminer.layout import LTText
from pdfminer.pdfcolor import PDFColorSpace
from pdfminer.pdffont import PDFCIDFont, PDFFont, PDFUnicodeNotDefined
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
from pdfminer.utils import Matrix, apply_matrix_pt, bbox2str, matrix2str, mult_matrix
from pdfminer.pdffont import PDFCIDFont
from pdfminer.pdffont import PDFFont
from pdfminer.pdffont import PDFUnicodeNotDefined
from pdfminer.pdfinterp import PDFGraphicState
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.utils import Matrix
from pdfminer.utils import apply_matrix_pt
from pdfminer.utils import bbox2str
from pdfminer.utils import matrix2str
from pdfminer.utils import mult_matrix
from pymupdf import Font

from yadt.document_il.frontend.il_creater import ILCreater
Expand All @@ -22,7 +33,7 @@ class PDFConverterEx(PDFConverter):
def __init__(
self,
rsrcmgr: PDFResourceManager,
il_creater: ILCreater = None,
il_creater: ILCreater | None = None,
) -> None:
PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
self.il_creater = il_creater
Expand All @@ -34,12 +45,15 @@ def begin_page(self, page, ctm) -> None:
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
self.il_creater.on_page_media_box(
mediabox[0], mediabox[1], mediabox[2], mediabox[3]
mediabox[0],
mediabox[1],
mediabox[2],
mediabox[3],
)
self.il_creater.on_page_number(page.pageno)
self.cur_item = LTPage(page.pageno, mediabox)

def end_page(self, page):
def end_page(self, _page) -> None:
# 重载返回指令流
return self.receive_layout(self.cur_item)

Expand All @@ -52,7 +66,8 @@ def begin_figure(self, name, bbox, matrix) -> None:
def end_figure(self, _: str) -> None:
# 重载返回指令流
fig = self.cur_item
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
if not isinstance(self.cur_item, LTFigure):
raise ValueError(f"Unexpected item type: {type(self.cur_item)}")
self.cur_item = self._stack.pop()
self.cur_item.add(fig)
return self.receive_layout(fig)
Expand All @@ -71,7 +86,8 @@ def render_char(
# 重载设置 cid 和 font
try:
text = font.to_unichr(cid)
assert isinstance(text, str), str(type(text))
if not isinstance(text, str):
raise TypeError(f"Expected string, got {type(text)}")
except PDFUnicodeNotDefined:
text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid)
Expand Down Expand Up @@ -117,7 +133,7 @@ def __init__(
rise: float,
text: str,
textwidth: float,
textdisp: Union[float, Tuple[Optional[float], float]],
textdisp: float | tuple[float | None, float],
ncs: PDFColorSpace,
graphicstate: PDFGraphicState,
xobj_id: int,
Expand Down Expand Up @@ -150,7 +166,7 @@ def __init__(
bbox_lower_left = (0, descent + rise)
bbox_upper_right = (self.adv, descent + rise + fontsize)
(a, b, c, d, e, f) = self.matrix
self.upright = 0 < a * d * scaling and b * c <= 0
self.upright = a * d * scaling > 0 and b * c <= 0
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
if x1 < x0:
Expand All @@ -165,14 +181,7 @@ def __init__(
return

def __repr__(self) -> str:
return "<{} {} matrix={} font={!r} adv={} text={!r}>".format(
self.__class__.__name__,
bbox2str(self.bbox),
matrix2str(self.matrix),
self.fontname,
self.adv,
self.get_text(),
)
return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"

def get_text(self) -> str:
return self._text
Expand All @@ -193,19 +202,20 @@ class TranslateConverter(PDFConverterEx):
def __init__(
self,
rsrcmgr,
vfont: str = None,
vchar: str = None,
vfont: str | None = None,
vchar: str | None = None,
thread: int = 0,
layout={},
lang_in: str = "",
lang_out: str = "",
service: str = "",
layout: dict | None = None,
lang_in: str = "", # 保留参数但添加未使用标记
_lang_out: str = "", # 改为未使用参数
_service: str = "", # 改为未使用参数
resfont: str = "",
noto: Font = None,
envs: Dict = None,
prompt: List = None,
il_creater: ILCreater = None,
) -> None:
noto: Font | None = None,
envs: dict | None = None,
_prompt: list | None = None, # 改为未使用参数
il_creater: ILCreater | None = None,
):
layout = layout or {}
super().__init__(rsrcmgr, il_creater)
self.vfont = vfont
self.vchar = vchar
Expand Down Expand Up @@ -384,9 +394,9 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体
varl.append(vlstk)
varf.append(vfix)
log.debug("\n==========[VSTACK]==========\n")
for id, v in enumerate(var): # 计算公式宽度
for var_id, v in enumerate(var): # 计算公式宽度
l = max([vch.x1 for vch in v]) - v[0].x0
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > v{id} = {"".join([ch.get_text() for ch in v])}')
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[var_id])} > v{var_id} = {"".join([ch.get_text() for ch in v])}')
vlen.append(l)

############################################################
Expand All @@ -399,37 +409,38 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体
# C. 新文档排版
def raw_string(fcur: str, cstk: str): # 编码字符串
if fcur == 'noto':
return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
return "".join([f"{self.noto.has_glyph(ord(c)):04x}" for c in cstk])
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
return "".join(["%04x" % ord(c) for c in cstk])
return "".join([f"{ord(c):04x}" for c in cstk])
else:
return "".join(["%02x" % ord(c) for c in cstk])
return "".join([f"{ord(c):02x}" for c in cstk])

_x, _y = 0, 0
for id, new in enumerate(news):
x: float = pstk[id].x # 段落初始横坐标
y: float = pstk[id].y # 段落初始纵坐标
x0: float = pstk[id].x0 # 段落左边界
x1: float = pstk[id].x1 # 段落右边界
size: float = pstk[id].size # 段落字体大小
brk: bool = pstk[id].brk # 段落换行标记
for para_id, new in enumerate(news):
x: float = pstk[para_id].x # 段落初始横坐标
y: float = pstk[para_id].y # 段落初始纵坐标
x0: float = pstk[para_id].x0 # 段落左边界
x1: float = pstk[para_id].x1 # 段落右边界
size: float = pstk[para_id].size # 段落字体大小
brk: bool = pstk[para_id].brk # 段落换行标记
cstk: str = "" # 当前文字栈
fcur: str = None # 当前字体 ID
tx = x
fcur_ = fcur
ptr = 0
log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")
log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[para_id]} | {new}")
while ptr < len(new):
vy_regex = re.match(
r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE
r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE,
) # 匹配 {vn} 公式标记
mod = 0 # 文字修饰符
if vy_regex: # 加载公式
ptr += len(vy_regex.group(0))
try:
vid = int(vy_regex.group(1).replace(" ", ""))
adv = vlen[vid]
except Exception:
except Exception as e:
log.debug("Skipping formula placeholder due to: %s", e)
continue # 翻译器可能会自动补个越界的公式标记
if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符
mod = var[vid][-1].width
Expand Down
42 changes: 20 additions & 22 deletions yadt/document_il/__init__.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,23 @@
from yadt.document_il.il_version_1 import (
BaseOperations,
Box,
Cropbox,
Document,
GraphicState,
Mediabox,
Page,
PageLayout,
PdfCharacter,
PdfFigure,
PdfFont,
PdfFormula,
PdfLine,
PdfParagraph,
PdfParagraphComposition,
PdfRectangle,
PdfSameStyleCharacters,
PdfSameStyleUnicodeCharacters,
PdfStyle,
PdfXobject,
)
from yadt.document_il.il_version_1 import BaseOperations
from yadt.document_il.il_version_1 import Box
from yadt.document_il.il_version_1 import Cropbox
from yadt.document_il.il_version_1 import Document
from yadt.document_il.il_version_1 import GraphicState
from yadt.document_il.il_version_1 import Mediabox
from yadt.document_il.il_version_1 import Page
from yadt.document_il.il_version_1 import PageLayout
from yadt.document_il.il_version_1 import PdfCharacter
from yadt.document_il.il_version_1 import PdfFigure
from yadt.document_il.il_version_1 import PdfFont
from yadt.document_il.il_version_1 import PdfFormula
from yadt.document_il.il_version_1 import PdfLine
from yadt.document_il.il_version_1 import PdfParagraph
from yadt.document_il.il_version_1 import PdfParagraphComposition
from yadt.document_il.il_version_1 import PdfRectangle
from yadt.document_il.il_version_1 import PdfSameStyleCharacters
from yadt.document_il.il_version_1 import PdfSameStyleUnicodeCharacters
from yadt.document_il.il_version_1 import PdfStyle
from yadt.document_il.il_version_1 import PdfXobject

__all__ = [
"BaseOperations",
Expand Down
Loading

0 comments on commit bd0a7fc

Please sign in to comment.