diff --git a/assets/docs/sources/option_file_syntax.md b/assets/docs/sources/option_file_syntax.md index b59fcfb1..95bf91f0 100644 --- a/assets/docs/sources/option_file_syntax.md +++ b/assets/docs/sources/option_file_syntax.md @@ -191,6 +191,17 @@ plugins: zip_dir: D:/jmcomic/zip/ # 压缩文件存放的文件夹 delete_original_file: true # 压缩成功后,删除所有原文件和文件夹 + + # 删除重复文件插件 + # 参考 → [https://github.com/hect0x7/JMComic-Crawler-Python/issues/244] + - plugin: delete_duplicated_files + kwargs: + # limit: 必填,表示对md5出现次数的限制 + limit: 3 + # 如果文件的md5的出现次数 >= limit,是否要删除 + # 如果delete_original_file不配置,此插件只会打印信息,不会执行其他操作 + # 如果limit=1, delete_original_file=true 效果会是删除所有文件 + delete_original_file: true - plugin: send_qq_email # 发送qq邮件插件 kwargs: diff --git a/assets/option/option_workflow_download.yml b/assets/option/option_workflow_download.yml index 5b469916..4a201b5a 100644 --- a/assets/option/option_workflow_download.yml +++ b/assets/option/option_workflow_download.yml @@ -33,4 +33,4 @@ plugins: msg_to: ${EMAIL_TO} password: ${EMAIL_PASS} title: ${EMAIL_TITLE} - content: ${EMAIL_CONTENT} \ No newline at end of file + content: ${EMAIL_CONTENT} diff --git a/src/jmcomic/__init__.py b/src/jmcomic/__init__.py index c43f17a7..3ead2eb9 100644 --- a/src/jmcomic/__init__.py +++ b/src/jmcomic/__init__.py @@ -2,7 +2,7 @@ # 被依赖方 <--- 使用方 # config <--- entity <--- toolkit <--- client <--- option <--- downloader -__version__ = '2.5.11' +__version__ = '2.5.12' from .api import * from .jm_plugin import * diff --git a/src/jmcomic/jm_client_interface.py b/src/jmcomic/jm_client_interface.py index dcdfa03f..c8abace3 100644 --- a/src/jmcomic/jm_client_interface.py +++ b/src/jmcomic/jm_client_interface.py @@ -581,6 +581,6 @@ def is_given_type(self, ctype: Type['JmcomicClient']) -> bool: """ if isinstance(self, ctype): return True - if self.client_key == instance.client_key: + if self.client_key == ctype.client_key: return True return False diff --git a/src/jmcomic/jm_config.py b/src/jmcomic/jm_config.py index 3b0698c2..b8f7c5c3 100644 --- a/src/jmcomic/jm_config.py +++ b/src/jmcomic/jm_config.py @@ -109,14 +109,17 @@ class JmModuleConfig: DOMAIN_IMAGE_LIST = str_to_list(''' cdn-msp.jmapinodeudzn.net cdn-msp2.jmapinodeudzn.net + cdn-msp2.jmapiproxy3.cc + cdn-msp3.jmapinodeudzn.net ''') # 移动端API域名 DOMAIN_API_LIST = str_to_list(''' www.jmapinodeudzn.xyz - www.jmapinode.vip - www.jmapinode.biz + www.cdn-eldenringproxy.xyz + www.cdn-eldenringproxy.me + www.cdn-eldenringproxy.vip www.jmapinode.xyz ''') diff --git a/src/jmcomic/jm_option.py b/src/jmcomic/jm_option.py index 5c6c0a6d..2051c8ab 100644 --- a/src/jmcomic/jm_option.py +++ b/src/jmcomic/jm_option.py @@ -72,11 +72,9 @@ class DirRule: Detail = Union[JmAlbumDetail, JmPhotoDetail, None] RuleFunc = Callable[[Detail], str] - RuleSolver = Tuple[int, RuleFunc, str] + RuleSolver = Tuple[str, RuleFunc, str] RuleSolverList = List[RuleSolver] - rule_solver_cache: Dict[str, RuleSolver] = {} - def __init__(self, rule: str, base_dir=None): base_dir = JmcomicText.parse_to_abspath(base_dir) self.base_dir = base_dir @@ -100,6 +98,25 @@ def decide_image_save_dir(self, return fix_filepath('/'.join(path_ls), is_dir=True) + def decide_album_root_dir(self, album: JmAlbumDetail) -> str: + path_ls = [] + for solver in self.solver_list: + key, _, rule = solver + + if key != 'Bd' and key != 'A': + continue + + try: + ret = self.apply_rule_solver(album, None, solver) + except BaseException as e: + # noinspection PyUnboundLocalVariable + jm_log('dir_rule', f'路径规则"{rule}"的解析出错: {e}, album={album}') + raise e + + path_ls.append(str(ret)) + + return fix_filepath('/'.join(path_ls), is_dir=True) + def get_role_solver_list(self, rule_dsl: str, base_dir: str) -> RuleSolverList: """ 解析下载路径dsl,得到一个路径规则解析列表 @@ -111,7 +128,7 @@ def get_role_solver_list(self, rule_dsl: str, base_dir: str) -> RuleSolverList: for rule in rule_list: rule = rule.strip() if rule == 'Bd': - solver_ls.append((0, lambda _: base_dir, 'Bd')) + solver_ls.append(('Bd', lambda _: base_dir, 'Bd')) continue rule_solver = self.get_rule_solver(rule) @@ -137,24 +154,14 @@ def split_rule_dsl(self, rule_dsl: str) -> List[str]: @classmethod def get_rule_solver(cls, rule: str) -> Optional[RuleSolver]: - # 查找缓存 - if rule in cls.rule_solver_cache: - return cls.rule_solver_cache[rule] - # 检查dsl if not rule.startswith(('A', 'P')): return None - # Axxx or Pyyy - key = 1 if rule[0] == 'A' else 2 - def solve_func(detail): return fix_windir_name(str(DetailEntity.get_dirname(detail, rule[1:]))) - # 保存缓存 - rule_solver = (key, solve_func, rule) - cls.rule_solver_cache[rule] = rule_solver - return rule_solver + return rule[0], solve_func, rule @classmethod def apply_rule_solver(cls, album, photo, rule_solver: RuleSolver) -> str: @@ -168,11 +175,11 @@ def apply_rule_solver(cls, album, photo, rule_solver: RuleSolver) -> str: """ def choose_detail(key): - if key == 0: + if key == 'Bd': return None - if key == 1: + if key == 'A': return album - if key == 2: + if key == 'P': return photo key, func, _ = rule_solver diff --git a/src/jmcomic/jm_plugin.py b/src/jmcomic/jm_plugin.py index 29081c9e..68104284 100644 --- a/src/jmcomic/jm_plugin.py +++ b/src/jmcomic/jm_plugin.py @@ -1035,3 +1035,62 @@ def try_mark_photo_skip_and_log(self, photo: JmPhotoDetail, at_least_image_count @field_cache() # 单例 def build(cls, option: JmOption) -> 'JmOptionPlugin': return super().build(option) + + +class DeleteDuplicatedFilesPlugin(JmOptionPlugin): + """ + https://github.com/hect0x7/JMComic-Crawler-Python/issues/244 + """ + plugin_key = 'delete_duplicated_files' + + @classmethod + def calculate_md5(cls, file_path): + import hashlib + + """计算文件的MD5哈希值""" + hash_md5 = hashlib.md5() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + @classmethod + def find_duplicate_files(cls, root_folder): + """递归读取文件夹下所有文件并计算MD5出现次数""" + import os + from collections import defaultdict + md5_dict = defaultdict(list) + + for root, _, files in os.walk(root_folder): + for file in files: + file_path = os.path.join(root, file) + file_md5 = cls.calculate_md5(file_path) + md5_dict[file_md5].append(file_path) + + return md5_dict + + def invoke(self, + limit, + album=None, + downloader=None, + delete_original_file=True, + **kwargs, + ) -> None: + if album is None: + return + + self.delete_original_file = delete_original_file + # 获取到下载本子所在根目录 + root_folder = self.option.dir_rule.decide_album_root_dir(album) + self.find_duplicated_files_and_delete(limit, root_folder, album) + + def find_duplicated_files_and_delete(self, limit: int, root_folder: str, album: Optional[JmAlbumDetail] = None): + md5_dict = self.find_duplicate_files(root_folder) + # 打印MD5出现次数大于等于limit的文件 + for md5, paths in md5_dict.items(): + if len(paths) >= limit: + prefix = '' if album is None else f'({album.album_id}) ' + message = [prefix + f'MD5: {md5} 出现次数: {len(paths)}'] + \ + [f' {path}' for path in paths] + self.log('\n'.join(message)) + self.execute_deletion(paths) diff --git a/src/jmcomic/jm_toolkit.py b/src/jmcomic/jm_toolkit.py index 8be3c803..fca01f2e 100644 --- a/src/jmcomic/jm_toolkit.py +++ b/src/jmcomic/jm_toolkit.py @@ -707,7 +707,7 @@ def save_resp_img(cls, resp: Any, filepath: str, need_convert=True): 如果需要改变图片的文件格式,比如 .jpg → .png,则需要指定参数 neet_convert=True. 如果不需要改变图片的文件格式,使用 need_convert=False,可以跳过PIL解析图片,效率更高. - :param resp: HTTP响应对象 + :param resp: JmImageResp :param filepath: 图片文件路径 :param need_convert: 是否转换图片 """ @@ -746,7 +746,7 @@ def decode_and_save(cls, # 无需解密,直接保存 if num == 0: - img_src.save(decoded_save_path) + cls.save_image(img_src, decoded_save_path) return import math