Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

功能需求:过滤重复图片 #245

Merged
merged 8 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions assets/docs/sources/option_file_syntax.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,17 @@ plugins:

zip_dir: D:/jmcomic/zip/ # 压缩文件存放的文件夹
delete_original_file: true # 压缩成功后,删除所有原文件和文件夹

# 删除重复文件插件
# 参考 → [https://github.com/hect0x7/JMComic-Crawler-Python/issues/244]
- plugin: delete_duplicated_files
kwargs:
# limit: 必填,表示对md5出现次数的限制
limit: 3
# 如果文件的md5的出现次数 >= limit,是否要删除
# 如果delete_original_file不配置,此插件只会打印信息,不会执行其他操作
# 如果limit=1, delete_original_file=true 效果会是删除所有文件
delete_original_file: true

- plugin: send_qq_email # 发送qq邮件插件
kwargs:
Expand Down
2 changes: 1 addition & 1 deletion assets/option/option_workflow_download.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ plugins:
msg_to: ${EMAIL_TO}
password: ${EMAIL_PASS}
title: ${EMAIL_TITLE}
content: ${EMAIL_CONTENT}
content: ${EMAIL_CONTENT}
2 changes: 1 addition & 1 deletion src/jmcomic/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# 被依赖方 <--- 使用方
# config <--- entity <--- toolkit <--- client <--- option <--- downloader

__version__ = '2.5.11'
__version__ = '2.5.12'

from .api import *
from .jm_plugin import *
Comment on lines 7 to 8
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Replace star imports with explicit imports to improve code clarity and maintainability.

- from .api import *
- from .jm_plugin import *
+ from .api import SpecificClass1, SpecificClass2
+ from .jm_plugin import SpecificPlugin1, SpecificPlugin2

Committable suggestion was skipped due low confidence.

Expand Down
2 changes: 1 addition & 1 deletion src/jmcomic/jm_client_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,6 @@ def is_given_type(self, ctype: Type['JmcomicClient']) -> bool:
"""
if isinstance(self, ctype):
return True
if self.client_key == instance.client_key:
if self.client_key == ctype.client_key:
return True
return False
7 changes: 5 additions & 2 deletions src/jmcomic/jm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,17 @@ class JmModuleConfig:
DOMAIN_IMAGE_LIST = str_to_list('''
cdn-msp.jmapinodeudzn.net
cdn-msp2.jmapinodeudzn.net
cdn-msp2.jmapiproxy3.cc
cdn-msp3.jmapinodeudzn.net
''')

# 移动端API域名
DOMAIN_API_LIST = str_to_list('''
www.jmapinodeudzn.xyz
www.jmapinode.vip
www.jmapinode.biz
www.cdn-eldenringproxy.xyz
www.cdn-eldenringproxy.me
www.cdn-eldenringproxy.vip
www.jmapinode.xyz
''')

Expand Down
43 changes: 25 additions & 18 deletions src/jmcomic/jm_option.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,9 @@ class DirRule:

Detail = Union[JmAlbumDetail, JmPhotoDetail, None]
RuleFunc = Callable[[Detail], str]
RuleSolver = Tuple[int, RuleFunc, str]
RuleSolver = Tuple[str, RuleFunc, str]
RuleSolverList = List[RuleSolver]

rule_solver_cache: Dict[str, RuleSolver] = {}

def __init__(self, rule: str, base_dir=None):
base_dir = JmcomicText.parse_to_abspath(base_dir)
self.base_dir = base_dir
Expand All @@ -100,6 +98,25 @@ def decide_image_save_dir(self,

return fix_filepath('/'.join(path_ls), is_dir=True)

def decide_album_root_dir(self, album: JmAlbumDetail) -> str:
path_ls = []
for solver in self.solver_list:
key, _, rule = solver

if key != 'Bd' and key != 'A':
continue

try:
ret = self.apply_rule_solver(album, None, solver)
except BaseException as e:
# noinspection PyUnboundLocalVariable
jm_log('dir_rule', f'路径规则"{rule}"的解析出错: {e}, album={album}')
raise e

path_ls.append(str(ret))

return fix_filepath('/'.join(path_ls), is_dir=True)

def get_role_solver_list(self, rule_dsl: str, base_dir: str) -> RuleSolverList:
"""
解析下载路径dsl,得到一个路径规则解析列表
Expand All @@ -111,7 +128,7 @@ def get_role_solver_list(self, rule_dsl: str, base_dir: str) -> RuleSolverList:
for rule in rule_list:
rule = rule.strip()
if rule == 'Bd':
solver_ls.append((0, lambda _: base_dir, 'Bd'))
solver_ls.append(('Bd', lambda _: base_dir, 'Bd'))
continue

rule_solver = self.get_rule_solver(rule)
Expand All @@ -137,24 +154,14 @@ def split_rule_dsl(self, rule_dsl: str) -> List[str]:

@classmethod
def get_rule_solver(cls, rule: str) -> Optional[RuleSolver]:
# 查找缓存
if rule in cls.rule_solver_cache:
return cls.rule_solver_cache[rule]

# 检查dsl
if not rule.startswith(('A', 'P')):
return None

# Axxx or Pyyy
key = 1 if rule[0] == 'A' else 2

def solve_func(detail):
return fix_windir_name(str(DetailEntity.get_dirname(detail, rule[1:])))

# 保存缓存
rule_solver = (key, solve_func, rule)
cls.rule_solver_cache[rule] = rule_solver
return rule_solver
return rule[0], solve_func, rule

@classmethod
def apply_rule_solver(cls, album, photo, rule_solver: RuleSolver) -> str:
Expand All @@ -168,11 +175,11 @@ def apply_rule_solver(cls, album, photo, rule_solver: RuleSolver) -> str:
"""

def choose_detail(key):
if key == 0:
if key == 'Bd':
return None
if key == 1:
if key == 'A':
return album
if key == 2:
if key == 'P':
return photo

key, func, _ = rule_solver
Expand Down
59 changes: 59 additions & 0 deletions src/jmcomic/jm_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -1035,3 +1035,62 @@ def try_mark_photo_skip_and_log(self, photo: JmPhotoDetail, at_least_image_count
@field_cache() # 单例
def build(cls, option: JmOption) -> 'JmOptionPlugin':
return super().build(option)


class DeleteDuplicatedFilesPlugin(JmOptionPlugin):
"""
https://github.com/hect0x7/JMComic-Crawler-Python/issues/244
"""
plugin_key = 'delete_duplicated_files'

@classmethod
def calculate_md5(cls, file_path):
import hashlib

"""计算文件的MD5哈希值"""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
Comment on lines +1046 to +1055
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ensure proper error handling in calculate_md5.

Consider adding error handling for file operations to prevent the application from crashing if the file is inaccessible or corrupt. Here's a suggested modification:

- with open(file_path, "rb") as f:
+ try:
+     with open(file_path, "rb") as f:
+         for chunk in iter(lambda: f.read(4096), b""):
+             hash_md5.update(chunk)
+ except IOError as e:
+     print(f"Error reading file {file_path}: {e}")
+     return None

Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
@classmethod
def calculate_md5(cls, file_path):
import hashlib
"""计算文件的MD5哈希值"""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
@classmethod
def calculate_md5(cls, file_path):
import hashlib
"""计算文件的MD5哈希值"""
hash_md5 = hashlib.md5()
try:
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
except IOError as e:
print(f"Error reading file {file_path}: {e}")
return None
return hash_md5.hexdigest()


@classmethod
def find_duplicate_files(cls, root_folder):
"""递归读取文件夹下所有文件并计算MD5出现次数"""
import os
from collections import defaultdict
md5_dict = defaultdict(list)

for root, _, files in os.walk(root_folder):
for file in files:
file_path = os.path.join(root, file)
file_md5 = cls.calculate_md5(file_path)
md5_dict[file_md5].append(file_path)

return md5_dict
Comment on lines +1057 to +1070
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optimize file handling in find_duplicate_files.

Consider using pathlib for more robust and readable file path operations. Here's a suggested refactor:

- import os
+ from pathlib import Path

- for root, _, files in os.walk(root_folder):
+ for file_path in Path(root_folder).rglob('*'):
-    file_path = os.path.join(root, file)
-    file_md5 = cls.calculate_md5(file_path)
+    file_md5 = cls.calculate_md5(str(file_path))

Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
@classmethod
def find_duplicate_files(cls, root_folder):
"""递归读取文件夹下所有文件并计算MD5出现次数"""
import os
from collections import defaultdict
md5_dict = defaultdict(list)
for root, _, files in os.walk(root_folder):
for file in files:
file_path = os.path.join(root, file)
file_md5 = cls.calculate_md5(file_path)
md5_dict[file_md5].append(file_path)
return md5_dict
@classmethod
def find_duplicate_files(cls, root_folder):
"""递归读取文件夹下所有文件并计算MD5出现次数"""
from pathlib import Path
from collections import defaultdict
md5_dict = defaultdict(list)
for file_path in Path(root_folder).rglob('*'):
file_md5 = cls.calculate_md5(str(file_path))
md5_dict[file_md5].append(str(file_path))
return md5_dict


def invoke(self,
limit,
album=None,
downloader=None,
delete_original_file=True,
**kwargs,
) -> None:
if album is None:
return

self.delete_original_file = delete_original_file
# 获取到下载本子所在根目录
root_folder = self.option.dir_rule.decide_album_root_dir(album)
self.find_duplicated_files_and_delete(limit, root_folder, album)

def find_duplicated_files_and_delete(self, limit: int, root_folder: str, album: Optional[JmAlbumDetail] = None):
md5_dict = self.find_duplicate_files(root_folder)
# 打印MD5出现次数大于等于limit的文件
for md5, paths in md5_dict.items():
if len(paths) >= limit:
prefix = '' if album is None else f'({album.album_id}) '
message = [prefix + f'MD5: {md5} 出现次数: {len(paths)}'] + \
[f' {path}' for path in paths]
self.log('\n'.join(message))
self.execute_deletion(paths)
Comment on lines +1040 to +1096
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review the DeleteDuplicatedFilesPlugin class for potential improvements.

The DeleteDuplicatedFilesPlugin class has been well-implemented with methods to handle file duplication based on MD5 hashes. However, consider the following improvements for better error handling and performance:

  1. Error Handling in calculate_md5:
    Ensure that the method gracefully handles exceptions that may occur during file operations, such as a file being inaccessible.
- with open(file_path, "rb") as f:
+ try:
+     with open(file_path, "rb") as f:
+         for chunk in iter(lambda: f.read(4096), b""):
+             hash_md5.update(chunk)
+ except IOError as e:
+     print(f"Error reading file {file_path}: {e}")
+     return None
  1. Use of pathlib in find_duplicate_files:
    For better readability and robustness, consider using pathlib instead of os.path for file path manipulations.
- import os
+ from pathlib import Path

- for root, _, files in os.walk(root_folder):
+ for file_path in Path(root_folder).rglob('*'):
-    file_path = os.path.join(root, file)
-    file_md5 = cls.calculate_md5(file_path)
+    file_md5 = cls.calculate_md5(str(file_path))
  1. Clarification when album is None in invoke:
    The method returns immediately if album is None. It might be helpful to log a message or raise an exception to inform the user why the operation was not performed.
- if album is None:
+ if album is None:
+     self.log("No album provided, skipping duplicate file deletion.")
+     return

4 changes: 2 additions & 2 deletions src/jmcomic/jm_toolkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,7 @@ def save_resp_img(cls, resp: Any, filepath: str, need_convert=True):
如果需要改变图片的文件格式,比如 .jpg → .png,则需要指定参数 neet_convert=True.
如果不需要改变图片的文件格式,使用 need_convert=False,可以跳过PIL解析图片,效率更高.
:param resp: HTTP响应对象
:param resp: JmImageResp
:param filepath: 图片文件路径
:param need_convert: 是否转换图片
"""
Expand Down Expand Up @@ -746,7 +746,7 @@ def decode_and_save(cls,

# 无需解密,直接保存
if num == 0:
img_src.save(decoded_save_path)
cls.save_image(img_src, decoded_save_path)
return

import math
Expand Down
Loading