-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
executable file
·245 lines (197 loc) · 7.82 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
"""
Mangapark-DL: Downloads manga and converts to PDF for the site www.mangapark.com
Example:
Download chapter 20 for the manga Ajin Miura Tsuina
$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ -chapter 20
"""
import re
import os
import sys
import argparse
import urllib.request
import img2pdf
from bs4 import BeautifulSoup
from PIL import Image
from resizeimage import resizeimage
def parse_url_to_manga_info(url):
"""
Extracts the title of a manga from an URL.
:param url: a string that denotes the URL
:return: the title of a manga
"""
url = re.sub('http://', '', url)
url = re.sub('mangapark.me/manga/', '', url)
title = url.split("/")[0]
return title
def parse_url_to_chapter_info(url):
"""
Extract manga info from the URL, namely: ()
:param url: a string that denotes the URL
:return: 4-tuple containing the manga's title, version, chapter and url
"""
url = re.sub("http://", '', url)
url = re.sub("mangapark.me", '', url)
url = re.sub("/manga/", '', url)
if len(url.split("/")) == 3:
title, version, chapter = url.split("/")
elif len(url.split("/")) == 4:
title, _, version, chapter = url.split("/")
else:
raise ValueError("Couldn't parse URL")
return title, version, chapter, url
def ensure_directory_exist(directory):
"""
Creates a directory, if it doesn't exist yet.
:param directory: directory file path
:return: None
"""
if not os.path.exists(directory):
os.makedirs(directory)
def download_image(path):
"""
Reads an image from the specified source.
:param path: file path of the image
:return: raw image data in byte[]
"""
if path == '-':
raw_data = sys.stdin.buffer.read()
else:
try:
with open(path, "rb") as im:
raw_data = im.read()
except IsADirectoryError:
raise argparse.ArgumentTypeError(
"\"%s\" is a directory" % path)
except PermissionError:
raise argparse.ArgumentTypeError(
"\"%s\" permission denied" % path)
except FileNotFoundError:
raise argparse.ArgumentTypeError(
"\"%s\" does not exist" % path)
if len(raw_data) == 0:
raise argparse.ArgumentTypeError("\"%s\" is empty" % path)
return raw_data
def convert_to_pdf(os_dir, chapter, file_names):
"""
Converts a collection of images to PDF format
:param os_dir: Directory to save PDF in.
:param chapter: Title of the PDF.
:param file_names: Images to construct the PDF from.
:return:
"""
print("Converting chapter %s to pdf..." % chapter)
pdf_bytes = None
try:
pdf_bytes = img2pdf.convert(*[download_image(path) for path in file_names])
except img2pdf.PdfTooLargeError:
# Sometimes the images are registered as having a dpi of 1.
# Because PDF has a limitation of 200 inches max per side, a
# special layout_fun has to be used, as to prevent an exception.
# default manga size 5"x7"
layout_fun = img2pdf.get_layout_fun(pagesize=(None, img2pdf.in_to_pt(7)),
imgsize=None, border=None,
fit=img2pdf.FitMode.into,
auto_orient=False)
pdf_bytes = img2pdf.convert(*[download_image(path) for path in file_names],
layout_fun=layout_fun)
file = open("%s/%s.pdf" % (os_dir, chapter), "wb")
file.write(pdf_bytes)
print("Conversion completed!")
def download_chapter(url, height):
"""
Downloads the chapter specified by the url into your file directory
:param url: string denoting the url
:param height: int denoting the height of the image you want to download in
:return: None.
"""
title, _, chapter, os_dir = parse_url_to_chapter_info(url)
ensure_directory_exist(os_dir)
try:
page = urllib.request.urlopen(url)
except ValueError:
page = urllib.request.urlopen("http://mangapark.me" + url)
soup = BeautifulSoup(page, "html.parser")
imgs_wrappers = soup.find_all("a", {"class": "img-link"})
file_names = []
for i in imgs_wrappers:
img_url = strip_parameters_from_url(i.img['src'])
filename = img_url.split('/')[-1]
print("Downloading %s %s %s..." % (title, chapter, filename))
dir_filename = os_dir + "/" + os.path.basename(img_url)
urllib.request.urlretrieve(img_url, dir_filename)
new_dir_filename = resize(dir_filename, height)
file_names.append(new_dir_filename)
convert_to_pdf(os_dir, chapter, file_names)
def strip_parameters_from_url(url):
"""
Parses the URL and strips away the parameters
:param url: string URL with parameters
:return: string URL without parameters
"""
return re.sub(r'\?.*', '', url)
def resize(filename, height=None):
"""
Resize the image to a certain proportion by height
:param filename: string path of file to the image
:param height: int
:return: new filename of the image
"""
if height is None:
return filename
print("Resizing %s to %spx height..." % (filename, height))
with open(filename, 'r+b') as f:
with Image.open(f) as image:
cover = resizeimage.resize_height(image, height)
new_filename = filename + '.res'
cover.save(new_filename, image.format)
return new_filename
def download_manga(url, chapter=None, min_max=None, height=None):
"""
Downloads the chapters of a manga
:param url: string url of the manga
:param chapter: int chapter to download. if no chapter is specified, the min_max parameter is used.
:param min_max: the inclusive range of chapters to download, e.g [1,10] -> chapters 1 to 10
:param height: The height to witch resize all images (keeping the aspect ratio)
:return: None
"""
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
streams = soup.find_all("div", {"class": "stream"})
stream_lens = []
for stream in streams:
chapters = stream.find_all("li")
stream_lens += [len(chapters)]
max_stream_len = max(stream_lens)
max_idx = stream_lens.index(max_stream_len)
best_stream = streams[max_idx]
chapters = best_stream.find_all("li")
for c in chapters[::-1]:
chapter_url = c.em.find_all("a")[-1]['href']
chapter_no = float(parse_url_to_chapter_info(chapter_url)[2][1:])
if chapter and chapter_no == chapter:
download_chapter(chapter_url, height)
break
elif min_max and min_max[0] <= chapter_no <= min_max[1]:
download_chapter(chapter_url, height)
def main():
"""
Downloads manga specified in command line based on the following arguments:
"""
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--manga-url', help="The url of the mangapark manga to download")
parser.add_argument('-s', '--size', '--height', type=int,
help='Height to resize images to (it will keep the aspect ratio)')
parser.add_argument('-c', '--chapter', help="The chapter number that you specifically want to download")
parser.add_argument('-cs', '--chapters', nargs=2, help="An inclusive range of chapters you want to download")
args = parser.parse_args()
print(args)
if args.manga_url is None:
print("Please specify the URL of the manga on mangapark.me")
return
elif args.chapters is not None:
assert isinstance(args.chapters, list)
download_manga(args.manga_url, min_max=[float(x) for x in args.chapters], height=args.size)
elif args.chapter is not None:
download_manga(args.manga_url, chapter=int(args.chapter), height=args.size)
if __name__ == "__main__":
main()