Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2024 update #9

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
Binary file added downloads/.DS_Store
Binary file not shown.
Binary file added finals/.DS_Store
Binary file not shown.
249 changes: 189 additions & 60 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Download chapter 20 for the manga Ajin Miura Tsuina
$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ -chapter 20
"""
#import statements
import re
import os
import sys
Expand All @@ -13,8 +14,20 @@
import img2pdf
from bs4 import BeautifulSoup
from PIL import Image
from resizeimage import resizeimage

# from resizeimage import resizeimage
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import os
import time
import zipfile

# Ensure raw_images and final_output directories exist

def parse_url_to_manga_info(url):
"""
Expand All @@ -28,23 +41,55 @@ def parse_url_to_manga_info(url):
title = url.split("/")[0]
return title

def download_image_with_headers(img_url, dir_filename, os_dir):
# Ensure the 'downloads' folder exists
os.makedirs("downloads", exist_ok=True)

# Extract the filename from the image URL if not provided
filename = os.path.basename(dir_filename) # Use the provided 'dir_filename' for filename

# Join the 'downloads' folder with the image filename
dir_filename = os.path.join(os_dir, filename)

# Set a User-Agent to mimic a real browser
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}

req = urllib.request.Request(img_url, headers=headers)

# Download the image with the custom headers
with urllib.request.urlopen(req) as response, open(dir_filename, 'wb') as out_file:
out_file.write(response.read())

def parse_url_to_chapter_info(url):
"""
Extract manga info from the URL, namely: ()
Extract manga info from the URL, namely: (title, version, chapter, url)
:param url: a string that denotes the URL
:return: 4-tuple containing the manga's title, version, chapter and url
"""
# Ensure there are no redundant slashes in the URL.
url = url.strip() # Remove leading/trailing whitespaces
parsed_url = urlparse(url)

# Fix if the URL contains an incorrect number of slashes or is relative
if parsed_url.scheme == '' or parsed_url.netloc == '':
url = 'https://mangapark.me/' + url.lstrip('/') # Prepend the base URL

# Clean up the URL by removing parts we don't need
url = re.sub(r"^https?://", '', url) # Remove http:// or https://
url = re.sub(r"mangapark.me", '', url) # Remove the domain part
url = re.sub(r"/manga/", '', url) # Remove the "/manga/" path part

# Ensure the URL structure is correct
url_parts = url.split("/")
if len(url_parts) == 3:
title, version, chapter = url_parts
elif len(url_parts) == 4:
title, _, version, chapter = url_parts

url = re.sub("http://", '', url)
url = re.sub("mangapark.me", '', url)
url = re.sub("/manga/", '', url)

if len(url.split("/")) == 3:
title, version, chapter = url.split("/")
elif len(url.split("/")) == 4:
title, _, version, chapter = url.split("/")
else:
print("The URL in question was: ", url)
raise ValueError("Couldn't parse URL")

return title, version, chapter, url
Expand Down Expand Up @@ -100,57 +145,93 @@ def convert_to_pdf(os_dir, chapter, file_names):

print("Converting chapter %s to pdf..." % chapter)

pdf_bytes = None

try:
pdf_bytes = img2pdf.convert(*[download_image(path) for path in file_names])
except img2pdf.PdfTooLargeError:
# Sometimes the images are registered as having a dpi of 1.
# Because PDF has a limitation of 200 inches max per side, a
# special layout_fun has to be used, as to prevent an exception.
# default manga size 5"x7"

layout_fun = img2pdf.get_layout_fun(pagesize=(None, img2pdf.in_to_pt(7)),
imgsize=None, border=None,
fit=img2pdf.FitMode.into,
auto_orient=False)
pdf_bytes = img2pdf.convert(*[download_image(path) for path in file_names],
layout_fun=layout_fun)

file = open("%s/%s.pdf" % (os_dir, chapter), "wb")
file.write(pdf_bytes)
print("Conversion completed!")

image_paths = [os.path.join(os_dir, os.path.basename(path)) for path in file_names]

# Debugging: Print the paths to verify correctness
# print("Image paths for PDF conversion:", image_paths)

# Ensure the 'downloads' folder exists
os.makedirs("downloads", exist_ok=True)
os.makedirs("finals", exist_ok=True)

# Convert images to PDF using img2pdf
pdf_bytes = img2pdf.convert(image_paths)
output_pdf = os.path.join("finals", f"chapter_{chapter}.pdf")

# Save the PDF file directly in the 'downloads' folder
with open(output_pdf, "wb") as f:
f.write(pdf_bytes)

print(f"PDF saved as {output_pdf}")

def zip_final_pdfs(output_zip_path="finals/final_pdfs.zip"):
pdf_folder = "finals" # Your `finals` folder path
with zipfile.ZipFile(output_zip_path, 'w') as zipf:
for root, _, files in os.walk(pdf_folder):
for file in files:
if file.endswith(".pdf"):
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, pdf_folder) # Preserve folder structure in the ZIP
zipf.write(file_path, arcname)
print(f"PDFs zipped into {output_zip_path}")

def download_chapter(url, height):
"""
Downloads the chapter specified by the url into your file directory
:param url: string denoting the url
:param height: int denoting the height of the image you want to download in
:return: None.

"""
if not url.startswith("http"):
url = "https://mangapark.me" + url

title, _, chapter, os_dir = parse_url_to_chapter_info(url)
ensure_directory_exist(os_dir)
os_dir = os.path.join("downloads", title, f"chapter_{chapter}")
os.makedirs(os_dir, exist_ok=True)

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless") # Run Chrome in headless mode (no UI)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

try:
page = urllib.request.urlopen(url)
except ValueError:
page = urllib.request.urlopen("http://mangapark.me" + url)
driver.get(url) # Open the manga page

soup = BeautifulSoup(page, "html.parser")
imgs_wrappers = soup.find_all("a", {"class": "img-link"})
file_names = []
for i in imgs_wrappers:
img_url = strip_parameters_from_url(i.img['src'])
filename = img_url.split('/')[-1]
print("Downloading %s %s %s..." % (title, chapter, filename))
dir_filename = os_dir + "/" + os.path.basename(img_url)
urllib.request.urlretrieve(img_url, dir_filename)
new_dir_filename = resize(dir_filename, height)
file_names.append(new_dir_filename)
# Wait for the "Close" button to appear and click it
try:
# Adjust the selector to target the "Close" button in the ad
close_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//span[text()="Close"]'))
)
close_button.click() # Click the "Close" button
time.sleep(1) # Wait a bit after clicking (optional)
except Exception as e:
print("No pop-up detected or failed to close pop-up:", e)

# Now parse the page after closing the ad
soup = BeautifulSoup(driver.page_source, "html.parser")

# Find images after closing the pop-up
imgs_wrappers = soup.find_all("img", {"class": "w-full h-full"})
file_names = []
for i in imgs_wrappers:
img_url = strip_parameters_from_url(i['src'])
filename = img_url.split('/')[-1]
print(f"Downloading {title} {chapter} {filename}...")
dir_filename = os.path.join(os_dir, filename)

# Use the custom download method with headers
download_image_with_headers(img_url, filename, os_dir)

# If a resize function is defined, apply it
new_dir_filename = resize(dir_filename, height)
file_names.append(new_dir_filename)

convert_to_pdf(os_dir, chapter, file_names)

convert_to_pdf(os_dir, chapter, file_names)

finally:
driver.quit() # Close the Selenium WebDriver


def strip_parameters_from_url(url):
Expand Down Expand Up @@ -193,26 +274,74 @@ def download_manga(url, chapter=None, min_max=None, height=None):
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html.parser")

streams = soup.find_all("div", {"class": "stream"})
stream_lens = []
for stream in streams:
chapters = stream.find_all("li")
stream_lens += [len(chapters)]
# streams = soup.find_all("div", {"class": "stream"})
# if not streams:
# raise ValueError("No streams found on the page. Check the URL or website structure.")

# stream_lens = []
# for stream in streams:
# chapters = stream.find_all("li")
# stream_lens += [len(chapters)]

# max_stream_len = max(stream_lens)
# max_idx = stream_lens.index(max_stream_len)
# best_stream = streams[max_idx]

# #judging by the above script, there used to be a div called stream that, in the chapter select screen, would select the optimal server.
# #today, it's within the chapter page where a different server can be picked; over the past 8 years stream quality is on the up.

# chapters = best_stream.find_all("li")

chapter_divs = soup.find_all("div", {"class": "space-x-1"})
chapters = []

for div in chapter_divs:
chapter_link = div.find("a", {"class": "link-hover"})
if chapter_link and "href" in chapter_link.attrs:
chapter_url = chapter_link["href"]
chapter_text = chapter_link.text.strip()

# Check if the text starts with "Ch." or "Chapter"
if chapter_text.startswith("Ch.") or chapter_text.startswith("Chapter"):
try:
# Extract chapter number correctly
if chapter_text.startswith("Ch."):
chapter_no = int(chapter_text[3:]) # Ch. for earlier chapters
elif chapter_text.startswith("Chapter"):
chapter_no = int(chapter_text[7:].split(":")[0]) # Chapter for newer chapters, before the colon

chapters.append((chapter_no, chapter_url))
except ValueError:
print(f"Skipping invalid chapter number: {chapter_text}")
continue

#below removed in this update. No longer needed wiht the latest implementation
# for c in chapters[::-1]:
# chapter_url = c.em.find_all("a")[-1]['href']
# chapter_no = float(parse_url_to_chapter_info(chapter_url)[2][1:])

# if chapter and chapter_no == chapter:
# download_chapter(chapter_url, height)
# break
# elif min_max and min_max[0] <= chapter_no <= min_max[1]:
# download_chapter(chapter_url, height)
# Process each chapter based on the specified criteria

for chapter_no, chapter_url in sorted(chapters, reverse=True): # Sort by chapter number (descending)
print(f"Processing Chapter {chapter_no}: {chapter_url}")

max_stream_len = max(stream_lens)
max_idx = stream_lens.index(max_stream_len)
best_stream = streams[max_idx]
if not chapter_url.startswith("https"):
chapter_url = "https://mangapark.me" + chapter_url # Add base URL

chapters = best_stream.find_all("li")
for c in chapters[::-1]:
chapter_url = c.em.find_all("a")[-1]['href']
chapter_no = float(parse_url_to_chapter_info(chapter_url)[2][1:])
# Validate and clean up URL
chapter_url = chapter_url.replace("///", "//") # Fix triple slashes if they exist

if chapter and chapter_no == chapter:
download_chapter(chapter_url, height)
break
elif min_max and min_max[0] <= chapter_no <= min_max[1]:
download_chapter(chapter_url, height)
zip_final_pdfs()


def main():
Expand Down
5 changes: 4 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@ A script to download your favourite mangas on mangapark.me and convert them to `

```
# Download chapter 20 for the manga Ajin Miura Tsuina
$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ --chapter 20 --size 1000
$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ -c 20 --size 1000

# Download chapters 19 to 22 for the manga Ajin Miura Tsuina very small
$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ --chapters 19 22 --size 300

#download chapters 22 to 24 for the manga Ajin Miura Tsuina
$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ -cs 22 24
```

`--size` is optional on both ways of downloading. Without it, it will not resize.
Expand Down