tohyongcheng · nievesatla · Dec 29, 2024 · Dec 29, 2024 · Dec 29, 2024 · Dec 29, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/downloads/.DS_Store b/downloads/.DS_Store
diff --git a/finals/.DS_Store b/finals/.DS_Store
diff --git a/main.py b/main.py
@@ -5,6 +5,7 @@
     Download chapter 20 for the manga Ajin Miura Tsuina
         $ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ -chapter 20
 """
+#import statements
 import re
 import os
 import sys
@@ -13,8 +14,20 @@
 import img2pdf
 from bs4 import BeautifulSoup
 from PIL import Image
-from resizeimage import resizeimage
 
+# from resizeimage import resizeimage
+from urllib.parse import urljoin, urlparse
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from webdriver_manager.chrome import ChromeDriverManager
+import os
+import time
+import zipfile
+
+# Ensure raw_images and final_output directories exist
 
 def parse_url_to_manga_info(url):
     """
@@ -28,23 +41,55 @@ def parse_url_to_manga_info(url):
     title = url.split("/")[0]
     return title
 
+def download_image_with_headers(img_url, dir_filename, os_dir):
+    # Ensure the 'downloads' folder exists
+    os.makedirs("downloads", exist_ok=True)
+
+    # Extract the filename from the image URL if not provided
+    filename = os.path.basename(dir_filename)  # Use the provided 'dir_filename' for filename
+
+    # Join the 'downloads' folder with the image filename
+    dir_filename = os.path.join(os_dir, filename)
+
+    # Set a User-Agent to mimic a real browser
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
+    }
+
+    req = urllib.request.Request(img_url, headers=headers)
+
+    # Download the image with the custom headers
+    with urllib.request.urlopen(req) as response, open(dir_filename, 'wb') as out_file:
+        out_file.write(response.read())
 
 def parse_url_to_chapter_info(url):
     """
-    Extract manga info from the URL, namely: ()
+    Extract manga info from the URL, namely: (title, version, chapter, url)
     :param url: a string that denotes the URL
     :return: 4-tuple containing the manga's title, version, chapter and url
     """
+    # Ensure there are no redundant slashes in the URL.
+    url = url.strip()  # Remove leading/trailing whitespaces
+    parsed_url = urlparse(url)
+
+    # Fix if the URL contains an incorrect number of slashes or is relative
+    if parsed_url.scheme == '' or parsed_url.netloc == '':
+        url = 'https://mangapark.me/' + url.lstrip('/')  # Prepend the base URL
+
+    # Clean up the URL by removing parts we don't need
+    url = re.sub(r"^https?://", '', url)  # Remove http:// or https://
+    url = re.sub(r"mangapark.me", '', url)  # Remove the domain part
+    url = re.sub(r"/manga/", '', url)  # Remove the "/manga/" path part
+
+    # Ensure the URL structure is correct
+    url_parts = url.split("/")
+    if len(url_parts) == 3:
+        title, version, chapter = url_parts
+    elif len(url_parts) == 4:
+        title, _, version, chapter = url_parts
 
-    url = re.sub("http://", '', url)
-    url = re.sub("mangapark.me", '', url)
-    url = re.sub("/manga/", '', url)
-
-    if len(url.split("/")) == 3:
-        title, version, chapter = url.split("/")
-    elif len(url.split("/")) == 4:
-        title, _, version, chapter = url.split("/")
     else:
+        print("The URL in question was: ", url)
         raise ValueError("Couldn't parse URL")
 
     return title, version, chapter, url
@@ -100,57 +145,93 @@ def convert_to_pdf(os_dir, chapter, file_names):
 
     print("Converting chapter %s to pdf..." % chapter)
 
-    pdf_bytes = None
-
-    try:
-        pdf_bytes = img2pdf.convert(*[download_image(path) for path in file_names])
-    except img2pdf.PdfTooLargeError:
-        # Sometimes the images are registered as having a dpi of 1.
-        # Because PDF has a limitation of 200 inches max per side, a
-        # special layout_fun has to be used, as to prevent an exception.
-        # default manga size 5"x7"
-
-        layout_fun = img2pdf.get_layout_fun(pagesize=(None, img2pdf.in_to_pt(7)),
-                                            imgsize=None, border=None,
-                                            fit=img2pdf.FitMode.into,
-                                            auto_orient=False)
-        pdf_bytes = img2pdf.convert(*[download_image(path) for path in file_names],
-                                    layout_fun=layout_fun)
-
-    file = open("%s/%s.pdf" % (os_dir, chapter), "wb")
-    file.write(pdf_bytes)
-    print("Conversion completed!")
-
+    image_paths = [os.path.join(os_dir, os.path.basename(path)) for path in file_names]
+
+    # Debugging: Print the paths to verify correctness
+    # print("Image paths for PDF conversion:", image_paths)
+
+    # Ensure the 'downloads' folder exists
+    os.makedirs("downloads", exist_ok=True)
+    os.makedirs("finals", exist_ok=True)
+
+    # Convert images to PDF using img2pdf
+    pdf_bytes = img2pdf.convert(image_paths)
+    output_pdf = os.path.join("finals", f"chapter_{chapter}.pdf")
+
+    # Save the PDF file directly in the 'downloads' folder
+    with open(output_pdf, "wb") as f:
+        f.write(pdf_bytes)
+
+    print(f"PDF saved as {output_pdf}")
+
+def zip_final_pdfs(output_zip_path="finals/final_pdfs.zip"):
+    pdf_folder = "finals"  # Your `finals` folder path
+    with zipfile.ZipFile(output_zip_path, 'w') as zipf:
+        for root, _, files in os.walk(pdf_folder):
+            for file in files:
+                if file.endswith(".pdf"):
+                    file_path = os.path.join(root, file)
+                    arcname = os.path.relpath(file_path, pdf_folder)  # Preserve folder structure in the ZIP
+                    zipf.write(file_path, arcname)
+    print(f"PDFs zipped into {output_zip_path}")
 
 def download_chapter(url, height):
     """
     Downloads the chapter specified by the url into your file directory
     :param url: string denoting the url
     :param height: int denoting the height of the image you want to download in
     :return: None.
-
     """
+    if not url.startswith("http"):
+        url = "https://mangapark.me" + url
 
     title, _, chapter, os_dir = parse_url_to_chapter_info(url)
-    ensure_directory_exist(os_dir)
+    os_dir = os.path.join("downloads", title, f"chapter_{chapter}")
+    os.makedirs(os_dir, exist_ok=True)
+
+    # Set up Selenium WebDriver
+    options = webdriver.ChromeOptions()
+    options.add_argument("--headless")  # Run Chrome in headless mode (no UI)
+    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
+
     try:
-        page = urllib.request.urlopen(url)
-    except ValueError:
-        page = urllib.request.urlopen("http://mangapark.me" + url)
+        driver.get(url)  # Open the manga page
 
-    soup = BeautifulSoup(page, "html.parser")
-    imgs_wrappers = soup.find_all("a", {"class": "img-link"})
-    file_names = []
-    for i in imgs_wrappers:
-        img_url = strip_parameters_from_url(i.img['src'])
-        filename = img_url.split('/')[-1]
-        print("Downloading %s %s %s..." % (title, chapter, filename))
-        dir_filename = os_dir + "/" + os.path.basename(img_url)
-        urllib.request.urlretrieve(img_url, dir_filename)
-        new_dir_filename = resize(dir_filename, height)
-        file_names.append(new_dir_filename)
+        # Wait for the "Close" button to appear and click it
+        try:
+            # Adjust the selector to target the "Close" button in the ad
+            close_button = WebDriverWait(driver, 10).until(
+                EC.element_to_be_clickable((By.XPATH, '//span[text()="Close"]'))
+            )
+            close_button.click()  # Click the "Close" button
+            time.sleep(1)  # Wait a bit after clicking (optional)
+        except Exception as e:
+            print("No pop-up detected or failed to close pop-up:", e)
+
+        # Now parse the page after closing the ad
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+
+        # Find images after closing the pop-up
+        imgs_wrappers = soup.find_all("img", {"class": "w-full h-full"})
+        file_names = []
+        for i in imgs_wrappers:
+            img_url = strip_parameters_from_url(i['src'])
+            filename = img_url.split('/')[-1]
+            print(f"Downloading {title} {chapter} {filename}...")
+            dir_filename = os.path.join(os_dir, filename)
+
+            # Use the custom download method with headers
+            download_image_with_headers(img_url, filename, os_dir)
+
+            # If a resize function is defined, apply it
+            new_dir_filename = resize(dir_filename, height)
+            file_names.append(new_dir_filename)
+
+        convert_to_pdf(os_dir, chapter, file_names)
 
-    convert_to_pdf(os_dir, chapter, file_names)
+
+    finally:
+        driver.quit()  # Close the Selenium WebDriver
 
 
 def strip_parameters_from_url(url):
@@ -193,26 +274,74 @@ def download_manga(url, chapter=None, min_max=None, height=None):
     page = urllib.request.urlopen(url)
     soup = BeautifulSoup(page, "html.parser")
 
-    streams = soup.find_all("div", {"class": "stream"})
-    stream_lens = []
-    for stream in streams:
-        chapters = stream.find_all("li")
-        stream_lens += [len(chapters)]
+    # streams = soup.find_all("div", {"class": "stream"})
+    # if not streams:
+    #     raise ValueError("No streams found on the page. Check the URL or website structure.")
+
+    # stream_lens = []
+    # for stream in streams:
+    #     chapters = stream.find_all("li")
+    #     stream_lens += [len(chapters)]
+
+    # max_stream_len = max(stream_lens)
+    # max_idx = stream_lens.index(max_stream_len)
+    # best_stream = streams[max_idx]
+
+    # #judging by the above script, there used to be a div called stream that, in the chapter select screen, would select the optimal server.
+    # #today, it's within the chapter page where a different server can be picked; over the past 8 years stream quality is on the up.
+
+    # chapters = best_stream.find_all("li")
+
+    chapter_divs = soup.find_all("div", {"class": "space-x-1"})
+    chapters = []
+
+    for div in chapter_divs:
+        chapter_link = div.find("a", {"class": "link-hover"})
+        if chapter_link and "href" in chapter_link.attrs:
+            chapter_url = chapter_link["href"]
+            chapter_text = chapter_link.text.strip()
+
+            # Check if the text starts with "Ch." or "Chapter"
+            if chapter_text.startswith("Ch.") or chapter_text.startswith("Chapter"):
+                try:
+                    # Extract chapter number correctly
+                    if chapter_text.startswith("Ch."):
+                        chapter_no = int(chapter_text[3:])  # Ch. for earlier chapters
+                    elif chapter_text.startswith("Chapter"):
+                        chapter_no = int(chapter_text[7:].split(":")[0])  # Chapter for newer chapters, before the colon
+
+                    chapters.append((chapter_no, chapter_url))
+                except ValueError:
+                    print(f"Skipping invalid chapter number: {chapter_text}")
+                    continue
+
+    #below removed in this update. No longer needed wiht the latest implementation
+    # for c in chapters[::-1]:
+    #     chapter_url = c.em.find_all("a")[-1]['href']
+    #     chapter_no = float(parse_url_to_chapter_info(chapter_url)[2][1:])
+
+    #     if chapter and chapter_no == chapter:
+    #         download_chapter(chapter_url, height)
+    #         break
+    #     elif min_max and min_max[0] <= chapter_no <= min_max[1]:
+    #         download_chapter(chapter_url, height)
+    # Process each chapter based on the specified criteria
+
+    for chapter_no, chapter_url in sorted(chapters, reverse=True):  # Sort by chapter number (descending)
+        print(f"Processing Chapter {chapter_no}: {chapter_url}")
 
-    max_stream_len = max(stream_lens)
-    max_idx = stream_lens.index(max_stream_len)
-    best_stream = streams[max_idx]
+        if not chapter_url.startswith("https"):
+            chapter_url = "https://mangapark.me" + chapter_url  # Add base URL
 
-    chapters = best_stream.find_all("li")
-    for c in chapters[::-1]:
-        chapter_url = c.em.find_all("a")[-1]['href']
-        chapter_no = float(parse_url_to_chapter_info(chapter_url)[2][1:])
+        # Validate and clean up URL
+        chapter_url = chapter_url.replace("///", "//")  # Fix triple slashes if they exist
 
         if chapter and chapter_no == chapter:
             download_chapter(chapter_url, height)
             break
         elif min_max and min_max[0] <= chapter_no <= min_max[1]:
             download_chapter(chapter_url, height)
+    zip_final_pdfs()
 
 
 def main():

diff --git a/readme.md b/readme.md
@@ -6,10 +6,13 @@ A script to download your favourite mangas on mangapark.me and convert them to `
 
 ```
 # Download chapter 20 for the manga Ajin Miura Tsuina 
-$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ --chapter 20 --size 1000
+$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ -c 20 --size 1000
 
 # Download chapters 19 to 22 for the manga Ajin Miura Tsuina very small
 $ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ --chapters 19 22 --size 300
+
+#download chapters 22 to 24 for the manga Ajin Miura Tsuina
+$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ -cs 22 24
 ```
 
 `--size` is optional on both ways of downloading. Without it, it will not resize.