diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..284a7dd Binary files /dev/null and b/.DS_Store differ diff --git a/downloads/.DS_Store b/downloads/.DS_Store new file mode 100644 index 0000000..9971d10 Binary files /dev/null and b/downloads/.DS_Store differ diff --git a/finals/.DS_Store b/finals/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/finals/.DS_Store differ diff --git a/main.py b/main.py index 8af2f62..650f564 100755 --- a/main.py +++ b/main.py @@ -5,6 +5,7 @@ Download chapter 20 for the manga Ajin Miura Tsuina $ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ -chapter 20 """ +#import statements import re import os import sys @@ -13,8 +14,20 @@ import img2pdf from bs4 import BeautifulSoup from PIL import Image -from resizeimage import resizeimage +# from resizeimage import resizeimage +from urllib.parse import urljoin, urlparse +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from webdriver_manager.chrome import ChromeDriverManager +import os +import time +import zipfile + +# Ensure raw_images and final_output directories exist def parse_url_to_manga_info(url): """ @@ -28,23 +41,55 @@ def parse_url_to_manga_info(url): title = url.split("/")[0] return title +def download_image_with_headers(img_url, dir_filename, os_dir): + # Ensure the 'downloads' folder exists + os.makedirs("downloads", exist_ok=True) + + # Extract the filename from the image URL if not provided + filename = os.path.basename(dir_filename) # Use the provided 'dir_filename' for filename + + # Join the 'downloads' folder with the image filename + dir_filename = os.path.join(os_dir, filename) + + # Set a User-Agent to mimic a real browser + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" + } + + req = urllib.request.Request(img_url, headers=headers) + + # Download the image with the custom headers + with urllib.request.urlopen(req) as response, open(dir_filename, 'wb') as out_file: + out_file.write(response.read()) def parse_url_to_chapter_info(url): """ - Extract manga info from the URL, namely: () + Extract manga info from the URL, namely: (title, version, chapter, url) :param url: a string that denotes the URL :return: 4-tuple containing the manga's title, version, chapter and url """ + # Ensure there are no redundant slashes in the URL. + url = url.strip() # Remove leading/trailing whitespaces + parsed_url = urlparse(url) + + # Fix if the URL contains an incorrect number of slashes or is relative + if parsed_url.scheme == '' or parsed_url.netloc == '': + url = 'https://mangapark.me/' + url.lstrip('/') # Prepend the base URL + + # Clean up the URL by removing parts we don't need + url = re.sub(r"^https?://", '', url) # Remove http:// or https:// + url = re.sub(r"mangapark.me", '', url) # Remove the domain part + url = re.sub(r"/manga/", '', url) # Remove the "/manga/" path part + + # Ensure the URL structure is correct + url_parts = url.split("/") + if len(url_parts) == 3: + title, version, chapter = url_parts + elif len(url_parts) == 4: + title, _, version, chapter = url_parts - url = re.sub("http://", '', url) - url = re.sub("mangapark.me", '', url) - url = re.sub("/manga/", '', url) - - if len(url.split("/")) == 3: - title, version, chapter = url.split("/") - elif len(url.split("/")) == 4: - title, _, version, chapter = url.split("/") else: + print("The URL in question was: ", url) raise ValueError("Couldn't parse URL") return title, version, chapter, url @@ -100,27 +145,35 @@ def convert_to_pdf(os_dir, chapter, file_names): print("Converting chapter %s to pdf..." % chapter) - pdf_bytes = None - - try: - pdf_bytes = img2pdf.convert(*[download_image(path) for path in file_names]) - except img2pdf.PdfTooLargeError: - # Sometimes the images are registered as having a dpi of 1. - # Because PDF has a limitation of 200 inches max per side, a - # special layout_fun has to be used, as to prevent an exception. - # default manga size 5"x7" - - layout_fun = img2pdf.get_layout_fun(pagesize=(None, img2pdf.in_to_pt(7)), - imgsize=None, border=None, - fit=img2pdf.FitMode.into, - auto_orient=False) - pdf_bytes = img2pdf.convert(*[download_image(path) for path in file_names], - layout_fun=layout_fun) - - file = open("%s/%s.pdf" % (os_dir, chapter), "wb") - file.write(pdf_bytes) - print("Conversion completed!") - + image_paths = [os.path.join(os_dir, os.path.basename(path)) for path in file_names] + + # Debugging: Print the paths to verify correctness + # print("Image paths for PDF conversion:", image_paths) + + # Ensure the 'downloads' folder exists + os.makedirs("downloads", exist_ok=True) + os.makedirs("finals", exist_ok=True) + + # Convert images to PDF using img2pdf + pdf_bytes = img2pdf.convert(image_paths) + output_pdf = os.path.join("finals", f"chapter_{chapter}.pdf") + + # Save the PDF file directly in the 'downloads' folder + with open(output_pdf, "wb") as f: + f.write(pdf_bytes) + + print(f"PDF saved as {output_pdf}") + +def zip_final_pdfs(output_zip_path="finals/final_pdfs.zip"): + pdf_folder = "finals" # Your `finals` folder path + with zipfile.ZipFile(output_zip_path, 'w') as zipf: + for root, _, files in os.walk(pdf_folder): + for file in files: + if file.endswith(".pdf"): + file_path = os.path.join(root, file) + arcname = os.path.relpath(file_path, pdf_folder) # Preserve folder structure in the ZIP + zipf.write(file_path, arcname) + print(f"PDFs zipped into {output_zip_path}") def download_chapter(url, height): """ @@ -128,29 +181,57 @@ def download_chapter(url, height): :param url: string denoting the url :param height: int denoting the height of the image you want to download in :return: None. - """ + if not url.startswith("http"): + url = "https://mangapark.me" + url title, _, chapter, os_dir = parse_url_to_chapter_info(url) - ensure_directory_exist(os_dir) + os_dir = os.path.join("downloads", title, f"chapter_{chapter}") + os.makedirs(os_dir, exist_ok=True) + + # Set up Selenium WebDriver + options = webdriver.ChromeOptions() + options.add_argument("--headless") # Run Chrome in headless mode (no UI) + driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) + try: - page = urllib.request.urlopen(url) - except ValueError: - page = urllib.request.urlopen("http://mangapark.me" + url) + driver.get(url) # Open the manga page - soup = BeautifulSoup(page, "html.parser") - imgs_wrappers = soup.find_all("a", {"class": "img-link"}) - file_names = [] - for i in imgs_wrappers: - img_url = strip_parameters_from_url(i.img['src']) - filename = img_url.split('/')[-1] - print("Downloading %s %s %s..." % (title, chapter, filename)) - dir_filename = os_dir + "/" + os.path.basename(img_url) - urllib.request.urlretrieve(img_url, dir_filename) - new_dir_filename = resize(dir_filename, height) - file_names.append(new_dir_filename) + # Wait for the "Close" button to appear and click it + try: + # Adjust the selector to target the "Close" button in the ad + close_button = WebDriverWait(driver, 10).until( + EC.element_to_be_clickable((By.XPATH, '//span[text()="Close"]')) + ) + close_button.click() # Click the "Close" button + time.sleep(1) # Wait a bit after clicking (optional) + except Exception as e: + print("No pop-up detected or failed to close pop-up:", e) + + # Now parse the page after closing the ad + soup = BeautifulSoup(driver.page_source, "html.parser") + + # Find images after closing the pop-up + imgs_wrappers = soup.find_all("img", {"class": "w-full h-full"}) + file_names = [] + for i in imgs_wrappers: + img_url = strip_parameters_from_url(i['src']) + filename = img_url.split('/')[-1] + print(f"Downloading {title} {chapter} {filename}...") + dir_filename = os.path.join(os_dir, filename) + + # Use the custom download method with headers + download_image_with_headers(img_url, filename, os_dir) + + # If a resize function is defined, apply it + new_dir_filename = resize(dir_filename, height) + file_names.append(new_dir_filename) + + convert_to_pdf(os_dir, chapter, file_names) - convert_to_pdf(os_dir, chapter, file_names) + + finally: + driver.quit() # Close the Selenium WebDriver def strip_parameters_from_url(url): @@ -193,26 +274,74 @@ def download_manga(url, chapter=None, min_max=None, height=None): page = urllib.request.urlopen(url) soup = BeautifulSoup(page, "html.parser") - streams = soup.find_all("div", {"class": "stream"}) - stream_lens = [] - for stream in streams: - chapters = stream.find_all("li") - stream_lens += [len(chapters)] + # streams = soup.find_all("div", {"class": "stream"}) + # if not streams: + # raise ValueError("No streams found on the page. Check the URL or website structure.") + + # stream_lens = [] + # for stream in streams: + # chapters = stream.find_all("li") + # stream_lens += [len(chapters)] + + # max_stream_len = max(stream_lens) + # max_idx = stream_lens.index(max_stream_len) + # best_stream = streams[max_idx] + + # #judging by the above script, there used to be a div called stream that, in the chapter select screen, would select the optimal server. + # #today, it's within the chapter page where a different server can be picked; over the past 8 years stream quality is on the up. + + # chapters = best_stream.find_all("li") + + chapter_divs = soup.find_all("div", {"class": "space-x-1"}) + chapters = [] + + for div in chapter_divs: + chapter_link = div.find("a", {"class": "link-hover"}) + if chapter_link and "href" in chapter_link.attrs: + chapter_url = chapter_link["href"] + chapter_text = chapter_link.text.strip() + + # Check if the text starts with "Ch." or "Chapter" + if chapter_text.startswith("Ch.") or chapter_text.startswith("Chapter"): + try: + # Extract chapter number correctly + if chapter_text.startswith("Ch."): + chapter_no = int(chapter_text[3:]) # Ch. for earlier chapters + elif chapter_text.startswith("Chapter"): + chapter_no = int(chapter_text[7:].split(":")[0]) # Chapter for newer chapters, before the colon + + chapters.append((chapter_no, chapter_url)) + except ValueError: + print(f"Skipping invalid chapter number: {chapter_text}") + continue + + #below removed in this update. No longer needed wiht the latest implementation + # for c in chapters[::-1]: + # chapter_url = c.em.find_all("a")[-1]['href'] + # chapter_no = float(parse_url_to_chapter_info(chapter_url)[2][1:]) + + # if chapter and chapter_no == chapter: + # download_chapter(chapter_url, height) + # break + # elif min_max and min_max[0] <= chapter_no <= min_max[1]: + # download_chapter(chapter_url, height) + # Process each chapter based on the specified criteria + + for chapter_no, chapter_url in sorted(chapters, reverse=True): # Sort by chapter number (descending) + print(f"Processing Chapter {chapter_no}: {chapter_url}") - max_stream_len = max(stream_lens) - max_idx = stream_lens.index(max_stream_len) - best_stream = streams[max_idx] + if not chapter_url.startswith("https"): + chapter_url = "https://mangapark.me" + chapter_url # Add base URL - chapters = best_stream.find_all("li") - for c in chapters[::-1]: - chapter_url = c.em.find_all("a")[-1]['href'] - chapter_no = float(parse_url_to_chapter_info(chapter_url)[2][1:]) + # Validate and clean up URL + chapter_url = chapter_url.replace("///", "//") # Fix triple slashes if they exist if chapter and chapter_no == chapter: download_chapter(chapter_url, height) break elif min_max and min_max[0] <= chapter_no <= min_max[1]: download_chapter(chapter_url, height) + zip_final_pdfs() def main(): diff --git a/readme.md b/readme.md index d649bd5..a86e23f 100644 --- a/readme.md +++ b/readme.md @@ -6,10 +6,13 @@ A script to download your favourite mangas on mangapark.me and convert them to ` ``` # Download chapter 20 for the manga Ajin Miura Tsuina -$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ --chapter 20 --size 1000 +$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ -c 20 --size 1000 # Download chapters 19 to 22 for the manga Ajin Miura Tsuina very small $ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ --chapters 19 22 --size 300 + +#download chapters 22 to 24 for the manga Ajin Miura Tsuina +$ python3 main.py -m http://mangapark.me/manga/ajin-miura-tsuina/ -cs 22 24 ``` `--size` is optional on both ways of downloading. Without it, it will not resize.