diff --git a/scripts/android_import_sutta.py b/scripts/android_import_sutta.py index c73ddc74ec..26a4286da1 100644 --- a/scripts/android_import_sutta.py +++ b/scripts/android_import_sutta.py @@ -330,7 +330,14 @@ def process_pdf(pdf_file): if parallels: parallels += "\n" print(f"Attempting to upload \"{filename}\" to Google Drive...") - filegid = upload_to_google_drive(pdf_file, filename=filename, folder_id=folder_id) + author = trans['author_uid'].replace('thanissaro','geoff').replace('-thera','').replace('mills','mills-laurence') + filegid = upload_to_google_drive( + pdf_file, + creator='LibraryUtils.SuttaUploader', + custom_properties={'sutta':sutta,'slug':slugfield,'translator':author}, + filename=filename, + folder_id=folder_id, + ) if not filegid: print("Failed to upload!") quit(1) @@ -350,7 +357,7 @@ def process_pdf(pdf_file): blurb = f"\n\n{blurb}\n" if blurb else "" mdfile.write_text(f"""--- title: "{title}" -translator: {trans['author_uid'].replace('thanissaro','geoff').replace('-thera','').replace('mills','mills-laurence')} +translator: {author} slug: "{slugfield}"{extra_fields} external_url: "{external_url}" {drive_links}: diff --git a/scripts/archive/add-properties-to-link-docs.py b/scripts/archive/add-properties-to-link-docs.py new file mode 100644 index 0000000000..89096d30c4 --- /dev/null +++ b/scripts/archive/add-properties-to-link-docs.py @@ -0,0 +1,60 @@ +#!/bin/python3 + +from urllib.parse import urlparse, parse_qs + +import gdrive + +gfiles = gdrive.session().files() + +query = "mimeType='application/vnd.google-apps.document' and trashed=false and 'me' in owners" +fields = "id,properties,name,size" + +for file in gdrive.all_files_matching(query, fields): + print(f"Analyzing '{file['name']}'...") + if int(file.get('size') or 1) > 3000: + print(" File too large to be a link file. Skipping") + continue + if file.get('properties',{}).get('url'): + print(" Has the metadata already. Skipping") + continue + doc = gfiles.export( + fileId=file['id'], + mimeType='text/html', + ).execute().decode('utf-8') + soup = gdrive.BeautifulSoup(doc, features='html.parser') + links, ps, h2s, h3s = [soup.find_all(foo) or [] for foo in ['a', 'p', 'h2', 'h3']] + if len(links) != 1: + # I don't want to handle multi-link docs + # and no-link docs are other things + print(" Doesn't appear to be a single-link doc") + continue + link = links[0].get('href') + link = parse_qs(urlparse(link).query).get('q', [''])[0] + if len(link) > 121: + link = gdrive.requests.get('http://tinyurl.com/api-create.php?url='+link).text + data = {'properties': { + 'createdBy': 'LibraryUtils.LinkSaver', + 'url': link, + }} + # if new-style doc, just add properties metadata + if len(ps) == len(h2s) and len(h2s) == 1 and len(h3s) == 0: + print(f" Saving '{link}' to document properties...") + gfiles.update(fileId=file['id'], body=data).execute() + continue + # if old-style doc, reformat to the new style and add metadata + ps_with_text = [p for p in ps if p.get_text() != ""] + if len(ps_with_text) == 2 and len(h2s) == len(h3s) and len(h3s) == 0: + title = ps_with_text[0].get_text() + elif len(ps_with_text) == 1 and ps_with_text[0].get_text() == link: + title = file['name'] + else: + print(" Doesn't match any known link doc format. Skipping") + continue + data['mimeType'] = 'text/html' + html = f"""

{title}

{link}""" + print(" Updating style and adding metadata...") + gfiles.update( + fileId=file['id'], + body=data, + media_body=gdrive.string_to_media(html, 'text/html'), + ).execute() diff --git a/scripts/archive_site.py b/scripts/archive_site.py index 87b40efcd9..7fbab1020a 100644 --- a/scripts/archive_site.py +++ b/scripts/archive_site.py @@ -1,4 +1,4 @@ -"""Saves every page across the site to Archive.org's Wayback Machine""" +#!/bin/python3 import requests import xml.etree.ElementTree as XML diff --git a/scripts/archivedotorg.py b/scripts/archivedotorg.py index e542238210..11f9a05b2e 100644 --- a/scripts/archivedotorg.py +++ b/scripts/archivedotorg.py @@ -14,15 +14,14 @@ from tqdm import tqdm, trange except: print(" pip install tqdm") - quit(1) + exit(1) ARCHIVE_ORG_AUTH_FILE = '~/archive.org.auth' ARCHIVE_ORG_AUTH_PATH = Path(os.path.expanduser(ARCHIVE_ORG_AUTH_FILE)) if ARCHIVE_ORG_AUTH_PATH.exists(): ARCHIVE_ORG_AUTH = ARCHIVE_ORG_AUTH_PATH.read_text().strip() else: - print(f"Please make a new {ARCHIVE_ORG_AUTH_FILE} text file and put in it the information from https://archive.org/account/s3.php in the following format: \"LOW :\"") - quit(1) + raise RuntimeError(f"Please make a new {ARCHIVE_ORG_AUTH_FILE} text file and put in it the information from https://archive.org/account/s3.php in the following format: \"LOW :\"") ARCHIVEID_BLACKLIST = { "unehistoiredetou0000na", diff --git a/scripts/gdrive.py b/scripts/gdrive.py index b45e6da2fd..cdac1bd135 100644 --- a/scripts/gdrive.py +++ b/scripts/gdrive.py @@ -4,6 +4,8 @@ from pathlib import Path import requests import struct +from datetime import datetime +from math import floor from io import BytesIO from strutils import ( titlecase, @@ -13,11 +15,16 @@ file_info, prompt, approx_eq, + whitespace, + yt_url_to_plid_re, + yt_url_to_id_re, ) import pdfutils import json import re from functools import cache +from scrape_utils import extract_simplified_html_for_url +from archivedotorg import archive_urls try: import joblib from yaspin import yaspin @@ -27,9 +34,10 @@ from google_auth_oauthlib.flow import InstalledAppFlow from googleapiclient.discovery import build from googleapiclient.http import MediaIoBaseUpload, MediaFileUpload + from youtube_transcript_api import YouTubeTranscriptApi except: - print("pip install yaspin bs4 google google-api-python-client google_auth_oauthlib joblib") - quit(1) + print("pip install yaspin bs4 google google-api-python-client google_auth_oauthlib joblib youtube-transcript-api") + exit(1) # If modifying these scopes, have to login again. SCOPES = ['https://www.googleapis.com/auth/drive','https://www.googleapis.com/auth/youtube.readonly'] @@ -125,10 +133,31 @@ def session(): def youtube(): return build('youtube', 'v3', credentials=google_credentials()) -@disk_memorizor.cache -def get_ytvideo_snippet(ytid): - snippet = youtube().videos().list(id=ytid,part="snippet").execute().get("items")[0].get("snippet") - return {k: snippet[k] for k in ['title', 'description', 'tags'] if k in snippet} +def get_ytvideo_snippets(ytids): + snippets = [] + data = youtube().videos().list(id=','.join(ytids),part="snippet,topicDetails").execute().get("items", []) + data = {vid['id']: vid for vid in data} + for ytid in ytids: + vid = data.get(ytid,{}) + ret = {k: vid['snippet'][k] for k in ['title', 'description', 'tags', 'thumbnails'] if k in vid['snippet']} + ret['contentDetails'] = vid.get('contentDetails') + snippets.append(ret) + return snippets + +def get_ytvideo_snippets_for_playlist(plid): + deets = youtube().playlistItems().list( + playlistId=plid, + part='snippet', + maxResults=100, + ).execute() + return [e['snippet'] for e in deets.get("items",[])] + +def get_ytplaylist_snippet(plid): + deets = youtube().playlists().list( + id=plid, + part='snippet', + ).execute() + return deets['items'][0]['snippet'] @disk_memorizor.cache(cache_validation_callback=joblib.expires_after(days=28)) def get_subfolders(folderid): @@ -140,9 +169,16 @@ def get_subfolders(folderid): ).execute() return childrenFoldersDict['files'] -def create_doc(filename=None, html=None, rtf=None, folder_id=None): - if html and rtf: - raise ValueError("Please specify either rtf or html. Not both.") +def string_to_media(s, mimeType): + return MediaIoBaseUpload( + BytesIO(bytes(s, 'UTF-8')), + mimetype=mimeType, + resumable=True, + ) + +def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None, custom_properties: dict[str, str] = None): + if bool(html) == bool(rtf): + raise ValueError("Please specify either rtf OR html.") drive_service = session() metadata = {'mimeType': 'application/vnd.google-apps.document'} media = None @@ -150,16 +186,24 @@ def create_doc(filename=None, html=None, rtf=None, folder_id=None): metadata['name'] = filename if folder_id: metadata['parents'] = [folder_id] + if custom_properties: + metadata['properties'] = custom_properties + else: + metadata['properties'] = dict() + if 'createdBy' not in metadata['properties']: + metadata['properties']['createdBy'] = creator or 'LibraryUtils' if html: - media = MediaIoBaseUpload(BytesIO(bytes(html, 'UTF-8')), mimetype='text/html', resumable=True) + media = string_to_media(html, 'text/html') if rtf: - media = MediaIoBaseUpload(BytesIO(bytes(rtf, 'UTF-8')), mimetype='application/rtf', resumable=True) + media = string_to_media(rtf, 'application/rtf') return _perform_upload(metadata, media) -def upload_to_google_drive(file_path, filename=None, folder_id=None): +def upload_to_google_drive(file_path, creator=None, filename=None, folder_id=None, custom_properties: dict[str,str] = None): file_metadata = {'name': (filename or os.path.basename(file_path))} if folder_id: file_metadata['parents'] = [folder_id] + file_metadata['properties'] = custom_properties or dict() + file_metadata['properties']['createdBy'] = creator or 'LibraryUtils' media = MediaFileUpload(file_path, resumable=True) return _perform_upload(file_metadata, media) @@ -234,6 +278,23 @@ def move_drive_file(file_id, folder_id, previous_parents=None): print(f" \"{file.get('name')}\" moved to {file.get('parents')}") return file +def all_files_matching(query, fields, page_size=30): + files = session().files() + fields = f"files({fields}),nextPageToken" + params = { + 'q': query, + 'fields': fields, + 'pageSize': page_size, + } + results = files.list(**params).execute() + for item in results.get('files', []): + yield item + while 'nextPageToken' in results: + params['pageToken'] = results['nextPageToken'] + results = files.list(**params).execute() + for item in results.get('files', []): + yield item + EXACT_MATCH_FIELDS = "files(id,mimeType,name,md5Checksum,originalFilename,size,parents)" def files_exactly_named(file_name): @@ -347,8 +408,67 @@ def guess_link_title(url): except: return "" +def make_link_doc_html(title, link): + ret = f"""

{title}

{link}

""" + if 'youtu' in link: + if 'playlist' in link: + ret += make_ytplaylist_summary_html(yt_url_to_plid_re.search(link).groups()[0]) + else: + vid = yt_url_to_id_re.search(link) + if vid: + ret += make_ytvideo_summary_html(vid.groups()[0]) + else: + contents = extract_simplified_html_for_url(link) + if contents: + ret += f"

Website Contents Preview (as of {datetime.now().strftime('%Y-%m-%d')})

" + ret += contents + return ret + +def htmlify_ytdesc(description): + return description.replace('\n\n', '
') + +def _yt_thumbnail(snippet): + if 'high' in snippet['thumbnails']: + return snippet['thumbnails']['high']['url'] + return snippet['thumbnails']['default']['url'] + +def make_ytvideo_summary_html(vid): + snippet = get_ytvideo_snippets([vid])[0] + ret = "" + if snippet.get('description'): + desc = htmlify_ytdesc(snippet['description']) + ret += f"""

Video Description (from YouTube)

{desc}

""" + ret += f"""

Thumbnail

""" + if len(snippet.get('tags',[])) > 0: + ret += f"""

Video Tags

{snippet['tags']}

""" + transcript = None + try: + transcript = YouTubeTranscriptApi.get_transcript(vid) + except: + pass + if transcript: + ret += "

Video Subtitles

" + for line in transcript: + ret += f"""

{floor(line['start']/60)}:{round(line['start']%60):02d} {whitespace.sub(' ', line['text'])}

""" + return ret + +def make_ytplaylist_summary_html(ytplid): + ret = "" + plsnip = get_ytplaylist_snippet(ytplid) + desc = htmlify_ytdesc(plsnip.get('description', '')) + if desc: + ret += f"""

Description (from YouTube)

{desc}

""" + videos = get_ytvideo_snippets_for_playlist(ytplid) + if len(videos) > 0: + ret += "

Videos

" + for video in videos: + ret += f"""

{int(video['position'])+1}. {video['title']}

""" + ret += f"""

""" + return ret + if __name__ == "__main__": glink_gens = [] + urls_to_save = [] # a list of generator lambdas not direct links # so that we can defer doc creation to the end while True: @@ -358,11 +478,31 @@ def guess_link_title(url): if not link_to_id(link): if "youtu" in link: link = link.split("?si=")[0] + else: + urls_to_save.append(link) title = input_with_prefill("title: ", guess_link_title(link)) - glink_gens.append(lambda title=title, link=link: DOC_LINK.format(create_doc(title, html=f"""

{title}

{link}"""))) + if len(link) > 121: + with yaspin(text="Shortening long URL..."): + link = requests.get('http://tinyurl.com/api-create.php?url='+link).text + glink_gens.append( + lambda title=title, link=link: DOC_LINK.format( + create_doc( + filename=title, + html=make_link_doc_html(title, link), + custom_properties={ + "createdBy": "LibraryUtils.LinkSaver", + "url": link, + }, + ) + ) + ) else: glink_gens.append(lambda r=link: r) course = input_with_tab_complete("course: ", get_known_courses()) folders = get_gfolders_for_course(course) for glink_gen in glink_gens: move_gfile(glink_gen(), folders) + print("Files moved!") + if len(urls_to_save) > 0: + print("Ensuring URLs are saved to Archive.org...") + archive_urls(urls_to_save) diff --git a/scripts/refresh_link_docs.py b/scripts/refresh_link_docs.py new file mode 100644 index 0000000000..dea4c31705 --- /dev/null +++ b/scripts/refresh_link_docs.py @@ -0,0 +1,38 @@ +#!/bin/python3 + +from archivedotorg import archive_urls +import gdrive + +FIELDS = "id,properties,name,size" + +def regen_link_doc(docid, title=None, link=None): + if not (link and title): + fdata = gdrive.session().files().get(fileId=docid,fields=FIELDS).execute() + if not link: + link = fdata['properties']['url'] + if not title: + title = fdata['name'] + html = gdrive.make_link_doc_html(title, link) + gdrive.session().files().update( + fileId=docid, + body={'mimeType':'text/html'}, + media_body=gdrive.string_to_media(html, 'text/html'), + ).execute() + +if __name__ == '__main__': + QUERY = " and ".join([ + "mimeType='application/vnd.google-apps.document'", + "trashed=false", + "'me' in writers", + "properties has { key='createdBy' and value='LibraryUtils.LinkSaver' }", + ]) + urls = [] + for file in gdrive.all_files_matching(QUERY, FIELDS, page_size=2): + print(f"Regenerating '{file['name']}'...") + link = file['properties']['url'] + regen_link_doc(file['id'], title=file['name'], link=link) + if 'youtu' not in link: + urls.append(link) + print(f"Ensuring all {len(urls)} (non-YT) URLs are saved to Archive.org...") + archive_urls(urls) + diff --git a/scripts/requirements.txt b/scripts/requirements.txt index b8788467ec..8ff88f0c03 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -13,4 +13,5 @@ google google-api-python-client google_auth_oauthlib titlecase +trafilatura diff --git a/scripts/scrape_utils.py b/scripts/scrape_utils.py new file mode 100644 index 0000000000..b154224d5a --- /dev/null +++ b/scripts/scrape_utils.py @@ -0,0 +1,35 @@ +#!/bin/python3 + +try: + import trafilatura +except: + print(" pip install trafilatura") + exit(1) + +def extract_simplified_html_for_url(url): + downloaded_webpage = trafilatura.fetch_url(url) + parsed_document = trafilatura.extract( + downloaded_webpage, + output_format='xml', + include_images=True, + include_links=True, + include_comments=False, + favor_recall=True, + ) or '' + # convert trafilatura's silly xml back to html + html_doc = parsed_document\ + .replace(" target=\"", " href=\"")\ + .replace("", "")\ + .replace("","")\ + .replace("","")\ + .replace('',"
    ")\ + .replace('',"
      ")\ + .replace("","
    ")\ + .replace("","")\ + .replace("","") + return html_doc diff --git a/scripts/strutils.py b/scripts/strutils.py index e636a599d1..ac2687b0cc 100644 --- a/scripts/strutils.py +++ b/scripts/strutils.py @@ -48,6 +48,7 @@ def ANSI_MOVE_UP(n): abnormalchars = re.compile('[^\w\s]') sutta_id_re = re.compile(r'^([a-zA-Z]+)(\d+)[\.]?([-–\d]*)$') yt_url_to_id_re = re.compile(r'(?:youtube(?:-nocookie)?\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})') +yt_url_to_plid_re = re.compile(r'[&?]list=([^&]+)') HOSTNAME_BLACKLIST = { "www.questia.com",