diff --git a/scripts/android_import_sutta.py b/scripts/android_import_sutta.py
index c73ddc74ec..26a4286da1 100644
--- a/scripts/android_import_sutta.py
+++ b/scripts/android_import_sutta.py
@@ -330,7 +330,14 @@ def process_pdf(pdf_file):
if parallels:
parallels += "\n"
print(f"Attempting to upload \"{filename}\" to Google Drive...")
- filegid = upload_to_google_drive(pdf_file, filename=filename, folder_id=folder_id)
+ author = trans['author_uid'].replace('thanissaro','geoff').replace('-thera','').replace('mills','mills-laurence')
+ filegid = upload_to_google_drive(
+ pdf_file,
+ creator='LibraryUtils.SuttaUploader',
+ custom_properties={'sutta':sutta,'slug':slugfield,'translator':author},
+ filename=filename,
+ folder_id=folder_id,
+ )
if not filegid:
print("Failed to upload!")
quit(1)
@@ -350,7 +357,7 @@ def process_pdf(pdf_file):
blurb = f"\n\n{blurb}\n" if blurb else ""
mdfile.write_text(f"""---
title: "{title}"
-translator: {trans['author_uid'].replace('thanissaro','geoff').replace('-thera','').replace('mills','mills-laurence')}
+translator: {author}
slug: "{slugfield}"{extra_fields}
external_url: "{external_url}"
{drive_links}:
diff --git a/scripts/archive/add-properties-to-link-docs.py b/scripts/archive/add-properties-to-link-docs.py
new file mode 100644
index 0000000000..89096d30c4
--- /dev/null
+++ b/scripts/archive/add-properties-to-link-docs.py
@@ -0,0 +1,60 @@
+#!/bin/python3
+
+from urllib.parse import urlparse, parse_qs
+
+import gdrive
+
+gfiles = gdrive.session().files()
+
+query = "mimeType='application/vnd.google-apps.document' and trashed=false and 'me' in owners"
+fields = "id,properties,name,size"
+
+for file in gdrive.all_files_matching(query, fields):
+ print(f"Analyzing '{file['name']}'...")
+ if int(file.get('size') or 1) > 3000:
+ print(" File too large to be a link file. Skipping")
+ continue
+ if file.get('properties',{}).get('url'):
+ print(" Has the metadata already. Skipping")
+ continue
+ doc = gfiles.export(
+ fileId=file['id'],
+ mimeType='text/html',
+ ).execute().decode('utf-8')
+ soup = gdrive.BeautifulSoup(doc, features='html.parser')
+ links, ps, h2s, h3s = [soup.find_all(foo) or [] for foo in ['a', 'p', 'h2', 'h3']]
+ if len(links) != 1:
+ # I don't want to handle multi-link docs
+ # and no-link docs are other things
+ print(" Doesn't appear to be a single-link doc")
+ continue
+ link = links[0].get('href')
+ link = parse_qs(urlparse(link).query).get('q', [''])[0]
+ if len(link) > 121:
+ link = gdrive.requests.get('http://tinyurl.com/api-create.php?url='+link).text
+ data = {'properties': {
+ 'createdBy': 'LibraryUtils.LinkSaver',
+ 'url': link,
+ }}
+ # if new-style doc, just add properties metadata
+ if len(ps) == len(h2s) and len(h2s) == 1 and len(h3s) == 0:
+ print(f" Saving '{link}' to document properties...")
+ gfiles.update(fileId=file['id'], body=data).execute()
+ continue
+ # if old-style doc, reformat to the new style and add metadata
+ ps_with_text = [p for p in ps if p.get_text() != ""]
+ if len(ps_with_text) == 2 and len(h2s) == len(h3s) and len(h3s) == 0:
+ title = ps_with_text[0].get_text()
+ elif len(ps_with_text) == 1 and ps_with_text[0].get_text() == link:
+ title = file['name']
+ else:
+ print(" Doesn't match any known link doc format. Skipping")
+ continue
+ data['mimeType'] = 'text/html'
+ html = f"""
{title}
{link}"""
+ print(" Updating style and adding metadata...")
+ gfiles.update(
+ fileId=file['id'],
+ body=data,
+ media_body=gdrive.string_to_media(html, 'text/html'),
+ ).execute()
diff --git a/scripts/archive_site.py b/scripts/archive_site.py
index 87b40efcd9..7fbab1020a 100644
--- a/scripts/archive_site.py
+++ b/scripts/archive_site.py
@@ -1,4 +1,4 @@
-"""Saves every page across the site to Archive.org's Wayback Machine"""
+#!/bin/python3
import requests
import xml.etree.ElementTree as XML
diff --git a/scripts/archivedotorg.py b/scripts/archivedotorg.py
index e542238210..11f9a05b2e 100644
--- a/scripts/archivedotorg.py
+++ b/scripts/archivedotorg.py
@@ -14,15 +14,14 @@
from tqdm import tqdm, trange
except:
print(" pip install tqdm")
- quit(1)
+ exit(1)
ARCHIVE_ORG_AUTH_FILE = '~/archive.org.auth'
ARCHIVE_ORG_AUTH_PATH = Path(os.path.expanduser(ARCHIVE_ORG_AUTH_FILE))
if ARCHIVE_ORG_AUTH_PATH.exists():
ARCHIVE_ORG_AUTH = ARCHIVE_ORG_AUTH_PATH.read_text().strip()
else:
- print(f"Please make a new {ARCHIVE_ORG_AUTH_FILE} text file and put in it the information from https://archive.org/account/s3.php in the following format: \"LOW :\"")
- quit(1)
+ raise RuntimeError(f"Please make a new {ARCHIVE_ORG_AUTH_FILE} text file and put in it the information from https://archive.org/account/s3.php in the following format: \"LOW :\"")
ARCHIVEID_BLACKLIST = {
"unehistoiredetou0000na",
diff --git a/scripts/gdrive.py b/scripts/gdrive.py
index b45e6da2fd..cdac1bd135 100644
--- a/scripts/gdrive.py
+++ b/scripts/gdrive.py
@@ -4,6 +4,8 @@
from pathlib import Path
import requests
import struct
+from datetime import datetime
+from math import floor
from io import BytesIO
from strutils import (
titlecase,
@@ -13,11 +15,16 @@
file_info,
prompt,
approx_eq,
+ whitespace,
+ yt_url_to_plid_re,
+ yt_url_to_id_re,
)
import pdfutils
import json
import re
from functools import cache
+from scrape_utils import extract_simplified_html_for_url
+from archivedotorg import archive_urls
try:
import joblib
from yaspin import yaspin
@@ -27,9 +34,10 @@
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseUpload, MediaFileUpload
+ from youtube_transcript_api import YouTubeTranscriptApi
except:
- print("pip install yaspin bs4 google google-api-python-client google_auth_oauthlib joblib")
- quit(1)
+ print("pip install yaspin bs4 google google-api-python-client google_auth_oauthlib joblib youtube-transcript-api")
+ exit(1)
# If modifying these scopes, have to login again.
SCOPES = ['https://www.googleapis.com/auth/drive','https://www.googleapis.com/auth/youtube.readonly']
@@ -125,10 +133,31 @@ def session():
def youtube():
return build('youtube', 'v3', credentials=google_credentials())
-@disk_memorizor.cache
-def get_ytvideo_snippet(ytid):
- snippet = youtube().videos().list(id=ytid,part="snippet").execute().get("items")[0].get("snippet")
- return {k: snippet[k] for k in ['title', 'description', 'tags'] if k in snippet}
+def get_ytvideo_snippets(ytids):
+ snippets = []
+ data = youtube().videos().list(id=','.join(ytids),part="snippet,topicDetails").execute().get("items", [])
+ data = {vid['id']: vid for vid in data}
+ for ytid in ytids:
+ vid = data.get(ytid,{})
+ ret = {k: vid['snippet'][k] for k in ['title', 'description', 'tags', 'thumbnails'] if k in vid['snippet']}
+ ret['contentDetails'] = vid.get('contentDetails')
+ snippets.append(ret)
+ return snippets
+
+def get_ytvideo_snippets_for_playlist(plid):
+ deets = youtube().playlistItems().list(
+ playlistId=plid,
+ part='snippet',
+ maxResults=100,
+ ).execute()
+ return [e['snippet'] for e in deets.get("items",[])]
+
+def get_ytplaylist_snippet(plid):
+ deets = youtube().playlists().list(
+ id=plid,
+ part='snippet',
+ ).execute()
+ return deets['items'][0]['snippet']
@disk_memorizor.cache(cache_validation_callback=joblib.expires_after(days=28))
def get_subfolders(folderid):
@@ -140,9 +169,16 @@ def get_subfolders(folderid):
).execute()
return childrenFoldersDict['files']
-def create_doc(filename=None, html=None, rtf=None, folder_id=None):
- if html and rtf:
- raise ValueError("Please specify either rtf or html. Not both.")
+def string_to_media(s, mimeType):
+ return MediaIoBaseUpload(
+ BytesIO(bytes(s, 'UTF-8')),
+ mimetype=mimeType,
+ resumable=True,
+ )
+
+def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None, custom_properties: dict[str, str] = None):
+ if bool(html) == bool(rtf):
+ raise ValueError("Please specify either rtf OR html.")
drive_service = session()
metadata = {'mimeType': 'application/vnd.google-apps.document'}
media = None
@@ -150,16 +186,24 @@ def create_doc(filename=None, html=None, rtf=None, folder_id=None):
metadata['name'] = filename
if folder_id:
metadata['parents'] = [folder_id]
+ if custom_properties:
+ metadata['properties'] = custom_properties
+ else:
+ metadata['properties'] = dict()
+ if 'createdBy' not in metadata['properties']:
+ metadata['properties']['createdBy'] = creator or 'LibraryUtils'
if html:
- media = MediaIoBaseUpload(BytesIO(bytes(html, 'UTF-8')), mimetype='text/html', resumable=True)
+ media = string_to_media(html, 'text/html')
if rtf:
- media = MediaIoBaseUpload(BytesIO(bytes(rtf, 'UTF-8')), mimetype='application/rtf', resumable=True)
+ media = string_to_media(rtf, 'application/rtf')
return _perform_upload(metadata, media)
-def upload_to_google_drive(file_path, filename=None, folder_id=None):
+def upload_to_google_drive(file_path, creator=None, filename=None, folder_id=None, custom_properties: dict[str,str] = None):
file_metadata = {'name': (filename or os.path.basename(file_path))}
if folder_id:
file_metadata['parents'] = [folder_id]
+ file_metadata['properties'] = custom_properties or dict()
+ file_metadata['properties']['createdBy'] = creator or 'LibraryUtils'
media = MediaFileUpload(file_path, resumable=True)
return _perform_upload(file_metadata, media)
@@ -234,6 +278,23 @@ def move_drive_file(file_id, folder_id, previous_parents=None):
print(f" \"{file.get('name')}\" moved to {file.get('parents')}")
return file
+def all_files_matching(query, fields, page_size=30):
+ files = session().files()
+ fields = f"files({fields}),nextPageToken"
+ params = {
+ 'q': query,
+ 'fields': fields,
+ 'pageSize': page_size,
+ }
+ results = files.list(**params).execute()
+ for item in results.get('files', []):
+ yield item
+ while 'nextPageToken' in results:
+ params['pageToken'] = results['nextPageToken']
+ results = files.list(**params).execute()
+ for item in results.get('files', []):
+ yield item
+
EXACT_MATCH_FIELDS = "files(id,mimeType,name,md5Checksum,originalFilename,size,parents)"
def files_exactly_named(file_name):
@@ -347,8 +408,67 @@ def guess_link_title(url):
except:
return ""
+def make_link_doc_html(title, link):
+ ret = f"""{title}
"""
+ if 'youtu' in link:
+ if 'playlist' in link:
+ ret += make_ytplaylist_summary_html(yt_url_to_plid_re.search(link).groups()[0])
+ else:
+ vid = yt_url_to_id_re.search(link)
+ if vid:
+ ret += make_ytvideo_summary_html(vid.groups()[0])
+ else:
+ contents = extract_simplified_html_for_url(link)
+ if contents:
+ ret += f"Website Contents Preview (as of {datetime.now().strftime('%Y-%m-%d')})
"
+ ret += contents
+ return ret
+
+def htmlify_ytdesc(description):
+ return description.replace('\n\n', '
')
+
+def _yt_thumbnail(snippet):
+ if 'high' in snippet['thumbnails']:
+ return snippet['thumbnails']['high']['url']
+ return snippet['thumbnails']['default']['url']
+
+def make_ytvideo_summary_html(vid):
+ snippet = get_ytvideo_snippets([vid])[0]
+ ret = ""
+ if snippet.get('description'):
+ desc = htmlify_ytdesc(snippet['description'])
+ ret += f"""Video Description (from YouTube)
{desc}
"""
+ ret += f"""Thumbnail
"""
+ if len(snippet.get('tags',[])) > 0:
+ ret += f"""Video Tags
{snippet['tags']}
"""
+ transcript = None
+ try:
+ transcript = YouTubeTranscriptApi.get_transcript(vid)
+ except:
+ pass
+ if transcript:
+ ret += "Video Subtitles
"
+ for line in transcript:
+ ret += f"""{floor(line['start']/60)}:{round(line['start']%60):02d} {whitespace.sub(' ', line['text'])}
"""
+ return ret
+
+def make_ytplaylist_summary_html(ytplid):
+ ret = ""
+ plsnip = get_ytplaylist_snippet(ytplid)
+ desc = htmlify_ytdesc(plsnip.get('description', ''))
+ if desc:
+ ret += f"""Description (from YouTube)
{desc}
"""
+ videos = get_ytvideo_snippets_for_playlist(ytplid)
+ if len(videos) > 0:
+ ret += "Videos
"
+ for video in videos:
+ ret += f""""""
+ ret += f""""""
+ return ret
+
if __name__ == "__main__":
glink_gens = []
+ urls_to_save = []
# a list of generator lambdas not direct links
# so that we can defer doc creation to the end
while True:
@@ -358,11 +478,31 @@ def guess_link_title(url):
if not link_to_id(link):
if "youtu" in link:
link = link.split("?si=")[0]
+ else:
+ urls_to_save.append(link)
title = input_with_prefill("title: ", guess_link_title(link))
- glink_gens.append(lambda title=title, link=link: DOC_LINK.format(create_doc(title, html=f"""{title}
{link}""")))
+ if len(link) > 121:
+ with yaspin(text="Shortening long URL..."):
+ link = requests.get('http://tinyurl.com/api-create.php?url='+link).text
+ glink_gens.append(
+ lambda title=title, link=link: DOC_LINK.format(
+ create_doc(
+ filename=title,
+ html=make_link_doc_html(title, link),
+ custom_properties={
+ "createdBy": "LibraryUtils.LinkSaver",
+ "url": link,
+ },
+ )
+ )
+ )
else:
glink_gens.append(lambda r=link: r)
course = input_with_tab_complete("course: ", get_known_courses())
folders = get_gfolders_for_course(course)
for glink_gen in glink_gens:
move_gfile(glink_gen(), folders)
+ print("Files moved!")
+ if len(urls_to_save) > 0:
+ print("Ensuring URLs are saved to Archive.org...")
+ archive_urls(urls_to_save)
diff --git a/scripts/refresh_link_docs.py b/scripts/refresh_link_docs.py
new file mode 100644
index 0000000000..dea4c31705
--- /dev/null
+++ b/scripts/refresh_link_docs.py
@@ -0,0 +1,38 @@
+#!/bin/python3
+
+from archivedotorg import archive_urls
+import gdrive
+
+FIELDS = "id,properties,name,size"
+
+def regen_link_doc(docid, title=None, link=None):
+ if not (link and title):
+ fdata = gdrive.session().files().get(fileId=docid,fields=FIELDS).execute()
+ if not link:
+ link = fdata['properties']['url']
+ if not title:
+ title = fdata['name']
+ html = gdrive.make_link_doc_html(title, link)
+ gdrive.session().files().update(
+ fileId=docid,
+ body={'mimeType':'text/html'},
+ media_body=gdrive.string_to_media(html, 'text/html'),
+ ).execute()
+
+if __name__ == '__main__':
+ QUERY = " and ".join([
+ "mimeType='application/vnd.google-apps.document'",
+ "trashed=false",
+ "'me' in writers",
+ "properties has { key='createdBy' and value='LibraryUtils.LinkSaver' }",
+ ])
+ urls = []
+ for file in gdrive.all_files_matching(QUERY, FIELDS, page_size=2):
+ print(f"Regenerating '{file['name']}'...")
+ link = file['properties']['url']
+ regen_link_doc(file['id'], title=file['name'], link=link)
+ if 'youtu' not in link:
+ urls.append(link)
+ print(f"Ensuring all {len(urls)} (non-YT) URLs are saved to Archive.org...")
+ archive_urls(urls)
+
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
index b8788467ec..8ff88f0c03 100644
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -13,4 +13,5 @@ google
google-api-python-client
google_auth_oauthlib
titlecase
+trafilatura
diff --git a/scripts/scrape_utils.py b/scripts/scrape_utils.py
new file mode 100644
index 0000000000..b154224d5a
--- /dev/null
+++ b/scripts/scrape_utils.py
@@ -0,0 +1,35 @@
+#!/bin/python3
+
+try:
+ import trafilatura
+except:
+ print(" pip install trafilatura")
+ exit(1)
+
+def extract_simplified_html_for_url(url):
+ downloaded_webpage = trafilatura.fetch_url(url)
+ parsed_document = trafilatura.extract(
+ downloaded_webpage,
+ output_format='xml',
+ include_images=True,
+ include_links=True,
+ include_comments=False,
+ favor_recall=True,
+ ) or ''
+ # convert trafilatura's silly xml back to html
+ html_doc = parsed_document\
+ .replace(" target=\"", " href=\"")\
+ .replace("[", "")\
+ .replace("","")\
+ .replace("]","")\
+ .replace('',"")\
+ .replace("- ","")\
+ .replace("","")
+ return html_doc
diff --git a/scripts/strutils.py b/scripts/strutils.py
index e636a599d1..ac2687b0cc 100644
--- a/scripts/strutils.py
+++ b/scripts/strutils.py
@@ -48,6 +48,7 @@ def ANSI_MOVE_UP(n):
abnormalchars = re.compile('[^\w\s]')
sutta_id_re = re.compile(r'^([a-zA-Z]+)(\d+)[\.]?([-–\d]*)$')
yt_url_to_id_re = re.compile(r'(?:youtube(?:-nocookie)?\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})')
+yt_url_to_plid_re = re.compile(r'[&?]list=([^&]+)')
HOSTNAME_BLACKLIST = {
"www.questia.com",