Skip to content

Commit

Permalink
v2 link saving script
Browse files Browse the repository at this point in the history
  • Loading branch information
Khemarato Bhikkhu committed Jan 19, 2024
1 parent 2f75e1a commit b44051c
Show file tree
Hide file tree
Showing 9 changed files with 300 additions and 19 deletions.
11 changes: 9 additions & 2 deletions scripts/android_import_sutta.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,14 @@ def process_pdf(pdf_file):
if parallels:
parallels += "\n"
print(f"Attempting to upload \"{filename}\" to Google Drive...")
filegid = upload_to_google_drive(pdf_file, filename=filename, folder_id=folder_id)
author = trans['author_uid'].replace('thanissaro','geoff').replace('-thera','').replace('mills','mills-laurence')
filegid = upload_to_google_drive(
pdf_file,
creator='LibraryUtils.SuttaUploader',
custom_properties={'sutta':sutta,'slug':slugfield,'translator':author},
filename=filename,
folder_id=folder_id,
)
if not filegid:
print("Failed to upload!")
quit(1)
Expand All @@ -350,7 +357,7 @@ def process_pdf(pdf_file):
blurb = f"\n\n{blurb}\n<!---->" if blurb else ""
mdfile.write_text(f"""---
title: "{title}"
translator: {trans['author_uid'].replace('thanissaro','geoff').replace('-thera','').replace('mills','mills-laurence')}
translator: {author}
slug: "{slugfield}"{extra_fields}
external_url: "{external_url}"
{drive_links}:
Expand Down
60 changes: 60 additions & 0 deletions scripts/archive/add-properties-to-link-docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/python3

from urllib.parse import urlparse, parse_qs

import gdrive

gfiles = gdrive.session().files()

query = "mimeType='application/vnd.google-apps.document' and trashed=false and 'me' in owners"
fields = "id,properties,name,size"

for file in gdrive.all_files_matching(query, fields):
print(f"Analyzing '{file['name']}'...")
if int(file.get('size') or 1) > 3000:
print(" File too large to be a link file. Skipping")
continue
if file.get('properties',{}).get('url'):
print(" Has the metadata already. Skipping")
continue
doc = gfiles.export(
fileId=file['id'],
mimeType='text/html',
).execute().decode('utf-8')
soup = gdrive.BeautifulSoup(doc, features='html.parser')
links, ps, h2s, h3s = [soup.find_all(foo) or [] for foo in ['a', 'p', 'h2', 'h3']]
if len(links) != 1:
# I don't want to handle multi-link docs
# and no-link docs are other things
print(" Doesn't appear to be a single-link doc")
continue
link = links[0].get('href')
link = parse_qs(urlparse(link).query).get('q', [''])[0]
if len(link) > 121:
link = gdrive.requests.get('http://tinyurl.com/api-create.php?url='+link).text
data = {'properties': {
'createdBy': 'LibraryUtils.LinkSaver',
'url': link,
}}
# if new-style doc, just add properties metadata
if len(ps) == len(h2s) and len(h2s) == 1 and len(h3s) == 0:
print(f" Saving '{link}' to document properties...")
gfiles.update(fileId=file['id'], body=data).execute()
continue
# if old-style doc, reformat to the new style and add metadata
ps_with_text = [p for p in ps if p.get_text() != ""]
if len(ps_with_text) == 2 and len(h2s) == len(h3s) and len(h3s) == 0:
title = ps_with_text[0].get_text()
elif len(ps_with_text) == 1 and ps_with_text[0].get_text() == link:
title = file['name']
else:
print(" Doesn't match any known link doc format. Skipping")
continue
data['mimeType'] = 'text/html'
html = f"""<h2>{title}</h2><a href="{link}">{link}</a>"""
print(" Updating style and adding metadata...")
gfiles.update(
fileId=file['id'],
body=data,
media_body=gdrive.string_to_media(html, 'text/html'),
).execute()
2 changes: 1 addition & 1 deletion scripts/archive_site.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Saves every page across the site to Archive.org's Wayback Machine"""
#!/bin/python3

import requests
import xml.etree.ElementTree as XML
Expand Down
5 changes: 2 additions & 3 deletions scripts/archivedotorg.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,14 @@
from tqdm import tqdm, trange
except:
print(" pip install tqdm")
quit(1)
exit(1)
ARCHIVE_ORG_AUTH_FILE = '~/archive.org.auth'

ARCHIVE_ORG_AUTH_PATH = Path(os.path.expanduser(ARCHIVE_ORG_AUTH_FILE))
if ARCHIVE_ORG_AUTH_PATH.exists():
ARCHIVE_ORG_AUTH = ARCHIVE_ORG_AUTH_PATH.read_text().strip()
else:
print(f"Please make a new {ARCHIVE_ORG_AUTH_FILE} text file and put in it the information from https://archive.org/account/s3.php in the following format: \"LOW <accesskey>:<secretkey>\"")
quit(1)
raise RuntimeError(f"Please make a new {ARCHIVE_ORG_AUTH_FILE} text file and put in it the information from https://archive.org/account/s3.php in the following format: \"LOW <accesskey>:<secretkey>\"")

ARCHIVEID_BLACKLIST = {
"unehistoiredetou0000na",
Expand Down
166 changes: 153 additions & 13 deletions scripts/gdrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from pathlib import Path
import requests
import struct
from datetime import datetime
from math import floor
from io import BytesIO
from strutils import (
titlecase,
Expand All @@ -13,11 +15,16 @@
file_info,
prompt,
approx_eq,
whitespace,
yt_url_to_plid_re,
yt_url_to_id_re,
)
import pdfutils
import json
import re
from functools import cache
from scrape_utils import extract_simplified_html_for_url
from archivedotorg import archive_urls
try:
import joblib
from yaspin import yaspin
Expand All @@ -27,9 +34,10 @@
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseUpload, MediaFileUpload
from youtube_transcript_api import YouTubeTranscriptApi
except:
print("pip install yaspin bs4 google google-api-python-client google_auth_oauthlib joblib")
quit(1)
print("pip install yaspin bs4 google google-api-python-client google_auth_oauthlib joblib youtube-transcript-api")
exit(1)

# If modifying these scopes, have to login again.
SCOPES = ['https://www.googleapis.com/auth/drive','https://www.googleapis.com/auth/youtube.readonly']
Expand Down Expand Up @@ -125,10 +133,31 @@ def session():
def youtube():
return build('youtube', 'v3', credentials=google_credentials())

@disk_memorizor.cache
def get_ytvideo_snippet(ytid):
snippet = youtube().videos().list(id=ytid,part="snippet").execute().get("items")[0].get("snippet")
return {k: snippet[k] for k in ['title', 'description', 'tags'] if k in snippet}
def get_ytvideo_snippets(ytids):
snippets = []
data = youtube().videos().list(id=','.join(ytids),part="snippet,topicDetails").execute().get("items", [])
data = {vid['id']: vid for vid in data}
for ytid in ytids:
vid = data.get(ytid,{})
ret = {k: vid['snippet'][k] for k in ['title', 'description', 'tags', 'thumbnails'] if k in vid['snippet']}
ret['contentDetails'] = vid.get('contentDetails')
snippets.append(ret)
return snippets

def get_ytvideo_snippets_for_playlist(plid):
deets = youtube().playlistItems().list(
playlistId=plid,
part='snippet',
maxResults=100,
).execute()
return [e['snippet'] for e in deets.get("items",[])]

def get_ytplaylist_snippet(plid):
deets = youtube().playlists().list(
id=plid,
part='snippet',
).execute()
return deets['items'][0]['snippet']

@disk_memorizor.cache(cache_validation_callback=joblib.expires_after(days=28))
def get_subfolders(folderid):
Expand All @@ -140,26 +169,41 @@ def get_subfolders(folderid):
).execute()
return childrenFoldersDict['files']

def create_doc(filename=None, html=None, rtf=None, folder_id=None):
if html and rtf:
raise ValueError("Please specify either rtf or html. Not both.")
def string_to_media(s, mimeType):
return MediaIoBaseUpload(
BytesIO(bytes(s, 'UTF-8')),
mimetype=mimeType,
resumable=True,
)

def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None, custom_properties: dict[str, str] = None):
if bool(html) == bool(rtf):
raise ValueError("Please specify either rtf OR html.")
drive_service = session()
metadata = {'mimeType': 'application/vnd.google-apps.document'}
media = None
if filename:
metadata['name'] = filename
if folder_id:
metadata['parents'] = [folder_id]
if custom_properties:
metadata['properties'] = custom_properties
else:
metadata['properties'] = dict()
if 'createdBy' not in metadata['properties']:
metadata['properties']['createdBy'] = creator or 'LibraryUtils'
if html:
media = MediaIoBaseUpload(BytesIO(bytes(html, 'UTF-8')), mimetype='text/html', resumable=True)
media = string_to_media(html, 'text/html')
if rtf:
media = MediaIoBaseUpload(BytesIO(bytes(rtf, 'UTF-8')), mimetype='application/rtf', resumable=True)
media = string_to_media(rtf, 'application/rtf')
return _perform_upload(metadata, media)

def upload_to_google_drive(file_path, filename=None, folder_id=None):
def upload_to_google_drive(file_path, creator=None, filename=None, folder_id=None, custom_properties: dict[str,str] = None):
file_metadata = {'name': (filename or os.path.basename(file_path))}
if folder_id:
file_metadata['parents'] = [folder_id]
file_metadata['properties'] = custom_properties or dict()
file_metadata['properties']['createdBy'] = creator or 'LibraryUtils'
media = MediaFileUpload(file_path, resumable=True)
return _perform_upload(file_metadata, media)

Expand Down Expand Up @@ -234,6 +278,23 @@ def move_drive_file(file_id, folder_id, previous_parents=None):
print(f" \"{file.get('name')}\" moved to {file.get('parents')}")
return file

def all_files_matching(query, fields, page_size=30):
files = session().files()
fields = f"files({fields}),nextPageToken"
params = {
'q': query,
'fields': fields,
'pageSize': page_size,
}
results = files.list(**params).execute()
for item in results.get('files', []):
yield item
while 'nextPageToken' in results:
params['pageToken'] = results['nextPageToken']
results = files.list(**params).execute()
for item in results.get('files', []):
yield item

EXACT_MATCH_FIELDS = "files(id,mimeType,name,md5Checksum,originalFilename,size,parents)"

def files_exactly_named(file_name):
Expand Down Expand Up @@ -347,8 +408,67 @@ def guess_link_title(url):
except:
return ""

def make_link_doc_html(title, link):
ret = f"""<h1>{title}</h1><h2><a href="{link}">{link}</a></h2>"""
if 'youtu' in link:
if 'playlist' in link:
ret += make_ytplaylist_summary_html(yt_url_to_plid_re.search(link).groups()[0])
else:
vid = yt_url_to_id_re.search(link)
if vid:
ret += make_ytvideo_summary_html(vid.groups()[0])
else:
contents = extract_simplified_html_for_url(link)
if contents:
ret += f"<h2>Website Contents Preview (as of {datetime.now().strftime('%Y-%m-%d')})</h2>"
ret += contents
return ret

def htmlify_ytdesc(description):
return description.replace('\n\n', '<br /').replace('\n', '<br />')

def _yt_thumbnail(snippet):
if 'high' in snippet['thumbnails']:
return snippet['thumbnails']['high']['url']
return snippet['thumbnails']['default']['url']

def make_ytvideo_summary_html(vid):
snippet = get_ytvideo_snippets([vid])[0]
ret = ""
if snippet.get('description'):
desc = htmlify_ytdesc(snippet['description'])
ret += f"""<h2>Video Description (from YouTube)</h2><p>{desc}</p>"""
ret += f"""<h2>Thumbnail</h2><p><img src="{_yt_thumbnail(snippet)}" /></p>"""
if len(snippet.get('tags',[])) > 0:
ret += f"""<h2>Video Tags</h2><p>{snippet['tags']}</p>"""
transcript = None
try:
transcript = YouTubeTranscriptApi.get_transcript(vid)
except:
pass
if transcript:
ret += "<h2>Video Subtitles</h2>"
for line in transcript:
ret += f"""<p><a href="https://youtu.be/{vid}?t={floor(line['start'])}">{floor(line['start']/60)}:{round(line['start']%60):02d}</a> {whitespace.sub(' ', line['text'])}</p>"""
return ret

def make_ytplaylist_summary_html(ytplid):
ret = ""
plsnip = get_ytplaylist_snippet(ytplid)
desc = htmlify_ytdesc(plsnip.get('description', ''))
if desc:
ret += f"""<h2>Description (from YouTube)</h2><p>{desc}</p>"""
videos = get_ytvideo_snippets_for_playlist(ytplid)
if len(videos) > 0:
ret += "<h2>Videos</h2>"
for video in videos:
ret += f"""<h3>{int(video['position'])+1}. <a href="https://youtu.be/{video['resourceId']['videoId']}">{video['title']}</a></h3>"""
ret += f"""<p><img src="{_yt_thumbnail(video)}" /></p>"""
return ret

if __name__ == "__main__":
glink_gens = []
urls_to_save = []
# a list of generator lambdas not direct links
# so that we can defer doc creation to the end
while True:
Expand All @@ -358,11 +478,31 @@ def guess_link_title(url):
if not link_to_id(link):
if "youtu" in link:
link = link.split("?si=")[0]
else:
urls_to_save.append(link)
title = input_with_prefill("title: ", guess_link_title(link))
glink_gens.append(lambda title=title, link=link: DOC_LINK.format(create_doc(title, html=f"""<h2>{title}</h2><a href="{link}">{link}</a>""")))
if len(link) > 121:
with yaspin(text="Shortening long URL..."):
link = requests.get('http://tinyurl.com/api-create.php?url='+link).text
glink_gens.append(
lambda title=title, link=link: DOC_LINK.format(
create_doc(
filename=title,
html=make_link_doc_html(title, link),
custom_properties={
"createdBy": "LibraryUtils.LinkSaver",
"url": link,
},
)
)
)
else:
glink_gens.append(lambda r=link: r)
course = input_with_tab_complete("course: ", get_known_courses())
folders = get_gfolders_for_course(course)
for glink_gen in glink_gens:
move_gfile(glink_gen(), folders)
print("Files moved!")
if len(urls_to_save) > 0:
print("Ensuring URLs are saved to Archive.org...")
archive_urls(urls_to_save)
38 changes: 38 additions & 0 deletions scripts/refresh_link_docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/python3

from archivedotorg import archive_urls
import gdrive

FIELDS = "id,properties,name,size"

def regen_link_doc(docid, title=None, link=None):
if not (link and title):
fdata = gdrive.session().files().get(fileId=docid,fields=FIELDS).execute()
if not link:
link = fdata['properties']['url']
if not title:
title = fdata['name']
html = gdrive.make_link_doc_html(title, link)
gdrive.session().files().update(
fileId=docid,
body={'mimeType':'text/html'},
media_body=gdrive.string_to_media(html, 'text/html'),
).execute()

if __name__ == '__main__':
QUERY = " and ".join([
"mimeType='application/vnd.google-apps.document'",
"trashed=false",
"'me' in writers",
"properties has { key='createdBy' and value='LibraryUtils.LinkSaver' }",
])
urls = []
for file in gdrive.all_files_matching(QUERY, FIELDS, page_size=2):
print(f"Regenerating '{file['name']}'...")
link = file['properties']['url']
regen_link_doc(file['id'], title=file['name'], link=link)
if 'youtu' not in link:
urls.append(link)
print(f"Ensuring all {len(urls)} (non-YT) URLs are saved to Archive.org...")
archive_urls(urls)

Loading

0 comments on commit b44051c

Please sign in to comment.