Skip to content

Commit

Permalink
Add python script for refreshing the Drive Catalog
Browse files Browse the repository at this point in the history
  • Loading branch information
Khemarato Bhikkhu committed Aug 25, 2024
1 parent d7f6214 commit 151ae04
Show file tree
Hide file tree
Showing 5 changed files with 201 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/archive.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout the Code
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
ref: main
- name: Install Dependencies
Expand Down
19 changes: 19 additions & 0 deletions .github/workflows/catalogue.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: GDrive Cataloguer
on:
workflow_dispatch:
schedule:
- cron: "50 0 24 * *"
jobs:
gdrive:
runs-on: ubuntu-latest
steps:
- name: Checkout the Code
uses: actions/checkout@v4
- name: Install Dependencies
run: |
pip install titlecase pyyaml pypdf tqdm yaspin bs4 google google-api-python-client google_auth_oauthlib joblib youtube-transcript-api
printf '${{ secrets.GTOKEN }}' > ~/gtoken.json
printf '${{ secrets.LIBRARY_UTILS_CLIENT_SECRET }}' > ~/library-utils-client-secret.json
printf "${{ secrets.ARCHIVE_ORG_AUTH }}" > ~/archive.org.auth
- name: Run Cataloguer
run: python scripts/refresh_catalog_doc.py
6 changes: 3 additions & 3 deletions _layouts/default.html
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@
</head>
<body>
<a class="skip-to-main" href="#main">Skip to content</a>
<noscript>
<img src="https://{{ site.goatcounter }}.goatcounter.com/count?p=/{{ page.url }}&t={{ page.title | url_encode }}">
</noscript>

<header class="site-header{% if page.banner_info %} banner-img{% if page.next_courses %} fullsize{% elsif site.header_pages contains page.path %} smallsize{% endif %}{% if page.image_center_y %}" style="background-position-y: {{ page.image_center_y }};{% if page.image_center_x %} background-position-x: {{ page.image_center_x }};{% endif %}{% endif %}{% endif %}" role="banner">
<div class="wrapper"{% if page.banner_info %} style="backdrop-filter: blur(0.3px);"{% endif %}>
Expand Down Expand Up @@ -145,9 +148,6 @@ <h2 class="footer-heading">{{ site.title | escape }}</h2>
data-goatcounter-settings='{"allow_frame": true, "title": "{{ page.title | replace: '"', '\\"' }}"}'
async src="//gc.zgo.at/count.js">
</script>
<noscript>
<img src="https://{{ site.goatcounter }}.goatcounter.com/count?p=/{{ page.url }}&t={{ page.title | url_encode }}">
</noscript>
<script async src="/assets/js/goatclicker.js">
</script>
{%- endif -%}
Expand Down
20 changes: 14 additions & 6 deletions scripts/gdrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os.path
from pathlib import Path
import requests
import struct
import socket
from math import floor
from io import BytesIO, BufferedIOBase
from strutils import (
Expand All @@ -21,7 +21,6 @@
import pdfutils
import json
import re
import shutil
from functools import cache
from archivedotorg import archive_urls
try:
Expand Down Expand Up @@ -137,12 +136,17 @@ def google_credentials():

@cache
def session():
socket.setdefaulttimeout(300) # some of our uploads take a while...
return build('drive', 'v3', credentials=google_credentials())

@cache
def youtube():
return build('youtube', 'v3', credentials=google_credentials())

@cache
def docs():
return build('docs', 'v1', credentials=google_credentials()).documents()

def get_ytvideo_snippets(ytids):
snippets = []
if len(ytids) > 50:
Expand Down Expand Up @@ -204,7 +208,7 @@ def string_to_media(s, mimeType):
resumable=True,
)

def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None, custom_properties: dict[str, str] = None):
def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None, custom_properties: dict[str, str] = None, replace_doc=False):
if bool(html) == bool(rtf):
raise ValueError("Please specify either rtf OR html.")
drive_service = session()
Expand All @@ -224,7 +228,7 @@ def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None,
media = string_to_media(html, 'text/html')
if rtf:
media = string_to_media(rtf, 'application/rtf')
return _perform_upload(metadata, media, verbose=False)
return _perform_upload(metadata, media, verbose=False, update_file=replace_doc)

def get_file_contents(fileid, verbose=True):
"""Downloads and returns the contents of fileid in a BytesIO buffer"""
Expand Down Expand Up @@ -266,11 +270,15 @@ def upload_to_google_drive(file_path, creator=None, filename=None, folder_id=Non
media = MediaFileUpload(file_path, resumable=True)
return _perform_upload(file_metadata, media, verbose=verbose)

def _perform_upload(file_metadata, media, verbose=True):
def _perform_upload(file_metadata, media, verbose=True, update_file=False):
drive_service = session()
try:
# Upload the file
request = drive_service.files().create(body=file_metadata, media_body=media)
request = None
if update_file:
request = drive_service.files().update(fileId=update_file, body=file_metadata, media_body=media)
else:
request = drive_service.files().create(body=file_metadata, media_body=media)
response = None
while response is None:
status, response = request.next_chunk()
Expand Down
164 changes: 164 additions & 0 deletions scripts/refresh_catalog_doc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#!/bin/python

import gdrive
import datetime
from collections import defaultdict
from functools import cache
import hashlib

ROOT_FOLDER = "1RJi6bEXa25zizGdsm5evCycYuY6a2D8r"
FIELDS = "id,name,mimeType,size,shortcutDetails,createdTime,webViewLink"
MY_EMAILS = {
'aee5188bd988b0ab263a6b3003831c6e',
'e55371a7e1b97300ea623338dbcc0694',
'3945098d73ac3a594febd2c87d357971',
'3b654b6ccfb53f233fbd798415b62624',
'b9083baac482b28ac374ebe1856bfefc',
'7f519cc091d7690b440aa4db74141a94',
'd97d9501979b0a1442b0482418509a84',
}

TD = 'td style="padding:5pt;"'

def md5(text):
return hashlib.md5(text.encode()).hexdigest()

def human_readable_size(bytes_size):
units = ["B", "KB", "MB", "GB", "TB", "PB"]
if bytes_size == 0:
return "0 B"
index = 0
size = float(bytes_size)
while size >= 922 and index < len(units) - 1:
size /= 1024
index += 1
return f"{size:.2f} {units[index]}"

def headerize(text, depth):
if depth < 1:
return ''
if depth <= 4:
return f"<h{depth}>{text}</h{depth}>"
if depth <= 6:
return f'<h{depth} style="font-size=11pt;">{text}</h{depth}>'
space = "&nbsp;&nbsp;"*(depth-1)
return f"<p>{space}+ {text}</p>"

seen_folders = set()

class DriveFolder:
def __init__(self, name: str, folderid: str, createdTime: str, depth: int) -> None:
if folderid in seen_folders:
raise ValueError(f"Folder already seen: {folderid}")
seen_folders.add(folderid)
print(f"Loading folder \"{name}\"...")
self.name = name
self.id = id
self.createdTime = createdTime
self.depth = depth
self.files = []
subfolders = []
shortcuts = []
query = f"trashed=false AND '{folderid}' in parents"
for child in gdrive.all_files_matching(query, FIELDS):
if child['mimeType'] == 'application/vnd.google-apps.folder':
subfolders.append(child)
continue
if child['mimeType'] == 'application/vnd.google-apps.shortcut':
shortcuts.append(child)
continue
child['size'] = int(child.get('size', 0))
self.files.append(child)
if len(shortcuts) > 0:
print(f" Resolving {len(shortcuts)} shortcut(s)...")
for child in gdrive.batch_get_files_by_id(
[c['shortcutDetails']['targetId'] for c in shortcuts],
FIELDS+',owners'
):
shortcut = [s for s in shortcuts if s['shortcutDetails']['targetId'] == child['id']][0]
owner = child['owners'][0]
if md5(owner['emailAddress']) in MY_EMAILS:
print(f" Skipping {shortcut['name']}->{child['name']} because it's owned by me")
continue
child['originalName'] = child['name']
child['originalCreatedTime'] = child['createdTime']
child['name'] = shortcut['name']
child['createdTime'] = shortcut['createdTime']
if child['mimeType'] == 'application/vnd.google-apps.folder':
subfolders.append(child)
else:
child['size'] = int(child.get('size', 0))
self.files.append(child)
print(f" Got {len(self.files)} files and {len(subfolders)} subfolders")
self.subfolders = []
for child in subfolders:
self.subfolders.append(DriveFolder(
child['name'],
child['id'],
child['createdTime'],
self.depth + 1,
))
self.files = sorted(self.files, key=lambda f: f['createdTime'])
self.subfolders = sorted(self.subfolders, key=lambda f: f.createdTime)

@cache
def total_size(self):
return sum(f['size'] for f in self.files) + sum(f.total_size() for f in self.subfolders)

@cache
def total_count(self):
return sum(f.total_count() for f in self.subfolders) + len(self.files)

def file_count_by_mimetype(self):
ret = defaultdict(lambda: {'size': 0, 'count': 0})
for t in set([g['mimeType'] for g in self.files]):
fs = [f for f in self.files if f['mimeType'] == t]
ret[t] = {'size': sum(f['size'] for f in fs), 'count': len(fs)}
for child in self.subfolders:
subcounts = child.file_count_by_mimetype()
for t in subcounts:
ret[t]['count'] += subcounts[t]['count']
ret[t]['size'] += subcounts[t]['size']
return ret

def list_files(self):
space = '&nbsp;&nbsp;'*self.depth
ret = [headerize(
f'<a href="{gdrive.FOLDER_LINK_PREFIX}{self.id}">{self.name}</a> <span style="color:#666666;">({human_readable_size(self.total_size())})</span>',
self.depth,
)]
for child in self.files:
ret.append(f"""<p>{space}- <a href="{child['webViewLink']}">{child['name']}</a></p>""")
for child in self.subfolders:
ret.append(child.list_files())
return '\n'.join(ret)

if __name__ == "__main__":

root = DriveFolder("A Curated Buddhist G-Library", ROOT_FOLDER, "2019-01-01T00:00:00Z", 0)
total_size = human_readable_size(root.total_size())
total_count = root.total_count()
print("\n==================\nFinished fetching data!\n==================\n")

html = f"""<html>
<head><meta content="text/html; charset=UTF-8"></head>
<body class="doc-content">
<p class="title" style="font-size:26pt;padding-bottom:3pt;line-height:1.15;page-break-after:avoid;font-family:&quot;Arial&quot;;orphans:2;widows:2;text-align:left;"><span style="font-weight:400;text-decoration:none;vertical-align:baseline;font-size:26pt;font-family:&quot;Arial&quot;;font-style:normal">Buddhist G-Library Catalog</span></p>
<p>An automatically generated list of all the files in the Library.</p>
<p>Generated on {datetime.datetime.now(datetime.timezone.utc).strftime("%a, %d %b %Y %H:%M:%S GMT")}</p><p></p>
<p>In total, the library is {total_size} large and contains {total_count} files. They break down by MIME type as follows:</p>
<table><tr style="text-decoration:underline;"><{TD}>MIME Type</td><{TD}>Count</td><{TD}>Size</td></tr>
{"".join(f"<tr><{TD}>{t}</td><{TD}>{c['count']}</td><{TD}>{human_readable_size(c['size'])}</td></tr>" for t, c in root.file_count_by_mimetype().items())}
<tr style="font-weight:700;"><{TD}>Total</td><{TD}>{total_count}</td><{TD}>{total_size}</td></tr>
</table><p></p><h1>Files</h1><p></p>{root.list_files()}
</body>
</html>
"""

print("Replacing public doc with new version...")
docid = gdrive.create_doc(
html=html,
creator="CatalogBuilder",
replace_doc="1rGLm9Xh5de0e3hsMY2yyt97MWBuZJ1V1_q0jhGe7vpw",
)
print(f"Done! See https://docs.google.com/document/d/{docid}/edit")

0 comments on commit 151ae04

Please sign in to comment.