Add python script for refreshing the Drive Catalog

buddhist-uni · Aug 25, 2024 · 151ae04 · 151ae04
1 parent d7f6214
commit 151ae04
Show file tree

Hide file tree

Showing 5 changed files with 201 additions and 10 deletions.
diff --git a/.github/workflows/archive.yml b/.github/workflows/archive.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout the Code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           ref: main
       - name: Install Dependencies

diff --git a/.github/workflows/catalogue.yml b/.github/workflows/catalogue.yml
@@ -0,0 +1,19 @@
+name: GDrive Cataloguer
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "50 0 24 * *"
+jobs:
+  gdrive:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout the Code
+        uses: actions/checkout@v4
+      - name: Install Dependencies
+        run: |
+          pip install titlecase pyyaml pypdf tqdm yaspin bs4 google google-api-python-client google_auth_oauthlib joblib youtube-transcript-api
+          printf '${{ secrets.GTOKEN }}' > ~/gtoken.json
+          printf '${{ secrets.LIBRARY_UTILS_CLIENT_SECRET }}' > ~/library-utils-client-secret.json
+          printf "${{ secrets.ARCHIVE_ORG_AUTH }}" > ~/archive.org.auth
+      - name: Run Cataloguer
+        run: python scripts/refresh_catalog_doc.py
diff --git a/_layouts/default.html b/_layouts/default.html
@@ -84,6 +84,9 @@
 </head>
 <body>
 <a class="skip-to-main" href="#main">Skip to content</a>
+<noscript>
+  <img src="https://{{ site.goatcounter }}.goatcounter.com/count?p=/{{ page.url }}&t={{ page.title | url_encode }}">
+</noscript>
 
 <header class="site-header{% if page.banner_info %} banner-img{% if page.next_courses %} fullsize{% elsif site.header_pages contains page.path %} smallsize{% endif %}{% if page.image_center_y %}" style="background-position-y: {{ page.image_center_y }};{% if page.image_center_x %} background-position-x: {{ page.image_center_x }};{% endif %}{% endif %}{% endif %}" role="banner">
   <div class="wrapper"{% if page.banner_info %} style="backdrop-filter: blur(0.3px);"{% endif %}>
@@ -145,9 +148,6 @@ <h2 class="footer-heading">{{ site.title | escape }}</h2>
     data-goatcounter-settings='{"allow_frame": true, "title": "{{ page.title | replace: '"', '\\"' }}"}'
     async src="//gc.zgo.at/count.js">
   </script>
-  <noscript>
-    <img src="https://{{ site.goatcounter }}.goatcounter.com/count?p=/{{ page.url }}&t={{ page.title | url_encode }}">
-  </noscript>
   <script async src="/assets/js/goatclicker.js">
   </script>
 {%- endif -%}

diff --git a/scripts/gdrive.py b/scripts/gdrive.py
@@ -3,7 +3,7 @@
 import os.path
 from pathlib import Path
 import requests
-import struct
+import socket
 from math import floor
 from io import BytesIO, BufferedIOBase
 from strutils import (
@@ -21,7 +21,6 @@
 import pdfutils
 import json
 import re
-import shutil
 from functools import cache
 from archivedotorg import archive_urls
 try:
@@ -137,12 +136,17 @@ def google_credentials():
 
 @cache
 def session():
+    socket.setdefaulttimeout(300) # some of our uploads take a while...
     return build('drive', 'v3', credentials=google_credentials())
 
 @cache
 def youtube():
     return build('youtube', 'v3', credentials=google_credentials())
 
+@cache
+def docs():
+  return build('docs', 'v1', credentials=google_credentials()).documents()
+
 def get_ytvideo_snippets(ytids):
   snippets = []
   if len(ytids) > 50:
@@ -204,7 +208,7 @@ def string_to_media(s, mimeType):
     resumable=True,
   )
 
-def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None, custom_properties: dict[str, str] = None):
+def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None, custom_properties: dict[str, str] = None, replace_doc=False):
   if bool(html) == bool(rtf):
     raise ValueError("Please specify either rtf OR html.")
   drive_service = session()
@@ -224,7 +228,7 @@ def create_doc(filename=None, html=None, rtf=None, folder_id=None, creator=None,
     media = string_to_media(html, 'text/html')
   if rtf:
     media = string_to_media(rtf, 'application/rtf')
-  return _perform_upload(metadata, media, verbose=False)
+  return _perform_upload(metadata, media, verbose=False, update_file=replace_doc)
 
 def get_file_contents(fileid, verbose=True):
   """Downloads and returns the contents of fileid in a BytesIO buffer"""
@@ -266,11 +270,15 @@ def upload_to_google_drive(file_path, creator=None, filename=None, folder_id=Non
     media = MediaFileUpload(file_path, resumable=True)
     return _perform_upload(file_metadata, media, verbose=verbose)
 
-def _perform_upload(file_metadata, media, verbose=True):
+def _perform_upload(file_metadata, media, verbose=True, update_file=False):
     drive_service = session()
     try:
         # Upload the file
-        request = drive_service.files().create(body=file_metadata, media_body=media)
+        request = None
+        if update_file:
+          request = drive_service.files().update(fileId=update_file, body=file_metadata, media_body=media)
+        else:
+          request = drive_service.files().create(body=file_metadata, media_body=media)
         response = None
         while response is None:
             status, response = request.next_chunk()

diff --git a/scripts/refresh_catalog_doc.py b/scripts/refresh_catalog_doc.py
@@ -0,0 +1,164 @@
+#!/bin/python
+
+import gdrive
+import datetime
+from collections import defaultdict
+from functools import cache
+import hashlib
+
+ROOT_FOLDER = "1RJi6bEXa25zizGdsm5evCycYuY6a2D8r"
+FIELDS = "id,name,mimeType,size,shortcutDetails,createdTime,webViewLink"
+MY_EMAILS = {
+  'aee5188bd988b0ab263a6b3003831c6e',
+  'e55371a7e1b97300ea623338dbcc0694',
+  '3945098d73ac3a594febd2c87d357971',
+  '3b654b6ccfb53f233fbd798415b62624',
+  'b9083baac482b28ac374ebe1856bfefc',
+  '7f519cc091d7690b440aa4db74141a94',
+  'd97d9501979b0a1442b0482418509a84',
+}
+
+TD = 'td style="padding:5pt;"'
+
+def md5(text):
+  return hashlib.md5(text.encode()).hexdigest()
+
+def human_readable_size(bytes_size):
+    units = ["B", "KB", "MB", "GB", "TB", "PB"]
+    if bytes_size == 0:
+        return "0 B"
+    index = 0
+    size = float(bytes_size)
+    while size >= 922 and index < len(units) - 1:
+        size /= 1024
+        index += 1
+    return f"{size:.2f} {units[index]}"
+
+def headerize(text, depth):
+  if depth < 1:
+    return ''
+  if depth <= 4:
+    return f"<h{depth}>{text}</h{depth}>"
+  if depth <= 6:
+    return f'<h{depth} style="font-size=11pt;">{text}</h{depth}>'
+  space = "&nbsp;&nbsp;"*(depth-1)
+  return f"<p>{space}+ {text}</p>"
+
+seen_folders = set()
+
+class DriveFolder:
+  def __init__(self, name: str, folderid: str, createdTime: str, depth: int) -> None:
+    if folderid in seen_folders:
+      raise ValueError(f"Folder already seen: {folderid}")
+    seen_folders.add(folderid)
+    print(f"Loading folder \"{name}\"...")
+    self.name = name
+    self.id = id
+    self.createdTime = createdTime
+    self.depth = depth
+    self.files = []
+    subfolders = []
+    shortcuts = []
+    query = f"trashed=false AND '{folderid}' in parents"
+    for child in gdrive.all_files_matching(query, FIELDS):
+      if child['mimeType'] == 'application/vnd.google-apps.folder':
+        subfolders.append(child)
+        continue
+      if child['mimeType'] == 'application/vnd.google-apps.shortcut':
+        shortcuts.append(child)
+        continue
+      child['size'] = int(child.get('size', 0))
+      self.files.append(child)
+    if len(shortcuts) > 0:
+      print(f"  Resolving {len(shortcuts)} shortcut(s)...")
+      for child in gdrive.batch_get_files_by_id(
+        [c['shortcutDetails']['targetId'] for c in shortcuts],
+        FIELDS+',owners'
+      ):
+        shortcut = [s for s in shortcuts if s['shortcutDetails']['targetId'] == child['id']][0]
+        owner = child['owners'][0]
+        if md5(owner['emailAddress']) in MY_EMAILS:
+          print(f"  Skipping {shortcut['name']}->{child['name']} because it's owned by me")
+          continue
+        child['originalName'] = child['name']
+        child['originalCreatedTime'] = child['createdTime']
+        child['name'] = shortcut['name']
+        child['createdTime'] = shortcut['createdTime']
+        if child['mimeType'] == 'application/vnd.google-apps.folder':
+          subfolders.append(child)
+        else:
+          child['size'] = int(child.get('size', 0))
+          self.files.append(child)
+    print(f"  Got {len(self.files)} files and {len(subfolders)} subfolders")
+    self.subfolders = []
+    for child in subfolders:
+      self.subfolders.append(DriveFolder(
+        child['name'],
+        child['id'],
+        child['createdTime'],
+        self.depth + 1,
+      ))
+    self.files = sorted(self.files, key=lambda f: f['createdTime'])
+    self.subfolders = sorted(self.subfolders, key=lambda f: f.createdTime)
+
+  @cache
+  def total_size(self):
+    return sum(f['size'] for f in self.files) + sum(f.total_size() for f in self.subfolders)
+
+  @cache
+  def total_count(self):
+    return sum(f.total_count() for f in self.subfolders) + len(self.files)
+
+  def file_count_by_mimetype(self):
+    ret = defaultdict(lambda: {'size': 0, 'count': 0})
+    for t in set([g['mimeType'] for g in self.files]):
+      fs = [f for f in self.files if f['mimeType'] == t]
+      ret[t] = {'size': sum(f['size'] for f in fs), 'count': len(fs)}
+    for child in self.subfolders:
+      subcounts = child.file_count_by_mimetype()
+      for t in subcounts:
+        ret[t]['count'] += subcounts[t]['count']
+        ret[t]['size'] += subcounts[t]['size']
+    return ret
+
+  def list_files(self):
+    space = '&nbsp;&nbsp;'*self.depth
+    ret = [headerize(
+      f'<a href="{gdrive.FOLDER_LINK_PREFIX}{self.id}">{self.name}</a> <span style="color:#666666;">({human_readable_size(self.total_size())})</span>',
+      self.depth,
+    )]
+    for child in self.files:
+      ret.append(f"""<p>{space}- <a href="{child['webViewLink']}">{child['name']}</a></p>""")
+    for child in self.subfolders:
+      ret.append(child.list_files())
+    return '\n'.join(ret)
+
+if __name__ == "__main__":
+
+  root = DriveFolder("A Curated Buddhist G-Library", ROOT_FOLDER, "2019-01-01T00:00:00Z", 0)
+  total_size = human_readable_size(root.total_size())
+  total_count = root.total_count()
+  print("\n==================\nFinished fetching data!\n==================\n")
+
+  html = f"""<html>
+    <head><meta content="text/html; charset=UTF-8"></head>
+    <body class="doc-content">
+      <p class="title" style="font-size:26pt;padding-bottom:3pt;line-height:1.15;page-break-after:avoid;font-family:&quot;Arial&quot;;orphans:2;widows:2;text-align:left;"><span style="font-weight:400;text-decoration:none;vertical-align:baseline;font-size:26pt;font-family:&quot;Arial&quot;;font-style:normal">Buddhist G-Library Catalog</span></p>
+      <p>An automatically generated list of all the files in the Library.</p>
+      <p>Generated on {datetime.datetime.now(datetime.timezone.utc).strftime("%a, %d %b %Y %H:%M:%S GMT")}</p><p></p>
+      <p>In total, the library is {total_size} large and contains {total_count} files. They break down by MIME type as follows:</p>
+      <table><tr style="text-decoration:underline;"><{TD}>MIME Type</td><{TD}>Count</td><{TD}>Size</td></tr>
+      {"".join(f"<tr><{TD}>{t}</td><{TD}>{c['count']}</td><{TD}>{human_readable_size(c['size'])}</td></tr>" for t, c in root.file_count_by_mimetype().items())}
+      <tr style="font-weight:700;"><{TD}>Total</td><{TD}>{total_count}</td><{TD}>{total_size}</td></tr>
+      </table><p></p><h1>Files</h1><p></p>{root.list_files()}
+    </body>
+  </html>
+  """
+
+  print("Replacing public doc with new version...")
+  docid = gdrive.create_doc(
+    html=html,
+    creator="CatalogBuilder",
+    replace_doc="1rGLm9Xh5de0e3hsMY2yyt97MWBuZJ1V1_q0jhGe7vpw",
+  )
+  print(f"Done! See https://docs.google.com/document/d/{docid}/edit")