-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add python script for refreshing the Drive Catalog
- Loading branch information
Khemarato Bhikkhu
committed
Aug 25, 2024
1 parent
d7f6214
commit b8f1391
Showing
4 changed files
with
200 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
name: GDrive Cataloguer | ||
on: | ||
workflow_dispatch: | ||
schedule: | ||
- cron: "50 0 24 * *" | ||
jobs: | ||
gdrive: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout the Code | ||
uses: actions/checkout@v3 | ||
- name: Install Dependencies | ||
run: | | ||
pip install titlecase pyyaml pypdf tqdm yaspin bs4 google google-api-python-client google_auth_oauthlib joblib youtube-transcript-api | ||
printf "${{ secrets.GTOKEN }}" > ~/gtoken.json | ||
printf "${{ secrets.LIBRARY_UTILS_CLIENT_SECRET }}" > ~/library-utils-client-secret.json | ||
printf "${{ secrets.ARCHIVE_ORG_AUTH }}" > ~/archive.org.auth | ||
- name: Run Cataloguer | ||
run: python scripts/refresh_catalog_doc.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
#!/bin/python | ||
|
||
import gdrive | ||
import datetime | ||
from collections import defaultdict | ||
from functools import cache | ||
import hashlib | ||
|
||
ROOT_FOLDER = "1RJi6bEXa25zizGdsm5evCycYuY6a2D8r" | ||
FIELDS = "id,name,mimeType,size,shortcutDetails,createdTime,webViewLink" | ||
MY_EMAILS = { | ||
'aee5188bd988b0ab263a6b3003831c6e', | ||
'e55371a7e1b97300ea623338dbcc0694', | ||
'3945098d73ac3a594febd2c87d357971', | ||
'3b654b6ccfb53f233fbd798415b62624', | ||
'b9083baac482b28ac374ebe1856bfefc', | ||
'7f519cc091d7690b440aa4db74141a94', | ||
'd97d9501979b0a1442b0482418509a84', | ||
} | ||
|
||
TD = 'td style="padding:5pt;"' | ||
|
||
def md5(text): | ||
return hashlib.md5(text.encode()).hexdigest() | ||
|
||
def human_readable_size(bytes_size): | ||
units = ["B", "KB", "MB", "GB", "TB", "PB"] | ||
if bytes_size == 0: | ||
return "0 B" | ||
index = 0 | ||
size = float(bytes_size) | ||
while size >= 922 and index < len(units) - 1: | ||
size /= 1024 | ||
index += 1 | ||
return f"{size:.2f} {units[index]}" | ||
|
||
def headerize(text, depth): | ||
if depth < 1: | ||
return '' | ||
if depth <= 4: | ||
return f"<h{depth}>{text}</h{depth}>" | ||
if depth <= 6: | ||
return f'<h{depth} style="font-size=11pt;">{text}</h{depth}>' | ||
space = " "*(depth-1) | ||
return f"<p>{space}+ {text}</p>" | ||
|
||
seen_folders = set() | ||
|
||
class DriveFolder: | ||
def __init__(self, name: str, folderid: str, createdTime: str, depth: int) -> None: | ||
if folderid in seen_folders: | ||
raise ValueError(f"Folder already seen: {folderid}") | ||
seen_folders.add(folderid) | ||
print(f"Loading folder \"{name}\"...") | ||
self.name = name | ||
self.id = id | ||
self.createdTime = createdTime | ||
self.depth = depth | ||
self.files = [] | ||
subfolders = [] | ||
shortcuts = [] | ||
query = f"trashed=false AND '{folderid}' in parents" | ||
for child in gdrive.all_files_matching(query, FIELDS): | ||
if child['mimeType'] == 'application/vnd.google-apps.folder': | ||
subfolders.append(child) | ||
continue | ||
if child['mimeType'] == 'application/vnd.google-apps.shortcut': | ||
shortcuts.append(child) | ||
continue | ||
child['size'] = int(child.get('size', 0)) | ||
self.files.append(child) | ||
if len(shortcuts) > 0: | ||
print(f" Resolving {len(shortcuts)} shortcut(s)...") | ||
for child in gdrive.batch_get_files_by_id( | ||
[c['shortcutDetails']['targetId'] for c in shortcuts], | ||
FIELDS+',owners' | ||
): | ||
shortcut = [s for s in shortcuts if s['shortcutDetails']['targetId'] == child['id']][0] | ||
owner = child['owners'][0] | ||
if md5(owner['emailAddress']) in MY_EMAILS: | ||
print(f" Skipping {shortcut['name']}->{child['name']} because it's owned by me") | ||
continue | ||
child['originalName'] = child['name'] | ||
child['originalCreatedTime'] = child['createdTime'] | ||
child['name'] = shortcut['name'] | ||
child['createdTime'] = shortcut['createdTime'] | ||
if child['mimeType'] == 'application/vnd.google-apps.folder': | ||
subfolders.append(child) | ||
else: | ||
child['size'] = int(child.get('size', 0)) | ||
self.files.append(child) | ||
print(f" Got {len(self.files)} files and {len(subfolders)} subfolders") | ||
self.subfolders = [] | ||
for child in subfolders: | ||
self.subfolders.append(DriveFolder( | ||
child['name'], | ||
child['id'], | ||
child['createdTime'], | ||
self.depth + 1, | ||
)) | ||
self.files = sorted(self.files, key=lambda f: f['createdTime']) | ||
self.subfolders = sorted(self.subfolders, key=lambda f: f.createdTime) | ||
|
||
@cache | ||
def total_size(self): | ||
return sum(f['size'] for f in self.files) + sum(f.total_size() for f in self.subfolders) | ||
|
||
@cache | ||
def total_count(self): | ||
return sum(f.total_count() for f in self.subfolders) + len(self.files) | ||
|
||
def file_count_by_mimetype(self): | ||
ret = defaultdict(lambda: {'size': 0, 'count': 0}) | ||
for t in set([g['mimeType'] for g in self.files]): | ||
fs = [f for f in self.files if f['mimeType'] == t] | ||
ret[t] = {'size': sum(f['size'] for f in fs), 'count': len(fs)} | ||
for child in self.subfolders: | ||
subcounts = child.file_count_by_mimetype() | ||
for t in subcounts: | ||
ret[t]['count'] += subcounts[t]['count'] | ||
ret[t]['size'] += subcounts[t]['size'] | ||
return ret | ||
|
||
def list_files(self): | ||
space = ' '*self.depth | ||
ret = [headerize( | ||
f'<a href="{gdrive.FOLDER_LINK_PREFIX}{self.id}">{self.name}</a> <span style="color:#666666;">({human_readable_size(self.total_size())})</span>', | ||
self.depth, | ||
)] | ||
for child in self.files: | ||
ret.append(f"""<p>{space}- <a href="{child['webViewLink']}">{child['name']}</a></p>""") | ||
for child in self.subfolders: | ||
ret.append(child.list_files()) | ||
return '\n'.join(ret) | ||
|
||
if __name__ == "__main__": | ||
|
||
root = DriveFolder("A Curated Buddhist G-Library", ROOT_FOLDER, "2019-01-01T00:00:00Z", 0) | ||
total_size = human_readable_size(root.total_size()) | ||
total_count = root.total_count() | ||
print("\n==================\nFinished fetching data!\n==================\n") | ||
|
||
html = f"""<html> | ||
<head><meta content="text/html; charset=UTF-8"></head> | ||
<body class="doc-content"> | ||
<p class="title" style="font-size:26pt;padding-bottom:3pt;line-height:1.15;page-break-after:avoid;font-family:"Arial";orphans:2;widows:2;text-align:left;"><span style="font-weight:400;text-decoration:none;vertical-align:baseline;font-size:26pt;font-family:"Arial";font-style:normal">Buddhist G-Library Catalog</span></p> | ||
<p>An automatically generated list of all the files in the Library.</p> | ||
<p>Generated on {datetime.datetime.now(datetime.timezone.utc).strftime("%a, %d %b %Y %H:%M:%S GMT")}</p><p></p> | ||
<p>In total, the library is {total_size} large and contains {total_count} files. They break down by MIME type as follows:</p> | ||
<table><tr style="text-decoration:underline;"><{TD}>MIME Type</td><{TD}>Count</td><{TD}>Size</td></tr> | ||
{"".join(f"<tr><{TD}>{t}</td><{TD}>{c['count']}</td><{TD}>{human_readable_size(c['size'])}</td></tr>" for t, c in root.file_count_by_mimetype().items())} | ||
<tr style="font-weight:700;"><{TD}>Total</td><{TD}>{total_count}</td><{TD}>{total_size}</td></tr> | ||
</table><p></p><h1>Files</h1><p></p>{root.list_files()} | ||
</body> | ||
</html> | ||
""" | ||
|
||
print("Replacing public doc with new version...") | ||
docid = gdrive.create_doc( | ||
html=html, | ||
creator="CatalogBuilder", | ||
replace_doc="1rGLm9Xh5de0e3hsMY2yyt97MWBuZJ1V1_q0jhGe7vpw", | ||
) | ||
print(f"Done! See https://docs.google.com/document/d/{docid}/edit") |