From 5c87df1982500a0b55adb21f5df0f1f2985e2a87 Mon Sep 17 00:00:00 2001 From: J Muchovej Date: Tue, 19 Apr 2022 22:50:54 -0400 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Migrate=20to=20using=20`cms`=20cach?= =?UTF-8?q?ing,=20filters,=20and=20downloads?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original implementation relied heavily on lots of self-managed indexes, but `@jitl/notion-api` provides much of this out-of-the-box. Some speed improvements could be made, but overall, the use of caching may be useful in the future -- as I'd like to allow for dynamic block creation to avoid needing to manually assign templates. --- package.json | 2 +- src/commands/articles/clean.ts | 24 ++- src/commands/articles/sync.ts | 147 +++++++++++++----- src/commands/authors/clean.ts | 198 ++++++++----------------- src/commands/authors/sync.ts | 74 ++++++---- src/models/article.ts | 113 ++------------ src/models/author.ts | 120 ++------------- src/notion-cms.ts | 263 +++++++++++++++++++++++++++++++++ tsconfig.json | 4 +- yarn.lock | 46 +++--- 10 files changed, 542 insertions(+), 449 deletions(-) create mode 100644 src/notion-cms.ts diff --git a/package.json b/package.json index ba63309..09d1524 100644 --- a/package.json +++ b/package.json @@ -22,7 +22,7 @@ "dependencies": { "@citation-js/core": "^0.5.4", "@citation-js/plugin-bibtex": "^0.5.6", - "@jitl/notion-api": "^0.1.2", + "@jitl/notion-api": "0.2.1", "@notionhq/client": "^1.0.4", "@oclif/core": "^1.7.0", "@types/lodash": "^4.14.180", diff --git a/src/commands/articles/clean.ts b/src/commands/articles/clean.ts index fd6ba58..53b5238 100644 --- a/src/commands/articles/clean.ts +++ b/src/commands/articles/clean.ts @@ -1,31 +1,29 @@ -import {removeEmptyRelationOrMultiSelects} from "../../notion"; -import {ArticlesDB} from "../../config"; -import BaseCommand, {BaseArgTypes, BaseFlagTypes} from '../../base'; +import {ArticlesDB} from "../../config" +import BaseCommand, {BaseArgTypes, BaseFlagTypes} from "../../base" +import {archiveEmptyFilters, ArticleCMS, createCMS} from "../../notion-cms" export default class ArticlesClean extends BaseCommand { static summary: string = `Cleans up your Articles Database.` static description: string = `1. Removes dangling articles without authors.` - static args: BaseArgTypes = BaseCommand.args; - static flags: BaseFlagTypes = BaseCommand.flags; - static examples: string[] = BaseCommand.examples; + static args: BaseArgTypes = BaseCommand.args + static flags: BaseFlagTypes = BaseCommand.flags + static examples: string[] = BaseCommand.examples public async run(): Promise { await this.parse(ArticlesClean) - const articles: ArticlesDB = this.appConfig.databases.articles + const articlesCMS: ArticleCMS = createCMS( + this.config, this.appConfig, this.notion, "articles", + ) this.log(`Removing Articles with no Authors.`) - await archivePapersWithNoAuthors(this, articles, this.appConfig.authorType) + const noAuthorsFilter = articlesCMS.filter.authors.is_empty(true) + await archiveEmptyFilters(this, articlesCMS, noAuthorsFilter) this.log() this.log() // TODO implement deduplication } } - -const archivePapersWithNoAuthors = async (CLI: BaseCommand, articlesDB: ArticlesDB, propType: string): Promise => { - const {databaseID, authorRef} = articlesDB - await removeEmptyRelationOrMultiSelects(CLI, databaseID, authorRef, propType); -} diff --git a/src/commands/articles/sync.ts b/src/commands/articles/sync.ts index a9921b3..df83ede 100644 --- a/src/commands/articles/sync.ts +++ b/src/commands/articles/sync.ts @@ -1,58 +1,133 @@ -import _ from "lodash" -import {BibTeXForNotion} from "../../bibtex"; +import {ArticlesDB, AuthorsDB} from "../../config" +import BaseCommand, {BaseArgTypes, BaseFlagTypes} from "../../base" import { - BibTeXToNotion, - initArticleDB, - prepareBibTeXForNotion -} from "../../models/article" -import {initAuthorDB} from "../../models/author" -import {createEntries, diff, updateEntries} from "../../notion" -import {ArticlesDB, AuthorsDB} from "../../config"; -import BaseCommand, {BaseArgTypes, BaseFlagTypes} from '../../base'; + ArticleCMS, + ArticlePage, + AuthorCMS, + batchEntries, + createCMS, + Relation, +} from "../../notion-cms" +import _ from "lodash" +import {BibTeXToNotion, NotionArticle} from "../../models/article" +import {richTextAsPlainText} from "@jitl/notion-api" +import {performance} from "perf_hooks" export default class ArticlesSync extends BaseCommand { static summary: string = `Syncs your Articles Database with the local BibTeX file.` static description: string = `Strictly creates or updates articles based on the ID assigned by Paperpile.` - static args: BaseArgTypes = BaseCommand.args; - static flags: BaseFlagTypes = BaseCommand.flags; - static examples: string[] = BaseCommand.examples; + static args: BaseArgTypes = BaseCommand.args + static flags: BaseFlagTypes = BaseCommand.flags + static examples: string[] = BaseCommand.examples public async run(): Promise { await this.parse(ArticlesSync) - const articlesDB: ArticlesDB = this.appConfig.databases.articles - const authorsDB: AuthorsDB = this.appConfig.databases.authors + const articleCMS: ArticleCMS = createCMS(this.config, this.appConfig, this.notion, "articles") - const {notion: articles} = await initArticleDB(articlesDB.databaseID, this.config.cacheDir) + let authorCMS: AuthorCMS | undefined + if (this.appConfig.hasAuthorDB) { + authorCMS = createCMS(this.config, this.appConfig, this.notion, "authors") + } - const {authorIndex} = await initAuthorDB(authorsDB.databaseID, this.config.cacheDir) + const Status = this.appConfig.status.states + const parent = { + database_id: this.appConfig.databases.articles.databaseID, + } - const BibTeX = _.chain(this.BibTeX).reduce( - (obj: BibTeXForNotion, bib: any, key: string) => { - obj[key] = prepareBibTeXForNotion(bib, authorIndex, this.appConfig) - return obj - }, {} - ).value() + const toUpdate: { page_id: string, properties: NotionArticle }[] = [] + const toCreate: { parent: typeof parent, properties: NotionArticle }[] = [] - this.log(`Found ${_.keys(BibTeX).length} articles in BibTeX and ${_.keys(articles).length} on Notion...`) - const {toCreate, toUpdate} = diff(_.keys(BibTeX), _.keys(articles)) + const existingPages = await fetchDB(this.BibTeX, articleCMS) - let notionCreates = _.map(toCreate, (ID: string) => { - return BibTeX[ID] - }) - while (notionCreates.length > 0) { - notionCreates = await createEntries(this, notionCreates, BibTeXToNotion, articlesDB.databaseID) + let counter: number = 0 + let startTime: number = performance.now(), + endTime: number = performance.now() + for await (const [ID, article] of _.entries(this.BibTeX)) { + article.status = _.isNil(article.status) ? undefined : Status[article.status] + + let {authors} = article + if (authors && this.appConfig.hasAuthorDB && authorCMS) { + authors = await fetchAuthors(authors, authorCMS) + } + article.authors = authors?.filter((a: any) => a) + + const properties: NotionArticle = BibTeXToNotion(this.appConfig, article) + + const page: ArticlePage | undefined = existingPages[ID] + if (page) { + toUpdate.push({page_id: (page).content.id, properties}) + } else { + toCreate.push({parent, properties}) + } + + if (counter % 100 == 0) { + endTime = performance.now() + const time: string = `${(endTime - startTime) / 1000 / 60}min` + console.log(`Cumulative time: ~${time}.`) + } + + counter++ } - let notionUpdates = _.map(toUpdate, (ID: string) => { - const update = BibTeX[ID] - const {pageID} = articles[ID as string][0].frontmatter - return {pageID, ...update} + await batchEntries(this, toCreate, async (entry: typeof toCreate[0]) => { + await articleCMS.config.notion.pages.create(entry) }) - while (notionUpdates.length > 0) { - notionUpdates = await updateEntries(this, notionUpdates, BibTeXToNotion) + + await batchEntries(this, toUpdate, async (entry: typeof toUpdate[0]) => { + await articleCMS.config.notion.pages.update(entry) + }) + } +} + +type FetchedArticleDB = { + [name: string]: ArticlePage +} + +const fetchDB = async (BibTeX: any, cms: ArticleCMS): Promise => { + const db: FetchedArticleDB = {} + + const chunks = _.chain(BibTeX).keys().chunk(100).value() + let batchId = 1 + + for await (const batch of chunks) { + const filter = cms.filter.or( + ...batch.map((id: string) => cms.filter.ID.equals(id)), + ) + for await (const page of cms.query({filter})) { + const ID = richTextAsPlainText(page.frontmatter.ID) + db[ID] = page + } + batchId++ + } + + return db +} + +const fetchAuthors = async (authors: string[], cms: AuthorCMS): Promise => { + const filter = cms.filter.or( + ...authors.map((author: string) => cms.filter.or( + cms.filter.name.equals(author), cms.filter.aliases.contains(author), + )), + ) + + const sortKeys: number[] = [] + const relations: Relation["relation"] = [] + for await (const author of cms.query({filter: filter})) { + let {content: {id}, frontmatter: {name, aliases}} = author + + name = richTextAsPlainText(name) + aliases = richTextAsPlainText(aliases) + const index: number | undefined = [name, ...aliases.split(";")].map( + (alias: string): number => authors.indexOf(alias.trim()), + ).find((n: number): boolean => n > -1) + + if (index !== undefined) { + relations.push({id}) + sortKeys.push(index) } } + return sortKeys.map((index: number) => relations[index]) } diff --git a/src/commands/authors/clean.ts b/src/commands/authors/clean.ts index 22f2097..2e3a76d 100644 --- a/src/commands/authors/clean.ts +++ b/src/commands/authors/clean.ts @@ -1,20 +1,14 @@ +import {richTextAsPlainText} from "@jitl/notion-api" +import * as _ from "lodash" +import {AuthorsDB} from "../../config" +import BaseCommand, {BaseArgTypes, BaseFlagTypes} from "../../base" import { - asyncIterableToArray, - getPageTitle, - getPropertyValue, - iteratePaginatedAPI, - richTextAsPlainText -} from "@jitl/notion-api" -import { + archiveEmptyFilters, + AuthorCMS, batchEntries, - makeRelation, - Notion, + createCMS, Relation, - removeEmptyRelationOrMultiSelects -} from "../../notion" -import * as _ from "lodash" -import {AuthorsDB} from "../../config"; -import BaseCommand, {BaseArgTypes, BaseFlagTypes} from '../../base'; +} from "../../notion-cms" export default class AuthorsClean extends BaseCommand { static summary: string = `Cleans up your Authors Database.` @@ -23,160 +17,88 @@ export default class AuthorsClean extends BaseCommand { 1. Removes dangling authors with no articles. 2. Attempts to clean up and merge authors and aliases.` - static args: BaseArgTypes = BaseCommand.args; - static flags: BaseFlagTypes = BaseCommand.flags; - static examples: string[] = BaseCommand.examples; + static args: BaseArgTypes = BaseCommand.args + static flags: BaseFlagTypes = BaseCommand.flags + static examples: string[] = BaseCommand.examples public async run(): Promise { await this.parse(AuthorsClean) - if (!this.appConfig.databases.authors) { + if (!this.appConfig.hasAuthorDB) { this.error("You don't have an Authors database. Exiting.") this.exit(0) // analogous to this.exit(0), but keeps WebStorm from whining } - const authors: AuthorsDB = this.appConfig.databases.authors + const authorsCMS: AuthorCMS = createCMS( + this.config, this.appConfig, this.notion, "authors", + ) this.log(`Removing Authors with no Articles.`) - await archiveAuthorsWithNoArticles(this, authors) + const noArticlesFilter = authorsCMS.filter.articles.is_empty(true) + await archiveEmptyFilters(this, authorsCMS, noArticlesFilter) this.log() this.log() this.log(`Attempting de-duplication based on "Aliases".`) - await deduplicateAuthors(this, authors) + await deduplicateAuthors(this, authorsCMS) this.log() this.log() } } -const archiveAuthorsWithNoArticles = async (CLI: BaseCommand, authorsDB: AuthorsDB): Promise => { - const {databaseID, articleRef} = authorsDB - await removeEmptyRelationOrMultiSelects(CLI, databaseID, articleRef, "relation") -} +const deduplicateAuthors = async (CLI: BaseCommand, cms: AuthorCMS): Promise => { + const Nicks: { [id: string]: string } = {} + const Aliases: { [alias: string]: string } = {} + + const filter = cms.filter.aliases.is_not_empty(true) + for await (const author of cms.query({filter})) { + const {content: {id}, frontmatter: {name, aliases}} = author + const nick: string = richTextAsPlainText(name).trim() + const _aliases_: string[] = richTextAsPlainText(aliases) + .split(";") + .map((a: string) => a.trim()) + .filter((a: string) => a) + + Nicks[id] = nick -const deduplicateAuthors = async (CLI: BaseCommand, authorsDB: AuthorsDB) => { - const {databaseID, articleRef} = authorsDB - - const nonEmptyAliasIterable = iteratePaginatedAPI( - Notion.databases.query, - { - database_id: databaseID, - // @ts-ignore - filter: { - property: "Aliases", - rich_text: { - is_not_empty: true, - } - }, - page_size: 100, - }) - let nonEmptyAliasArray = await asyncIterableToArray(nonEmptyAliasIterable) - - const toUpdate: string[] = [] - const toArchive: string[] = [] - - const authorsWithAliases: { - [name: string]: string - } = {} - const pages: { - [pageID: string]: any, - } = {} - const articles: { - [pageID: string]: { id: string }[] - } = {} - - while (nonEmptyAliasArray.length > 0) { - nonEmptyAliasArray = await batchEntries( - CLI, - nonEmptyAliasArray, - async (entry: any): Promise => { - pages[entry.id] = entry - toUpdate.push(entry.id) - - const aliasesProperty = richTextAsPlainText(getPropertyValue(entry, { - name: "Aliases", - type: "rich_text" - })) - const nameProperty = richTextAsPlainText(getPageTitle(entry)) - - const aliases = [ - nameProperty, - ...aliasesProperty.split(";") - ].map(a => a.trim()).filter(a => a) - for (const alias of aliases) { - authorsWithAliases[alias] = entry.id - } - - const otherArticles = getPropertyValue(entry, { - name: articleRef, - type: "relation" - }) ?? [] - articles[entry.id] = [...(articles[entry.id] ?? []), ...otherArticles] - - }) + for (const alias of _aliases_) { + Aliases[alias] = id + } } - const queryForAuthor = async (name: string) => { - const iterable = iteratePaginatedAPI( - Notion.databases.query, - { - database_id: databaseID, - page_size: 100, - filter: {property: "Name", title: {equals: name}} + const articlesToCondense: { [name: string]: Relation["relation"] } = {} + const toArchive: { page_id: string, archived: true }[] = [] + for await (const [alias, pageID] of _.entries(Aliases)) { + const matches = cms.filter.name.equals(alias) + for await (const page of cms.query({filter: matches})) { + const {content: {id}, frontmatter: {name, articles}} = page + if (id === pageID && articles.length > 0) { + CLI.warn(`Skipping pageID = ${id}, ${Nicks[id]}`) + continue } - ) - return await asyncIterableToArray(iterable) - } - CLI.log() - for await (const [name, pageID] of _.entries(authorsWithAliases)) { - const result = await queryForAuthor(name) - if (result.length == 0 || result[0].id === pageID) - continue - - const notThisID = _.filter( - result, (entry) => entry.id != pageID - ).filter(a => a) - - for (const author of notThisID) { - toArchive.push(author.id) - // @ts-ignore - const otherArticles = getPropertyValue(author, { - name: articleRef, - type: "relation" - }) ?? [] - articles[pageID] = [...articles[pageID], ...otherArticles] + const Name = richTextAsPlainText(name) + CLI.log(`Collapsing ${Name}'s articles into ${Nicks[pageID]}...`) + const relations = _.get(articlesToCondense, pageID, []) + articlesToCondense[pageID] = [...relations, ...articles] + toArchive.push({page_id: id, archived: true}) } - console.log(`The alias of ${name} (${pageID}) will now have ${articles[pageID].length} articles.`) } - CLI.log() + + const toUpdate: { page_id: string, relation: Relation["relation"] }[] = _.map( + articlesToCondense, + (relation, page_id) => { + return {page_id, relation} + }, + ) CLI.log("Archiving detected duplicates...") - let notionArchive = _.map(toArchive, (id) => { - return {id} + await batchEntries(CLI, toArchive, async (entry: typeof toArchive[0]) => { + await cms.config.notion.pages.update(entry) }) - while (notionArchive.length > 0) { - notionArchive = await batchEntries(CLI, notionArchive, async (entry: any) => { - await Notion.pages.update({ - page_id: entry.id, - archived: true, - }) - }) - } CLI.log("Migrating duplicates' articles to correct alias...") - let notionUpdate = _.map(toUpdate, (id) => { - return {id, articles: articles[id]} + await batchEntries(CLI, toUpdate, async (entry: typeof toUpdate[0]) => { + await cms.config.notion.pages.update(entry) }) - while (notionUpdate.length > 0) { - notionUpdate = await batchEntries(CLI, notionUpdate, async (entry: any) => { - await Notion.pages.update({ - page_id: entry.id, - properties: { - [articleRef]: makeRelation(entry.articles) as Relation - }, - }) - }) - } - } diff --git a/src/commands/authors/sync.ts b/src/commands/authors/sync.ts index eedbd31..904f3fb 100644 --- a/src/commands/authors/sync.ts +++ b/src/commands/authors/sync.ts @@ -1,46 +1,70 @@ -import _ from "lodash" -import {createEntries, diff, updateEntries} from "../../notion" -import { - AuthorToNotion, - initAuthorDB, - prepareAuthorsForNotion -} from "../../models/author" import {AuthorsDB} from "../../config" -import BaseCommand, {BaseArgTypes, BaseFlagTypes} from '../../base'; +import BaseCommand, {BaseArgTypes, BaseFlagTypes} from "../../base" +import { + AuthorCMS, + AuthorIteratorResult, + AuthorPage, + batchEntries, + createCMS, +} from "../../notion-cms" +import {AuthorToNotion, prepareAuthorsForNotion} from "../../models/author" export default class AuthorsSync extends BaseCommand { static summary: string = `Syncs your Authors Database with the local BibTeX file.` static description: string = `Authors will be created if not present (or if they don't match a manually entered alias). Otherwise, Authors will have their name stripped of whitespace and articles consolidation based on matching Aliases.` - static args: BaseArgTypes = BaseCommand.args; - static flags: BaseFlagTypes = BaseCommand.flags; - static examples: string[] = BaseCommand.examples; + static args: BaseArgTypes = BaseCommand.args + static flags: BaseFlagTypes = BaseCommand.flags + static examples: string[] = BaseCommand.examples public async run(): Promise { await this.parse(AuthorsSync) - if (!this.appConfig.databases.authors) { + if (!this.appConfig.hasAuthorDB) { this.error("You don't have an Authors database. Exiting.") this.exit(0) // analogous to this.exit(0), but keeps WebStorm from whining } - const authors: AuthorsDB = this.appConfig.databases.authors - const {authorIndex} = await initAuthorDB(authors.databaseID, this.config.cacheDir) + const authorCMS: AuthorCMS = createCMS(this.config, this.appConfig, this.notion, "authors") - const {toCreate, toUpdate} = diff(this.BibTeXAuthors, _.keys(authorIndex)) + const parent = { + database_id: this.appConfig.databases.authors.databaseID, + } - let notionCreates = toCreate.map((author: string) => { - return prepareAuthorsForNotion(author, authorIndex, this.appConfig) - }) - while (notionCreates.length > 0) { - notionCreates = await createEntries(this, notionCreates, AuthorToNotion, authors.databaseID) + const toCreate = [] + + for await (const author of this.BibTeXAuthors) { + const filter = authorCMS.filter.or( + authorCMS.filter.name.equals(author), + authorCMS.filter.aliases.contains(author), + ) + const query: AuthorIteratorResult = await authorCMS.query({filter}).next() + const page: AuthorPage | undefined = query.value + if (!page) { + toCreate.push({ + parent, + properties: AuthorToNotion(prepareAuthorsForNotion(author)), + }) + } } - let notionUpdates = toUpdate.map((author: string) => { - return prepareAuthorsForNotion(author, authorIndex, this.appConfig) + await batchEntries(this, toCreate, async (entry: typeof toCreate[0]) => { + await authorCMS.config.notion.pages.create(entry) }) - while (notionUpdates.length > 0) { - notionUpdates = await updateEntries(this, notionUpdates, AuthorToNotion) - } + // const {toCreate, toUpdate} = diff(this.BibTeXAuthors, _.keys(authorIndex)) + + // let notionCreates = toCreate.map((author: string) => { + // return prepareAuthorsForNotion(author, authorIndex, this.appConfig) + // }) + // while (notionCreates.length > 0) { + // notionCreates = await createEntries(this, notionCreates, AuthorToNotion, authors.databaseID) + // } + + // let notionUpdates = toUpdate.map((author: string) => { + // return prepareAuthorsForNotion(author, authorIndex, this.appConfig) + // }) + // while (notionUpdates.length > 0) { + // notionUpdates = await updateEntries(this, notionUpdates, AuthorToNotion) + // } } } diff --git a/src/models/article.ts b/src/models/article.ts index 6600985..1378550 100644 --- a/src/models/article.ts +++ b/src/models/article.ts @@ -1,13 +1,6 @@ /* eslint-disable camelcase */ +import {StrictConfig} from "../config" import { - CMSPage, - getPageTitle, - getPropertyValue, - PageWithChildren, - richTextAsPlainText, -} from "@jitl/notion-api" -import { - Database, makeMultiSelect, makeRelation, makeRichText, @@ -15,73 +8,16 @@ import { makeTitle, makeURL, MultiSelect, - Notion, Relation, RichText, Select, Title, URL, -} from "../notion" -import * as path from "node:path" -import * as _ from "lodash" -import {AuthorIndex} from "./author"; -import {ConfigInterface} from "../config"; - -export class ArticlesDatabase extends Database { - constructor(id: string, cacheDir: string) { - super(CMSConfig, id, cacheDir) - } -} - -export type ArticleFrontmatter = { - authors: Relation - ID: RichText, - plain: { - title: string, - ID: string, - } -} - -export type ArticlePage = CMSPage; - -// @ts-ignore -const getFrontmatter = (page: PageWithChildren) => { - // const blocks = await getChildBlocks(Notion, page.id) - const title = getPageTitle(page) - // TOOD this needs to be changed based on whether there's an Author's database - const authors = getPropertyValue(page, {name: `Authors`, type: `relation`}) - const ID = getPropertyValue(page, {name: `ID`, type: `rich_text`}) - - const plain = { - title: richTextAsPlainText(title).trim(), - ID: richTextAsPlainText(ID).trim(), - } - - // TODO gather child block lengths - return {plain, authors, ID, pageID: page.id} -} - -export const CMSConfig = (cacheDir: string, databaseID: string) => { - return { - database_id: databaseID, - getFrontmatter, - notion: Notion, - visible: true, - // slug: "ID", - cache: { - directory: path.join(cacheDir, `articles`), - }, - assets: { - directory: path.join(cacheDir, `articles/assets`), - downloadExternalAssets: true, - }, - } -} +} from "../notion-cms" export type ArticleEntry = any - -type NotionEntry = { +export type NotionArticle = { Title: Title, ID: RichText, Status: Select, @@ -95,12 +31,13 @@ type NotionEntry = { URL?: URL, } -export const BibTeXToNotion = (bib: ArticleEntry) => { - const ID = makeRichText(bib.id) as RichText +export const BibTeXToNotion = (appConfig: StrictConfig, bib: ArticleEntry): NotionArticle => { + const ID = makeRichText(bib.id) const Title = makeTitle(bib.title) - const Status = makeSelect(bib.status) ?? makeSelect("❓ Unknown") + const Status = makeSelect(bib.status) ??