diff --git a/baker/SiteBaker.tsx b/baker/SiteBaker.tsx index 03bc0932af..a4a15932a5 100644 --- a/baker/SiteBaker.tsx +++ b/baker/SiteBaker.tsx @@ -574,7 +574,7 @@ export class SiteBaker { if (!this.bakeSteps.has("gdocPosts")) return // We don't need to call `load` on these, because we prefetch all attachments const publishedGdocs = await db - .getPublishedGdocPostsWithTags(knex) + .getPublishedGdocsWithTags(knex) .then((gdocs) => gdocs.map(gdocFromJSON)) const allParentTagArraysByChildName = diff --git a/baker/algolia/utils/pages.ts b/baker/algolia/utils/pages.ts index 8a68ee9d9a..09651735bd 100644 --- a/baker/algolia/utils/pages.ts +++ b/baker/algolia/utils/pages.ts @@ -16,6 +16,9 @@ import { DEFAULT_GDOC_FEATURED_IMAGE, DEFAULT_THUMBNAIL_FILENAME, DbEnrichedImage, + OwidGdocDataInsightInterface, + getFirstTwoSentencesFromString, + spansToUnformattedPlainText, } from "@ourworldindata/utils" import { formatPost } from "../../formatWordpressPost.js" import ReactDOMServer from "react-dom/server.js" @@ -45,6 +48,8 @@ import { CLOUDFLARE_IMAGES_URL, } from "../../../settings/clientSettings.js" import { logErrorAndMaybeCaptureInSentry } from "../../../serverUtils/errorLog.js" +import { getFirstBlockOfType } from "../../../site/gdocs/utils.js" +import { getPrefixedGdocPath } from "@ourworldindata/components" const computePageScore = (record: Omit): number => { const { importance, views_7d } = record @@ -161,9 +166,17 @@ async function generateWordpressRecords( } const getThumbnailUrl = ( - gdoc: OwidGdocPostInterface, + gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface, cloudflareImages: Record ): string => { + if (gdoc.content.type === OwidGdocType.DataInsight) { + const firstImage = getFirstBlockOfType(gdoc, "image") + const filename = firstImage?.smallFilename || firstImage?.filename + return filename && cloudflareImages[filename] + ? `${CLOUDFLARE_IMAGES_URL}/${cloudflareImages[filename].cloudflareId}/w=512` + : `${BAKED_BASE_URL}/${DEFAULT_GDOC_FEATURED_IMAGE}` + } + if (gdoc.content["deprecation-notice"]) { return `${BAKED_BASE_URL}/${ARCHVED_THUMBNAIL_FILENAME}` } @@ -188,13 +201,30 @@ const getThumbnailUrl = ( return `${CLOUDFLARE_IMAGES_URL}/${cloudflareId}/w=512` } +function getExcerptFromGdoc( + gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface +): string { + if (gdoc.content.type === OwidGdocType.DataInsight) { + const firstParagraph = getFirstBlockOfType(gdoc, "text") + + if (firstParagraph) { + const plaintext = spansToUnformattedPlainText(firstParagraph.value) + return getFirstTwoSentencesFromString(plaintext, 140) + } + + return "" + } else { + return gdoc.content.excerpt ?? "" + } +} + function generateGdocRecords( - gdocs: OwidGdocPostInterface[], + gdocs: (OwidGdocPostInterface | OwidGdocDataInsightInterface)[], pageviews: Record, cloudflareImagesByFilename: Record ): PageRecord[] { const getPostTypeAndImportance = ( - gdoc: OwidGdocPostInterface + gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface ): TypeAndImportance => { return match(gdoc.content.type) .with(OwidGdocType.Article, () => ({ @@ -216,6 +246,10 @@ function generateGdocRecords( type: "other" as const, importance: 0, })) + .with(OwidGdocType.DataInsight, () => ({ + type: "data-insight" as const, + importance: 0, + })) .exhaustive() } @@ -245,8 +279,9 @@ function generateGdocRecords( slug: gdoc.slug, title: gdoc.content.title || "", content: chunk, - views_7d: pageviews[`/${gdoc.slug}`]?.views_7d ?? 0, - excerpt: gdoc.content.excerpt, + views_7d: + pageviews[getPrefixedGdocPath("", gdoc)]?.views_7d ?? 0, + excerpt: getExcerptFromGdoc(gdoc), date: gdoc.publishedAt!.toISOString(), modifiedDate: ( gdoc.updatedAt ?? gdoc.publishedAt! @@ -267,9 +302,18 @@ function generateGdocRecords( // Generate records for countries, WP posts (not including posts that have been succeeded by Gdocs equivalents), and Gdocs export const getPagesRecords = async (knex: db.KnexReadonlyTransaction) => { const pageviews = await getAnalyticsPageviewsByUrlObj(knex) - const gdocs = await db - .getPublishedGdocPostsWithTags(knex) - .then((gdocs) => gdocs.map(gdocFromJSON) as OwidGdocPostInterface[]) + const gdocs = (await db + .getPublishedGdocsWithTags(knex, [ + OwidGdocType.Article, + OwidGdocType.LinearTopicPage, + OwidGdocType.TopicPage, + OwidGdocType.AboutPage, + OwidGdocType.DataInsight, + ]) + .then((gdocs) => gdocs.map(gdocFromJSON))) as ( + | OwidGdocPostInterface + | OwidGdocDataInsightInterface + )[] const publishedGdocsBySlug = keyBy(gdocs, "slug") const slugsWithPublishedGdocsSuccessors = diff --git a/db/db.ts b/db/db.ts index 211fe168cd..771ad17e66 100644 --- a/db/db.ts +++ b/db/db.ts @@ -445,8 +445,15 @@ export const getPublishedGdocPosts = async ( ).then((rows) => rows.map(parsePostsGdocsRow)) } -export const getPublishedGdocPostsWithTags = async ( - knex: KnexReadonlyTransaction +export const getPublishedGdocsWithTags = async ( + knex: KnexReadonlyTransaction, + // The traditional "post" types - doesn't include data insights, author pages, the homepage, etc. + gdocTypes: OwidGdocType[] = [ + OwidGdocType.Article, + OwidGdocType.LinearTopicPage, + OwidGdocType.TopicPage, + OwidGdocType.AboutPage, + ] ): Promise => { return knexRaw( knex, @@ -477,17 +484,12 @@ export const getPublishedGdocPostsWithTags = async ( gxt.tagId = t.id WHERE g.published = 1 - AND g.type IN (:types) + AND g.type IN (:gdocTypes) AND g.publishedAt <= NOW() GROUP BY g.id ORDER BY g.publishedAt DESC`, { - types: [ - OwidGdocType.Article, - OwidGdocType.LinearTopicPage, - OwidGdocType.TopicPage, - OwidGdocType.AboutPage, - ], + gdocTypes, } ).then((rows) => rows.map(parsePostsGdocsWithTagsRow)) } diff --git a/db/model/Gdoc/GdocFactory.ts b/db/model/Gdoc/GdocFactory.ts index baa58daa4e..9c9573e5dd 100644 --- a/db/model/Gdoc/GdocFactory.ts +++ b/db/model/Gdoc/GdocFactory.ts @@ -43,7 +43,7 @@ import { knexRaw, KnexReadWriteTransaction, getImageMetadataByFilenames, - getPublishedGdocPostsWithTags, + getPublishedGdocsWithTags, getParentTagArraysByChildName, getBestBreadcrumbs, } from "../../db.js" @@ -490,7 +490,7 @@ export async function getLatestDataInsights( export async function getAndLoadPublishedGdocPosts( knex: KnexReadonlyTransaction ): Promise { - const rows = await getPublishedGdocPostsWithTags(knex) + const rows = await getPublishedGdocsWithTags(knex) const gdocs = await Promise.all( rows.map(async (row) => loadGdocFromGdocBase(knex, row)) ) diff --git a/packages/@ourworldindata/components/src/GdocsUtils.ts b/packages/@ourworldindata/components/src/GdocsUtils.ts index 9b59fd8628..874c861d51 100644 --- a/packages/@ourworldindata/components/src/GdocsUtils.ts +++ b/packages/@ourworldindata/components/src/GdocsUtils.ts @@ -45,7 +45,7 @@ export function convertHeadingTextToId(headingText: Span[]): string { return urlSlug(spansToUnformattedPlainText(headingText)) } -function _getPrefixedPath( +export function getPrefixedGdocPath( prefix: string, gdoc: { slug: string; content: { type?: OwidGdocType } } ): string { @@ -94,14 +94,14 @@ export const getBakePath = ( bakedSiteDir: string, gdoc: { slug: string; content: { type?: OwidGdocType } } ): string => { - return _getPrefixedPath(bakedSiteDir, gdoc) + return getPrefixedGdocPath(bakedSiteDir, gdoc) } export const getCanonicalUrl = ( baseUrl: string, gdoc: { slug: string; content: { type?: OwidGdocType } } ): string => { - return _getPrefixedPath(baseUrl, gdoc) + return getPrefixedGdocPath(baseUrl, gdoc) } export function getPageTitle(gdoc: OwidGdoc) { diff --git a/packages/@ourworldindata/components/src/index.ts b/packages/@ourworldindata/components/src/index.ts index ab0603ca19..639bc422b2 100644 --- a/packages/@ourworldindata/components/src/index.ts +++ b/packages/@ourworldindata/components/src/index.ts @@ -14,6 +14,7 @@ export { getUrlTarget, checkIsInternalLink, convertHeadingTextToId, + getPrefixedGdocPath, getBakePath, getCanonicalUrl, getPageTitle, diff --git a/packages/@ourworldindata/types/src/gdocTypes/ArchieMlComponents.ts b/packages/@ourworldindata/types/src/gdocTypes/ArchieMlComponents.ts index 50222fee90..f9d28f042d 100644 --- a/packages/@ourworldindata/types/src/gdocTypes/ArchieMlComponents.ts +++ b/packages/@ourworldindata/types/src/gdocTypes/ArchieMlComponents.ts @@ -1077,3 +1077,16 @@ export type OwidEnrichedGdocBlock = | EnrichedBlockHomepageIntro | EnrichedBlockLatestDataInsights | EnrichedBlockSocials + +/** + * A map of all possible block types, with the type as the key and the block type as the value + * e.g. + * { + * "text": EnrichedBlockText, + * "aside": EnrichedBlockAside, + * ... + * } + */ +export type OwidEnrichedGdocBlockTypeMap = { + [K in OwidEnrichedGdocBlock as K["type"]]: K +} diff --git a/packages/@ourworldindata/types/src/index.ts b/packages/@ourworldindata/types/src/index.ts index 3a668ffb8e..f278b2a880 100644 --- a/packages/@ourworldindata/types/src/index.ts +++ b/packages/@ourworldindata/types/src/index.ts @@ -290,6 +290,7 @@ export { type EnrichedSocialLink, type RawBlockNarrativeChart, type EnrichedBlockNarrativeChart, + type OwidEnrichedGdocBlockTypeMap, } from "./gdocTypes/ArchieMlComponents.js" export { ChartConfigType, diff --git a/packages/@ourworldindata/utils/src/Util.ts b/packages/@ourworldindata/utils/src/Util.ts index 5209f0e670..46ba967f8e 100644 --- a/packages/@ourworldindata/utils/src/Util.ts +++ b/packages/@ourworldindata/utils/src/Util.ts @@ -1325,6 +1325,27 @@ export const removeAllWhitespace = (text: string): string => { return text.replace(/\s+|\n/g, "") } +export function getFirstTwoSentencesFromString( + text: string, + maxChars: number +): string { + // match sentences ending in ".", "!", or "?" + const sentenceRegex = /[^.!?]+[.!?]/g + const sentences = text.match(sentenceRegex) || [text] + + // Try to return two full sentences if possible + if (sentences.length >= 2) { + const twoSentences = sentences.slice(0, 2).join(" ").trim() + if (twoSentences.length <= maxChars) { + return twoSentences + } + } + + // Otherwise, truncate by maxChars with an ellipsis + const truncated = text.slice(0, maxChars).trim() + return truncated.length < text.length ? truncated + "..." : truncated +} + export function moveArrayItemToIndex( arr: Item[], fromIndex: number, diff --git a/packages/@ourworldindata/utils/src/index.ts b/packages/@ourworldindata/utils/src/index.ts index 0da9e6e3e7..f77cb4ad50 100644 --- a/packages/@ourworldindata/utils/src/index.ts +++ b/packages/@ourworldindata/utils/src/index.ts @@ -89,6 +89,7 @@ export { triggerDownloadFromBlob, triggerDownloadFromUrl, removeAllWhitespace, + getFirstTwoSentencesFromString, moveArrayItemToIndex, getIndexableKeys, retryPromise, diff --git a/site/gdocs/utils.ts b/site/gdocs/utils.ts index 0a046704d5..905bb424cf 100644 --- a/site/gdocs/utils.ts +++ b/site/gdocs/utils.ts @@ -16,6 +16,9 @@ import { SubNavId, OwidGdocDataInsightContent, OwidGdocLinkType, + OwidGdocDataInsightInterface, + OwidGdocPostInterface, + OwidEnrichedGdocBlockTypeMap, } from "@ourworldindata/types" import { formatAuthors, @@ -211,6 +214,25 @@ export const getTopSubnavigationParentItem = ( return subnavs[subnavId]?.[0] } +export function getFirstBlockOfType< + T extends keyof OwidEnrichedGdocBlockTypeMap, +>( + gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface, + type: T +): OwidEnrichedGdocBlockTypeMap[T] | undefined { + if (!gdoc.content.body) return undefined + for (const block of gdoc.content.body) { + let foundBlock: OwidEnrichedGdocBlockTypeMap[T] | undefined + traverseEnrichedBlock(block, (node) => { + if (!foundBlock && node.type === type) { + foundBlock = node as OwidEnrichedGdocBlockTypeMap[T] + } + }) + if (foundBlock) return foundBlock + } + return undefined +} + // Always use the smallFilename for old data insights, where two filenames were always provided // Doing this in code was simpler than migrating all the DI gdocs themselves // See https://github.com/owid/owid-grapher/issues/4416 diff --git a/site/search/searchTypes.ts b/site/search/searchTypes.ts index a50a7896dd..8aaad845c7 100644 --- a/site/search/searchTypes.ts +++ b/site/search/searchTypes.ts @@ -7,6 +7,7 @@ export type PageType = | "faq" | "article" | "other" + | "data-insight" export const pageTypeDisplayNames: Record = { about: "About", @@ -14,6 +15,7 @@ export const pageTypeDisplayNames: Record = { country: "Country", faq: "FAQ", article: "Article", + "data-insight": "Data Insight", other: "Topic", // this is a band-aid to avoid showing "Other" for items that we now largely consider to be "Topics". Caveat: some non-topic pages are still indexed as "other" (e.g. /jobs). See https://owid.slack.com/archives/C04N12KT6GY/p1693580177430049?thread_ts=1693336759.239919&cid=C04N12KT6GY }