🎉 add data insights to algolia pages index

owid · Feb 7, 2025 · ed997f4 · ed997f4
1 parent b56247a
commit ed997f4
Show file tree

Hide file tree

Showing 12 changed files with 130 additions and 23 deletions.
diff --git a/baker/SiteBaker.tsx b/baker/SiteBaker.tsx
@@ -574,7 +574,7 @@ export class SiteBaker {
         if (!this.bakeSteps.has("gdocPosts")) return
         // We don't need to call `load` on these, because we prefetch all attachments
         const publishedGdocs = await db
-            .getPublishedGdocPostsWithTags(knex)
+            .getPublishedGdocsWithTags(knex)
             .then((gdocs) => gdocs.map(gdocFromJSON))
 
         const allParentTagArraysByChildName =

diff --git a/baker/algolia/utils/pages.ts b/baker/algolia/utils/pages.ts
@@ -16,6 +16,9 @@ import {
     DEFAULT_GDOC_FEATURED_IMAGE,
     DEFAULT_THUMBNAIL_FILENAME,
     DbEnrichedImage,
+    OwidGdocDataInsightInterface,
+    getFirstTwoSentencesFromString,
+    spansToUnformattedPlainText,
 } from "@ourworldindata/utils"
 import { formatPost } from "../../formatWordpressPost.js"
 import ReactDOMServer from "react-dom/server.js"
@@ -45,6 +48,8 @@ import {
     CLOUDFLARE_IMAGES_URL,
 } from "../../../settings/clientSettings.js"
 import { logErrorAndMaybeCaptureInSentry } from "../../../serverUtils/errorLog.js"
+import { getFirstBlockOfType } from "../../../site/gdocs/utils.js"
+import { getPrefixedGdocPath } from "@ourworldindata/components"
 
 const computePageScore = (record: Omit<PageRecord, "score">): number => {
     const { importance, views_7d } = record
@@ -161,9 +166,17 @@ async function generateWordpressRecords(
 }
 
 const getThumbnailUrl = (
-    gdoc: OwidGdocPostInterface,
+    gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface,
     cloudflareImages: Record<string, DbEnrichedImage>
 ): string => {
+    if (gdoc.content.type === OwidGdocType.DataInsight) {
+        const firstImage = getFirstBlockOfType(gdoc, "image")
+        const filename = firstImage?.smallFilename || firstImage?.filename
+        return filename && cloudflareImages[filename]
+            ? `${CLOUDFLARE_IMAGES_URL}/${cloudflareImages[filename].cloudflareId}/w=512`
+            : `${BAKED_BASE_URL}/${DEFAULT_GDOC_FEATURED_IMAGE}`
+    }
+
     if (gdoc.content["deprecation-notice"]) {
         return `${BAKED_BASE_URL}/${ARCHVED_THUMBNAIL_FILENAME}`
     }
@@ -188,13 +201,30 @@ const getThumbnailUrl = (
     return `${CLOUDFLARE_IMAGES_URL}/${cloudflareId}/w=512`
 }
 
+function getExcerptFromGdoc(
+    gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface
+): string {
+    if (gdoc.content.type === OwidGdocType.DataInsight) {
+        const firstParagraph = getFirstBlockOfType(gdoc, "text")
+
+        if (firstParagraph) {
+            const plaintext = spansToUnformattedPlainText(firstParagraph.value)
+            return getFirstTwoSentencesFromString(plaintext, 140)
+        }
+
+        return ""
+    } else {
+        return gdoc.content.excerpt ?? ""
+    }
+}
+
 function generateGdocRecords(
-    gdocs: OwidGdocPostInterface[],
+    gdocs: (OwidGdocPostInterface | OwidGdocDataInsightInterface)[],
     pageviews: Record<string, RawPageview>,
     cloudflareImagesByFilename: Record<string, DbEnrichedImage>
 ): PageRecord[] {
     const getPostTypeAndImportance = (
-        gdoc: OwidGdocPostInterface
+        gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface
     ): TypeAndImportance => {
         return match(gdoc.content.type)
             .with(OwidGdocType.Article, () => ({
@@ -216,6 +246,10 @@ function generateGdocRecords(
                 type: "other" as const,
                 importance: 0,
             }))
+            .with(OwidGdocType.DataInsight, () => ({
+                type: "data-insight" as const,
+                importance: 0,
+            }))
             .exhaustive()
     }
 
@@ -245,8 +279,9 @@ function generateGdocRecords(
                 slug: gdoc.slug,
                 title: gdoc.content.title || "",
                 content: chunk,
-                views_7d: pageviews[`/${gdoc.slug}`]?.views_7d ?? 0,
-                excerpt: gdoc.content.excerpt,
+                views_7d:
+                    pageviews[getPrefixedGdocPath("", gdoc)]?.views_7d ?? 0,
+                excerpt: getExcerptFromGdoc(gdoc),
                 date: gdoc.publishedAt!.toISOString(),
                 modifiedDate: (
                     gdoc.updatedAt ?? gdoc.publishedAt!
@@ -267,9 +302,18 @@ function generateGdocRecords(
 // Generate records for countries, WP posts (not including posts that have been succeeded by Gdocs equivalents), and Gdocs
 export const getPagesRecords = async (knex: db.KnexReadonlyTransaction) => {
     const pageviews = await getAnalyticsPageviewsByUrlObj(knex)
-    const gdocs = await db
-        .getPublishedGdocPostsWithTags(knex)
-        .then((gdocs) => gdocs.map(gdocFromJSON) as OwidGdocPostInterface[])
+    const gdocs = (await db
+        .getPublishedGdocsWithTags(knex, [
+            OwidGdocType.Article,
+            OwidGdocType.LinearTopicPage,
+            OwidGdocType.TopicPage,
+            OwidGdocType.AboutPage,
+            OwidGdocType.DataInsight,
+        ])
+        .then((gdocs) => gdocs.map(gdocFromJSON))) as (
+        | OwidGdocPostInterface
+        | OwidGdocDataInsightInterface
+    )[]
 
     const publishedGdocsBySlug = keyBy(gdocs, "slug")
     const slugsWithPublishedGdocsSuccessors =

diff --git a/db/db.ts b/db/db.ts
@@ -445,8 +445,15 @@ export const getPublishedGdocPosts = async (
     ).then((rows) => rows.map(parsePostsGdocsRow))
 }
 
-export const getPublishedGdocPostsWithTags = async (
-    knex: KnexReadonlyTransaction
+export const getPublishedGdocsWithTags = async (
+    knex: KnexReadonlyTransaction,
+    // The traditional "post" types - doesn't include data insights, author pages, the homepage, etc.
+    gdocTypes: OwidGdocType[] = [
+        OwidGdocType.Article,
+        OwidGdocType.LinearTopicPage,
+        OwidGdocType.TopicPage,
+        OwidGdocType.AboutPage,
+    ]
 ): Promise<DBEnrichedPostGdocWithTags[]> => {
     return knexRaw<DBRawPostGdocWithTags>(
         knex,
@@ -477,17 +484,12 @@ export const getPublishedGdocPostsWithTags = async (
         gxt.tagId = t.id
     WHERE
         g.published = 1
-        AND g.type IN (:types)
+        AND g.type IN (:gdocTypes)
         AND g.publishedAt <= NOW()
     GROUP BY g.id
     ORDER BY g.publishedAt DESC`,
         {
-            types: [
-                OwidGdocType.Article,
-                OwidGdocType.LinearTopicPage,
-                OwidGdocType.TopicPage,
-                OwidGdocType.AboutPage,
-            ],
+            gdocTypes,
         }
     ).then((rows) => rows.map(parsePostsGdocsWithTagsRow))
 }

diff --git a/db/model/Gdoc/GdocFactory.ts b/db/model/Gdoc/GdocFactory.ts
@@ -43,7 +43,7 @@ import {
     knexRaw,
     KnexReadWriteTransaction,
     getImageMetadataByFilenames,
-    getPublishedGdocPostsWithTags,
+    getPublishedGdocsWithTags,
     getParentTagArraysByChildName,
     getBestBreadcrumbs,
 } from "../../db.js"
@@ -490,7 +490,7 @@ export async function getLatestDataInsights(
 export async function getAndLoadPublishedGdocPosts(
     knex: KnexReadonlyTransaction
 ): Promise<GdocPost[]> {
-    const rows = await getPublishedGdocPostsWithTags(knex)
+    const rows = await getPublishedGdocsWithTags(knex)
     const gdocs = await Promise.all(
         rows.map(async (row) => loadGdocFromGdocBase(knex, row))
     )

diff --git a/packages/@ourworldindata/components/src/GdocsUtils.ts b/packages/@ourworldindata/components/src/GdocsUtils.ts
@@ -45,7 +45,7 @@ export function convertHeadingTextToId(headingText: Span[]): string {
     return urlSlug(spansToUnformattedPlainText(headingText))
 }
 
-function _getPrefixedPath(
+export function getPrefixedGdocPath(
     prefix: string,
     gdoc: { slug: string; content: { type?: OwidGdocType } }
 ): string {
@@ -94,14 +94,14 @@ export const getBakePath = (
     bakedSiteDir: string,
     gdoc: { slug: string; content: { type?: OwidGdocType } }
 ): string => {
-    return _getPrefixedPath(bakedSiteDir, gdoc)
+    return getPrefixedGdocPath(bakedSiteDir, gdoc)
 }
 
 export const getCanonicalUrl = (
     baseUrl: string,
     gdoc: { slug: string; content: { type?: OwidGdocType } }
 ): string => {
-    return _getPrefixedPath(baseUrl, gdoc)
+    return getPrefixedGdocPath(baseUrl, gdoc)
 }
 
 export function getPageTitle(gdoc: OwidGdoc) {

diff --git a/packages/@ourworldindata/components/src/index.ts b/packages/@ourworldindata/components/src/index.ts
@@ -14,6 +14,7 @@ export {
     getUrlTarget,
     checkIsInternalLink,
     convertHeadingTextToId,
+    getPrefixedGdocPath,
     getBakePath,
     getCanonicalUrl,
     getPageTitle,

diff --git a/packages/@ourworldindata/types/src/gdocTypes/ArchieMlComponents.ts b/packages/@ourworldindata/types/src/gdocTypes/ArchieMlComponents.ts
@@ -1077,3 +1077,16 @@ export type OwidEnrichedGdocBlock =
     | EnrichedBlockHomepageIntro
     | EnrichedBlockLatestDataInsights
     | EnrichedBlockSocials
+
+/**
+ * A map of all possible block types, with the type as the key and the block type as the value
+ * e.g.
+ * {
+ *   "text": EnrichedBlockText,
+ *   "aside": EnrichedBlockAside,
+ *    ...
+ * }
+ */
+export type OwidEnrichedGdocBlockTypeMap = {
+    [K in OwidEnrichedGdocBlock as K["type"]]: K
+}
diff --git a/packages/@ourworldindata/types/src/index.ts b/packages/@ourworldindata/types/src/index.ts
@@ -290,6 +290,7 @@ export {
     type EnrichedSocialLink,
     type RawBlockNarrativeChart,
     type EnrichedBlockNarrativeChart,
+    type OwidEnrichedGdocBlockTypeMap,
 } from "./gdocTypes/ArchieMlComponents.js"
 export {
     ChartConfigType,

diff --git a/packages/@ourworldindata/utils/src/Util.ts b/packages/@ourworldindata/utils/src/Util.ts
@@ -1325,6 +1325,27 @@ export const removeAllWhitespace = (text: string): string => {
     return text.replace(/\s+|\n/g, "")
 }
 
+export function getFirstTwoSentencesFromString(
+    text: string,
+    maxChars: number
+): string {
+    // match sentences ending in ".", "!", or "?"
+    const sentenceRegex = /[^.!?]+[.!?]/g
+    const sentences = text.match(sentenceRegex) || [text]
+
+    // Try to return two full sentences if possible
+    if (sentences.length >= 2) {
+        const twoSentences = sentences.slice(0, 2).join(" ").trim()
+        if (twoSentences.length <= maxChars) {
+            return twoSentences
+        }
+    }
+
+    // Otherwise, truncate by maxChars with an ellipsis
+    const truncated = text.slice(0, maxChars).trim()
+    return truncated.length < text.length ? truncated + "..." : truncated
+}
+
 export function moveArrayItemToIndex<Item>(
     arr: Item[],
     fromIndex: number,

diff --git a/packages/@ourworldindata/utils/src/index.ts b/packages/@ourworldindata/utils/src/index.ts
@@ -89,6 +89,7 @@ export {
     triggerDownloadFromBlob,
     triggerDownloadFromUrl,
     removeAllWhitespace,
+    getFirstTwoSentencesFromString,
     moveArrayItemToIndex,
     getIndexableKeys,
     retryPromise,

diff --git a/site/gdocs/utils.ts b/site/gdocs/utils.ts
@@ -16,6 +16,9 @@ import {
     SubNavId,
     OwidGdocDataInsightContent,
     OwidGdocLinkType,
+    OwidGdocDataInsightInterface,
+    OwidGdocPostInterface,
+    OwidEnrichedGdocBlockTypeMap,
 } from "@ourworldindata/types"
 import {
     formatAuthors,
@@ -211,6 +214,25 @@ export const getTopSubnavigationParentItem = (
     return subnavs[subnavId]?.[0]
 }
 
+export function getFirstBlockOfType<
+    T extends keyof OwidEnrichedGdocBlockTypeMap,
+>(
+    gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface,
+    type: T
+): OwidEnrichedGdocBlockTypeMap[T] | undefined {
+    if (!gdoc.content.body) return undefined
+    for (const block of gdoc.content.body) {
+        let foundBlock: OwidEnrichedGdocBlockTypeMap[T] | undefined
+        traverseEnrichedBlock(block, (node) => {
+            if (!foundBlock && node.type === type) {
+                foundBlock = node as OwidEnrichedGdocBlockTypeMap[T]
+            }
+        })
+        if (foundBlock) return foundBlock
+    }
+    return undefined
+}
+
 // Always use the smallFilename for old data insights, where two filenames were always provided
 // Doing this in code was simpler than migrating all the DI gdocs themselves
 // See https://github.com/owid/owid-grapher/issues/4416

diff --git a/site/search/searchTypes.ts b/site/search/searchTypes.ts
@@ -7,13 +7,15 @@ export type PageType =
     | "faq"
     | "article"
     | "other"
+    | "data-insight"
 
 export const pageTypeDisplayNames: Record<PageType, string> = {
     about: "About",
     topic: "Topic",
     country: "Country",
     faq: "FAQ",
     article: "Article",
+    "data-insight": "Data Insight",
     other: "Topic", // this is a band-aid to avoid showing "Other" for items that we now largely consider to be "Topics". Caveat: some non-topic pages are still indexed as "other" (e.g. /jobs). See https://owid.slack.com/archives/C04N12KT6GY/p1693580177430049?thread_ts=1693336759.239919&cid=C04N12KT6GY
 }