Skip to content

Commit

Permalink
🎉 add data insights to algolia pages index
Browse files Browse the repository at this point in the history
  • Loading branch information
ikesau committed Feb 7, 2025
1 parent b56247a commit ed997f4
Show file tree
Hide file tree
Showing 12 changed files with 130 additions and 23 deletions.
2 changes: 1 addition & 1 deletion baker/SiteBaker.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ export class SiteBaker {
if (!this.bakeSteps.has("gdocPosts")) return
// We don't need to call `load` on these, because we prefetch all attachments
const publishedGdocs = await db
.getPublishedGdocPostsWithTags(knex)
.getPublishedGdocsWithTags(knex)
.then((gdocs) => gdocs.map(gdocFromJSON))

const allParentTagArraysByChildName =
Expand Down
60 changes: 52 additions & 8 deletions baker/algolia/utils/pages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ import {
DEFAULT_GDOC_FEATURED_IMAGE,
DEFAULT_THUMBNAIL_FILENAME,
DbEnrichedImage,
OwidGdocDataInsightInterface,
getFirstTwoSentencesFromString,
spansToUnformattedPlainText,
} from "@ourworldindata/utils"
import { formatPost } from "../../formatWordpressPost.js"
import ReactDOMServer from "react-dom/server.js"
Expand Down Expand Up @@ -45,6 +48,8 @@ import {
CLOUDFLARE_IMAGES_URL,
} from "../../../settings/clientSettings.js"
import { logErrorAndMaybeCaptureInSentry } from "../../../serverUtils/errorLog.js"
import { getFirstBlockOfType } from "../../../site/gdocs/utils.js"
import { getPrefixedGdocPath } from "@ourworldindata/components"

const computePageScore = (record: Omit<PageRecord, "score">): number => {
const { importance, views_7d } = record
Expand Down Expand Up @@ -161,9 +166,17 @@ async function generateWordpressRecords(
}

const getThumbnailUrl = (
gdoc: OwidGdocPostInterface,
gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface,
cloudflareImages: Record<string, DbEnrichedImage>
): string => {
if (gdoc.content.type === OwidGdocType.DataInsight) {
const firstImage = getFirstBlockOfType(gdoc, "image")
const filename = firstImage?.smallFilename || firstImage?.filename
return filename && cloudflareImages[filename]
? `${CLOUDFLARE_IMAGES_URL}/${cloudflareImages[filename].cloudflareId}/w=512`
: `${BAKED_BASE_URL}/${DEFAULT_GDOC_FEATURED_IMAGE}`
}

if (gdoc.content["deprecation-notice"]) {
return `${BAKED_BASE_URL}/${ARCHVED_THUMBNAIL_FILENAME}`
}
Expand All @@ -188,13 +201,30 @@ const getThumbnailUrl = (
return `${CLOUDFLARE_IMAGES_URL}/${cloudflareId}/w=512`
}

function getExcerptFromGdoc(
gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface
): string {
if (gdoc.content.type === OwidGdocType.DataInsight) {
const firstParagraph = getFirstBlockOfType(gdoc, "text")

if (firstParagraph) {
const plaintext = spansToUnformattedPlainText(firstParagraph.value)
return getFirstTwoSentencesFromString(plaintext, 140)
}

return ""
} else {
return gdoc.content.excerpt ?? ""
}
}

function generateGdocRecords(
gdocs: OwidGdocPostInterface[],
gdocs: (OwidGdocPostInterface | OwidGdocDataInsightInterface)[],
pageviews: Record<string, RawPageview>,
cloudflareImagesByFilename: Record<string, DbEnrichedImage>
): PageRecord[] {
const getPostTypeAndImportance = (
gdoc: OwidGdocPostInterface
gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface
): TypeAndImportance => {
return match(gdoc.content.type)
.with(OwidGdocType.Article, () => ({
Expand All @@ -216,6 +246,10 @@ function generateGdocRecords(
type: "other" as const,
importance: 0,
}))
.with(OwidGdocType.DataInsight, () => ({
type: "data-insight" as const,
importance: 0,
}))
.exhaustive()
}

Expand Down Expand Up @@ -245,8 +279,9 @@ function generateGdocRecords(
slug: gdoc.slug,
title: gdoc.content.title || "",
content: chunk,
views_7d: pageviews[`/${gdoc.slug}`]?.views_7d ?? 0,
excerpt: gdoc.content.excerpt,
views_7d:
pageviews[getPrefixedGdocPath("", gdoc)]?.views_7d ?? 0,
excerpt: getExcerptFromGdoc(gdoc),
date: gdoc.publishedAt!.toISOString(),
modifiedDate: (
gdoc.updatedAt ?? gdoc.publishedAt!
Expand All @@ -267,9 +302,18 @@ function generateGdocRecords(
// Generate records for countries, WP posts (not including posts that have been succeeded by Gdocs equivalents), and Gdocs
export const getPagesRecords = async (knex: db.KnexReadonlyTransaction) => {
const pageviews = await getAnalyticsPageviewsByUrlObj(knex)
const gdocs = await db
.getPublishedGdocPostsWithTags(knex)
.then((gdocs) => gdocs.map(gdocFromJSON) as OwidGdocPostInterface[])
const gdocs = (await db
.getPublishedGdocsWithTags(knex, [
OwidGdocType.Article,
OwidGdocType.LinearTopicPage,
OwidGdocType.TopicPage,
OwidGdocType.AboutPage,
OwidGdocType.DataInsight,
])
.then((gdocs) => gdocs.map(gdocFromJSON))) as (
| OwidGdocPostInterface
| OwidGdocDataInsightInterface
)[]

const publishedGdocsBySlug = keyBy(gdocs, "slug")
const slugsWithPublishedGdocsSuccessors =
Expand Down
20 changes: 11 additions & 9 deletions db/db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,15 @@ export const getPublishedGdocPosts = async (
).then((rows) => rows.map(parsePostsGdocsRow))
}

export const getPublishedGdocPostsWithTags = async (
knex: KnexReadonlyTransaction
export const getPublishedGdocsWithTags = async (
knex: KnexReadonlyTransaction,
// The traditional "post" types - doesn't include data insights, author pages, the homepage, etc.
gdocTypes: OwidGdocType[] = [
OwidGdocType.Article,
OwidGdocType.LinearTopicPage,
OwidGdocType.TopicPage,
OwidGdocType.AboutPage,
]
): Promise<DBEnrichedPostGdocWithTags[]> => {
return knexRaw<DBRawPostGdocWithTags>(
knex,
Expand Down Expand Up @@ -477,17 +484,12 @@ export const getPublishedGdocPostsWithTags = async (
gxt.tagId = t.id
WHERE
g.published = 1
AND g.type IN (:types)
AND g.type IN (:gdocTypes)
AND g.publishedAt <= NOW()
GROUP BY g.id
ORDER BY g.publishedAt DESC`,
{
types: [
OwidGdocType.Article,
OwidGdocType.LinearTopicPage,
OwidGdocType.TopicPage,
OwidGdocType.AboutPage,
],
gdocTypes,
}
).then((rows) => rows.map(parsePostsGdocsWithTagsRow))
}
Expand Down
4 changes: 2 additions & 2 deletions db/model/Gdoc/GdocFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ import {
knexRaw,
KnexReadWriteTransaction,
getImageMetadataByFilenames,
getPublishedGdocPostsWithTags,
getPublishedGdocsWithTags,
getParentTagArraysByChildName,
getBestBreadcrumbs,
} from "../../db.js"
Expand Down Expand Up @@ -490,7 +490,7 @@ export async function getLatestDataInsights(
export async function getAndLoadPublishedGdocPosts(
knex: KnexReadonlyTransaction
): Promise<GdocPost[]> {
const rows = await getPublishedGdocPostsWithTags(knex)
const rows = await getPublishedGdocsWithTags(knex)
const gdocs = await Promise.all(
rows.map(async (row) => loadGdocFromGdocBase(knex, row))
)
Expand Down
6 changes: 3 additions & 3 deletions packages/@ourworldindata/components/src/GdocsUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ export function convertHeadingTextToId(headingText: Span[]): string {
return urlSlug(spansToUnformattedPlainText(headingText))
}

function _getPrefixedPath(
export function getPrefixedGdocPath(
prefix: string,
gdoc: { slug: string; content: { type?: OwidGdocType } }
): string {
Expand Down Expand Up @@ -94,14 +94,14 @@ export const getBakePath = (
bakedSiteDir: string,
gdoc: { slug: string; content: { type?: OwidGdocType } }
): string => {
return _getPrefixedPath(bakedSiteDir, gdoc)
return getPrefixedGdocPath(bakedSiteDir, gdoc)
}

export const getCanonicalUrl = (
baseUrl: string,
gdoc: { slug: string; content: { type?: OwidGdocType } }
): string => {
return _getPrefixedPath(baseUrl, gdoc)
return getPrefixedGdocPath(baseUrl, gdoc)
}

export function getPageTitle(gdoc: OwidGdoc) {
Expand Down
1 change: 1 addition & 0 deletions packages/@ourworldindata/components/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export {
getUrlTarget,
checkIsInternalLink,
convertHeadingTextToId,
getPrefixedGdocPath,
getBakePath,
getCanonicalUrl,
getPageTitle,
Expand Down
13 changes: 13 additions & 0 deletions packages/@ourworldindata/types/src/gdocTypes/ArchieMlComponents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1077,3 +1077,16 @@ export type OwidEnrichedGdocBlock =
| EnrichedBlockHomepageIntro
| EnrichedBlockLatestDataInsights
| EnrichedBlockSocials

/**
* A map of all possible block types, with the type as the key and the block type as the value
* e.g.
* {
* "text": EnrichedBlockText,
* "aside": EnrichedBlockAside,
* ...
* }
*/
export type OwidEnrichedGdocBlockTypeMap = {
[K in OwidEnrichedGdocBlock as K["type"]]: K
}
1 change: 1 addition & 0 deletions packages/@ourworldindata/types/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ export {
type EnrichedSocialLink,
type RawBlockNarrativeChart,
type EnrichedBlockNarrativeChart,
type OwidEnrichedGdocBlockTypeMap,
} from "./gdocTypes/ArchieMlComponents.js"
export {
ChartConfigType,
Expand Down
21 changes: 21 additions & 0 deletions packages/@ourworldindata/utils/src/Util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1325,6 +1325,27 @@ export const removeAllWhitespace = (text: string): string => {
return text.replace(/\s+|\n/g, "")
}

export function getFirstTwoSentencesFromString(
text: string,
maxChars: number
): string {
// match sentences ending in ".", "!", or "?"
const sentenceRegex = /[^.!?]+[.!?]/g
const sentences = text.match(sentenceRegex) || [text]

Check failure

Code scanning / CodeQL

Polynomial regular expression used on uncontrolled data High

This
regular expression
that depends on
library input
may run slow on strings with many repetitions of ' '.

// Try to return two full sentences if possible
if (sentences.length >= 2) {
const twoSentences = sentences.slice(0, 2).join(" ").trim()
if (twoSentences.length <= maxChars) {
return twoSentences
}
}

// Otherwise, truncate by maxChars with an ellipsis
const truncated = text.slice(0, maxChars).trim()
return truncated.length < text.length ? truncated + "..." : truncated
}

export function moveArrayItemToIndex<Item>(
arr: Item[],
fromIndex: number,
Expand Down
1 change: 1 addition & 0 deletions packages/@ourworldindata/utils/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ export {
triggerDownloadFromBlob,
triggerDownloadFromUrl,
removeAllWhitespace,
getFirstTwoSentencesFromString,
moveArrayItemToIndex,
getIndexableKeys,
retryPromise,
Expand Down
22 changes: 22 additions & 0 deletions site/gdocs/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ import {
SubNavId,
OwidGdocDataInsightContent,
OwidGdocLinkType,
OwidGdocDataInsightInterface,
OwidGdocPostInterface,
OwidEnrichedGdocBlockTypeMap,
} from "@ourworldindata/types"
import {
formatAuthors,
Expand Down Expand Up @@ -211,6 +214,25 @@ export const getTopSubnavigationParentItem = (
return subnavs[subnavId]?.[0]
}

export function getFirstBlockOfType<
T extends keyof OwidEnrichedGdocBlockTypeMap,
>(
gdoc: OwidGdocPostInterface | OwidGdocDataInsightInterface,
type: T
): OwidEnrichedGdocBlockTypeMap[T] | undefined {
if (!gdoc.content.body) return undefined
for (const block of gdoc.content.body) {
let foundBlock: OwidEnrichedGdocBlockTypeMap[T] | undefined
traverseEnrichedBlock(block, (node) => {
if (!foundBlock && node.type === type) {
foundBlock = node as OwidEnrichedGdocBlockTypeMap[T]
}
})
if (foundBlock) return foundBlock
}
return undefined
}

// Always use the smallFilename for old data insights, where two filenames were always provided
// Doing this in code was simpler than migrating all the DI gdocs themselves
// See https://github.com/owid/owid-grapher/issues/4416
Expand Down
2 changes: 2 additions & 0 deletions site/search/searchTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ export type PageType =
| "faq"
| "article"
| "other"
| "data-insight"

export const pageTypeDisplayNames: Record<PageType, string> = {
about: "About",
topic: "Topic",
country: "Country",
faq: "FAQ",
article: "Article",
"data-insight": "Data Insight",
other: "Topic", // this is a band-aid to avoid showing "Other" for items that we now largely consider to be "Topics". Caveat: some non-topic pages are still indexed as "other" (e.g. /jobs). See https://owid.slack.com/archives/C04N12KT6GY/p1693580177430049?thread_ts=1693336759.239919&cid=C04N12KT6GY
}

Expand Down

0 comments on commit ed997f4

Please sign in to comment.