Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 filter non-editorial content out of sitemap #2846

Merged
merged 3 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 2 additions & 10 deletions baker/SiteBaker.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -288,16 +288,8 @@ export class SiteBaker {

private async bakePosts() {
if (!this.bakeSteps.has("wordpressPosts")) return
// In the backporting workflow, the users create gdoc posts for posts. As long as these are not yet published,
// we still want to bake them from the WP posts. Once the users presses publish there though, we want to stop
// baking them from the wordpress post. Here we fetch all the slugs of posts that have been published via gdocs
// and exclude them from the baking process.
const alreadyPublishedViaGdocsSlugs = await db.knexRaw(`-- sql
select slug from posts_with_gdoc_publish_status
where isGdocPublished = TRUE`)
const alreadyPublishedViaGdocsSlugsSet = new Set(
alreadyPublishedViaGdocsSlugs.map((row: any) => row.slug)
)
const alreadyPublishedViaGdocsSlugsSet =
await db.getSlugsWithPublishedGdocsSuccessors()

const postsApi = await wpdb.getPosts(
undefined,
Expand Down
18 changes: 13 additions & 5 deletions baker/algolia/indexToAlgolia.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
OwidGdocType,
type RawPageview,
Tag,
PostRestApi,
} from "@ourworldindata/utils"
import { formatPost } from "../formatWordpressPost.js"
import ReactDOMServer from "react-dom/server.js"
Expand Down Expand Up @@ -71,7 +72,7 @@ function generateChunksFromHtmlText(htmlString: string) {
}

async function generateWordpressRecords(
postsApi: wpdb.PostAPI[],
postsApi: PostRestApi[],
pageviews: Record<string, RawPageview>
): Promise<PageRecord[]> {
const getPostTypeAndImportance = (
Expand Down Expand Up @@ -187,11 +188,18 @@ const getPagesRecords = async () => {
const pageviews = await Pageview.getViewsByUrlObj()
const gdocs = await Gdoc.getPublishedGdocs()
const publishedGdocsBySlug = keyBy(gdocs, "slug")
const postsApi = await wpdb
.getPosts()
.then((posts) =>
posts.filter((post) => !publishedGdocsBySlug[`/${post.slug}`])
const slugsWithPublishedGdocsSuccessors =
await db.getSlugsWithPublishedGdocsSuccessors()
const postsApi = await wpdb.getPosts(undefined, (post) => {
// Two things can happen here:
// 1. There's a published Gdoc with the same slug
// 2. This post has a Gdoc successor (which might have a different slug)
// In either case, we don't want to index this WP post
return !(
publishedGdocsBySlug[post.slug] ||
slugsWithPublishedGdocsSuccessors.has(post.slug)
)
})

const countryRecords = generateCountryRecords(countries, pageviews)
const wordpressRecords = await generateWordpressRecords(postsApi, pageviews)
Expand Down
26 changes: 10 additions & 16 deletions baker/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
} from "../settings/serverSettings.js"
import { dayjs, countries, queryParamsToStr } from "@ourworldindata/utils"
import * as db from "../db/db.js"
import * as wpdb from "../db/wpdb.js"
import urljoin from "url-join"
import { countryProfileSpecs } from "../site/countryProfileProjects.js"
import { ExplorerAdminServer } from "../explorerAdminServer/ExplorerAdminServer.js"
Expand Down Expand Up @@ -57,20 +58,13 @@ const explorerToSitemapUrl = (program: ExplorerProgram): SitemapUrl[] => {
}

export const makeSitemap = async (explorerAdminServer: ExplorerAdminServer) => {
const posts = (await db
.knexTable("posts_with_gdoc_publish_status")
.where({ status: "publish", isGdocPublished: false })
.select("slug", "updated_at_in_wordpress")) as {
slug: string
updated_at_in_wordpress: Date
}[]
const gdocPosts = (await db
.knexTable(Gdoc.table)
.where({ published: true })
.select("slug", "updatedAt")) as {
slug: string
updatedAt: Date
}[]
const alreadyPublishedViaGdocsSlugsSet =
await db.getSlugsWithPublishedGdocsSuccessors()
const postsApi = await wpdb.getPosts(
undefined,
(postrow) => !alreadyPublishedViaGdocsSlugsSet.has(postrow.slug)
)
const gdocPosts = await Gdoc.getPublishedGdocs()
const charts = (await db
.knexTable(Chart.table)
.select(db.knexRaw(`updatedAt, config->>"$.slug" AS slug`))
Expand All @@ -94,9 +88,9 @@ export const makeSitemap = async (explorerAdminServer: ExplorerAdminServer) => {
})
)
.concat(
posts.map((p) => ({
postsApi.map((p) => ({
loc: urljoin(BAKED_BASE_URL, p.slug),
lastmod: dayjs(p.updated_at_in_wordpress).format("YYYY-MM-DD"),
lastmod: dayjs(p.modified_gmt).format("YYYY-MM-DD"),
}))
)
.concat(
Expand Down
18 changes: 18 additions & 0 deletions db/db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,21 @@ export const knexTable = (table: string): Knex.QueryBuilder =>
knexInstance().table(table)

export const knexRaw = (str: string): Knex.Raw => knexInstance().raw(str)

/**
* In the backporting workflow, the users create gdoc posts for posts. As long as these are not yet published,
* we still want to bake them from the WP posts. Once the users presses publish there though, we want to stop
* baking them from the wordpress post. This funciton fetches all the slugs of posts that have been published via gdocs,
* to help us exclude them from the baking process.
*/
export const getSlugsWithPublishedGdocsSuccessors = async (): Promise<
Set<string>
> => {
return knexRaw(
`-- sql
select slug from posts_with_gdoc_publish_status
where isGdocPublished = TRUE`
)
.then((res) => res[0])
.then((rows) => new Set(rows.map((row: any) => row.slug)))
}
22 changes: 2 additions & 20 deletions db/wpdb.ts
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ export const getPosts = async (
postTypes: string[] = [WP_PostType.Post, WP_PostType.Page],
filterFunc?: FilterFnPostRestApi,
limit?: number
): Promise<any[]> => {
): Promise<PostRestApi[]> => {
if (!isWordpressAPIEnabled) return []

const perPage = 20
Expand Down Expand Up @@ -720,27 +720,9 @@ export const getBlockContent = async (

return post.data?.wpBlock?.content ?? undefined
}
export interface PostAPI {
id: number
type: WP_PostType
slug: string
title: {
rendered: string
}
date_gmt: string
modified_gmt: string
authors_name?: string[]
content: { rendered: string }
excerpt: { rendered: string }
featured_media_paths: {
medium_large: string
thumbnail: string
}
featured_media: number
}

export const getFullPost = async (
postApi: PostAPI,
postApi: PostRestApi,
excludeContent?: boolean
): Promise<FullPost> => ({
id: postApi.id,
Expand Down
37 changes: 37 additions & 0 deletions packages/@ourworldindata/utils/src/owidTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,43 @@ export interface PostRestApi {
latest?: boolean
}
}
id: number
date: string
date_gmt: string
guid: {
rendered: string
}
modified: string
modified_gmt: string

status: string
type: WP_PostType
link: string
title: {
rendered: string
}
content: {
rendered: string
protected: boolean
}
excerpt: {
rendered: string
protected: boolean
}
author: number
featured_media: number
comment_status: string
ping_status: string
sticky: boolean
template: string
format: string
categories: number[]
tags: any[]
authors_name: string[]
featured_media_paths: {
thumbnail: string
medium_large: string
}
}

export interface KeyInsight {
Expand Down
Loading