From 67d90b47a0e3fbdc342c5090bec1cb388bf33b7e Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Fri, 13 Oct 2023 19:24:27 +0200 Subject: [PATCH] :hammer: add tooling to get pageview data into local mysql --- Makefile | 31 +++++++++++-------- db/refreshPageviewsFromDatasette.ts | 47 +++++++++++++++++++++++++++++ package.json | 1 + 3 files changed, 66 insertions(+), 13 deletions(-) create mode 100644 db/refreshPageviewsFromDatasette.ts diff --git a/Makefile b/Makefile index ab8c0b9b44c..b7b6bd1f4a2 100644 --- a/Makefile +++ b/Makefile @@ -20,23 +20,24 @@ help: @echo 'Available commands:' @echo @echo ' GRAPHER ONLY' - @echo ' make up start dev environment via docker-compose and tmux' - @echo ' make down stop any services still running' - @echo ' make refresh (while up) download a new grapher snapshot and update MySQL' - @echo ' make migrate (while up) run any outstanding db migrations' - @echo ' make test run full suite (except db tests) of CI checks including unit tests' - @echo ' make dbtest run db test suite that needs a running mysql db' - @echo ' make svgtest compare current rendering against reference SVGs' + @echo ' make up start dev environment via docker-compose and tmux' + @echo ' make down stop any services still running' + @echo ' make refresh (while up) download a new grapher snapshot and update MySQL' + @echo ' make refresh.pageviews (while up) download and load pageviews from the private datasette instance' + @echo ' make migrate (while up) run any outstanding db migrations' + @echo ' make test run full suite (except db tests) of CI checks including unit tests' + @echo ' make dbtest run db test suite that needs a running mysql db' + @echo ' make svgtest compare current rendering against reference SVGs' @echo @echo ' GRAPHER + WORDPRESS (staff-only)' - @echo ' make up.full start dev environment via docker-compose and tmux' - @echo ' make down.full stop any services still running' - @echo ' make refresh.wp download a new wordpress snapshot and update MySQL' - @echo ' make refresh.full do a full MySQL update of both wordpress and grapher' + @echo ' make up.full start dev environment via docker-compose and tmux' + @echo ' make down.full stop any services still running' + @echo ' make refresh.wp download a new wordpress snapshot and update MySQL' + @echo ' make refresh.full do a full MySQL update of both wordpress and grapher' @echo @echo ' OPS (staff-only)' - @echo ' make deploy Deploy your local site to production' - @echo ' make stage Deploy your local site to staging' + @echo ' make deploy Deploy your local site to production' + @echo ' make stage Deploy your local site to staging' @echo up: export DEBUG = 'knex:query' @@ -132,6 +133,10 @@ refresh: @echo '==> Updating grapher database' @. ./.env && DATA_FOLDER=tmp-downloads ./devTools/docker/refresh-grapher-data.sh +refresh.pageviews: + @echo '==> Refreshing pageviews' + yarn && yarn buildTsc && yarn refreshPageviews + refresh.wp: @echo '==> Downloading wordpress data' ./devTools/docker/download-wordpress-mysql.sh diff --git a/db/refreshPageviewsFromDatasette.ts b/db/refreshPageviewsFromDatasette.ts new file mode 100644 index 00000000000..80bd6f55e26 --- /dev/null +++ b/db/refreshPageviewsFromDatasette.ts @@ -0,0 +1,47 @@ +// index.ts +import fetch from "node-fetch" +import Papa from "papaparse" +import * as db from "./db.js" + +async function downloadAndInsertCSV(): Promise { + const csvUrl = "http://datasette-private/owid/pageviews.csv?_size=max" + const response = await fetch(csvUrl) + + if (!response.ok) { + throw new Error(`Failed to fetch CSV: ${response.statusText}`) + } + + const csvText = await response.text() + const parsedData = Papa.parse(csvText, { + header: true, + }) + + if (parsedData.errors.length > 1) { + console.error("Errors while parsing CSV:", parsedData.errors) + return + } + + const onlyValidRows = [...parsedData.data].filter( + (row) => Object.keys(row as any).length === 5 + ) as any[] + + console.log("Parsed CSV data:", onlyValidRows.length, "rows") + console.log("Columns:", parsedData.meta.fields) + + await db.knexRaw("TRUNCATE TABLE pageviews") + + await db.knexInstance().batchInsert("pageviews", onlyValidRows) + console.log("CSV data inserted successfully!") +} + +const main = async (): Promise => { + try { + await downloadAndInsertCSV() + } catch (e) { + console.error(e) + } finally { + await db.closeTypeOrmAndKnexConnections() + } +} + +main() diff --git a/package.json b/package.json index b5c85515a7b..0edf209b6e3 100644 --- a/package.json +++ b/package.json @@ -23,6 +23,7 @@ "fixPrettierChanged": "yarn pretty-quick --pattern \"**/*.{tsx,ts,jsx,js,json,md,html,css,scss,yml}\"", "runRegionsUpdater": "node --enable-source-maps ./itsJustJavascript/devTools/regionsUpdater/update.js", "runDbMigrations": "yarn typeorm migration:run -d itsJustJavascript/db/dataSource.js", + "refreshPageviews": "node --enable-source-maps ./itsJustJavascript/db/refreshPageviewsFromDatasette.js", "revertLastDbMigration": "yarn typeorm migration:revert -d itsJustJavascript/db/dataSource.js", "runPostUpdateHook": "node --enable-source-maps ./itsJustJavascript/baker/postUpdatedHook.js", "startAdminServer": "node --enable-source-maps ./itsJustJavascript/adminSiteServer/app.js",