Skip to content

Commit

Permalink
Merge branch 'main' into excess-aphid
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko authored Jan 21, 2025
2 parents 315eb7b + 88387c6 commit f89088c
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 47 deletions.
47 changes: 36 additions & 11 deletions definitions/declarations/httparchive.js
Original file line number Diff line number Diff line change
@@ -1,17 +1,42 @@
const stagingTables = ['pages', 'requests', 'parsed_css']
for (const table of stagingTables) {
// Staging tables source: https://github.com/HTTPArchive/crawl/blob/main/crawl.py
['pages', 'requests', 'parsed_css'].forEach(table =>
declare({
schema: 'crawl_staging',
name: table
})
}
)

declare({
schema: 'wappalyzer',
name: 'technologies'
})
// See https://github.com/HTTPArchive/dataform/issues/43
assert('corrupted_technology_values')
.tags(['crawl_complete'])
.query(ctx => `
SELECT
date,
client,
tech,
COUNT(DISTINCT page) AS cnt_pages,
ARRAY_AGG(DISTINCT page LIMIT 3) AS sample_pages
FROM ${ctx.ref('crawl_staging', 'pages')} AS pages
LEFT JOIN pages.technologies AS tech
LEFT JOIN tech.categories AS category
WHERE
date = '${constants.currentMonth}' AND
(
tech.technology NOT IN (SELECT DISTINCT name FROM wappalyzer.technologies)
OR category NOT IN (SELECT DISTINCT name FROM wappalyzer.categories)
OR ARRAY_LENGTH(tech.categories) = 0
)
GROUP BY
date,
client,
tech
ORDER BY cnt_pages DESC
`);

declare({
schema: 'wappalyzer',
name: 'categories'
})
// Wappalyzer tables source: https://github.com/HTTPArchive/wappalyzer/blob/main/.github/workflows/upload.yml
['technologies', 'categories'].forEach(table =>
declare({
schema: 'wappalyzer',
name: table
})
)
84 changes: 79 additions & 5 deletions definitions/output/crawl/pages.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,97 @@ publish('pages', {
DELETE FROM ${ctx.self()}
WHERE date = '${constants.currentMonth}' AND
client = 'desktop';
`).query(ctx => `
INSERT INTO ${ctx.self()}
SELECT
*
FROM ${ctx.ref('crawl_staging', 'pages')}
WHERE date = '${constants.currentMonth}' AND
client = 'desktop'
${constants.devRankFilter}
`).postOps(ctx => `
${constants.devRankFilter};
DELETE FROM ${ctx.self()}
WHERE date = '${constants.currentMonth}' AND
client = 'mobile';
INSERT INTO ${ctx.self()}
`).query(ctx => `
SELECT
*
FROM ${ctx.ref('crawl_staging', 'pages')}
WHERE date = '${constants.currentMonth}' AND
client = 'mobile'
${constants.devRankFilter}
`).postOps(ctx => `
CREATE TEMP TABLE technologies_cleaned AS (
WITH wappalyzer AS (
SELECT DISTINCT
name AS technology,
categories
FROM ${ctx.ref('wappalyzer', 'technologies')}
),
pages AS (
SELECT
client,
page,
tech.technology,
tech.categories,
tech.info
FROM ${ctx.self()} AS pages
LEFT JOIN pages.technologies AS tech
WHERE date = '${constants.currentMonth}' ${constants.devRankFilter}
),
-- Identify impacted pages
impacted_pages AS (
SELECT DISTINCT
client,
page
FROM pages
LEFT JOIN pages.categories AS category
WHERE
-- Technology is corrupted
technology NOT IN (SELECT DISTINCT technology FROM wappalyzer) OR
-- Technology's category is corrupted
CONCAT(technology, category) NOT IN (
SELECT DISTINCT
CONCAT(technology, category)
FROM wappalyzer
LEFT JOIN wappalyzer.categories AS category
)
),
-- Keep valid technologies and use correct categories
reconstructed_technologies AS (
SELECT
client,
page,
ARRAY_AGG(STRUCT(
pages.technology,
wappalyzer.categories,
pages.info
)) AS technologies
FROM pages
INNER JOIN impacted_pages
USING (client, page)
INNER JOIN wappalyzer
ON pages.technology = wappalyzer.technology
GROUP BY
client,
page
)
SELECT
client,
page,
technologies
FROM reconstructed_technologies
);
-- Update the crawl.pages table with the cleaned and restored technologies
UPDATE ${ctx.self()} AS pages
SET technologies = technologies_cleaned.technologies
FROM technologies_cleaned
WHERE pages.date = '${constants.currentMonth}' AND
pages.client = technologies_cleaned.client AND
pages.page = technologies_cleaned.page;
`)
60 changes: 38 additions & 22 deletions definitions/output/reports/cwv_tech_categories.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,53 +7,69 @@ publish('cwv_tech_categories', {
}).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "categories", "type": "dict"} */
WITH pages AS (
SELECT
SELECT DISTINCT
client,
root_page,
technologies
FROM ${ctx.ref('crawl', 'pages')}
WHERE
date = '${pastMonth}' AND
client = 'mobile'
date = '${pastMonth}'
${constants.devRankFilter}
), categories AS (
),
category_descriptions AS (
SELECT
name AS category,
description
FROM ${ctx.ref('wappalyzer', 'categories')}
), category_stats AS (
),
category_stats AS (
SELECT
category,
COUNT(DISTINCT root_page) AS origins
FROM pages,
UNNEST(technologies) AS t,
UNNEST(t.categories) AS category
STRUCT(
COALESCE(MAX(IF(client = 'desktop', origins, 0))) AS desktop,
COALESCE(MAX(IF(client = 'mobile', origins, 0))) AS mobile
) AS origins
FROM (
SELECT
client,
category,
COUNT(DISTINCT root_page) AS origins
FROM pages
LEFT JOIN pages.technologies AS tech
LEFT JOIN tech.categories AS category
GROUP BY
client,
category
)
GROUP BY category
), technology_stats AS (
),
technology_stats AS (
SELECT
category,
technology,
COUNT(DISTINCT root_page) AS origins
FROM pages,
UNNEST(technologies) AS t,
UNNEST(t.categories) AS category
category_obj AS categories,
SUM(origins) AS total_origins
FROM ${ctx.ref('reports', 'cwv_tech_technologies')}
GROUP BY
category,
technology
technology,
categories
)
SELECT
category,
description,
category_stats.origins,
ARRAY_AGG(technology IGNORE NULLS ORDER BY technology_stats.origins DESC) AS technologies
origins,
ARRAY_AGG(technology IGNORE NULLS ORDER BY technology_stats.total_origins DESC) AS technologies
FROM category_stats
INNER JOIN technology_stats
USING (category)
LEFT JOIN categories
ON category_stats.category IN UNNEST(technology_stats.categories)
INNER JOIN category_descriptions
USING (category)
GROUP BY
category,
description,
origins
ORDER BY origins DESC
ORDER BY category ASC
`)
72 changes: 63 additions & 9 deletions definitions/output/reports/cwv_tech_technologies.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,73 @@ publish('cwv_tech_technologies', {
tags: ['crux_ready']
}).query(ctx => `
/* {"dataform_trigger": "report_cwv_tech_complete", "name": "technologies", "type": "dict"} */
WITH pages AS (
SELECT DISTINCT
client,
root_page,
tech.technology
FROM ${ctx.ref('crawl', 'pages')},
UNNEST(technologies) AS tech
WHERE
date = '${pastMonth}'
${constants.devRankFilter}
),
tech_origins AS (
SELECT
client,
technology,
COUNT(DISTINCT root_page) AS origins
FROM pages
GROUP BY
client,
technology
),
technologies AS (
SELECT
name AS technology,
description,
STRING_AGG(DISTINCT category, ', ' ORDER BY category ASC) AS category,
categories AS category_obj,
NULL AS similar_technologies
FROM ${ctx.ref('wappalyzer', 'technologies')},
UNNEST(categories) AS category
GROUP BY
technology,
description,
categories
),
total_pages AS (
SELECT
client,
COUNT(DISTINCT root_page) AS origins
FROM pages
GROUP BY client
)
SELECT
client,
app AS technology,
technology,
description,
category,
SPLIT(category, ",") AS category_obj,
category_obj,
similar_technologies,
origins
FROM tech_origins
INNER JOIN technologies
USING(technology)
UNION ALL
SELECT
client,
'ALL' AS technology,
NULL AS description,
NULL AS category,
NULL AS category_obj,
NULL AS similar_technologies,
origins
FROM ${ctx.ref('core_web_vitals', 'technologies')}
LEFT JOIN ${ctx.ref('wappalyzer', 'technologies')}
ON app = name
WHERE date = '${pastMonth}' AND
geo = 'ALL' AND
rank = 'ALL'
ORDER BY origins DESC
FROM total_pages
`)

0 comments on commit f89088c

Please sign in to comment.