From d5e2fbed0ee58277a6f7394d44dc24fdc728615c Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Tue, 17 Oct 2023 20:54:51 -0700 Subject: [PATCH] Browse: page loading service, using remote Puppeteer also: moved to tRPC (node) --- next.config.mjs | 5 + package-lock.json | 106 +++++++++++++++++++ package.json | 1 + src/modules/browse/browse.client.ts | 23 ++++ src/modules/browse/browse.router.ts | 158 ++++++++++++++++++++++++++++ src/server/api/trpc.router-node.ts | 2 + src/server/env.mjs | 3 + 7 files changed, 298 insertions(+) create mode 100644 src/modules/browse/browse.client.ts create mode 100644 src/modules/browse/browse.router.ts diff --git a/next.config.mjs b/next.config.mjs index 07909cf0ec..7ae446c09b 100644 --- a/next.config.mjs +++ b/next.config.mjs @@ -9,6 +9,11 @@ let nextConfig = { // }, // }, + // [puppeteer] https://github.com/puppeteer/puppeteer/issues/11052 + experimental: { + serverComponentsExternalPackages: ['puppeteer-core'], + }, + webpack: (config, _options) => { // @mui/joy: anything material gets redirected to Joy config.resolve.alias['@mui/material'] = '@mui/joy'; diff --git a/package-lock.json b/package-lock.json index 942b2db684..fc1f3615ac 100644 --- a/package-lock.json +++ b/package-lock.json @@ -46,6 +46,7 @@ "zustand": "~4.3.9" }, "devDependencies": { + "@cloudflare/puppeteer": "^0.0.5", "@types/node": "^20.10.0", "@types/plantuml-encoder": "^1.4.2", "@types/prismjs": "^1.26.3", @@ -277,6 +278,23 @@ "node": ">=6.9.0" } }, + "node_modules/@cloudflare/puppeteer": { + "version": "0.0.5", + "resolved": "https://registry.npmjs.org/@cloudflare/puppeteer/-/puppeteer-0.0.5.tgz", + "integrity": "sha512-K+DLUmDVSM5UNzFokSqie0LPIFAPvdkLKHWnx8Gmck/M41387aCyLlUjWIeUGV3QifSRwaxTRfeMpELQW0lDZg==", + "dev": true, + "dependencies": { + "debug": "4.3.4", + "devtools-protocol": "0.0.1019158", + "events": "3.3.0", + "stream": "0.0.2", + "url": "0.11.0", + "util": "0.12.5" + }, + "engines": { + "node": ">=14.1.0" + } + }, "node_modules/@dqbd/tiktoken": { "version": "1.0.7", "resolved": "https://registry.npmjs.org/@dqbd/tiktoken/-/tiktoken-1.0.7.tgz", @@ -2227,6 +2245,12 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/devtools-protocol": { + "version": "0.0.1019158", + "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1019158.tgz", + "integrity": "sha512-wvq+KscQ7/6spEV7czhnZc9RM/woz1AY+/Vpd8/h2HFMwJSdTliu7f/yr1A6vDdJfKICZsShqsYpEQbdhg8AFQ==", + "dev": true + }, "node_modules/dir-glob": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz", @@ -2306,6 +2330,15 @@ "safe-buffer": "~5.1.0" } }, + "node_modules/emitter-component": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/emitter-component/-/emitter-component-1.1.2.tgz", + "integrity": "sha512-QdXO3nXOzZB4pAjM0n6ZE+R9/+kPpECA/XSELIcc54NeYVnBqIk+4DFiBgK+8QbV3mdvTG6nedl7dTYgO+5wDw==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/emoji-regex": { "version": "9.2.2", "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz", @@ -2869,6 +2902,15 @@ "node": ">=0.10.0" } }, + "node_modules/events": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz", + "integrity": "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==", + "dev": true, + "engines": { + "node": ">=0.8.x" + } + }, "node_modules/eventsource-parser": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-1.1.1.tgz", @@ -3502,6 +3544,22 @@ "node": ">= 0.4" } }, + "node_modules/is-arguments": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-arguments/-/is-arguments-1.1.1.tgz", + "integrity": "sha512-8Q7EARjzEnKpt/PCD7e1cgUS0a6X8u5tdSiMqXhojOdoV9TsMsiO+9VLC5vAmO8N7/GmXn7yjR8qnA6bVAEzfA==", + "dev": true, + "dependencies": { + "call-bind": "^1.0.2", + "has-tostringtag": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/is-array-buffer": { "version": "3.0.2", "resolved": "https://registry.npmjs.org/is-array-buffer/-/is-array-buffer-3.0.2.tgz", @@ -5500,6 +5558,16 @@ "node": ">=6" } }, + "node_modules/querystring": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/querystring/-/querystring-0.2.0.tgz", + "integrity": "sha512-X/xY82scca2tau62i9mDyU9K+I+djTMUsvwf7xnUX5GLvVzgJybOJf4Y6o9Zx3oJK/LSXg5tTZBjwzqVPaPO2g==", + "deprecated": "The querystring API is considered Legacy. new code should use the URLSearchParams API instead.", + "dev": true, + "engines": { + "node": ">=0.4.x" + } + }, "node_modules/queue-microtask": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", @@ -6053,6 +6121,15 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/stream": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/stream/-/stream-0.0.2.tgz", + "integrity": "sha512-gCq3NDI2P35B2n6t76YJuOp7d6cN/C7Rt0577l91wllh0sY9ZBuw9KaSGqH/b0hzn3CWWJbpbW0W0WvQ1H/Q7g==", + "dev": true, + "dependencies": { + "emitter-component": "^1.1.1" + } + }, "node_modules/streamsearch": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz", @@ -6619,6 +6696,22 @@ "punycode": "^2.1.0" } }, + "node_modules/url": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/url/-/url-0.11.0.tgz", + "integrity": "sha512-kbailJa29QrtXnxgq+DdCEGlbTeYM2eJUxsz6vjZavrCYPMIFHMKQmSKYAIuUK2i7hgPm28a8piX5NTUtM/LKQ==", + "dev": true, + "dependencies": { + "punycode": "1.3.2", + "querystring": "0.2.0" + } + }, + "node_modules/url/node_modules/punycode": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.3.2.tgz", + "integrity": "sha512-RofWgt/7fL5wP1Y7fxE7/EmTLzQVnB0ycyibJ0OOHIlJqTNzglYFxVwETOcIoJqJmpDXJ9xImDv+Fq34F/d4Dw==", + "dev": true + }, "node_modules/use-sync-external-store": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.2.0.tgz", @@ -6627,6 +6720,19 @@ "react": "^16.8.0 || ^17.0.0 || ^18.0.0" } }, + "node_modules/util": { + "version": "0.12.5", + "resolved": "https://registry.npmjs.org/util/-/util-0.12.5.tgz", + "integrity": "sha512-kZf/K6hEIrWHI6XqOFUiiMa+79wE/D8Q+NCNAWclkyg3b4d2k7s0QGepNjiABc+aR3N1PAyHL7p6UcLY6LmrnA==", + "dev": true, + "dependencies": { + "inherits": "^2.0.3", + "is-arguments": "^1.0.4", + "is-generator-function": "^1.0.7", + "is-typed-array": "^1.1.3", + "which-typed-array": "^1.1.2" + } + }, "node_modules/util-deprecate": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", diff --git a/package.json b/package.json index f977d9c065..0491da422e 100644 --- a/package.json +++ b/package.json @@ -50,6 +50,7 @@ "zustand": "~4.3.9" }, "devDependencies": { + "@cloudflare/puppeteer": "^0.0.5", "@types/node": "^20.10.0", "@types/plantuml-encoder": "^1.4.2", "@types/prismjs": "^1.26.3", diff --git a/src/modules/browse/browse.client.ts b/src/modules/browse/browse.client.ts new file mode 100644 index 0000000000..67b311bb90 --- /dev/null +++ b/src/modules/browse/browse.client.ts @@ -0,0 +1,23 @@ +import { apiAsyncNode } from '~/common/util/trpc.client'; + +export const CmdRunBrowse: string[] = ['/browse']; + + +export async function callBrowseFetchSinglePage(url: string): Promise { + try { + + const results = await apiAsyncNode.browse.fetchPages.mutate({ + access: { dialect: 'browse-wss' }, + subjects: [{ url }], + }); + + if (results.objects.length !== 1) + return `Browsing error: expected 1 result, got ${results.objects.length}`; + + const firstResult = results.objects[0]; + return !firstResult.error ? firstResult.content : `Browsing service error: ${firstResult.error}`; + + } catch (error: any) { + return `Browsing error: ${error?.message || error?.toString() || 'Unknown fetch error'}`; + } +} diff --git a/src/modules/browse/browse.router.ts b/src/modules/browse/browse.router.ts new file mode 100644 index 0000000000..a3d9962c42 --- /dev/null +++ b/src/modules/browse/browse.router.ts @@ -0,0 +1,158 @@ +import { z } from 'zod'; +import { TRPCError } from '@trpc/server'; +import { connect, TimeoutError } from '@cloudflare/puppeteer'; + +import { createTRPCRouter, publicProcedure } from '~/server/api/trpc.server'; +import { env } from '~/server/env.mjs'; + + +// change the page load and scrape timeout +const WORKER_TIMEOUT = 10 * 1000; // 10 seconds + + +// Input schemas + +const browseAccessSchema = z.object({ + dialect: z.enum(['browse-wss']), + wssEndpoint: z.string().trim().optional(), + wssKey: z.string().trim().optional(), + userAgent: z.string().optional(), +}); + +const fetchPageInputSchema = z.object({ + access: browseAccessSchema, + subjects: z.array(z.object({ + url: z.string().url(), + })), +}); + + +// Output schemas + +const fetchPageWorkerOutputSchema = z.object({ + url: z.string(), + content: z.string(), + error: z.string().optional(), + stopReason: z.enum(['end', 'timeout', 'error']), + screenshot: z.object({ + base64: z.string(), + width: z.number(), + height: z.number(), + }).optional(), +}); + +const fetchPagesOutputSchema = z.object({ + objects: z.array(fetchPageWorkerOutputSchema), +}); + + +export const browseRouter = createTRPCRouter({ + + fetchPages: publicProcedure + .input(fetchPageInputSchema) + .output(fetchPagesOutputSchema) + .mutation(async ({ input: { access, subjects } }) => { + const results: FetchPageWorkerOutputSchema[] = []; + + for (const subject of subjects) { + try { + results.push(await workerPuppeteer(access, subject.url)); + } catch (error: any) { + results.push({ + url: subject.url, + content: '', + error: error?.message || error?.toString() || 'Unknown fetch error', + stopReason: 'error', + }); + } + } + + return { objects: results }; + }), + +}); + + +type BrowseAccessSchema = z.infer; +type FetchPageWorkerOutputSchema = z.infer; + +async function workerPuppeteer(access: BrowseAccessSchema, targetUrl: string): Promise { + + // access + const browserWSEndpoint = (access.wssEndpoint || env.PUPPETEER_WSS_ENDPOINT || '').trim(); + if (!browserWSEndpoint || !browserWSEndpoint.startsWith('wss://')) + throw new TRPCError({ + code: 'BAD_REQUEST', + message: 'Invalid wss:// endpoint', + }); + + const result: FetchPageWorkerOutputSchema = { + url: targetUrl, + content: '(no content)', + error: undefined, + stopReason: 'error', + screenshot: undefined, + }; + + // [puppeteer] start the remote session + const browser = await connect({ browserWSEndpoint }); + const page = await browser.newPage(); + + // open url + try { + page.setDefaultNavigationTimeout(WORKER_TIMEOUT); + await page.goto(targetUrl); + result.stopReason = 'end'; + } catch (error: any) { + const isExpected: boolean = error instanceof TimeoutError; + result.stopReason = isExpected ? 'timeout' : 'error'; + if (!isExpected) { + result.error = '[Puppeteer] Loading issue: ' + error?.message || error?.toString() || 'Unknown error'; + console.error('workerPuppeteer: page.goto', error); + } + } + + // transform the content of the page as text + try { + if (result.stopReason !== 'error') { + result.content = await page.evaluate(() => { + const content = document.body.innerText || document.textContent; + if (!content) + throw new Error('No content'); + return content; + }); + } + } catch (error: any) { + console.error('workerPuppeteer: page.evaluate', error); + } + + // get a screenshot of the page + try { + const width = 100; + const height = 100; + const scale = 0.1; // 10% + + await page.setViewport({ width: width / scale, height: height / scale, deviceScaleFactor: scale }); + + result.screenshot = { + base64: await page.screenshot({ + type: 'webp', + clip: { x: 0, y: 0, width: width / scale, height: height / scale }, + encoding: 'base64', + }) as string, + width, + height, + }; + } catch (error: any) { + console.error('workerPuppeteer: page.screenshot', error); + } + + // close the browse (important!) + try { + await browser.close(); + } catch (error: any) { + console.error('workerPuppeteer: browser.close', error); + } + + return result; +} diff --git a/src/server/api/trpc.router-node.ts b/src/server/api/trpc.router-node.ts index 5f118fcd0e..e828ce98b7 100644 --- a/src/server/api/trpc.router-node.ts +++ b/src/server/api/trpc.router-node.ts @@ -1,11 +1,13 @@ import { createTRPCRouter } from './trpc.server'; +import { browseRouter } from '~/modules/browse/browse.router'; import { tradeRouter } from '~/modules/trade/server/trade.router'; /** * Secondary rooter, and will be sitting on an NodeJS Runtime. */ export const appRouterNode = createTRPCRouter({ + browse: browseRouter, trade: tradeRouter, }); diff --git a/src/server/env.mjs b/src/server/env.mjs index 30f2227cb4..3216fb3620 100644 --- a/src/server/env.mjs +++ b/src/server/env.mjs @@ -41,6 +41,9 @@ export const env = createEnv({ // Google Custom Search GOOGLE_CLOUD_API_KEY: z.string().optional(), GOOGLE_CSE_ID: z.string().optional(), + + // Browsing Service + PUPPETEER_WSS_ENDPOINT: z.string().url().optional(), },