Skip to content

Commit

Permalink
Browse: page loading service, using remote Puppeteer
Browse files Browse the repository at this point in the history
also: moved to tRPC (node)
  • Loading branch information
enricoros committed Nov 24, 2023
1 parent 2dfa78f commit d5e2fbe
Show file tree
Hide file tree
Showing 7 changed files with 298 additions and 0 deletions.
5 changes: 5 additions & 0 deletions next.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ let nextConfig = {
// },
// },

// [puppeteer] https://github.com/puppeteer/puppeteer/issues/11052
experimental: {
serverComponentsExternalPackages: ['puppeteer-core'],
},

webpack: (config, _options) => {
// @mui/joy: anything material gets redirected to Joy
config.resolve.alias['@mui/material'] = '@mui/joy';
Expand Down
106 changes: 106 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"zustand": "~4.3.9"
},
"devDependencies": {
"@cloudflare/puppeteer": "^0.0.5",
"@types/node": "^20.10.0",
"@types/plantuml-encoder": "^1.4.2",
"@types/prismjs": "^1.26.3",
Expand Down
23 changes: 23 additions & 0 deletions src/modules/browse/browse.client.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import { apiAsyncNode } from '~/common/util/trpc.client';

export const CmdRunBrowse: string[] = ['/browse'];


export async function callBrowseFetchSinglePage(url: string): Promise<string | null> {
try {

const results = await apiAsyncNode.browse.fetchPages.mutate({
access: { dialect: 'browse-wss' },
subjects: [{ url }],
});

if (results.objects.length !== 1)
return `Browsing error: expected 1 result, got ${results.objects.length}`;

const firstResult = results.objects[0];
return !firstResult.error ? firstResult.content : `Browsing service error: ${firstResult.error}`;

} catch (error: any) {
return `Browsing error: ${error?.message || error?.toString() || 'Unknown fetch error'}`;
}
}
158 changes: 158 additions & 0 deletions src/modules/browse/browse.router.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import { z } from 'zod';
import { TRPCError } from '@trpc/server';
import { connect, TimeoutError } from '@cloudflare/puppeteer';

import { createTRPCRouter, publicProcedure } from '~/server/api/trpc.server';
import { env } from '~/server/env.mjs';


// change the page load and scrape timeout
const WORKER_TIMEOUT = 10 * 1000; // 10 seconds


// Input schemas

const browseAccessSchema = z.object({
dialect: z.enum(['browse-wss']),
wssEndpoint: z.string().trim().optional(),
wssKey: z.string().trim().optional(),
userAgent: z.string().optional(),
});

const fetchPageInputSchema = z.object({
access: browseAccessSchema,
subjects: z.array(z.object({
url: z.string().url(),
})),
});


// Output schemas

const fetchPageWorkerOutputSchema = z.object({
url: z.string(),
content: z.string(),
error: z.string().optional(),
stopReason: z.enum(['end', 'timeout', 'error']),
screenshot: z.object({
base64: z.string(),
width: z.number(),
height: z.number(),
}).optional(),
});

const fetchPagesOutputSchema = z.object({
objects: z.array(fetchPageWorkerOutputSchema),
});


export const browseRouter = createTRPCRouter({

fetchPages: publicProcedure
.input(fetchPageInputSchema)
.output(fetchPagesOutputSchema)
.mutation(async ({ input: { access, subjects } }) => {
const results: FetchPageWorkerOutputSchema[] = [];

for (const subject of subjects) {
try {
results.push(await workerPuppeteer(access, subject.url));
} catch (error: any) {
results.push({
url: subject.url,
content: '',
error: error?.message || error?.toString() || 'Unknown fetch error',
stopReason: 'error',
});
}
}

return { objects: results };
}),

});


type BrowseAccessSchema = z.infer<typeof browseAccessSchema>;
type FetchPageWorkerOutputSchema = z.infer<typeof fetchPageWorkerOutputSchema>;

async function workerPuppeteer(access: BrowseAccessSchema, targetUrl: string): Promise<FetchPageWorkerOutputSchema> {

// access
const browserWSEndpoint = (access.wssEndpoint || env.PUPPETEER_WSS_ENDPOINT || '').trim();
if (!browserWSEndpoint || !browserWSEndpoint.startsWith('wss://'))
throw new TRPCError({
code: 'BAD_REQUEST',
message: 'Invalid wss:// endpoint',
});

const result: FetchPageWorkerOutputSchema = {
url: targetUrl,
content: '(no content)',
error: undefined,
stopReason: 'error',
screenshot: undefined,
};

// [puppeteer] start the remote session
const browser = await connect({ browserWSEndpoint });
const page = await browser.newPage();

// open url
try {
page.setDefaultNavigationTimeout(WORKER_TIMEOUT);
await page.goto(targetUrl);
result.stopReason = 'end';
} catch (error: any) {
const isExpected: boolean = error instanceof TimeoutError;
result.stopReason = isExpected ? 'timeout' : 'error';
if (!isExpected) {
result.error = '[Puppeteer] Loading issue: ' + error?.message || error?.toString() || 'Unknown error';
console.error('workerPuppeteer: page.goto', error);
}
}

// transform the content of the page as text
try {
if (result.stopReason !== 'error') {
result.content = await page.evaluate(() => {
const content = document.body.innerText || document.textContent;
if (!content)
throw new Error('No content');
return content;
});
}
} catch (error: any) {
console.error('workerPuppeteer: page.evaluate', error);
}

// get a screenshot of the page
try {
const width = 100;
const height = 100;
const scale = 0.1; // 10%

await page.setViewport({ width: width / scale, height: height / scale, deviceScaleFactor: scale });

result.screenshot = {
base64: await page.screenshot({
type: 'webp',
clip: { x: 0, y: 0, width: width / scale, height: height / scale },
encoding: 'base64',
}) as string,
width,
height,
};
} catch (error: any) {
console.error('workerPuppeteer: page.screenshot', error);
}

// close the browse (important!)
try {
await browser.close();
} catch (error: any) {
console.error('workerPuppeteer: browser.close', error);
}

return result;
}
2 changes: 2 additions & 0 deletions src/server/api/trpc.router-node.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import { createTRPCRouter } from './trpc.server';

import { browseRouter } from '~/modules/browse/browse.router';
import { tradeRouter } from '~/modules/trade/server/trade.router';

/**
* Secondary rooter, and will be sitting on an NodeJS Runtime.
*/
export const appRouterNode = createTRPCRouter({
browse: browseRouter,
trade: tradeRouter,
});

Expand Down
Loading

0 comments on commit d5e2fbe

Please sign in to comment.