Skip to content

Commit

Permalink
chore: support got-scraping v4 (#233)
Browse files Browse the repository at this point in the history
* chore: support got-scraping v4

* chore: i am in pain

* chore: bump crawlee

* chore: move to vitest

* chore: lint

* chore: son of a gun

* chore: hallelujah it passes

---------

Co-authored-by: Martin Adámek <[email protected]>
  • Loading branch information
vladfrangu and B4nan authored Nov 27, 2023
1 parent 681e35b commit 661192a
Show file tree
Hide file tree
Showing 34 changed files with 2,150 additions and 3,134 deletions.
24 changes: 0 additions & 24 deletions jest.config.js

This file was deleted.

4,678 changes: 1,831 additions & 2,847 deletions package-lock.json

Large diffs are not rendered by default.

19 changes: 9 additions & 10 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@
"clean": "turbo run clean",
"build": "turbo run build",
"ci:build": "turbo run build --cache-dir=\".turbo\"",
"test": "jest --silent",
"test": "vitest run --silent",
"test:e2e": "node test/e2e/run.mjs",
"coverage": "jest --coverage",
"coverage": "vitest --coverage",
"release": "npm run build && lerna publish from-package --contents dist",
"publish:next": "lerna publish --canary --preid beta --dist-tag next",
"release:next": "npm run build && npm run publish:next",
Expand All @@ -50,37 +50,36 @@
},
"devDependencies": {
"@apify/consts": "^2.20.0",
"@apify/eslint-config-ts": "^0.3.0",
"@apify/eslint-config-ts": "^0.4.1",
"@apify/input_secrets": "^1.1.32",
"@apify/tsconfig": "^0.1.0",
"@commitlint/config-conventional": "^18.0.0",
"@playwright/browser-chromium": "^1.39.0",
"@types/content-type": "^1.1.5",
"@types/fs-extra": "^11.0.1",
"@types/jest": "^29.5.3",
"@types/node": "^20.4.2",
"@types/rimraf": "^4.0.5",
"@types/semver": "^7.5.0",
"@types/tough-cookie": "^4.0.2",
"@types/ws": "^8.5.5",
"@typescript-eslint/eslint-plugin": "^6.2.0",
"@typescript-eslint/parser": "^6.2.0",
"@typescript-eslint/eslint-plugin": "^6.12.0",
"@typescript-eslint/parser": "^6.12.0",
"commitlint": "^18.0.0",
"crawlee": "^3.5.3",
"eslint": "^8.45.0",
"crawlee": "^3.6.1",
"eslint": "^8.54.0",
"fs-extra": "^11.1.1",
"gen-esm-wrapper": "^1.1.3",
"husky": "^8.0.3",
"jest": "^29.6.1",
"lerna": "^7.1.5",
"lint-staged": "^15.0.0",
"playwright": "^1.39.0",
"puppeteer": "^21.3.8",
"rimraf": "^5.0.1",
"ts-jest": "^29.1.1",
"ts-node": "^10.9.1",
"turbo": "1.10.16",
"typescript": "~5.2.0",
"vite-tsconfig-paths": "^4.2.1",
"vitest": "^0.34.6",
"ws": "^7.5.9"
},
"packageManager": "[email protected]"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { readFile } from 'node:fs/promises';
import { IncomingMessage } from 'node:http';
import { dirname } from 'node:path';
import { fileURLToPath, URL } from 'node:url';

import {
constants as scraperToolsConstants,
CrawlerSetupOptions,
Expand All @@ -26,6 +27,7 @@ import {
} from '@crawlee/cheerio';
import { Actor, ApifyEnv } from 'apify';
import { load } from 'cheerio';

import { Input, ProxyRotation } from './consts.js';

const { SESSION_MAX_USAGE_COUNTS, META_KEY } = scraperToolsConstants;
Expand Down Expand Up @@ -146,7 +148,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);

// Proxy configuration
this.proxyConfiguration = await Actor.createProxyConfiguration(this.input.proxyConfiguration);
this.proxyConfiguration = await Actor.createProxyConfiguration(this.input.proxyConfiguration) as any as ProxyConfiguration;
}

/**
Expand Down Expand Up @@ -240,7 +242,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
});
}

private _failedRequestHandler({ request }: CheerioCrawlingContext) {
private async _failedRequestHandler({ request }: CheerioCrawlingContext) {
const lastError = request.errorMessages[request.errorMessages.length - 1];
const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
log.error(`Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);
Expand Down
1 change: 1 addition & 0 deletions packages/actor-scraper/cheerio-scraper/src/main.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { runActor } from '@apify/scraper-tools';

import { CrawlerSetup } from './internals/crawler_setup.js';

runActor(CrawlerSetup);
22 changes: 12 additions & 10 deletions packages/actor-scraper/jsdom-scraper/src/internals/crawler_setup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@ import { readFile } from 'node:fs/promises';
import { IncomingMessage } from 'node:http';
import { dirname } from 'node:path';
import { fileURLToPath, URL } from 'node:url';
import { Actor, ApifyEnv } from 'apify';

import {
constants as scraperToolsConstants,
CrawlerSetupOptions,
createContext,
RequestMetadata,
tools,
} from '@apify/scraper-tools';
import {
AutoscaledPool,
JSDOMCrawler,
Expand All @@ -18,13 +25,8 @@ import {
Dictionary,
Awaitable,
} from '@crawlee/jsdom';
import {
constants as scraperToolsConstants,
CrawlerSetupOptions,
createContext,
RequestMetadata,
tools,
} from '@apify/scraper-tools';
import { Actor, ApifyEnv } from 'apify';

import { Input, ProxyRotation } from './consts.js';

const { SESSION_MAX_USAGE_COUNTS, META_KEY } = scraperToolsConstants;
Expand Down Expand Up @@ -145,7 +147,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);

// Proxy configuration
this.proxyConfiguration = await Actor.createProxyConfiguration(this.input.proxyConfiguration);
this.proxyConfiguration = await Actor.createProxyConfiguration(this.input.proxyConfiguration) as any as ProxyConfiguration;
}

/**
Expand Down Expand Up @@ -241,7 +243,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
});
}

private _failedRequestHandler({ request }: JSDOMCrawlingContext) {
private async _failedRequestHandler({ request }: JSDOMCrawlingContext) {
const lastError = request.errorMessages[request.errorMessages.length - 1];
const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
log.error(`Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);
Expand Down
1 change: 1 addition & 0 deletions packages/actor-scraper/jsdom-scraper/src/main.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { runActor } from '@apify/scraper-tools';

import { CrawlerSetup } from './internals/crawler_setup.js';

runActor(CrawlerSetup);
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { readFile } from 'node:fs/promises';
import { dirname } from 'node:path';
import { fileURLToPath, URL } from 'node:url';

import { browserTools, constants as scraperToolsConstants, CrawlerSetupOptions, createContext, RequestMetadata, tools } from '@apify/scraper-tools';
import { Actor, ApifyEnv } from 'apify';
import {
AutoscaledPool,
Dataset,
Expand All @@ -16,10 +16,13 @@ import {
PlaywrightLaunchContext,
EnqueueLinksOptions,
log,
ProxyConfiguration,
} from '@crawlee/playwright';
import { Awaitable, Dictionary, sleep } from '@crawlee/utils';
import playwright, { Response } from 'playwright';
import { Actor, ApifyEnv } from 'apify';
import { getInjectableScript } from 'idcac-playwright';
import playwright, { Response } from 'playwright';

import { Input, ProxyRotation } from './consts.js';

const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
Expand Down Expand Up @@ -180,7 +183,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
maxConcurrency: this.input.maxConcurrency,
maxRequestRetries: this.input.maxRequestRetries,
maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
proxyConfiguration: await Actor.createProxyConfiguration(this.input.proxyConfiguration),
proxyConfiguration: await Actor.createProxyConfiguration(this.input.proxyConfiguration) as any as ProxyConfiguration,
launchContext: {
useChrome: this.input.useChrome,
launcher: playwright[this.input.launcher],
Expand Down Expand Up @@ -255,7 +258,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
});
}

private _failedRequestHandler({ request }: PlaywrightCrawlingContext) {
private async _failedRequestHandler({ request }: PlaywrightCrawlingContext) {
const lastError = request.errorMessages[request.errorMessages.length - 1];
const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
log.error(`Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);
Expand Down
1 change: 1 addition & 0 deletions packages/actor-scraper/playwright-scraper/src/main.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { runActor } from '@apify/scraper-tools';

import { CrawlerSetup } from './internals/crawler_setup.js';

runActor(CrawlerSetup);
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { readFile } from 'node:fs/promises';
import { dirname } from 'node:path';
import { fileURLToPath, URL } from 'node:url';

import { browserTools, constants as scraperToolsConstants, CrawlerSetupOptions, createContext, RequestMetadata, tools } from '@apify/scraper-tools';
import { Actor, ApifyEnv } from 'apify';
import {
AutoscaledPool,
Dataset,
Expand All @@ -16,10 +16,13 @@ import {
PuppeteerCrawlerOptions,
EnqueueLinksOptions,
log,
ProxyConfiguration,
} from '@crawlee/puppeteer';
import { Awaitable, Dictionary, sleep } from '@crawlee/utils';
import { HTTPResponse } from 'puppeteer';
import { Actor, ApifyEnv } from 'apify';
import { getInjectableScript } from 'idcac-playwright';
import { HTTPResponse } from 'puppeteer';

import { Input, ProxyRotation } from './consts.js';

const SESSION_STORE_NAME = 'APIFY-PUPPETEER-SCRAPER-SESSION-STORE';
Expand Down Expand Up @@ -178,7 +181,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
maxConcurrency: this.input.maxConcurrency,
maxRequestRetries: this.input.maxRequestRetries,
maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
proxyConfiguration: await Actor.createProxyConfiguration(this.input.proxyConfiguration),
proxyConfiguration: await Actor.createProxyConfiguration(this.input.proxyConfiguration) as any as ProxyConfiguration,
launchContext: {
useChrome: this.input.useChrome,
launchOptions: {
Expand Down Expand Up @@ -254,7 +257,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
});
}

private _failedRequestHandler({ request }: PuppeteerCrawlingContext) {
private async _failedRequestHandler({ request }: PuppeteerCrawlingContext) {
const lastError = request.errorMessages[request.errorMessages.length - 1];
const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
log.error(`Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);
Expand Down
1 change: 1 addition & 0 deletions packages/actor-scraper/puppeteer-scraper/src/main.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { runActor } from '@apify/scraper-tools';

import { CrawlerSetup } from './internals/crawler_setup.js';

runActor(CrawlerSetup);
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
/* eslint-disable max-classes-per-file */

import type { CrawlerSetupOptions, constants, RequestMetadata } from '@apify/scraper-tools';
import { Log } from '@apify/log';
import type { ApifyEnv } from 'apify';
import type { CrawlerSetupOptions, constants, RequestMetadata } from '@apify/scraper-tools';
import type {
KeyValueStore,
RecordOptions,
Expand All @@ -12,6 +11,8 @@ import type {
RequestQueueOperationOptions,
Dictionary,
} from '@crawlee/puppeteer';
import type { ApifyEnv } from 'apify';

import { Input } from './consts';
import { GlobalStore } from './global_store';

Expand Down Expand Up @@ -139,11 +140,11 @@ export function createBundle(apifyNamespace: string) {
this.waitFor = this.waitFor.bind(this);
}

getValue<T>(...args: Parameters<KeyValueStore['getValue']>) {
async getValue<T>(...args: Parameters<KeyValueStore['getValue']>) {
return this[internalState].keyValueStore!.getValue(...args) as Promise<T>;
}

setValue<T>(...args: Parameters<KeyValueStore['setValue']>) {
async setValue<T>(...args: Parameters<KeyValueStore['setValue']>) {
return this[internalState].keyValueStore!.setValue(...args as [key: string, value: T | null, options?: RecordOptions]);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import { readFile } from 'node:fs/promises';
import { dirname } from 'node:path';
import { fileURLToPath, URL } from 'node:url';
import { setTimeout } from 'node:timers/promises';
import { fileURLToPath, URL } from 'node:url';

import { browserTools, constants as scraperToolsConstants, CrawlerSetupOptions, createContext, tools } from '@apify/scraper-tools';
import { Actor, ApifyEnv } from 'apify';
import {
AutoscaledPool,
Dataset,
Expand All @@ -18,12 +18,15 @@ import {
log,
Awaitable,
Dictionary,
ProxyConfiguration,
} from '@crawlee/puppeteer';
import { Actor, ApifyEnv } from 'apify';
import contentType from 'content-type';
// @ts-expect-error no typings
import DevToolsServer from 'devtools-server';
import { HTTPResponse, Page } from 'puppeteer';
import { getInjectableScript } from 'idcac-playwright';
import { HTTPResponse, Page } from 'puppeteer';

import { createBundle } from './bundle.browser.js';
import { BreakpointLocation, CHROME_DEBUGGER_PORT, Input, ProxyRotation, RunMode } from './consts.js';
import { GlobalStore } from './global_store.js';
Expand Down Expand Up @@ -205,7 +208,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
maxConcurrency: this.isDevRun ? MAX_CONCURRENCY_IN_DEVELOPMENT : this.input.maxConcurrency,
maxRequestRetries: this.input.maxRequestRetries,
maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
proxyConfiguration: await Actor.createProxyConfiguration(this.input.proxyConfiguration),
proxyConfiguration: await Actor.createProxyConfiguration(this.input.proxyConfiguration) as any as ProxyConfiguration,
browserPoolOptions: {
preLaunchHooks: [
async () => {
Expand Down Expand Up @@ -348,7 +351,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
});
}

private _failedRequestHandler({ request }: PuppeteerCrawlingContext) {
private async _failedRequestHandler({ request }: PuppeteerCrawlingContext) {
const lastError = request.errorMessages[request.errorMessages.length - 1];
const errorMessage = lastError ? lastError.split('\n')[0] : 'no error';
log.error(`Request ${request.url} failed and will not be retried anymore. Marking as failed.\nLast Error Message: ${errorMessage}`);
Expand Down Expand Up @@ -583,7 +586,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
}

private async _injectBrowserHandles(page: Page, pageContext: PageContext) {
const saveSnapshotP = browserTools.createBrowserHandle(page, () => browserTools.saveSnapshot({ page }));
const saveSnapshotP = browserTools.createBrowserHandle(page, async () => browserTools.saveSnapshot({ page }));
const skipLinksP = browserTools.createBrowserHandle(page, () => { pageContext.skipLinks = true; });
const globalStoreP = browserTools.createBrowserHandlesForObject(
page,
Expand Down
1 change: 1 addition & 0 deletions packages/actor-scraper/web-scraper/src/main.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { runActor } from '@apify/scraper-tools';

import { CrawlerSetup } from './internals/crawler_setup.js';

runActor(CrawlerSetup);
Loading

0 comments on commit 661192a

Please sign in to comment.