From e87eb1f2ccd9585f8d53cb03ec671cedf23a06b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Fri, 9 Dec 2022 15:46:25 +0100 Subject: [PATCH] fix: session.markBad() on requestHandler error (#1709) fixes #1635 Caused by [this](https://github.com/apify/crawlee/blob/5ff04faa85c3a6b6f02cd58a91b46b80610d8ae6/packages/browser-crawler/src/internals/browser-crawler.ts#L524). Oh, don't we all love random modifications to passed parameters... --- .../src/internals/basic-crawler.ts | 4 +-- test/e2e/session-rotation/actor/.gitignore | 7 +++++ test/e2e/session-rotation/actor/Dockerfile | 23 +++++++++++++++ test/e2e/session-rotation/actor/apify.json | 6 ++++ test/e2e/session-rotation/actor/main.js | 26 +++++++++++++++++ test/e2e/session-rotation/actor/package.json | 29 +++++++++++++++++++ test/e2e/session-rotation/test.mjs | 12 ++++++++ 7 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 test/e2e/session-rotation/actor/.gitignore create mode 100644 test/e2e/session-rotation/actor/Dockerfile create mode 100644 test/e2e/session-rotation/actor/apify.json create mode 100644 test/e2e/session-rotation/actor/main.js create mode 100644 test/e2e/session-rotation/actor/package.json create mode 100644 test/e2e/session-rotation/test.mjs diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 4a2719a883ee..91a27c8c58b1 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -943,7 +943,7 @@ export class BasicCrawler { + const crawler = new PlaywrightCrawler({ + maxRequestRetries: 10, + sessionPoolOptions: { + sessionOptions: { + maxErrorScore: 2, + }, + }, + requestHandler: async ({ session }) => { + const { id, usageCount, errorScore } = session; + Actor.pushData({ id, usageCount, errorScore }); + throw new Error('retry'); + }, + }); + + await crawler.run(['https://crawlee.dev/']); +}, mainOptions); diff --git a/test/e2e/session-rotation/actor/package.json b/test/e2e/session-rotation/actor/package.json new file mode 100644 index 000000000000..56132f00fa69 --- /dev/null +++ b/test/e2e/session-rotation/actor/package.json @@ -0,0 +1,29 @@ +{ + "name": "test-session-rotation", + "version": "0.0.1", + "description": "Session Test - Rotation", + "dependencies": { + "apify": "next", + "@apify/storage-local": "^2.1.0", + "@crawlee/basic": "file:./packages/basic-crawler", + "@crawlee/browser": "file:./packages/browser-crawler", + "@crawlee/browser-pool": "file:./packages/browser-pool", + "@crawlee/core": "file:./packages/core", + "@crawlee/memory-storage": "file:./packages/memory-storage", + "@crawlee/puppeteer": "file:./packages/puppeteer-crawler", + "@crawlee/types": "file:./packages/types", + "@crawlee/utils": "file:./packages/utils", + "puppeteer": "*" + }, + "overrides": { + "apify": { + "@crawlee/core": "file:./packages/core", + "@crawlee/utils": "file:./packages/utils" + } + }, + "scripts": { + "start": "node main.js" + }, + "type": "module", + "license": "ISC" +} diff --git a/test/e2e/session-rotation/test.mjs b/test/e2e/session-rotation/test.mjs new file mode 100644 index 000000000000..5ffd7300b384 --- /dev/null +++ b/test/e2e/session-rotation/test.mjs @@ -0,0 +1,12 @@ +import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs'; + +const testActorDirname = getActorTestDir(import.meta.url); +await initialize(testActorDirname); + +const { datasetItems } = await runActor(testActorDirname, 4096); + +await expect(datasetItems.length === 11, 'Retried correct number of times'); +await expect( + datasetItems.map( + (session) => datasetItems.filter((s) => s.id === session.id), + ).every((x) => x.length <= 2), 'No session used more than three times');