Skip to content

Commit

Permalink
fix: session.markBad() on requestHandler error (#1709)
Browse files Browse the repository at this point in the history
  • Loading branch information
barjin authored Dec 9, 2022
1 parent f731144 commit e87eb1f
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 2 deletions.
4 changes: 2 additions & 2 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -943,7 +943,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

// reclaim session if request finishes successfully
request.state = RequestState.DONE;
session?.markGood();
crawlingContext.session?.markGood();
} catch (err) {
try {
request.state = RequestState.ERROR_HANDLER;
Expand All @@ -964,7 +964,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
throw secondaryError;
}
// decrease the session score if the request fails (but the error handler did not throw)
session?.markBad();
crawlingContext.session?.markBad();
} finally {
await this._cleanupContext(crawlingContext);

Expand Down
7 changes: 7 additions & 0 deletions test/e2e/session-rotation/actor/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.idea
.DS_Store
node_modules
package-lock.json
apify_storage
crawlee_storage
storage
23 changes: 23 additions & 0 deletions test/e2e/session-rotation/actor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM node:16 AS builder

COPY /packages ./packages
COPY /package*.json ./
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& npm update

FROM apify/actor-node-playwright-chrome:16-beta

RUN rm -r node_modules
COPY --from=builder /node_modules ./node_modules
COPY --from=builder /packages ./packages
COPY --from=builder /package*.json ./
COPY /apify.json ./
COPY /main.js ./

RUN echo "Installed NPM packages:" \
&& (npm list --only=prod --no-optional --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
6 changes: 6 additions & 0 deletions test/e2e/session-rotation/actor/apify.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"name": "test-session-rotation",
"version": "0.0",
"buildTag": "latest",
"env": null
}
26 changes: 26 additions & 0 deletions test/e2e/session-rotation/actor/main.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import { Actor } from 'apify';
import { PlaywrightCrawler } from '@crawlee/playwright';
import { ApifyStorageLocal } from '@apify/storage-local';

const mainOptions = {
exit: Actor.isAtHome(),
storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new ApifyStorageLocal() : undefined,
};

await Actor.main(async () => {
const crawler = new PlaywrightCrawler({
maxRequestRetries: 10,
sessionPoolOptions: {
sessionOptions: {
maxErrorScore: 2,
},
},
requestHandler: async ({ session }) => {
const { id, usageCount, errorScore } = session;
Actor.pushData({ id, usageCount, errorScore });

This comment has been minimized.

Copy link
@metalwarrior665

metalwarrior665 Dec 14, 2022

Member

@barjin Missing await?

throw new Error('retry');
},
});

await crawler.run(['https://crawlee.dev/']);
}, mainOptions);
29 changes: 29 additions & 0 deletions test/e2e/session-rotation/actor/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"name": "test-session-rotation",
"version": "0.0.1",
"description": "Session Test - Rotation",
"dependencies": {
"apify": "next",
"@apify/storage-local": "^2.1.0",
"@crawlee/basic": "file:./packages/basic-crawler",
"@crawlee/browser": "file:./packages/browser-crawler",
"@crawlee/browser-pool": "file:./packages/browser-pool",
"@crawlee/core": "file:./packages/core",
"@crawlee/memory-storage": "file:./packages/memory-storage",
"@crawlee/puppeteer": "file:./packages/puppeteer-crawler",
"@crawlee/types": "file:./packages/types",
"@crawlee/utils": "file:./packages/utils",
"puppeteer": "*"
},
"overrides": {
"apify": {
"@crawlee/core": "file:./packages/core",
"@crawlee/utils": "file:./packages/utils"
}
},
"scripts": {
"start": "node main.js"
},
"type": "module",
"license": "ISC"
}
12 changes: 12 additions & 0 deletions test/e2e/session-rotation/test.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs';

const testActorDirname = getActorTestDir(import.meta.url);
await initialize(testActorDirname);

const { datasetItems } = await runActor(testActorDirname, 4096);

await expect(datasetItems.length === 11, 'Retried correct number of times');
await expect(
datasetItems.map(
(session) => datasetItems.filter((s) => s.id === session.id),
).every((x) => x.length <= 2), 'No session used more than three times');

0 comments on commit e87eb1f

Please sign in to comment.