From caee03fe3a43cc1d7a8d3f9e19b42df1bdb1c0aa Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Fri, 31 Jan 2025 13:22:05 +0200 Subject: [PATCH] fix: fix session managment with retire (#947) ### Description - Fix cases where a session was not excluded from the pool after calling `session.retire()` inside a successful `request_handler`. --- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- .../crawlers/_basic/test_basic_crawler.py | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index e2bd3c9f68..eaac5fbc5b 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1029,7 +1029,7 @@ async def __run_task_function(self) -> None: request.state = RequestState.DONE - if context.session: + if context.session and context.session.is_usable: context.session.mark_good() self._statistics.record_request_processing_finish(statistics_id) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 4bf425a566..526fa09c8d 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -25,6 +25,7 @@ from crawlee.errors import SessionError, UserDefinedErrorHandlerError from crawlee.events._local_event_manager import LocalEventManager from crawlee.request_loaders import RequestList, RequestManagerTandem +from crawlee.sessions import SessionPool from crawlee.statistics import FinalStatistics from crawlee.storage_clients import MemoryStorageClient from crawlee.storage_clients._memory import DatasetClient @@ -1169,3 +1170,32 @@ async def handler(context: BasicCrawlingContext) -> None: await asyncio.gather(crawler_run_task, add_request_task) mocked_handler.assert_has_calls(expected_handler_calls) + + +@pytest.mark.parametrize( + ('retire'), + [ + pytest.param(False, id='without retire'), + pytest.param(True, id='with retire'), + ], +) +async def test_session_retire_in_user_handler(*, retire: bool) -> None: + crawler = BasicCrawler(session_pool=SessionPool(max_pool_size=1)) + sessions = list[str]() + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + if context.session: + sessions.append(context.session.id) + + context.session.retire() if retire else None + + await context.add_requests(['http://b.com/']) + + await crawler.run(['http://a.com/']) + + # The session should differ if `retire` was called and match otherwise since pool size == 1 + if retire: + assert sessions[1] != sessions[0] + else: + assert sessions[1] == sessions[0]