Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support use_incognito_pages for browser_launch_options in PlaywrightCrawler #941

Merged
merged 9 commits into from
Feb 5, 2025
Merged
26 changes: 19 additions & 7 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,23 @@ def __init__(
browser: Browser,
*,
max_open_pages_per_browser: int = 20,
use_incognito_pages: bool = False,
header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
) -> None:
"""A default constructor.

Args:
browser: The browser instance to control.
max_open_pages_per_browser: The maximum number of pages that can be open at the same time.
use_incognito_pages: each page opens in its own context.
Mantisus marked this conversation as resolved.
Show resolved Hide resolved
header_generator: An optional `HeaderGenerator` instance used to generate and manage HTTP headers for
requests made by the browser. By default, a predefined header generator is used. Set to `None` to
disable automatic header modifications.
"""
self._browser = browser
self._max_open_pages_per_browser = max_open_pages_per_browser
self._header_generator = header_generator
self._use_incognito_pages = use_incognito_pages

self._browser_context: BrowserContext | None = None
self._pages = list[Page]()
Expand Down Expand Up @@ -115,13 +118,16 @@ async def new_page(
Raises:
ValueError: If the browser has reached the maximum number of open pages.
"""
if not self._browser_context:
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)

if not self.has_free_capacity:
raise ValueError('Cannot open more pages in this browser.')

page = await self._browser_context.new_page()
if self._use_incognito_pages:
page_context_options = self._create_context_options(browser_new_context_options, proxy_info)
page = await self._browser.new_page(**page_context_options)
else:
if not self._browser_context:
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)
page = await self._browser_context.new_page()
vdusek marked this conversation as resolved.
Show resolved Hide resolved

# Handle page close event
page.on(event='close', f=self._on_page_close)
Expand Down Expand Up @@ -153,10 +159,10 @@ def _on_page_close(self, page: Page) -> None:
"""Handle actions after a page is closed."""
self._pages.remove(page)

async def _create_browser_context(
def _create_context_options(
self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
) -> BrowserContext:
"""Create a new browser context with the specified proxy settings."""
) -> Mapping[str, Any]:
"""Create context options for context and single pages with the specified proxy settings."""
if self._header_generator:
common_headers = self._header_generator.get_common_headers()
sec_ch_ua_headers = self._header_generator.get_sec_ch_ua_headers(browser_type=self.browser_type)
Expand All @@ -179,5 +185,11 @@ async def _create_browser_context(
username=proxy_info.username,
password=proxy_info.password,
)
return browser_new_context_options

async def _create_browser_context(
self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
) -> BrowserContext:
"""Create a new browser context with the specified proxy settings."""
browser_new_context_options = self._create_context_options(browser_new_context_options, proxy_info)
return await self._browser.new_context(**browser_new_context_options)
2 changes: 2 additions & 0 deletions src/crawlee/browsers/_playwright_browser_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def __init__(
self._browser_launch_options = default_launch_browser_options | (browser_launch_options or {})
self._browser_new_context_options = browser_new_context_options or {}
self._max_open_pages_per_browser = max_open_pages_per_browser
self._use_incognito_pages: bool = self._browser_launch_options.pop('use_incognito_pages', False)
vdusek marked this conversation as resolved.
Show resolved Hide resolved

self._playwright_context_manager = async_playwright()
self._playwright: Playwright | None = None
Expand Down Expand Up @@ -154,5 +155,6 @@ async def new_browser(self) -> PlaywrightBrowserController:

return PlaywrightBrowserController(
browser,
use_incognito_pages=self._use_incognito_pages,
max_open_pages_per_browser=self._max_open_pages_per_browser,
)
21 changes: 21 additions & 0 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import TYPE_CHECKING, Any, Callable

from pydantic import ValidationError
from yarl import URL

from crawlee import EnqueueStrategy, RequestTransformAction
from crawlee._request import Request, RequestOptions
Expand All @@ -22,6 +23,7 @@
if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Awaitable, Mapping

from playwright.async_api import Page
from typing_extensions import Unpack

from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
Expand Down Expand Up @@ -175,6 +177,9 @@ async def _navigate(
infinite_scroll and block_requests).
"""
async with context.page:
if context.session:
await self._set_cookies(context.page, context.request.url, context.session.cookies)

if context.request.headers:
await context.page.set_extra_http_headers(context.request.headers.model_dump())
# Navigate to the URL and get response.
Expand All @@ -186,6 +191,10 @@ async def _navigate(
# Set the loaded URL to the actual URL after redirection.
context.request.loaded_url = context.page.url

if context.session:
cookies = await self._get_cookies(context.page)
context.session.cookies.update(cookies)

async def enqueue_links(
*,
selector: str = 'a',
Expand Down Expand Up @@ -294,3 +303,15 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext],
hook: A coroutine function to be called before each navigation.
"""
self._pre_navigation_hooks.append(hook)

async def _get_cookies(self, page: Page) -> dict[str, str]:
"""Get the cookies from the page."""
cookies = await page.context.cookies()
return {cookie['name']: cookie['value'] for cookie in cookies if cookie.get('name') and cookie.get('value')}

async def _set_cookies(self, page: Page, url: str, cookies: dict[str, str]) -> None:
"""Set the cookies to the page."""
parsed_url = URL(url)
await page.context.add_cookies(
[{'name': name, 'value': value, 'domain': parsed_url.host, 'path': '/'} for name, value in cookies.items()]
)
4 changes: 2 additions & 2 deletions src/crawlee/sessions/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(
usage_count: int = 0,
max_usage_count: int = 50,
error_score: float = 0.0,
cookies: dict | None = None,
cookies: dict[str, str] | None = None,
blocked_status_codes: list | None = None,
) -> None:
"""A default constructor.
Expand Down Expand Up @@ -94,7 +94,7 @@ def user_data(self) -> dict:
return self._user_data

@property
def cookies(self) -> dict:
def cookies(self) -> dict[str, str]:
"""Get the cookies."""
return self._cookies

Expand Down
80 changes: 80 additions & 0 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from typing import TYPE_CHECKING
from unittest import mock

import pytest

from crawlee import Glob, HttpHeaders, Request, RequestTransformAction
from crawlee._types import EnqueueStrategy
from crawlee.crawlers import PlaywrightCrawler
Expand All @@ -19,6 +21,7 @@
PW_FIREFOX_HEADLESS_DEFAULT_USER_AGENT,
)
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import SessionPool

if TYPE_CHECKING:
from yarl import URL
Expand Down Expand Up @@ -242,3 +245,80 @@ async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
await crawler.run(['https://test.com'])

assert handler_data.get('proxy') == proxy_value


@pytest.mark.parametrize(
'use_incognito_pages',
[False, True],
ids=['without use_incognito_pages', 'with use_incognito_pages'],
)
janbuchar marked this conversation as resolved.
Show resolved Hide resolved
async def test_isolation_cookies(*, use_incognito_pages: bool, httpbin: URL) -> None:
sessions_id: list[str] = []
sessions_cookies: dict[str, dict[str, str]] = {}
response_cookies: dict[str, dict[str, str]] = {}

crawler = PlaywrightCrawler(
session_pool=SessionPool(max_pool_size=1), browser_launch_options={'use_incognito_pages': use_incognito_pages}
)

@crawler.router.default_handler
async def handler(context: PlaywrightCrawlingContext) -> None:
if context.session:
sessions_id.append(context.session.id)

# Add to the queue the request that will be made by the session with the cookie
await context.add_requests(
[
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='1', label='cookies'),
]
)

@crawler.router.handler('cookies')
async def cookies_handler(context: PlaywrightCrawlingContext) -> None:
if context.session:
if context.request.unique_key == '2':
sessions_id.append(context.session.id)
sessions_cookies[context.session.id] = context.session.cookies
response_data = json.loads(await context.response.text())
response_cookies[context.session.id] = response_data.get('cookies')
context.session.retire()

# The session with the cookie is retire. The next request should be made by a session without a cookie
if context.request.unique_key == '1':
await context.add_requests(
[
Request.from_url(str(httpbin.with_path('/cookies')), unique_key='2', label='cookies'),
]
)

await crawler.run(
[
str(httpbin.with_path('/cookies/set').extend_query(a=1)),
]
)

assert len(sessions_cookies) == 2
assert len(response_cookies) == 2

cookie_session_id = sessions_id[0]
clean_session_id = sessions_id[1]

assert cookie_session_id != clean_session_id

# When using `use_incognito_pages` there should be full cookie isolation
if use_incognito_pages:
# The initiated cookies must match in both the response and the session store
assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}

# For a clean session, the cookie should not be in the sesstion store or in the response
# This way we can be sure that no cookies are being leaked through the http client
assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {}
# Without `use_incognito_pages` we will have access to the session cookie,
# but there will be a cookie leak via PlaywrightContext
else:
# The initiated cookies must match in both the response and the session store
assert sessions_cookies[cookie_session_id] == response_cookies[cookie_session_id] == {'a': '1'}

# PlaywrightContext makes cookies shared by all sessions that work with it.
# So in this case a clean session contains the same cookies
assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {'a': '1'}
Loading