From d0f6d56ed7b9878707fdb1655f2dbb3c42215bc6 Mon Sep 17 00:00:00 2001 From: kunwarsodhi Date: Wed, 18 Dec 2024 12:06:49 -0800 Subject: [PATCH 1/7] Added extensions support --- sdk/harambe/contrib/playwright/harness.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/sdk/harambe/contrib/playwright/harness.py b/sdk/harambe/contrib/playwright/harness.py index 2ff5ede..230a4a2 100644 --- a/sdk/harambe/contrib/playwright/harness.py +++ b/sdk/harambe/contrib/playwright/harness.py @@ -1,4 +1,6 @@ import json +import shutil +import tempfile from collections import defaultdict from contextlib import asynccontextmanager from typing import Any, AsyncGenerator, Awaitable, Callable, Optional, Sequence, cast @@ -39,6 +41,7 @@ async def playwright_harness( browser_type: Optional[BrowserType] = None, enable_clipboard: bool = False, launch_args: Sequence[str] = (), + extensions: Sequence[str] = (), **__: Any, ) -> AsyncGenerator[PageFactory, None]: """ @@ -47,13 +50,27 @@ async def playwright_harness( creation of HAR file, and stealth. """ async with async_playwright() as p: + extension_args = [] + user_data_dir = tempfile.mkdtemp() + browser_type = browser_type or "chromium" + + if extensions and browser_type == "chromium": + extension_paths = ",".join(extensions) + extension_args.extend( + [ + f"--disable-extensions-except={extension_paths}", + f"--load-extension={extension_paths}", + ] + ) + browser = await ( p.chromium.connect_over_cdp(endpoint_url=cdp_endpoint) if cdp_endpoint - else getattr(p, cast(str, browser_type or "chromium")).launch( + else getattr(p, cast(str, browser_type)).launch( headless=headless, args=[ *launch_args, + *extension_args, *( # Disables navigator.webdriver showing up ["--disable-blink-features=AutomationControlled"] @@ -135,3 +152,4 @@ async def page_factory(*_: Any, **__: Any) -> PlaywrightPage: finally: await ctx.close() await browser.close() + shutil.rmtree(user_data_dir, ignore_errors=True) From 819216bbcb4b06cf27b039edefae0a3b8a77b6cd Mon Sep 17 00:00:00 2001 From: kunwarsodhi Date: Wed, 18 Dec 2024 12:08:09 -0800 Subject: [PATCH 2/7] Updated pyproject file --- core/pyproject.toml | 2 +- sdk/pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/pyproject.toml b/core/pyproject.toml index 47259d2..1cef0a9 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-core" -version = "0.56.0" +version = "0.57.0" description = "Core types for harambe SDK 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index 2feaca4..dd3b8c6 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-sdk" -version = "0.56.0" +version = "0.57.0" description = "Data extraction SDK for Playwright 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } @@ -8,7 +8,7 @@ authors = [ requires-python = ">=3.11,<4.0" readme = "README.md" dependencies = [ - "harambe_core==0.56.0", + "harambe_core==0.57.0", "playwright==1.47.0", "beautifulsoup4==4.12.3", "requests==2.32.3", From 018819f329b773b888a27ead19a711cccd91bb3c Mon Sep 17 00:00:00 2001 From: kunwarsodhi Date: Wed, 18 Dec 2024 12:26:12 -0800 Subject: [PATCH 3/7] removed user data dir --- sdk/harambe/contrib/playwright/harness.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sdk/harambe/contrib/playwright/harness.py b/sdk/harambe/contrib/playwright/harness.py index 230a4a2..552d961 100644 --- a/sdk/harambe/contrib/playwright/harness.py +++ b/sdk/harambe/contrib/playwright/harness.py @@ -1,6 +1,7 @@ import json import shutil import tempfile + from collections import defaultdict from contextlib import asynccontextmanager from typing import Any, AsyncGenerator, Awaitable, Callable, Optional, Sequence, cast @@ -51,7 +52,6 @@ async def playwright_harness( """ async with async_playwright() as p: extension_args = [] - user_data_dir = tempfile.mkdtemp() browser_type = browser_type or "chromium" if extensions and browser_type == "chromium": @@ -151,5 +151,4 @@ async def page_factory(*_: Any, **__: Any) -> PlaywrightPage: await on_end(ctx) finally: await ctx.close() - await browser.close() - shutil.rmtree(user_data_dir, ignore_errors=True) + await browser.close() \ No newline at end of file From 4b3a6a1ed34497cbecb74502cacffb3060d997a1 Mon Sep 17 00:00:00 2001 From: kunwarsodhi Date: Wed, 18 Dec 2024 12:30:27 -0800 Subject: [PATCH 4/7] fixed formatting --- core/uv.lock | 2 +- sdk/harambe/contrib/playwright/harness.py | 2 +- sdk/uv.lock | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/uv.lock b/core/uv.lock index dd5a038..76b8fdb 100644 --- a/core/uv.lock +++ b/core/uv.lock @@ -115,7 +115,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.56.0" +version = "0.57.0" source = { virtual = "." } dependencies = [ { name = "dateparser" }, diff --git a/sdk/harambe/contrib/playwright/harness.py b/sdk/harambe/contrib/playwright/harness.py index 552d961..df3108d 100644 --- a/sdk/harambe/contrib/playwright/harness.py +++ b/sdk/harambe/contrib/playwright/harness.py @@ -151,4 +151,4 @@ async def page_factory(*_: Any, **__: Any) -> PlaywrightPage: await on_end(ctx) finally: await ctx.close() - await browser.close() \ No newline at end of file + await browser.close() diff --git a/sdk/uv.lock b/sdk/uv.lock index 2f51499..fe43f07 100644 --- a/sdk/uv.lock +++ b/sdk/uv.lock @@ -428,7 +428,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.56.0" +version = "0.57.0" source = { editable = "../core" } dependencies = [ { name = "dateparser" }, @@ -459,7 +459,7 @@ dev = [ [[package]] name = "harambe-sdk" -version = "0.56.0" +version = "0.57.0" source = { virtual = "." } dependencies = [ { name = "aiohttp" }, From 889871724ffdde6843d7fc67db962d61b966ef23 Mon Sep 17 00:00:00 2001 From: kunwarsodhi Date: Wed, 18 Dec 2024 12:42:04 -0800 Subject: [PATCH 5/7] Added to harness options type --- sdk/harambe/contrib/playwright/harness.py | 3 --- sdk/harambe/types.py | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/sdk/harambe/contrib/playwright/harness.py b/sdk/harambe/contrib/playwright/harness.py index df3108d..13c9edb 100644 --- a/sdk/harambe/contrib/playwright/harness.py +++ b/sdk/harambe/contrib/playwright/harness.py @@ -1,7 +1,4 @@ import json -import shutil -import tempfile - from collections import defaultdict from contextlib import asynccontextmanager from typing import Any, AsyncGenerator, Awaitable, Callable, Optional, Sequence, cast diff --git a/sdk/harambe/types.py b/sdk/harambe/types.py index f825183..34113dc 100644 --- a/sdk/harambe/types.py +++ b/sdk/harambe/types.py @@ -71,3 +71,4 @@ class HarnessOptions(TypedDict, total=False): disable_go_to_url: bool on_start: Optional[Callback] on_end: Optional[Callback] + extensions: Sequence[str] \ No newline at end of file From 0bc2416285fe352f427b4277e1688234750db50d Mon Sep 17 00:00:00 2001 From: kunwarsodhi Date: Wed, 18 Dec 2024 12:45:37 -0800 Subject: [PATCH 6/7] fixed formatting --- sdk/harambe/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/harambe/types.py b/sdk/harambe/types.py index 34113dc..46e73e8 100644 --- a/sdk/harambe/types.py +++ b/sdk/harambe/types.py @@ -71,4 +71,4 @@ class HarnessOptions(TypedDict, total=False): disable_go_to_url: bool on_start: Optional[Callback] on_end: Optional[Callback] - extensions: Sequence[str] \ No newline at end of file + extensions: Sequence[str] From 523b436e530e4a9b78f40e82b37c2f21efb8d16f Mon Sep 17 00:00:00 2001 From: kunwar-reworkd Date: Fri, 20 Dec 2024 14:29:17 -0800 Subject: [PATCH 7/7] =?UTF-8?q?=F0=9F=94=84=20Added=20ignore=20goto=20erro?= =?UTF-8?q?r=20flag=20(#104)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added goto error flag * Updated version * Added callback * Defaulted function in params * converted to str/int only * Updated callback --- core/pyproject.toml | 2 +- core/uv.lock | 4 ++-- sdk/harambe/core.py | 7 ++++++- sdk/pyproject.toml | 4 ++-- sdk/test/test_e2e.py | 41 ++++++++++++++++++++++++++++++++++------- sdk/uv.lock | 6 +++--- 6 files changed, 48 insertions(+), 16 deletions(-) diff --git a/core/pyproject.toml b/core/pyproject.toml index 1cef0a9..6e9696d 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-core" -version = "0.57.0" +version = "0.58.0" description = "Core types for harambe SDK 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } diff --git a/core/uv.lock b/core/uv.lock index 76b8fdb..4eb3c7e 100644 --- a/core/uv.lock +++ b/core/uv.lock @@ -115,7 +115,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.57.0" +version = "0.58.0" source = { virtual = "." } dependencies = [ { name = "dateparser" }, @@ -502,7 +502,7 @@ name = "tzlocal" version = "5.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "tzdata", marker = "platform_system == 'Windows'" }, + { name = "tzdata", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/04/d3/c19d65ae67636fe63953b20c2e4a8ced4497ea232c43ff8d01db16de8dc0/tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e", size = 30201 } wheels = [ diff --git a/sdk/harambe/core.py b/sdk/harambe/core.py index e4bd5fc..d6f8fd7 100644 --- a/sdk/harambe/core.py +++ b/sdk/harambe/core.py @@ -64,6 +64,10 @@ from harambe.contrib import WebHarness, playwright_harness +async def default_callback(url: str, status: int): + raise GotoError(url, status) + + class AsyncScraper(Protocol): """ Protocol that all classed based scrapers should implement. @@ -449,6 +453,7 @@ async def run( harness: WebHarness = playwright_harness, evaluator: Optional[ExpressionEvaluator] = None, observer: Optional[OutputObserver | List[OutputObserver]] = None, + callback: Callable[[str, int], Awaitable[None]] = default_callback, **harness_options: Unpack[HarnessOptions], ) -> "SDK": """ @@ -492,7 +497,7 @@ async def run( if not harness_options.get("disable_go_to_url", False): response = await page.goto(url) if response.status >= 400: - raise GotoError(url, response.status) + await callback(url, response.status) elif isinstance(page, SoupPage): page.url = url await scraper(sdk, url, context) diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index dd3b8c6..605909e 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-sdk" -version = "0.57.0" +version = "0.58.0" description = "Data extraction SDK for Playwright 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } @@ -8,7 +8,7 @@ authors = [ requires-python = ">=3.11,<4.0" readme = "README.md" dependencies = [ - "harambe_core==0.57.0", + "harambe_core==0.58.0", "playwright==1.47.0", "beautifulsoup4==4.12.3", "requests==2.32.3", diff --git a/sdk/test/test_e2e.py b/sdk/test/test_e2e.py index 7623aa5..423855e 100644 --- a/sdk/test/test_e2e.py +++ b/sdk/test/test_e2e.py @@ -633,22 +633,49 @@ async def scrape(sdk: SDK, url, context) -> None: @pytest.mark.parametrize("harness", [playwright_harness, soup_harness]) -async def test_403_status_on_goto(server, observer, harness): +async def test_403_status_on_goto_with_default_callback(server, observer, harness): url = f"{server}/403" async def scrape(sdk: SDK, current_url, context) -> None: - await sdk.save_data( - {"key": "this should't be saved as we're throwing an exception"} - ) + await sdk.save_data({"key": "this shouldn't be saved if GotoError is raised"}) with pytest.raises(GotoError): await SDK.run( - scrape, - url, + scraper=scrape, + url=url, harness=harness, schema={}, context={"status": "Open"}, observer=observer, ) - assert len(observer.data) == 0 + + +@pytest.mark.parametrize("harness", [playwright_harness, soup_harness]) +@pytest.mark.parametrize("goto_error_cb", ["custom"]) +async def test_403_status_on_goto_with_custom_callback( + server, observer, harness, goto_error_cb +): + url = f"{server}/403" + + async def scrape(sdk: SDK, current_url, context) -> None: + await sdk.save_data({"key": "this shouldn't be saved if GotoError is raised"}) + + async def custom_error_handler(url, status_code): + print(f"Handled {status_code} for {url} gracefully.") + + error_callback = custom_error_handler + await SDK.run( + scraper=scrape, + url=url, + harness=harness, + schema={}, + context={"status": "Open"}, + observer=observer, + callback=error_callback, + ) + + # Ensure data is saved when error is handled (either with custom or no callback) + assert len(observer.data) == 1 + assert observer.data[0]["key"] == "this shouldn't be saved if GotoError is raised" + assert observer.data[0]["__url"] == url diff --git a/sdk/uv.lock b/sdk/uv.lock index fe43f07..ca23389 100644 --- a/sdk/uv.lock +++ b/sdk/uv.lock @@ -428,7 +428,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.57.0" +version = "0.58.0" source = { editable = "../core" } dependencies = [ { name = "dateparser" }, @@ -459,7 +459,7 @@ dev = [ [[package]] name = "harambe-sdk" -version = "0.57.0" +version = "0.58.0" source = { virtual = "." } dependencies = [ { name = "aiohttp" }, @@ -1053,7 +1053,7 @@ name = "tzlocal" version = "5.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "tzdata", marker = "platform_system == 'Windows'" }, + { name = "tzdata", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/04/d3/c19d65ae67636fe63953b20c2e4a8ced4497ea232c43ff8d01db16de8dc0/tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e", size = 30201 } wheels = [