diff --git a/core/pyproject.toml b/core/pyproject.toml index 1cef0a9..6e9696d 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-core" -version = "0.57.0" +version = "0.58.0" description = "Core types for harambe SDK 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } diff --git a/core/uv.lock b/core/uv.lock index 76b8fdb..4eb3c7e 100644 --- a/core/uv.lock +++ b/core/uv.lock @@ -115,7 +115,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.57.0" +version = "0.58.0" source = { virtual = "." } dependencies = [ { name = "dateparser" }, @@ -502,7 +502,7 @@ name = "tzlocal" version = "5.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "tzdata", marker = "platform_system == 'Windows'" }, + { name = "tzdata", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/04/d3/c19d65ae67636fe63953b20c2e4a8ced4497ea232c43ff8d01db16de8dc0/tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e", size = 30201 } wheels = [ diff --git a/sdk/harambe/core.py b/sdk/harambe/core.py index e4bd5fc..d6f8fd7 100644 --- a/sdk/harambe/core.py +++ b/sdk/harambe/core.py @@ -64,6 +64,10 @@ from harambe.contrib import WebHarness, playwright_harness +async def default_callback(url: str, status: int): + raise GotoError(url, status) + + class AsyncScraper(Protocol): """ Protocol that all classed based scrapers should implement. @@ -449,6 +453,7 @@ async def run( harness: WebHarness = playwright_harness, evaluator: Optional[ExpressionEvaluator] = None, observer: Optional[OutputObserver | List[OutputObserver]] = None, + callback: Callable[[str, int], Awaitable[None]] = default_callback, **harness_options: Unpack[HarnessOptions], ) -> "SDK": """ @@ -492,7 +497,7 @@ async def run( if not harness_options.get("disable_go_to_url", False): response = await page.goto(url) if response.status >= 400: - raise GotoError(url, response.status) + await callback(url, response.status) elif isinstance(page, SoupPage): page.url = url await scraper(sdk, url, context) diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index dd3b8c6..605909e 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-sdk" -version = "0.57.0" +version = "0.58.0" description = "Data extraction SDK for Playwright 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } @@ -8,7 +8,7 @@ authors = [ requires-python = ">=3.11,<4.0" readme = "README.md" dependencies = [ - "harambe_core==0.57.0", + "harambe_core==0.58.0", "playwright==1.47.0", "beautifulsoup4==4.12.3", "requests==2.32.3", diff --git a/sdk/test/test_e2e.py b/sdk/test/test_e2e.py index 7623aa5..423855e 100644 --- a/sdk/test/test_e2e.py +++ b/sdk/test/test_e2e.py @@ -633,22 +633,49 @@ async def scrape(sdk: SDK, url, context) -> None: @pytest.mark.parametrize("harness", [playwright_harness, soup_harness]) -async def test_403_status_on_goto(server, observer, harness): +async def test_403_status_on_goto_with_default_callback(server, observer, harness): url = f"{server}/403" async def scrape(sdk: SDK, current_url, context) -> None: - await sdk.save_data( - {"key": "this should't be saved as we're throwing an exception"} - ) + await sdk.save_data({"key": "this shouldn't be saved if GotoError is raised"}) with pytest.raises(GotoError): await SDK.run( - scrape, - url, + scraper=scrape, + url=url, harness=harness, schema={}, context={"status": "Open"}, observer=observer, ) - assert len(observer.data) == 0 + + +@pytest.mark.parametrize("harness", [playwright_harness, soup_harness]) +@pytest.mark.parametrize("goto_error_cb", ["custom"]) +async def test_403_status_on_goto_with_custom_callback( + server, observer, harness, goto_error_cb +): + url = f"{server}/403" + + async def scrape(sdk: SDK, current_url, context) -> None: + await sdk.save_data({"key": "this shouldn't be saved if GotoError is raised"}) + + async def custom_error_handler(url, status_code): + print(f"Handled {status_code} for {url} gracefully.") + + error_callback = custom_error_handler + await SDK.run( + scraper=scrape, + url=url, + harness=harness, + schema={}, + context={"status": "Open"}, + observer=observer, + callback=error_callback, + ) + + # Ensure data is saved when error is handled (either with custom or no callback) + assert len(observer.data) == 1 + assert observer.data[0]["key"] == "this shouldn't be saved if GotoError is raised" + assert observer.data[0]["__url"] == url diff --git a/sdk/uv.lock b/sdk/uv.lock index fe43f07..ca23389 100644 --- a/sdk/uv.lock +++ b/sdk/uv.lock @@ -428,7 +428,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.57.0" +version = "0.58.0" source = { editable = "../core" } dependencies = [ { name = "dateparser" }, @@ -459,7 +459,7 @@ dev = [ [[package]] name = "harambe-sdk" -version = "0.57.0" +version = "0.58.0" source = { virtual = "." } dependencies = [ { name = "aiohttp" }, @@ -1053,7 +1053,7 @@ name = "tzlocal" version = "5.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "tzdata", marker = "platform_system == 'Windows'" }, + { name = "tzdata", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/04/d3/c19d65ae67636fe63953b20c2e4a8ced4497ea232c43ff8d01db16de8dc0/tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e", size = 30201 } wheels = [