Skip to content

Commit

Permalink
Merge pull request #109 from reworkd/goto-callback-handler
Browse files Browse the repository at this point in the history
⚠️ Goto Error Handler
  • Loading branch information
BilalG1 authored Jan 17, 2025
2 parents 9b1cdbc + 6bab7e2 commit 8566526
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 30 deletions.
4 changes: 4 additions & 0 deletions core/harambe_core/errors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from typing import Any


async def default_error_callback(url: str, status: int, *args):
raise GotoError(url, status)


class HarambeException(Exception):
"""Base exception for all custom exceptions in Harambe."""

Expand Down
2 changes: 1 addition & 1 deletion core/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "harambe-core"
version = "0.59.2"
version = "0.59.3"
description = "Core types for harambe SDK 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
Expand Down
6 changes: 3 additions & 3 deletions core/test/parser/test_type_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def assert_is_iso_format(date_string):
)
def test_pydantic_type_date_validate_type_success(date_string):
parsed_date = ParserTypeDate.validate_type(date_string)
assert isinstance(
parsed_date, str
), f"Expected string for '{date_string}', got {parsed_date}"
assert isinstance(parsed_date, str), (
f"Expected string for '{date_string}', got {parsed_date}"
)
assert_is_iso_format(parsed_date)


Expand Down
1 change: 1 addition & 0 deletions sdk/harambe/contrib/soup/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ async def goto(self, url: str, **kwargs: Any) -> ResponseWithStatus:

class SoupResponseWithStatus:
status: int = res.status_code
headers: dict[str, str] = res.headers

return SoupResponseWithStatus()

Expand Down
1 change: 1 addition & 0 deletions sdk/harambe/contrib/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class ResponseWithStatus(Protocol):
"""Protocol for goto responses across all harnesses. Use minimal attributes required for current use cases."""

status: int
headers: dict[str, str]


class AbstractPage(Selectable[T], abc.ABC):
Expand Down
12 changes: 5 additions & 7 deletions sdk/harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
LocalStorage,
)
from harambe_core import SchemaParser, Schema
from harambe_core.errors import GotoError
from harambe_core.errors import default_error_callback
from harambe_core.normalize_url import normalize_url
from harambe_core.parser.expression import ExpressionEvaluator
from playwright.async_api import (
Expand All @@ -64,10 +64,6 @@
from harambe.contrib import WebHarness, playwright_harness


async def default_callback(url: str, status: int):
raise GotoError(url, status)


class AsyncScraper(Protocol):
"""
Protocol that all classed based scrapers should implement.
Expand Down Expand Up @@ -453,7 +449,9 @@ async def run(
harness: WebHarness = playwright_harness,
evaluator: Optional[ExpressionEvaluator] = None,
observer: Optional[OutputObserver | List[OutputObserver]] = None,
callback: Callable[[str, int], Awaitable[None]] = default_callback,
goto_error_handler: Callable[
[str, int, dict[str, str]], Awaitable[None]
] = default_error_callback,
**harness_options: Unpack[HarnessOptions],
) -> "SDK":
"""
Expand Down Expand Up @@ -497,7 +495,7 @@ async def run(
if not harness_options.get("disable_go_to_url", False):
response = await page.goto(url)
if response.status >= 400:
await callback(url, response.status)
await goto_error_handler(url, response.status, response.headers)
elif isinstance(page, SoupPage):
page.url = url
await scraper(sdk, url, context)
Expand Down
4 changes: 2 additions & 2 deletions sdk/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[project]
name = "harambe-sdk"
version = "0.59.2"
version = "0.59.3"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
]
requires-python = ">=3.11,<4.0"
readme = "README.md"
dependencies = [
"harambe_core==0.59.2",
"harambe_core==0.59.3",
"playwright==1.47.0",
"beautifulsoup4==4.12.3",
"requests==2.32.3",
Expand Down
16 changes: 8 additions & 8 deletions sdk/test/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,12 +279,12 @@ async def scraper(sdk: SDK, *args, **kwargs):

assert observer.data[0]["page_content"] == observer.data[1]["table_content"]
for text in ["Apple", "Orange", "Banana"]:
assert (
text in observer.data[0]["page_content"]
), f"{text} not in {observer.data[0]['page_content']}"
assert (
text in observer.data[1]["table_content"]
), f"{text} not in {observer.data[1]['table_content']}"
assert text in observer.data[0]["page_content"], (
f"{text} not in {observer.data[0]['page_content']}"
)
assert text in observer.data[1]["table_content"], (
f"{text} not in {observer.data[1]['table_content']}"
)


@pytest.mark.parametrize("harness", [soup_harness])
Expand Down Expand Up @@ -687,7 +687,7 @@ async def test_403_status_on_goto_with_custom_callback(
async def scrape(sdk: SDK, current_url, context) -> None:
await sdk.save_data({"key": "this shouldn't be saved if GotoError is raised"})

async def custom_error_handler(url, status_code):
async def custom_error_handler(url, status_code, *args):
print(f"Handled {status_code} for {url} gracefully.")

error_callback = custom_error_handler
Expand All @@ -698,7 +698,7 @@ async def custom_error_handler(url, status_code):
schema={},
context={"status": "Open"},
observer=observer,
callback=error_callback,
goto_error_handler=error_callback,
)

# Ensure data is saved when error is handled (either with custom or no callback)
Expand Down
18 changes: 9 additions & 9 deletions sdk/test/test_stealth.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ async def test_navigator_webdriver(async_page):
@pytest.mark.asyncio
async def test_user_agent(async_page):
user_agent = await async_page.evaluate("navigator.userAgent")
assert (
"headless" not in user_agent.lower()
), "User agent should not contain 'headless'"
assert "headless" not in user_agent.lower(), (
"User agent should not contain 'headless'"
)


@pytest.mark.asyncio
Expand All @@ -56,9 +56,9 @@ async def test_plugins(async_page):
@pytest.mark.asyncio
async def test_app_version(async_page):
app_version = await async_page.evaluate("navigator.appVersion")
assert (
"headless" not in app_version.lower()
), "App version should not contain 'headless'"
assert "headless" not in app_version.lower(), (
"App version should not contain 'headless'"
)


@pytest.mark.asyncio
Expand Down Expand Up @@ -88,6 +88,6 @@ async def test_connection_rtt(async_page):
}
""")

assert (
connection_rtt is not None and connection_rtt != 0
), "Connection RTT should exist and not be zero in non-headless browsers"
assert connection_rtt is not None and connection_rtt != 0, (
"Connection RTT should exist and not be zero in non-headless browsers"
)

0 comments on commit 8566526

Please sign in to comment.