Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
awtkns committed Dec 24, 2024
2 parents 076d269 + 523b436 commit fecb4fd
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 17 deletions.
2 changes: 1 addition & 1 deletion core/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "harambe-core"
version = "0.56.0"
version = "0.58.0"
description = "Core types for harambe SDK 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
Expand Down
4 changes: 2 additions & 2 deletions core/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 15 additions & 1 deletion sdk/harambe/contrib/playwright/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ async def playwright_harness(
browser_type: Optional[BrowserType] = None,
enable_clipboard: bool = False,
launch_args: Sequence[str] = (),
extensions: Sequence[str] = (),
**__: Any,
) -> AsyncGenerator[PageFactory, None]:
"""
Expand All @@ -47,13 +48,26 @@ async def playwright_harness(
creation of HAR file, and stealth.
"""
async with async_playwright() as p:
extension_args = []
browser_type = browser_type or "chromium"

if extensions and browser_type == "chromium":
extension_paths = ",".join(extensions)
extension_args.extend(
[
f"--disable-extensions-except={extension_paths}",
f"--load-extension={extension_paths}",
]
)

browser = await (
p.chromium.connect_over_cdp(endpoint_url=cdp_endpoint)
if cdp_endpoint
else getattr(p, cast(str, browser_type or "chromium")).launch(
else getattr(p, cast(str, browser_type)).launch(
headless=headless,
args=[
*launch_args,
*extension_args,
*(
# Disables navigator.webdriver showing up
["--disable-blink-features=AutomationControlled"]
Expand Down
7 changes: 6 additions & 1 deletion sdk/harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@
from harambe.contrib import WebHarness, playwright_harness


async def default_callback(url: str, status: int):
raise GotoError(url, status)


class AsyncScraper(Protocol):
"""
Protocol that all classed based scrapers should implement.
Expand Down Expand Up @@ -449,6 +453,7 @@ async def run(
harness: WebHarness = playwright_harness,
evaluator: Optional[ExpressionEvaluator] = None,
observer: Optional[OutputObserver | List[OutputObserver]] = None,
callback: Callable[[str, int], Awaitable[None]] = default_callback,
**harness_options: Unpack[HarnessOptions],
) -> "SDK":
"""
Expand Down Expand Up @@ -492,7 +497,7 @@ async def run(
if not harness_options.get("disable_go_to_url", False):
response = await page.goto(url)
if response.status >= 400:
raise GotoError(url, response.status)
await callback(url, response.status)
elif isinstance(page, SoupPage):
page.url = url
await scraper(sdk, url, context)
Expand Down
1 change: 1 addition & 0 deletions sdk/harambe/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,4 @@ class HarnessOptions(TypedDict, total=False):
disable_go_to_url: bool
on_start: Optional[Callback]
on_end: Optional[Callback]
extensions: Sequence[str]
4 changes: 2 additions & 2 deletions sdk/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[project]
name = "harambe-sdk"
version = "0.56.0"
version = "0.58.0"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
]
requires-python = ">=3.11,<4.0"
readme = "README.md"
dependencies = [
"harambe_core==0.56.0",
"harambe_core==0.58.0",
"playwright==1.47.0",
"beautifulsoup4==4.12.3",
"requests==2.32.3",
Expand Down
41 changes: 34 additions & 7 deletions sdk/test/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,22 +633,49 @@ async def scrape(sdk: SDK, url, context) -> None:


@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
async def test_403_status_on_goto(server, observer, harness):
async def test_403_status_on_goto_with_default_callback(server, observer, harness):
url = f"{server}/403"

async def scrape(sdk: SDK, current_url, context) -> None:
await sdk.save_data(
{"key": "this should't be saved as we're throwing an exception"}
)
await sdk.save_data({"key": "this shouldn't be saved if GotoError is raised"})

with pytest.raises(GotoError):
await SDK.run(
scrape,
url,
scraper=scrape,
url=url,
harness=harness,
schema={},
context={"status": "Open"},
observer=observer,
)

assert len(observer.data) == 0


@pytest.mark.parametrize("harness", [playwright_harness, soup_harness])
@pytest.mark.parametrize("goto_error_cb", ["custom"])
async def test_403_status_on_goto_with_custom_callback(
server, observer, harness, goto_error_cb
):
url = f"{server}/403"

async def scrape(sdk: SDK, current_url, context) -> None:
await sdk.save_data({"key": "this shouldn't be saved if GotoError is raised"})

async def custom_error_handler(url, status_code):
print(f"Handled {status_code} for {url} gracefully.")

error_callback = custom_error_handler
await SDK.run(
scraper=scrape,
url=url,
harness=harness,
schema={},
context={"status": "Open"},
observer=observer,
callback=error_callback,
)

# Ensure data is saved when error is handled (either with custom or no callback)
assert len(observer.data) == 1
assert observer.data[0]["key"] == "this shouldn't be saved if GotoError is raised"
assert observer.data[0]["__url"] == url
6 changes: 3 additions & 3 deletions sdk/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit fecb4fd

Please sign in to comment.