diff --git a/core/pyproject.toml b/core/pyproject.toml index a569221..0d3e83d 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-core" -version = "0.58.0" +version = "0.59.0" description = "Core types for harambe SDK 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } diff --git a/core/uv.lock b/core/uv.lock index b494d06..a6193f5 100644 --- a/core/uv.lock +++ b/core/uv.lock @@ -115,7 +115,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.58.0" +version = "0.59.0" source = { virtual = "." } dependencies = [ { name = "dateparser" }, diff --git a/sdk/harambe/handlers.py b/sdk/harambe/handlers.py index e85845d..976c68f 100644 --- a/sdk/harambe/handlers.py +++ b/sdk/harambe/handlers.py @@ -1,4 +1,5 @@ import re +import base64 from abc import ABC from typing import Any, Literal, Self @@ -21,6 +22,10 @@ "*", ] +FAKE_IMAGE_BYTES = base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" +) + class AbstractHandler(ABC): async def handle(self, route: Route) -> None: @@ -74,13 +79,15 @@ def captured_url(self) -> str | None: return self._new_pages[0] if self._new_pages else None -class UnnecessaryResourceHandler(AbstractHandler): +class UnnecessaryResourceHandler: async def handle(self, route: Route) -> None: resource_type = route.request.resource_type url = route.request.url - - if ( - resource_type in ["image", "media", "font"] + if resource_type in ["image", "media"]: + await route.fulfill(body=FAKE_IMAGE_BYTES, content_type="image/png") + return + elif ( + resource_type == "font" or re.match(r"^data:(image|audio|video)", url) or re.match(r"social-widget|tracking-script|ads", url) ): diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index 03171f2..3bc8dd6 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-sdk" -version = "0.58.0" +version = "0.59.0" description = "Data extraction SDK for Playwright 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } @@ -8,7 +8,7 @@ authors = [ requires-python = ">=3.11,<4.0" readme = "README.md" dependencies = [ - "harambe_core==0.58.0", + "harambe_core==0.59.0", "playwright==1.47.0", "beautifulsoup4==4.12.3", "requests==2.32.3", diff --git a/sdk/uv.lock b/sdk/uv.lock index 2bc07bd..67838ed 100644 --- a/sdk/uv.lock +++ b/sdk/uv.lock @@ -428,7 +428,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.58.0" +version = "0.59.0" source = { editable = "../core" } dependencies = [ { name = "dateparser" }, @@ -459,7 +459,7 @@ dev = [ [[package]] name = "harambe-sdk" -version = "0.58.0" +version = "0.59.0" source = { virtual = "." } dependencies = [ { name = "aiohttp" },