From 0f49a63a2d9e4c19435273b22fb3afbb8fdff6a6 Mon Sep 17 00:00:00 2001 From: kunwarsodhi Date: Tue, 31 Dec 2024 12:56:46 -0800 Subject: [PATCH 1/3] Added fake image response --- core/pyproject.toml | 2 +- core/uv.lock | 2 +- sdk/harambe/handlers.py | 12 +++++++++--- sdk/pyproject.toml | 4 ++-- sdk/uv.lock | 4 ++-- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/core/pyproject.toml b/core/pyproject.toml index a569221..0d3e83d 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-core" -version = "0.58.0" +version = "0.59.0" description = "Core types for harambe SDK 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } diff --git a/core/uv.lock b/core/uv.lock index b494d06..a6193f5 100644 --- a/core/uv.lock +++ b/core/uv.lock @@ -115,7 +115,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.58.0" +version = "0.59.0" source = { virtual = "." } dependencies = [ { name = "dateparser" }, diff --git a/sdk/harambe/handlers.py b/sdk/harambe/handlers.py index e85845d..f58ac3d 100644 --- a/sdk/harambe/handlers.py +++ b/sdk/harambe/handlers.py @@ -1,4 +1,5 @@ import re +import base64 from abc import ABC from typing import Any, Literal, Self @@ -78,9 +79,14 @@ class UnnecessaryResourceHandler(AbstractHandler): async def handle(self, route: Route) -> None: resource_type = route.request.resource_type url = route.request.url - - if ( - resource_type in ["image", "media", "font"] + if resource_type in ["image", "media"]: + fake_img = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" + await route.fulfill( + body=base64.b64decode(fake_img), content_type="image/png" + ) + return + elif ( + resource_type == "font" or re.match(r"^data:(image|audio|video)", url) or re.match(r"social-widget|tracking-script|ads", url) ): diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index 03171f2..3bc8dd6 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-sdk" -version = "0.58.0" +version = "0.59.0" description = "Data extraction SDK for Playwright 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } @@ -8,7 +8,7 @@ authors = [ requires-python = ">=3.11,<4.0" readme = "README.md" dependencies = [ - "harambe_core==0.58.0", + "harambe_core==0.59.0", "playwright==1.47.0", "beautifulsoup4==4.12.3", "requests==2.32.3", diff --git a/sdk/uv.lock b/sdk/uv.lock index 2bc07bd..67838ed 100644 --- a/sdk/uv.lock +++ b/sdk/uv.lock @@ -428,7 +428,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.58.0" +version = "0.59.0" source = { editable = "../core" } dependencies = [ { name = "dateparser" }, @@ -459,7 +459,7 @@ dev = [ [[package]] name = "harambe-sdk" -version = "0.58.0" +version = "0.59.0" source = { virtual = "." } dependencies = [ { name = "aiohttp" }, From 46791c4b81f88423d78e49a8f60d4b4be60ab941 Mon Sep 17 00:00:00 2001 From: kunwarsodhi Date: Thu, 2 Jan 2025 09:32:35 -0800 Subject: [PATCH 2/3] moved to constant --- sdk/harambe/handlers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sdk/harambe/handlers.py b/sdk/harambe/handlers.py index f58ac3d..6ff91e1 100644 --- a/sdk/harambe/handlers.py +++ b/sdk/harambe/handlers.py @@ -22,6 +22,9 @@ "*", ] +FAKE_IMAGE_BYTES = base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" +) class AbstractHandler(ABC): async def handle(self, route: Route) -> None: @@ -75,15 +78,12 @@ def captured_url(self) -> str | None: return self._new_pages[0] if self._new_pages else None -class UnnecessaryResourceHandler(AbstractHandler): +class UnnecessaryResourceHandler: async def handle(self, route: Route) -> None: resource_type = route.request.resource_type url = route.request.url if resource_type in ["image", "media"]: - fake_img = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" - await route.fulfill( - body=base64.b64decode(fake_img), content_type="image/png" - ) + await route.fulfill( body=FAKE_IMAGE_BYTES, content_type="image/png" ) return elif ( resource_type == "font" From be9b650b3463887e99c541373e35946b8683e841 Mon Sep 17 00:00:00 2001 From: kunwarsodhi Date: Thu, 2 Jan 2025 09:35:05 -0800 Subject: [PATCH 3/3] Fixed formatting --- sdk/harambe/handlers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/harambe/handlers.py b/sdk/harambe/handlers.py index 6ff91e1..976c68f 100644 --- a/sdk/harambe/handlers.py +++ b/sdk/harambe/handlers.py @@ -26,6 +26,7 @@ "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" ) + class AbstractHandler(ABC): async def handle(self, route: Route) -> None: raise NotImplementedError @@ -83,7 +84,7 @@ async def handle(self, route: Route) -> None: resource_type = route.request.resource_type url = route.request.url if resource_type in ["image", "media"]: - await route.fulfill( body=FAKE_IMAGE_BYTES, content_type="image/png" ) + await route.fulfill(body=FAKE_IMAGE_BYTES, content_type="image/png") return elif ( resource_type == "font"