From 5dab8ca2ae4394124919444f632b47a656940bb5 Mon Sep 17 00:00:00 2001 From: mohamedmamdouh22 Date: Wed, 22 Jan 2025 20:38:05 +0200 Subject: [PATCH 1/7] Refactor SoupPage's post method to handle different content types --- sdk/harambe/contrib/soup/impl.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/sdk/harambe/contrib/soup/impl.py b/sdk/harambe/contrib/soup/impl.py index 5a3b37b..cc1b776 100644 --- a/sdk/harambe/contrib/soup/impl.py +++ b/sdk/harambe/contrib/soup/impl.py @@ -10,6 +10,12 @@ from harambe.contrib.types import ResponseWithStatus +class SoupResponseWithStatus: + def __init__(self, status: int, body: Any): + self.status = status + self.body = body + + class SoupElementHandle(AbstractElementHandle, Selectable["SoupElementHandle"]): def __init__(self, tag: Tag) -> None: self._tag = tag @@ -92,13 +98,22 @@ async def post( self, url: str, data: dict[str, Any], + params: Optional[dict[str, Any]] = None, headers: Optional[HeaderTypes] = None, **kwargs: Any, ) -> Any: + content_type = ( + (headers or self._extra_headers or {}).get("Content-Type", "").lower() + ) + + processed_data = ( + json.dumps(data) if "application/json" in content_type else data + ) res = await self._session.post( url, headers=headers or self._extra_headers, - data=json.dumps(data), + data=processed_data, + params=params or {}, impersonate="chrome", **kwargs, ) @@ -107,22 +122,12 @@ async def post( self._url = res.url content_type = res.headers.get("Content-Type", "") - if "application/json" in content_type: - - class SoupResponseWithStatus: - status: int = res.status_code - body: dict[str, Any] = res.json() - - return SoupResponseWithStatus() + return SoupResponseWithStatus(res.status_code, res.text) self._soup = BeautifulSoup(res.text, "html.parser") - class SoupResponseWithStatus: - status: int = res.status_code - body: str = res.text - - return SoupResponseWithStatus() + return SoupResponseWithStatus(res.status_code, res.text) async def query_selector_all(self, selector: str) -> list[SoupElementHandle]: return SoupElementHandle.from_tags(self._soup.select(selector)) From 714ef22fbf25870007a71cafe51c63dd9b1e28a7 Mon Sep 17 00:00:00 2001 From: mohamedmamdouh22 Date: Thu, 23 Jan 2025 02:09:40 +0200 Subject: [PATCH 2/7] Bump version to 0.59.6 in pyproject.toml files --- core/pyproject.toml | 2 +- core/uv.lock | 2 +- sdk/pyproject.toml | 4 ++-- sdk/uv.lock | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/core/pyproject.toml b/core/pyproject.toml index d3a0df4..93c60e3 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-core" -version = "0.59.5" +version = "0.59.6" description = "Core types for harambe SDK 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } diff --git a/core/uv.lock b/core/uv.lock index 6b8c419..232b1a5 100644 --- a/core/uv.lock +++ b/core/uv.lock @@ -115,7 +115,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.59.5" +version = "0.59.6" source = { virtual = "." } dependencies = [ { name = "dateparser" }, diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index 8e61063..d1e3982 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-sdk" -version = "0.59.5" +version = "0.59.6" description = "Data extraction SDK for Playwright 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } @@ -8,7 +8,7 @@ authors = [ requires-python = ">=3.11,<4.0" readme = "README.md" dependencies = [ - "harambe_core==0.59.5", + "harambe_core==0.59.6", "playwright==1.47.0", "beautifulsoup4==4.12.3", "requests==2.32.3", diff --git a/sdk/uv.lock b/sdk/uv.lock index ebbac89..a1f9ef0 100644 --- a/sdk/uv.lock +++ b/sdk/uv.lock @@ -428,7 +428,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.59.5" +version = "0.59.6" source = { editable = "../core" } dependencies = [ { name = "dateparser" }, @@ -459,7 +459,7 @@ dev = [ [[package]] name = "harambe-sdk" -version = "0.59.5" +version = "0.59.6" source = { virtual = "." } dependencies = [ { name = "aiohttp" }, From 207cbe6d4f7cf606ba1ca53858bfa5103558939d Mon Sep 17 00:00:00 2001 From: mohamedmamdouh22 Date: Thu, 23 Jan 2025 02:42:30 +0200 Subject: [PATCH 3/7] Refactor SoupPage's post method to return JSON response if content type is application/json --- sdk/harambe/contrib/soup/impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/harambe/contrib/soup/impl.py b/sdk/harambe/contrib/soup/impl.py index cc1b776..dfa95de 100644 --- a/sdk/harambe/contrib/soup/impl.py +++ b/sdk/harambe/contrib/soup/impl.py @@ -123,7 +123,7 @@ async def post( self._url = res.url content_type = res.headers.get("Content-Type", "") if "application/json" in content_type: - return SoupResponseWithStatus(res.status_code, res.text) + return SoupResponseWithStatus(res.status_code, res.json()) self._soup = BeautifulSoup(res.text, "html.parser") From c8d325cff6e42654bdde82304c81ac4efc5f0ac8 Mon Sep 17 00:00:00 2001 From: mohamedmamdouh22 Date: Thu, 23 Jan 2025 23:47:02 +0200 Subject: [PATCH 4/7] Refactor SoupPage's post method to handle different content types --- sdk/harambe/contrib/soup/impl.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/sdk/harambe/contrib/soup/impl.py b/sdk/harambe/contrib/soup/impl.py index dfa95de..9399d2a 100644 --- a/sdk/harambe/contrib/soup/impl.py +++ b/sdk/harambe/contrib/soup/impl.py @@ -4,18 +4,12 @@ import json # noinspection PyProtectedMember -from curl_cffi.requests import AsyncSession, HeaderTypes +from curl_cffi.requests import AsyncSession, HeaderTypes, Response from harambe.contrib.soup.tracing import Tracer from harambe.contrib.types import AbstractElementHandle, AbstractPage, Selectable from harambe.contrib.types import ResponseWithStatus -class SoupResponseWithStatus: - def __init__(self, status: int, body: Any): - self.status = status - self.body = body - - class SoupElementHandle(AbstractElementHandle, Selectable["SoupElementHandle"]): def __init__(self, tag: Tag) -> None: self._tag = tag @@ -101,7 +95,7 @@ async def post( params: Optional[dict[str, Any]] = None, headers: Optional[HeaderTypes] = None, **kwargs: Any, - ) -> Any: + ) -> Response: content_type = ( (headers or self._extra_headers or {}).get("Content-Type", "").lower() ) @@ -117,17 +111,19 @@ async def post( impersonate="chrome", **kwargs, ) + res.raise_for_status() + if self._tracer: self._tracer.log_request(res) self._url = res.url content_type = res.headers.get("Content-Type", "") if "application/json" in content_type: - return SoupResponseWithStatus(res.status_code, res.json()) + return res self._soup = BeautifulSoup(res.text, "html.parser") - return SoupResponseWithStatus(res.status_code, res.text) + return res async def query_selector_all(self, selector: str) -> list[SoupElementHandle]: return SoupElementHandle.from_tags(self._soup.select(selector)) From ed7e4748cbf422a8e86a3b9585475c87a44d0137 Mon Sep 17 00:00:00 2001 From: = <=> Date: Sat, 1 Feb 2025 01:38:06 +0200 Subject: [PATCH 5/7] Refactor assertions in tests for improved readability --- core/test/parser/test_type_date.py | 6 +++--- sdk/test/test_e2e.py | 12 ++++++------ sdk/test/test_stealth.py | 18 +++++++++--------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/core/test/parser/test_type_date.py b/core/test/parser/test_type_date.py index 54a3aad..4231605 100644 --- a/core/test/parser/test_type_date.py +++ b/core/test/parser/test_type_date.py @@ -31,9 +31,9 @@ def assert_is_iso_format(date_string): ) def test_pydantic_type_date_validate_type_success(date_string): parsed_date = ParserTypeDate.validate_type(date_string) - assert isinstance(parsed_date, str), ( - f"Expected string for '{date_string}', got {parsed_date}" - ) + assert isinstance( + parsed_date, str + ), f"Expected string for '{date_string}', got {parsed_date}" assert_is_iso_format(parsed_date) diff --git a/sdk/test/test_e2e.py b/sdk/test/test_e2e.py index dde486e..f0263a9 100644 --- a/sdk/test/test_e2e.py +++ b/sdk/test/test_e2e.py @@ -279,12 +279,12 @@ async def scraper(sdk: SDK, *args, **kwargs): assert observer.data[0]["page_content"] == observer.data[1]["table_content"] for text in ["Apple", "Orange", "Banana"]: - assert text in observer.data[0]["page_content"], ( - f"{text} not in {observer.data[0]['page_content']}" - ) - assert text in observer.data[1]["table_content"], ( - f"{text} not in {observer.data[1]['table_content']}" - ) + assert ( + text in observer.data[0]["page_content"] + ), f"{text} not in {observer.data[0]['page_content']}" + assert ( + text in observer.data[1]["table_content"] + ), f"{text} not in {observer.data[1]['table_content']}" @pytest.mark.parametrize("harness", [soup_harness]) diff --git a/sdk/test/test_stealth.py b/sdk/test/test_stealth.py index 026f411..1cf6c49 100644 --- a/sdk/test/test_stealth.py +++ b/sdk/test/test_stealth.py @@ -36,9 +36,9 @@ async def test_navigator_webdriver(async_page): @pytest.mark.asyncio async def test_user_agent(async_page): user_agent = await async_page.evaluate("navigator.userAgent") - assert "headless" not in user_agent.lower(), ( - "User agent should not contain 'headless'" - ) + assert ( + "headless" not in user_agent.lower() + ), "User agent should not contain 'headless'" @pytest.mark.asyncio @@ -56,9 +56,9 @@ async def test_plugins(async_page): @pytest.mark.asyncio async def test_app_version(async_page): app_version = await async_page.evaluate("navigator.appVersion") - assert "headless" not in app_version.lower(), ( - "App version should not contain 'headless'" - ) + assert ( + "headless" not in app_version.lower() + ), "App version should not contain 'headless'" @pytest.mark.asyncio @@ -88,6 +88,6 @@ async def test_connection_rtt(async_page): } """) - assert connection_rtt is not None and connection_rtt != 0, ( - "Connection RTT should exist and not be zero in non-headless browsers" - ) + assert ( + connection_rtt is not None and connection_rtt != 0 + ), "Connection RTT should exist and not be zero in non-headless browsers" From 0e6a865044e1d772bd41ec461178484a1755a08e Mon Sep 17 00:00:00 2001 From: = <=> Date: Sat, 1 Feb 2025 01:41:42 +0200 Subject: [PATCH 6/7] Reformat --- sdk/test/test_e2e.py | 12 ++++++------ sdk/test/test_stealth.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/sdk/test/test_e2e.py b/sdk/test/test_e2e.py index f0263a9..dde486e 100644 --- a/sdk/test/test_e2e.py +++ b/sdk/test/test_e2e.py @@ -279,12 +279,12 @@ async def scraper(sdk: SDK, *args, **kwargs): assert observer.data[0]["page_content"] == observer.data[1]["table_content"] for text in ["Apple", "Orange", "Banana"]: - assert ( - text in observer.data[0]["page_content"] - ), f"{text} not in {observer.data[0]['page_content']}" - assert ( - text in observer.data[1]["table_content"] - ), f"{text} not in {observer.data[1]['table_content']}" + assert text in observer.data[0]["page_content"], ( + f"{text} not in {observer.data[0]['page_content']}" + ) + assert text in observer.data[1]["table_content"], ( + f"{text} not in {observer.data[1]['table_content']}" + ) @pytest.mark.parametrize("harness", [soup_harness]) diff --git a/sdk/test/test_stealth.py b/sdk/test/test_stealth.py index 1cf6c49..026f411 100644 --- a/sdk/test/test_stealth.py +++ b/sdk/test/test_stealth.py @@ -36,9 +36,9 @@ async def test_navigator_webdriver(async_page): @pytest.mark.asyncio async def test_user_agent(async_page): user_agent = await async_page.evaluate("navigator.userAgent") - assert ( - "headless" not in user_agent.lower() - ), "User agent should not contain 'headless'" + assert "headless" not in user_agent.lower(), ( + "User agent should not contain 'headless'" + ) @pytest.mark.asyncio @@ -56,9 +56,9 @@ async def test_plugins(async_page): @pytest.mark.asyncio async def test_app_version(async_page): app_version = await async_page.evaluate("navigator.appVersion") - assert ( - "headless" not in app_version.lower() - ), "App version should not contain 'headless'" + assert "headless" not in app_version.lower(), ( + "App version should not contain 'headless'" + ) @pytest.mark.asyncio @@ -88,6 +88,6 @@ async def test_connection_rtt(async_page): } """) - assert ( - connection_rtt is not None and connection_rtt != 0 - ), "Connection RTT should exist and not be zero in non-headless browsers" + assert connection_rtt is not None and connection_rtt != 0, ( + "Connection RTT should exist and not be zero in non-headless browsers" + ) From 3e1fc1f1c4f25f1382f87e16589637c1bb2def56 Mon Sep 17 00:00:00 2001 From: = <=> Date: Sat, 1 Feb 2025 01:42:55 +0200 Subject: [PATCH 7/7] Reformat --- core/test/parser/test_type_date.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/test/parser/test_type_date.py b/core/test/parser/test_type_date.py index 4231605..54a3aad 100644 --- a/core/test/parser/test_type_date.py +++ b/core/test/parser/test_type_date.py @@ -31,9 +31,9 @@ def assert_is_iso_format(date_string): ) def test_pydantic_type_date_validate_type_success(date_string): parsed_date = ParserTypeDate.validate_type(date_string) - assert isinstance( - parsed_date, str - ), f"Expected string for '{date_string}', got {parsed_date}" + assert isinstance(parsed_date, str), ( + f"Expected string for '{date_string}', got {parsed_date}" + ) assert_is_iso_format(parsed_date)