From a7cf07b0fac40de186ddc88ddc6375eceec63a0b Mon Sep 17 00:00:00 2001 From: mohamedmamdouh22 Date: Thu, 19 Sep 2024 20:24:26 +0300 Subject: [PATCH 1/6] =?UTF-8?q?=F0=9F=94=A5feat:=20Add=20page-level=20even?= =?UTF-8?q?t=20handlers=20for=20dismissing=20dialogs=20and=20aborting=20ce?= =?UTF-8?q?rtain=20requests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- harambe/contrib/playwright/harness.py | 7 +++++++ harambe/core.py | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/harambe/contrib/playwright/harness.py b/harambe/contrib/playwright/harness.py index da04198..5bb4c29 100644 --- a/harambe/contrib/playwright/harness.py +++ b/harambe/contrib/playwright/harness.py @@ -88,6 +88,13 @@ async def playwright_harness( async def page_factory(*_: Any, **__: Any) -> PlaywrightPage: page = await ctx.new_page() + page.on("dialog", lambda dialog: dialog.dismiss()) + await page.route( + "**/*", + lambda route, request: route.abort() + if request.url.startswith(("mailto:", "tel:")) + else route.continue_(), + ) if on_new_page: await on_new_page(page) if stealth: diff --git a/harambe/core.py b/harambe/core.py index abf64e4..4437e75 100644 --- a/harambe/core.py +++ b/harambe/core.py @@ -362,6 +362,15 @@ async def run( if not harness_options.get("disable_go_to_url", False): await page.goto(url) + await page.wait_for_load_state("domcontentloaded") + await page.evaluate(""" + document.addEventListener('click', (event) => { + const target = event.target.closest('a[href^="mailto:"], a[href^="tel:"]'); + if (target) { + event.preventDefault(); + } + }); + """) elif isinstance(page, SoupPage): page.url = url await scraper(sdk, url, context) From c1464ac6db90d85530e90ee0794554af9b796f0e Mon Sep 17 00:00:00 2001 From: mohamedmamdouh22 Date: Fri, 20 Sep 2024 02:06:40 +0300 Subject: [PATCH 2/6] =?UTF-8?q?feat:=20Add=20MailtoTelBlockerHandler=20to?= =?UTF-8?q?=20playwright=20harness=F0=9F=A6=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- harambe/contrib/playwright/harness.py | 9 ++------- harambe/handlers.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/harambe/contrib/playwright/harness.py b/harambe/contrib/playwright/harness.py index 5bb4c29..b4dee7a 100644 --- a/harambe/contrib/playwright/harness.py +++ b/harambe/contrib/playwright/harness.py @@ -6,7 +6,7 @@ from playwright_stealth import stealth_async from harambe.contrib.playwright.impl import PlaywrightPage -from harambe.handlers import UnnecessaryResourceHandler +from harambe.handlers import UnnecessaryResourceHandler, MailtoTelBlockerHandler from harambe.proxy import proxy_from_url from harambe.types import SetCookieParam, BrowserType @@ -85,16 +85,11 @@ async def playwright_harness( if abort_unnecessary_requests: await ctx.route("**/*", UnnecessaryResourceHandler().handle) + await ctx.route("**/*", MailtoTelBlockerHandler().handle) async def page_factory(*_: Any, **__: Any) -> PlaywrightPage: page = await ctx.new_page() page.on("dialog", lambda dialog: dialog.dismiss()) - await page.route( - "**/*", - lambda route, request: route.abort() - if request.url.startswith(("mailto:", "tel:")) - else route.continue_(), - ) if on_new_page: await on_new_page(page) if stealth: diff --git a/harambe/handlers.py b/harambe/handlers.py index e85845d..5cbc17e 100644 --- a/harambe/handlers.py +++ b/harambe/handlers.py @@ -88,3 +88,14 @@ async def handle(self, route: Route) -> None: return await route.fallback() + + +class MailtoTelBlockerHandler: + async def handle(self, route: Route) -> None: + request_url = route.request.url + + # Check if the request is a 'mailto:' or 'tel:' link and abort it + if request_url.startswith(("mailto:", "tel:")): + await route.abort("blockedbyclient") + else: + await route.continue_() From 311c950af730844cd497ed6f0714696ed1f9c8c7 Mon Sep 17 00:00:00 2001 From: mohamedmamdouh22 Date: Fri, 20 Sep 2024 02:08:01 +0300 Subject: [PATCH 3/6] refactor --- harambe/handlers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/harambe/handlers.py b/harambe/handlers.py index 5cbc17e..6d2f732 100644 --- a/harambe/handlers.py +++ b/harambe/handlers.py @@ -94,7 +94,6 @@ class MailtoTelBlockerHandler: async def handle(self, route: Route) -> None: request_url = route.request.url - # Check if the request is a 'mailto:' or 'tel:' link and abort it if request_url.startswith(("mailto:", "tel:")): await route.abort("blockedbyclient") else: From 7596e4190ee9cee183108a487168b6e0bc0cb85c Mon Sep 17 00:00:00 2001 From: mohamedmamdouh22 Date: Fri, 20 Sep 2024 02:36:49 +0300 Subject: [PATCH 4/6] refactor: Improve page load performance by lazy loading images and prevent mailto and tel links from triggering popups --- harambe/core.py | 20 ++++++++++---------- test/mock_html/emails.html | 23 +++++++++++++++++++++++ test/test_e2e.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 10 deletions(-) create mode 100644 test/mock_html/emails.html diff --git a/harambe/core.py b/harambe/core.py index 4437e75..e62f28a 100644 --- a/harambe/core.py +++ b/harambe/core.py @@ -359,18 +359,18 @@ async def run( ) if setup: await setup(sdk) - if not harness_options.get("disable_go_to_url", False): await page.goto(url) - await page.wait_for_load_state("domcontentloaded") - await page.evaluate(""" - document.addEventListener('click', (event) => { - const target = event.target.closest('a[href^="mailto:"], a[href^="tel:"]'); - if (target) { - event.preventDefault(); - } - }); - """) + if harness == playwright_harness: + await page.wait_for_load_state("domcontentloaded") + await page.evaluate(""" + document.addEventListener('click', (event) => { + const target = event.target.closest('a[href^="mailto:"], a[href^="tel:"]'); + if (target) { + event.preventDefault(); + } + }); + """) elif isinstance(page, SoupPage): page.url = url await scraper(sdk, url, context) diff --git a/test/mock_html/emails.html b/test/mock_html/emails.html new file mode 100644 index 0000000..990207e --- /dev/null +++ b/test/mock_html/emails.html @@ -0,0 +1,23 @@ + + + + + + Mock Page for Testing Mailto and Tel Blocking + + +

Test Mailto and Tel Links

+ +

+ Email Us +

+ +

+ Call Us +

+ +

+ Go to Example +

+ + diff --git a/test/test_e2e.py b/test/test_e2e.py index 706a6c9..9901f87 100644 --- a/test/test_e2e.py +++ b/test/test_e2e.py @@ -413,3 +413,35 @@ async def scraper(sdk: SDK, *args, **kwargs): headless=True, harness=harness, ) + + +@pytest.mark.parametrize("harness", [playwright_harness]) +async def test_email_popup_prevention(server, observer, harness): + @SDK.scraper("test", "detail", observer=observer) + async def scraper(sdk: SDK, *args, **kwargs) -> None: + page = sdk.page + dialog_opened = False + page.on("dialog", lambda dialog: dialog.dismiss() and set_dialog_flag()) + + def set_dialog_flag(): + nonlocal dialog_opened + dialog_opened = True + + mailto_link = await page.query_selector('a[href^="mailto:"]') + await mailto_link.click() + current_url = page.url + + assert not current_url.startswith( + "mailto:" + ), "Popup prevention failed, mailto link triggered." + assert ( + not dialog_opened + ), "A dialog (popup) was opened, but it should have been prevented." + + await SDK.run( + scraper=scraper, + url=f"{server}/emails", # This should be your mock page with mailto links + schema={}, + headless=True, + harness=harness, + ) From 105fe287206c3946099f6468f73a1d08d8cd51b9 Mon Sep 17 00:00:00 2001 From: mohamedmamdouh22 Date: Fri, 20 Sep 2024 03:01:58 +0300 Subject: [PATCH 5/6] refactor: Update email popup prevention logic in test_e2e.py --- test/test_e2e.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_e2e.py b/test/test_e2e.py index 9901f87..2f0e2b1 100644 --- a/test/test_e2e.py +++ b/test/test_e2e.py @@ -440,7 +440,7 @@ def set_dialog_flag(): await SDK.run( scraper=scraper, - url=f"{server}/emails", # This should be your mock page with mailto links + url=f"{server}/emails", schema={}, headless=True, harness=harness, From 4ccad856ffc36ce3b3cbad90c2939ccb52b41d92 Mon Sep 17 00:00:00 2001 From: mohamedmamdouh22 Date: Wed, 23 Oct 2024 17:59:02 +0300 Subject: [PATCH 6/6] Update version to 0.32.0 in pyproject.toml Fix string formatting in test_e2e.py --- pyproject.toml | 2 +- test/test_e2e.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bf72394..34c3265 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "harambe-sdk" -version = "0.31.1" +version = "0.32.0" description = "Data extraction SDK for Playwright 🐒🍌" authors = ["awtkns "] readme = "README.md" diff --git a/test/test_e2e.py b/test/test_e2e.py index be63457..d8e8e9a 100644 --- a/test/test_e2e.py +++ b/test/test_e2e.py @@ -437,7 +437,7 @@ async def scraper(sdk: SDK, *args, **kwargs): await SDK.run( scraper=scraper, - url=f"https://{local_storage_entry["domain"]}/", + url=f"https://{local_storage_entry['domain']}/", headless=True, harness=harness, schema={},