Skip to content

Commit

Permalink
🫡 Improve dynamic urls
Browse files Browse the repository at this point in the history
  • Loading branch information
awtkns committed Mar 1, 2024
1 parent 0bca7df commit d2a72bd
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 30 deletions.
24 changes: 9 additions & 15 deletions harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,14 @@ async def enqueue(self, *urls: URL, context: Optional[Context] = None) -> None:
)

async def paginate(
self, next_page: Callable[..., Awaitable[URL | ElementHandle | None]]
self,
next_page: Callable[..., Awaitable[URL | ElementHandle | None]],
sleep: int = 0,
) -> None:
"""
Navigate to the next page of a listing.
:param sleep: seconds to sleep for before continuing
:param next_page: the url or ElementHandle of the next page
"""
try:
Expand All @@ -119,15 +122,14 @@ async def paginate(
await self.page.goto(next_url)

if next_url:
if sleep > 0:
await asyncio.sleep(sleep)
await self._scraper(self, next_url, self._context)
except: # noqa: E722
return

async def capture_url(
self,
clickable: ElementHandle,
resource_type: ResourceType = "document",
abort_on_match: bool = True,
self, clickable: ElementHandle, resource_type: ResourceType = "document"
) -> URL | None:
"""
Capture the url of a click event. This will click the element and return the url
Expand All @@ -136,23 +138,15 @@ async def capture_url(
:param clickable: the element to click
:param resource_type: the type of resource to capture
:param abort_on_match: whether to abort the request once a match is found
:return url: the url of the captured resource or None if no match was found
:raises ValueError: if more than one request matches
"""

current_url = self.page.url

async with ResourceRequestHandler(
self.page, resource_type, abort_on_match
self.page, resource_type=resource_type
) as handler:
await clickable.click()

for page in self.page.context.pages:
if page.url != current_url:
await page.close()

return handler.matched_url
return handler.captured_url()

@staticmethod
async def run(
Expand Down
37 changes: 23 additions & 14 deletions harambe/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"websocket",
"manifest",
"other",
"*",
]


Expand All @@ -31,14 +32,14 @@ def __init__(
self,
page: Page,
resource_type: ResourceType,
abort_on_match: bool,
url_pattern: str = "**/*",
):
self.page = page
self.resource_type = resource_type
self.abort_on_match = abort_on_match
self.url_pattern = url_pattern
self.matched_requests = []
self.resource_type = resource_type

self._initial_pages = [p.url for p in page.context.pages]
self._new_pages = []

async def __aenter__(self):
await self.page.context.route(self.url_pattern, self.handle)
Expand All @@ -47,22 +48,30 @@ async def __aenter__(self):
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.page.context.unroute(self.url_pattern, self.handle)
await self.page.bring_to_front()
for page in self.page.context.pages:
if page.url not in self._initial_pages:
self._new_pages.append(page.url)
await page.close()

async def handle(self, route: Route) -> None:
if self.resource_type in route.request.resource_type:
self.matched_requests.append(route.request)
if self.abort_on_match:
await route.abort("blockedbyclient")
return
if (
self.resource_type == "*"
or self.resource_type in route.request.resource_type
):
await route.fulfill(
status=200,
content_type="text/plain",
body="Intercepted by the handler",
)
return

await route.fallback()

@property
def matched_url(self) -> str | None:
if len(self.matched_requests) > 1:
raise ValueError("More than one request matched")
def captured_url(self) -> str | None:
if len(self._new_pages) > 1:
raise ValueError("More than one page matched")

return self.matched_requests[0].url if self.matched_requests else None
return self._new_pages[0] if self._new_pages else None


class UnnecessaryResourceHandler(AbstractHandler):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "harambe-sdk"
version = "0.7.0"
version = "0.8.0"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = ["awtkns <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit d2a72bd

Please sign in to comment.