-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
𖭦 Ability to override stage on enqueue #89
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[project] | ||
name = "harambe-core" | ||
version = "0.50.2" | ||
version = "0.51.0" | ||
description = "Core types for harambe SDK 🐒🍌" | ||
authors = [ | ||
{ name = "Adam Watkins", email = "[email protected]" } | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,6 +138,7 @@ async def enqueue( | |
*urls: URL | Awaitable[URL], | ||
context: Optional[Context] = None, | ||
options: Optional[Options] = None, | ||
stage: Optional[Stage] = None, | ||
) -> None: | ||
""" | ||
Enqueue url to be scraped. This will be passed to the on_enqueue callback. | ||
|
@@ -147,9 +148,11 @@ async def enqueue( | |
:param urls: urls to enqueue | ||
:param context: additional context to pass to the next run of the next stage/url | ||
:param options: job level options to pass to the next stage/url | ||
:param stage: the override stage to use for the next job. Will use the next stage in the sequence if not provided. | ||
""" | ||
context = context or {} | ||
options = options or {} | ||
stage = stage or get_next_stage(self._stage) | ||
context["__url"] = self.page.url | ||
|
||
for url in urls: | ||
|
@@ -160,7 +163,7 @@ async def enqueue( | |
normalize_url(url, self.page.url) if hasattr(self.page, "url") else url | ||
) | ||
await self._notify_observers( | ||
"on_queue_url", normalized_url, context, options | ||
"on_queue_url", normalized_url, context, options, stage | ||
) | ||
|
||
async def paginate( | ||
|
@@ -498,7 +501,7 @@ async def run_from_file( | |
:return: None: the scraper should save data to the database or file | ||
""" | ||
domain: str = getattr(scraper, "domain", "") | ||
stage: str = getattr(scraper, "stage", "") | ||
stage: Stage = getattr(scraper, "stage", "") | ||
headers: dict[str, str] = getattr(scraper, "extra_headers", {}) | ||
observer: Optional[OutputObserver] = getattr(scraper, "observer", None) | ||
|
||
|
@@ -507,19 +510,23 @@ async def run_from_file( | |
|
||
tracker = FileDataTracker(domain, stage) | ||
|
||
prev = "listing" if stage == "detail" else "category" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should remove this run from file / stage stuff here too |
||
file_path = tracker.get_storage_filepath(prev) | ||
previous_stage = get_previous_stage(stage) | ||
file_path = tracker.get_storage_filepath(previous_stage) | ||
|
||
if not file_path.exists(): | ||
raise ValueError( | ||
f"Could not find {file_path}." | ||
f" No listing data found for this domain. Run the listing scraper first." | ||
f" No {previous_stage} data found for this domain. Run the {previous_stage} scraper first." | ||
) | ||
|
||
listing_data = tracker.load_data(domain, prev) | ||
previous_stage_data = tracker.load_data(domain, previous_stage) | ||
async with playwright_harness(**harness_options) as page_factory: | ||
page = await page_factory() | ||
for listing in listing_data: | ||
for enqueue_data in [ | ||
enqueue_data | ||
for enqueue_data in previous_stage_data | ||
if enqueue_data["stage"] == stage | ||
]: | ||
sdk = SDK( | ||
page, | ||
domain=domain, | ||
|
@@ -533,11 +540,11 @@ async def run_from_file( | |
|
||
if headers: | ||
await page.set_extra_http_headers(headers) | ||
await page.goto(listing["url"]) | ||
await page.goto(enqueue_data["url"]) | ||
await scraper( | ||
sdk, | ||
listing["url"], | ||
listing["context"], | ||
enqueue_data["url"], | ||
enqueue_data["context"], | ||
) | ||
|
||
return sdk | ||
|
@@ -595,4 +602,20 @@ async def wrapper(sdk: "SDK", url: URL, context: Context) -> None: | |
return decorator | ||
|
||
|
||
def get_next_stage(previous_stage: Stage | None) -> Stage: | ||
if previous_stage == "parent_category": | ||
return "category" | ||
if previous_stage == "category": | ||
return "listing" | ||
return "detail" | ||
|
||
|
||
def get_previous_stage(stage: Stage | None) -> Stage: | ||
Comment on lines
+605
to
+613
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally api stuff |
||
if stage == "detail" or stage is None: | ||
return "listing" | ||
if stage == "listing": | ||
return "category" | ||
return "parent_category" | ||
|
||
|
||
PAGE_PDF_FILENAME = "reworkd_page_pdf.pdf" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,14 @@ | ||
[project] | ||
name = "harambe-sdk" | ||
version = "0.50.2" | ||
version = "0.51.0" | ||
description = "Data extraction SDK for Playwright 🐒🍌" | ||
authors = [ | ||
{ name = "Adam Watkins", email = "[email protected]" } | ||
] | ||
requires-python = ">=3.11,<4.0" | ||
readme = "README.md" | ||
dependencies = [ | ||
"harambe_core==0.50.2", | ||
"harambe_core==0.51.0", | ||
"pydantic==2.9.2", | ||
"playwright==1.47.0", | ||
"setuptools==73.0.0", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import pytest | ||
from harambe.core import get_next_stage | ||
from harambe.core import get_previous_stage | ||
from harambe.types import Stage | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"previous_stage,expected_stage", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this should be introduced to harambe. The consumer of the observer should make this choice |
||
[ | ||
("parent_category", "category"), | ||
("category", "listing"), | ||
("listing", "detail"), | ||
("detail", "detail"), | ||
(None, "detail"), | ||
], | ||
) | ||
def test_get_next_stage(previous_stage: Stage, expected_stage: Stage): | ||
result = get_next_stage(previous_stage) | ||
assert result == expected_stage | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"stage,expected_stage", | ||
[ | ||
("parent_category", "parent_category"), | ||
("category", "parent_category"), | ||
("listing", "category"), | ||
("detail", "listing"), | ||
(None, "detail"), | ||
], | ||
) | ||
def test_get_previous_stage(stage: Stage, expected_stage: Stage): | ||
result = get_previous_stage(stage) | ||
assert result == expected_stage |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would recommend leaving the getnextstage outside of harambe (separation of concerns)