forked from trancethehuman/entities-extraction-web-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
36 lines (26 loc) · 1.07 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import asyncio
import pprint
from ai_extractor import extract
from schemas import SchemaNewsWebsites, ecommerce_schema
from scrape import ascrape_playwright
# TESTING
if __name__ == "__main__":
token_limit = 4000
# News sites mostly have <span> tags to scrape
cnn_url = "https://www.cnn.com"
wsj_url = "https://www.wsj.com"
nyt_url = "https://www.nytimes.com/ca/"
amazon_url = "https://www.amazon.ca/s?k=computers&crid=1LUXGQOD2ULFD&sprefix=%2Caps%2C94&ref=nb_sb_ss_recent_1_0_recent"
async def scrape_with_playwright(url: str, tags, **kwargs):
html_content = await ascrape_playwright(url, tags)
print("Extracting content with LLM")
html_content_fits_context_window_llm = html_content[:token_limit]
extracted_content = extract(**kwargs,
content=html_content_fits_context_window_llm)
pprint.pprint(extracted_content)
# Scrape and Extract with LLM
asyncio.run(scrape_with_playwright(
url=wsj_url,
tags=["span"],
schema_pydantic=SchemaNewsWebsites
))