From beb75c4eae73d4f5b230654fd687fa10bf20b201 Mon Sep 17 00:00:00 2001 From: Kenny Burawudi Date: Mon, 11 Dec 2023 23:06:35 +0100 Subject: [PATCH] update scraper --- play_by_play/extract.py | 15 ++++++++++----- scraper/crawler.py | 40 ++++++++++++++++++++++++++++++++++------ 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/play_by_play/extract.py b/play_by_play/extract.py index a80807d..f715f1c 100644 --- a/play_by_play/extract.py +++ b/play_by_play/extract.py @@ -1,11 +1,16 @@ import json from sqlalchemy import create_engine +from airflow.models import Variable -def make_engine(user, pswd, db): - return create_engine("mariadb+mariadbconnector://"\ - +user+":"\ - +pswd+"@127.0.0.1:3306/"+db) +def make_engine(): + host = Variable.get('HOSTNAME') + db = Variable.get('NBA_DB') + port = Variable.get('PORT') + user = Variable.get('USER') + pswd = Variable.get('PSWD') + + return create_engine(f"postgresql+psycopg2://{user}:{pswd}@{host}:{port}/{db}") def extract_application(html): @@ -46,7 +51,7 @@ def extract(): command += "as match_up from box_scores where match_up regexp 'vs' and game_id not in " command += "(select distinct game_id from play_by_plays) order by game_id limit 100;" - engine = make_engine(environ.get('USER'),environ.get('PSWD'),'nba') + engine = make_engine() df = pd.read_sql(command,engine) diff --git a/scraper/crawler.py b/scraper/crawler.py index 97d05f6..f9e18c1 100644 --- a/scraper/crawler.py +++ b/scraper/crawler.py @@ -3,15 +3,42 @@ from datetime import datetime as dt, timedelta as td import json from os.path import join, expanduser +import pandas as pd +from sqlalchemy import create_engine +from airflow.models import Variable + + +def make_engine(): + host = Variable.get('HOSTNAME') + db = Variable.get('NBA_DB') + port = Variable.get('PORT') + user = Variable.get('USER') + pswd = Variable.get('PSWD') + + return create_engine(f"postgresql+psycopg2://{user}:{pswd}@{host}:{port}/{db}") + def game_dates(): - start_date = dt(1996,10,8) - end_date = dt.today() + engine = make_engine() + latest_scrape = pd.read_sql('SELECT max(game_date) as latest from scraped_games', engine) + latest_scrape = latest_scrape.loc[0,'latest'] + + if latest_scrape: + query = f'''SELECT game_date from calendar + where (game_date > to_date({latest_scrape.strftime('%Y-%m-%d')},'YYYY-MM-DD')) + AND (to_date({(latest_scrape + td(1)).strftime('%Y-%m-%d')},'YYYY-MM-DD') between quarter_from and quarter_to)''' + else: + query = f'''SELECT game_date from calendar + where (game_date > to_date('1996-11-01','YYYY-MM-DD')) + AND (to_date('1996-11-01','YYYY-MM-DD') between quarter_from and quarter_to)''' + + df = pd.read_sql(query, engine) + + engine.dispose() - days = (end_date - start_date).days + for i in df.itertuples(): + yield 'https://www.nba.com/games?date='+dt.strftime(i.game_date, '%Y-%m-%d') - for i in range(days): - yield 'https://www.nba.com/games?date='+dt.strftime(start_date + td(i+1), '%Y-%m-%d') class GamesSpider(CrawlSpider): name = 'pbp-games' @@ -23,8 +50,9 @@ class GamesSpider(CrawlSpider): def parse_page(self, response): items = response.css('script[type="application/json"]::text') + root = Variable.get('EXTDISK') - extract_path = expanduser(join('~','spark_apps','games')) + extract_path = expanduser(join(root,'spark_apps','games')) for i in items: to_write = json.loads(i.get())['props']['pageProps']