update scraper

Blunt10K · Dec 11, 2023 · beb75c4 · beb75c4
1 parent 068cd1d
commit beb75c4
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 11 deletions.
diff --git a/play_by_play/extract.py b/play_by_play/extract.py
@@ -1,11 +1,16 @@
 import json
 from sqlalchemy import create_engine
+from airflow.models import Variable
 
-def make_engine(user, pswd, db):
 
-    return create_engine("mariadb+mariadbconnector://"\
-                        +user+":"\
-                        +pswd+"@127.0.0.1:3306/"+db)
+def make_engine():
+    host = Variable.get('HOSTNAME')
+    db = Variable.get('NBA_DB')
+    port = Variable.get('PORT')
+    user = Variable.get('USER')
+    pswd = Variable.get('PSWD')
+
+    return create_engine(f"postgresql+psycopg2://{user}:{pswd}@{host}:{port}/{db}")
 
 
 def extract_application(html):
@@ -46,7 +51,7 @@ def extract():
     command += "as match_up from box_scores where match_up regexp 'vs' and game_id not in "
     command += "(select distinct game_id from play_by_plays) order by game_id limit 100;"
 
-    engine = make_engine(environ.get('USER'),environ.get('PSWD'),'nba')
+    engine = make_engine()
 
     df = pd.read_sql(command,engine)
 

diff --git a/scraper/crawler.py b/scraper/crawler.py
@@ -3,15 +3,42 @@
 from datetime import datetime as dt, timedelta as td
 import json
 from os.path import join, expanduser
+import pandas as pd
+from sqlalchemy import create_engine
+from airflow.models import Variable
+
+
+def make_engine():
+    host = Variable.get('HOSTNAME')
+    db = Variable.get('NBA_DB')
+    port = Variable.get('PORT')
+    user = Variable.get('USER')
+    pswd = Variable.get('PSWD')
+
+    return create_engine(f"postgresql+psycopg2://{user}:{pswd}@{host}:{port}/{db}")
+
 
 def game_dates():
-    start_date = dt(1996,10,8)
-    end_date = dt.today()
+    engine = make_engine()
+    latest_scrape = pd.read_sql('SELECT max(game_date) as latest from scraped_games', engine)
+    latest_scrape = latest_scrape.loc[0,'latest']
+
+    if latest_scrape:
+        query = f'''SELECT game_date from calendar
+        where (game_date > to_date({latest_scrape.strftime('%Y-%m-%d')},'YYYY-MM-DD'))
+        AND (to_date({(latest_scrape + td(1)).strftime('%Y-%m-%d')},'YYYY-MM-DD') between quarter_from and quarter_to)'''
+    else:
+        query = f'''SELECT game_date from calendar
+        where (game_date > to_date('1996-11-01','YYYY-MM-DD'))
+        AND (to_date('1996-11-01','YYYY-MM-DD') between quarter_from and quarter_to)'''
+
+    df = pd.read_sql(query, engine)
+
+    engine.dispose()
 
-    days = (end_date - start_date).days
+    for i in df.itertuples():
+        yield 'https://www.nba.com/games?date='+dt.strftime(i.game_date, '%Y-%m-%d')
 
-    for i in range(days):
-        yield 'https://www.nba.com/games?date='+dt.strftime(start_date + td(i+1), '%Y-%m-%d')
 
 class GamesSpider(CrawlSpider):
     name = 'pbp-games'
@@ -23,8 +50,9 @@ class GamesSpider(CrawlSpider):
 
     def parse_page(self, response):
         items = response.css('script[type="application/json"]::text')
+        root = Variable.get('EXTDISK')
 
-        extract_path = expanduser(join('~','spark_apps','games'))
+        extract_path = expanduser(join(root,'spark_apps','games'))
 
         for i in items:
             to_write = json.loads(i.get())['props']['pageProps']