Skip to content

Commit

Permalink
update scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
Blunt10K committed Dec 11, 2023
1 parent 068cd1d commit beb75c4
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 11 deletions.
15 changes: 10 additions & 5 deletions play_by_play/extract.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import json
from sqlalchemy import create_engine
from airflow.models import Variable

def make_engine(user, pswd, db):

return create_engine("mariadb+mariadbconnector://"\
+user+":"\
+pswd+"@127.0.0.1:3306/"+db)
def make_engine():
host = Variable.get('HOSTNAME')
db = Variable.get('NBA_DB')
port = Variable.get('PORT')
user = Variable.get('USER')
pswd = Variable.get('PSWD')

return create_engine(f"postgresql+psycopg2://{user}:{pswd}@{host}:{port}/{db}")


def extract_application(html):
Expand Down Expand Up @@ -46,7 +51,7 @@ def extract():
command += "as match_up from box_scores where match_up regexp 'vs' and game_id not in "
command += "(select distinct game_id from play_by_plays) order by game_id limit 100;"

engine = make_engine(environ.get('USER'),environ.get('PSWD'),'nba')
engine = make_engine()

df = pd.read_sql(command,engine)

Expand Down
40 changes: 34 additions & 6 deletions scraper/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,42 @@
from datetime import datetime as dt, timedelta as td
import json
from os.path import join, expanduser
import pandas as pd
from sqlalchemy import create_engine
from airflow.models import Variable


def make_engine():
host = Variable.get('HOSTNAME')
db = Variable.get('NBA_DB')
port = Variable.get('PORT')
user = Variable.get('USER')
pswd = Variable.get('PSWD')

return create_engine(f"postgresql+psycopg2://{user}:{pswd}@{host}:{port}/{db}")


def game_dates():
start_date = dt(1996,10,8)
end_date = dt.today()
engine = make_engine()
latest_scrape = pd.read_sql('SELECT max(game_date) as latest from scraped_games', engine)
latest_scrape = latest_scrape.loc[0,'latest']

if latest_scrape:
query = f'''SELECT game_date from calendar
where (game_date > to_date({latest_scrape.strftime('%Y-%m-%d')},'YYYY-MM-DD'))
AND (to_date({(latest_scrape + td(1)).strftime('%Y-%m-%d')},'YYYY-MM-DD') between quarter_from and quarter_to)'''
else:
query = f'''SELECT game_date from calendar
where (game_date > to_date('1996-11-01','YYYY-MM-DD'))
AND (to_date('1996-11-01','YYYY-MM-DD') between quarter_from and quarter_to)'''

df = pd.read_sql(query, engine)

engine.dispose()

days = (end_date - start_date).days
for i in df.itertuples():
yield 'https://www.nba.com/games?date='+dt.strftime(i.game_date, '%Y-%m-%d')

for i in range(days):
yield 'https://www.nba.com/games?date='+dt.strftime(start_date + td(i+1), '%Y-%m-%d')

class GamesSpider(CrawlSpider):
name = 'pbp-games'
Expand All @@ -23,8 +50,9 @@ class GamesSpider(CrawlSpider):

def parse_page(self, response):
items = response.css('script[type="application/json"]::text')
root = Variable.get('EXTDISK')

extract_path = expanduser(join('~','spark_apps','games'))
extract_path = expanduser(join(root,'spark_apps','games'))

for i in items:
to_write = json.loads(i.get())['props']['pageProps']
Expand Down

0 comments on commit beb75c4

Please sign in to comment.