Skip to content

Commit

Permalink
add scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
Blunt10K committed May 13, 2023
1 parent 2bd85f9 commit 2f057a4
Showing 1 changed file with 30 additions and 0 deletions.
30 changes: 30 additions & 0 deletions scraper/scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from datetime import datetime as dt, timedelta as td
import json

def game_dates():
start_date = dt(2000,10,8)
end_date = start_date + td(365)

days = (end_date - start_date).days

for i in range(days):
yield 'https://www.nba.com/games?date='+dt.strftime(start_date + td(i+1), '%Y-%m-%d')

class GamesSpider(CrawlSpider):
name = 'pbp-games'
allowed_domains = ['nba.com']
start_urls = list(game_dates())
REDIRECT_ENABLED = False

rules = [Rule(LinkExtractor(allow=['\w+-vs-\w+-\d+/box-score#box-score']), callback='parse_page')]

def parse_page(self, response):
items = response.css('script[type="application/json"]::text')

for i in items:
to_write = json.loads(i.get())['props']['pageProps']
fname = 'games/' + to_write['playByPlay']['gameId'] + '.json'
with open(fname, 'w') as fp:
json.dump(to_write, fp)

0 comments on commit 2f057a4

Please sign in to comment.