Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Blunt10K committed Oct 13, 2023
1 parent 2f057a4 commit 16d52a2
Show file tree
Hide file tree
Showing 10 changed files with 47,108 additions and 175 deletions.
Binary file added .DS_Store
Binary file not shown.
10,458 changes: 10,458 additions & 0 deletions .ipynb_checkpoints/Untitled-checkpoint.ipynb

Large diffs are not rendered by default.

5,257 changes: 5,257 additions & 0 deletions Untitled.ipynb

Large diffs are not rendered by default.

Binary file added scraper/.DS_Store
Binary file not shown.
3,214 changes: 3,214 additions & 0 deletions scraper/.ipynb_checkpoints/Untitled-checkpoint.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
from scrapy.linkextractors import LinkExtractor
from datetime import datetime as dt, timedelta as td
import json
from os.path import join, expanduser

def game_dates():
start_date = dt(2000,10,8)
end_date = start_date + td(365)
start_date = dt(1996,10,8)
end_date = dt.today()

days = (end_date - start_date).days

Expand All @@ -23,8 +24,10 @@ class GamesSpider(CrawlSpider):
def parse_page(self, response):
items = response.css('script[type="application/json"]::text')

extract_path = expanduser(join('~','spark_apps','games'))

for i in items:
to_write = json.loads(i.get())['props']['pageProps']
fname = 'games/' + to_write['playByPlay']['gameId'] + '.json'
fname = join(extract_path, to_write['playByPlay']['gameId'] + '.json')
with open(fname, 'w') as fp:
json.dump(to_write, fp)
json.dump(to_write, fp)
1,218 changes: 1,218 additions & 0 deletions scraper/.ipynb_checkpoints/transform-checkpoint.ipynb

Large diffs are not rendered by default.

8,668 changes: 8,497 additions & 171 deletions scraper/Untitled.ipynb

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions scraper/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from datetime import datetime as dt, timedelta as td
import json
from os.path import join, expanduser

def game_dates():
start_date = dt(1996,10,8)
end_date = dt.today()

days = (end_date - start_date).days

for i in range(days):
yield 'https://www.nba.com/games?date='+dt.strftime(start_date + td(i+1), '%Y-%m-%d')

class GamesSpider(CrawlSpider):
name = 'pbp-games'
allowed_domains = ['nba.com']
start_urls = list(game_dates())
REDIRECT_ENABLED = False

rules = [Rule(LinkExtractor(allow=['\w+-vs-\w+-\d+/box-score#box-score']), callback='parse_page')]

def parse_page(self, response):
items = response.css('script[type="application/json"]::text')

extract_path = expanduser(join('~','spark_apps','games'))

for i in items:
to_write = json.loads(i.get())['props']['pageProps']
fname = join(extract_path, to_write['playByPlay']['gameId'] + '.json')
with open(fname, 'w') as fp:
json.dump(to_write, fp)
18,424 changes: 18,424 additions & 0 deletions scraper/results.log

Large diffs are not rendered by default.

0 comments on commit 16d52a2

Please sign in to comment.