update

Blunt10K · Oct 13, 2023 · 16d52a2 · 16d52a2
1 parent 2f057a4
commit 16d52a2
Show file tree

Hide file tree

Showing 10 changed files with 47,108 additions and 175 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
diff --git a/Untitled.ipynb b/Untitled.ipynb
diff --git a/scraper/.DS_Store b/scraper/.DS_Store
diff --git a/scraper/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/scraper/.ipynb_checkpoints/Untitled-checkpoint.ipynb
diff --git a/scraper/scrape.py → .../.ipynb_checkpoints/crawler-checkpoint.py b/scraper/scrape.py → .../.ipynb_checkpoints/crawler-checkpoint.py
@@ -2,10 +2,11 @@
 from scrapy.linkextractors import LinkExtractor
 from datetime import datetime as dt, timedelta as td
 import json
+from os.path import join, expanduser
 
 def game_dates():
-    start_date = dt(2000,10,8)
-    end_date = start_date + td(365)
+    start_date = dt(1996,10,8)
+    end_date = dt.today()
 
     days = (end_date - start_date).days
 
@@ -23,8 +24,10 @@ class GamesSpider(CrawlSpider):
     def parse_page(self, response):
         items = response.css('script[type="application/json"]::text')
 
+        extract_path = expanduser(join('~','spark_apps','games'))
+
         for i in items:
             to_write = json.loads(i.get())['props']['pageProps']
-            fname = 'games/' + to_write['playByPlay']['gameId'] + '.json'
+            fname = join(extract_path, to_write['playByPlay']['gameId'] + '.json')
             with open(fname, 'w') as fp:
-                json.dump(to_write, fp)
+                json.dump(to_write, fp)
diff --git a/scraper/.ipynb_checkpoints/transform-checkpoint.ipynb b/scraper/.ipynb_checkpoints/transform-checkpoint.ipynb
diff --git a/scraper/Untitled.ipynb b/scraper/Untitled.ipynb
diff --git a/scraper/crawler.py b/scraper/crawler.py
@@ -0,0 +1,33 @@
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from datetime import datetime as dt, timedelta as td
+import json
+from os.path import join, expanduser
+
+def game_dates():
+    start_date = dt(1996,10,8)
+    end_date = dt.today()
+
+    days = (end_date - start_date).days
+
+    for i in range(days):
+        yield 'https://www.nba.com/games?date='+dt.strftime(start_date + td(i+1), '%Y-%m-%d')
+
+class GamesSpider(CrawlSpider):
+    name = 'pbp-games'
+    allowed_domains = ['nba.com']
+    start_urls = list(game_dates())
+    REDIRECT_ENABLED = False
+
+    rules = [Rule(LinkExtractor(allow=['\w+-vs-\w+-\d+/box-score#box-score']), callback='parse_page')]
+
+    def parse_page(self, response):
+        items = response.css('script[type="application/json"]::text')
+
+        extract_path = expanduser(join('~','spark_apps','games'))
+
+        for i in items:
+            to_write = json.loads(i.get())['props']['pageProps']
+            fname = join(extract_path, to_write['playByPlay']['gameId'] + '.json')
+            with open(fname, 'w') as fp:
+                json.dump(to_write, fp)
diff --git a/scraper/results.log b/scraper/results.log