-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmetrics.py
544 lines (446 loc) · 19.8 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
# -*- coding: utf-8 -*-
"""
This is experimental development to extend the functionality of the automated
bracket code that I've been working on for a few years.
I have written a set of utilities to scrape actual raw ratings data from a few
computer metrics. Currently supported are:
* Kenpom
* ESPN BPI
* Dokter Entropy
The scripts produce csv files with the names
| Metric | CSV File |
| -------------- | ------------------- |
| Kenpom | kenpom_YYYYMMDD.csv |
| Dokter Entropy | dokent_YYYYMMDD.csv |
| ESPN BPI | bpi_YYYYMMDD.csv |
The goal is to begin to incorporate this as a separate class. Eventually all
will be refactored into a coherent project structure. For now, goal is to get
a working automated bracket.
I am essentially starting from the code in bracket_picker.py and editing here.
"""
from urllib.request import urlretrieve
from openpyxl import Workbook, load_workbook
from scrape import download_kenpom, download_dokent, download_bpi, \
download_massey
import numpy as np
import pandas as pd
import csv
import datetime
import warnings
import os
warnings.filterwarnings('ignore')
def rank_calc(x, y) :
"""
Calculates the final ranking for teams.
Inputs
x: Computer ranking
y: Human Ranking
Outputs
If both rankings present,
Rank = (2 * x + y) / 3
If only computer ranking present, returns computer ranking.
If user prefers another formula, it must be in this format
"""
if np.isnan(y) :
return x
else:
return ((3 * x + y) / 4.)
def remove_seed(s):
# this is needed to clean up kenpom names after tourney begins
return (''.join([i for i in s if not i.isdigit()])).strip()
class Bracketeer(object):
"""
Downloads, aggregates data, selects final 68 teams for NCAA
Tournament following a simple average of computer and human
rankings.
"""
def __init__(self, csv_save_path = 'masseyratings.csv'):
self.save_path = csv_save_path
# List for seeding teams
self.seeds = np.array([
1,1,1,1,
2,2,2,2,
3,3,3,3,
4,4,4,4,
5,5,5,5,
6,6,6,6,
7,7,7,7,
8,8,8,8,
9,9,9,9,
10,10,10,10,
11,11,11,11,11,11,
12,12,12,12,
13,13,13,13,
14,14,14,14,
15,15,15,15,
16,16,16,16,16,16
])
self.download_csv()
self.parse_csv()
def download_csv(self) :
"""
Get the composite csv from masseyratings.com
stores in a folder, overwrites existing unless
name specified
"""
composite_csv = 'http://www.masseyratings.com/cb/compare.csv'
urlretrieve(composite_csv, self.save_path)
def parse_csv(self) :
"""
need to parse the csv
first row of interest starts at "team" in column 1
data starts two rows after
"""
data = []
with open(self.save_path, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter = ',')
for row in reader:
data.append(row)
# Input Sanitization
# The csv file wasn't formatted in a way pandas could read.
# There is a header section followed by the data, but the header
# section is variable in length. We can simply iterate through
# until we see the 'Team' string indicating the start of the data
i = 0 # counter of lines until Team found
for row in data:
if row and row[0] == 'Team':
break
i = i + 1
# The header data contains the abbreviations and URLs for the
# various ranking systems in the dataset, might be useful later
self.header_data = data[:i]
team_data = data[i:]
column_names = team_data[0] # Dataset column names
# strip whitespace from the column names for normalization
for i in range(len(column_names)):
column_names[i] = column_names[i].lstrip()
# drop the data set in a pandas Dataframe
self.team_data_df = pd.DataFrame(
team_data[2:],
columns=column_names
)
# remove team name whitespace
self.team_data_df = self.team_data_df.apply(
lambda x: x.str.strip() if x.dtype == "object" else x)
def print_polls(self) :
"""
Prints available polls for convenience
"""
print(self.team_data_df.columns)
def get_conferences(self):
return pd.unique(self.team_data_df['Conf'])
def get_tourney_teams (self, comp_polls = None, rank_calc_func = None,
conf_winners = None, use_metrics = False, human_polls = True) :
"""
Analysis on the full dataset to derive the teams actually in the
tournament
Inputs:
comp_polls: List of strings representing subset of available polls
to pull from for computer rankings
rank_calc_func: Function to use for calculation weighted average
of computer and human polls for final rank calculation. If none,
default is used (weight for computer polls to human polls 3:1)
conf_winners: Dictionary where keys are conferences and values are
teams who have won the autobid for the specified conferences
use_metrics: Boolean. If true, attempts to pull data from csv files
representing the raw rating data for polls in comp_polls. If
false, uses rank data to aggregate
human_polls: Boolean. If true, uses human polls in final
computation. If false, ignores human polls
"""
# idea here: splitting off functionality to be more modular, but I want
# access to these in a few places. May delete some if needed
self.comp_polls = comp_polls # default: None
self.rank_calc_func = rank_calc_func # default: None
self.conf_winners = conf_winners # default: None
self.use_metrics = use_metrics # default: False
self.human_polls = human_polls # default: True
# Use a place holder dataframe for calculated means and ranks
# for all teams
summary_df = self.team_data_df[["Team","Conf"]]
if use_metrics is False:
try:
# returns dataframes with mean in df['mean']
comp_rankings = self.get_comp_rankings()
summary_df["comp_mean"] = comp_rankings['mean']
except Exception as e:
print('Exception occured:')
print('------------------')
raise e
elif use_metrics is True:
try:
comp_ratings = self.get_comp_ratings()
summary_df["comp_mean"] = comp_ratings['mean']
except Exception as e:
print('Exception occured:')
print('------------------')
raise e
else:
print('use_metrics must be True or False')
raise Exception('use_metrics must be True or False. Input was {}'\
.format(use_metrics))
if use_metrics is False and human_polls is True:
# basically right now I have no intention of including human ranks
# and then averaging with ranks from mean computer ratings. May
# change in future
try:
human_rankings = self.get_human_rankings()
summary_df["human_mean"] = human_rankings['mean']
except Exception as e:
print('Exception occured:')
print('------------------')
raise e
else:
# this will essentially make the final rankings calculation
# ignore any human poll inputs. Initializing a human rankings
# 'mean' filled with nans
human_rankings = pd.DataFrame(
data = {
'mean':np.full([self.team_data_df.shape[0],],np.nan)
},
columns = ['mean']
)
summary_df["human_mean"] = human_rankings['mean']
# Calculate the final rank using either the user-defined algorithm
# or the default
if rank_calc_func is None :
rank_calc_func = rank_calc
summary_df["final_rank"] = np.vectorize(rank_calc_func)(\
summary_df["comp_mean"], summary_df["human_mean"])
# Sort by calculated rank
if use_metrics is True:
summary_df.sort_values(by=['final_rank'],inplace=True,ascending=False)
else:
summary_df.sort_values(by=['final_rank'],inplace=True)
# ----
# Tourney rules dictate the winners of the conferences all have auto
# bids into the tourney. There are 32 of these. The remaining 36
# are chosen at large. We simply take the highest ranked team in each
# conference as an auto bid. For the final bracket, we accept
# a list of conference winners
# ----
# now that final rank is calculated, need to find auto-bids and
# at large
auto_bid_confs = pd.unique(summary_df['Conf'])
# Using groupby to grab the auto bids
auto_bid_teams = summary_df.groupby(['Conf']).head(1)
if conf_winners is not None:
auto_bid_teams = self._replace_auto_bid(conf_winners,auto_bid_teams)
else:
auto_bid_teams = auto_bid_teams['Team'].values
# slated for removal below:
# else :
# auto_bid_confs = pd.unique(summary_df['Conf'])
# # Using groupby to grab the auto bids
# auto_bid_teams = summary_df.groupby(['Conf']).head(1)['Team'].values
# and we can use ~isin now to get at larges
at_large_teams = summary_df[~summary_df['Team'].isin(auto_bid_teams)].head(36)['Team'].values
# all 68 teams in one array
all_68 = np.append(auto_bid_teams,at_large_teams)
self.auto_bid_teams = auto_bid_teams
self.at_large_teams = at_large_teams
self.all_68 = all_68
# First four next four
self._ffnf = summary_df[~summary_df['Team'].isin(auto_bid_teams)].iloc[36:44]['Team'].values
self.final_68 = summary_df[summary_df['Team'].isin(all_68)]
self.final_68["seed"] = self.seeds
# return the dataframe to the user
# return self.final_68
def get_comp_rankings(self):
"""
return computer rankings dataframe
"""
# Drop columns that are unnecessary
cols_to_drop = ["WL","Rank","Mean","Trimmed","Median","StDev","AP",
"USA"]
# create computer ranking dataframe
comp_rankings = self.team_data_df.drop(cols_to_drop,axis=1)
# If user provides list of specific computer polls to use, subset here
# The abbreviations will need to be used. If one desired poll isn't
# available, then it will return a KeyError. In this case, let user
# know, then break out after printing all available polls.
if self.comp_polls is not None :
try:
comp_rankings = comp_rankings[self.comp_polls]
except KeyError as e:
print('One or more of the polls you tried isn\'t available\n')
print(comp_rankings.columns)
raise e
# Convert computer rankings dataframe columns to numeric where
# appropriate
comp_rankings = comp_rankings.apply(\
lambda x: pd.to_numeric(x, errors='ignore'))
# compute arithmetic mean of computer rankings
comp_rankings['mean'] = comp_rankings.mean(axis=1,numeric_only=True)
return comp_rankings
def get_human_rankings(self):
"""
return human rankings from dataframe
"""
# create human ranking dataframe
try:
human_rankings = self.team_data_df[['Team','Conf','AP','USA']]
except KeyError as e:
print('Human rankings are unavailable. Change human_polls to False')
print('if desired')
raise e
# remove whitespace around strings in these polls
human_rankings['AP'] = human_rankings['AP'].str.strip()
human_rankings['USA'] = human_rankings['USA'].str.strip()
# convert rankings to numeric
human_rankings = human_rankings.apply(\
lambda x: pd.to_numeric(x, errors='ignore'))
human_rankings['mean'] = human_rankings.mean(
axis = 1, numeric_only = True, skipna = True)
return human_rankings
def get_comp_ratings(self):
"""
return raw rating data instead of rankings for specified columns
right now only three available, so will basically ignore list of inputs
for now
"""
# Download latest data. This includes some waiting to be respectful to
# servers, so display message to let user know it will take a second
# all files downloaded to csv_files/
# for now, to save time re-running the code, including logic to only
# download if file is not there.
print('Downloading Ratings Data')
if not os.path.isfile('csv_files/bpi.csv'):
download_bpi()
if not os.path.isfile('csv_files/dokent.csv'):
download_dokent()
if not os.path.isfile('csv_files/kenpom.csv'):
download_kenpom()
if not os.path.isfile('csv_files/massey.csv'):
download_massey()
print('Downloading Finished!')
kenpom_df = pd.read_csv('csv_files/kenpom.csv',index_col=False)
bpi_df = pd.read_csv('csv_files/bpi.csv',index_col=False)
dokent_df = pd.read_csv('csv_files/dokent.csv',index_col=False)
massey_df = pd.read_csv('csv_files/massey.csv',index_col=False)
# remove seed from kenpom
kenpom_df['Team'] = kenpom_df['Team'].map(remove_seed)
# one big issue is that the names don't match. Ken Massey uses a
# database of keys and values to massage the names into a consistent
# format. I've implemented this similarly, using distinct csv files
# for every translation. I'm using massey names as my standard, and
# converting the rest accordingly. The keys will be stored in
# csv_files for now, and conversion will happen here. This all may be
# moved to makefiles or something down the road
massey_to_bpi = pd.read_csv('csv_files/names_massey-bpi.csv')
massey_to_dokter = pd.read_csv('csv_files/names_massey-dokter.csv')
massey_to_kenpom = pd.read_csv('csv_files/names_massey-kenpom.csv')
for row in zip(massey_to_kenpom['Massey'].values,
massey_to_kenpom['Kenpom'].values):
massey_team = row[0]
kenpom_team = row[1]
kenpom_df.loc[kenpom_df['Team'].str.match("^%s$"%kenpom_team), \
'Team'] = massey_team
for row in zip(massey_to_bpi['Massey'].values,
massey_to_bpi['BPI'].values):
massey_team = row[0]
bpi_team = row[1]
bpi_df.loc[bpi_df['Team'].str.match("^%s$"%bpi_team), \
'Team'] = massey_team
for row in zip(massey_to_dokter['Massey'].values,
massey_to_dokter['Dokter'].values):
massey_team = row[0]
dokent_team = row[1]
dokent_df.loc[dokent_df['Team'].str.match("^%s$"%dokent_team), \
'Team'] = massey_team
# drop unnecessary columns
dokent_df.drop(columns=['Unnamed: 0','Rk'],inplace=True)
bpi_df.drop(columns=['Unnamed: 0','Rk','Conf','W-L'],inplace=True)
kenpom_df.drop(columns=['Unnamed: 0','Rk','W-L'],inplace=True)
massey_df.drop(columns=['Unnamed: 0','W-L'],inplace=True)
# print(self.team_data_df.columns)
# print(dokent_df.columns)
# print(kenpom_df.columns)
# print(bpi_df.columns)
comp_ratings = self.team_data_df.merge(
right=dokent_df,
on='Team',
how='inner',
suffixes=('','_dokter'),
validate='one_to_one'
)
comp_ratings = comp_ratings.merge(
right=kenpom_df,
on='Team',
how='inner',
suffixes=('','_kenpom'),
validate='one_to_one'
)
comp_ratings = comp_ratings.merge(
right=bpi_df,
on='Team',
how='inner',
suffixes=('','_bpi'),
validate='one_to_one'
)
comp_ratings = comp_ratings.merge(
right=massey_df,
on='Team',
how='inner',
suffixes=('','_massey'),
validate='one_to_one'
)
# now standardize the summary rankings for each of the above. This is
# acceptable I think, because the generating functions in each case
# are designed to generate normal data, though the mean and variance
# are all different. I standardize to mean 0 and variance 1, then take
# the mean.
comp_ratings['BPI'] = (comp_ratings['BPI'] - \
comp_ratings['BPI'].mean()) / comp_ratings['BPI'].std()
comp_ratings['AdjEM'] = (comp_ratings['AdjEM'] - \
comp_ratings['AdjEM'].mean()) / comp_ratings['AdjEM'].std()
comp_ratings['power'] = (comp_ratings['power'] - \
comp_ratings['power'].mean()) / comp_ratings['power'].std()
comp_ratings['Rat'] = (comp_ratings['Rat'] - \
comp_ratings['Rat'].mean()) / comp_ratings['Rat'].std()
comp_ratings['mean'] = \
comp_ratings[['BPI','AdjEM','power','Rat']].mean(axis=1)
return comp_ratings
def _replace_auto_bid(self,conf_winners,auto_bid_teams):
"""
Replaces auto bid assumptions with actual winners. Winners expected
to be in dict where keys are conference names that are consistent
with internal names, while auto_bid_teams is a dataframe where
the top ranked team in each conference is present. Returns a list
of updated auto bid teams.
"""
try:
for conf, team in conf_winners.items():
if team is not None:
auto_bid_teams.loc[auto_bid_teams['Conf']==conf, 'Team'] = team
except Exception as e:
print('it failed')
raise(e)
return auto_bid_teams['Team'].values
def fill_bracket(self) :
# Need to tell Excel which cells get which team. Because we're
# using 'snake' method, there are specific cells corresponding
# to each seed and rank. Easiest way to do this is to simply
# hard code the table in, for now
excel_placements = [
'C7', 'R7', 'R39', 'C39', 'C67', 'R67', 'R35', 'C35',
'C27', 'R27', 'R59', 'C59', 'C51', 'R51', 'R19', 'C19',
'C15', 'R15', 'R47', 'C47', 'C55', 'R55', 'R23', 'C23',
'C31', 'R31', 'R63', 'C63', 'C43', 'R43', 'R11', 'C11',
'C13', 'R13', 'R45', 'C45', 'C65', 'R65', 'R33', 'C33',
'C25', 'R25', 'Q71', 'Q73', 'D71', 'D73', # 11 seeds
'C49', 'R49', 'R17', 'C17', 'C21', 'R21', 'R53', 'C53',
'C61', 'R61', 'R29', 'C29', 'C37', 'R37', 'R69', 'C69',
'C41', 'R41', 'O71', 'O73', 'F71', 'F73'
]
# append that to our final_68 dataframe
self.final_68['excel'] = excel_placements
# input to the workbook. There is one set up called 'bracket.xlsx'
wb = Workbook()
ws = wb.active
for index, row in self.final_68.iterrows():
ws[row['excel']] = row['Team']
# get today's date, save bracket
today = str(datetime.date.today())
save_file = 'bracket' + today + '.xlsx'
wb.save(save_file)