Skip to content

Commit

Permalink
MERGE latest changes on master with calibration
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelchughes committed Aug 7, 2018
2 parents 9caec31 + d89ea1f commit 8a324c7
Show file tree
Hide file tree
Showing 24 changed files with 853 additions and 53 deletions.
29 changes: 28 additions & 1 deletion pc_toolbox/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,33 @@
import os

import utils_io
import utils_data
import utils_snapshots

import model_slda
calc_nef_map_pi_DK = model_slda.calc_nef_map_pi_DK

# TODO discard this line
# calc_nef_map_pi_DK = model_slda.calc_nef_map_pi_DK

PC_REPO_DIR = os.path.sep.join(
os.path.abspath(__file__).split(os.path.sep)[:-2])

## Create version attrib
__version__ = None
version_txt_path = os.path.join(PC_REPO_DIR, 'version.txt')
if os.path.exists(version_txt_path):
with open(version_txt_path, 'r') as f:
__version__ = f.readline().strip()

## Create requirements attrib
__requirements__ = None
reqs_txt_path = os.path.join(PC_REPO_DIR, 'requirements.txt')
if os.path.exists(reqs_txt_path):
with open(reqs_txt_path, 'r') as f:
__requirements__ = []
for line in f.readlines():
__requirements__.append(line.strip())




Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,14 @@
load_list_of_strings_from_txt,
load_list_of_unicode_from_txt,
)
import matplotlib.pyplot as plt

from calc_roc_auc_via_bootstrap import calc_binary_clf_metric_with_ci_via_bootstrap

from utils_calibration import (
calc_binary_clf_calibration_per_bin,
plot_binary_clf_calibration_curve_and_histograms)

def read_args_from_stdin_and_run():
''' Main executable function to train and evaluate classifier.
Expand Down Expand Up @@ -398,6 +403,20 @@ def read_args_from_stdin_and_run():
elapsed_time = time.time() - start_time
pprint('[run_classifier says:] target %s completed after %.2f sec' % (target_names[c], elapsed_time))


def calc_calibration_info(clf, x, y, bins=5):
assert len(clf.classes_) == 2
assert clf.classes_[0] == 0
assert clf.classes_[1] == 1
y_proba = clf.predict_proba(x)
if y_proba.ndim > 1:
assert y_proba.shape[1] == 2
y_proba = y_proba[:, 1]
info_per_bin = calc_binary_clf_calibration_per_bin(
y, y_proba,
bins=bins)
return info_per_bin

def calcfrac(bmask):
return np.sum(bmask) / float(bmask.size)

Expand Down Expand Up @@ -525,26 +544,44 @@ def calc_f1_score(clf, x, y):
assert yhat.ndim == 1
return f1_score(y, yhat, pos_label=clf.classes_[1])

def make_clf_report(clf, x, y, header=''):
r_str = header
r_str += make_confusion_matrix_report(clf, x, y)
r_str += u"acc %.4f\n" % calc_accuracy_score(clf, x, y)
r_str += u" f1 %.4f\n" % calc_f1_score(clf, x, y)
r_str += u"auc %.4f\n" % calc_auc_score(clf, x, y)
r_str += make_calibration_report(clf, x, y)
return r_str

def make_confusion_matrix_report(clf, x, y):
assert len(clf.classes_) == 2
assert clf.classes_[0] == 0
assert clf.classes_[1] == 1

assert clf.classes_[1] == 1
y_pred = clf.predict(x)
cm = sk_confusion_matrix(y, y_pred)
cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
cm.columns.name = 'Predicted label'
cm.index.name = 'True label'
return "\n%s\n" % unicode(cm)

def make_clf_report(clf, x, y, header=''):
r_str = header
r_str += make_confusion_matrix_report(clf, x, y)
r_str += u"acc %.4f\n" % calc_accuracy_score(clf, x, y)
r_str += u" f1 %.4f\n" % calc_f1_score(clf, x, y)
r_str += u"auc %.4f\n" % calc_auc_score(clf, x, y)

def make_calibration_report(clf, x, y, bins=5):
""" Make plain-text report on clf calibration performance
"""
info_per_bin = calc_calibration_info(
clf, x, y, bins=bins)
bin_edges = info_per_bin['bin_edges']
r_str = "\nCalibration"
for bb in range(bin_edges.size - 1):
r_str += "\nproba bin [%.2f, %.2f] count %5d fracTP %.3f" % (
bin_edges[bb],
bin_edges[bb+1],
info_per_bin['count_per_bin'][bb],
info_per_bin['fracTP_per_bin'][bb],
)
return r_str


def make_csv_row_dict(clf, x, y, y_col_name, split_name, classifier_name):
keepers = np.isfinite(y)
x = x[keepers]
Expand Down Expand Up @@ -953,17 +990,37 @@ def train_and_eval_clf_with_best_params_via_grid_search(
csv_fpath = os.path.join(
output_path,
'clf_%d_callback_%s.csv' % (y_orig_col_id, split))

x_cursplit, y_cursplit = make_nonnan_xy_for_target(
datasets_by_split[split],
y_col_id=y_col_id)
row_dict = make_csv_row_dict(
best_clf,
datasets_by_split[split]['x'],
datasets_by_split[split]['y'][:, y_col_id],
x_cursplit,
y_cursplit,
y_col_name,
split,
classifier_name)
csv_df = pd.DataFrame([row_dict], columns=row_dict.keys())
csv_df.to_csv(
csv_fpath,
index=False)

if hasattr(best_clf, 'predict_proba'):
for nbins in [6, 10, 20]:
fig_fpath = os.path.join(
output_path,
'clf_%d_calibration_%02dbin_%s.pdf' % (
y_orig_col_id, nbins, split))

info_per_bin = calc_calibration_info(
best_clf, x_cursplit, y_cursplit, bins=nbins)
plot_binary_clf_calibration_curve_and_histograms(
info_per_bin=info_per_bin)
plt.savefig(
fig_fpath,
bbox_inches='tight',
pad_inches=0)
if verbose:
elapsed_time = time.time() - start_time
pprint("eval %d/%d on %5s split done after %11.2f sec" % (
Expand Down
120 changes: 120 additions & 0 deletions pc_toolbox/binary_classifiers/utils_calibration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import numpy as np
from scipy.special import expit
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt

def plot_binary_clf_calibration_curve_and_histograms(
info_per_bin=None,
fig_kws=dict(
figsize=(1.4*3, 1.4*4),
tight_layout=True),
):
fig_h = plt.figure(**fig_kws)
ax_grid = gridspec.GridSpec(
nrows=4, ncols=1,
height_ratios=[1, 1, 4, 0.1],
)
ax_cal = fig_h.add_subplot(ax_grid[2,0])
ax_TP = fig_h.add_subplot(ax_grid[0,0])
ax_TN = fig_h.add_subplot(ax_grid[1,0])

# Plot calibration curve
# First, lay down idealized line from 0-1
unit_grid = np.linspace(0, 1, 10)
ax_cal.plot(
unit_grid, unit_grid, 'k--', alpha=0.5)
# Then, plot actual-vs-expected fractions on top
ax_cal.plot(
info_per_bin['xcenter_per_bin'],
info_per_bin['fracTP_per_bin'],
'ks-')
ax_cal.set_ylabel('frac. true positive')
ax_cal.set_xlabel('predicted proba.')

# Plot TP histogram
ax_TP.bar(
info_per_bin['xcenter_per_bin'],
info_per_bin['countTP_per_bin'],
width=0.9*info_per_bin['xwidth_per_bin'],
color='b')

# Plot TN histogram
ax_TN.bar(
info_per_bin['xcenter_per_bin'],
info_per_bin['countTN_per_bin'],
width=0.9*info_per_bin['xwidth_per_bin'],
color='r')
for ax in [ax_cal, ax_TP, ax_TN]:
ax.set_xlim([0, 1])
ax_cal.set_ylim([0, 1])

def calc_binary_clf_calibration_per_bin(
y_true, y_prob,
bins=10):
"""
"""
if y_prob.min() < 0 or y_prob.max() > 1:
raise ValueError("y_prob has values outside [0, 1]")

bins = np.asarray(bins)
if bins.ndim == 1 and bins.size > 1:
bin_edges = bins
else:
bin_edges = np.linspace(0, 1, int(bins) + 1)
if bin_edges[-1] == 1.0:
bin_edges[-1] += 1e-8
assert bin_edges.ndim == 1
assert bin_edges.size > 2
nbins = bin_edges.size - 1
# Assign each predicted probability into one bin
# from 0, 1, ... nbins
binids = np.digitize(y_prob, bin_edges) - 1
assert binids.max() <= nbins
assert binids.min() >= 0

count_per_bin = np.bincount(binids, minlength=nbins)
countTP_per_bin = np.bincount(binids, minlength=nbins, weights=y_true == 1)
countTN_per_bin = np.bincount(binids, minlength=nbins, weights=y_true == 0)

# This divide will (and should) yield nan
# if any bin has no content
fracTP_per_bin = countTP_per_bin / np.asarray(count_per_bin, dtype=np.float64)

info_per_bin = dict(
count_per_bin=count_per_bin,
countTP_per_bin=countTP_per_bin,
countTN_per_bin=countTN_per_bin,
fracTP_per_bin=fracTP_per_bin,
xcenter_per_bin=0.5 * (bin_edges[:-1] + bin_edges[1:]),
xwidth_per_bin=(bin_edges[1:] - bin_edges[:-1]),
bin_edges=bin_edges,
)
return info_per_bin


if __name__ == '__main__':
prng = np.random.RandomState(0)
thr_true = prng.rand(100000)
u_true = 0.65 * prng.randn(100000)
y_true = np.asarray(expit(u_true) >= thr_true, dtype=np.float32)
y_prob = expit(u_true)

bins = 20

info_per_bin = calc_binary_clf_calibration_per_bin(
y_true=y_true,
y_prob=y_prob,
bins=bins)
bin_edges = info_per_bin['bin_edges']
for bb in range(bin_edges.size - 1):
print "bin [%.2f, %.2f] count %5d fracTP %.3f" % (
bin_edges[bb],
bin_edges[bb+1],
info_per_bin['count_per_bin'][bb],
info_per_bin['fracTP_per_bin'][bb],
)

plot_binary_clf_calibration_curve_and_histograms(
info_per_bin=info_per_bin)

plt.show()
5 changes: 5 additions & 0 deletions pc_toolbox/model_slda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
calc_nef_map_pi_DK,
)

from est_local_params__vb_qpiDir_qzCat import (
calc_elbo_for_many_docs,
)


import slda_utils__dataset_manager
import slda_utils__param_io_manager
save_topic_model_param_dict = slda_utils__param_io_manager.save_topic_model_param_dict
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from calc_elbo_for_many_docs__vb_qpiDir_qzCat import (
calc_elbo_for_many_docs)

from calc_N_d_K__vb_qpiDir_qzCat import (
calc_N_d_K__vb_coord_ascent__many_tries,
calc_N_d_K__vb_coord_ascent)
Loading

0 comments on commit 8a324c7

Please sign in to comment.