MERGE latest changes on master with calibration

dtak · Aug 7, 2018 · 8a324c7 · 8a324c7
2 parents 9caec31 + d89ea1f
commit 8a324c7
Show file tree

Hide file tree

Showing 24 changed files with 853 additions and 53 deletions.
diff --git a/pc_toolbox/__init__.py b/pc_toolbox/__init__.py
@@ -1,6 +1,33 @@
+import os
+
 import utils_io
 import utils_data
 import utils_snapshots
 
 import model_slda
-calc_nef_map_pi_DK = model_slda.calc_nef_map_pi_DK
+
+# TODO discard this line
+# calc_nef_map_pi_DK = model_slda.calc_nef_map_pi_DK
+
+PC_REPO_DIR = os.path.sep.join(
+    os.path.abspath(__file__).split(os.path.sep)[:-2])
+
+## Create version attrib
+__version__ = None
+version_txt_path = os.path.join(PC_REPO_DIR, 'version.txt')
+if os.path.exists(version_txt_path):
+   with open(version_txt_path, 'r') as f:
+        __version__ = f.readline().strip()
+
+## Create requirements attrib
+__requirements__ = None
+reqs_txt_path = os.path.join(PC_REPO_DIR, 'requirements.txt')
+if os.path.exists(reqs_txt_path):
+   with open(reqs_txt_path, 'r') as f:
+        __requirements__ = []
+        for line in f.readlines():
+            __requirements__.append(line.strip())
+
+
+
+
diff --git a/pc_toolbox/binary_classifiers/train_and_eval_sklearn_binary_classifier.py b/pc_toolbox/binary_classifiers/train_and_eval_sklearn_binary_classifier.py
@@ -56,9 +56,14 @@
     load_list_of_strings_from_txt,
     load_list_of_unicode_from_txt,
     )
+import matplotlib.pyplot as plt
 
 from calc_roc_auc_via_bootstrap import calc_binary_clf_metric_with_ci_via_bootstrap
 
+from utils_calibration import (
+    calc_binary_clf_calibration_per_bin,
+    plot_binary_clf_calibration_curve_and_histograms)
+
 def read_args_from_stdin_and_run():
     ''' Main executable function to train and evaluate classifier.
 
@@ -398,6 +403,20 @@ def read_args_from_stdin_and_run():
         elapsed_time = time.time() - start_time
         pprint('[run_classifier says:] target %s completed after %.2f sec' % (target_names[c], elapsed_time))
 
+
+def calc_calibration_info(clf, x, y, bins=5):
+    assert len(clf.classes_) == 2
+    assert clf.classes_[0] == 0
+    assert clf.classes_[1] == 1        
+    y_proba = clf.predict_proba(x)
+    if y_proba.ndim > 1:
+        assert y_proba.shape[1] == 2
+        y_proba = y_proba[:, 1]
+    info_per_bin = calc_binary_clf_calibration_per_bin(
+        y, y_proba,
+        bins=bins)
+    return info_per_bin
+
 def calcfrac(bmask):
     return np.sum(bmask) / float(bmask.size)
 
@@ -525,26 +544,44 @@ def calc_f1_score(clf, x, y):
         assert yhat.ndim == 1
         return f1_score(y, yhat, pos_label=clf.classes_[1])
 
+    def make_clf_report(clf, x, y, header=''):
+        r_str = header
+        r_str += make_confusion_matrix_report(clf, x, y)
+        r_str += u"acc %.4f\n" % calc_accuracy_score(clf, x, y)
+        r_str += u" f1 %.4f\n" % calc_f1_score(clf, x, y)
+        r_str += u"auc %.4f\n" % calc_auc_score(clf, x, y)
+        r_str += make_calibration_report(clf, x, y)
+        return r_str
+
     def make_confusion_matrix_report(clf, x, y):
         assert len(clf.classes_) == 2
         assert clf.classes_[0] == 0
-        assert clf.classes_[1] == 1
-
+        assert clf.classes_[1] == 1        
         y_pred = clf.predict(x)
         cm = sk_confusion_matrix(y, y_pred)
         cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
         cm.columns.name = 'Predicted label'
         cm.index.name = 'True label'
         return "\n%s\n" % unicode(cm)
 
-    def make_clf_report(clf, x, y, header=''):
-        r_str = header
-        r_str += make_confusion_matrix_report(clf, x, y)
-        r_str += u"acc %.4f\n" % calc_accuracy_score(clf, x, y)
-        r_str += u" f1 %.4f\n" % calc_f1_score(clf, x, y)
-        r_str += u"auc %.4f\n" % calc_auc_score(clf, x, y)
+
+    def make_calibration_report(clf, x, y, bins=5):
+        """ Make plain-text report on clf calibration performance
+        """
+        info_per_bin = calc_calibration_info(
+            clf, x, y, bins=bins)
+        bin_edges = info_per_bin['bin_edges']
+        r_str = "\nCalibration"
+        for bb in range(bin_edges.size - 1):
+            r_str += "\nproba bin [%.2f, %.2f]  count %5d  fracTP %.3f" % (
+                bin_edges[bb],
+                bin_edges[bb+1],
+                info_per_bin['count_per_bin'][bb],
+                info_per_bin['fracTP_per_bin'][bb],
+                )
         return r_str
 
+
     def make_csv_row_dict(clf, x, y, y_col_name, split_name, classifier_name):
         keepers = np.isfinite(y)
         x = x[keepers]
@@ -953,17 +990,37 @@ def train_and_eval_clf_with_best_params_via_grid_search(
             csv_fpath = os.path.join(
                 output_path,
                 'clf_%d_callback_%s.csv' % (y_orig_col_id, split))
+
+            x_cursplit, y_cursplit = make_nonnan_xy_for_target(
+                datasets_by_split[split],
+                y_col_id=y_col_id)
             row_dict = make_csv_row_dict(
                 best_clf,
-                datasets_by_split[split]['x'],
-                datasets_by_split[split]['y'][:, y_col_id],
+                x_cursplit,
+                y_cursplit,
                 y_col_name,
                 split,
                 classifier_name)
             csv_df = pd.DataFrame([row_dict], columns=row_dict.keys())
             csv_df.to_csv(
                 csv_fpath,
                 index=False)
+
+            if hasattr(best_clf, 'predict_proba'):
+                for nbins in [6, 10, 20]:
+                    fig_fpath = os.path.join(
+                        output_path,
+                        'clf_%d_calibration_%02dbin_%s.pdf' % (
+                            y_orig_col_id, nbins, split))
+
+                    info_per_bin = calc_calibration_info(
+                        best_clf, x_cursplit, y_cursplit, bins=nbins)
+                    plot_binary_clf_calibration_curve_and_histograms(
+                        info_per_bin=info_per_bin)
+                    plt.savefig(
+                        fig_fpath,
+                        bbox_inches='tight',
+                        pad_inches=0)
             if verbose:
                 elapsed_time = time.time() - start_time
                 pprint("eval %d/%d on %5s split done after %11.2f sec" % (

diff --git a/pc_toolbox/binary_classifiers/utils_calibration.py b/pc_toolbox/binary_classifiers/utils_calibration.py
@@ -0,0 +1,120 @@
+import numpy as np
+from scipy.special import expit
+import matplotlib.gridspec as gridspec
+import matplotlib.pyplot as plt
+
+def plot_binary_clf_calibration_curve_and_histograms(
+        info_per_bin=None,
+        fig_kws=dict(
+            figsize=(1.4*3, 1.4*4),
+            tight_layout=True),
+        ):
+    fig_h = plt.figure(**fig_kws)
+    ax_grid = gridspec.GridSpec(
+        nrows=4, ncols=1,
+        height_ratios=[1, 1, 4, 0.1],
+        )
+    ax_cal = fig_h.add_subplot(ax_grid[2,0])
+    ax_TP = fig_h.add_subplot(ax_grid[0,0])
+    ax_TN = fig_h.add_subplot(ax_grid[1,0])
+
+    # Plot calibration curve
+    # First, lay down idealized line from 0-1
+    unit_grid = np.linspace(0, 1, 10)
+    ax_cal.plot(
+        unit_grid, unit_grid, 'k--', alpha=0.5)
+    # Then, plot actual-vs-expected fractions on top
+    ax_cal.plot(
+        info_per_bin['xcenter_per_bin'],
+        info_per_bin['fracTP_per_bin'],
+        'ks-')
+    ax_cal.set_ylabel('frac. true positive')
+    ax_cal.set_xlabel('predicted proba.')
+
+    # Plot TP histogram
+    ax_TP.bar(
+        info_per_bin['xcenter_per_bin'],
+        info_per_bin['countTP_per_bin'],
+        width=0.9*info_per_bin['xwidth_per_bin'],
+        color='b')
+
+    # Plot TN histogram
+    ax_TN.bar(
+        info_per_bin['xcenter_per_bin'],
+        info_per_bin['countTN_per_bin'],
+        width=0.9*info_per_bin['xwidth_per_bin'],
+        color='r')
+    for ax in [ax_cal, ax_TP, ax_TN]:
+        ax.set_xlim([0, 1])
+    ax_cal.set_ylim([0, 1])
+
+def calc_binary_clf_calibration_per_bin(
+        y_true, y_prob,
+        bins=10):
+    """ 
+    """
+    if y_prob.min() < 0 or y_prob.max() > 1:
+        raise ValueError("y_prob has values outside [0, 1]")
+
+    bins = np.asarray(bins)
+    if bins.ndim == 1 and bins.size > 1:
+        bin_edges = bins
+    else:
+        bin_edges = np.linspace(0, 1, int(bins) + 1)
+    if bin_edges[-1] == 1.0:
+        bin_edges[-1] += 1e-8
+    assert bin_edges.ndim == 1
+    assert bin_edges.size > 2
+    nbins = bin_edges.size - 1
+    # Assign each predicted probability into one bin
+    # from 0, 1, ... nbins
+    binids = np.digitize(y_prob, bin_edges) - 1
+    assert binids.max() <= nbins
+    assert binids.min() >= 0
+
+    count_per_bin = np.bincount(binids, minlength=nbins)
+    countTP_per_bin = np.bincount(binids, minlength=nbins, weights=y_true == 1)
+    countTN_per_bin = np.bincount(binids, minlength=nbins, weights=y_true == 0)
+
+    # This divide will (and should) yield nan
+    # if any bin has no content
+    fracTP_per_bin = countTP_per_bin / np.asarray(count_per_bin, dtype=np.float64)
+
+    info_per_bin = dict(
+        count_per_bin=count_per_bin,
+        countTP_per_bin=countTP_per_bin,
+        countTN_per_bin=countTN_per_bin,
+        fracTP_per_bin=fracTP_per_bin,
+        xcenter_per_bin=0.5 * (bin_edges[:-1] + bin_edges[1:]),
+        xwidth_per_bin=(bin_edges[1:] - bin_edges[:-1]),
+        bin_edges=bin_edges,
+        )
+    return info_per_bin
+
+
+if __name__ == '__main__':
+    prng = np.random.RandomState(0)
+    thr_true = prng.rand(100000)
+    u_true = 0.65 * prng.randn(100000)
+    y_true = np.asarray(expit(u_true) >= thr_true, dtype=np.float32)
+    y_prob = expit(u_true)
+
+    bins = 20
+
+    info_per_bin = calc_binary_clf_calibration_per_bin(
+        y_true=y_true,
+        y_prob=y_prob,
+        bins=bins)
+    bin_edges = info_per_bin['bin_edges']
+    for bb in range(bin_edges.size - 1):
+        print "bin [%.2f, %.2f]  count %5d  fracTP %.3f" % (
+            bin_edges[bb],
+            bin_edges[bb+1],
+            info_per_bin['count_per_bin'][bb],
+            info_per_bin['fracTP_per_bin'][bb],
+            )
+
+    plot_binary_clf_calibration_curve_and_histograms(
+        info_per_bin=info_per_bin)
+
+    plt.show()
diff --git a/pc_toolbox/model_slda/__init__.py b/pc_toolbox/model_slda/__init__.py
@@ -9,6 +9,11 @@
     calc_nef_map_pi_DK,
     )
 
+from est_local_params__vb_qpiDir_qzCat import (
+    calc_elbo_for_many_docs,
+    )
+
+
 import slda_utils__dataset_manager
 import slda_utils__param_io_manager
 save_topic_model_param_dict = slda_utils__param_io_manager.save_topic_model_param_dict

diff --git a/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/__init__.py b/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/__init__.py
@@ -0,0 +1,6 @@
+from calc_elbo_for_many_docs__vb_qpiDir_qzCat import (
+    calc_elbo_for_many_docs)
+
+from calc_N_d_K__vb_qpiDir_qzCat import (
+    calc_N_d_K__vb_coord_ascent__many_tries,
+    calc_N_d_K__vb_coord_ascent)