diff --git a/pc_toolbox/__init__.py b/pc_toolbox/__init__.py index 4901758..faf2f98 100644 --- a/pc_toolbox/__init__.py +++ b/pc_toolbox/__init__.py @@ -1,6 +1,33 @@ +import os + import utils_io import utils_data import utils_snapshots import model_slda -calc_nef_map_pi_DK = model_slda.calc_nef_map_pi_DK + +# TODO discard this line +# calc_nef_map_pi_DK = model_slda.calc_nef_map_pi_DK + +PC_REPO_DIR = os.path.sep.join( + os.path.abspath(__file__).split(os.path.sep)[:-2]) + +## Create version attrib +__version__ = None +version_txt_path = os.path.join(PC_REPO_DIR, 'version.txt') +if os.path.exists(version_txt_path): + with open(version_txt_path, 'r') as f: + __version__ = f.readline().strip() + +## Create requirements attrib +__requirements__ = None +reqs_txt_path = os.path.join(PC_REPO_DIR, 'requirements.txt') +if os.path.exists(reqs_txt_path): + with open(reqs_txt_path, 'r') as f: + __requirements__ = [] + for line in f.readlines(): + __requirements__.append(line.strip()) + + + + diff --git a/pc_toolbox/binary_classifiers/train_and_eval_sklearn_binary_classifier.py b/pc_toolbox/binary_classifiers/train_and_eval_sklearn_binary_classifier.py index acf1c11..7510a6e 100644 --- a/pc_toolbox/binary_classifiers/train_and_eval_sklearn_binary_classifier.py +++ b/pc_toolbox/binary_classifiers/train_and_eval_sklearn_binary_classifier.py @@ -56,9 +56,14 @@ load_list_of_strings_from_txt, load_list_of_unicode_from_txt, ) +import matplotlib.pyplot as plt from calc_roc_auc_via_bootstrap import calc_binary_clf_metric_with_ci_via_bootstrap +from utils_calibration import ( + calc_binary_clf_calibration_per_bin, + plot_binary_clf_calibration_curve_and_histograms) + def read_args_from_stdin_and_run(): ''' Main executable function to train and evaluate classifier. @@ -398,6 +403,20 @@ def read_args_from_stdin_and_run(): elapsed_time = time.time() - start_time pprint('[run_classifier says:] target %s completed after %.2f sec' % (target_names[c], elapsed_time)) + +def calc_calibration_info(clf, x, y, bins=5): + assert len(clf.classes_) == 2 + assert clf.classes_[0] == 0 + assert clf.classes_[1] == 1 + y_proba = clf.predict_proba(x) + if y_proba.ndim > 1: + assert y_proba.shape[1] == 2 + y_proba = y_proba[:, 1] + info_per_bin = calc_binary_clf_calibration_per_bin( + y, y_proba, + bins=bins) + return info_per_bin + def calcfrac(bmask): return np.sum(bmask) / float(bmask.size) @@ -525,11 +544,19 @@ def calc_f1_score(clf, x, y): assert yhat.ndim == 1 return f1_score(y, yhat, pos_label=clf.classes_[1]) + def make_clf_report(clf, x, y, header=''): + r_str = header + r_str += make_confusion_matrix_report(clf, x, y) + r_str += u"acc %.4f\n" % calc_accuracy_score(clf, x, y) + r_str += u" f1 %.4f\n" % calc_f1_score(clf, x, y) + r_str += u"auc %.4f\n" % calc_auc_score(clf, x, y) + r_str += make_calibration_report(clf, x, y) + return r_str + def make_confusion_matrix_report(clf, x, y): assert len(clf.classes_) == 2 assert clf.classes_[0] == 0 - assert clf.classes_[1] == 1 - + assert clf.classes_[1] == 1 y_pred = clf.predict(x) cm = sk_confusion_matrix(y, y_pred) cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1]) @@ -537,14 +564,24 @@ def make_confusion_matrix_report(clf, x, y): cm.index.name = 'True label' return "\n%s\n" % unicode(cm) - def make_clf_report(clf, x, y, header=''): - r_str = header - r_str += make_confusion_matrix_report(clf, x, y) - r_str += u"acc %.4f\n" % calc_accuracy_score(clf, x, y) - r_str += u" f1 %.4f\n" % calc_f1_score(clf, x, y) - r_str += u"auc %.4f\n" % calc_auc_score(clf, x, y) + + def make_calibration_report(clf, x, y, bins=5): + """ Make plain-text report on clf calibration performance + """ + info_per_bin = calc_calibration_info( + clf, x, y, bins=bins) + bin_edges = info_per_bin['bin_edges'] + r_str = "\nCalibration" + for bb in range(bin_edges.size - 1): + r_str += "\nproba bin [%.2f, %.2f] count %5d fracTP %.3f" % ( + bin_edges[bb], + bin_edges[bb+1], + info_per_bin['count_per_bin'][bb], + info_per_bin['fracTP_per_bin'][bb], + ) return r_str + def make_csv_row_dict(clf, x, y, y_col_name, split_name, classifier_name): keepers = np.isfinite(y) x = x[keepers] @@ -953,10 +990,14 @@ def train_and_eval_clf_with_best_params_via_grid_search( csv_fpath = os.path.join( output_path, 'clf_%d_callback_%s.csv' % (y_orig_col_id, split)) + + x_cursplit, y_cursplit = make_nonnan_xy_for_target( + datasets_by_split[split], + y_col_id=y_col_id) row_dict = make_csv_row_dict( best_clf, - datasets_by_split[split]['x'], - datasets_by_split[split]['y'][:, y_col_id], + x_cursplit, + y_cursplit, y_col_name, split, classifier_name) @@ -964,6 +1005,22 @@ def train_and_eval_clf_with_best_params_via_grid_search( csv_df.to_csv( csv_fpath, index=False) + + if hasattr(best_clf, 'predict_proba'): + for nbins in [6, 10, 20]: + fig_fpath = os.path.join( + output_path, + 'clf_%d_calibration_%02dbin_%s.pdf' % ( + y_orig_col_id, nbins, split)) + + info_per_bin = calc_calibration_info( + best_clf, x_cursplit, y_cursplit, bins=nbins) + plot_binary_clf_calibration_curve_and_histograms( + info_per_bin=info_per_bin) + plt.savefig( + fig_fpath, + bbox_inches='tight', + pad_inches=0) if verbose: elapsed_time = time.time() - start_time pprint("eval %d/%d on %5s split done after %11.2f sec" % ( diff --git a/pc_toolbox/binary_classifiers/utils_calibration.py b/pc_toolbox/binary_classifiers/utils_calibration.py new file mode 100644 index 0000000..3d0cdaa --- /dev/null +++ b/pc_toolbox/binary_classifiers/utils_calibration.py @@ -0,0 +1,120 @@ +import numpy as np +from scipy.special import expit +import matplotlib.gridspec as gridspec +import matplotlib.pyplot as plt + +def plot_binary_clf_calibration_curve_and_histograms( + info_per_bin=None, + fig_kws=dict( + figsize=(1.4*3, 1.4*4), + tight_layout=True), + ): + fig_h = plt.figure(**fig_kws) + ax_grid = gridspec.GridSpec( + nrows=4, ncols=1, + height_ratios=[1, 1, 4, 0.1], + ) + ax_cal = fig_h.add_subplot(ax_grid[2,0]) + ax_TP = fig_h.add_subplot(ax_grid[0,0]) + ax_TN = fig_h.add_subplot(ax_grid[1,0]) + + # Plot calibration curve + # First, lay down idealized line from 0-1 + unit_grid = np.linspace(0, 1, 10) + ax_cal.plot( + unit_grid, unit_grid, 'k--', alpha=0.5) + # Then, plot actual-vs-expected fractions on top + ax_cal.plot( + info_per_bin['xcenter_per_bin'], + info_per_bin['fracTP_per_bin'], + 'ks-') + ax_cal.set_ylabel('frac. true positive') + ax_cal.set_xlabel('predicted proba.') + + # Plot TP histogram + ax_TP.bar( + info_per_bin['xcenter_per_bin'], + info_per_bin['countTP_per_bin'], + width=0.9*info_per_bin['xwidth_per_bin'], + color='b') + + # Plot TN histogram + ax_TN.bar( + info_per_bin['xcenter_per_bin'], + info_per_bin['countTN_per_bin'], + width=0.9*info_per_bin['xwidth_per_bin'], + color='r') + for ax in [ax_cal, ax_TP, ax_TN]: + ax.set_xlim([0, 1]) + ax_cal.set_ylim([0, 1]) + +def calc_binary_clf_calibration_per_bin( + y_true, y_prob, + bins=10): + """ + """ + if y_prob.min() < 0 or y_prob.max() > 1: + raise ValueError("y_prob has values outside [0, 1]") + + bins = np.asarray(bins) + if bins.ndim == 1 and bins.size > 1: + bin_edges = bins + else: + bin_edges = np.linspace(0, 1, int(bins) + 1) + if bin_edges[-1] == 1.0: + bin_edges[-1] += 1e-8 + assert bin_edges.ndim == 1 + assert bin_edges.size > 2 + nbins = bin_edges.size - 1 + # Assign each predicted probability into one bin + # from 0, 1, ... nbins + binids = np.digitize(y_prob, bin_edges) - 1 + assert binids.max() <= nbins + assert binids.min() >= 0 + + count_per_bin = np.bincount(binids, minlength=nbins) + countTP_per_bin = np.bincount(binids, minlength=nbins, weights=y_true == 1) + countTN_per_bin = np.bincount(binids, minlength=nbins, weights=y_true == 0) + + # This divide will (and should) yield nan + # if any bin has no content + fracTP_per_bin = countTP_per_bin / np.asarray(count_per_bin, dtype=np.float64) + + info_per_bin = dict( + count_per_bin=count_per_bin, + countTP_per_bin=countTP_per_bin, + countTN_per_bin=countTN_per_bin, + fracTP_per_bin=fracTP_per_bin, + xcenter_per_bin=0.5 * (bin_edges[:-1] + bin_edges[1:]), + xwidth_per_bin=(bin_edges[1:] - bin_edges[:-1]), + bin_edges=bin_edges, + ) + return info_per_bin + + +if __name__ == '__main__': + prng = np.random.RandomState(0) + thr_true = prng.rand(100000) + u_true = 0.65 * prng.randn(100000) + y_true = np.asarray(expit(u_true) >= thr_true, dtype=np.float32) + y_prob = expit(u_true) + + bins = 20 + + info_per_bin = calc_binary_clf_calibration_per_bin( + y_true=y_true, + y_prob=y_prob, + bins=bins) + bin_edges = info_per_bin['bin_edges'] + for bb in range(bin_edges.size - 1): + print "bin [%.2f, %.2f] count %5d fracTP %.3f" % ( + bin_edges[bb], + bin_edges[bb+1], + info_per_bin['count_per_bin'][bb], + info_per_bin['fracTP_per_bin'][bb], + ) + + plot_binary_clf_calibration_curve_and_histograms( + info_per_bin=info_per_bin) + + plt.show() \ No newline at end of file diff --git a/pc_toolbox/model_slda/__init__.py b/pc_toolbox/model_slda/__init__.py index ce36957..4c29cb4 100644 --- a/pc_toolbox/model_slda/__init__.py +++ b/pc_toolbox/model_slda/__init__.py @@ -9,6 +9,11 @@ calc_nef_map_pi_DK, ) +from est_local_params__vb_qpiDir_qzCat import ( + calc_elbo_for_many_docs, + ) + + import slda_utils__dataset_manager import slda_utils__param_io_manager save_topic_model_param_dict = slda_utils__param_io_manager.save_topic_model_param_dict diff --git a/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/__init__.py b/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/__init__.py new file mode 100644 index 0000000..c38f3b3 --- /dev/null +++ b/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/__init__.py @@ -0,0 +1,6 @@ +from calc_elbo_for_many_docs__vb_qpiDir_qzCat import ( + calc_elbo_for_many_docs) + +from calc_N_d_K__vb_qpiDir_qzCat import ( + calc_N_d_K__vb_coord_ascent__many_tries, + calc_N_d_K__vb_coord_ascent) \ No newline at end of file diff --git a/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/calc_N_d_K__vb_qpiDir_qzCat.py b/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/calc_N_d_K__vb_qpiDir_qzCat.py new file mode 100644 index 0000000..25325dc --- /dev/null +++ b/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/calc_N_d_K__vb_qpiDir_qzCat.py @@ -0,0 +1,266 @@ +import argparse +import numpy as np +import os +from scipy.special import gammaln, digamma +from scipy.misc import logsumexp + +def calc_N_d_K__vb_coord_ascent__many_tries( + word_id_d_Ud=None, + word_ct_d_Ud=None, + topics_KV=None, + alpha_K=None, + init_pi_d_K=None, + init_name=None, + init_name_list=None, + coldstart_initname='prior_mean', + prng=np.random, + verbose=False, + do_trace_elbo=True, + **lstep_kwargs): + """ Estimate token-assignment counts for VB approximate posterior. + + Returns + ------- + N_d_K : 1D array, size K + N_d_K[k] : count of usage of topic k in document d + """ + K = alpha_K.size + if init_name is not None: + init_name_list = init_name.split("+") + if init_name_list is None: + init_name_list = [coldstart_initname] + assert isinstance(init_name_list, list) + + # Precompute likelihoods + # lik_d_UdK : 2D array, Ud x K + lik_d_UdK = topics_KV[:, word_id_d_Ud].T.copy() + log_lik_d_UdK = np.log(1e-100 + lik_d_UdK) + + best_ELBO = -np.inf + best_N_d_K = None + best_info = None + for init_name in init_name_list: + if init_name.count("_x") > 0: + n_reps = int(init_name.split("_x")[1]) + else: + n_reps = 1 + + for rep in xrange(n_reps): + init_P_d_K = make_initial_P_d_K( + init_name, + prng=prng, + alpha_K=alpha_K, + init_P_d_K_list=[init_pi_d_K]) + if verbose: + pprint__N_d_K(init_P_d_K, "init") + + cur_N_d_K, cur_info = calc_N_d_K__vb_coord_ascent( + word_ct_d_Ud=word_ct_d_Ud, + lik_d_UdK=lik_d_UdK, + log_lik_d_UdK=log_lik_d_UdK, + alpha_K=alpha_K, + init_P_d_K=init_P_d_K, + verbose=verbose, + do_trace_elbo=do_trace_elbo, + **lstep_kwargs) + cur_ELBO = calc_elbo_for_single_doc__simplified_from_N_d_K( + word_ct_d_Ud=word_ct_d_Ud, + log_lik_d_UdK=log_lik_d_UdK, + alpha_K=alpha_K, + N_d_K=cur_N_d_K) + if verbose: + pprint__N_d_K(cur_N_d_K, "final", cur_ELBO) + + if cur_ELBO > best_ELBO + 1e-6: + best_ELBO = cur_ELBO + best_N_d_K = cur_N_d_K + best_info = cur_info + if verbose: + print "best: %s" % init_name + elif cur_ELBO > best_ELBO - 1e-6: + if verbose: + print "tied: %s" % init_name + if verbose: + print "" + best_info['ELBO'] = best_ELBO + return best_N_d_K, best_info + +def calc_N_d_K__vb_coord_ascent( + word_id_d_Ud=None, + word_ct_d_Ud=None, + lik_d_UdK=None, + log_lik_d_UdK=None, + topics_KV=None, + alpha_K=None, + init_theta_d_K=None, + init_N_d_K=None, + init_P_d_K=None, + lstep_converge_thr=0.0001, + lstep_max_iters=100, + do_trace_elbo=False, + verbose=False, + **unused_kwargs): + """ Estimate token-assignment counts for VB approximate posterior. + + Uses one run of coordinate descent. + + Returns + ------- + N_d_K : 1D array, size K + info_dict : dict + """ + if lik_d_UdK is None: + lik_d_UdK = topics_KV[:, word_id_d_Ud].T.copy() + if log_lik_d_UdK is None and do_trace_elbo: + log_lik_d_UdK = np.log(1e-100 + lik_d_UdK) + + P_d_K = np.zeros_like(alpha_K) + sumresp_U = np.zeros_like(word_ct_d_Ud) + if init_P_d_K is not None: + P_d_K[:] = init_P_d_K + N_d_K = np.zeros_like(alpha_K) + np.dot(lik_d_UdK, P_d_K, out=sumresp_U) + np.dot(word_ct_d_Ud / sumresp_U, lik_d_UdK, out=N_d_K) + N_d_K *= P_d_K + elif init_theta_d_K is not None: + N_d_K = np.maximum(init_theta_d_K - alpha_K, 1e-10) + elif init_N_d_K is not None: + N_d_K = init_N_d_K + + prev_N_d_K = np.zeros_like(N_d_K) + digamma_sumtheta_d = digamma(np.sum(alpha_K) + np.sum(word_ct_d_Ud)) + + if do_trace_elbo: + elbo_list = list() + converge_dist = np.inf + for local_iter in range(1, 1+lstep_max_iters): + if do_trace_elbo: + elbo = calc_elbo_for_single_doc__simplified_from_N_d_K( + word_ct_d_Ud=word_ct_d_Ud, + log_lik_d_UdK=log_lik_d_UdK, + alpha_K=alpha_K, + N_d_K=N_d_K) + elbo_list.append(elbo) + np.add(N_d_K, alpha_K, out=P_d_K) + digamma(P_d_K, out=P_d_K) + np.subtract(P_d_K, digamma_sumtheta_d, out=P_d_K) + np.exp(P_d_K, out=P_d_K) + np.dot(lik_d_UdK, P_d_K, out=sumresp_U) + # Update DocTopicCounts + np.dot(word_ct_d_Ud / sumresp_U, lik_d_UdK, out=N_d_K) + N_d_K *= P_d_K + + if verbose and local_iter % 10 == 0: + pprint__N_d_K(N_d_K) + + if local_iter % 5 == 0: + converge_dist = np.sum(np.abs(N_d_K - prev_N_d_K)) + if converge_dist < lstep_converge_thr: + break + prev_N_d_K[:] = N_d_K + + opt_info = dict( + n_iters=local_iter, + max_iters=lstep_max_iters, + did_converge=converge_dist < lstep_converge_thr, + converge_thr=lstep_converge_thr, + converge_dist=converge_dist, + ) + if do_trace_elbo: + opt_info['trace_lb_logpdf_x'] = np.asarray(elbo_list) + opt_info['trace_lb_logpdf_x_pertok'] = np.asarray(elbo_list) / np.sum(word_ct_d_Ud) + return N_d_K, opt_info + + +def calc_elbo_for_single_doc__simplified_from_N_d_K( + word_ct_d_Ud=None, + log_lik_d_UdK=None, + alpha_K=None, + N_d_K=None): + theta_d_K = N_d_K + alpha_K + E_log_pi_d_K = digamma(theta_d_K) - digamma(np.sum(theta_d_K)) + log_resp_d_UK = log_lik_d_UdK + E_log_pi_d_K[np.newaxis,:] + return ( + np.inner(word_ct_d_Ud, logsumexp(log_resp_d_UK, axis=1)) + + c_Dir_1D(alpha_K) - c_Dir_1D(theta_d_K) + + np.inner(alpha_K - theta_d_K, E_log_pi_d_K) + ) + + +def make_initial_P_d_K( + init_name, + prng=np.random, + alpha_K=None, + init_P_d_K_list=None): + K = alpha_K.size + + if init_name.count('warm'): + return init_P_d_K_list.pop() + elif init_name.count('uniform_sample'): + return prng.dirichlet(np.ones(K)) + elif init_name.count('prior_sample'): + return prng.dirichlet(alpha_K) + elif init_name.count("prior_mean"): + return alpha_K / np.sum(alpha_K) #np.zeros(K, dtype=alpha_K.dtype) + else: + raise ValueError("Unrecognized vb lstep_init_name: " + init_name) + +def pprint__N_d_K(N_d_K, label='', elbo=None): + if elbo: + print( + "%6s" % label + + " " + ' '.join(['%7.2f' % a for a in N_d_K]) + + " %.7e" % elbo) + else: + print "%6s" % label, ' '.join(['%7.2f' % a for a in N_d_K]) + +def c_Dir_1D(alpha_K): + return gammaln(np.sum(alpha_K)) - np.sum(gammaln(alpha_K)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--K', type=int, default=1) + parser.add_argument('--Ud', type=int, default=100) + parser.add_argument('--alpha', type=float, default=1.1) + parser.add_argument( + '--lstep_max_iters', + type=int, + default=100) + parser.add_argument('--verbose', type=int, default=0) + args = parser.parse_args() + + lstep_kwargs = dict( + lstep_max_iters=args.lstep_max_iters, + ) + if args.verbose: + lstep_kwargs['verbose'] = True + lstep_kwargs['very_verbose'] = True + + K = args.K + Ud = args.Ud + alpha_K = args.alpha * np.ones(K, dtype=np.float64) + + prng = np.random.RandomState(12342) + topics_KV = prng.rand(K, Ud) + topics_KV /= np.sum(topics_KV, axis=1)[:,np.newaxis] + word_id_d_Ud = np.arange(Ud) + word_ct_d_Ud = prng.randint(low=1, high=3, size=Ud) + word_ct_d_Ud = np.asarray(word_ct_d_Ud, dtype=np.float64) + print "Applying K=%d topics to doc with Ud=%d uniq terms" % (K, Ud) + + + for (init_name, init_pi_d_K) in [ + ('prior_mean', None), + ('prior_sample', None), + ('warm', np.arange(K)), + ]: + N_d_K, info_dict = calc_N_d_K__vb_coord_ascent__many_tries( + word_id_d_Ud=word_id_d_Ud, + word_ct_d_Ud=word_ct_d_Ud, + topics_KV=topics_KV, + alpha_K=alpha_K, + init_name=init_name, + init_pi_d_K=init_pi_d_K, + verbose=True, + **lstep_kwargs) diff --git a/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/calc_elbo_for_many_docs__vb_qpiDir_qzCat.py b/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/calc_elbo_for_many_docs__vb_qpiDir_qzCat.py new file mode 100644 index 0000000..5fc2b5a --- /dev/null +++ b/pc_toolbox/model_slda/est_local_params__vb_qpiDir_qzCat/calc_elbo_for_many_docs__vb_qpiDir_qzCat.py @@ -0,0 +1,129 @@ +import argparse +import numpy as np +import os +from scipy.special import gammaln, digamma +from scipy.misc import logsumexp + +from calc_N_d_K__vb_qpiDir_qzCat import ( + calc_N_d_K__vb_coord_ascent__many_tries) + +def calc_elbo_for_many_docs( + dataset=None, + alpha=None, + alpha_K=None, + topics_KV=None, + verbose=False, + print_progress_every=-1, + init_name_list=['prior_mean'], + init_pi_DK=None, + prng=None, + seed=0, + return_info=False, + active_ct_thr=0.01, + do_trace_elbo=False, + **lstep_kwargs): + + assert dataset is not None + assert topics_KV is not None + + K = topics_KV.shape[0] + dtype = topics_KV.dtype + word_ct_U = np.asarray(dataset['word_ct_U'], dtype=dtype) + if alpha_K is None: + alpha_K = float(alpha) * np.ones(K, dtype=dtype) + else: + alpha_K = np.asarray(alpha_K, dtype=dtype) + + if return_info: + theta_DK = np.zeros((dataset['n_docs'], K)) + + if init_pi_DK is not None: + assert init_pi_DK.shape[0] == dataset['n_docs'] + assert init_pi_DK.shape[1] == K + assert 'warm' in init_name_list + else: + init_P_d_K = None + + if prng is None: + prng = np.random.RandomState(seed) + + ttl_lb_logpdf_x = 0.0 + ttl_n_tokens = 0 + ttl_n_docs = 0 + + D = dataset['n_docs'] + if print_progress_every > 0: + converged_per_doc = np.zeros(D, dtype=np.int32) + dist_per_doc = np.zeros(D, dtype=np.float64) + iter_per_doc = np.zeros(D, dtype=np.int32) + n_active_per_doc = np.zeros(D, dtype=np.float64) + start_time_sec = time.time() + for d in range(D): + start = dataset['doc_indptr_Dp1'][d] + stop = dataset['doc_indptr_Dp1'][d+1] + Ud = stop - start + word_ct_d_Ud = word_ct_U[start:stop] + word_id_d_Ud = dataset['word_id_U'][start:stop] + + if init_pi_DK is not None: + init_pi_d_K = init_pi_DK[d] + + N_d_K, info_dict = \ + calc_N_d_K__vb_coord_ascent__many_tries( + word_id_d_Ud=word_id_d_Ud, + word_ct_d_Ud=word_ct_d_Ud, + topics_KV=topics_KV, + alpha_K=alpha_K, + init_name_list=init_name_list, + init_pi_d_K=init_pi_d_K, + prng=prng, + verbose=verbose, + do_trace_elbo=do_trace_elbo, + **lstep_kwargs) + + if return_info: + theta_DK[d] = N_d_K + alpha_K + + # Norm constant per document + h_x_d = gammaln(1.0 + np.sum(word_ct_d_Ud)) \ + - np.sum(gammaln(1.0 + word_ct_d_Ud)) + + # Aggregate + ttl_lb_logpdf_x += info_dict['ELBO'] + h_x_d + ttl_n_tokens += np.sum(word_ct_d_Ud) + ttl_n_docs += 1 + + if print_progress_every > 0: + dist_per_doc[d] = info_dict['converge_dist'] + converged_per_doc[d] = info_dict['did_converge'] + iter_per_doc[d] = info_dict['n_iters'] + n_active_per_doc[d] = np.sum(N_d_K >= active_ct_thr) + # Do the printing of the progress + if print_progress_every > 0 and ( + (d + 1) % print_progress_every == 0 + or (d + 1) == D + ): + msg = make_readable_summary_for_pi_DK_inference( + n_docs_completed=ttl_n_docs, + n_docs=D, + dist_per_doc=dist_per_doc, + iters_per_doc=iter_per_doc, + converged_per_doc=converged_per_doc, + n_active_per_doc=n_active_per_doc, + elapsed_time_sec=time.time() - start_time_sec) + msg += "\n neg_log_p(x) %.6e" % ( + ttl_neg_log_p_x / ttl_n_tokens) + pprint(msg) + + ttl_lb_logpdf_x_per_tok = ttl_lb_logpdf_x / ttl_n_tokens + if return_info: + info_dict = dict( + theta_DK=theta_DK, + dist_per_doc=dist_per_doc, + iters_per_doc=iter_per_doc, + converged_per_doc=converged_per_doc, + n_active_per_doc=n_active_per_doc, + ) + return ttl_lb_logpdf_x, ttl_lb_logpdf_x_per_tok, info_dict + else: + return ttl_lb_logpdf_x, ttl_lb_logpdf_x_per_tok diff --git a/pc_toolbox/model_slda/slda_snapshot_perf_metrics.py b/pc_toolbox/model_slda/slda_snapshot_perf_metrics.py index ef0d69f..774464c 100644 --- a/pc_toolbox/model_slda/slda_snapshot_perf_metrics.py +++ b/pc_toolbox/model_slda/slda_snapshot_perf_metrics.py @@ -27,6 +27,10 @@ from pc_toolbox.topic_quality_metrics import ( calc_coherence_metrics as coh ) +from pc_toolbox.model_slda.est_local_params__vb_qpiDir_qzCat import ( + calc_elbo_for_many_docs) + + def calc_perf_metrics_for_snapshot_param_dict( param_dict=None, topics_KV=None, @@ -190,7 +194,7 @@ def calc_perf_metrics_for_snapshot_param_dict( # Remember, y_proba_DC is really estimated mean of y_DC y_est_DC = ans_dict.pop('y_proba_DC') for c in xrange(n_labels): - y_true_c_D = datasets_by_split['split']['y_DC'][:, c] + y_true_c_D = datasets_by_split[split_name]['y_DC'][:, c] y_est_c_D = y_est_DC[:, c] # Keep only finite values rowmask = np.logical_and( @@ -205,6 +209,20 @@ def calc_perf_metrics_for_snapshot_param_dict( info_dict['y_%d_rmse' % c] = rmse etimes = stop_timer_segment(etimes, '%s_calc_y_metrics' % split_name) + ## Compute vb lower bound on logpdf x + etimes = start_timer_segment(etimes, '%s_calc_lb_logpdf_x' % split_name) + lb_logpdf_x, lb_logpdf_x_pertok = calc_elbo_for_many_docs( + dataset=datasets_by_split[split_name], + topics_KV=topics_KV, + alpha=alpha, + init_name_list=['warm'], + init_pi_DK=pi_DK, + verbose=False, + do_trace_elbo=False, + ) + etimes = stop_timer_segment(etimes, '%s_calc_lb_logpdf_x' % split_name) + info_dict['elbo_logpdf_x_pertok'] = lb_logpdf_x_pertok + ## COHERENCE etimes = start_timer_segment(etimes, '%s_calc_coher_metrics' % split_name) K = topics_KV.shape[0] diff --git a/pc_toolbox/topic_quality_metrics/calc_coherence_metrics.py b/pc_toolbox/topic_quality_metrics/calc_coherence_metrics.py index 7f53b6b..158a125 100644 --- a/pc_toolbox/topic_quality_metrics/calc_coherence_metrics.py +++ b/pc_toolbox/topic_quality_metrics/calc_coherence_metrics.py @@ -87,8 +87,8 @@ def calc_npmi_and_pmi_coherence_for_top_ranked_terms_in_topic( npmi_coherence_score += np.sum(npmi_coherence_score_Mrem) n_top_pairs += Mrem return ( - npmi_coherence_score / n_top_pairs, - pmi_coherence_score / n_top_pairs, + npmi_coherence_score / (n_top_pairs + 1e-13), + pmi_coherence_score / (n_top_pairs + 1e-13), ) diff --git a/pc_toolbox/train_slda_model.py b/pc_toolbox/train_slda_model.py index 0460336..78533db 100644 --- a/pc_toolbox/train_slda_model.py +++ b/pc_toolbox/train_slda_model.py @@ -10,6 +10,7 @@ setup_random_seed, write_user_provided_kwargs_to_txt, write_env_vars_to_txt, + write_python_module_versions_to_txt, ) from pc_toolbox.algs_gradient_descent import ( @@ -317,11 +318,16 @@ def train_slda_model( arg_dict['seed'] = setup_random_seed( **arg_dict) + # Write useful environment info to .txt + # so we can reproduce later write_user_provided_kwargs_to_txt( arg_dict=arg_dict, output_path=arg_dict['output_path']) write_env_vars_to_txt( output_path=arg_dict['output_path']) + write_python_module_versions_to_txt( + context_dict=locals(), + output_path=arg_dict['output_path']) train_slda_model( **arg_dict) \ No newline at end of file diff --git a/pc_toolbox/utils_io/__init__.py b/pc_toolbox/utils_io/__init__.py index 66a0f00..bf685e1 100644 --- a/pc_toolbox/utils_io/__init__.py +++ b/pc_toolbox/utils_io/__init__.py @@ -1,6 +1,8 @@ from pprint_logging import pprint, config_pprint_logging from util_pprint_percentiles import make_percentile_str +from util_watermark import make_string_of_reachable_modules_with_versions + from util_timing import ( start_timer_segment, stop_timer_segment, @@ -28,6 +30,7 @@ setup_output_path, write_user_provided_kwargs_to_txt, write_env_vars_to_txt, + write_python_module_versions_to_txt, ) from util_io_csr import ( diff --git a/pc_toolbox/utils_io/util_setup.py b/pc_toolbox/utils_io/util_setup.py index 3bdc6c5..2a60bb9 100644 --- a/pc_toolbox/utils_io/util_setup.py +++ b/pc_toolbox/utils_io/util_setup.py @@ -8,6 +8,7 @@ from numexpr.cpuinfo import cpuinfo as numexpr_cpuinfo from pprint_logging import config_pprint_logging, pprint +import util_watermark def setup_detect_taskid_and_insert_into_output_path( output_path=None, @@ -209,3 +210,19 @@ def write_user_provided_kwargs_to_txt( with open(txt_fpath, 'w') as f: for key in sorted(arg_dict.keys()): f.write("--%s %s\n" % (key, str(arg_dict[key]))) + +def write_python_module_versions_to_txt( + context_dict=None, + output_path=None): + """ Write .txt file to provided output_path dir with module info. + + Post condition + -------------- + Writes plain text file called "modules_with_versions.txt" to disk. + Each line contains name and version number of a python module. + """ + watermark_string = util_watermark.make_string_of_reachable_modules_with_versions( + context_dict=context_dict) + txt_fpath = os.path.join(output_path, 'modules_with_versions.txt') + with open(txt_fpath, 'w') as f: + f.write(watermark_string) diff --git a/pc_toolbox/utils_io/util_watermark.py b/pc_toolbox/utils_io/util_watermark.py new file mode 100644 index 0000000..f48d7b6 --- /dev/null +++ b/pc_toolbox/utils_io/util_watermark.py @@ -0,0 +1,59 @@ +import pip + +def make_string_of_reachable_modules_with_versions(context_dict=None): + if context_dict is None: + context_dict = globals() + reachable_modules = dict() + for key, val in context_dict.items(): + if key.startswith('_'): + continue + if str(type(val)).count('module'): + # This trick will import parent package + # e.g. scipy.stats becomes scipy + if val.__package__ is None: + mod_name = val.__name__ + mod = val + else: + try: + mod = __import__(val.__package__) + except ImportError: + continue + mod_name = mod.__name__ + reachable_modules[mod_name] = mod + if hasattr(mod, '__requirements__'): + for req_line in mod.__requirements__: + if req_line.count("=="): + mname = req_line.split("==")[0] + elif req_line.count(">="): + mname = req_line.split(">=")[0] + reachable_modules[mname] = None + + ver_info_list = [val for val in pip.operations.freeze.freeze()] + + explained_reachables = [] + ans_list = [] + for vstr in ver_info_list: + if vstr.count('=='): + name, version = vstr.split("==") + elif vstr.count('egg'): + parts = vstr.split('#egg=') + name = parts[1] + version = parts[0].replace('-e ', '') + if version.count('.git@'): + # Only display first 10 chars of git hash + version = version[:version.find('.git@') + 15] + else: + name = vstr + for mod_name in reachable_modules.keys(): + if vstr.count(mod_name): + ans_list.append("%-40s %s" % (name, version)) + explained_reachables.append(mod_name) + for rname, rmod in reachable_modules.items(): + if rname not in explained_reachables: + if hasattr(rmod, '__version__'): + version = rmod.__version__ + ans_list.append("%-40s %s" % (rname, version)) + # Sort and return a list + ans_list = sorted([s for s in ans_list]) + ans = "\n".join(ans_list) + "\n" + return ans \ No newline at end of file diff --git a/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_adam_fromgood.sh b/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_adam_fromgood.sh index 54eb320..6cbb26c 100755 --- a/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_adam_fromgood.sh +++ b/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_adam_fromgood.sh @@ -34,7 +34,7 @@ for step_size in 0.0333 #0.1000 0.3333 do export step_size=$step_size - +# =============================== PER-DOC INFER SETTINGS ## Per-doc inference settings at training export pi_max_iters=5 export pi_step_size=0.05 @@ -45,8 +45,7 @@ export perf_metrics_pi_max_iters=50 # =============================== INIT SETTINGS -# =============================== INIT SETTINGS -for init_name in good_loss_x_K4 good_loss_pc_K4 +for init_name in good_loss_pc_K4 good_loss_x_K4 good_loss_pc_K4 do export init_model_path=$dataset_path"/"$init_name"_param_dict.dump" diff --git a/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_adam_fromscratch.sh b/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_adam_fromscratch.sh index a310333..6797377 100755 --- a/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_adam_fromscratch.sh +++ b/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_adam_fromscratch.sh @@ -30,11 +30,11 @@ export step_direction='adam' export decay_staircase=0 export decay_interval=1 export decay_rate=0.997 -for step_size in 0.0333 #0.1000 0.3333 +for step_size in 0.0333 do export step_size=$step_size - +# =============================== PER-DOC INFER SETTINGS ## Per-doc inference settings at training export pi_max_iters=5 export pi_step_size=0.05 @@ -58,10 +58,11 @@ export lambda_w=0.001 export weight_x=1.0 ## Loop over weights to place on log p(y|x) -for weight_y in 10.0 02.0 01.0 +for weight_y in 10.0 01.0 do export weight_y=$weight_y +## Loop over number of topics K for n_states in 004 do export n_states=$n_states diff --git a/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_lbfgs_fromscratch.sh b/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_lbfgs_fromscratch.sh index 5ae57ed..be5e208 100755 --- a/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_lbfgs_fromscratch.sh +++ b/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_ag_lbfgs_fromscratch.sh @@ -17,8 +17,6 @@ export n_batches=1 export param_output_fmt="topic_model_snapshot" export n_steps_between_save=10 export n_steps_between_print=10 -export n_seconds_between_save=-1 -export n_seconds_between_print=-1 export n_steps_to_print_early=2 export n_steps_to_save_early=2 export laps_to_save_custom='0,1,2,4,6,8,10' @@ -29,9 +27,16 @@ export n_laps=3 ## Overall training: L-BFGS export alg_name="scipy_lbfgs_minimizer" -## Per-doc inference settings + +# =============================== PER-DOC INFER SETTINGS +## Per-doc inference settings at training export pi_max_iters=5 export pi_step_size=0.05 +export pi_max_iters_first_train_lap=3 + +## Per-doc inference settings at perf-metric (eval step) +export perf_metrics_pi_max_iters=50 + ## Per-doc inference settings during training export pi_max_iters_first_train_lap=3 diff --git a/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_tf_adam_fromscratch.sh b/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_tf_adam_fromscratch.sh index 0f7ff64..eba1a10 100755 --- a/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_tf_adam_fromscratch.sh +++ b/scripts/toy_bars_3x3/quicktest_topic_models/pcslda_tf_adam_fromscratch.sh @@ -30,11 +30,11 @@ export step_direction='adam' export decay_staircase=0 export decay_interval=1 export decay_rate=0.997 -for step_size in 0.0333 #0.1000 0.3333 +for step_size in 0.0333 do export step_size=$step_size - +# =============================== PER-DOC INFER SETTINGS ## Per-doc inference settings at training export pi_max_iters=5 export pi_step_size=0.05 @@ -43,6 +43,7 @@ export pi_max_iters_first_train_lap=3 ## Per-doc inference settings at perf-metric (eval step) export perf_metrics_pi_max_iters=50 + # =============================== INIT SETTINGS export init_model_path=none for init_name in rand_smooth @@ -57,10 +58,11 @@ export lambda_w=0.001 export weight_x=1.0 ## Loop over weights to place on log p(y|x) -for weight_y in 10.0 02.0 01.0 +for weight_y in 10.0 01.0 do export weight_y=$weight_y +## Loop over number of topics K for n_states in 004 do export n_states=$n_states diff --git a/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_adam_fromscratch.sh b/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_adam_fromscratch.sh index f95c739..fc2b680 100755 --- a/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_adam_fromscratch.sh +++ b/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_adam_fromscratch.sh @@ -4,6 +4,7 @@ nickname=20180301 export lossandgrad_mod_name="slda_loss__autograd" + # =============================== DATA SETTINGS export dataset_name=toy_bars_3x3 export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" @@ -11,7 +12,10 @@ export n_vocabs=9 export n_outputs=2 export n_train_docs=500 -export n_batches=5 +for n_batches in 01 05 +do +export n_batches=$n_batches + # =============================== OUTPUT SETTINGS export param_output_fmt="topic_model_snapshot" @@ -21,6 +25,7 @@ export n_steps_to_print_early=2 export n_steps_to_save_early=2 export laps_to_save_custom='0,1,2,4,6,8,10' + # =============================== ALGO SETTINGS export n_laps=200 @@ -30,20 +35,20 @@ export step_direction='adam' export decay_staircase=0 export decay_interval=1 export decay_rate=0.997 -for step_size in 0.0333 0.1000 0.3333 +for step_size in 0.0333 0.3333 do export step_size=$step_size +# =============================== PER-DOC INFER SETTINGS ## Per-doc inference settings export pi_max_iters=100 export pi_step_size=0.05 +export pi_max_iters_first_train_lap=10 + +## Per-doc inference settings at perf-metric (eval step) +export perf_metrics_pi_max_iters=100 -# =============================== INIT SETTINGS -export init_model_path=none -for init_name in rand_smooth -do - export init_name=$init_name # =============================== MODEL HYPERS export alpha=1.100 @@ -57,6 +62,13 @@ for weight_y in 100.0 010.0 001.0 do export weight_y=$weight_y + +# =============================== INIT SETTINGS +export init_model_path=none +for init_name in rand_smooth +do + export init_name=$init_name + ## Loop over number of topics K for n_states in 004 do @@ -70,3 +82,4 @@ done done done done +done diff --git a/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_lbfgs_fromgood.sh b/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_lbfgs_fromgood.sh index 13973ba..3c89082 100755 --- a/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_lbfgs_fromgood.sh +++ b/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_lbfgs_fromgood.sh @@ -4,6 +4,7 @@ nickname=20180301 export lossandgrad_mod_name="slda_loss__autograd" + # =============================== DATA SETTINGS export dataset_name=toy_bars_3x3 export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" @@ -11,7 +12,10 @@ export n_vocabs=9 export n_outputs=2 export n_train_docs=500 -export n_batches=1 +for n_batches in 01 +do +export n_batches=$n_batches + # =============================== OUTPUT SETTINGS export param_output_fmt="topic_model_snapshot" @@ -21,23 +25,30 @@ export n_steps_to_print_early=2 export n_steps_to_save_early=2 export laps_to_save_custom='0,1,2,4,6,8,10' + # =============================== ALGO SETTINGS export n_laps=200 ## Overall training: L-BFGS export alg_name="scipy_lbfgs_minimizer" + + + + + + + + +# =============================== PER-DOC INFER SETTINGS ## Per-doc inference settings export pi_max_iters=100 export pi_step_size=0.05 +export pi_max_iters_first_train_lap=10 -# =============================== INIT SETTINGS -for init_name in good_loss_x_K4 good_loss_pc_K4 -do +## Per-doc inference settings at perf-metric (eval step) +export perf_metrics_pi_max_iters=100 - export init_model_path=$dataset_path"/"$init_name"_param_dict.dump" - export init_name=$init_name - export n_states=004 # =============================== MODEL HYPERS export alpha=1.100 @@ -51,9 +62,24 @@ for weight_y in 100.0 010.0 001.0 do export weight_y=$weight_y + +# =============================== INIT SETTINGS +for init_name in good_loss_x_K4 good_loss_pc_K4 +do + + export init_model_path=$dataset_path"/"$init_name"_param_dict.dump" + export init_name=$init_name + export n_states=004 + + + + export output_path="$XHOST_RESULTS_DIR/$dataset_name/$nickname-n_batches=$n_batches-lossandgrad_mod=$lossandgrad_mod_name-n_states=$n_states-alpha=$alpha-tau=$tau-lambda_w=$lambda_w-weight_x=$weight_x-weight_y=$weight_y-init_name=$init_name-alg_name=$alg_name/1/" bash $PC_REPO_DIR/scripts/launch_job_on_host_via_env.sh || { exit 1; } done done + + + diff --git a/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_lbfgs_fromscratch.sh b/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_lbfgs_fromscratch.sh index 23dd303..dc5196e 100755 --- a/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_lbfgs_fromscratch.sh +++ b/scripts/toy_bars_3x3/train_topic_models/pcslda_ag_lbfgs_fromscratch.sh @@ -4,6 +4,7 @@ nickname=20180301 export lossandgrad_mod_name="slda_loss__autograd" + # =============================== DATA SETTINGS export dataset_name=toy_bars_3x3 export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" @@ -11,7 +12,10 @@ export n_vocabs=9 export n_outputs=2 export n_train_docs=500 -export n_batches=1 +for n_batches in 01 05 +do +export n_batches=$n_batches + # =============================== OUTPUT SETTINGS export param_output_fmt="topic_model_snapshot" @@ -21,21 +25,30 @@ export n_steps_to_print_early=2 export n_steps_to_save_early=2 export laps_to_save_custom='0,1,2,4,6,8,10' + # =============================== ALGO SETTINGS export n_laps=100 ## Overall training: L-BFGS export alg_name="scipy_lbfgs_minimizer" + + + + + + + + +# =============================== PER-DOC INFER SETTINGS ## Per-doc inference settings export pi_max_iters=100 export pi_step_size=0.05 +export pi_max_iters_first_train_lap=10 + +## Per-doc inference settings at perf-metric (eval step) +export perf_metrics_pi_max_iters=100 -# =============================== INIT SETTINGS -export init_model_path=none -for init_name in rand_smooth -do - export init_name=$init_name # =============================== MODEL HYPERS export alpha=1.100 @@ -49,6 +62,13 @@ for weight_y in 100.0 010.0 001.0 do export weight_y=$weight_y + +# =============================== INIT SETTINGS +export init_model_path=none +for init_name in rand_smooth +do + export init_name=$init_name + ## Loop over number of topics K for n_states in 004 do @@ -61,3 +81,4 @@ do done done done +done diff --git a/scripts/toy_bars_3x3/train_topic_models/pcslda_tf_adam_fromscratch.sh b/scripts/toy_bars_3x3/train_topic_models/pcslda_tf_adam_fromscratch.sh index ed0978b..2e85a62 100755 --- a/scripts/toy_bars_3x3/train_topic_models/pcslda_tf_adam_fromscratch.sh +++ b/scripts/toy_bars_3x3/train_topic_models/pcslda_tf_adam_fromscratch.sh @@ -4,6 +4,7 @@ nickname=20180301 export lossandgrad_mod_name="slda_loss__tensorflow" + # =============================== DATA SETTINGS export dataset_name=toy_bars_3x3 export dataset_path="$PC_REPO_DIR/datasets/$dataset_name/" @@ -11,9 +12,11 @@ export n_vocabs=9 export n_outputs=2 export n_train_docs=500 -for n_batches in 01 05; do +for n_batches in 01 05 +do export n_batches=$n_batches + # =============================== OUTPUT SETTINGS export param_output_fmt="topic_model_snapshot" export n_steps_between_save=10 @@ -22,6 +25,7 @@ export n_steps_to_print_early=2 export n_steps_to_save_early=2 export laps_to_save_custom='0,1,2,4,6,8,10' + # =============================== ALGO SETTINGS export n_laps=200 @@ -36,15 +40,15 @@ do export step_size=$step_size +# =============================== PER-DOC INFER SETTINGS ## Per-doc inference settings export pi_max_iters=100 export pi_step_size=0.05 +export pi_max_iters_first_train_lap=10 + +## Per-doc inference settings at perf-metric (eval step) +export perf_metrics_pi_max_iters=100 -# =============================== INIT SETTINGS -export init_model_path=none -for init_name in rand_smooth -do - export init_name=$init_name # =============================== MODEL HYPERS export alpha=1.100 @@ -58,6 +62,13 @@ for weight_y in 100.0 010.0 001.0 do export weight_y=$weight_y + +# =============================== INIT SETTINGS +export init_model_path=none +for init_name in rand_smooth +do + export init_name=$init_name + ## Loop over number of topics K for n_states in 004 do diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..8f69613 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[build_ext] +inplace=1 diff --git a/setup.py b/setup.py index a52b49f..de405bc 100644 --- a/setup.py +++ b/setup.py @@ -45,9 +45,14 @@ def add_directives_to_cython_ext(ext): 'cdivision':True} return ext +def read_version(txtpath): + with open(txtpath, 'r') as f: + version = f.readline().strip() + return version + setup( name='pc_toolbox', - version='0.1', + version=read_version('version.txt'), description='Prediction-constrained training for supervised topic models', long_description='Support code for Hughes et al AISTATS 2018', classifiers=[ diff --git a/version.txt b/version.txt new file mode 100644 index 0000000..09b9630 --- /dev/null +++ b/version.txt @@ -0,0 +1,2 @@ +0.1.20180712 +