From 9b8ce3d8a69c0bf97391b2c65ad971a60a8f0f77 Mon Sep 17 00:00:00 2001 From: Niema Moshiri Date: Thu, 22 Jun 2023 15:56:09 -0700 Subject: [PATCH] Don't perform all possible statistical tests --- MESS.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/MESS.py b/MESS.py index 79ed349..2f97972 100755 --- a/MESS.py +++ b/MESS.py @@ -7,7 +7,7 @@ from csv import reader, writer from datetime import datetime from math import log -from numpy import arange +from numpy import arange, histogram from os.path import isfile from scipy.stats import expon, gaussian_kde, linregress from seaborn import histplot, kdeplot @@ -19,7 +19,7 @@ matplotlib.use("Agg") # constants -VERSION = '1.0.6' +VERSION = '1.0.7' # no correction def qvalues_nocorrection(pvalues): @@ -177,7 +177,7 @@ def regress_mess(mess_scores, reg_min, reg_max, reg_xdelta): return rate, scale, loc # plot MESS distribution + regression -def plot_mess(mess_scores, scale, loc, xdelta, kde_color='black', kde_linestyle='--', kde_linewidth=0.75, reg_color='black', reg_linestyle='-', reg_linewidth=None, title=None, xlabel=None, xmin=0, xmax=None, ylabel=None, ymin=None, ymax=None, ylog=True, show_hist=True): +def plot_mess(mess_scores, scale, loc, xdelta, min_mess_test=None, kde_color='black', kde_linestyle='--', kde_linewidth=0.75, reg_color='black', reg_linestyle='-', reg_linewidth=None, title=None, xlabel=None, xmin=0, xmax=None, ylabel=None, ymin=None, ymax=None, ylog=True, show_hist=True): fig, ax = plt.subplots() if show_hist: histplot(mess_scores, stat='density', fill=False) @@ -199,6 +199,8 @@ def plot_mess(mess_scores, scale, loc, xdelta, kde_color='black', kde_linestyle= ymin = ax.get_ylim()[0] if ymax is None: ymax = ax.get_ylim()[1] + if min_mess_test is not None: + plt.plot([min_mess_test,min_mess_test], [ymin,ymax], color='red', linestyle=':') plt.xlim(xmin=xmin, xmax=xmax); plt.ylim(ymin=ymin, ymax=ymax) return fig, ax @@ -211,11 +213,23 @@ def compute_pvals(mess_scores, scale, loc): def write_mess_output(output_tsv_fn, mess, p_values, q_values, rate, loc, correction): with open(output_tsv_fn, 'w') as out_tsv_f: out_tsv = writer(out_tsv_f, delimiter='\t') - out_tsv.writerow(["Student 1", "Student 2", "MESS", "Proportion Identical", "p-value (rate=%s, loc=%s)" % (rate,loc), "q-value (correction: %s)" % CORRECTION[correction]['name'], "Red (same wrong answer)", "Yellow (different wrong answers)", "Green (only 1 student missed)"]) + out_tsv.writerow(["Student 1", "Student 2", "MESS", "Proportion Identical", "p-value (rate=%s, loc=%s)" % (rate,loc), "q-value (%d tests; MESS >= %s; correction: %s)" % (args.num_tests, min_mess_test, CORRECTION[correction]['name']), "Red (same wrong answer)", "Yellow (different wrong answers)", "Green (only 1 student missed)"]) for i in range(len(mess)): - m, ident, u, v, r, y, g = mess[i]; p = p_values[i]; q = q_values[i] + m, ident, u, v, r, y, g = mess[i]; p = p_values[i] + if i < len(q_values): + q = q_values[i] + else: + q = "N/A" out_tsv.writerow([u, v, m, ident, p, q, r, y, g]) +# find number of significance tests to perform +def find_num_tests(mess_scores): + hist, bin_edges = histogram(mess_scores, bins='auto') + for i in range(len(hist)): + if hist[i] == 0: + min_mess_test = bin_edges[i]; break + return min_mess_test, len([v for v in mess_scores if v > min_mess_test]) + # parse user args def parse_args(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -223,6 +237,7 @@ def parse_args(): parser.add_argument('-ot', '--output_tsv', required=True, type=str, help="Output MESS Spreadsheet (TSV)") parser.add_argument('-op', '--output_pdf', required=True, type=str, help="Output MESS Distribution (PDF)") parser.add_argument('--ignore_case', action='store_true', help="Ignore Case in Student Responses") + parser.add_argument('-nt', '--num_tests', required=False, type=int, default=None, help="Number of Significance Tests to Perform") parser.add_argument('-c', '--correction', required=False, type=str, default='benjamini_hochberg', help="Multiple Hypothesis Test Correction (options: %s)" % ', '.join(sorted(CORRECTION.keys()))) parser.add_argument('-rm', '--reg_min', required=False, type=float, default=None, help="Minimum MESS for Regression") parser.add_argument('-rM', '--reg_max', required=False, type=float, default=None, help="Maximum MESS for Regression") @@ -254,6 +269,8 @@ def parse_args(): if isfile(fn): error("Output file exists: %s" % fn) args.correction = args.correction.lower() + if args.num_tests is not None and args.num_tests < 1: + error("Number of hypothesis tests needs to be positive: %s" % args.num_tests) if args.correction not in CORRECTION: error("Invalid multiple hypothesis test correction: %s\nOptions: %s" % (args.correction, ', '.join(sorted(CORRECTION.keys())))) if args.reg_min is not None: @@ -305,8 +322,13 @@ def parse_args(): print_log("Finished computing theoretical p-values") # perform multiple hypothesis test correction - print_log("Performing multiple hypothesis test correction method: %s" % CORRECTION[args.correction]['name']) - q_values = CORRECTION[args.correction]['func'](p_values) + if args.num_tests is None: + min_mess_test, args.num_tests = find_num_tests(mess_scores) + else: + min_mess_test = mess_scores[args.num_tests-1] + print_log("Performing %d significance tests... Minimum MESS: %s" % (args.num_tests, min_mess_test)) + print_log("Multiple hypothesis test correction method: %s" % CORRECTION[args.correction]['name']) + q_values = CORRECTION[args.correction]['func'](p_values[:args.num_tests]) print_log("Finished computing q-values (corrected p-values)") # write output TSV @@ -316,6 +338,6 @@ def parse_args(): # plot MESS distribution + regression print_log("Plotting MESS distribution and regression...") - fig, ax = plot_mess(mess_scores, scale, loc, args.reg_xdelta, kde_color=args.kde_color, kde_linestyle=args.kde_linestyle, kde_linewidth=args.kde_linewidth, reg_color=args.reg_color, reg_linestyle=args.reg_linestyle, reg_linewidth=args.reg_linewidth, title=args.title, xlabel=args.xlabel, xmin=args.xmin, xmax=args.xmax, ylabel=args.ylabel, ymin=args.ymin, ymax=args.ymax, ylog=(not args.no_ylog), show_hist=args.show_hist) + fig, ax = plot_mess(mess_scores, scale, loc, args.reg_xdelta, min_mess_test=min_mess_test, kde_color=args.kde_color, kde_linestyle=args.kde_linestyle, kde_linewidth=args.kde_linewidth, reg_color=args.reg_color, reg_linestyle=args.reg_linestyle, reg_linewidth=args.reg_linewidth, title=args.title, xlabel=args.xlabel, xmin=args.xmin, xmax=args.xmax, ylabel=args.ylabel, ymin=args.ymin, ymax=args.ymax, ylog=(not args.no_ylog), show_hist=args.show_hist) fig.savefig(args.output_pdf, format='pdf', bbox_inches='tight'); plt.close(fig) print_log("MESS distribution and regression figure written to PDF: %s" % args.output_pdf)