From c704ab792084f3ef9be8acf05760105ff714eec8 Mon Sep 17 00:00:00 2001 From: Hans Niederhausen Date: Wed, 4 May 2022 14:54:46 -0700 Subject: [PATCH 1/4] add function to compute pval from gamma-fit to trials. --- skyllh/core/analysis_utils.py | 93 +++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/skyllh/core/analysis_utils.py b/skyllh/core/analysis_utils.py index 7440fa9907..2a3145aae4 100644 --- a/skyllh/core/analysis_utils.py +++ b/skyllh/core/analysis_utils.py @@ -96,6 +96,99 @@ def calculate_pval_from_trials( return (p, p_sigma) + +def calculate_pval_from_gammafit_to_trials(ts_vals, ts_threshold, + eta=3.0, n_max=500000): + """Calculates the probability (p-value) of test-statistic exceeding + the given test-statistic threshold. This calculation relies on fitting + a gamma distribution to a list of ts values. + + Parameters + ---------- + ts_vals : (n_trials,)-shaped 1D ndarray of float + The ndarray holding the test-statistic values of the trials. + ts_threshold : float + The critical test-statistic value. + eta : float, optional + Test-statistic value at which the gamma function is truncated + from below. Default = 3.0. + n_max : int, optional + The maximum number of trials that should be used during + fitting. Default = 500,000 + + Returns + ------- + p, p_err: tuple(float, float) + """ + if(ts_threshold < eta): + raise ValueError( + 'ts threshold value = %e, eta = %e. The calculation of the p-value' + 'from the fit is correct only for ts threshold larger than ' + 'the truncation threshold eta.', + ts_threshold, eta) + + if len(ts_vals) > n_max: + ts_vals = ts_vals[:n_max] + + Ntot = len(ts_vals) + ts_eta = ts_vals[ts_vals > eta] + N_prime = len(ts_eta) + alpha = N_prime/Ntot + + obj = lambda x: truncated_gamma_logpdf(x[0], x[1], eta=eta, + ts_above_eta=ts_eta, + N_above_eta=N_prime) + x0 = [0.75, 1.8] # Initial values of function parameters. + bounds = [[0.1, 10], [0.1, 10]] # Ranges for the minimization fitter. + r = minimize(obj, x0, bounds=bounds) + pars = r.x + + norm = alpha/gamma.sf(eta, a=pars[0], scale=pars[1]) + p = norm * gamma.sf(ts_threshold, a=pars[0], scale=pars[1]) + + # a correct calculation of the error in pvalue due to + # fitting uncertainty remains to be implemented + # return p_sigma = 0 for now for consistentcy with + # calculate_pval_from_trials() + p_sigma = 0.0 + return (p, p_sigma) + + +def calculate_pval_from_trials_mixed(ts_vals, ts_threshold, switch_at_ts=5.0, + eta=3.0, n_max=500000): + """Calculates the probability (p-value) of test-statistic exceeding + the given test-statistic threshold. This calculation relies on fitting + a gamma distribution to a list of ts values if ts_threshold is larger than + switch_at_ts. If ts_threshold is smaller then the pvalue will be taken + from the trials directly. + + Parameters + ---------- + ts_vals : (n_trials,)-shaped 1D ndarray of float + The ndarray holding the test-statistic values of the trials. + ts_threshold : float + The critical test-statistic value. + switch_at_ts : float, optional + Test-statistic value below which p-value is computed from trials + directly. For thresholds greater than switch_at_ts the pvalue is + calculated using a gamma fit. + eta : float, optional + Test-statistic value at which the gamma function is truncated + from below. Default = 3.0. + n_max : int, optional + The maximum number of trials that should be used during + fitting. Default = 500,000 + + Returns + ------- + p, p_err: tuple(float, float) + """ + if ts_threshold < switch_at_ts: + return calculate_pval_from_trials(ts_vals, ts_threshold) + else: + return calculate_pval_from_gammafit_to_trials(ts_vals, ts_threshold, eta=eta, n_max=n_max) + + def truncated_gamma_logpdf( a, scale, eta, ts_above_eta, N_above_eta): """Calculates the -log(likelihood) of a sample of random numbers From 1c5be34c4330588fd970fd20ecb64ba773bea218 Mon Sep 17 00:00:00 2001 From: tomaskontrimas Date: Mon, 9 May 2022 13:57:11 -0500 Subject: [PATCH 2/4] Update docstring --- skyllh/core/analysis_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/skyllh/core/analysis_utils.py b/skyllh/core/analysis_utils.py index 2a3145aae4..4a276419c2 100644 --- a/skyllh/core/analysis_utils.py +++ b/skyllh/core/analysis_utils.py @@ -90,6 +90,10 @@ def calculate_pval_from_trials( The ndarray holding the test-statistic values of the trials. ts_threshold : float The critical test-statistic value. + + Returns + ------- + p, p_sigma: tuple(float, float) """ p = ts_vals[ts_vals > ts_threshold].size / ts_vals.size p_sigma = np.sqrt(p * (1 - p) / ts_vals.size) @@ -118,7 +122,7 @@ def calculate_pval_from_gammafit_to_trials(ts_vals, ts_threshold, Returns ------- - p, p_err: tuple(float, float) + p, p_sigma: tuple(float, float) """ if(ts_threshold < eta): raise ValueError( @@ -181,7 +185,7 @@ def calculate_pval_from_trials_mixed(ts_vals, ts_threshold, switch_at_ts=5.0, Returns ------- - p, p_err: tuple(float, float) + p, p_sigma: tuple(float, float) """ if ts_threshold < switch_at_ts: return calculate_pval_from_trials(ts_vals, ts_threshold) From 3a44c176f0090127422dc0e0f9aa1fd590df7d3c Mon Sep 17 00:00:00 2001 From: tomaskontrimas Date: Mon, 9 May 2022 13:59:43 -0500 Subject: [PATCH 3/4] Add comp_operator arg for pval calculation and set `eta` to `switch_at_ts` for mixed pval calculation --- skyllh/core/analysis_utils.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/skyllh/core/analysis_utils.py b/skyllh/core/analysis_utils.py index 4a276419c2..4ec144351f 100644 --- a/skyllh/core/analysis_utils.py +++ b/skyllh/core/analysis_utils.py @@ -78,7 +78,7 @@ def pointlikesource_to_data_field_array( def calculate_pval_from_trials( - ts_vals, ts_threshold): + ts_vals, ts_threshold, comp_operator='greater'): """Calculates the percentage (p-value) of test-statistic trials that are above the given test-statistic critical value. In addition it calculates the standard deviation of the p-value assuming @@ -90,12 +90,24 @@ def calculate_pval_from_trials( The ndarray holding the test-statistic values of the trials. ts_threshold : float The critical test-statistic value. + comp_operator: string, optional + The comparison operator for p-value calculation. It can be set to one of + the following options: 'greater' or 'greater_equal'. Returns ------- p, p_sigma: tuple(float, float) """ - p = ts_vals[ts_vals > ts_threshold].size / ts_vals.size + if comp_operator == 'greater': + p = ts_vals[ts_vals > ts_threshold].size / ts_vals.size + elif comp_operator == 'greater_equal': + p = ts_vals[ts_vals >= ts_threshold].size / ts_vals.size + else: + raise ValueError( + f"The comp_operator={comp_operator} is not an" + "available option ('greater' or 'greater_equal')." + ) + p_sigma = np.sqrt(p * (1 - p) / ts_vals.size) return (p, p_sigma) @@ -124,7 +136,7 @@ def calculate_pval_from_gammafit_to_trials(ts_vals, ts_threshold, ------- p, p_sigma: tuple(float, float) """ - if(ts_threshold < eta): + if(ts_threshold < eta): raise ValueError( 'ts threshold value = %e, eta = %e. The calculation of the p-value' 'from the fit is correct only for ts threshold larger than ' @@ -159,7 +171,7 @@ def calculate_pval_from_gammafit_to_trials(ts_vals, ts_threshold, def calculate_pval_from_trials_mixed(ts_vals, ts_threshold, switch_at_ts=5.0, - eta=3.0, n_max=500000): + eta=None, n_max=500000, comp_operator='greater_equal'): """Calculates the probability (p-value) of test-statistic exceeding the given test-statistic threshold. This calculation relies on fitting a gamma distribution to a list of ts values if ts_threshold is larger than @@ -178,17 +190,25 @@ def calculate_pval_from_trials_mixed(ts_vals, ts_threshold, switch_at_ts=5.0, calculated using a gamma fit. eta : float, optional Test-statistic value at which the gamma function is truncated - from below. Default = 3.0. + from below. Default is None. n_max : int, optional The maximum number of trials that should be used during fitting. Default = 500,000 + comp_operator: string, optional + The comparison operator for p-value calculation. It can be set to one of + the following options: 'greater' or 'greater_equal'. Returns ------- p, p_sigma: tuple(float, float) """ + # Set `eta` to `switch_at_ts` as a default. + # It makes sure that both functions return the same pval at `switch_at_ts`. + if eta is None: + eta = switch_at_ts + if ts_threshold < switch_at_ts: - return calculate_pval_from_trials(ts_vals, ts_threshold) + return calculate_pval_from_trials(ts_vals, ts_threshold, comp_operator=comp_operator) else: return calculate_pval_from_gammafit_to_trials(ts_vals, ts_threshold, eta=eta, n_max=n_max) From ffc51b785ac05f715a5eef22658d0555114ef1d0 Mon Sep 17 00:00:00 2001 From: tomaskontrimas Date: Mon, 9 May 2022 14:42:09 -0500 Subject: [PATCH 4/4] Update `switch_at_ts` default to 3.0 --- skyllh/core/analysis_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skyllh/core/analysis_utils.py b/skyllh/core/analysis_utils.py index 4ec144351f..406b464234 100644 --- a/skyllh/core/analysis_utils.py +++ b/skyllh/core/analysis_utils.py @@ -170,7 +170,7 @@ def calculate_pval_from_gammafit_to_trials(ts_vals, ts_threshold, return (p, p_sigma) -def calculate_pval_from_trials_mixed(ts_vals, ts_threshold, switch_at_ts=5.0, +def calculate_pval_from_trials_mixed(ts_vals, ts_threshold, switch_at_ts=3.0, eta=None, n_max=500000, comp_operator='greater_equal'): """Calculates the probability (p-value) of test-statistic exceeding the given test-statistic threshold. This calculation relies on fitting