From 2c1c7e0c9266935492e1aa2f22eda6d721aaaf0f Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 28 Jan 2025 15:14:41 +0100 Subject: [PATCH 01/12] New functionality for bounding LR systems, based on a publication by I. Alberink et al. that is being submitted for publication. --- lir/bounding.py | 119 +++++++++++++++++++++++++++++++++++++++++ tests/test_bounding.py | 61 +++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 lir/bounding.py create mode 100644 tests/test_bounding.py diff --git a/lir/bounding.py b/lir/bounding.py new file mode 100644 index 0000000..50e1285 --- /dev/null +++ b/lir/bounding.py @@ -0,0 +1,119 @@ +""" +Extrapolation bounds on LRs using the method by Alberink et al. (2025) + +See: +[-] Ivo Alberink, Jeannette Leegwater, Jonas Malmborg, Anders Nordgaard, Marjan Sjerps, Leen van der Ham + A transparent method to determine limit values for Likelihood Ratio systems + In: to be submitted for publication. +""" +import numpy as np +import matplotlib.pyplot as plt + +def plot_delta_functions(lrs, y, llr_threshold_range=None, step_size=.001, ax=plt): + + if llr_threshold_range is None: + llrs = np.log10(lrs) + llr_threshold_range = (np.min(llrs) - .5, np.max(llrs) + .5) + + llr_threshold = np.arange(*llr_threshold_range, step_size) + + lower_bound, upper_bound, delta_low, delta_high = (calculate_bounds(lrs, y, llr_threshold=llr_threshold)) + + # plot the delta-functions and the 0-line + lower_llr = np.round(np.log10(lower_bound),2) + upper_llr = np.round(np.log10(upper_bound),2) + ax.plot(llr_threshold, delta_low, '--', label="$\Delta_{lower}$ is 0 at " + str(lower_llr)) + ax.plot(llr_threshold, delta_high, '-', label="$\Delta_{upper}$ is 0 at " + str(upper_llr)) + ax.axhline(y=0, color='k', linestyle='dotted') + # Some more formatting + ax.legend(loc="upper left") + ax.xlabel("log10(LR)") + ax.ylabel("$\Delta$-value") + ax.xlim(llr_threshold_range) + ax.grid(True, linestyle=':') + ax.show() + + return lower_bound, upper_bound + +def calculate_bounds(lrs, y, llr_threshold=None, step_size=.001, substitute_extremes=(np.exp(-20), np.exp(20))): + """ + Returns the upper and lower bounds of the LRs. + + :param lrs: an array of LRs + :param y: an array of ground-truth labels (values 0 for Hd or 1 for Hp); + must be of the same length as `lrs` + :param llr_threshold: predefined values of LLRs as possible bounds + :param step_size: required accuracy on a base-10 logarithmic scale + :param substitute_extremes: (tuple of scalars) substitute for extreme LRs, i.e. + LRs of 0 and inf are substituted by these values + """ + + # remove LRs of 0 and infinity + sanitized_lrs = lrs + sanitized_lrs[sanitized_lrs < substitute_extremes[0]] = substitute_extremes[0] + sanitized_lrs[sanitized_lrs > substitute_extremes[1]] = substitute_extremes[1] + + # determine the range of LRs to be considered + if llr_threshold is None: + llrs = np.log10(sanitized_lrs) + llr_threshold_range = (min(0, np.min(llrs)), max(0, np.max(llrs))+step_size) + llr_threshold = np.arange(*llr_threshold_range, step_size) + + # calculate the two delta functions + delta_low, delta_high = calculate_delta_functions(lrs, y, llr_threshold) + + # find the LLRs closest to LLR=0 where the functions become negative & convert them to LRs + # if no negatives are found, use the maximum H1-LR in case of upper bound & minimum H2-LR in case of lower bound + delta_high_negative = np.where(delta_high < 0) + if not any(delta_high_negative[0]): + upper_bound = np.max(lrs[y==1]) + else: + pst_upper_bound = delta_high_negative[0][0] - 1 + upper_bound = 10 ** llr_threshold[pst_upper_bound] + delta_low_negative = np.where(delta_low < 0) + if not any(delta_low_negative[0]): + lower_bound = np.min(lrs[y==0]) + else: + pst_lower_bound = delta_low_negative[0][-1] + 1 + lower_bound = 10 ** llr_threshold[pst_lower_bound] + + # Check for bounds on the wrong side of 1. This may occur for badly + # performing LR systems, e.g. if the delta function is always below zero. + lower_bound = min(lower_bound, 1) + upper_bound = max(upper_bound, 1) + + return lower_bound, upper_bound, delta_low, delta_high + +def calculate_delta_functions(lrs, y, llr_threshold): + """ + Calculates the delta functions for a set of LRs at given threshold values. + + :param lrs: an array of LRs + :param y: an array of ground-truth labels (values 0 for Hd or 1 for Hp); + must be of the same length as `lrs` + :param llr_threshold: an array of threshold LLRs + :returns: two arrays of delta-values, at all threshold LR values + """ + + # fix the value used for the beta distributions at 1/2 (Jeffreys prior) + beta_parameter = 1 / 2 + + # for all possible llr_threshold values, count how many of the lrs are larger or equal to them for both h1 and h2 + lrs_h1 = lrs[y==1] + lrs_h2 = lrs[y==0] + llr_h1_2d = np.tile(np.expand_dims(np.log10(lrs_h1), 1), (1, llr_threshold.shape[0])) + llr_h2_2d = np.tile(np.expand_dims(np.log10(lrs_h2), 1), (1, llr_threshold.shape[0])) + success_h1 = np.sum(llr_h1_2d >= llr_threshold, axis=0) + success_h2 = np.sum(llr_h2_2d >= llr_threshold, axis=0) + + # use the as inputs for calculations of the probabilities + prob_h1_above_grid = (success_h1 + beta_parameter) / (len(lrs_h1) + 2*beta_parameter) + prob_h2_above_grid = (success_h2 + beta_parameter) / (len(lrs_h2) + 2*beta_parameter) + prob_h1_below_grid = 1 - prob_h1_above_grid + prob_h2_below_grid = 1 - prob_h2_above_grid + + # calculate the delta-functions for all the llr_threshold values + delta_high = np.log10(prob_h1_above_grid) - np.log10(prob_h2_above_grid) - llr_threshold + delta_low = llr_threshold - np.log10(prob_h1_below_grid) + np.log10(prob_h2_below_grid) + + return delta_low, delta_high diff --git a/tests/test_bounding.py b/tests/test_bounding.py new file mode 100644 index 0000000..fc14000 --- /dev/null +++ b/tests/test_bounding.py @@ -0,0 +1,61 @@ +import numpy as np +import unittest + +# import lir.bounding as bounding +import bounding +from lir.data import AlcoholBreathAnalyser + + +class TestBounding(unittest.TestCase): + + def test_breath(self): + lrs, y = AlcoholBreathAnalyser(ill_calibrated=True).sample_lrs() + bounds = bounding.calculate_bounds(lrs, y) + np.testing.assert_almost_equal((0.1052741, 85.3731634), bounds[:2]) + + def test_extreme_smallset(self): + lrs = np.array([np.inf, 0]) + y = np.array([1, 0]) + bounds = bounding.calculate_bounds(lrs, y) + np.testing.assert_almost_equal((0.3335112, 2.9999248), bounds[:2]) + + def test_extreme(self): + lrs = np.array([np.inf, np.inf, np.inf, 0, 0, 0]) + y = np.array([1, 1, 1, 0, 0, 0]) + bounds = bounding.calculate_bounds(lrs, y) + np.testing.assert_almost_equal((0.1429257, 6.9840986), bounds[:2]) + + def test_system01(self): + lrs = np.array([.01, .1, 1, 10, 100]) + bounds_bad = bounding.calculate_bounds(lrs, np.array([1, 1, 1, 0, 0])) + bounds_good1 = bounding.calculate_bounds(lrs, np.array([0, 0, 1, 1, 1])) + bounds_good2 = bounding.calculate_bounds(lrs, np.array([0, 0, 0, 1, 1])) + + np.testing.assert_almost_equal((1, 1), bounds_bad[:2]) + np.testing.assert_almost_equal((0.1503142, 3.74973), bounds_good1[:2]) + np.testing.assert_almost_equal((0.2666859, 6.6527316), bounds_good2[:2]) + + def test_neutral_smallset(self): + lrs = np.array([1, 1]) + y = np.array([1, 0]) + bounds = bounding.calculate_bounds(lrs, y) + np.testing.assert_almost_equal((1, 1), bounds[:2]) + + def test_bias(self): + lrs = np.ones(10) * 10 + y = np.concatenate([np.ones(9), np.zeros(1)]) + bounds = bounding.calculate_bounds(lrs, y) + np.testing.assert_almost_equal((1, 1.2647363), bounds[:2]) + + lrs = np.concatenate([np.ones(10) * 10, np.ones(1)]) + y = np.concatenate([np.ones(10), np.zeros(1)]) + bounds = bounding.calculate_bounds(lrs, y) + np.testing.assert_almost_equal((1, 3.8106582), bounds[:2]) + + lrs = np.concatenate([np.ones(10) * 1000, np.ones(1) * 1.1]) + y = np.concatenate([np.ones(10), np.zeros(1)]) + bounds = bounding.calculate_bounds(lrs, y) + np.testing.assert_almost_equal((1, 3.8106582), bounds[:2]) + +if __name__ == '__main__': + unittest.main() From 05f84a8823994d38977da4a7a75023c9a06afd79 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 28 Jan 2025 15:20:20 +0100 Subject: [PATCH 02/12] Fix path to bounding function --- tests/test_bounding.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_bounding.py b/tests/test_bounding.py index fc14000..778f9be 100644 --- a/tests/test_bounding.py +++ b/tests/test_bounding.py @@ -1,8 +1,7 @@ import numpy as np import unittest -# import lir.bounding as bounding -import bounding +import lir.bounding as bounding from lir.data import AlcoholBreathAnalyser From a32f49b7f12e2675e161118dd7daee0ae0272d3c Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Thu, 30 Jan 2025 15:31:00 +0100 Subject: [PATCH 03/12] Processed various review comments. --- lir/bounding.py | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/lir/bounding.py b/lir/bounding.py index 50e1285..a7526de 100644 --- a/lir/bounding.py +++ b/lir/bounding.py @@ -9,7 +9,18 @@ import numpy as np import matplotlib.pyplot as plt -def plot_delta_functions(lrs, y, llr_threshold_range=None, step_size=.001, ax=plt): +def plot_delta_functions(lrs: np.ndarray, y: np.ndarray, llr_threshold_range: tuple[float, float] = None, + step_size: float = .001, ax: plt.Axes = plt) -> None: + """ + Returns a figure of the delta functions along with the upper and lower bounds of the LRs. + + :param lrs: an array of LRs + :param y: an array of ground-truth labels (values 0 for Hd or 1 for Hp); + must be of the same length as `lrs` + :param llr_threshold_range: lower limit and upper limit for the LLRs to include in the figure + :param step_size: required accuracy on a base-10 logarithmic scale + :param ax: matplotlib axes + """ if llr_threshold_range is None: llrs = np.log10(lrs) @@ -17,25 +28,21 @@ def plot_delta_functions(lrs, y, llr_threshold_range=None, step_size=.001, ax=pl llr_threshold = np.arange(*llr_threshold_range, step_size) - lower_bound, upper_bound, delta_low, delta_high = (calculate_bounds(lrs, y, llr_threshold=llr_threshold)) + lower_bound, upper_bound, delta_low, delta_high = calculate_bounds(lrs, y, llr_threshold=llr_threshold) # plot the delta-functions and the 0-line lower_llr = np.round(np.log10(lower_bound),2) upper_llr = np.round(np.log10(upper_bound),2) - ax.plot(llr_threshold, delta_low, '--', label="$\Delta_{lower}$ is 0 at " + str(lower_llr)) - ax.plot(llr_threshold, delta_high, '-', label="$\Delta_{upper}$ is 0 at " + str(upper_llr)) - ax.axhline(y=0, color='k', linestyle='dotted') + ax.plot(llr_threshold, delta_low, '--', label=r"$\Delta_{lower}$ is 0 at " + str(lower_llr)) + ax.plot(llr_threshold, delta_high, '-', label=r"$\Delta_{upper}$ is 0 at " + str(upper_llr)) + ax.axhline(color='k', linestyle='dotted') # Some more formatting ax.legend(loc="upper left") ax.xlabel("log10(LR)") ax.ylabel("$\Delta$-value") - ax.xlim(llr_threshold_range) - ax.grid(True, linestyle=':') - ax.show() - - return lower_bound, upper_bound -def calculate_bounds(lrs, y, llr_threshold=None, step_size=.001, substitute_extremes=(np.exp(-20), np.exp(20))): +def calculate_bounds(lrs: np.ndarray, y: np.ndarray, llr_threshold: np.ndarray = None, step_size: float = .001, + substitute_extremes: tuple[float, float] = (np.exp(-20), np.exp(20))) -> tuple[float, float, np.ndarray, np.ndarray]: """ Returns the upper and lower bounds of the LRs. @@ -64,17 +71,17 @@ def calculate_bounds(lrs, y, llr_threshold=None, step_size=.001, substitute_extr # find the LLRs closest to LLR=0 where the functions become negative & convert them to LRs # if no negatives are found, use the maximum H1-LR in case of upper bound & minimum H2-LR in case of lower bound - delta_high_negative = np.where(delta_high < 0) - if not any(delta_high_negative[0]): + delta_high_negative = np.where(delta_high < 0)[0] + if not any(delta_high_negative): upper_bound = np.max(lrs[y==1]) else: - pst_upper_bound = delta_high_negative[0][0] - 1 + pst_upper_bound = delta_high_negative[0] - 1 upper_bound = 10 ** llr_threshold[pst_upper_bound] - delta_low_negative = np.where(delta_low < 0) - if not any(delta_low_negative[0]): + delta_low_negative = np.where(delta_low < 0)[0] + if not any(delta_low_negative): lower_bound = np.min(lrs[y==0]) else: - pst_lower_bound = delta_low_negative[0][-1] + 1 + pst_lower_bound = delta_low_negative[-1] + 1 lower_bound = 10 ** llr_threshold[pst_lower_bound] # Check for bounds on the wrong side of 1. This may occur for badly @@ -84,7 +91,7 @@ def calculate_bounds(lrs, y, llr_threshold=None, step_size=.001, substitute_extr return lower_bound, upper_bound, delta_low, delta_high -def calculate_delta_functions(lrs, y, llr_threshold): +def calculate_delta_functions(lrs: np.ndarray, y: np.ndarray, llr_threshold: np.ndarray) -> tuple[np.ndarray, np.ndarray]: """ Calculates the delta functions for a set of LRs at given threshold values. From 7a4f05a9ee84d1d3455d9d199c08c6970261f45d Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 4 Feb 2025 12:46:26 +0100 Subject: [PATCH 04/12] Added test case for ELUBbounder. --- tests/test_elub.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_elub.py b/tests/test_elub.py index 380b4dc..28ba7fe 100644 --- a/tests/test_elub.py +++ b/tests/test_elub.py @@ -1,8 +1,12 @@ import numpy as np import unittest +from sklearn.linear_model import LogisticRegression import lir.bayeserror from lir.data import AlcoholBreathAnalyser +from lir.calibration import KDECalibrator, ELUBbounder +from lir.lr import CalibratedScorer +from lir.util import Xn_to_Xy class TestElub(unittest.TestCase): @@ -54,6 +58,18 @@ def test_bias(self): y = np.concatenate([np.ones(10), np.zeros(1)]) np.testing.assert_almost_equal((1, 1), lir.bayeserror.elub(lrs, y, add_misleading=1)) + def test_bounded_calibrated_scorer(self): + + np.random.seed(0) + + X0 = np.random.normal(loc=-1, scale=1, size=(1000, 1)) + X1 = np.random.normal(loc=+1, scale=1, size=(1000, 1)) + X, y = Xn_to_Xy(X0, X1) + + bounded_calibrated_scorer = CalibratedScorer(LogisticRegression(), ELUBbounder(KDECalibrator(bandwidth=(1, 1)))) + bounded_calibrated_scorer.fit(X, y) + bounds = (bounded_calibrated_scorer.calibrator._lower_lr_bound, bounded_calibrated_scorer.calibrator._upper_lr_bound) + np.testing.assert_almost_equal((0.0497761, 17.9893586), bounds) if __name__ == '__main__': unittest.main() From f478f970e8b6fa0fac252c25d900ba4f6f60f319 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 4 Feb 2025 13:09:02 +0100 Subject: [PATCH 05/12] Redefined seed for better reproducibility. --- tests/test_elub.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_elub.py b/tests/test_elub.py index 28ba7fe..5ea8d97 100644 --- a/tests/test_elub.py +++ b/tests/test_elub.py @@ -60,16 +60,16 @@ def test_bias(self): def test_bounded_calibrated_scorer(self): - np.random.seed(0) + rng = np.random.default_rng(0) - X0 = np.random.normal(loc=-1, scale=1, size=(1000, 1)) - X1 = np.random.normal(loc=+1, scale=1, size=(1000, 1)) + X0 = rng.normal(loc=-1, scale=1, size=(1000, 1)) + X1 = rng.normal(loc=+1, scale=1, size=(1000, 1)) X, y = Xn_to_Xy(X0, X1) bounded_calibrated_scorer = CalibratedScorer(LogisticRegression(), ELUBbounder(KDECalibrator(bandwidth=(1, 1)))) bounded_calibrated_scorer.fit(X, y) bounds = (bounded_calibrated_scorer.calibrator._lower_lr_bound, bounded_calibrated_scorer.calibrator._upper_lr_bound) - np.testing.assert_almost_equal((0.0497761, 17.9893586), bounds) + np.testing.assert_almost_equal((0.063251, 17.6256669), bounds) if __name__ == '__main__': unittest.main() From 831c201df4aee7cb39ee65fb9d7b33f3e9bd0420 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 4 Feb 2025 13:25:07 +0100 Subject: [PATCH 06/12] Changed calibrator from kde to logit. --- tests/test_elub.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_elub.py b/tests/test_elub.py index 5ea8d97..63fae94 100644 --- a/tests/test_elub.py +++ b/tests/test_elub.py @@ -4,7 +4,7 @@ import lir.bayeserror from lir.data import AlcoholBreathAnalyser -from lir.calibration import KDECalibrator, ELUBbounder +from lir.calibration import LogitCalibrator, ELUBbounder from lir.lr import CalibratedScorer from lir.util import Xn_to_Xy @@ -66,10 +66,10 @@ def test_bounded_calibrated_scorer(self): X1 = rng.normal(loc=+1, scale=1, size=(1000, 1)) X, y = Xn_to_Xy(X0, X1) - bounded_calibrated_scorer = CalibratedScorer(LogisticRegression(), ELUBbounder(KDECalibrator(bandwidth=(1, 1)))) + bounded_calibrated_scorer = CalibratedScorer(LogisticRegression(), ELUBbounder(LogitCalibrator())) bounded_calibrated_scorer.fit(X, y) bounds = (bounded_calibrated_scorer.calibrator._lower_lr_bound, bounded_calibrated_scorer.calibrator._upper_lr_bound) - np.testing.assert_almost_equal((0.063251, 17.6256669), bounds) + np.testing.assert_almost_equal((0.0293941, 104.9032605), bounds) if __name__ == '__main__': unittest.main() From a02f0603aa4fd4231c339d51ad10e6c2b63446f2 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 4 Feb 2025 14:54:44 +0100 Subject: [PATCH 07/12] Updated values to results from github. --- tests/test_elub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_elub.py b/tests/test_elub.py index 63fae94..05c610e 100644 --- a/tests/test_elub.py +++ b/tests/test_elub.py @@ -69,7 +69,7 @@ def test_bounded_calibrated_scorer(self): bounded_calibrated_scorer = CalibratedScorer(LogisticRegression(), ELUBbounder(LogitCalibrator())) bounded_calibrated_scorer.fit(X, y) bounds = (bounded_calibrated_scorer.calibrator._lower_lr_bound, bounded_calibrated_scorer.calibrator._upper_lr_bound) - np.testing.assert_almost_equal((0.0293941, 104.9032605), bounds) + np.testing.assert_almost_equal((0.0293081, 104.5963329), bounds) if __name__ == '__main__': unittest.main() From 18dc3a1c253ceef3f6c85d332c64ac9fe6160f96 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 4 Feb 2025 15:00:36 +0100 Subject: [PATCH 08/12] Make generic LRbounder class, with ELUBbounder as subclass. --- lir/calibration.py | 90 +++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 40 deletions(-) diff --git a/lir/calibration.py b/lir/calibration.py index 698123a..e476026 100644 --- a/lir/calibration.py +++ b/lir/calibration.py @@ -11,6 +11,7 @@ from sklearn.mixture import GaussianMixture from sklearn.neighbors import KernelDensity from typing import Optional, Tuple, Union, Callable, Sized +from abc import ABC, abstractmethod from .bayeserror import elub from .loss_functions import negative_log_likelihood_balanced @@ -671,7 +672,53 @@ def transform(self, log_odds: np.ndarray): return odds -class ELUBbounder(BaseEstimator, TransformerMixin): +class LRbounder(ABC, BaseEstimator, TransformerMixin): + """ + Class that, given an LR system, outputs the same LRs as the system but bounded by lower and upper bounds. + """ + + def __init__(self, first_step_calibrator, also_fit_calibrator=True): + """ + a calibrator should be provided (optionally already fitted to data). This calibrator is called on scores, + the resulting LRs are then bounded. If also_fit_calibrator, the first step calibrator will be fit on the same + data used to derive the bounds + :param first_step_calibrator: the calibrator to use. Should already have been fitted if also_fit_calibrator is False + :param also_fit_calibrator: whether to also fit the first step calibrator when calling fit + """ + + self.first_step_calibrator = first_step_calibrator + self.also_fit_calibrator = also_fit_calibrator + self._lower_lr_bound = None + self._upper_lr_bound = None + if not also_fit_calibrator: + # check the model was fitted. + try: + first_step_calibrator.transform(np.array([0.5])) + except NotFittedError: + print('calibrator should have been fit when setting also_fit_calibrator = False!') + + @abstractmethod + def fit(self, X, y): + pass + + def transform(self, X): + """ + a transform entails calling the first step calibrator and applying the bounds found + """ + unadjusted_lrs = np.array(self.first_step_calibrator.transform(X)) + lower_adjusted_lrs = np.where(self._lower_lr_bound < unadjusted_lrs, unadjusted_lrs, self._lower_lr_bound) + adjusted_lrs = np.where(self._upper_lr_bound > lower_adjusted_lrs, lower_adjusted_lrs, self._upper_lr_bound) + return adjusted_lrs + + @property + def p0(self): + return self.first_step_calibrator.p0 + + @property + def p1(self): + return self.first_step_calibrator.p1 + +class ELUBbounder(LRbounder): """ Class that, given an LR system, outputs the same LRs as the system but bounded by the Empirical Upper and Lower Bounds as described in @@ -705,51 +752,14 @@ class ELUBbounder(BaseEstimator, TransformerMixin): # empirical_bounds=[min(a) max(a)] """ - def __init__(self, first_step_calibrator, also_fit_calibrator=True): - """ - a calibrator should be provided (optionally already fitted to data). This calibrator is called on scores, - the resulting LRs are then bounded. If also_fit_calibrator, the first step calibrator will be fit on the same - data used to derive the ELUB bounds - :param first_step_calibrator: the calibrator to use. Should already have been fitted if also_fit_calibrator is False - :param also_fit_calibrator: whether to also fit the first step calibrator when calling fit - """ - - self.first_step_calibrator = first_step_calibrator - self.also_fit_calibrator = also_fit_calibrator - self._lower_lr_bound = None - self._upper_lr_bound = None - if not also_fit_calibrator: - # check the model was fitted. - try: - first_step_calibrator.transform(np.array([0.5])) - except NotFittedError: - print('calibrator should have been fit when setting also_fit_calibrator = False!') - def fit(self, X, y): """ assuming that y=1 corresponds to Hp, y=0 to Hd """ if self.also_fit_calibrator: - self.first_step_calibrator.fit(X,y) - lrs = self.first_step_calibrator.transform(X) + self.first_step_calibrator.fit(X, y) + lrs = self.first_step_calibrator.transform(X) y = np.asarray(y).squeeze() self._lower_lr_bound, self._upper_lr_bound = elub(lrs, y, add_misleading=1) return self - - def transform(self, X): - """ - a transform entails calling the first step calibrator and applying the bounds found - """ - unadjusted_lrs = np.array(self.first_step_calibrator.transform(X)) - lower_adjusted_lrs = np.where(self._lower_lr_bound < unadjusted_lrs, unadjusted_lrs, self._lower_lr_bound) - adjusted_lrs = np.where(self._upper_lr_bound > lower_adjusted_lrs, lower_adjusted_lrs, self._upper_lr_bound) - return adjusted_lrs - - @property - def p0(self): - return self.first_step_calibrator.p0 - - @property - def p1(self): - return self.first_step_calibrator.p1 From cf2892908ef846d076b7cc826854a0cc53a89ffb Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 4 Feb 2025 15:05:30 +0100 Subject: [PATCH 09/12] Added IVbounder subclass, including test. Made function names more distinctive. --- lir/bounding.py | 26 +++++++++++++------------- lir/calibration.py | 22 ++++++++++++++++++++++ tests/test_bounding.py | 37 +++++++++++++++++++++++++++---------- 3 files changed, 62 insertions(+), 23 deletions(-) diff --git a/lir/bounding.py b/lir/bounding.py index a7526de..c3ab3b8 100644 --- a/lir/bounding.py +++ b/lir/bounding.py @@ -1,18 +1,18 @@ """ -Extrapolation bounds on LRs using the method by Alberink et al. (2025) +Extrapolation bounds on LRs using the Invariance Verification method by Alberink et al. (2025) See: -[-] Ivo Alberink, Jeannette Leegwater, Jonas Malmborg, Anders Nordgaard, Marjan Sjerps, Leen van der Ham - A transparent method to determine limit values for Likelihood Ratio systems - In: to be submitted for publication. +[-] A transparent method to determine limit values for Likelihood Ratio systems, by + Ivo Alberink, Jeannette Leegwater, Jonas Malmborg, Anders Nordgaard, Marjan Sjerps, Leen van der Ham + In: Submitted for publication in 2025. """ import numpy as np import matplotlib.pyplot as plt -def plot_delta_functions(lrs: np.ndarray, y: np.ndarray, llr_threshold_range: tuple[float, float] = None, - step_size: float = .001, ax: plt.Axes = plt) -> None: +def plot_invariance_delta_functions(lrs: np.ndarray, y: np.ndarray, llr_threshold_range: tuple[float, float] = None, + step_size: float = .001, ax: plt.Axes = plt): """ - Returns a figure of the delta functions along with the upper and lower bounds of the LRs. + Returns a figure of the Invariance Verification delta functions along with the upper and lower bounds of the LRs. :param lrs: an array of LRs :param y: an array of ground-truth labels (values 0 for Hd or 1 for Hp); @@ -28,7 +28,7 @@ def plot_delta_functions(lrs: np.ndarray, y: np.ndarray, llr_threshold_range: tu llr_threshold = np.arange(*llr_threshold_range, step_size) - lower_bound, upper_bound, delta_low, delta_high = calculate_bounds(lrs, y, llr_threshold=llr_threshold) + lower_bound, upper_bound, delta_low, delta_high = calculate_invariance_bounds(lrs, y, llr_threshold=llr_threshold) # plot the delta-functions and the 0-line lower_llr = np.round(np.log10(lower_bound),2) @@ -41,10 +41,10 @@ def plot_delta_functions(lrs: np.ndarray, y: np.ndarray, llr_threshold_range: tu ax.xlabel("log10(LR)") ax.ylabel("$\Delta$-value") -def calculate_bounds(lrs: np.ndarray, y: np.ndarray, llr_threshold: np.ndarray = None, step_size: float = .001, +def calculate_invariance_bounds(lrs: np.ndarray, y: np.ndarray, llr_threshold: np.ndarray = None, step_size: float = .001, substitute_extremes: tuple[float, float] = (np.exp(-20), np.exp(20))) -> tuple[float, float, np.ndarray, np.ndarray]: """ - Returns the upper and lower bounds of the LRs. + Returns the upper and lower Invariance Verification bounds of the LRs. :param lrs: an array of LRs :param y: an array of ground-truth labels (values 0 for Hd or 1 for Hp); @@ -67,7 +67,7 @@ def calculate_bounds(lrs: np.ndarray, y: np.ndarray, llr_threshold: np.ndarray = llr_threshold = np.arange(*llr_threshold_range, step_size) # calculate the two delta functions - delta_low, delta_high = calculate_delta_functions(lrs, y, llr_threshold) + delta_low, delta_high = calculate_invariance_delta_functions(lrs, y, llr_threshold) # find the LLRs closest to LLR=0 where the functions become negative & convert them to LRs # if no negatives are found, use the maximum H1-LR in case of upper bound & minimum H2-LR in case of lower bound @@ -91,9 +91,9 @@ def calculate_bounds(lrs: np.ndarray, y: np.ndarray, llr_threshold: np.ndarray = return lower_bound, upper_bound, delta_low, delta_high -def calculate_delta_functions(lrs: np.ndarray, y: np.ndarray, llr_threshold: np.ndarray) -> tuple[np.ndarray, np.ndarray]: +def calculate_invariance_delta_functions(lrs: np.ndarray, y: np.ndarray, llr_threshold: np.ndarray) -> tuple[np.ndarray, np.ndarray]: """ - Calculates the delta functions for a set of LRs at given threshold values. + Calculates the Invariance Verification delta functions for a set of LRs at given threshold values. :param lrs: an array of LRs :param y: an array of ground-truth labels (values 0 for Hd or 1 for Hp); diff --git a/lir/calibration.py b/lir/calibration.py index e476026..7625584 100644 --- a/lir/calibration.py +++ b/lir/calibration.py @@ -14,6 +14,7 @@ from abc import ABC, abstractmethod from .bayeserror import elub +from .bounding import calculate_invariance_bounds from .loss_functions import negative_log_likelihood_balanced from .regression import IsotonicRegressionInf from .util import Xy_to_Xn, to_odds, ln_to_log10, Bind, to_probability @@ -763,3 +764,24 @@ def fit(self, X, y): y = np.asarray(y).squeeze() self._lower_lr_bound, self._upper_lr_bound = elub(lrs, y, add_misleading=1) return self + +class IVbounder(LRbounder): + """ + Class that, given an LR system, outputs the same LRs as the system but bounded by the Invariance Verification + bounds as described in: + A transparent method to determine limit values for Likelihood Ratio systems, by + Ivo Alberink, Jeannette Leegwater, Jonas Malmborg, Anders Nordgaard, Marjan Sjerps, Leen van der Ham + In: Submitted for publication in 2025. + """ + + def fit(self, X, y): + """ + assuming that y=1 corresponds to Hp, y=0 to Hd + """ + if self.also_fit_calibrator: + self.first_step_calibrator.fit(X, y) + lrs = self.first_step_calibrator.transform(X) + + y = np.asarray(y).squeeze() + self._lower_lr_bound, self._upper_lr_bound = calculate_invariance_bounds(lrs, y)[:2] + return self diff --git a/tests/test_bounding.py b/tests/test_bounding.py index 778f9be..5a17483 100644 --- a/tests/test_bounding.py +++ b/tests/test_bounding.py @@ -1,34 +1,38 @@ import numpy as np import unittest +from sklearn.linear_model import LogisticRegression import lir.bounding as bounding from lir.data import AlcoholBreathAnalyser +from lir.calibration import LogitCalibrator, IVbounder +from lir.lr import CalibratedScorer +from lir.util import Xn_to_Xy class TestBounding(unittest.TestCase): def test_breath(self): lrs, y = AlcoholBreathAnalyser(ill_calibrated=True).sample_lrs() - bounds = bounding.calculate_bounds(lrs, y) + bounds = bounding.calculate_invariance_bounds(lrs, y) np.testing.assert_almost_equal((0.1052741, 85.3731634), bounds[:2]) def test_extreme_smallset(self): lrs = np.array([np.inf, 0]) y = np.array([1, 0]) - bounds = bounding.calculate_bounds(lrs, y) + bounds = bounding.calculate_invariance_bounds(lrs, y) np.testing.assert_almost_equal((0.3335112, 2.9999248), bounds[:2]) def test_extreme(self): lrs = np.array([np.inf, np.inf, np.inf, 0, 0, 0]) y = np.array([1, 1, 1, 0, 0, 0]) - bounds = bounding.calculate_bounds(lrs, y) + bounds = bounding.calculate_invariance_bounds(lrs, y) np.testing.assert_almost_equal((0.1429257, 6.9840986), bounds[:2]) def test_system01(self): lrs = np.array([.01, .1, 1, 10, 100]) - bounds_bad = bounding.calculate_bounds(lrs, np.array([1, 1, 1, 0, 0])) - bounds_good1 = bounding.calculate_bounds(lrs, np.array([0, 0, 1, 1, 1])) - bounds_good2 = bounding.calculate_bounds(lrs, np.array([0, 0, 0, 1, 1])) + bounds_bad = bounding.calculate_invariance_bounds(lrs, np.array([1, 1, 1, 0, 0])) + bounds_good1 = bounding.calculate_invariance_bounds(lrs, np.array([0, 0, 1, 1, 1])) + bounds_good2 = bounding.calculate_invariance_bounds(lrs, np.array([0, 0, 0, 1, 1])) np.testing.assert_almost_equal((1, 1), bounds_bad[:2]) np.testing.assert_almost_equal((0.1503142, 3.74973), bounds_good1[:2]) @@ -37,24 +41,37 @@ def test_system01(self): def test_neutral_smallset(self): lrs = np.array([1, 1]) y = np.array([1, 0]) - bounds = bounding.calculate_bounds(lrs, y) + bounds = bounding.calculate_invariance_bounds(lrs, y) np.testing.assert_almost_equal((1, 1), bounds[:2]) def test_bias(self): lrs = np.ones(10) * 10 y = np.concatenate([np.ones(9), np.zeros(1)]) - bounds = bounding.calculate_bounds(lrs, y) + bounds = bounding.calculate_invariance_bounds(lrs, y) np.testing.assert_almost_equal((1, 1.2647363), bounds[:2]) lrs = np.concatenate([np.ones(10) * 10, np.ones(1)]) y = np.concatenate([np.ones(10), np.zeros(1)]) - bounds = bounding.calculate_bounds(lrs, y) + bounds = bounding.calculate_invariance_bounds(lrs, y) np.testing.assert_almost_equal((1, 3.8106582), bounds[:2]) lrs = np.concatenate([np.ones(10) * 1000, np.ones(1) * 1.1]) y = np.concatenate([np.ones(10), np.zeros(1)]) - bounds = bounding.calculate_bounds(lrs, y) + bounds = bounding.calculate_invariance_bounds(lrs, y) np.testing.assert_almost_equal((1, 3.8106582), bounds[:2]) + def test_bounded_calibrated_scorer(self): + + rng = np.random.default_rng(0) + + X0 = rng.normal(loc=-1, scale=1, size=(1000, 1)) + X1 = rng.normal(loc=+1, scale=1, size=(1000, 1)) + X, y = Xn_to_Xy(X0, X1) + + bounded_calibrated_scorer = CalibratedScorer(LogisticRegression(), IVbounder(LogitCalibrator())) + bounded_calibrated_scorer.fit(X, y) + bounds = (bounded_calibrated_scorer.calibrator._lower_lr_bound, bounded_calibrated_scorer.calibrator._upper_lr_bound) + np.testing.assert_almost_equal((0.0241724, 153.2216824), bounds) + if __name__ == '__main__': unittest.main() From d3b540b733ff6936f14e9c1081e12e4d4f1120a9 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 4 Feb 2025 15:08:08 +0100 Subject: [PATCH 10/12] Updated new test outputs to match results from github. --- tests/test_bounding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_bounding.py b/tests/test_bounding.py index 5a17483..1dd162f 100644 --- a/tests/test_bounding.py +++ b/tests/test_bounding.py @@ -71,7 +71,7 @@ def test_bounded_calibrated_scorer(self): bounded_calibrated_scorer = CalibratedScorer(LogisticRegression(), IVbounder(LogitCalibrator())) bounded_calibrated_scorer.fit(X, y) bounds = (bounded_calibrated_scorer.calibrator._lower_lr_bound, bounded_calibrated_scorer.calibrator._upper_lr_bound) - np.testing.assert_almost_equal((0.0241724, 153.2216824), bounds) + np.testing.assert_almost_equal((0.0241763, 152.8940706), bounds) if __name__ == '__main__': unittest.main() From 2198947191c4cf2b9366c895b0a6d9124f025c3e Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 11 Feb 2025 09:58:46 +0100 Subject: [PATCH 11/12] Made split between super class and sub class more consistent. --- lir/calibration.py | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/lir/calibration.py b/lir/calibration.py index 7625584..f702037 100644 --- a/lir/calibration.py +++ b/lir/calibration.py @@ -699,9 +699,21 @@ def __init__(self, first_step_calibrator, also_fit_calibrator=True): print('calibrator should have been fit when setting also_fit_calibrator = False!') @abstractmethod - def fit(self, X, y): + def calculate_bounds(self, lrs, y): pass + def fit(self, X, y): + """ + assuming that y=1 corresponds to Hp, y=0 to Hd + """ + if self.also_fit_calibrator: + self.first_step_calibrator.fit(X, y) + lrs = self.first_step_calibrator.transform(X) + + y = np.asarray(y).squeeze() + self._lower_lr_bound, self._upper_lr_bound = self.calculate_bounds(lrs, y) + return self + def transform(self, X): """ a transform entails calling the first step calibrator and applying the bounds found @@ -753,17 +765,10 @@ class ELUBbounder(LRbounder): # empirical_bounds=[min(a) max(a)] """ - def fit(self, X, y): - """ - assuming that y=1 corresponds to Hp, y=0 to Hd - """ - if self.also_fit_calibrator: - self.first_step_calibrator.fit(X, y) - lrs = self.first_step_calibrator.transform(X) + def calculate_bounds(self, lrs, y): - y = np.asarray(y).squeeze() - self._lower_lr_bound, self._upper_lr_bound = elub(lrs, y, add_misleading=1) - return self + lower_lr_bound, upper_lr_bound = elub(lrs, y, add_misleading=1) + return lower_lr_bound, upper_lr_bound class IVbounder(LRbounder): """ @@ -774,14 +779,7 @@ class IVbounder(LRbounder): In: Submitted for publication in 2025. """ - def fit(self, X, y): - """ - assuming that y=1 corresponds to Hp, y=0 to Hd - """ - if self.also_fit_calibrator: - self.first_step_calibrator.fit(X, y) - lrs = self.first_step_calibrator.transform(X) + def calculate_bounds(self, lrs, y): - y = np.asarray(y).squeeze() - self._lower_lr_bound, self._upper_lr_bound = calculate_invariance_bounds(lrs, y)[:2] - return self + lower_lr_bound, upper_lr_bound = calculate_invariance_bounds(lrs, y)[:2] + return lower_lr_bound, upper_lr_bound From 0f48d72cbfb9aa82c687c11a55a3bd58843117e4 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Wed, 12 Feb 2025 09:54:39 +0100 Subject: [PATCH 12/12] Improve error handling when method is not implemented in subclass --- lir/calibration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lir/calibration.py b/lir/calibration.py index f702037..c05c1f4 100644 --- a/lir/calibration.py +++ b/lir/calibration.py @@ -700,7 +700,7 @@ def __init__(self, first_step_calibrator, also_fit_calibrator=True): @abstractmethod def calculate_bounds(self, lrs, y): - pass + raise NotImplementedError def fit(self, X, y): """