Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New functionality for bounding LR systems #108

Merged
merged 12 commits into from
Feb 12, 2025
126 changes: 126 additions & 0 deletions lir/bounding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""
Extrapolation bounds on LRs using the Invariance Verification method by Alberink et al. (2025)

See:
[-] A transparent method to determine limit values for Likelihood Ratio systems, by
Ivo Alberink, Jeannette Leegwater, Jonas Malmborg, Anders Nordgaard, Marjan Sjerps, Leen van der Ham
In: Submitted for publication in 2025.
"""
import numpy as np
import matplotlib.pyplot as plt

def plot_invariance_delta_functions(lrs: np.ndarray, y: np.ndarray, llr_threshold_range: tuple[float, float] = None,
step_size: float = .001, ax: plt.Axes = plt):
"""
Returns a figure of the Invariance Verification delta functions along with the upper and lower bounds of the LRs.

:param lrs: an array of LRs
:param y: an array of ground-truth labels (values 0 for Hd or 1 for Hp);
must be of the same length as `lrs`
:param llr_threshold_range: lower limit and upper limit for the LLRs to include in the figure
:param step_size: required accuracy on a base-10 logarithmic scale
:param ax: matplotlib axes
"""

if llr_threshold_range is None:
llrs = np.log10(lrs)
llr_threshold_range = (np.min(llrs) - .5, np.max(llrs) + .5)

llr_threshold = np.arange(*llr_threshold_range, step_size)

lower_bound, upper_bound, delta_low, delta_high = calculate_invariance_bounds(lrs, y, llr_threshold=llr_threshold)

# plot the delta-functions and the 0-line
lower_llr = np.round(np.log10(lower_bound),2)
upper_llr = np.round(np.log10(upper_bound),2)
ax.plot(llr_threshold, delta_low, '--', label=r"$\Delta_{lower}$ is 0 at " + str(lower_llr))
ax.plot(llr_threshold, delta_high, '-', label=r"$\Delta_{upper}$ is 0 at " + str(upper_llr))
ax.axhline(color='k', linestyle='dotted')
# Some more formatting
LvdHam marked this conversation as resolved.
Show resolved Hide resolved
ax.legend(loc="upper left")
ax.xlabel("log10(LR)")
ax.ylabel("$\Delta$-value")

def calculate_invariance_bounds(lrs: np.ndarray, y: np.ndarray, llr_threshold: np.ndarray = None, step_size: float = .001,
substitute_extremes: tuple[float, float] = (np.exp(-20), np.exp(20))) -> tuple[float, float, np.ndarray, np.ndarray]:
"""
Returns the upper and lower Invariance Verification bounds of the LRs.

:param lrs: an array of LRs
:param y: an array of ground-truth labels (values 0 for Hd or 1 for Hp);
must be of the same length as `lrs`
:param llr_threshold: predefined values of LLRs as possible bounds
:param step_size: required accuracy on a base-10 logarithmic scale
:param substitute_extremes: (tuple of scalars) substitute for extreme LRs, i.e.
LRs of 0 and inf are substituted by these values
"""

# remove LRs of 0 and infinity
sanitized_lrs = lrs
sanitized_lrs[sanitized_lrs < substitute_extremes[0]] = substitute_extremes[0]
sanitized_lrs[sanitized_lrs > substitute_extremes[1]] = substitute_extremes[1]

# determine the range of LRs to be considered
if llr_threshold is None:
llrs = np.log10(sanitized_lrs)
llr_threshold_range = (min(0, np.min(llrs)), max(0, np.max(llrs))+step_size)
llr_threshold = np.arange(*llr_threshold_range, step_size)

# calculate the two delta functions
delta_low, delta_high = calculate_invariance_delta_functions(lrs, y, llr_threshold)

# find the LLRs closest to LLR=0 where the functions become negative & convert them to LRs
# if no negatives are found, use the maximum H1-LR in case of upper bound & minimum H2-LR in case of lower bound
delta_high_negative = np.where(delta_high < 0)[0]
if not any(delta_high_negative):
upper_bound = np.max(lrs[y==1])
else:
pst_upper_bound = delta_high_negative[0] - 1
upper_bound = 10 ** llr_threshold[pst_upper_bound]
delta_low_negative = np.where(delta_low < 0)[0]
if not any(delta_low_negative):
lower_bound = np.min(lrs[y==0])
else:
pst_lower_bound = delta_low_negative[-1] + 1
lower_bound = 10 ** llr_threshold[pst_lower_bound]

# Check for bounds on the wrong side of 1. This may occur for badly
# performing LR systems, e.g. if the delta function is always below zero.
lower_bound = min(lower_bound, 1)
upper_bound = max(upper_bound, 1)

return lower_bound, upper_bound, delta_low, delta_high

def calculate_invariance_delta_functions(lrs: np.ndarray, y: np.ndarray, llr_threshold: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
"""
Calculates the Invariance Verification delta functions for a set of LRs at given threshold values.

:param lrs: an array of LRs
:param y: an array of ground-truth labels (values 0 for Hd or 1 for Hp);
must be of the same length as `lrs`
:param llr_threshold: an array of threshold LLRs
:returns: two arrays of delta-values, at all threshold LR values
"""

# fix the value used for the beta distributions at 1/2 (Jeffreys prior)
beta_parameter = 1 / 2

# for all possible llr_threshold values, count how many of the lrs are larger or equal to them for both h1 and h2
lrs_h1 = lrs[y==1]
lrs_h2 = lrs[y==0]
llr_h1_2d = np.tile(np.expand_dims(np.log10(lrs_h1), 1), (1, llr_threshold.shape[0]))
llr_h2_2d = np.tile(np.expand_dims(np.log10(lrs_h2), 1), (1, llr_threshold.shape[0]))
success_h1 = np.sum(llr_h1_2d >= llr_threshold, axis=0)
success_h2 = np.sum(llr_h2_2d >= llr_threshold, axis=0)

# use the as inputs for calculations of the probabilities
prob_h1_above_grid = (success_h1 + beta_parameter) / (len(lrs_h1) + 2*beta_parameter)
prob_h2_above_grid = (success_h2 + beta_parameter) / (len(lrs_h2) + 2*beta_parameter)
prob_h1_below_grid = 1 - prob_h1_above_grid
prob_h2_below_grid = 1 - prob_h2_above_grid

# calculate the delta-functions for all the llr_threshold values
delta_high = np.log10(prob_h1_above_grid) - np.log10(prob_h2_above_grid) - llr_threshold
delta_low = llr_threshold - np.log10(prob_h1_below_grid) + np.log10(prob_h2_below_grid)

return delta_low, delta_high
104 changes: 68 additions & 36 deletions lir/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KernelDensity
from typing import Optional, Tuple, Union, Callable, Sized
from abc import ABC, abstractmethod

from .bayeserror import elub
from .bounding import calculate_invariance_bounds
from .loss_functions import negative_log_likelihood_balanced
from .regression import IsotonicRegressionInf
from .util import Xy_to_Xn, to_odds, ln_to_log10, Bind, to_probability
Expand Down Expand Up @@ -671,7 +673,53 @@ def transform(self, log_odds: np.ndarray):
return odds


class ELUBbounder(BaseEstimator, TransformerMixin):
class LRbounder(ABC, BaseEstimator, TransformerMixin):
"""
Class that, given an LR system, outputs the same LRs as the system but bounded by lower and upper bounds.
"""

def __init__(self, first_step_calibrator, also_fit_calibrator=True):
"""
a calibrator should be provided (optionally already fitted to data). This calibrator is called on scores,
the resulting LRs are then bounded. If also_fit_calibrator, the first step calibrator will be fit on the same
data used to derive the bounds
:param first_step_calibrator: the calibrator to use. Should already have been fitted if also_fit_calibrator is False
:param also_fit_calibrator: whether to also fit the first step calibrator when calling fit
"""

self.first_step_calibrator = first_step_calibrator
self.also_fit_calibrator = also_fit_calibrator
self._lower_lr_bound = None
self._upper_lr_bound = None
if not also_fit_calibrator:
# check the model was fitted.
try:
first_step_calibrator.transform(np.array([0.5]))
except NotFittedError:
print('calibrator should have been fit when setting also_fit_calibrator = False!')

@abstractmethod
def fit(self, X, y):
pass

def transform(self, X):
"""
a transform entails calling the first step calibrator and applying the bounds found
"""
unadjusted_lrs = np.array(self.first_step_calibrator.transform(X))
lower_adjusted_lrs = np.where(self._lower_lr_bound < unadjusted_lrs, unadjusted_lrs, self._lower_lr_bound)
adjusted_lrs = np.where(self._upper_lr_bound > lower_adjusted_lrs, lower_adjusted_lrs, self._upper_lr_bound)
return adjusted_lrs

@property
def p0(self):
return self.first_step_calibrator.p0

@property
def p1(self):
return self.first_step_calibrator.p1

class ELUBbounder(LRbounder):
"""
Class that, given an LR system, outputs the same LRs as the system but bounded by the Empirical Upper and Lower
Bounds as described in
Expand Down Expand Up @@ -705,51 +753,35 @@ class ELUBbounder(BaseEstimator, TransformerMixin):
# empirical_bounds=[min(a) max(a)]
"""

def __init__(self, first_step_calibrator, also_fit_calibrator=True):
"""
a calibrator should be provided (optionally already fitted to data). This calibrator is called on scores,
the resulting LRs are then bounded. If also_fit_calibrator, the first step calibrator will be fit on the same
data used to derive the ELUB bounds
:param first_step_calibrator: the calibrator to use. Should already have been fitted if also_fit_calibrator is False
:param also_fit_calibrator: whether to also fit the first step calibrator when calling fit
"""

self.first_step_calibrator = first_step_calibrator
self.also_fit_calibrator = also_fit_calibrator
self._lower_lr_bound = None
self._upper_lr_bound = None
if not also_fit_calibrator:
# check the model was fitted.
try:
first_step_calibrator.transform(np.array([0.5]))
except NotFittedError:
print('calibrator should have been fit when setting also_fit_calibrator = False!')

def fit(self, X, y):
"""
assuming that y=1 corresponds to Hp, y=0 to Hd
"""
if self.also_fit_calibrator:
self.first_step_calibrator.fit(X,y)
lrs = self.first_step_calibrator.transform(X)
self.first_step_calibrator.fit(X, y)
LvdHam marked this conversation as resolved.
Show resolved Hide resolved
lrs = self.first_step_calibrator.transform(X)

y = np.asarray(y).squeeze()
self._lower_lr_bound, self._upper_lr_bound = elub(lrs, y, add_misleading=1)
return self

def transform(self, X):
class IVbounder(LRbounder):
"""
Class that, given an LR system, outputs the same LRs as the system but bounded by the Invariance Verification
bounds as described in:
A transparent method to determine limit values for Likelihood Ratio systems, by
Ivo Alberink, Jeannette Leegwater, Jonas Malmborg, Anders Nordgaard, Marjan Sjerps, Leen van der Ham
In: Submitted for publication in 2025.
"""

def fit(self, X, y):
"""
a transform entails calling the first step calibrator and applying the bounds found
assuming that y=1 corresponds to Hp, y=0 to Hd
"""
unadjusted_lrs = np.array(self.first_step_calibrator.transform(X))
lower_adjusted_lrs = np.where(self._lower_lr_bound < unadjusted_lrs, unadjusted_lrs, self._lower_lr_bound)
adjusted_lrs = np.where(self._upper_lr_bound > lower_adjusted_lrs, lower_adjusted_lrs, self._upper_lr_bound)
return adjusted_lrs

@property
def p0(self):
return self.first_step_calibrator.p0
if self.also_fit_calibrator:
LvdHam marked this conversation as resolved.
Show resolved Hide resolved
self.first_step_calibrator.fit(X, y)
lrs = self.first_step_calibrator.transform(X)

@property
def p1(self):
return self.first_step_calibrator.p1
y = np.asarray(y).squeeze()
self._lower_lr_bound, self._upper_lr_bound = calculate_invariance_bounds(lrs, y)[:2]
LvdHam marked this conversation as resolved.
Show resolved Hide resolved
return self
77 changes: 77 additions & 0 deletions tests/test_bounding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import numpy as np
import unittest
from sklearn.linear_model import LogisticRegression

import lir.bounding as bounding
from lir.data import AlcoholBreathAnalyser
from lir.calibration import LogitCalibrator, IVbounder
from lir.lr import CalibratedScorer
from lir.util import Xn_to_Xy


class TestBounding(unittest.TestCase):

def test_breath(self):
lrs, y = AlcoholBreathAnalyser(ill_calibrated=True).sample_lrs()
bounds = bounding.calculate_invariance_bounds(lrs, y)
np.testing.assert_almost_equal((0.1052741, 85.3731634), bounds[:2])

def test_extreme_smallset(self):
lrs = np.array([np.inf, 0])
y = np.array([1, 0])
bounds = bounding.calculate_invariance_bounds(lrs, y)
np.testing.assert_almost_equal((0.3335112, 2.9999248), bounds[:2])

def test_extreme(self):
lrs = np.array([np.inf, np.inf, np.inf, 0, 0, 0])
y = np.array([1, 1, 1, 0, 0, 0])
bounds = bounding.calculate_invariance_bounds(lrs, y)
np.testing.assert_almost_equal((0.1429257, 6.9840986), bounds[:2])

def test_system01(self):
lrs = np.array([.01, .1, 1, 10, 100])
bounds_bad = bounding.calculate_invariance_bounds(lrs, np.array([1, 1, 1, 0, 0]))
bounds_good1 = bounding.calculate_invariance_bounds(lrs, np.array([0, 0, 1, 1, 1]))
bounds_good2 = bounding.calculate_invariance_bounds(lrs, np.array([0, 0, 0, 1, 1]))

np.testing.assert_almost_equal((1, 1), bounds_bad[:2])
np.testing.assert_almost_equal((0.1503142, 3.74973), bounds_good1[:2])
np.testing.assert_almost_equal((0.2666859, 6.6527316), bounds_good2[:2])

def test_neutral_smallset(self):
lrs = np.array([1, 1])
y = np.array([1, 0])
bounds = bounding.calculate_invariance_bounds(lrs, y)
np.testing.assert_almost_equal((1, 1), bounds[:2])

def test_bias(self):
lrs = np.ones(10) * 10
y = np.concatenate([np.ones(9), np.zeros(1)])
bounds = bounding.calculate_invariance_bounds(lrs, y)
np.testing.assert_almost_equal((1, 1.2647363), bounds[:2])

lrs = np.concatenate([np.ones(10) * 10, np.ones(1)])
y = np.concatenate([np.ones(10), np.zeros(1)])
bounds = bounding.calculate_invariance_bounds(lrs, y)
np.testing.assert_almost_equal((1, 3.8106582), bounds[:2])

lrs = np.concatenate([np.ones(10) * 1000, np.ones(1) * 1.1])
y = np.concatenate([np.ones(10), np.zeros(1)])
bounds = bounding.calculate_invariance_bounds(lrs, y)
np.testing.assert_almost_equal((1, 3.8106582), bounds[:2])

def test_bounded_calibrated_scorer(self):

rng = np.random.default_rng(0)

X0 = rng.normal(loc=-1, scale=1, size=(1000, 1))
X1 = rng.normal(loc=+1, scale=1, size=(1000, 1))
X, y = Xn_to_Xy(X0, X1)

bounded_calibrated_scorer = CalibratedScorer(LogisticRegression(), IVbounder(LogitCalibrator()))
bounded_calibrated_scorer.fit(X, y)
bounds = (bounded_calibrated_scorer.calibrator._lower_lr_bound, bounded_calibrated_scorer.calibrator._upper_lr_bound)
np.testing.assert_almost_equal((0.0241763, 152.8940706), bounds)

if __name__ == '__main__':
unittest.main()
16 changes: 16 additions & 0 deletions tests/test_elub.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import numpy as np
import unittest
from sklearn.linear_model import LogisticRegression

import lir.bayeserror
from lir.data import AlcoholBreathAnalyser
from lir.calibration import LogitCalibrator, ELUBbounder
from lir.lr import CalibratedScorer
from lir.util import Xn_to_Xy


class TestElub(unittest.TestCase):
Expand Down Expand Up @@ -54,6 +58,18 @@ def test_bias(self):
y = np.concatenate([np.ones(10), np.zeros(1)])
np.testing.assert_almost_equal((1, 1), lir.bayeserror.elub(lrs, y, add_misleading=1))

def test_bounded_calibrated_scorer(self):

rng = np.random.default_rng(0)

X0 = rng.normal(loc=-1, scale=1, size=(1000, 1))
X1 = rng.normal(loc=+1, scale=1, size=(1000, 1))
X, y = Xn_to_Xy(X0, X1)

bounded_calibrated_scorer = CalibratedScorer(LogisticRegression(), ELUBbounder(LogitCalibrator()))
bounded_calibrated_scorer.fit(X, y)
bounds = (bounded_calibrated_scorer.calibrator._lower_lr_bound, bounded_calibrated_scorer.calibrator._upper_lr_bound)
np.testing.assert_almost_equal((0.0293081, 104.5963329), bounds)

if __name__ == '__main__':
unittest.main()