From 0510a3a376ba3da99b0dfa4364347c6f9524ea75 Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Thu, 8 Jun 2023 12:47:36 +0300 Subject: [PATCH 01/19] add only very basic structure --- src/hist/basehist.py | 20 ++++++++++++++++++++ src/hist/stats.py | 27 +++++++++++++++++++++++++++ tests/test_stats.py | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 src/hist/stats.py create mode 100644 tests/test_stats.py diff --git a/src/hist/basehist.py b/src/hist/basehist.py index bfaaa8f9..c53f59b8 100644 --- a/src/hist/basehist.py +++ b/src/hist/basehist.py @@ -547,3 +547,23 @@ def sum(self, flow: bool = False) -> float | bh.accumulators.Accumulator: raise AssertionError(f"Unsupported storage type {storage}") return super().sum(flow=flow) + + def chisquare_1samp(self) -> str | NotImplementedError: + from hist import stats + + return stats.chisquare_1samp(self) + + def chisquare_2samp(self) -> str | NotImplementedError: + from hist import stats + + return stats.chisquare_2samp(self) + + def ks_1samp(self) -> str | NotImplementedError: + from hist import stats + + return stats.ks_1samp(self) + + def ks_2samp(self) -> str | NotImplementedError: + from hist import stats + + return stats.ks_2samp(self) diff --git a/src/hist/stats.py b/src/hist/stats.py new file mode 100644 index 00000000..0535f110 --- /dev/null +++ b/src/hist/stats.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +import hist + + +def chisquare_1samp(self: hist.BaseHist) -> str | NotImplementedError: + if self.ndim != 1: + raise NotImplementedError("chisquare_1samp is only supported for 1D histograms") + return "Performing chi square one sample test" + + +def chisquare_2samp(self: hist.BaseHist) -> str | NotImplementedError: + if self.ndim != 1: + raise NotImplementedError("chisquare_2samp is only supported for 1D histograms") + return "Performing chi square two sample test" + + +def ks_1samp(self: hist.BaseHist) -> str | NotImplementedError: + if self.ndim != 1: + raise NotImplementedError("ks_1samp is only supported for 1D histograms") + return "Performing ks one sample test" + + +def ks_2samp(self: hist.BaseHist) -> str | NotImplementedError: + if self.ndim != 1: + raise NotImplementedError("ks_2samp is only supported for 1D histograms") + return "Performing ks two sample test" diff --git a/tests/test_stats.py b/tests/test_stats.py new file mode 100644 index 00000000..b3792026 --- /dev/null +++ b/tests/test_stats.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import numpy as np +import pytest + +import hist +from hist import Hist + + +def test_stattest(): + h = Hist(hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000)) + + assert h.chisquare_1samp() == "Performing chi square one sample test" + assert h.chisquare_2samp() == "Performing chi square two sample test" + assert h.ks_1samp() == "Performing ks one sample test" + assert h.ks_2samp() == "Performing ks two sample test" + + h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) + + with pytest.raises( + NotImplementedError, match="chisquare_1samp is only supported for 1D histograms" + ): + h.chisquare_1samp() + with pytest.raises( + NotImplementedError, match="chisquare_2samp is only supported for 1D histograms" + ): + h.chisquare_2samp() + with pytest.raises( + NotImplementedError, match="ks_1samp is only supported for 1D histograms" + ): + h.ks_1samp() + with pytest.raises( + NotImplementedError, match="ks_2samp is only supported for 1D histograms" + ): + h.ks_2samp() From 60443c46448ec2ba17cf29ddc658d0406a770b01 Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Fri, 9 Jun 2023 15:43:37 +0300 Subject: [PATCH 02/19] first attemptt at chisquare tests --- src/hist/basehist.py | 26 +++++--- src/hist/stats.py | 141 +++++++++++++++++++++++++++++++++++++++---- tests/test_stats.py | 105 ++++++++++++++++++++++++++------ 3 files changed, 233 insertions(+), 39 deletions(-) diff --git a/src/hist/basehist.py b/src/hist/basehist.py index c53f59b8..0f104364 100644 --- a/src/hist/basehist.py +++ b/src/hist/basehist.py @@ -548,22 +548,32 @@ def sum(self, flow: bool = False) -> float | bh.accumulators.Accumulator: return super().sum(flow=flow) - def chisquare_1samp(self) -> str | NotImplementedError: + def chisquare_1samp( + self: hist.BaseHist, + distribution: str | Callable[..., Any], + args: Any = (), + kwds: Any = None, + ) -> Any: from hist import stats - return stats.chisquare_1samp(self) + return stats.chisquare_1samp(self, distribution, args, kwds) - def chisquare_2samp(self) -> str | NotImplementedError: + def chisquare_2samp(self, other: hist.BaseHist) -> Any: from hist import stats - return stats.chisquare_2samp(self) + return stats.chisquare_2samp(self, other) - def ks_1samp(self) -> str | NotImplementedError: + def ks_1samp( + self: hist.BaseHist, + distribution: str | Callable[..., Any], + args: Any = (), + kwds: Any = None, + ) -> Any: from hist import stats - return stats.ks_1samp(self) + return stats.ks_1samp(self, distribution, args, kwds) - def ks_2samp(self) -> str | NotImplementedError: + def ks_2samp(self, other: hist.BaseHist) -> Any: from hist import stats - return stats.ks_2samp(self) + return stats.ks_2samp(self, other) diff --git a/src/hist/stats.py b/src/hist/stats.py index 0535f110..846cd75f 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -1,27 +1,146 @@ from __future__ import annotations +from typing import Any, Callable + +import numpy as np +from scipy import stats as spstats + import hist -def chisquare_1samp(self: hist.BaseHist) -> str | NotImplementedError: +def chisquare_1samp( + self: hist.BaseHist, + distribution: str | Callable[..., Any], + args: Any = (), + kwds: Any = None, +) -> Any: + kwds = {} if kwds is None else kwds + if self.ndim != 1: - raise NotImplementedError("chisquare_1samp is only supported for 1D histograms") - return "Performing chi square one sample test" + raise NotImplementedError( + f"Only 1D-histogram supports chisquare_1samp, try projecting {self.__class__.__name__} to 1D" + ) + if isinstance(distribution, str): + if not hasattr(spstats, distribution): + raise ValueError( + f"Unknown distribution {distribution}, try one of the defined scipy distributions" + ) + cdf = getattr(spstats, distribution).cdf + elif callable(distribution): + cdf = distribution + else: + raise TypeError( + f"Unknown distribution type {distribution}, try one of the defined scipy distributions or a callable CDF" + ) + + variances = self.variances() + if variances is None: + raise RuntimeError( + "Cannot compute from a variance-less histogram, try a Weight storage" + ) + observed = self.values() + totalentries = self.sum() + expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries + where = variances != 0 + squares = (expected[where] - observed[where]) ** 2 / variances[where] + ndof = len(observed) - 1 + chisq = np.sum(squares) + pvalue = spstats.chi2.sf(chisq, ndof) -def chisquare_2samp(self: hist.BaseHist) -> str | NotImplementedError: + return chisq, ndof, pvalue + + +def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: if self.ndim != 1: - raise NotImplementedError("chisquare_2samp is only supported for 1D histograms") - return "Performing chi square two sample test" + raise NotImplementedError( + f"Only 1D-histogram supports chisquare_2samp, try projecting {self.__class__.__name__} to 1D" + ) + if isinstance(other, hist.hist.Hist) and other.ndim != 1: + raise NotImplementedError( + f"Only 1D-histogram supports chisquare_2samp, try projecting other={other.__class__.__name__} to 1D" + ) + if not isinstance(other, hist.hist.Hist): + raise TypeError( + f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object" + ) + + variances1 = self.variances() + variances2 = other.variances() + if variances1 is None or variances2 is None: + raise RuntimeError( + "Cannot compute from variance-less histograms, try a Weight storage" + ) + counts1 = self.values() + counts2 = other.values() + squares = ( + counts1 * np.sqrt(counts2.sum() / counts1.sum()) + - counts2 * np.sqrt(counts1.sum() / counts2.sum()) + ) ** 2 + variances = variances1 + variances2 + where = variances != 0 + chisq = np.sum(squares[where] / variances[where]) + k = where.sum() + ndof = k - 1 if self.sum() == other.sum() else k + pvalue = spstats.chi2.sf(chisq, ndof) + + return chisq, ndof, pvalue + + +def ks_1samp( + self: hist.BaseHist, + distribution: str | Callable[..., Any], + args: Any = (), + kwds: Any = None, +) -> Any: + kwds = {} if kwds is None else kwds -def ks_1samp(self: hist.BaseHist) -> str | NotImplementedError: if self.ndim != 1: - raise NotImplementedError("ks_1samp is only supported for 1D histograms") - return "Performing ks one sample test" + raise NotImplementedError( + f"Only 1D-histogram supports ks_1samp, try projecting {self.__class__.__name__} to 1D" + ) + if isinstance(distribution, str): + if not hasattr(spstats, distribution): + raise ValueError( + f"Unknown distribution {distribution}, try one of the defined scipy distributions" + ) + cdf = getattr(spstats, distribution).cdf + elif callable(distribution): + cdf = distribution + else: + raise TypeError( + f"Unknown distribution type {distribution}, try one of the defined scipy distributions or a callable CDF" + ) + + variances = self.variances() + if variances is None: + raise RuntimeError( + "Cannot compute from a variance-less histogram, try a Weight storage" + ) + return cdf(self.axes[0].edges, *args, **kwds) # placeholder to pass pre-commit -def ks_2samp(self: hist.BaseHist) -> str | NotImplementedError: + +def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: if self.ndim != 1: - raise NotImplementedError("ks_2samp is only supported for 1D histograms") + raise NotImplementedError( + f"Only 1D-histogram supports ks_2samp, try projecting {self.__class__.__name__} to 1D" + ) + if isinstance(other, hist.hist.Hist) and other.ndim != 1: + raise NotImplementedError( + f"Only 1D-histogram supports ks_2samp, try projecting other={other.__class__.__name__} to 1D" + ) + if not isinstance(other, hist.hist.Hist): + raise TypeError( + f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object" + ) + + variances1 = self.variances() + variances2 = other.variances() + if variances1 is None or variances2 is None: + raise RuntimeError( + "Cannot compute from variance-less histograms, try a Weight storage" + ) + return "Performing ks two sample test" diff --git a/tests/test_stats.py b/tests/test_stats.py index b3792026..7561d643 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -2,36 +2,101 @@ import numpy as np import pytest +from scipy import stats as spstats import hist from hist import Hist def test_stattest(): + from numba_stats import norm + + np.random.seed(42) + h = Hist(hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000)) - assert h.chisquare_1samp() == "Performing chi square one sample test" - assert h.chisquare_2samp() == "Performing chi square two sample test" - assert h.ks_1samp() == "Performing ks one sample test" - assert h.ks_2samp() == "Performing ks two sample test" + chisq, ndof, pvalue = h.chisquare_1samp("norm") + assert chisq == pytest.approx(9.47425856230132) + assert ndof == 19 + assert pvalue == pytest.approx(0.9647461095072625) + + chisq, ndof, pvalue = h.chisquare_1samp(norm.cdf, args=(0, 1)) + assert chisq == pytest.approx(9.47425856230132) + assert ndof == 19 + assert pvalue == pytest.approx(0.9647461095072625) + + np.random.seed(42) + + h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm")) + h2 = Hist(hist.axis.Regular(20, -5, 5, name="norm")) + h1.fill(np.random.normal(size=1000)) + h2.fill(np.random.normal(size=500)) + + chisq, ndof, pvalue = h1.chisquare_2samp(h2) + assert chisq == pytest.approx(12.901853991544478) + assert ndof == 15 + assert pvalue == pytest.approx(0.609878574488961) + + np.random.seed(42) + + h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm")) + h2 = Hist(hist.axis.Regular(20, -5, 5, name="norm")) + h1.fill(np.random.normal(size=1000)) + h2.fill(np.random.normal(size=1000)) + + chisq, ndof, pvalue = h1.chisquare_2samp(h2) + assert chisq == pytest.approx(13.577308400334086) + assert ndof == 14 + assert pvalue == pytest.approx(0.4816525905214355) + + assert np.all( + h.ks_1samp("norm") == spstats.norm.cdf(h.axes[0].edges) + ) # placeholder to pass test for now + + np.random.seed(42) + + h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm")) + h2 = Hist(hist.axis.Regular(20, -5, 5, name="norm")) + h1.fill(np.random.normal(size=1000)) + h2.fill(np.random.normal(size=1000)) + + assert h1.ks_2samp(h2) == "Performing ks two sample test" h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) + with pytest.raises(NotImplementedError): + h.chisquare_1samp("norm") + with pytest.raises(NotImplementedError): + h.chisquare_2samp(h2) + with pytest.raises(NotImplementedError): + h.ks_1samp("norm") + with pytest.raises(NotImplementedError): + h.ks_2samp(h2) + + h = Hist(hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000)) + with pytest.raises(RuntimeError): + h.chisquare_1samp("norm") + with pytest.raises(RuntimeError): + h.chisquare_2samp(h2) + with pytest.raises(RuntimeError): + h.ks_1samp("norm") + with pytest.raises(RuntimeError): + h.ks_2samp(h2) + + h = Hist(hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000)) + with pytest.raises(ValueError): + h.chisquare_1samp("not_a_distribution") + with pytest.raises(ValueError): + h.ks_1samp("not_a_distribution") - with pytest.raises( - NotImplementedError, match="chisquare_1samp is only supported for 1D histograms" - ): - h.chisquare_1samp() - with pytest.raises( - NotImplementedError, match="chisquare_2samp is only supported for 1D histograms" - ): - h.chisquare_2samp() - with pytest.raises( - NotImplementedError, match="ks_1samp is only supported for 1D histograms" - ): - h.ks_1samp() - with pytest.raises( - NotImplementedError, match="ks_2samp is only supported for 1D histograms" - ): - h.ks_2samp() + with pytest.raises(TypeError): + h.chisquare_1samp(1) + with pytest.raises(TypeError): + h.chisquare_2samp(1) + with pytest.raises(TypeError): + h.ks_1samp(1) + with pytest.raises(TypeError): + h.ks_2samp(1) From 70e5a718b526c337eed9f668f8d9c5896f285b25 Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Fri, 9 Jun 2023 17:01:11 +0300 Subject: [PATCH 03/19] split tests into 4 functions --- src/hist/stats.py | 4 +-- tests/test_stats.py | 87 +++++++++++++++++++++++++++++++-------------- 2 files changed, 63 insertions(+), 28 deletions(-) diff --git a/src/hist/stats.py b/src/hist/stats.py index 846cd75f..3cb8f5d2 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -43,9 +43,9 @@ def chisquare_1samp( totalentries = self.sum() expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries where = variances != 0 - squares = (expected[where] - observed[where]) ** 2 / variances[where] + squares = (expected - observed) ** 2 ndof = len(observed) - 1 - chisq = np.sum(squares) + chisq = np.sum(squares[where] / variances[where]) pvalue = spstats.chi2.sf(chisq, ndof) return chisq, ndof, pvalue diff --git a/tests/test_stats.py b/tests/test_stats.py index 7561d643..7463efc0 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -8,7 +8,7 @@ from hist import Hist -def test_stattest(): +def test_chisquare_1samp(): from numba_stats import norm np.random.seed(42) @@ -26,6 +26,26 @@ def test_stattest(): assert ndof == 19 assert pvalue == pytest.approx(0.9647461095072625) + h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) + with pytest.raises(NotImplementedError): + h.chisquare_1samp("norm") + + h = Hist(hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000)) + with pytest.raises(RuntimeError): + h.chisquare_1samp("norm") + + h = Hist(hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000)) + with pytest.raises(ValueError): + h.chisquare_1samp("not_a_distribution") + + with pytest.raises(TypeError): + h.chisquare_1samp(1) + + +def test_chisquare_2samp(): np.random.seed(42) h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm")) @@ -50,10 +70,50 @@ def test_stattest(): assert ndof == 14 assert pvalue == pytest.approx(0.4816525905214355) + h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) + with pytest.raises(NotImplementedError): + h.chisquare_2samp(h2) + + h = Hist(hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000)) + with pytest.raises(RuntimeError): + h.chisquare_2samp(h2) + + with pytest.raises(TypeError): + h.chisquare_2samp(1) + + +def test_ks_1samp(): + np.random.seed(42) + + h = Hist(hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000)) + assert np.all( h.ks_1samp("norm") == spstats.norm.cdf(h.axes[0].edges) ) # placeholder to pass test for now + h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) + with pytest.raises(NotImplementedError): + h.ks_1samp("norm") + + h = Hist(hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000)) + with pytest.raises(RuntimeError): + h.ks_1samp("norm") + + h = Hist(hist.axis.Regular(20, -5, 5)) + h.fill(np.random.normal(size=1000)) + with pytest.raises(ValueError): + h.ks_1samp("not_a_distribution") + + with pytest.raises(TypeError): + h.ks_1samp(1) + + +def test_ks_2samp(): np.random.seed(42) h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm")) @@ -65,38 +125,13 @@ def test_stattest(): h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) - with pytest.raises(NotImplementedError): - h.chisquare_1samp("norm") - with pytest.raises(NotImplementedError): - h.chisquare_2samp(h2) - with pytest.raises(NotImplementedError): - h.ks_1samp("norm") with pytest.raises(NotImplementedError): h.ks_2samp(h2) h = Hist(hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000)) - with pytest.raises(RuntimeError): - h.chisquare_1samp("norm") - with pytest.raises(RuntimeError): - h.chisquare_2samp(h2) - with pytest.raises(RuntimeError): - h.ks_1samp("norm") with pytest.raises(RuntimeError): h.ks_2samp(h2) - h = Hist(hist.axis.Regular(20, -5, 5)) - h.fill(np.random.normal(size=1000)) - with pytest.raises(ValueError): - h.chisquare_1samp("not_a_distribution") - with pytest.raises(ValueError): - h.ks_1samp("not_a_distribution") - - with pytest.raises(TypeError): - h.chisquare_1samp(1) - with pytest.raises(TypeError): - h.chisquare_2samp(1) - with pytest.raises(TypeError): - h.ks_1samp(1) with pytest.raises(TypeError): h.ks_2samp(1) From 4f8ea6987a4d97f9b085481dd58f879cd4e8606f Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Sun, 11 Jun 2023 01:34:39 +0300 Subject: [PATCH 04/19] small updates in chisquare tests --- src/hist/stats.py | 58 ++++++++++++++++++++++++++------------------- tests/test_stats.py | 4 ++-- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/src/hist/stats.py b/src/hist/stats.py index 3cb8f5d2..2b150201 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -4,10 +4,27 @@ import numpy as np from scipy import stats as spstats +from scipy.stats import rv_continuous, rv_discrete import hist +def _get_cdf_if_valid(obj: Any) -> Any: + if isinstance(obj, (rv_continuous, rv_discrete)): + return obj.cdf + if isinstance(obj, str) and hasattr(spstats, obj): + dist = getattr(spstats, obj) + if isinstance(dist, (rv_continuous, rv_discrete)): + return dist.cdf + if hasattr(obj, "cdf") and callable(obj.cdf): + return obj.cdf + if callable(obj): + return obj + raise TypeError( + f"Unknown distribution type {obj}, try one of the scipy distributions, an object with a cdf method, or a callable cdf implementation" + ) + + def chisquare_1samp( self: hist.BaseHist, distribution: str | Callable[..., Any], @@ -20,18 +37,7 @@ def chisquare_1samp( raise NotImplementedError( f"Only 1D-histogram supports chisquare_1samp, try projecting {self.__class__.__name__} to 1D" ) - if isinstance(distribution, str): - if not hasattr(spstats, distribution): - raise ValueError( - f"Unknown distribution {distribution}, try one of the defined scipy distributions" - ) - cdf = getattr(spstats, distribution).cdf - elif callable(distribution): - cdf = distribution - else: - raise TypeError( - f"Unknown distribution type {distribution}, try one of the defined scipy distributions or a callable CDF" - ) + cdf = _get_cdf_if_valid(distribution) variances = self.variances() if variances is None: @@ -42,9 +48,12 @@ def chisquare_1samp( observed = self.values() totalentries = self.sum() expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries + variances = ( + expected # TODO: check if variances or expected should go in the denominator + ) where = variances != 0 squares = (expected - observed) ** 2 - ndof = len(observed) - 1 + ndof = where.sum() - 1 chisq = np.sum(squares[where] / variances[where]) pvalue = spstats.chi2.sf(chisq, ndof) @@ -64,6 +73,11 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: raise TypeError( f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object" ) + # TODO: add support for compatible rebinning + if not np.allclose(self.axes[0].edges, other.axes[0].edges): + raise NotImplementedError( + "Cannot compute chi2 from histograms with different binning, try rebinning" + ) variances1 = self.variances() variances2 = other.variances() @@ -100,18 +114,7 @@ def ks_1samp( raise NotImplementedError( f"Only 1D-histogram supports ks_1samp, try projecting {self.__class__.__name__} to 1D" ) - if isinstance(distribution, str): - if not hasattr(spstats, distribution): - raise ValueError( - f"Unknown distribution {distribution}, try one of the defined scipy distributions" - ) - cdf = getattr(spstats, distribution).cdf - elif callable(distribution): - cdf = distribution - else: - raise TypeError( - f"Unknown distribution type {distribution}, try one of the defined scipy distributions or a callable CDF" - ) + cdf = _get_cdf_if_valid(distribution) variances = self.variances() if variances is None: @@ -135,6 +138,11 @@ def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: raise TypeError( f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object" ) + # TODO: add support for compatible rebinning + if not np.allclose(self.axes[0].edges, other.axes[0].edges): + raise NotImplementedError( + "Cannot compute chi2 from histograms with different binning, try rebinning" + ) variances1 = self.variances() variances2 = other.variances() diff --git a/tests/test_stats.py b/tests/test_stats.py index 7463efc0..fa438800 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -38,7 +38,7 @@ def test_chisquare_1samp(): h = Hist(hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000)) - with pytest.raises(ValueError): + with pytest.raises(TypeError): h.chisquare_1samp("not_a_distribution") with pytest.raises(TypeError): @@ -106,7 +106,7 @@ def test_ks_1samp(): h = Hist(hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000)) - with pytest.raises(ValueError): + with pytest.raises(TypeError): h.ks_1samp("not_a_distribution") with pytest.raises(TypeError): From 07eee774990568cd52ad4f128ca4a1a75dac4c9f Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Sun, 11 Jun 2023 16:05:18 +0300 Subject: [PATCH 05/19] some fixes for now --- src/hist/stats.py | 10 +++++++++- tests/test_stats.py | 16 ++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/hist/stats.py b/src/hist/stats.py index 2b150201..8c4ecc35 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -74,6 +74,10 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object" ) # TODO: add support for compatible rebinning + if self.size != other.size: + raise NotImplementedError( + "Cannot compute chi2 from histograms with different binning, try rebinning" + ) if not np.allclose(self.axes[0].edges, other.axes[0].edges): raise NotImplementedError( "Cannot compute chi2 from histograms with different binning, try rebinning" @@ -96,7 +100,7 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: where = variances != 0 chisq = np.sum(squares[where] / variances[where]) k = where.sum() - ndof = k - 1 if self.sum() == other.sum() else k + ndof = k if self.sum() == other.sum() else k - 1 pvalue = spstats.chi2.sf(chisq, ndof) return chisq, ndof, pvalue @@ -139,6 +143,10 @@ def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object" ) # TODO: add support for compatible rebinning + if self.size != other.size: + raise NotImplementedError( + "Cannot compute chi2 from histograms with different binning, try rebinning" + ) if not np.allclose(self.axes[0].edges, other.axes[0].edges): raise NotImplementedError( "Cannot compute chi2 from histograms with different binning, try rebinning" diff --git a/tests/test_stats.py b/tests/test_stats.py index fa438800..16f35b83 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -17,14 +17,14 @@ def test_chisquare_1samp(): h.fill(np.random.normal(size=1000)) chisq, ndof, pvalue = h.chisquare_1samp("norm") - assert chisq == pytest.approx(9.47425856230132) + assert chisq == pytest.approx(11.640108080468355) assert ndof == 19 - assert pvalue == pytest.approx(0.9647461095072625) + assert pvalue == pytest.approx(0.9004272294139111) chisq, ndof, pvalue = h.chisquare_1samp(norm.cdf, args=(0, 1)) - assert chisq == pytest.approx(9.47425856230132) + assert chisq == pytest.approx(11.64010808046843) assert ndof == 19 - assert pvalue == pytest.approx(0.9647461095072625) + assert pvalue == pytest.approx(0.9004272294139111) h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) @@ -55,8 +55,8 @@ def test_chisquare_2samp(): chisq, ndof, pvalue = h1.chisquare_2samp(h2) assert chisq == pytest.approx(12.901853991544478) - assert ndof == 15 - assert pvalue == pytest.approx(0.609878574488961) + assert ndof == 14 + assert pvalue == pytest.approx(0.5342659447391381) np.random.seed(42) @@ -67,8 +67,8 @@ def test_chisquare_2samp(): chisq, ndof, pvalue = h1.chisquare_2samp(h2) assert chisq == pytest.approx(13.577308400334086) - assert ndof == 14 - assert pvalue == pytest.approx(0.4816525905214355) + assert ndof == 15 + assert pvalue == pytest.approx(0.5577978656881688) h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) From 7efd00ecaaed032a7b10642cafe05cf33b113fca Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Tue, 13 Jun 2023 03:13:09 +0300 Subject: [PATCH 06/19] first attempt at ks_1samp --- src/hist/stats.py | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/src/hist/stats.py b/src/hist/stats.py index 8c4ecc35..3f6073e6 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -46,7 +46,7 @@ def chisquare_1samp( ) observed = self.values() - totalentries = self.sum() + totalentries = self.sum(flow=True) expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries variances = ( expected # TODO: check if variances or expected should go in the denominator @@ -92,15 +92,18 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: counts1 = self.values() counts2 = other.values() + totalentries1 = self.values(flow=True).sum() + totalentries2 = other.values(flow=True).sum() squares = ( - counts1 * np.sqrt(counts2.sum() / counts1.sum()) - - counts2 * np.sqrt(counts1.sum() / counts2.sum()) + counts1 * np.sqrt(totalentries2 / totalentries1) + - counts2 * np.sqrt(totalentries1 / totalentries2) ) ** 2 variances = variances1 + variances2 where = variances != 0 chisq = np.sum(squares[where] / variances[where]) k = where.sum() - ndof = k if self.sum() == other.sum() else k - 1 + # TODO: check if ndof = k if totalentries1 == totalentries2 else k - 1 is correct + ndof = k - 1 pvalue = spstats.chi2.sf(chisq, ndof) return chisq, ndof, pvalue @@ -120,13 +123,32 @@ def ks_1samp( ) cdf = _get_cdf_if_valid(distribution) - variances = self.variances() - if variances is None: - raise RuntimeError( - "Cannot compute from a variance-less histogram, try a Weight storage" - ) - - return cdf(self.axes[0].edges, *args, **kwds) # placeholder to pass pre-commit + cdflocs = self.axes[0].edges[:-1] + cdfvals = cdf(cdflocs, *args, **kwds) + observed = self.values(flow=True) + totalentries = self.sum(flow=True) + ecdf = np.cumsum(observed) / totalentries + ecdfplus = ecdf[1:-1] + ecdfminus = ecdf[0:-2] + dplus_index = (ecdfplus - cdfvals).argmax() + dminus_index = (cdfvals - ecdfminus).argmax() + Dplus = (ecdfplus - cdfvals)[dplus_index] + Dminus = (cdfvals - ecdfminus)[dminus_index] + dplus_location = cdflocs[dplus_index] + dminus_location = cdflocs[dminus_index] + + if Dplus > Dminus: + D = Dplus + d_location = dplus_location + d_sign = 1 + else: + D = Dminus + d_location = dminus_location + d_sign = -1 + + pvalue = spstats.kstwo.sf(D, self.sum(flow=True)) + + return D, Dplus, Dminus, dplus_location, dminus_location, d_location, d_sign, pvalue def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: From 0e0819ec4f8b4849464a0ae4d68fbcb58ad9e7a8 Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Sun, 18 Jun 2023 22:01:24 +0300 Subject: [PATCH 07/19] first attempt at ks_2samp --- src/hist/basehist.py | 14 +++- src/hist/stats.py | 181 ++++++++++++++++++++++++++++++++++++------- tests/test_stats.py | 54 +------------ 3 files changed, 170 insertions(+), 79 deletions(-) diff --git a/src/hist/basehist.py b/src/hist/basehist.py index 2ff6b71e..c69fca25 100644 --- a/src/hist/basehist.py +++ b/src/hist/basehist.py @@ -584,12 +584,20 @@ def ks_1samp( distribution: str | Callable[..., Any], args: Any = (), kwds: Any = None, + alternative: str = "two-sided", + mode: str = "auto", ) -> Any: from hist import stats - return stats.ks_1samp(self, distribution, args, kwds) + return stats.ks_1samp(self, distribution, args, kwds, alternative, mode) - def ks_2samp(self, other: hist.BaseHist) -> Any: + def ks_2samp( + self: hist.BaseHist, + other: hist.BaseHist, + equal_bins: bool = True, + alternative: str = "two-sided", + mode: str = "auto", + ) -> Any: from hist import stats - return stats.ks_2samp(self, other) + return stats.ks_2samp(self, other, equal_bins, alternative, mode) diff --git a/src/hist/stats.py b/src/hist/stats.py index 3f6073e6..1c559d81 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -1,10 +1,12 @@ from __future__ import annotations +import warnings from typing import Any, Callable import numpy as np from scipy import stats as spstats from scipy.stats import rv_continuous, rv_discrete +from scipy.stats._stats_py import _attempt_exact_2kssamp import hist @@ -46,15 +48,13 @@ def chisquare_1samp( ) observed = self.values() - totalentries = self.sum(flow=True) + totalentries = self.values(flow=True).sum() expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries - variances = ( - expected # TODO: check if variances or expected should go in the denominator - ) - where = variances != 0 + # TODO: check if variances or expected should go in the denominator + where = expected != 0 squares = (expected - observed) ** 2 - ndof = where.sum() - 1 - chisq = np.sum(squares[where] / variances[where]) + ndof = np.sum(where) - 1 + chisq = np.sum(squares[where] / expected[where]) pvalue = spstats.chi2.sf(chisq, ndof) return chisq, ndof, pvalue @@ -101,7 +101,7 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: variances = variances1 + variances2 where = variances != 0 chisq = np.sum(squares[where] / variances[where]) - k = where.sum() + k = np.sum(where) # TODO: check if ndof = k if totalentries1 == totalentries2 else k - 1 is correct ndof = k - 1 pvalue = spstats.chi2.sf(chisq, ndof) @@ -114,6 +114,8 @@ def ks_1samp( distribution: str | Callable[..., Any], args: Any = (), kwds: Any = None, + alternative: str = "two-sided", + mode: str = "auto", ) -> Any: kwds = {} if kwds is None else kwds @@ -121,12 +123,21 @@ def ks_1samp( raise NotImplementedError( f"Only 1D-histogram supports ks_1samp, try projecting {self.__class__.__name__} to 1D" ) + + if mode not in ["auto", "exact", "asymp"]: + raise ValueError(f"Invalid value for mode: {mode}") + alternative = {"t": "two-sided", "g": "greater", "l": "less"}.get( + alternative.lower()[0], alternative + ) + if alternative not in ["two-sided", "less", "greater"]: + raise ValueError(f"Invalid value for alternative: {alternative}") + cdf = _get_cdf_if_valid(distribution) cdflocs = self.axes[0].edges[:-1] cdfvals = cdf(cdflocs, *args, **kwds) observed = self.values(flow=True) - totalentries = self.sum(flow=True) + totalentries = observed.sum() ecdf = np.cumsum(observed) / totalentries ecdfplus = ecdf[1:-1] ecdfminus = ecdf[0:-2] @@ -146,12 +157,34 @@ def ks_1samp( d_location = dminus_location d_sign = -1 - pvalue = spstats.kstwo.sf(D, self.sum(flow=True)) + if alternative == "greater": + pvalue = spstats.ksone.sf(Dplus, totalentries) + return Dplus, dplus_location, 1, pvalue + if alternative == "less": + pvalue = spstats.ksone.sf(Dminus, totalentries) + return Dminus, dminus_location, -1, pvalue + + if mode == "auto": # Always select exact + mode = "exact" + if mode == "exact": + pvalue = spstats.kstwo.sf(D, totalentries) + elif mode == "asymp": + pvalue = spstats.kstwobign.sf(D * np.sqrt(totalentries)) + elif mode == "approx": + pvalue = 2 * spstats.ksone.sf(D, totalentries) + + pvalue = np.clip(pvalue, 0, 1) return D, Dplus, Dminus, dplus_location, dminus_location, d_location, d_sign, pvalue -def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: +def ks_2samp( + self: hist.BaseHist, + other: hist.BaseHist, + equal_bins: bool = True, + alternative: str = "two-sided", + mode: str = "auto", +) -> Any: if self.ndim != 1: raise NotImplementedError( f"Only 1D-histogram supports ks_2samp, try projecting {self.__class__.__name__} to 1D" @@ -164,21 +197,117 @@ def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: raise TypeError( f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object" ) - # TODO: add support for compatible rebinning - if self.size != other.size: - raise NotImplementedError( - "Cannot compute chi2 from histograms with different binning, try rebinning" - ) - if not np.allclose(self.axes[0].edges, other.axes[0].edges): - raise NotImplementedError( - "Cannot compute chi2 from histograms with different binning, try rebinning" - ) - variances1 = self.variances() - variances2 = other.variances() - if variances1 is None or variances2 is None: - raise RuntimeError( - "Cannot compute from variance-less histograms, try a Weight storage" + if equal_bins: + if self.size != other.size: + raise NotImplementedError( + "Cannot compute KS from histograms with different binning, try rebinning" + ) + if not np.allclose(self.axes[0].edges, other.axes[0].edges): + raise NotImplementedError( + "Cannot compute KS from histograms with different binning, try rebinning" + ) + + if mode not in ["auto", "exact", "asymp"]: + raise ValueError(f"Invalid value for mode: {mode}") + alternative = {"t": "two-sided", "g": "greater", "l": "less"}.get( + alternative.lower()[0], alternative + ) + if alternative not in ["two-sided", "less", "greater"]: + raise ValueError(f"Invalid value for alternative: {alternative}") + + data1 = self.values(flow=True) + data2 = other.values(flow=True) + n1 = data1.sum() + n2 = data2.sum() + edges1 = self.axes[0].edges[:-1] + edges2 = other.axes[0].edges[:-1] + cdf1 = np.cumsum(data1) / n1 + cdf2 = np.cumsum(data2) / n2 + + if equal_bins: + cdflocs = edges1 + + cdf1 = cdf1[1:-1] + cdf2 = cdf2[1:-1] + cddiffs = cdf1 - cdf2 + + elif not equal_bins: + edges_all = np.sort(np.concatenate([edges1, edges2])) + cdflocs = edges_all + + # Use np.searchsorted to get the CDF values at all bin edges + cdf1_all = cdf1[np.searchsorted(edges1, edges_all, side="right")] + cdf2_all = cdf2[np.searchsorted(edges2, edges_all, side="right")] + cddiffs = cdf1_all - cdf2_all + + # Identify the location of the statistic + argminS = np.argmin(cddiffs) + argmaxS = np.argmax(cddiffs) + loc_minS = cdflocs[argminS] + loc_maxS = cdflocs[argmaxS] + + # Ensure sign of minS is not negative. + minS = np.clip(-cddiffs[argminS], 0, 1) + maxS = cddiffs[argmaxS] + + if alternative == "less" or (alternative == "two-sided" and minS > maxS): + d = minS + d_location = loc_minS + d_sign = -1 + else: + d = maxS + d_location = loc_maxS + d_sign = 1 + + g = np.gcd(int(n1), int(n2)) + n1g = n1 // g + n2g = n2 // g + pvalue = -np.inf + + if mode == "auto": # Always select exact + mode = "exact" + if mode == "exact" and n1g >= np.iinfo(np.int32).max / n2g: + # If lcm(n1, n2) is too big, switch from exact to asymp + mode = "asymp" + warnings.warn( + f"Exact ks_2samp calculation not possible with samples sizes " + f"{n1} and {n2}. Switching to 'asymp'.", + RuntimeWarning, + stacklevel=3, ) - return "Performing ks two sample test" + if mode == "exact": + success, d, pvalue = _attempt_exact_2kssamp(n1, n2, g, d, alternative) + if not success: + mode = "asymp" + warnings.warn( + f"ks_2samp: Exact calculation unsuccessful. " + f"Switching to method={mode}.", + RuntimeWarning, + stacklevel=3, + ) + + if mode == "asymp": + # The product n1*n2 is large. Use Smirnov's asymptoptic formula. + # Ensure float to avoid overflow in multiplication + # sorted because the one-sided formula is not symmetric in n1, n2 + m, n = sorted([float(n1), float(n2)], reverse=True) + en = m * n / (m + n) + if alternative == "two-sided": + pvalue = spstats.kstwo.sf(d, np.round(en)) + else: + z = np.sqrt(en) * d + # Use Hodges' suggested approximation Eqn 5.3 + # Requires m to be the larger of (n1, n2) + expt = -2 * z**2 - 2 * z * (m + 2 * n) / np.sqrt(m * n * (m + n)) / 3.0 + pvalue = np.exp(expt) + + pvalue = np.clip(pvalue, 0, 1) + D = d + Dplus = maxS + Dminus = minS + dplus_location = loc_maxS + dminus_location = loc_minS + + return D, Dplus, Dminus, dplus_location, dminus_location, d_location, d_sign, pvalue diff --git a/tests/test_stats.py b/tests/test_stats.py index 16f35b83..0458f6fe 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -2,7 +2,6 @@ import numpy as np import pytest -from scipy import stats as spstats import hist from hist import Hist @@ -67,8 +66,8 @@ def test_chisquare_2samp(): chisq, ndof, pvalue = h1.chisquare_2samp(h2) assert chisq == pytest.approx(13.577308400334086) - assert ndof == 15 - assert pvalue == pytest.approx(0.5577978656881688) + assert ndof == 14 + assert pvalue == pytest.approx(0.4816525905214355) h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) @@ -85,53 +84,8 @@ def test_chisquare_2samp(): def test_ks_1samp(): - np.random.seed(42) - - h = Hist(hist.axis.Regular(20, -5, 5)) - h.fill(np.random.normal(size=1000)) - - assert np.all( - h.ks_1samp("norm") == spstats.norm.cdf(h.axes[0].edges) - ) # placeholder to pass test for now - - h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) - h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) - with pytest.raises(NotImplementedError): - h.ks_1samp("norm") - - h = Hist(hist.axis.Regular(20, -5, 5)) - h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000)) - with pytest.raises(RuntimeError): - h.ks_1samp("norm") - - h = Hist(hist.axis.Regular(20, -5, 5)) - h.fill(np.random.normal(size=1000)) - with pytest.raises(TypeError): - h.ks_1samp("not_a_distribution") - - with pytest.raises(TypeError): - h.ks_1samp(1) + pass def test_ks_2samp(): - np.random.seed(42) - - h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm")) - h2 = Hist(hist.axis.Regular(20, -5, 5, name="norm")) - h1.fill(np.random.normal(size=1000)) - h2.fill(np.random.normal(size=1000)) - - assert h1.ks_2samp(h2) == "Performing ks two sample test" - - h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) - h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) - with pytest.raises(NotImplementedError): - h.ks_2samp(h2) - - h = Hist(hist.axis.Regular(20, -5, 5)) - h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000)) - with pytest.raises(RuntimeError): - h.ks_2samp(h2) - - with pytest.raises(TypeError): - h.ks_2samp(1) + pass From 75a262bea817467b72f10b84a8896ff1f1c3eaa3 Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Sun, 18 Jun 2023 22:30:55 +0300 Subject: [PATCH 08/19] use dtype=int when calculating total entries --- src/hist/stats.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/hist/stats.py b/src/hist/stats.py index 1c559d81..159a2715 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -48,7 +48,7 @@ def chisquare_1samp( ) observed = self.values() - totalentries = self.values(flow=True).sum() + totalentries = self.values(flow=True).sum(dtype=int) expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries # TODO: check if variances or expected should go in the denominator where = expected != 0 @@ -92,8 +92,8 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: counts1 = self.values() counts2 = other.values() - totalentries1 = self.values(flow=True).sum() - totalentries2 = other.values(flow=True).sum() + totalentries1 = self.values(flow=True).sum(dtype=int) + totalentries2 = other.values(flow=True).sum(dtype=int) squares = ( counts1 * np.sqrt(totalentries2 / totalentries1) - counts2 * np.sqrt(totalentries1 / totalentries2) @@ -137,7 +137,7 @@ def ks_1samp( cdflocs = self.axes[0].edges[:-1] cdfvals = cdf(cdflocs, *args, **kwds) observed = self.values(flow=True) - totalentries = observed.sum() + totalentries = observed.sum(dtype=int) ecdf = np.cumsum(observed) / totalentries ecdfplus = ecdf[1:-1] ecdfminus = ecdf[0:-2] @@ -200,12 +200,12 @@ def ks_2samp( if equal_bins: if self.size != other.size: - raise NotImplementedError( - "Cannot compute KS from histograms with different binning, try rebinning" + raise ValueError( + "Cannot compute KS from histograms with different binning, use equal_bins=False" ) if not np.allclose(self.axes[0].edges, other.axes[0].edges): - raise NotImplementedError( - "Cannot compute KS from histograms with different binning, try rebinning" + raise ValueError( + "Cannot compute KS from histograms with different binning, use equal_bins=False" ) if mode not in ["auto", "exact", "asymp"]: @@ -218,8 +218,8 @@ def ks_2samp( data1 = self.values(flow=True) data2 = other.values(flow=True) - n1 = data1.sum() - n2 = data2.sum() + n1 = data1.sum(dtype=int) + n2 = data2.sum(dtype=int) edges1 = self.axes[0].edges[:-1] edges2 = other.axes[0].edges[:-1] cdf1 = np.cumsum(data1) / n1 From d21759faf903c79f48d2e94b68c487eae2a88b7f Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Mon, 19 Jun 2023 02:13:51 +0300 Subject: [PATCH 09/19] add flow option in chisquare tests --- src/hist/basehist.py | 13 +++++++------ src/hist/stats.py | 27 +++++++++++++++++---------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/hist/basehist.py b/src/hist/basehist.py index 00324761..02d92c87 100644 --- a/src/hist/basehist.py +++ b/src/hist/basehist.py @@ -632,22 +632,23 @@ def sum(self, flow: bool = False) -> float | bh.accumulators.Accumulator: return super().sum(flow=flow) def chisquare_1samp( - self: hist.BaseHist, + self, distribution: str | Callable[..., Any], + flow: bool = False, args: Any = (), kwds: Any = None, ) -> Any: from hist import stats - return stats.chisquare_1samp(self, distribution, args, kwds) + return stats.chisquare_1samp(self, distribution, flow, args, kwds) - def chisquare_2samp(self, other: hist.BaseHist) -> Any: + def chisquare_2samp(self, other: hist.BaseHist, flow: bool = False) -> Any: from hist import stats - return stats.chisquare_2samp(self, other) + return stats.chisquare_2samp(self, other, flow) def ks_1samp( - self: hist.BaseHist, + self, distribution: str | Callable[..., Any], args: Any = (), kwds: Any = None, @@ -659,7 +660,7 @@ def ks_1samp( return stats.ks_1samp(self, distribution, args, kwds, alternative, mode) def ks_2samp( - self: hist.BaseHist, + self, other: hist.BaseHist, equal_bins: bool = True, alternative: str = "two-sided", diff --git a/src/hist/stats.py b/src/hist/stats.py index 159a2715..cda84a23 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -30,6 +30,7 @@ def _get_cdf_if_valid(obj: Any) -> Any: def chisquare_1samp( self: hist.BaseHist, distribution: str | Callable[..., Any], + flow: bool = False, args: Any = (), kwds: Any = None, ) -> Any: @@ -41,26 +42,32 @@ def chisquare_1samp( ) cdf = _get_cdf_if_valid(distribution) - variances = self.variances() + variances = self.variances(flow=flow) if variances is None: raise RuntimeError( "Cannot compute from a variance-less histogram, try a Weight storage" ) - observed = self.values() + observed = self.values(flow=flow) totalentries = self.values(flow=True).sum(dtype=int) - expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries + cdfvals = cdf(self.axes[0].edges, *args, **kwds) + if flow: + cdfvals = np.concatenate([[0], cdfvals, [1]]) + expected = np.diff(cdfvals) * totalentries # TODO: check if variances or expected should go in the denominator - where = expected != 0 + # variances fails if bins have low statistics + where = variances != 0 squares = (expected - observed) ** 2 ndof = np.sum(where) - 1 - chisq = np.sum(squares[where] / expected[where]) + chisq = np.sum(squares[where] / variances[where]) pvalue = spstats.chi2.sf(chisq, ndof) return chisq, ndof, pvalue -def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: +def chisquare_2samp( + self: hist.BaseHist, other: hist.BaseHist, flow: bool = False +) -> Any: if self.ndim != 1: raise NotImplementedError( f"Only 1D-histogram supports chisquare_2samp, try projecting {self.__class__.__name__} to 1D" @@ -83,15 +90,15 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any: "Cannot compute chi2 from histograms with different binning, try rebinning" ) - variances1 = self.variances() - variances2 = other.variances() + variances1 = self.variances(flow=flow) + variances2 = other.variances(flow=flow) if variances1 is None or variances2 is None: raise RuntimeError( "Cannot compute from variance-less histograms, try a Weight storage" ) - counts1 = self.values() - counts2 = other.values() + counts1 = self.values(flow=flow) + counts2 = other.values(flow=flow) totalentries1 = self.values(flow=True).sum(dtype=int) totalentries2 = other.values(flow=True).sum(dtype=int) squares = ( From d0ed901baae95c50cd6487997c21fe7b0f7c816d Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Mon, 19 Jun 2023 02:39:39 +0300 Subject: [PATCH 10/19] bin centers for ecdf is more correct --- src/hist/stats.py | 2 +- tests/test_stats.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/hist/stats.py b/src/hist/stats.py index cda84a23..5928c445 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -141,7 +141,7 @@ def ks_1samp( cdf = _get_cdf_if_valid(distribution) - cdflocs = self.axes[0].edges[:-1] + cdflocs = self.axes[0].centers cdfvals = cdf(cdflocs, *args, **kwds) observed = self.values(flow=True) totalentries = observed.sum(dtype=int) diff --git a/tests/test_stats.py b/tests/test_stats.py index 0458f6fe..ed5d23bc 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -16,14 +16,14 @@ def test_chisquare_1samp(): h.fill(np.random.normal(size=1000)) chisq, ndof, pvalue = h.chisquare_1samp("norm") - assert chisq == pytest.approx(11.640108080468355) - assert ndof == 19 - assert pvalue == pytest.approx(0.9004272294139111) + assert chisq == pytest.approx(9.47425856230132) + assert ndof == 14 + assert pvalue == pytest.approx(0.7995235818496339) chisq, ndof, pvalue = h.chisquare_1samp(norm.cdf, args=(0, 1)) - assert chisq == pytest.approx(11.64010808046843) - assert ndof == 19 - assert pvalue == pytest.approx(0.9004272294139111) + assert chisq == pytest.approx(9.47425856230132) + assert ndof == 14 + assert pvalue == pytest.approx(0.7995235818496339) h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) From 4d517f7ca3099a878b84294fd0fad2b0a74b095e Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Mon, 19 Jun 2023 16:28:15 +0300 Subject: [PATCH 11/19] add flow option in ks_2samp --- src/hist/basehist.py | 3 ++- src/hist/stats.py | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/hist/basehist.py b/src/hist/basehist.py index 02d92c87..e44faa71 100644 --- a/src/hist/basehist.py +++ b/src/hist/basehist.py @@ -663,9 +663,10 @@ def ks_2samp( self, other: hist.BaseHist, equal_bins: bool = True, + flow: bool = False, alternative: str = "two-sided", mode: str = "auto", ) -> Any: from hist import stats - return stats.ks_2samp(self, other, equal_bins, alternative, mode) + return stats.ks_2samp(self, other, equal_bins, flow, alternative, mode) diff --git a/src/hist/stats.py b/src/hist/stats.py index 5928c445..83d38003 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -189,6 +189,7 @@ def ks_2samp( self: hist.BaseHist, other: hist.BaseHist, equal_bins: bool = True, + flow: bool = False, alternative: str = "two-sided", mode: str = "auto", ) -> Any: @@ -227,22 +228,31 @@ def ks_2samp( data2 = other.values(flow=True) n1 = data1.sum(dtype=int) n2 = data2.sum(dtype=int) - edges1 = self.axes[0].edges[:-1] - edges2 = other.axes[0].edges[:-1] + edges1 = self.axes[0].centers + edges2 = other.axes[0].centers cdf1 = np.cumsum(data1) / n1 cdf2 = np.cumsum(data2) / n2 + if flow: + edges1 = np.concatenate([[-np.inf], edges1, [np.inf]]) + edges2 = np.concatenate([[-np.inf], edges2, [np.inf]]) if equal_bins: cdflocs = edges1 - cdf1 = cdf1[1:-1] - cdf2 = cdf2[1:-1] + if not flow: + cdf1 = cdf1[1:-1] + cdf2 = cdf2[1:-1] + cddiffs = cdf1 - cdf2 elif not equal_bins: edges_all = np.sort(np.concatenate([edges1, edges2])) cdflocs = edges_all + if flow: + cdf1 = np.concatenate([[0], cdf1]) + cdf2 = np.concatenate([[0], cdf2]) + # Use np.searchsorted to get the CDF values at all bin edges cdf1_all = cdf1[np.searchsorted(edges1, edges_all, side="right")] cdf2_all = cdf2[np.searchsorted(edges2, edges_all, side="right")] From 7d603899d34d5dec89fcd0e992be705d066e18ec Mon Sep 17 00:00:00 2001 From: fabriceMUKARAGE Date: Thu, 22 Jun 2023 21:42:08 +0200 Subject: [PATCH 12/19] wip: chisquare test suggestions --- pyproject.toml | 3 +++ tests/test_stats.py | 27 +++++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 82067cee..fe9a72a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,6 +72,7 @@ test = [ "pytest-mpl >=0.12", "dask[dataframe] >=2022; python_version>='3.8'", "dask_histogram >=2023.1; python_version>='3.8'", + "numba_stats", ] dev = [ "pytest >=6", @@ -83,6 +84,7 @@ dev = [ "ipykernel", "dask[dataframe] >=2022; python_version>='3.8'", "dask_histogram >=2023.1; python_version>='3.8'", + "numba_stats", ] docs = [ "pytest >=6", @@ -176,6 +178,7 @@ messages_control.disable = [ "disallowed-name", "cyclic-import", "redefined-builtin", + "too-many-public-methods", ] [tool.ruff] diff --git a/tests/test_stats.py b/tests/test_stats.py index ed5d23bc..8340c96a 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -5,36 +5,47 @@ import hist from hist import Hist +from numba_stats import norm +import functools +cdf =functools.partial(norm.cdf, loc=0, scale=1) -def test_chisquare_1samp(): - from numba_stats import norm +pytest.importorskip("scipy") +@pytest.mark.parametrize("arg", ["norm", cdf]) +def test_chisquare_1samp(arg): np.random.seed(42) h = Hist(hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000)) - chisq, ndof, pvalue = h.chisquare_1samp("norm") - assert chisq == pytest.approx(9.47425856230132) - assert ndof == 14 - assert pvalue == pytest.approx(0.7995235818496339) - - chisq, ndof, pvalue = h.chisquare_1samp(norm.cdf, args=(0, 1)) + chisq, ndof, pvalue = h.chisquare_1samp(arg) assert chisq == pytest.approx(9.47425856230132) assert ndof == 14 assert pvalue == pytest.approx(0.7995235818496339) + +def test_chisquare_1samp_2d(): + np.random.seed(42) h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000), np.random.normal(size=1000)) with pytest.raises(NotImplementedError): h.chisquare_1samp("norm") +def test_chisquare_1samp_weight(): + np.random.seed(42) + h = Hist(hist.axis.Regular(20, -5, 5), storage=hist.storage.Weight()) + h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000)) + h.chisquare_1samp("norm") + +def test_chisquare_1samp_weight_raises(): + np.random.seed(42) h = Hist(hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000)) with pytest.raises(RuntimeError): h.chisquare_1samp("norm") +def test_chisquare_1samp_invalid_args(): h = Hist(hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000)) with pytest.raises(TypeError): From 223ef96ea9c834ce29bb3868fbe3362ba7553ee9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 22 Jun 2023 21:40:30 +0000 Subject: [PATCH 13/19] style: pre-commit fixes --- tests/test_stats.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/test_stats.py b/tests/test_stats.py index 8340c96a..07845300 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -1,17 +1,19 @@ from __future__ import annotations +import functools + import numpy as np import pytest +from numba_stats import norm import hist from hist import Hist -from numba_stats import norm -import functools -cdf =functools.partial(norm.cdf, loc=0, scale=1) +cdf = functools.partial(norm.cdf, loc=0, scale=1) pytest.importorskip("scipy") + @pytest.mark.parametrize("arg", ["norm", cdf]) def test_chisquare_1samp(arg): np.random.seed(42) @@ -23,7 +25,8 @@ def test_chisquare_1samp(arg): assert chisq == pytest.approx(9.47425856230132) assert ndof == 14 assert pvalue == pytest.approx(0.7995235818496339) - + + def test_chisquare_1samp_2d(): np.random.seed(42) @@ -32,11 +35,13 @@ def test_chisquare_1samp_2d(): with pytest.raises(NotImplementedError): h.chisquare_1samp("norm") + def test_chisquare_1samp_weight(): np.random.seed(42) h = Hist(hist.axis.Regular(20, -5, 5), storage=hist.storage.Weight()) h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000)) - h.chisquare_1samp("norm") + h.chisquare_1samp("norm") + def test_chisquare_1samp_weight_raises(): np.random.seed(42) @@ -45,6 +50,7 @@ def test_chisquare_1samp_weight_raises(): with pytest.raises(RuntimeError): h.chisquare_1samp("norm") + def test_chisquare_1samp_invalid_args(): h = Hist(hist.axis.Regular(20, -5, 5)) h.fill(np.random.normal(size=1000)) From 0c0d92a274bed22e5040f6a3ce3875db61496687 Mon Sep 17 00:00:00 2001 From: Fabrice MUKARAGE <79330798+fabriceMUKARAGE@users.noreply.github.com> Date: Tue, 27 Jun 2023 21:41:20 +0200 Subject: [PATCH 14/19] fix: Hist requires mplhep to plot Added pytest.importorskip("mplhep") --- tests/test_plot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_plot.py b/tests/test_plot.py index cededea7..d0259ff3 100644 --- a/tests/test_plot.py +++ b/tests/test_plot.py @@ -7,6 +7,7 @@ plt = pytest.importorskip("matplotlib.pyplot") pytest.importorskip("scipy") +pytest.importorskip("mplhep") def test_general_plot1d(): From abcda1b8edf7bb7d61c39c51faee720955a6075a Mon Sep 17 00:00:00 2001 From: Fabrice MUKARAGE <79330798+fabriceMUKARAGE@users.noreply.github.com> Date: Tue, 27 Jun 2023 21:50:31 +0200 Subject: [PATCH 15/19] fix: mplhep required to plot Added pytest.importorskip("mplhep") --- tests/test_stacks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_stacks.py b/tests/test_stacks.py index e3ed7be7..60a35c43 100644 --- a/tests/test_stacks.py +++ b/tests/test_stacks.py @@ -7,6 +7,7 @@ from pytest import approx from hist import Hist, NamedHist, Stack, axis +pytest.importorskip("mplhep") np.random.seed(42) From e630e3f6588619571dc07b6a83e70d681baa9576 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 Jun 2023 19:50:50 +0000 Subject: [PATCH 16/19] style: pre-commit fixes --- tests/test_stacks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_stacks.py b/tests/test_stacks.py index 60a35c43..d8607131 100644 --- a/tests/test_stacks.py +++ b/tests/test_stacks.py @@ -7,6 +7,7 @@ from pytest import approx from hist import Hist, NamedHist, Stack, axis + pytest.importorskip("mplhep") np.random.seed(42) From 85e995537595f13ffb654b2b3dd8c06beb5420e1 Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Mon, 3 Jul 2023 14:43:01 +0100 Subject: [PATCH 17/19] avoid hidden scipy object --- src/hist/stats.py | 258 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 257 insertions(+), 1 deletion(-) diff --git a/src/hist/stats.py b/src/hist/stats.py index 83d38003..f21d258a 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -4,9 +4,9 @@ from typing import Any, Callable import numpy as np +from scipy import special from scipy import stats as spstats from scipy.stats import rv_continuous, rv_discrete -from scipy.stats._stats_py import _attempt_exact_2kssamp import hist @@ -27,6 +27,262 @@ def _get_cdf_if_valid(obj: Any) -> Any: ) +def _compute_outer_prob_inside_method(m: int, n: int, g: int, h: int) -> Any: + """ + Count the proportion of paths that do not stay strictly inside two + diagonal lines. + + Parameters + ---------- + m : integer + m > 0 + n : integer + n > 0 + g : integer + g is greatest common divisor of m and n + h : integer + 0 <= h <= lcm(m,n) + + Returns + ------- + p : float + The proportion of paths that do not stay inside the two lines. + + The classical algorithm counts the integer lattice paths from (0, 0) + to (m, n) which satisfy |x/m - y/n| < h / lcm(m, n). + The paths make steps of size +1 in either positive x or positive y + directions. + We are, however, interested in 1 - proportion to computes p-values, + so we change the recursion to compute 1 - p directly while staying + within the "inside method" a described by Hodges. + + We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk. + Hodges, J.L. Jr., + "The Significance Probability of the Smirnov Two-Sample Test," + Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. + + For the recursion for 1-p see + Viehmann, T.: "Numerically more stable computation of the p-values + for the two-sample Kolmogorov-Smirnov test," arXiv: 2102.08037 + + """ + # Probability is symmetrical in m, n. Computation below uses m >= n. + if m < n: + m, n = n, m + mg = m // g + ng = n // g + + # Count the integer lattice paths from (0, 0) to (m, n) which satisfy + # |nx/g - my/g| < h. + # Compute matrix A such that: + # A(x, 0) = A(0, y) = 1 + # A(x, y) = A(x, y-1) + A(x-1, y), for x,y>=1, except that + # A(x, y) = 0 if |x/m - y/n|>= h + # Probability is A(m, n)/binom(m+n, n) + # Optimizations exist for m==n, m==n*p. + # Only need to preserve a single column of A, and only a + # sliding window of it. + # minj keeps track of the slide. + minj, maxj = 0, min(int(np.ceil(h / mg)), n + 1) + curlen = maxj - minj + # Make a vector long enough to hold maximum window needed. + lenA = min(2 * maxj + 2, n + 1) + # This is an integer calculation, but the entries are essentially + # binomial coefficients, hence grow quickly. + # Scaling after each column is computed avoids dividing by a + # large binomial coefficient at the end, but is not sufficient to avoid + # the large dyanamic range which appears during the calculation. + # Instead we rescale based on the magnitude of the right most term in + # the column and keep track of an exponent separately and apply + # it at the end of the calculation. Similarly when multiplying by + # the binomial coefficient + dtype = np.float64 + A = np.ones(lenA, dtype=dtype) + # Initialize the first column + A[minj:maxj] = 0.0 + for i in range(1, m + 1): + # Generate the next column. + # First calculate the sliding window + lastminj, lastlen = minj, curlen + minj = max(int(np.floor((ng * i - h) / mg)) + 1, 0) + minj = min(minj, n) + maxj = min(int(np.ceil((ng * i + h) / mg)), n + 1) + if maxj <= minj: + return 1.0 + # Now fill in the values. We cannot use cumsum, unfortunately. + val = 0.0 if minj == 0 else 1.0 + for jj in range(maxj - minj): + j = jj + minj + val = (A[jj + minj - lastminj] * i + val * j) / (i + j) + A[jj] = val + curlen = maxj - minj + if lastlen > curlen: + # Set some carried-over elements to 1 + A[maxj - minj : maxj - minj + (lastlen - curlen)] = 1 + + return A[maxj - minj - 1] + + +def _compute_prob_outside_square(n: int, h: int) -> Any: + """ + Compute the proportion of paths that pass outside the two diagonal lines. + + Parameters + ---------- + n : integer + n > 0 + h : integer + 0 <= h <= n + + Returns + ------- + p : float + The proportion of paths that pass outside the lines x-y = +/-h. + + """ + # Compute Pr(D_{n,n} >= h/n) + # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... ) + # / binom(2n, n) + # This formulation exhibits subtractive cancellation. + # Instead divide each term by binom(2n, n), then factor common terms + # and use a Horner-like algorithm + # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...))))) + + P = 0.0 + k = int(np.floor(n / h)) + while k >= 0: + p1 = 1.0 + # Each of the Ai terms has numerator and denominator with + # h simple terms. + for j in range(h): + p1 = (n - k * h - j) * p1 / (n + k * h + j + 1) + P = p1 * (1.0 - P) + k -= 1 + return 2 * P + + +def _count_paths_outside_method(m: int, n: int, g: int, h: int) -> Any: + """Count the number of paths that pass outside the specified diagonal. + + Parameters + ---------- + m : integer + m > 0 + n : integer + n > 0 + g : integer + g is greatest common divisor of m and n + h : integer + 0 <= h <= lcm(m,n) + + Returns + ------- + p : float + The number of paths that go low. + The calculation may overflow - check for a finite answer. + + Notes + ----- + Count the integer lattice paths from (0, 0) to (m, n), which at some + point (x, y) along the path, satisfy: + m*y <= n*x - h*g + The paths make steps of size +1 in either positive x or positive y + directions. + + We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk. + Hodges, J.L. Jr., + "The Significance Probability of the Smirnov Two-Sample Test," + Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. + + """ + # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n) + # B(x, y) = #{paths from (0,0) to (x,y) without + # previously crossing the boundary} + # = binom(x, y) - #{paths which already reached the boundary} + # Multiply by the number of path extensions going from (x, y) to (m, n) + # Sum. + + # Probability is symmetrical in m, n. Computation below assumes m >= n. + if m < n: + m, n = n, m + mg = m // g + ng = n // g + + # Not every x needs to be considered. + # xj holds the list of x values to be checked. + # Wherever n*x/m + ng*h crosses an integer + lxj = n + (mg - h) // mg + xj = [(h + mg * j + ng - 1) // ng for j in range(lxj)] + # B is an array just holding a few values of B(x,y), the ones needed. + # B[j] == B(x_j, j) + if lxj == 0: + return special.binom(m + n, n) + B = np.zeros(lxj) + B[0] = 1 + # Compute the B(x, y) terms + for j in range(1, lxj): + Bj = special.binom(xj[j] + j, j) + for i in range(j): + bin = special.binom(xj[j] - xj[i] + j - i, j - i) + Bj -= bin * B[i] + B[j] = Bj + # Compute the number of path extensions... + num_paths = 0 + for j in range(lxj): + bin = special.binom((m - xj[j]) + (n - j), n - j) + term = B[j] * bin + num_paths += term + return num_paths + + +def _attempt_exact_2kssamp(n1: int, n2: int, g: int, d: float, alternative: str) -> Any: + """Attempts to compute the exact 2sample probability. + + n1, n2 are the sample sizes + g is the gcd(n1, n2) + d is the computed max difference in ECDFs + + Returns (success, d, probability) + """ + lcm = (n1 // g) * n2 + h = int(np.round(d * lcm)) + d = h * 1.0 / lcm + if h == 0: + return True, d, 1.0 + saw_fp_error, prob = False, np.nan + try: + with np.errstate(invalid="raise", over="raise"): + if alternative == "two-sided": + if n1 == n2: + prob = _compute_prob_outside_square(n1, h) + else: + prob = _compute_outer_prob_inside_method(n1, n2, g, h) + else: + if n1 == n2: + # prob = binom(2n, n-h) / binom(2n, n) + # Evaluating in that form incurs roundoff errors + # from special.binom. Instead calculate directly + jrange = np.arange(h) + prob = float(np.prod((n1 - jrange) / (n1 + jrange + 1.0))) + else: + with np.errstate(over="raise"): + num_paths = _count_paths_outside_method(n1, n2, g, h) + bin = special.binom(n1 + n2, n1) + if num_paths > bin or np.isinf(bin): + saw_fp_error = True + else: + prob = num_paths / bin + + except (FloatingPointError, OverflowError): + saw_fp_error = True + + if saw_fp_error: + return False, d, np.nan + if not (0 <= prob <= 1): + return False, d, prob + return True, d, prob + + def chisquare_1samp( self: hist.BaseHist, distribution: str | Callable[..., Any], From d171bd964d09c65c9f40a5a5dd67d80665b87f9d Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Mon, 3 Jul 2023 16:50:20 +0100 Subject: [PATCH 18/19] similar docstrings --- src/hist/stats.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/hist/stats.py b/src/hist/stats.py index f21d258a..7ec3967b 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -181,8 +181,6 @@ def _count_paths_outside_method(m: int, n: int, g: int, h: int) -> Any: The number of paths that go low. The calculation may overflow - check for a finite answer. - Notes - ----- Count the integer lattice paths from (0, 0) to (m, n), which at some point (x, y) along the path, satisfy: m*y <= n*x - h*g @@ -238,11 +236,27 @@ def _count_paths_outside_method(m: int, n: int, g: int, h: int) -> Any: def _attempt_exact_2kssamp(n1: int, n2: int, g: int, d: float, alternative: str) -> Any: """Attempts to compute the exact 2sample probability. - n1, n2 are the sample sizes - g is the gcd(n1, n2) - d is the computed max difference in ECDFs + Parameters + ---------- + n1 : integer + sample size of sample 1 + n2 : integer + sample size of sample 2 + g : integer + greatest common divisor of n1 and n2 + d : float + max difference in ECDFs + alternative : string + alternative hypothesis, either 'two-sided', 'less' or 'greater' - Returns (success, d, probability) + Returns + ------- + success : bool + True if the calculation was successful + d : float + max difference in ECDFs + prob : float + The probability of the test statistic under the null hypothesis. """ lcm = (n1 // g) * n2 h = int(np.round(d * lcm)) From ccc3dae59df63ed9973cf7416a83a893123efab6 Mon Sep 17 00:00:00 2001 From: iasonkrom Date: Tue, 4 Jul 2023 14:22:41 +0100 Subject: [PATCH 19/19] pylint compliance --- src/hist/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hist/stats.py b/src/hist/stats.py index 7ec3967b..6eab17ac 100644 --- a/src/hist/stats.py +++ b/src/hist/stats.py @@ -292,7 +292,7 @@ def _attempt_exact_2kssamp(n1: int, n2: int, g: int, d: float, alternative: str) if saw_fp_error: return False, d, np.nan - if not (0 <= prob <= 1): + if not 0 <= prob <= 1: return False, d, prob return True, d, prob