From 0510a3a376ba3da99b0dfa4364347c6f9524ea75 Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Thu, 8 Jun 2023 12:47:36 +0300
Subject: [PATCH 01/19] add only very basic structure

---
 src/hist/basehist.py | 20 ++++++++++++++++++++
 src/hist/stats.py    | 27 +++++++++++++++++++++++++++
 tests/test_stats.py  | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+)
 create mode 100644 src/hist/stats.py
 create mode 100644 tests/test_stats.py

diff --git a/src/hist/basehist.py b/src/hist/basehist.py
index bfaaa8f9..c53f59b8 100644
--- a/src/hist/basehist.py
+++ b/src/hist/basehist.py
@@ -547,3 +547,23 @@ def sum(self, flow: bool = False) -> float | bh.accumulators.Accumulator:
             raise AssertionError(f"Unsupported storage type {storage}")
 
         return super().sum(flow=flow)
+
+    def chisquare_1samp(self) -> str | NotImplementedError:
+        from hist import stats
+
+        return stats.chisquare_1samp(self)
+
+    def chisquare_2samp(self) -> str | NotImplementedError:
+        from hist import stats
+
+        return stats.chisquare_2samp(self)
+
+    def ks_1samp(self) -> str | NotImplementedError:
+        from hist import stats
+
+        return stats.ks_1samp(self)
+
+    def ks_2samp(self) -> str | NotImplementedError:
+        from hist import stats
+
+        return stats.ks_2samp(self)
diff --git a/src/hist/stats.py b/src/hist/stats.py
new file mode 100644
index 00000000..0535f110
--- /dev/null
+++ b/src/hist/stats.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+import hist
+
+
+def chisquare_1samp(self: hist.BaseHist) -> str | NotImplementedError:
+    if self.ndim != 1:
+        raise NotImplementedError("chisquare_1samp is only supported for 1D histograms")
+    return "Performing chi square one sample test"
+
+
+def chisquare_2samp(self: hist.BaseHist) -> str | NotImplementedError:
+    if self.ndim != 1:
+        raise NotImplementedError("chisquare_2samp is only supported for 1D histograms")
+    return "Performing chi square two sample test"
+
+
+def ks_1samp(self: hist.BaseHist) -> str | NotImplementedError:
+    if self.ndim != 1:
+        raise NotImplementedError("ks_1samp is only supported for 1D histograms")
+    return "Performing ks one sample test"
+
+
+def ks_2samp(self: hist.BaseHist) -> str | NotImplementedError:
+    if self.ndim != 1:
+        raise NotImplementedError("ks_2samp is only supported for 1D histograms")
+    return "Performing ks two sample test"
diff --git a/tests/test_stats.py b/tests/test_stats.py
new file mode 100644
index 00000000..b3792026
--- /dev/null
+++ b/tests/test_stats.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import hist
+from hist import Hist
+
+
+def test_stattest():
+    h = Hist(hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000))
+
+    assert h.chisquare_1samp() == "Performing chi square one sample test"
+    assert h.chisquare_2samp() == "Performing chi square two sample test"
+    assert h.ks_1samp() == "Performing ks one sample test"
+    assert h.ks_2samp() == "Performing ks two sample test"
+
+    h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
+
+    with pytest.raises(
+        NotImplementedError, match="chisquare_1samp is only supported for 1D histograms"
+    ):
+        h.chisquare_1samp()
+    with pytest.raises(
+        NotImplementedError, match="chisquare_2samp is only supported for 1D histograms"
+    ):
+        h.chisquare_2samp()
+    with pytest.raises(
+        NotImplementedError, match="ks_1samp is only supported for 1D histograms"
+    ):
+        h.ks_1samp()
+    with pytest.raises(
+        NotImplementedError, match="ks_2samp is only supported for 1D histograms"
+    ):
+        h.ks_2samp()

From 60443c46448ec2ba17cf29ddc658d0406a770b01 Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Fri, 9 Jun 2023 15:43:37 +0300
Subject: [PATCH 02/19] first attemptt at chisquare tests

---
 src/hist/basehist.py |  26 +++++---
 src/hist/stats.py    | 141 +++++++++++++++++++++++++++++++++++++++----
 tests/test_stats.py  | 105 ++++++++++++++++++++++++++------
 3 files changed, 233 insertions(+), 39 deletions(-)

diff --git a/src/hist/basehist.py b/src/hist/basehist.py
index c53f59b8..0f104364 100644
--- a/src/hist/basehist.py
+++ b/src/hist/basehist.py
@@ -548,22 +548,32 @@ def sum(self, flow: bool = False) -> float | bh.accumulators.Accumulator:
 
         return super().sum(flow=flow)
 
-    def chisquare_1samp(self) -> str | NotImplementedError:
+    def chisquare_1samp(
+        self: hist.BaseHist,
+        distribution: str | Callable[..., Any],
+        args: Any = (),
+        kwds: Any = None,
+    ) -> Any:
         from hist import stats
 
-        return stats.chisquare_1samp(self)
+        return stats.chisquare_1samp(self, distribution, args, kwds)
 
-    def chisquare_2samp(self) -> str | NotImplementedError:
+    def chisquare_2samp(self, other: hist.BaseHist) -> Any:
         from hist import stats
 
-        return stats.chisquare_2samp(self)
+        return stats.chisquare_2samp(self, other)
 
-    def ks_1samp(self) -> str | NotImplementedError:
+    def ks_1samp(
+        self: hist.BaseHist,
+        distribution: str | Callable[..., Any],
+        args: Any = (),
+        kwds: Any = None,
+    ) -> Any:
         from hist import stats
 
-        return stats.ks_1samp(self)
+        return stats.ks_1samp(self, distribution, args, kwds)
 
-    def ks_2samp(self) -> str | NotImplementedError:
+    def ks_2samp(self, other: hist.BaseHist) -> Any:
         from hist import stats
 
-        return stats.ks_2samp(self)
+        return stats.ks_2samp(self, other)
diff --git a/src/hist/stats.py b/src/hist/stats.py
index 0535f110..846cd75f 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -1,27 +1,146 @@
 from __future__ import annotations
 
+from typing import Any, Callable
+
+import numpy as np
+from scipy import stats as spstats
+
 import hist
 
 
-def chisquare_1samp(self: hist.BaseHist) -> str | NotImplementedError:
+def chisquare_1samp(
+    self: hist.BaseHist,
+    distribution: str | Callable[..., Any],
+    args: Any = (),
+    kwds: Any = None,
+) -> Any:
+    kwds = {} if kwds is None else kwds
+
     if self.ndim != 1:
-        raise NotImplementedError("chisquare_1samp is only supported for 1D histograms")
-    return "Performing chi square one sample test"
+        raise NotImplementedError(
+            f"Only 1D-histogram supports chisquare_1samp, try projecting {self.__class__.__name__} to 1D"
+        )
+    if isinstance(distribution, str):
+        if not hasattr(spstats, distribution):
+            raise ValueError(
+                f"Unknown distribution {distribution}, try one of the defined scipy distributions"
+            )
+        cdf = getattr(spstats, distribution).cdf
+    elif callable(distribution):
+        cdf = distribution
+    else:
+        raise TypeError(
+            f"Unknown distribution type {distribution}, try one of the defined scipy distributions or a callable CDF"
+        )
+
+    variances = self.variances()
+    if variances is None:
+        raise RuntimeError(
+            "Cannot compute from a variance-less histogram, try a Weight storage"
+        )
 
+    observed = self.values()
+    totalentries = self.sum()
+    expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries
+    where = variances != 0
+    squares = (expected[where] - observed[where]) ** 2 / variances[where]
+    ndof = len(observed) - 1
+    chisq = np.sum(squares)
+    pvalue = spstats.chi2.sf(chisq, ndof)
 
-def chisquare_2samp(self: hist.BaseHist) -> str | NotImplementedError:
+    return chisq, ndof, pvalue
+
+
+def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
     if self.ndim != 1:
-        raise NotImplementedError("chisquare_2samp is only supported for 1D histograms")
-    return "Performing chi square two sample test"
+        raise NotImplementedError(
+            f"Only 1D-histogram supports chisquare_2samp, try projecting {self.__class__.__name__} to 1D"
+        )
+    if isinstance(other, hist.hist.Hist) and other.ndim != 1:
+        raise NotImplementedError(
+            f"Only 1D-histogram supports chisquare_2samp, try projecting other={other.__class__.__name__} to 1D"
+        )
+    if not isinstance(other, hist.hist.Hist):
+        raise TypeError(
+            f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object"
+        )
+
+    variances1 = self.variances()
+    variances2 = other.variances()
+    if variances1 is None or variances2 is None:
+        raise RuntimeError(
+            "Cannot compute from variance-less histograms, try a Weight storage"
+        )
 
+    counts1 = self.values()
+    counts2 = other.values()
+    squares = (
+        counts1 * np.sqrt(counts2.sum() / counts1.sum())
+        - counts2 * np.sqrt(counts1.sum() / counts2.sum())
+    ) ** 2
+    variances = variances1 + variances2
+    where = variances != 0
+    chisq = np.sum(squares[where] / variances[where])
+    k = where.sum()
+    ndof = k - 1 if self.sum() == other.sum() else k
+    pvalue = spstats.chi2.sf(chisq, ndof)
+
+    return chisq, ndof, pvalue
+
+
+def ks_1samp(
+    self: hist.BaseHist,
+    distribution: str | Callable[..., Any],
+    args: Any = (),
+    kwds: Any = None,
+) -> Any:
+    kwds = {} if kwds is None else kwds
 
-def ks_1samp(self: hist.BaseHist) -> str | NotImplementedError:
     if self.ndim != 1:
-        raise NotImplementedError("ks_1samp is only supported for 1D histograms")
-    return "Performing ks one sample test"
+        raise NotImplementedError(
+            f"Only 1D-histogram supports ks_1samp, try projecting {self.__class__.__name__} to 1D"
+        )
+    if isinstance(distribution, str):
+        if not hasattr(spstats, distribution):
+            raise ValueError(
+                f"Unknown distribution {distribution}, try one of the defined scipy distributions"
+            )
+        cdf = getattr(spstats, distribution).cdf
+    elif callable(distribution):
+        cdf = distribution
+    else:
+        raise TypeError(
+            f"Unknown distribution type {distribution}, try one of the defined scipy distributions or a callable CDF"
+        )
+
+    variances = self.variances()
+    if variances is None:
+        raise RuntimeError(
+            "Cannot compute from a variance-less histogram, try a Weight storage"
+        )
 
+    return cdf(self.axes[0].edges, *args, **kwds)  # placeholder to pass pre-commit
 
-def ks_2samp(self: hist.BaseHist) -> str | NotImplementedError:
+
+def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
     if self.ndim != 1:
-        raise NotImplementedError("ks_2samp is only supported for 1D histograms")
+        raise NotImplementedError(
+            f"Only 1D-histogram supports ks_2samp, try projecting {self.__class__.__name__} to 1D"
+        )
+    if isinstance(other, hist.hist.Hist) and other.ndim != 1:
+        raise NotImplementedError(
+            f"Only 1D-histogram supports ks_2samp, try projecting other={other.__class__.__name__} to 1D"
+        )
+    if not isinstance(other, hist.hist.Hist):
+        raise TypeError(
+            f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object"
+        )
+
+    variances1 = self.variances()
+    variances2 = other.variances()
+    if variances1 is None or variances2 is None:
+        raise RuntimeError(
+            "Cannot compute from variance-less histograms, try a Weight storage"
+        )
+
     return "Performing ks two sample test"
diff --git a/tests/test_stats.py b/tests/test_stats.py
index b3792026..7561d643 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -2,36 +2,101 @@
 
 import numpy as np
 import pytest
+from scipy import stats as spstats
 
 import hist
 from hist import Hist
 
 
 def test_stattest():
+    from numba_stats import norm
+
+    np.random.seed(42)
+
     h = Hist(hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000))
 
-    assert h.chisquare_1samp() == "Performing chi square one sample test"
-    assert h.chisquare_2samp() == "Performing chi square two sample test"
-    assert h.ks_1samp() == "Performing ks one sample test"
-    assert h.ks_2samp() == "Performing ks two sample test"
+    chisq, ndof, pvalue = h.chisquare_1samp("norm")
+    assert chisq == pytest.approx(9.47425856230132)
+    assert ndof == 19
+    assert pvalue == pytest.approx(0.9647461095072625)
+
+    chisq, ndof, pvalue = h.chisquare_1samp(norm.cdf, args=(0, 1))
+    assert chisq == pytest.approx(9.47425856230132)
+    assert ndof == 19
+    assert pvalue == pytest.approx(0.9647461095072625)
+
+    np.random.seed(42)
+
+    h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm"))
+    h2 = Hist(hist.axis.Regular(20, -5, 5, name="norm"))
+    h1.fill(np.random.normal(size=1000))
+    h2.fill(np.random.normal(size=500))
+
+    chisq, ndof, pvalue = h1.chisquare_2samp(h2)
+    assert chisq == pytest.approx(12.901853991544478)
+    assert ndof == 15
+    assert pvalue == pytest.approx(0.609878574488961)
+
+    np.random.seed(42)
+
+    h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm"))
+    h2 = Hist(hist.axis.Regular(20, -5, 5, name="norm"))
+    h1.fill(np.random.normal(size=1000))
+    h2.fill(np.random.normal(size=1000))
+
+    chisq, ndof, pvalue = h1.chisquare_2samp(h2)
+    assert chisq == pytest.approx(13.577308400334086)
+    assert ndof == 14
+    assert pvalue == pytest.approx(0.4816525905214355)
+
+    assert np.all(
+        h.ks_1samp("norm") == spstats.norm.cdf(h.axes[0].edges)
+    )  # placeholder to pass test for now
+
+    np.random.seed(42)
+
+    h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm"))
+    h2 = Hist(hist.axis.Regular(20, -5, 5, name="norm"))
+    h1.fill(np.random.normal(size=1000))
+    h2.fill(np.random.normal(size=1000))
+
+    assert h1.ks_2samp(h2) == "Performing ks two sample test"
 
     h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
+    with pytest.raises(NotImplementedError):
+        h.chisquare_1samp("norm")
+    with pytest.raises(NotImplementedError):
+        h.chisquare_2samp(h2)
+    with pytest.raises(NotImplementedError):
+        h.ks_1samp("norm")
+    with pytest.raises(NotImplementedError):
+        h.ks_2samp(h2)
+
+    h = Hist(hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000))
+    with pytest.raises(RuntimeError):
+        h.chisquare_1samp("norm")
+    with pytest.raises(RuntimeError):
+        h.chisquare_2samp(h2)
+    with pytest.raises(RuntimeError):
+        h.ks_1samp("norm")
+    with pytest.raises(RuntimeError):
+        h.ks_2samp(h2)
+
+    h = Hist(hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000))
+    with pytest.raises(ValueError):
+        h.chisquare_1samp("not_a_distribution")
+    with pytest.raises(ValueError):
+        h.ks_1samp("not_a_distribution")
 
-    with pytest.raises(
-        NotImplementedError, match="chisquare_1samp is only supported for 1D histograms"
-    ):
-        h.chisquare_1samp()
-    with pytest.raises(
-        NotImplementedError, match="chisquare_2samp is only supported for 1D histograms"
-    ):
-        h.chisquare_2samp()
-    with pytest.raises(
-        NotImplementedError, match="ks_1samp is only supported for 1D histograms"
-    ):
-        h.ks_1samp()
-    with pytest.raises(
-        NotImplementedError, match="ks_2samp is only supported for 1D histograms"
-    ):
-        h.ks_2samp()
+    with pytest.raises(TypeError):
+        h.chisquare_1samp(1)
+    with pytest.raises(TypeError):
+        h.chisquare_2samp(1)
+    with pytest.raises(TypeError):
+        h.ks_1samp(1)
+    with pytest.raises(TypeError):
+        h.ks_2samp(1)

From 70e5a718b526c337eed9f668f8d9c5896f285b25 Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Fri, 9 Jun 2023 17:01:11 +0300
Subject: [PATCH 03/19] split tests into 4 functions

---
 src/hist/stats.py   |  4 +--
 tests/test_stats.py | 87 +++++++++++++++++++++++++++++++--------------
 2 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/src/hist/stats.py b/src/hist/stats.py
index 846cd75f..3cb8f5d2 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -43,9 +43,9 @@ def chisquare_1samp(
     totalentries = self.sum()
     expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries
     where = variances != 0
-    squares = (expected[where] - observed[where]) ** 2 / variances[where]
+    squares = (expected - observed) ** 2
     ndof = len(observed) - 1
-    chisq = np.sum(squares)
+    chisq = np.sum(squares[where] / variances[where])
     pvalue = spstats.chi2.sf(chisq, ndof)
 
     return chisq, ndof, pvalue
diff --git a/tests/test_stats.py b/tests/test_stats.py
index 7561d643..7463efc0 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -8,7 +8,7 @@
 from hist import Hist
 
 
-def test_stattest():
+def test_chisquare_1samp():
     from numba_stats import norm
 
     np.random.seed(42)
@@ -26,6 +26,26 @@ def test_stattest():
     assert ndof == 19
     assert pvalue == pytest.approx(0.9647461095072625)
 
+    h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
+    with pytest.raises(NotImplementedError):
+        h.chisquare_1samp("norm")
+
+    h = Hist(hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000))
+    with pytest.raises(RuntimeError):
+        h.chisquare_1samp("norm")
+
+    h = Hist(hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000))
+    with pytest.raises(ValueError):
+        h.chisquare_1samp("not_a_distribution")
+
+    with pytest.raises(TypeError):
+        h.chisquare_1samp(1)
+
+
+def test_chisquare_2samp():
     np.random.seed(42)
 
     h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm"))
@@ -50,10 +70,50 @@ def test_stattest():
     assert ndof == 14
     assert pvalue == pytest.approx(0.4816525905214355)
 
+    h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
+    with pytest.raises(NotImplementedError):
+        h.chisquare_2samp(h2)
+
+    h = Hist(hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000))
+    with pytest.raises(RuntimeError):
+        h.chisquare_2samp(h2)
+
+    with pytest.raises(TypeError):
+        h.chisquare_2samp(1)
+
+
+def test_ks_1samp():
+    np.random.seed(42)
+
+    h = Hist(hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000))
+
     assert np.all(
         h.ks_1samp("norm") == spstats.norm.cdf(h.axes[0].edges)
     )  # placeholder to pass test for now
 
+    h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
+    with pytest.raises(NotImplementedError):
+        h.ks_1samp("norm")
+
+    h = Hist(hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000))
+    with pytest.raises(RuntimeError):
+        h.ks_1samp("norm")
+
+    h = Hist(hist.axis.Regular(20, -5, 5))
+    h.fill(np.random.normal(size=1000))
+    with pytest.raises(ValueError):
+        h.ks_1samp("not_a_distribution")
+
+    with pytest.raises(TypeError):
+        h.ks_1samp(1)
+
+
+def test_ks_2samp():
     np.random.seed(42)
 
     h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm"))
@@ -65,38 +125,13 @@ def test_stattest():
 
     h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
-    with pytest.raises(NotImplementedError):
-        h.chisquare_1samp("norm")
-    with pytest.raises(NotImplementedError):
-        h.chisquare_2samp(h2)
-    with pytest.raises(NotImplementedError):
-        h.ks_1samp("norm")
     with pytest.raises(NotImplementedError):
         h.ks_2samp(h2)
 
     h = Hist(hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000))
-    with pytest.raises(RuntimeError):
-        h.chisquare_1samp("norm")
-    with pytest.raises(RuntimeError):
-        h.chisquare_2samp(h2)
-    with pytest.raises(RuntimeError):
-        h.ks_1samp("norm")
     with pytest.raises(RuntimeError):
         h.ks_2samp(h2)
 
-    h = Hist(hist.axis.Regular(20, -5, 5))
-    h.fill(np.random.normal(size=1000))
-    with pytest.raises(ValueError):
-        h.chisquare_1samp("not_a_distribution")
-    with pytest.raises(ValueError):
-        h.ks_1samp("not_a_distribution")
-
-    with pytest.raises(TypeError):
-        h.chisquare_1samp(1)
-    with pytest.raises(TypeError):
-        h.chisquare_2samp(1)
-    with pytest.raises(TypeError):
-        h.ks_1samp(1)
     with pytest.raises(TypeError):
         h.ks_2samp(1)

From 4f8ea6987a4d97f9b085481dd58f879cd4e8606f Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Sun, 11 Jun 2023 01:34:39 +0300
Subject: [PATCH 04/19] small updates in chisquare tests

---
 src/hist/stats.py   | 58 ++++++++++++++++++++++++++-------------------
 tests/test_stats.py |  4 ++--
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/src/hist/stats.py b/src/hist/stats.py
index 3cb8f5d2..2b150201 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -4,10 +4,27 @@
 
 import numpy as np
 from scipy import stats as spstats
+from scipy.stats import rv_continuous, rv_discrete
 
 import hist
 
 
+def _get_cdf_if_valid(obj: Any) -> Any:
+    if isinstance(obj, (rv_continuous, rv_discrete)):
+        return obj.cdf
+    if isinstance(obj, str) and hasattr(spstats, obj):
+        dist = getattr(spstats, obj)
+        if isinstance(dist, (rv_continuous, rv_discrete)):
+            return dist.cdf
+    if hasattr(obj, "cdf") and callable(obj.cdf):
+        return obj.cdf
+    if callable(obj):
+        return obj
+    raise TypeError(
+        f"Unknown distribution type {obj}, try one of the scipy distributions, an object with a cdf method, or a callable cdf implementation"
+    )
+
+
 def chisquare_1samp(
     self: hist.BaseHist,
     distribution: str | Callable[..., Any],
@@ -20,18 +37,7 @@ def chisquare_1samp(
         raise NotImplementedError(
             f"Only 1D-histogram supports chisquare_1samp, try projecting {self.__class__.__name__} to 1D"
         )
-    if isinstance(distribution, str):
-        if not hasattr(spstats, distribution):
-            raise ValueError(
-                f"Unknown distribution {distribution}, try one of the defined scipy distributions"
-            )
-        cdf = getattr(spstats, distribution).cdf
-    elif callable(distribution):
-        cdf = distribution
-    else:
-        raise TypeError(
-            f"Unknown distribution type {distribution}, try one of the defined scipy distributions or a callable CDF"
-        )
+    cdf = _get_cdf_if_valid(distribution)
 
     variances = self.variances()
     if variances is None:
@@ -42,9 +48,12 @@ def chisquare_1samp(
     observed = self.values()
     totalentries = self.sum()
     expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries
+    variances = (
+        expected  # TODO: check if variances or expected should go in the denominator
+    )
     where = variances != 0
     squares = (expected - observed) ** 2
-    ndof = len(observed) - 1
+    ndof = where.sum() - 1
     chisq = np.sum(squares[where] / variances[where])
     pvalue = spstats.chi2.sf(chisq, ndof)
 
@@ -64,6 +73,11 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
         raise TypeError(
             f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object"
         )
+    # TODO: add support for compatible rebinning
+    if not np.allclose(self.axes[0].edges, other.axes[0].edges):
+        raise NotImplementedError(
+            "Cannot compute chi2 from histograms with different binning, try rebinning"
+        )
 
     variances1 = self.variances()
     variances2 = other.variances()
@@ -100,18 +114,7 @@ def ks_1samp(
         raise NotImplementedError(
             f"Only 1D-histogram supports ks_1samp, try projecting {self.__class__.__name__} to 1D"
         )
-    if isinstance(distribution, str):
-        if not hasattr(spstats, distribution):
-            raise ValueError(
-                f"Unknown distribution {distribution}, try one of the defined scipy distributions"
-            )
-        cdf = getattr(spstats, distribution).cdf
-    elif callable(distribution):
-        cdf = distribution
-    else:
-        raise TypeError(
-            f"Unknown distribution type {distribution}, try one of the defined scipy distributions or a callable CDF"
-        )
+    cdf = _get_cdf_if_valid(distribution)
 
     variances = self.variances()
     if variances is None:
@@ -135,6 +138,11 @@ def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
         raise TypeError(
             f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object"
         )
+    # TODO: add support for compatible rebinning
+    if not np.allclose(self.axes[0].edges, other.axes[0].edges):
+        raise NotImplementedError(
+            "Cannot compute chi2 from histograms with different binning, try rebinning"
+        )
 
     variances1 = self.variances()
     variances2 = other.variances()
diff --git a/tests/test_stats.py b/tests/test_stats.py
index 7463efc0..fa438800 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -38,7 +38,7 @@ def test_chisquare_1samp():
 
     h = Hist(hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000))
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         h.chisquare_1samp("not_a_distribution")
 
     with pytest.raises(TypeError):
@@ -106,7 +106,7 @@ def test_ks_1samp():
 
     h = Hist(hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000))
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         h.ks_1samp("not_a_distribution")
 
     with pytest.raises(TypeError):

From 07eee774990568cd52ad4f128ca4a1a75dac4c9f Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Sun, 11 Jun 2023 16:05:18 +0300
Subject: [PATCH 05/19] some fixes for now

---
 src/hist/stats.py   | 10 +++++++++-
 tests/test_stats.py | 16 ++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/hist/stats.py b/src/hist/stats.py
index 2b150201..8c4ecc35 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -74,6 +74,10 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
             f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object"
         )
     # TODO: add support for compatible rebinning
+    if self.size != other.size:
+        raise NotImplementedError(
+            "Cannot compute chi2 from histograms with different binning, try rebinning"
+        )
     if not np.allclose(self.axes[0].edges, other.axes[0].edges):
         raise NotImplementedError(
             "Cannot compute chi2 from histograms with different binning, try rebinning"
@@ -96,7 +100,7 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
     where = variances != 0
     chisq = np.sum(squares[where] / variances[where])
     k = where.sum()
-    ndof = k - 1 if self.sum() == other.sum() else k
+    ndof = k if self.sum() == other.sum() else k - 1
     pvalue = spstats.chi2.sf(chisq, ndof)
 
     return chisq, ndof, pvalue
@@ -139,6 +143,10 @@ def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
             f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object"
         )
     # TODO: add support for compatible rebinning
+    if self.size != other.size:
+        raise NotImplementedError(
+            "Cannot compute chi2 from histograms with different binning, try rebinning"
+        )
     if not np.allclose(self.axes[0].edges, other.axes[0].edges):
         raise NotImplementedError(
             "Cannot compute chi2 from histograms with different binning, try rebinning"
diff --git a/tests/test_stats.py b/tests/test_stats.py
index fa438800..16f35b83 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -17,14 +17,14 @@ def test_chisquare_1samp():
     h.fill(np.random.normal(size=1000))
 
     chisq, ndof, pvalue = h.chisquare_1samp("norm")
-    assert chisq == pytest.approx(9.47425856230132)
+    assert chisq == pytest.approx(11.640108080468355)
     assert ndof == 19
-    assert pvalue == pytest.approx(0.9647461095072625)
+    assert pvalue == pytest.approx(0.9004272294139111)
 
     chisq, ndof, pvalue = h.chisquare_1samp(norm.cdf, args=(0, 1))
-    assert chisq == pytest.approx(9.47425856230132)
+    assert chisq == pytest.approx(11.64010808046843)
     assert ndof == 19
-    assert pvalue == pytest.approx(0.9647461095072625)
+    assert pvalue == pytest.approx(0.9004272294139111)
 
     h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
@@ -55,8 +55,8 @@ def test_chisquare_2samp():
 
     chisq, ndof, pvalue = h1.chisquare_2samp(h2)
     assert chisq == pytest.approx(12.901853991544478)
-    assert ndof == 15
-    assert pvalue == pytest.approx(0.609878574488961)
+    assert ndof == 14
+    assert pvalue == pytest.approx(0.5342659447391381)
 
     np.random.seed(42)
 
@@ -67,8 +67,8 @@ def test_chisquare_2samp():
 
     chisq, ndof, pvalue = h1.chisquare_2samp(h2)
     assert chisq == pytest.approx(13.577308400334086)
-    assert ndof == 14
-    assert pvalue == pytest.approx(0.4816525905214355)
+    assert ndof == 15
+    assert pvalue == pytest.approx(0.5577978656881688)
 
     h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000), np.random.normal(size=1000))

From 7efd00ecaaed032a7b10642cafe05cf33b113fca Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Tue, 13 Jun 2023 03:13:09 +0300
Subject: [PATCH 06/19] first attempt at ks_1samp

---
 src/hist/stats.py | 44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/src/hist/stats.py b/src/hist/stats.py
index 8c4ecc35..3f6073e6 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -46,7 +46,7 @@ def chisquare_1samp(
         )
 
     observed = self.values()
-    totalentries = self.sum()
+    totalentries = self.sum(flow=True)
     expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries
     variances = (
         expected  # TODO: check if variances or expected should go in the denominator
@@ -92,15 +92,18 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
 
     counts1 = self.values()
     counts2 = other.values()
+    totalentries1 = self.values(flow=True).sum()
+    totalentries2 = other.values(flow=True).sum()
     squares = (
-        counts1 * np.sqrt(counts2.sum() / counts1.sum())
-        - counts2 * np.sqrt(counts1.sum() / counts2.sum())
+        counts1 * np.sqrt(totalentries2 / totalentries1)
+        - counts2 * np.sqrt(totalentries1 / totalentries2)
     ) ** 2
     variances = variances1 + variances2
     where = variances != 0
     chisq = np.sum(squares[where] / variances[where])
     k = where.sum()
-    ndof = k if self.sum() == other.sum() else k - 1
+    # TODO: check if ndof = k if totalentries1 == totalentries2 else k - 1 is correct
+    ndof = k - 1
     pvalue = spstats.chi2.sf(chisq, ndof)
 
     return chisq, ndof, pvalue
@@ -120,13 +123,32 @@ def ks_1samp(
         )
     cdf = _get_cdf_if_valid(distribution)
 
-    variances = self.variances()
-    if variances is None:
-        raise RuntimeError(
-            "Cannot compute from a variance-less histogram, try a Weight storage"
-        )
-
-    return cdf(self.axes[0].edges, *args, **kwds)  # placeholder to pass pre-commit
+    cdflocs = self.axes[0].edges[:-1]
+    cdfvals = cdf(cdflocs, *args, **kwds)
+    observed = self.values(flow=True)
+    totalentries = self.sum(flow=True)
+    ecdf = np.cumsum(observed) / totalentries
+    ecdfplus = ecdf[1:-1]
+    ecdfminus = ecdf[0:-2]
+    dplus_index = (ecdfplus - cdfvals).argmax()
+    dminus_index = (cdfvals - ecdfminus).argmax()
+    Dplus = (ecdfplus - cdfvals)[dplus_index]
+    Dminus = (cdfvals - ecdfminus)[dminus_index]
+    dplus_location = cdflocs[dplus_index]
+    dminus_location = cdflocs[dminus_index]
+
+    if Dplus > Dminus:
+        D = Dplus
+        d_location = dplus_location
+        d_sign = 1
+    else:
+        D = Dminus
+        d_location = dminus_location
+        d_sign = -1
+
+    pvalue = spstats.kstwo.sf(D, self.sum(flow=True))
+
+    return D, Dplus, Dminus, dplus_location, dminus_location, d_location, d_sign, pvalue
 
 
 def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:

From 0e0819ec4f8b4849464a0ae4d68fbcb58ad9e7a8 Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Sun, 18 Jun 2023 22:01:24 +0300
Subject: [PATCH 07/19] first attempt at ks_2samp

---
 src/hist/basehist.py |  14 +++-
 src/hist/stats.py    | 181 ++++++++++++++++++++++++++++++++++++-------
 tests/test_stats.py  |  54 +------------
 3 files changed, 170 insertions(+), 79 deletions(-)

diff --git a/src/hist/basehist.py b/src/hist/basehist.py
index 2ff6b71e..c69fca25 100644
--- a/src/hist/basehist.py
+++ b/src/hist/basehist.py
@@ -584,12 +584,20 @@ def ks_1samp(
         distribution: str | Callable[..., Any],
         args: Any = (),
         kwds: Any = None,
+        alternative: str = "two-sided",
+        mode: str = "auto",
     ) -> Any:
         from hist import stats
 
-        return stats.ks_1samp(self, distribution, args, kwds)
+        return stats.ks_1samp(self, distribution, args, kwds, alternative, mode)
 
-    def ks_2samp(self, other: hist.BaseHist) -> Any:
+    def ks_2samp(
+        self: hist.BaseHist,
+        other: hist.BaseHist,
+        equal_bins: bool = True,
+        alternative: str = "two-sided",
+        mode: str = "auto",
+    ) -> Any:
         from hist import stats
 
-        return stats.ks_2samp(self, other)
+        return stats.ks_2samp(self, other, equal_bins, alternative, mode)
diff --git a/src/hist/stats.py b/src/hist/stats.py
index 3f6073e6..1c559d81 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
+import warnings
 from typing import Any, Callable
 
 import numpy as np
 from scipy import stats as spstats
 from scipy.stats import rv_continuous, rv_discrete
+from scipy.stats._stats_py import _attempt_exact_2kssamp
 
 import hist
 
@@ -46,15 +48,13 @@ def chisquare_1samp(
         )
 
     observed = self.values()
-    totalentries = self.sum(flow=True)
+    totalentries = self.values(flow=True).sum()
     expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries
-    variances = (
-        expected  # TODO: check if variances or expected should go in the denominator
-    )
-    where = variances != 0
+    # TODO: check if variances or expected should go in the denominator
+    where = expected != 0
     squares = (expected - observed) ** 2
-    ndof = where.sum() - 1
-    chisq = np.sum(squares[where] / variances[where])
+    ndof = np.sum(where) - 1
+    chisq = np.sum(squares[where] / expected[where])
     pvalue = spstats.chi2.sf(chisq, ndof)
 
     return chisq, ndof, pvalue
@@ -101,7 +101,7 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
     variances = variances1 + variances2
     where = variances != 0
     chisq = np.sum(squares[where] / variances[where])
-    k = where.sum()
+    k = np.sum(where)
     # TODO: check if ndof = k if totalentries1 == totalentries2 else k - 1 is correct
     ndof = k - 1
     pvalue = spstats.chi2.sf(chisq, ndof)
@@ -114,6 +114,8 @@ def ks_1samp(
     distribution: str | Callable[..., Any],
     args: Any = (),
     kwds: Any = None,
+    alternative: str = "two-sided",
+    mode: str = "auto",
 ) -> Any:
     kwds = {} if kwds is None else kwds
 
@@ -121,12 +123,21 @@ def ks_1samp(
         raise NotImplementedError(
             f"Only 1D-histogram supports ks_1samp, try projecting {self.__class__.__name__} to 1D"
         )
+
+    if mode not in ["auto", "exact", "asymp"]:
+        raise ValueError(f"Invalid value for mode: {mode}")
+    alternative = {"t": "two-sided", "g": "greater", "l": "less"}.get(
+        alternative.lower()[0], alternative
+    )
+    if alternative not in ["two-sided", "less", "greater"]:
+        raise ValueError(f"Invalid value for alternative: {alternative}")
+
     cdf = _get_cdf_if_valid(distribution)
 
     cdflocs = self.axes[0].edges[:-1]
     cdfvals = cdf(cdflocs, *args, **kwds)
     observed = self.values(flow=True)
-    totalentries = self.sum(flow=True)
+    totalentries = observed.sum()
     ecdf = np.cumsum(observed) / totalentries
     ecdfplus = ecdf[1:-1]
     ecdfminus = ecdf[0:-2]
@@ -146,12 +157,34 @@ def ks_1samp(
         d_location = dminus_location
         d_sign = -1
 
-    pvalue = spstats.kstwo.sf(D, self.sum(flow=True))
+    if alternative == "greater":
+        pvalue = spstats.ksone.sf(Dplus, totalentries)
+        return Dplus, dplus_location, 1, pvalue
+    if alternative == "less":
+        pvalue = spstats.ksone.sf(Dminus, totalentries)
+        return Dminus, dminus_location, -1, pvalue
+
+    if mode == "auto":  # Always select exact
+        mode = "exact"
+    if mode == "exact":
+        pvalue = spstats.kstwo.sf(D, totalentries)
+    elif mode == "asymp":
+        pvalue = spstats.kstwobign.sf(D * np.sqrt(totalentries))
+    elif mode == "approx":
+        pvalue = 2 * spstats.ksone.sf(D, totalentries)
+
+    pvalue = np.clip(pvalue, 0, 1)
 
     return D, Dplus, Dminus, dplus_location, dminus_location, d_location, d_sign, pvalue
 
 
-def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
+def ks_2samp(
+    self: hist.BaseHist,
+    other: hist.BaseHist,
+    equal_bins: bool = True,
+    alternative: str = "two-sided",
+    mode: str = "auto",
+) -> Any:
     if self.ndim != 1:
         raise NotImplementedError(
             f"Only 1D-histogram supports ks_2samp, try projecting {self.__class__.__name__} to 1D"
@@ -164,21 +197,117 @@ def ks_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
         raise TypeError(
             f"Unknown type {other.__class__.__name__}, other must be a hist.Hist object"
         )
-    # TODO: add support for compatible rebinning
-    if self.size != other.size:
-        raise NotImplementedError(
-            "Cannot compute chi2 from histograms with different binning, try rebinning"
-        )
-    if not np.allclose(self.axes[0].edges, other.axes[0].edges):
-        raise NotImplementedError(
-            "Cannot compute chi2 from histograms with different binning, try rebinning"
-        )
 
-    variances1 = self.variances()
-    variances2 = other.variances()
-    if variances1 is None or variances2 is None:
-        raise RuntimeError(
-            "Cannot compute from variance-less histograms, try a Weight storage"
+    if equal_bins:
+        if self.size != other.size:
+            raise NotImplementedError(
+                "Cannot compute KS from histograms with different binning, try rebinning"
+            )
+        if not np.allclose(self.axes[0].edges, other.axes[0].edges):
+            raise NotImplementedError(
+                "Cannot compute KS from histograms with different binning, try rebinning"
+            )
+
+    if mode not in ["auto", "exact", "asymp"]:
+        raise ValueError(f"Invalid value for mode: {mode}")
+    alternative = {"t": "two-sided", "g": "greater", "l": "less"}.get(
+        alternative.lower()[0], alternative
+    )
+    if alternative not in ["two-sided", "less", "greater"]:
+        raise ValueError(f"Invalid value for alternative: {alternative}")
+
+    data1 = self.values(flow=True)
+    data2 = other.values(flow=True)
+    n1 = data1.sum()
+    n2 = data2.sum()
+    edges1 = self.axes[0].edges[:-1]
+    edges2 = other.axes[0].edges[:-1]
+    cdf1 = np.cumsum(data1) / n1
+    cdf2 = np.cumsum(data2) / n2
+
+    if equal_bins:
+        cdflocs = edges1
+
+        cdf1 = cdf1[1:-1]
+        cdf2 = cdf2[1:-1]
+        cddiffs = cdf1 - cdf2
+
+    elif not equal_bins:
+        edges_all = np.sort(np.concatenate([edges1, edges2]))
+        cdflocs = edges_all
+
+        # Use np.searchsorted to get the CDF values at all bin edges
+        cdf1_all = cdf1[np.searchsorted(edges1, edges_all, side="right")]
+        cdf2_all = cdf2[np.searchsorted(edges2, edges_all, side="right")]
+        cddiffs = cdf1_all - cdf2_all
+
+    # Identify the location of the statistic
+    argminS = np.argmin(cddiffs)
+    argmaxS = np.argmax(cddiffs)
+    loc_minS = cdflocs[argminS]
+    loc_maxS = cdflocs[argmaxS]
+
+    # Ensure sign of minS is not negative.
+    minS = np.clip(-cddiffs[argminS], 0, 1)
+    maxS = cddiffs[argmaxS]
+
+    if alternative == "less" or (alternative == "two-sided" and minS > maxS):
+        d = minS
+        d_location = loc_minS
+        d_sign = -1
+    else:
+        d = maxS
+        d_location = loc_maxS
+        d_sign = 1
+
+    g = np.gcd(int(n1), int(n2))
+    n1g = n1 // g
+    n2g = n2 // g
+    pvalue = -np.inf
+
+    if mode == "auto":  # Always select exact
+        mode = "exact"
+    if mode == "exact" and n1g >= np.iinfo(np.int32).max / n2g:
+        # If lcm(n1, n2) is too big, switch from exact to asymp
+        mode = "asymp"
+        warnings.warn(
+            f"Exact ks_2samp calculation not possible with samples sizes "
+            f"{n1} and {n2}. Switching to 'asymp'.",
+            RuntimeWarning,
+            stacklevel=3,
         )
 
-    return "Performing ks two sample test"
+    if mode == "exact":
+        success, d, pvalue = _attempt_exact_2kssamp(n1, n2, g, d, alternative)
+        if not success:
+            mode = "asymp"
+            warnings.warn(
+                f"ks_2samp: Exact calculation unsuccessful. "
+                f"Switching to method={mode}.",
+                RuntimeWarning,
+                stacklevel=3,
+            )
+
+    if mode == "asymp":
+        # The product n1*n2 is large.  Use Smirnov's asymptoptic formula.
+        # Ensure float to avoid overflow in multiplication
+        # sorted because the one-sided formula is not symmetric in n1, n2
+        m, n = sorted([float(n1), float(n2)], reverse=True)
+        en = m * n / (m + n)
+        if alternative == "two-sided":
+            pvalue = spstats.kstwo.sf(d, np.round(en))
+        else:
+            z = np.sqrt(en) * d
+            # Use Hodges' suggested approximation Eqn 5.3
+            # Requires m to be the larger of (n1, n2)
+            expt = -2 * z**2 - 2 * z * (m + 2 * n) / np.sqrt(m * n * (m + n)) / 3.0
+            pvalue = np.exp(expt)
+
+    pvalue = np.clip(pvalue, 0, 1)
+    D = d
+    Dplus = maxS
+    Dminus = minS
+    dplus_location = loc_maxS
+    dminus_location = loc_minS
+
+    return D, Dplus, Dminus, dplus_location, dminus_location, d_location, d_sign, pvalue
diff --git a/tests/test_stats.py b/tests/test_stats.py
index 16f35b83..0458f6fe 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 import pytest
-from scipy import stats as spstats
 
 import hist
 from hist import Hist
@@ -67,8 +66,8 @@ def test_chisquare_2samp():
 
     chisq, ndof, pvalue = h1.chisquare_2samp(h2)
     assert chisq == pytest.approx(13.577308400334086)
-    assert ndof == 15
-    assert pvalue == pytest.approx(0.5577978656881688)
+    assert ndof == 14
+    assert pvalue == pytest.approx(0.4816525905214355)
 
     h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
@@ -85,53 +84,8 @@ def test_chisquare_2samp():
 
 
 def test_ks_1samp():
-    np.random.seed(42)
-
-    h = Hist(hist.axis.Regular(20, -5, 5))
-    h.fill(np.random.normal(size=1000))
-
-    assert np.all(
-        h.ks_1samp("norm") == spstats.norm.cdf(h.axes[0].edges)
-    )  # placeholder to pass test for now
-
-    h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
-    h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
-    with pytest.raises(NotImplementedError):
-        h.ks_1samp("norm")
-
-    h = Hist(hist.axis.Regular(20, -5, 5))
-    h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000))
-    with pytest.raises(RuntimeError):
-        h.ks_1samp("norm")
-
-    h = Hist(hist.axis.Regular(20, -5, 5))
-    h.fill(np.random.normal(size=1000))
-    with pytest.raises(TypeError):
-        h.ks_1samp("not_a_distribution")
-
-    with pytest.raises(TypeError):
-        h.ks_1samp(1)
+    pass
 
 
 def test_ks_2samp():
-    np.random.seed(42)
-
-    h1 = Hist(hist.axis.Regular(20, -5, 5, name="norm"))
-    h2 = Hist(hist.axis.Regular(20, -5, 5, name="norm"))
-    h1.fill(np.random.normal(size=1000))
-    h2.fill(np.random.normal(size=1000))
-
-    assert h1.ks_2samp(h2) == "Performing ks two sample test"
-
-    h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
-    h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
-    with pytest.raises(NotImplementedError):
-        h.ks_2samp(h2)
-
-    h = Hist(hist.axis.Regular(20, -5, 5))
-    h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000))
-    with pytest.raises(RuntimeError):
-        h.ks_2samp(h2)
-
-    with pytest.raises(TypeError):
-        h.ks_2samp(1)
+    pass

From 75a262bea817467b72f10b84a8896ff1f1c3eaa3 Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Sun, 18 Jun 2023 22:30:55 +0300
Subject: [PATCH 08/19] use dtype=int when calculating total entries

---
 src/hist/stats.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/hist/stats.py b/src/hist/stats.py
index 1c559d81..159a2715 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -48,7 +48,7 @@ def chisquare_1samp(
         )
 
     observed = self.values()
-    totalentries = self.values(flow=True).sum()
+    totalentries = self.values(flow=True).sum(dtype=int)
     expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries
     # TODO: check if variances or expected should go in the denominator
     where = expected != 0
@@ -92,8 +92,8 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
 
     counts1 = self.values()
     counts2 = other.values()
-    totalentries1 = self.values(flow=True).sum()
-    totalentries2 = other.values(flow=True).sum()
+    totalentries1 = self.values(flow=True).sum(dtype=int)
+    totalentries2 = other.values(flow=True).sum(dtype=int)
     squares = (
         counts1 * np.sqrt(totalentries2 / totalentries1)
         - counts2 * np.sqrt(totalentries1 / totalentries2)
@@ -137,7 +137,7 @@ def ks_1samp(
     cdflocs = self.axes[0].edges[:-1]
     cdfvals = cdf(cdflocs, *args, **kwds)
     observed = self.values(flow=True)
-    totalentries = observed.sum()
+    totalentries = observed.sum(dtype=int)
     ecdf = np.cumsum(observed) / totalentries
     ecdfplus = ecdf[1:-1]
     ecdfminus = ecdf[0:-2]
@@ -200,12 +200,12 @@ def ks_2samp(
 
     if equal_bins:
         if self.size != other.size:
-            raise NotImplementedError(
-                "Cannot compute KS from histograms with different binning, try rebinning"
+            raise ValueError(
+                "Cannot compute KS from histograms with different binning, use equal_bins=False"
             )
         if not np.allclose(self.axes[0].edges, other.axes[0].edges):
-            raise NotImplementedError(
-                "Cannot compute KS from histograms with different binning, try rebinning"
+            raise ValueError(
+                "Cannot compute KS from histograms with different binning, use equal_bins=False"
             )
 
     if mode not in ["auto", "exact", "asymp"]:
@@ -218,8 +218,8 @@ def ks_2samp(
 
     data1 = self.values(flow=True)
     data2 = other.values(flow=True)
-    n1 = data1.sum()
-    n2 = data2.sum()
+    n1 = data1.sum(dtype=int)
+    n2 = data2.sum(dtype=int)
     edges1 = self.axes[0].edges[:-1]
     edges2 = other.axes[0].edges[:-1]
     cdf1 = np.cumsum(data1) / n1

From d21759faf903c79f48d2e94b68c487eae2a88b7f Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Mon, 19 Jun 2023 02:13:51 +0300
Subject: [PATCH 09/19] add flow option in chisquare tests

---
 src/hist/basehist.py | 13 +++++++------
 src/hist/stats.py    | 27 +++++++++++++++++----------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/hist/basehist.py b/src/hist/basehist.py
index 00324761..02d92c87 100644
--- a/src/hist/basehist.py
+++ b/src/hist/basehist.py
@@ -632,22 +632,23 @@ def sum(self, flow: bool = False) -> float | bh.accumulators.Accumulator:
         return super().sum(flow=flow)
 
     def chisquare_1samp(
-        self: hist.BaseHist,
+        self,
         distribution: str | Callable[..., Any],
+        flow: bool = False,
         args: Any = (),
         kwds: Any = None,
     ) -> Any:
         from hist import stats
 
-        return stats.chisquare_1samp(self, distribution, args, kwds)
+        return stats.chisquare_1samp(self, distribution, flow, args, kwds)
 
-    def chisquare_2samp(self, other: hist.BaseHist) -> Any:
+    def chisquare_2samp(self, other: hist.BaseHist, flow: bool = False) -> Any:
         from hist import stats
 
-        return stats.chisquare_2samp(self, other)
+        return stats.chisquare_2samp(self, other, flow)
 
     def ks_1samp(
-        self: hist.BaseHist,
+        self,
         distribution: str | Callable[..., Any],
         args: Any = (),
         kwds: Any = None,
@@ -659,7 +660,7 @@ def ks_1samp(
         return stats.ks_1samp(self, distribution, args, kwds, alternative, mode)
 
     def ks_2samp(
-        self: hist.BaseHist,
+        self,
         other: hist.BaseHist,
         equal_bins: bool = True,
         alternative: str = "two-sided",
diff --git a/src/hist/stats.py b/src/hist/stats.py
index 159a2715..cda84a23 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -30,6 +30,7 @@ def _get_cdf_if_valid(obj: Any) -> Any:
 def chisquare_1samp(
     self: hist.BaseHist,
     distribution: str | Callable[..., Any],
+    flow: bool = False,
     args: Any = (),
     kwds: Any = None,
 ) -> Any:
@@ -41,26 +42,32 @@ def chisquare_1samp(
         )
     cdf = _get_cdf_if_valid(distribution)
 
-    variances = self.variances()
+    variances = self.variances(flow=flow)
     if variances is None:
         raise RuntimeError(
             "Cannot compute from a variance-less histogram, try a Weight storage"
         )
 
-    observed = self.values()
+    observed = self.values(flow=flow)
     totalentries = self.values(flow=True).sum(dtype=int)
-    expected = np.diff(cdf(self.axes[0].edges, *args, **kwds)) * totalentries
+    cdfvals = cdf(self.axes[0].edges, *args, **kwds)
+    if flow:
+        cdfvals = np.concatenate([[0], cdfvals, [1]])
+    expected = np.diff(cdfvals) * totalentries
     # TODO: check if variances or expected should go in the denominator
-    where = expected != 0
+    # variances fails if bins have low statistics
+    where = variances != 0
     squares = (expected - observed) ** 2
     ndof = np.sum(where) - 1
-    chisq = np.sum(squares[where] / expected[where])
+    chisq = np.sum(squares[where] / variances[where])
     pvalue = spstats.chi2.sf(chisq, ndof)
 
     return chisq, ndof, pvalue
 
 
-def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
+def chisquare_2samp(
+    self: hist.BaseHist, other: hist.BaseHist, flow: bool = False
+) -> Any:
     if self.ndim != 1:
         raise NotImplementedError(
             f"Only 1D-histogram supports chisquare_2samp, try projecting {self.__class__.__name__} to 1D"
@@ -83,15 +90,15 @@ def chisquare_2samp(self: hist.BaseHist, other: hist.BaseHist) -> Any:
             "Cannot compute chi2 from histograms with different binning, try rebinning"
         )
 
-    variances1 = self.variances()
-    variances2 = other.variances()
+    variances1 = self.variances(flow=flow)
+    variances2 = other.variances(flow=flow)
     if variances1 is None or variances2 is None:
         raise RuntimeError(
             "Cannot compute from variance-less histograms, try a Weight storage"
         )
 
-    counts1 = self.values()
-    counts2 = other.values()
+    counts1 = self.values(flow=flow)
+    counts2 = other.values(flow=flow)
     totalentries1 = self.values(flow=True).sum(dtype=int)
     totalentries2 = other.values(flow=True).sum(dtype=int)
     squares = (

From d0ed901baae95c50cd6487997c21fe7b0f7c816d Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Mon, 19 Jun 2023 02:39:39 +0300
Subject: [PATCH 10/19] bin centers for ecdf is more correct

---
 src/hist/stats.py   |  2 +-
 tests/test_stats.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/hist/stats.py b/src/hist/stats.py
index cda84a23..5928c445 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -141,7 +141,7 @@ def ks_1samp(
 
     cdf = _get_cdf_if_valid(distribution)
 
-    cdflocs = self.axes[0].edges[:-1]
+    cdflocs = self.axes[0].centers
     cdfvals = cdf(cdflocs, *args, **kwds)
     observed = self.values(flow=True)
     totalentries = observed.sum(dtype=int)
diff --git a/tests/test_stats.py b/tests/test_stats.py
index 0458f6fe..ed5d23bc 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -16,14 +16,14 @@ def test_chisquare_1samp():
     h.fill(np.random.normal(size=1000))
 
     chisq, ndof, pvalue = h.chisquare_1samp("norm")
-    assert chisq == pytest.approx(11.640108080468355)
-    assert ndof == 19
-    assert pvalue == pytest.approx(0.9004272294139111)
+    assert chisq == pytest.approx(9.47425856230132)
+    assert ndof == 14
+    assert pvalue == pytest.approx(0.7995235818496339)
 
     chisq, ndof, pvalue = h.chisquare_1samp(norm.cdf, args=(0, 1))
-    assert chisq == pytest.approx(11.64010808046843)
-    assert ndof == 19
-    assert pvalue == pytest.approx(0.9004272294139111)
+    assert chisq == pytest.approx(9.47425856230132)
+    assert ndof == 14
+    assert pvalue == pytest.approx(0.7995235818496339)
 
     h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000), np.random.normal(size=1000))

From 4d517f7ca3099a878b84294fd0fad2b0a74b095e Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Mon, 19 Jun 2023 16:28:15 +0300
Subject: [PATCH 11/19] add flow option in ks_2samp

---
 src/hist/basehist.py |  3 ++-
 src/hist/stats.py    | 18 ++++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/hist/basehist.py b/src/hist/basehist.py
index 02d92c87..e44faa71 100644
--- a/src/hist/basehist.py
+++ b/src/hist/basehist.py
@@ -663,9 +663,10 @@ def ks_2samp(
         self,
         other: hist.BaseHist,
         equal_bins: bool = True,
+        flow: bool = False,
         alternative: str = "two-sided",
         mode: str = "auto",
     ) -> Any:
         from hist import stats
 
-        return stats.ks_2samp(self, other, equal_bins, alternative, mode)
+        return stats.ks_2samp(self, other, equal_bins, flow, alternative, mode)
diff --git a/src/hist/stats.py b/src/hist/stats.py
index 5928c445..83d38003 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -189,6 +189,7 @@ def ks_2samp(
     self: hist.BaseHist,
     other: hist.BaseHist,
     equal_bins: bool = True,
+    flow: bool = False,
     alternative: str = "two-sided",
     mode: str = "auto",
 ) -> Any:
@@ -227,22 +228,31 @@ def ks_2samp(
     data2 = other.values(flow=True)
     n1 = data1.sum(dtype=int)
     n2 = data2.sum(dtype=int)
-    edges1 = self.axes[0].edges[:-1]
-    edges2 = other.axes[0].edges[:-1]
+    edges1 = self.axes[0].centers
+    edges2 = other.axes[0].centers
     cdf1 = np.cumsum(data1) / n1
     cdf2 = np.cumsum(data2) / n2
+    if flow:
+        edges1 = np.concatenate([[-np.inf], edges1, [np.inf]])
+        edges2 = np.concatenate([[-np.inf], edges2, [np.inf]])
 
     if equal_bins:
         cdflocs = edges1
 
-        cdf1 = cdf1[1:-1]
-        cdf2 = cdf2[1:-1]
+        if not flow:
+            cdf1 = cdf1[1:-1]
+            cdf2 = cdf2[1:-1]
+
         cddiffs = cdf1 - cdf2
 
     elif not equal_bins:
         edges_all = np.sort(np.concatenate([edges1, edges2]))
         cdflocs = edges_all
 
+        if flow:
+            cdf1 = np.concatenate([[0], cdf1])
+            cdf2 = np.concatenate([[0], cdf2])
+
         # Use np.searchsorted to get the CDF values at all bin edges
         cdf1_all = cdf1[np.searchsorted(edges1, edges_all, side="right")]
         cdf2_all = cdf2[np.searchsorted(edges2, edges_all, side="right")]

From 7d603899d34d5dec89fcd0e992be705d066e18ec Mon Sep 17 00:00:00 2001
From: fabriceMUKARAGE <fabrice.mukarage@ashesi.edu.gh>
Date: Thu, 22 Jun 2023 21:42:08 +0200
Subject: [PATCH 12/19] wip: chisquare test suggestions

---
 pyproject.toml      |  3 +++
 tests/test_stats.py | 27 +++++++++++++++++++--------
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 82067cee..fe9a72a1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,6 +72,7 @@ test = [
     "pytest-mpl >=0.12",
     "dask[dataframe] >=2022; python_version>='3.8'",
     "dask_histogram >=2023.1; python_version>='3.8'",
+    "numba_stats",
 ]
 dev = [
     "pytest >=6",
@@ -83,6 +84,7 @@ dev = [
     "ipykernel",
     "dask[dataframe] >=2022; python_version>='3.8'",
     "dask_histogram >=2023.1; python_version>='3.8'",
+    "numba_stats",
 ]
 docs = [
     "pytest >=6",
@@ -176,6 +178,7 @@ messages_control.disable = [
   "disallowed-name",
   "cyclic-import",
   "redefined-builtin",
+  "too-many-public-methods",
 ]
 
 [tool.ruff]
diff --git a/tests/test_stats.py b/tests/test_stats.py
index ed5d23bc..8340c96a 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -5,36 +5,47 @@
 
 import hist
 from hist import Hist
+from numba_stats import norm
+import functools
 
+cdf =functools.partial(norm.cdf, loc=0, scale=1)
 
-def test_chisquare_1samp():
-    from numba_stats import norm
+pytest.importorskip("scipy")
 
+@pytest.mark.parametrize("arg", ["norm", cdf])
+def test_chisquare_1samp(arg):
     np.random.seed(42)
 
     h = Hist(hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000))
 
-    chisq, ndof, pvalue = h.chisquare_1samp("norm")
-    assert chisq == pytest.approx(9.47425856230132)
-    assert ndof == 14
-    assert pvalue == pytest.approx(0.7995235818496339)
-
-    chisq, ndof, pvalue = h.chisquare_1samp(norm.cdf, args=(0, 1))
+    chisq, ndof, pvalue = h.chisquare_1samp(arg)
     assert chisq == pytest.approx(9.47425856230132)
     assert ndof == 14
     assert pvalue == pytest.approx(0.7995235818496339)
+    
+def test_chisquare_1samp_2d():
+    np.random.seed(42)
 
     h = Hist(hist.axis.Regular(20, -5, 5), hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000), np.random.normal(size=1000))
     with pytest.raises(NotImplementedError):
         h.chisquare_1samp("norm")
 
+def test_chisquare_1samp_weight():
+    np.random.seed(42)
+    h = Hist(hist.axis.Regular(20, -5, 5), storage=hist.storage.Weight())
+    h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000))
+    h.chisquare_1samp("norm")   
+
+def test_chisquare_1samp_weight_raises():
+    np.random.seed(42)
     h = Hist(hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000))
     with pytest.raises(RuntimeError):
         h.chisquare_1samp("norm")
 
+def test_chisquare_1samp_invalid_args():
     h = Hist(hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000))
     with pytest.raises(TypeError):

From 223ef96ea9c834ce29bb3868fbe3362ba7553ee9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 22 Jun 2023 21:40:30 +0000
Subject: [PATCH 13/19] style: pre-commit fixes

---
 tests/test_stats.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/test_stats.py b/tests/test_stats.py
index 8340c96a..07845300 100644
--- a/tests/test_stats.py
+++ b/tests/test_stats.py
@@ -1,17 +1,19 @@
 from __future__ import annotations
 
+import functools
+
 import numpy as np
 import pytest
+from numba_stats import norm
 
 import hist
 from hist import Hist
-from numba_stats import norm
-import functools
 
-cdf =functools.partial(norm.cdf, loc=0, scale=1)
+cdf = functools.partial(norm.cdf, loc=0, scale=1)
 
 pytest.importorskip("scipy")
 
+
 @pytest.mark.parametrize("arg", ["norm", cdf])
 def test_chisquare_1samp(arg):
     np.random.seed(42)
@@ -23,7 +25,8 @@ def test_chisquare_1samp(arg):
     assert chisq == pytest.approx(9.47425856230132)
     assert ndof == 14
     assert pvalue == pytest.approx(0.7995235818496339)
-    
+
+
 def test_chisquare_1samp_2d():
     np.random.seed(42)
 
@@ -32,11 +35,13 @@ def test_chisquare_1samp_2d():
     with pytest.raises(NotImplementedError):
         h.chisquare_1samp("norm")
 
+
 def test_chisquare_1samp_weight():
     np.random.seed(42)
     h = Hist(hist.axis.Regular(20, -5, 5), storage=hist.storage.Weight())
     h.fill(np.random.normal(size=1000), weight=np.random.randint(0, 10, size=1000))
-    h.chisquare_1samp("norm")   
+    h.chisquare_1samp("norm")
+
 
 def test_chisquare_1samp_weight_raises():
     np.random.seed(42)
@@ -45,6 +50,7 @@ def test_chisquare_1samp_weight_raises():
     with pytest.raises(RuntimeError):
         h.chisquare_1samp("norm")
 
+
 def test_chisquare_1samp_invalid_args():
     h = Hist(hist.axis.Regular(20, -5, 5))
     h.fill(np.random.normal(size=1000))

From 0c0d92a274bed22e5040f6a3ce3875db61496687 Mon Sep 17 00:00:00 2001
From: Fabrice MUKARAGE <79330798+fabriceMUKARAGE@users.noreply.github.com>
Date: Tue, 27 Jun 2023 21:41:20 +0200
Subject: [PATCH 14/19] fix: Hist requires mplhep to plot

Added pytest.importorskip("mplhep")
---
 tests/test_plot.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_plot.py b/tests/test_plot.py
index cededea7..d0259ff3 100644
--- a/tests/test_plot.py
+++ b/tests/test_plot.py
@@ -7,6 +7,7 @@
 
 plt = pytest.importorskip("matplotlib.pyplot")
 pytest.importorskip("scipy")
+pytest.importorskip("mplhep")
 
 
 def test_general_plot1d():

From abcda1b8edf7bb7d61c39c51faee720955a6075a Mon Sep 17 00:00:00 2001
From: Fabrice MUKARAGE <79330798+fabriceMUKARAGE@users.noreply.github.com>
Date: Tue, 27 Jun 2023 21:50:31 +0200
Subject: [PATCH 15/19] fix: mplhep required to plot

Added pytest.importorskip("mplhep")
---
 tests/test_stacks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_stacks.py b/tests/test_stacks.py
index e3ed7be7..60a35c43 100644
--- a/tests/test_stacks.py
+++ b/tests/test_stacks.py
@@ -7,6 +7,7 @@
 from pytest import approx
 
 from hist import Hist, NamedHist, Stack, axis
+pytest.importorskip("mplhep")
 
 np.random.seed(42)
 

From e630e3f6588619571dc07b6a83e70d681baa9576 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 27 Jun 2023 19:50:50 +0000
Subject: [PATCH 16/19] style: pre-commit fixes

---
 tests/test_stacks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_stacks.py b/tests/test_stacks.py
index 60a35c43..d8607131 100644
--- a/tests/test_stacks.py
+++ b/tests/test_stacks.py
@@ -7,6 +7,7 @@
 from pytest import approx
 
 from hist import Hist, NamedHist, Stack, axis
+
 pytest.importorskip("mplhep")
 
 np.random.seed(42)

From 85e995537595f13ffb654b2b3dd8c06beb5420e1 Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Mon, 3 Jul 2023 14:43:01 +0100
Subject: [PATCH 17/19] avoid hidden scipy object

---
 src/hist/stats.py | 258 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 257 insertions(+), 1 deletion(-)

diff --git a/src/hist/stats.py b/src/hist/stats.py
index 83d38003..f21d258a 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -4,9 +4,9 @@
 from typing import Any, Callable
 
 import numpy as np
+from scipy import special
 from scipy import stats as spstats
 from scipy.stats import rv_continuous, rv_discrete
-from scipy.stats._stats_py import _attempt_exact_2kssamp
 
 import hist
 
@@ -27,6 +27,262 @@ def _get_cdf_if_valid(obj: Any) -> Any:
     )
 
 
+def _compute_outer_prob_inside_method(m: int, n: int, g: int, h: int) -> Any:
+    """
+    Count the proportion of paths that do not stay strictly inside two
+    diagonal lines.
+
+    Parameters
+    ----------
+    m : integer
+        m > 0
+    n : integer
+        n > 0
+    g : integer
+        g is greatest common divisor of m and n
+    h : integer
+        0 <= h <= lcm(m,n)
+
+    Returns
+    -------
+    p : float
+        The proportion of paths that do not stay inside the two lines.
+
+    The classical algorithm counts the integer lattice paths from (0, 0)
+    to (m, n) which satisfy |x/m - y/n| < h / lcm(m, n).
+    The paths make steps of size +1 in either positive x or positive y
+    directions.
+    We are, however, interested in 1 - proportion to computes p-values,
+    so we change the recursion to compute 1 - p directly while staying
+    within the "inside method" a described by Hodges.
+
+    We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
+    Hodges, J.L. Jr.,
+    "The Significance Probability of the Smirnov Two-Sample Test,"
+    Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
+
+    For the recursion for 1-p see
+    Viehmann, T.: "Numerically more stable computation of the p-values
+    for the two-sample Kolmogorov-Smirnov test," arXiv: 2102.08037
+
+    """
+    # Probability is symmetrical in m, n.  Computation below uses m >= n.
+    if m < n:
+        m, n = n, m
+    mg = m // g
+    ng = n // g
+
+    # Count the integer lattice paths from (0, 0) to (m, n) which satisfy
+    # |nx/g - my/g| < h.
+    # Compute matrix A such that:
+    #  A(x, 0) = A(0, y) = 1
+    #  A(x, y) = A(x, y-1) + A(x-1, y), for x,y>=1, except that
+    #  A(x, y) = 0 if |x/m - y/n|>= h
+    # Probability is A(m, n)/binom(m+n, n)
+    # Optimizations exist for m==n, m==n*p.
+    # Only need to preserve a single column of A, and only a
+    # sliding window of it.
+    # minj keeps track of the slide.
+    minj, maxj = 0, min(int(np.ceil(h / mg)), n + 1)
+    curlen = maxj - minj
+    # Make a vector long enough to hold maximum window needed.
+    lenA = min(2 * maxj + 2, n + 1)
+    # This is an integer calculation, but the entries are essentially
+    # binomial coefficients, hence grow quickly.
+    # Scaling after each column is computed avoids dividing by a
+    # large binomial coefficient at the end, but is not sufficient to avoid
+    # the large dyanamic range which appears during the calculation.
+    # Instead we rescale based on the magnitude of the right most term in
+    # the column and keep track of an exponent separately and apply
+    # it at the end of the calculation.  Similarly when multiplying by
+    # the binomial coefficient
+    dtype = np.float64
+    A = np.ones(lenA, dtype=dtype)
+    # Initialize the first column
+    A[minj:maxj] = 0.0
+    for i in range(1, m + 1):
+        # Generate the next column.
+        # First calculate the sliding window
+        lastminj, lastlen = minj, curlen
+        minj = max(int(np.floor((ng * i - h) / mg)) + 1, 0)
+        minj = min(minj, n)
+        maxj = min(int(np.ceil((ng * i + h) / mg)), n + 1)
+        if maxj <= minj:
+            return 1.0
+        # Now fill in the values. We cannot use cumsum, unfortunately.
+        val = 0.0 if minj == 0 else 1.0
+        for jj in range(maxj - minj):
+            j = jj + minj
+            val = (A[jj + minj - lastminj] * i + val * j) / (i + j)
+            A[jj] = val
+        curlen = maxj - minj
+        if lastlen > curlen:
+            # Set some carried-over elements to 1
+            A[maxj - minj : maxj - minj + (lastlen - curlen)] = 1
+
+    return A[maxj - minj - 1]
+
+
+def _compute_prob_outside_square(n: int, h: int) -> Any:
+    """
+    Compute the proportion of paths that pass outside the two diagonal lines.
+
+    Parameters
+    ----------
+    n : integer
+        n > 0
+    h : integer
+        0 <= h <= n
+
+    Returns
+    -------
+    p : float
+        The proportion of paths that pass outside the lines x-y = +/-h.
+
+    """
+    # Compute Pr(D_{n,n} >= h/n)
+    # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... )
+    # / binom(2n, n)
+    # This formulation exhibits subtractive cancellation.
+    # Instead divide each term by binom(2n, n), then factor common terms
+    # and use a Horner-like algorithm
+    # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...)))))
+
+    P = 0.0
+    k = int(np.floor(n / h))
+    while k >= 0:
+        p1 = 1.0
+        # Each of the Ai terms has numerator and denominator with
+        # h simple terms.
+        for j in range(h):
+            p1 = (n - k * h - j) * p1 / (n + k * h + j + 1)
+        P = p1 * (1.0 - P)
+        k -= 1
+    return 2 * P
+
+
+def _count_paths_outside_method(m: int, n: int, g: int, h: int) -> Any:
+    """Count the number of paths that pass outside the specified diagonal.
+
+    Parameters
+    ----------
+    m : integer
+        m > 0
+    n : integer
+        n > 0
+    g : integer
+        g is greatest common divisor of m and n
+    h : integer
+        0 <= h <= lcm(m,n)
+
+    Returns
+    -------
+    p : float
+        The number of paths that go low.
+        The calculation may overflow - check for a finite answer.
+
+    Notes
+    -----
+    Count the integer lattice paths from (0, 0) to (m, n), which at some
+    point (x, y) along the path, satisfy:
+      m*y <= n*x - h*g
+    The paths make steps of size +1 in either positive x or positive y
+    directions.
+
+    We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
+    Hodges, J.L. Jr.,
+    "The Significance Probability of the Smirnov Two-Sample Test,"
+    Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.
+
+    """
+    # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n)
+    # B(x, y) = #{paths from (0,0) to (x,y) without
+    #             previously crossing the boundary}
+    #         = binom(x, y) - #{paths which already reached the boundary}
+    # Multiply by the number of path extensions going from (x, y) to (m, n)
+    # Sum.
+
+    # Probability is symmetrical in m, n.  Computation below assumes m >= n.
+    if m < n:
+        m, n = n, m
+    mg = m // g
+    ng = n // g
+
+    # Not every x needs to be considered.
+    # xj holds the list of x values to be checked.
+    # Wherever n*x/m + ng*h crosses an integer
+    lxj = n + (mg - h) // mg
+    xj = [(h + mg * j + ng - 1) // ng for j in range(lxj)]
+    # B is an array just holding a few values of B(x,y), the ones needed.
+    # B[j] == B(x_j, j)
+    if lxj == 0:
+        return special.binom(m + n, n)
+    B = np.zeros(lxj)
+    B[0] = 1
+    # Compute the B(x, y) terms
+    for j in range(1, lxj):
+        Bj = special.binom(xj[j] + j, j)
+        for i in range(j):
+            bin = special.binom(xj[j] - xj[i] + j - i, j - i)
+            Bj -= bin * B[i]
+        B[j] = Bj
+    # Compute the number of path extensions...
+    num_paths = 0
+    for j in range(lxj):
+        bin = special.binom((m - xj[j]) + (n - j), n - j)
+        term = B[j] * bin
+        num_paths += term
+    return num_paths
+
+
+def _attempt_exact_2kssamp(n1: int, n2: int, g: int, d: float, alternative: str) -> Any:
+    """Attempts to compute the exact 2sample probability.
+
+    n1, n2 are the sample sizes
+    g is the gcd(n1, n2)
+    d is the computed max difference in ECDFs
+
+    Returns (success, d, probability)
+    """
+    lcm = (n1 // g) * n2
+    h = int(np.round(d * lcm))
+    d = h * 1.0 / lcm
+    if h == 0:
+        return True, d, 1.0
+    saw_fp_error, prob = False, np.nan
+    try:
+        with np.errstate(invalid="raise", over="raise"):
+            if alternative == "two-sided":
+                if n1 == n2:
+                    prob = _compute_prob_outside_square(n1, h)
+                else:
+                    prob = _compute_outer_prob_inside_method(n1, n2, g, h)
+            else:
+                if n1 == n2:
+                    # prob = binom(2n, n-h) / binom(2n, n)
+                    # Evaluating in that form incurs roundoff errors
+                    # from special.binom. Instead calculate directly
+                    jrange = np.arange(h)
+                    prob = float(np.prod((n1 - jrange) / (n1 + jrange + 1.0)))
+                else:
+                    with np.errstate(over="raise"):
+                        num_paths = _count_paths_outside_method(n1, n2, g, h)
+                    bin = special.binom(n1 + n2, n1)
+                    if num_paths > bin or np.isinf(bin):
+                        saw_fp_error = True
+                    else:
+                        prob = num_paths / bin
+
+    except (FloatingPointError, OverflowError):
+        saw_fp_error = True
+
+    if saw_fp_error:
+        return False, d, np.nan
+    if not (0 <= prob <= 1):
+        return False, d, prob
+    return True, d, prob
+
+
 def chisquare_1samp(
     self: hist.BaseHist,
     distribution: str | Callable[..., Any],

From d171bd964d09c65c9f40a5a5dd67d80665b87f9d Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Mon, 3 Jul 2023 16:50:20 +0100
Subject: [PATCH 18/19] similar docstrings

---
 src/hist/stats.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/hist/stats.py b/src/hist/stats.py
index f21d258a..7ec3967b 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -181,8 +181,6 @@ def _count_paths_outside_method(m: int, n: int, g: int, h: int) -> Any:
         The number of paths that go low.
         The calculation may overflow - check for a finite answer.
 
-    Notes
-    -----
     Count the integer lattice paths from (0, 0) to (m, n), which at some
     point (x, y) along the path, satisfy:
       m*y <= n*x - h*g
@@ -238,11 +236,27 @@ def _count_paths_outside_method(m: int, n: int, g: int, h: int) -> Any:
 def _attempt_exact_2kssamp(n1: int, n2: int, g: int, d: float, alternative: str) -> Any:
     """Attempts to compute the exact 2sample probability.
 
-    n1, n2 are the sample sizes
-    g is the gcd(n1, n2)
-    d is the computed max difference in ECDFs
+    Parameters
+    ----------
+    n1 : integer
+        sample size of sample 1
+    n2 : integer
+        sample size of sample 2
+    g : integer
+        greatest common divisor of n1 and n2
+    d : float
+        max difference in ECDFs
+    alternative : string
+        alternative hypothesis, either 'two-sided', 'less' or 'greater'
 
-    Returns (success, d, probability)
+    Returns
+    -------
+    success : bool
+        True if the calculation was successful
+    d : float
+        max difference in ECDFs
+    prob : float
+        The probability of the test statistic under the null hypothesis.
     """
     lcm = (n1 // g) * n2
     h = int(np.round(d * lcm))

From ccc3dae59df63ed9973cf7416a83a893123efab6 Mon Sep 17 00:00:00 2001
From: iasonkrom <iason.krom@gmail.com>
Date: Tue, 4 Jul 2023 14:22:41 +0100
Subject: [PATCH 19/19] pylint compliance

---
 src/hist/stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hist/stats.py b/src/hist/stats.py
index 7ec3967b..6eab17ac 100644
--- a/src/hist/stats.py
+++ b/src/hist/stats.py
@@ -292,7 +292,7 @@ def _attempt_exact_2kssamp(n1: int, n2: int, g: int, d: float, alternative: str)
 
     if saw_fp_error:
         return False, d, np.nan
-    if not (0 <= prob <= 1):
+    if not 0 <= prob <= 1:
         return False, d, prob
     return True, d, prob