Added named_arrays.histogram() function which can compute the n-dim…

…ensional histogram for scalars, vectors, and functions. (#100)
sun-data · Nov 18, 2024 · 44a6ac1 · 44a6ac1
1 parent d12489a
commit 44a6ac1
Show file tree

Hide file tree

Showing 11 changed files with 604 additions and 4 deletions.
diff --git a/named_arrays/_functions/function_named_array_functions.py b/named_arrays/_functions/function_named_array_functions.py
@@ -1,4 +1,4 @@
-from typing import Callable, Literal
+from typing import Callable, Literal, Sequence
 import numpy as np
 import matplotlib
 import astropy.units as u
@@ -102,6 +102,32 @@ def unit_normalized(
     )
 
 
+@_implements(na.histogram)
+def histogram(
+    a: na.AbstractFunctionArray,
+    bins: dict[str, int] | na.AbstractScalarArray,
+    axis: None | str | Sequence[str] = None,
+    min: None | na.AbstractScalarArray = None,
+    max: None | na.AbstractScalarArray = None,
+    density: bool = False,
+    weights: None = None,
+) -> na.FunctionArray[na.AbstractScalarArray, na.ScalarArray]:
+    if weights is not None:  # pragma: nocover
+        raise ValueError(
+            "`weights` must be `None` for `AbstractFunctionArray`"
+            f"inputs, got {type(weights)}."
+        )
+    return na.histogram(
+        a=a.inputs,
+        bins=bins,
+        axis=axis,
+        min=min,
+        max=max,
+        density=density,
+        weights=a.outputs,
+    )
+
+
 @_implements(na.plt.pcolormesh)
 def pcolormesh(
     *XY: na.AbstractArray,

diff --git a/named_arrays/_functions/tests/test_functions.py b/named_arrays/_functions/tests/test_functions.py
@@ -1,4 +1,4 @@
-from typing import Sequence, Callable
+from typing import Sequence, Callable, Literal
 import pytest
 import numpy as np
 import astropy.units as u
@@ -650,6 +650,40 @@ class TestInterp(
         ):
             pass
 
+        @pytest.mark.parametrize(
+            argnames="bins",
+            argvalues=[
+                "dict",
+            ],
+        )
+        class TestHistogram(
+            named_arrays.tests.test_core.AbstractTestAbstractArray.TestNamedArrayFunctions.TestHistogram,
+        ):
+            def test_histogram(
+                self,
+                array: na.AbstractFunctionArray,
+                bins: Literal["dict"],
+                axis: None | str | Sequence[str],
+                min: None | na.AbstractScalarArray | na.AbstractVectorArray,
+                max: None | na.AbstractScalarArray | na.AbstractVectorArray,
+                weights: None | na.AbstractScalarArray,
+            ):
+                if bins == "dict":
+                    if isinstance(array.inputs, na.AbstractVectorArray):
+                        bins = {f"axis_{c}": 11 for c in array.inputs.cartesian_nd.components}
+                    else:
+                        bins = dict(axis_x=11)
+                if isinstance(array.outputs, na.AbstractVectorArray):
+                    return
+                super().test_histogram(
+                    array=array,
+                    bins=bins,
+                    axis=axis,
+                    min=min,
+                    max=max,
+                    weights=weights,
+                )
+
         @pytest.mark.xfail
         class TestPltPlotLikeFunctions(
             named_arrays.tests.test_core.AbstractTestAbstractArray.TestNamedArrayFunctions.TestPltPlotLikeFunctions

diff --git a/named_arrays/_matrices/tests/test_matrices.py b/named_arrays/_matrices/tests/test_matrices.py
@@ -113,6 +113,12 @@ class TestNamedArrayFunctions(
         named_arrays._vectors.tests.test_vectors.AbstractTestAbstractVectorArray.TestNamedArrayFunctions
     ):
 
+        @pytest.mark.skip
+        class TestHistogram(
+            named_arrays.tests.test_core.AbstractTestAbstractArray.TestNamedArrayFunctions.TestHistogram,
+        ):
+            pass
+
         @pytest.mark.skip
         class TestPltPlotLikeFunctions(
             named_arrays._vectors.tests.test_vectors.AbstractTestAbstractVectorArray.TestNamedArrayFunctions.TestPltPlotLikeFunctions

diff --git a/named_arrays/_named_array_functions.py b/named_arrays/_named_array_functions.py
@@ -23,7 +23,9 @@
     'concatenate',
     'add_axes',
     "interp",
+    "histogram",
     "histogram2d",
+    "histogramdd",
     'jacobian',
     'despike',
 ]
@@ -718,6 +720,80 @@ def interp(
     )
 
 
+def histogram(
+    a: na.AbstractArray,
+    bins: dict[str, int] | na.AbstractArray,
+    axis: None | str | Sequence[str] = None,
+    min: None | float | na.AbstractArray = None,
+    max: None | float | na.AbstractArray = None,
+    density: bool = False,
+    weights: None | na.AbstractArray = None,
+) -> na.FunctionArray[na.AbstractArray, na.ScalarArray]:
+    """
+    A thin wrapper around :func:`numpy.histogram` which adds an `axis` argument.
+
+    Parameters
+    ----------
+    a
+        The input data over which to compute the histogram.
+    bins
+        The bin specification of the histogram:
+         * If `bins` is a dictionary, the keys are interpreted as the axis names
+           and the values are the number of bins along each axis.
+           This dictionary must have only one key per coordinate.
+         * If `bins` is an array, it represents the bin edges.
+    axis
+        The logical axes along which to histogram the data points.
+        If :obj:`None` (the default), the histogram will be computed along
+        all the axes of `a`.
+    min
+        The lower boundary of the histogram.
+        If :obj:`None` (the default), the minimum of `a` is used.
+    max
+        The upper boundary of the histogram.
+        If :obj:`None` (the default), the maximum of `a` is used.
+    density
+        If :obj:`False` (the default), returns the number of samples in each bin.
+        If :obj:`True`, returns the probability density in each bin.
+    weights
+        An optional array weighting each sample.
+
+    Examples
+    --------
+
+    Construct a 2D histogram with constant bin width.
+
+    .. jupyter-execute::
+
+        import numpy as np
+        import matplotlib.pyplot as plt
+        import named_arrays as na
+
+        # Define the bin edges
+        bins = dict(x=6)
+
+        # Define random points to collect into a histogram
+        a = na.random.normal(0, 2, shape_random=dict(h=101))
+
+        # Compute the histogram
+        hist = na.histogram(a, bins=bins)
+
+        # Plot the resulting histogram
+        fig, ax = plt.subplots()
+        na.plt.stairs(hist.inputs, hist.outputs);
+    """
+    return _named_array_function(
+        func=histogram,
+        a=a,
+        axis=axis,
+        bins=bins,
+        min=min,
+        max=max,
+        density=density,
+        weights=weights,
+    )
+
+
 def histogram2d(
     x: na.AbstractScalarArray,
     y: na.AbstractScalarArray,
@@ -822,6 +898,61 @@ def histogram2d(
     )
 
 
+def histogramdd(
+    *sample: na.AbstractScalar,
+    bins: dict[str, int] | na.AbstractScalar| Sequence[na.AbstractScalar],
+    axis: None | str | Sequence[str] = None,
+    min: None | na.AbstractScalar | Sequence[na.AbstractScalar] = None,
+    max: None | na.AbstractScalar | Sequence[na.AbstractScalar] = None,
+    density: bool = False,
+    weights: None | na.AbstractScalar = None,
+) -> tuple[na.AbstractScalar, tuple[na.AbstractScalar, ...]]:
+    """
+    A thin wrapper around :func:`numpy.histogramdd` which adds an `axis` argument.
+
+    Parameters
+    ----------
+    sample
+        The data to be histrogrammed.
+        Note the difference in signature compared to :func:`numpy.histogramdd`,
+        each component must be a separate argument,
+        instead of a single argument containing a sequence of arrays.
+        This is done so that multiple dispatch works better for this function.
+    bins
+        The bin specification of the histogram:
+         * If `bins` is a dictionary, the keys are interpreted as the axis names
+           and the values are the number of bins along each axis.
+           This dictionary must have the same number of elements as `sample`.
+         * If `bins` is an array or a sequence of arrays, it describes the
+           monotonically-increasing bin edges along each dimension
+    axis
+        The logical axes along which to histogram the data points.
+        If :obj:`None` (the default), the histogram will be computed along
+        all the axes of `sample`.
+    min
+        The lower boundary of the histogram along each dimension.
+        If :obj:`None` (the default), the minimum of each element of `sample` is used.
+    max
+        The upper boundary of the histogram along each dimension.
+        If :obj:`None` (the default), the maximum of each elemennt of `sample` is used.
+    density
+        If :obj:`False` (the default), returns the number of samples in each bin.
+        If :obj:`True`, returns the probability density in each bin.
+    weights
+        An optional array weighting each sample.
+    """
+    return _named_array_function(
+        histogramdd,
+        *sample,
+        axis=axis,
+        bins=bins,
+        min=min,
+        max=max,
+        density=density,
+        weights=weights,
+    )
+
+
 def jacobian(
         function: Callable[[InputT], OutputT],
         x: InputT,

diff --git a/named_arrays/_scalars/scalar_named_array_functions.py b/named_arrays/_scalars/scalar_named_array_functions.py
@@ -1,4 +1,5 @@
 from typing import Callable, Sequence, Any, Literal
+import collections
 import dataclasses
 import numpy as np
 import numpy.typing as npt
@@ -249,6 +250,37 @@ def interp(
     return result
 
 
+@_implements(na.histogram)
+def histogram(
+    a: na.AbstractScalarArray,
+    bins: dict[str, int] | na.AbstractScalarArray,
+    axis: None | str | Sequence[str] = None,
+    min: None | na.AbstractScalarArray = None,
+    max: None | na.AbstractScalarArray = None,
+    density: bool = False,
+    weights: None | na.AbstractScalarArray = None,
+) -> na.FunctionArray[na.AbstractScalarArray, na.ScalarArray]:
+
+    if isinstance(bins, na.AbstractArray):
+        bins = (bins, )
+
+    hist, edges = na.histogramdd(
+        a,
+        bins=bins,
+        axis=axis,
+        min=min,
+        max=max,
+        density=density,
+        weights=weights,
+    )
+    edges = edges[0]
+
+    return na.FunctionArray(
+        inputs=edges,
+        outputs=hist,
+    )
+
+
 @_implements(na.histogram2d)
 def histogram2d(
     x: na.AbstractScalarArray,
@@ -389,6 +421,103 @@ def histogram2d(
     )
 
 
+@_implements(na.histogramdd)
+def histogramdd(
+    *sample: na.AbstractScalarArray,
+    bins: dict[str, int] | na.AbstractScalarArray | Sequence[na.AbstractScalarArray],
+    axis: None | str | Sequence[str] = None,
+    min: None | na.AbstractScalarArray | Sequence[na.AbstractScalarArray] = None,
+    max: None | na.AbstractScalarArray | Sequence[na.AbstractScalarArray] = None,
+    density: bool = False,
+    weights: None | na.AbstractScalarArray = None,
+) -> tuple[na.AbstractScalarArray, tuple[na.AbstractScalarArray, ...]]:
+
+    try:
+        sample = [scalars._normalize(s) for s in sample]
+        weights = scalars._normalize(weights) if weights is not None else weights
+    except scalars.ScalarTypeError:  # pragma: nocover
+        return NotImplemented
+
+    shape = na.shape_broadcasted(*sample, weights)
+
+    sample = [s.broadcast_to(shape) for s in sample]
+    weights = na.broadcast_to(weights, shape) if weights is not None else weights
+
+    if axis is None:
+        axis = tuple(shape)
+    elif isinstance(axis, str):
+        axis = (axis,)
+
+    if isinstance(bins, dict):
+
+        if min is None:
+            min = [s.min(axis) for s in sample]
+        elif not isinstance(min, collections.abc.Sequence):
+            min = [min] * len(sample)
+
+        if max is None:
+            max = [s.max(axis) for s in sample]
+        elif not isinstance(max, collections.abc.Sequence):
+            max = [max] * len(sample)
+
+        try:
+            max = [scalars._normalize(m) for m in max]
+            min = [scalars._normalize(m) for m in min]
+        except scalars.ScalarTypeError:  # pragma: nocover
+            return NotImplemented
+
+        if len(bins) != len(sample):  # pragma: nocover
+            raise ValueError(
+                f"if {bins=} is a dictionary, it must have the same number of"
+                f"elements as {len(sample)=}."
+            )
+
+        bins = [
+            na.linspace(start, stop, axis=ax, num=bins[ax])
+            for start, stop, ax in zip(min, max, bins)
+        ]
+
+    try:
+        bins = [scalars._normalize(b) for b in bins]
+    except scalars.ScalarTypeError:  # pragma: nocover
+        return NotImplemented
+
+    shape_orthogonal = {a: shape[a] for a in shape if a not in axis}
+
+    bins_broadcasted = [
+        b.broadcast_to(na.broadcast_shapes(shape_orthogonal, b.shape))
+        for b in bins
+    ]
+
+    shape_bins = na.shape_broadcasted(*bins)
+    shape_hist = {
+        ax: shape_bins[ax] - 1
+        for ax in shape_bins
+        if ax not in shape_orthogonal
+    }
+    shape_hist = na.broadcast_shapes(shape_orthogonal, shape_hist)
+
+    hist = na.ScalarArray.empty(shape_hist)
+
+    unit_weights = na.unit(weights)
+    hist = hist if unit_weights is None else hist << unit_weights
+
+    for i in na.ndindex(shape_orthogonal):
+        hist_i, _ = np.histogramdd(
+            sample=[s[i].ndarray_aligned(axis).reshape(-1) for s in sample],
+            bins=[b[i].ndarray for b in bins_broadcasted],
+            density=density,
+            weights=weights[i].ndarray_aligned(axis).reshape(-1) if weights is not None else weights,
+        )
+
+        hist[i] = na.ScalarArray(
+            ndarray=hist_i,
+            axes=sum((b[i].axes for b in bins_broadcasted), ()),
+        )
+
+    return hist, tuple(bins)
+
+
 def random(
         func: Callable,
         *args: float | u.Quantity | na.AbstractScalarArray,