uxlfoundation · Vika-F · Jan 27, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 14, 2025
@@ -82,7 +82,13 @@ def fit(self, data, sample_weight=None, queue=None):
             sample_weight = _check_array(sample_weight, ensure_2d=False)
 
         is_single_dim = data.ndim == 1
-        data_table, weights_table = to_table(data, sample_weight, queue=queue)
+
+        data_table = to_table(data, queue=queue)
+        weights_table = (
+            to_table(sample_weight, queue=queue)
+            if sample_weight is not None
+            else to_table(None)
+        )
 
         dtype = data_table.dtype
         raw_result = self._compute_raw(data_table, weights_table, policy, dtype, is_csr)

@@ -16,18 +16,35 @@
 
 import numpy as np
 
+
+# Compute unbiased variation for the columns of array-like X
+def variation(X):
+    X_mean = np.mean(X, axis=0)
+    if np.all(X_mean):
+        # Avoid division by zero
+        return np.std(X, axis=0, ddof=1) / X_mean
+    else:
+        return np.array(
+            [
+                x / y if y != 0 else np.nan
+                for x, y in zip(np.std(X, axis=0, ddof=1), X_mean)
+            ]
+        )
+
+
 options_and_tests = {
     "sum": (lambda X: np.sum(X, axis=0), (5e-4, 1e-7)),
     "min": (lambda X: np.min(X, axis=0), (1e-7, 1e-7)),
     "max": (lambda X: np.max(X, axis=0), (1e-7, 1e-7)),
     "mean": (lambda X: np.mean(X, axis=0), (5e-7, 1e-7)),
-    "variance": (lambda X: np.var(X, axis=0), (2e-3, 2e-3)),
-    "variation": (lambda X: np.std(X, axis=0) / np.mean(X, axis=0), (5e-2, 5e-2)),
+    # sklearnex computes unbiased variance and standard deviation that is why ddof=1
+    "variance": (lambda X: np.var(X, axis=0, ddof=1), (2e-4, 1e-7)),
+    "variation": (lambda X: variation(X), (1e-3, 1e-6)),
     "sum_squares": (lambda X: np.sum(np.square(X), axis=0), (2e-4, 1e-7)),
     "sum_squares_centered": (
         lambda X: np.sum(np.square(X - np.mean(X, axis=0)), axis=0),
-        (2e-4, 1e-7),
+        (1e-3, 1e-7),
     ),
-    "standard_deviation": (lambda X: np.std(X, axis=0), (2e-3, 2e-3)),
+    "standard_deviation": (lambda X: np.std(X, axis=0, ddof=1), (2e-3, 1e-7)),
     "second_order_raw_moment": (lambda X: np.mean(np.square(X), axis=0), (1e-6, 1e-7)),
 }
@@ -17,13 +17,15 @@
 import warnings
 
 import numpy as np
+from scipy.sparse import issparse
 from sklearn.base import BaseEstimator
 from sklearn.utils import check_array
 from sklearn.utils.validation import _check_sample_weight
 
 from daal4py.sklearn._n_jobs_support import control_n_jobs
-from daal4py.sklearn._utils import sklearn_check_version
+from daal4py.sklearn._utils import daal_check_version, sklearn_check_version
 from onedal.basic_statistics import BasicStatistics as onedal_BasicStatistics
+from onedal.utils import _is_csr
 
 from .._device_offload import dispatch
 from .._utils import IntelEstimator, PatchingConditionsChain
@@ -166,21 +168,50 @@ def __getattr__(self, attr):
             f"'{self.__class__.__name__}' object has no attribute '{attr}'"
         )
 
-    def _onedal_supported(self, method_name, *data):
+    def _onedal_cpu_supported(self, method_name, *data):
         patching_status = PatchingConditionsChain(
             f"sklearnex.basic_statistics.{self.__class__.__name__}.{method_name}"
         )
         return patching_status
 
-    _onedal_cpu_supported = _onedal_supported
-    _onedal_gpu_supported = _onedal_supported
+    def _onedal_gpu_supported(self, method_name, *data):
+        patching_status = PatchingConditionsChain(
+            f"sklearnex.basic_statistics.{self.__class__.__name__}.{method_name}"
+        )
+        X, sample_weight = data
+
+        is_data_supported = (
+            _is_csr(X) and daal_check_version((2025, "P", 200))
+        ) or not issparse(X)
+
+        is_sample_weight_supported = sample_weight is None or not issparse(X)
+
+        patching_status.and_conditions(
+            [
+                (
+                    is_sample_weight_supported,
+                    "Sample weights are not supported for CSR data format",
+                ),
+                (
+                    is_data_supported,
+                    "Supported data formats: Dense, CSR (oneDAL version >= 2025.2.0).",
+                ),
+            ]
+        )
+        return patching_status
 
     def _onedal_fit(self, X, sample_weight=None, queue=None):
         if sklearn_check_version("1.2"):
             self._validate_params()
 
         if sklearn_check_version("1.0"):
-            X = validate_data(self, X, dtype=[np.float64, np.float32], ensure_2d=False)
+            X = validate_data(
+                self,
+                X,
+                dtype=[np.float64, np.float32],
+                ensure_2d=False,
+                accept_sparse="csr",
+            )
         else:
             X = check_array(X, dtype=[np.float64, np.float32])
 

@@ -17,14 +17,30 @@
 import numpy as np
 import pytest
 from numpy.testing import assert_allclose
+from scipy import sparse as sp
 
 from daal4py.sklearn._utils import daal_check_version
 from onedal.basic_statistics.tests.utils import options_and_tests
 from onedal.tests.utils._dataframes_support import (
     _convert_to_dataframe,
     get_dataframes_and_queues,
+    get_queues,
 )
+from sklearnex import config_context
 from sklearnex.basic_statistics import BasicStatistics
+from sklearnex.tests.utils import gen_sparse_dataset
+
+
+# Compute the basic statistics on sparse data on CPU or GPU depending on the queue
+def compute_sparse_result(X_sparse, options, queue):
+    if queue is not None and queue.sycl_device.is_gpu:
+        with config_context(target_offload="gpu"):
+            basicstat = BasicStatistics(result_options=options)
+            result = basicstat.fit(X_sparse)
+    else:
+        basicstat = BasicStatistics(result_options=options)
+        result = basicstat.fit(X_sparse)
+    return result
 
 
 @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@@ -41,19 +57,19 @@
     expected_min = np.array([0, 0])
     expected_max = np.array([1, 1])
 
-    assert_allclose(expected_mean, result.mean)
-    assert_allclose(expected_max, result.max)
-    assert_allclose(expected_min, result.min)
+    assert_allclose(expected_mean, result.mean_)
+    assert_allclose(expected_max, result.max_)
+    assert_allclose(expected_min, result.min_)
 
     result = BasicStatistics().fit(X_df, sample_weight=weights_df)
 
     expected_weighted_mean = np.array([0.25, 0.25])
     expected_weighted_min = np.array([0, 0])
     expected_weighted_max = np.array([0.5, 0.5])
 
-    assert_allclose(expected_weighted_mean, result.mean)
-    assert_allclose(expected_weighted_min, result.min)
-    assert_allclose(expected_weighted_max, result.max)
+    assert_allclose(expected_weighted_mean, result.mean_)
+    assert_allclose(expected_weighted_min, result.min_)
+    assert_allclose(expected_weighted_max, result.max_)
 
 
 @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@@ -78,16 +94,16 @@
         expected_weighted_mean = np.array([0.25, 0.25])
         expected_weighted_min = np.array([0, 0])
         expected_weighted_max = np.array([0.5, 0.5])
-        assert_allclose(expected_weighted_mean, result.mean)
-        assert_allclose(expected_weighted_max, result.max)
-        assert_allclose(expected_weighted_min, result.min)
+        assert_allclose(expected_weighted_mean, result.mean_)
+        assert_allclose(expected_weighted_max, result.max_)
+        assert_allclose(expected_weighted_min, result.min_)
     else:
         expected_mean = np.array([0.5, 0.5])
         expected_min = np.array([0, 0])
         expected_max = np.array([1, 1])
-        assert_allclose(expected_mean, result.mean)
-        assert_allclose(expected_max, result.max)
-        assert_allclose(expected_min, result.min)
+        assert_allclose(expected_mean, result.mean_)
+        assert_allclose(expected_max, result.max_)
+        assert_allclose(expected_min, result.min_)
 
 
 @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@@ -117,7 +133,7 @@
     else:
         result = basicstat.fit(X_df)
 
-    res = getattr(result, result_option)
+    res = getattr(result, result_option + "_")
     if weighted:
         weighted_data = np.diag(weights) @ X
         gtr = function(weighted_data)
@@ -128,6 +144,49 @@
     assert_allclose(gtr, res, atol=tol)
 
 
+@pytest.mark.parametrize("queue", get_queues())
+@pytest.mark.parametrize("result_option", options_and_tests.keys())
+@pytest.mark.parametrize("row_count", [500, 2000])
+@pytest.mark.parametrize("column_count", [10, 100])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_single_option_on_random_sparse_data(
+    queue, result_option, row_count, column_count, dtype
+):
+    if not daal_check_version((2025, "P", 200)) and result_option in [
+        "max",
+        "sum_squares",
+    ]:
+        pytest.skip(
+            "There is a bug in 'max' and 'sum_squares' computations in oneDAL version < 2025.2"
+        )
+
+    function, tols = options_and_tests[result_option]
+    fp32tol, fp64tol = tols
+    seed = 77
+
+    gen = np.random.default_rng(seed)
+
+    X_sparse = gen_sparse_dataset(
+        row_count,
+        column_count,
+        density=0.01,
+        format="csr",
+        dtype=dtype,
+        random_state=gen,
+    )
+
+    X_dense = X_sparse.toarray()
+
+    result = compute_sparse_result(X_sparse, result_option, queue)
+
+    res = getattr(result, result_option + "_")
+
+    gtr = function(X_dense)
+
+    tol = fp32tol if res.dtype == np.float32 else fp64tol
+    assert_allclose(gtr, res, atol=tol)
+
+
 @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
 @pytest.mark.parametrize("row_count", [100, 1000])
 @pytest.mark.parametrize("column_count", [10, 100])
@@ -152,7 +211,7 @@
     else:
         result = basicstat.fit(X_df)
 
-    res_mean, res_max, res_sum = result.mean, result.max, result.sum
+    res_mean, res_max, res_sum = result.mean_, result.max_, result.sum_
     if weighted:
         weighted_data = np.diag(weights) @ X
         gtr_mean, gtr_max, gtr_sum = (
@@ -173,6 +232,48 @@
     assert_allclose(gtr_sum, res_sum, atol=tol)
 
 
+@pytest.mark.parametrize("queue", get_queues())
+@pytest.mark.parametrize("row_count", [100, 1000])
+@pytest.mark.parametrize("column_count", [10, 100])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_multiple_options_on_random_sparse_data(queue, row_count, column_count, dtype):
+    seed = 77
+
+    gen = np.random.default_rng(seed)
+
+    X_sparse = gen_sparse_dataset(
+        row_count,
+        column_count,
+        density=0.05,
+        format="csr",
+        dtype=dtype,
+        random_state=gen,
+    )
+
+    X_dense = X_sparse.toarray()
+
+    options = [
+        "sum",
+        "min",
+        "mean",
+        "standard_deviation",
+        "variance",
+        "second_order_raw_moment",
+    ]
+
+    result = compute_sparse_result(X_sparse, options, queue)
+
+    for result_option in options_and_tests:
+        function, tols = options_and_tests[result_option]
+        if not result_option in options:
+            continue
+        fp32tol, fp64tol = tols
+        res = getattr(result, result_option + "_")
+        gtr = function(X_dense)
+        tol = fp32tol if res.dtype == np.float32 else fp64tol
+        assert_allclose(gtr, res, atol=tol)
+
+
 @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
 @pytest.mark.parametrize("row_count", [100, 1000])
 @pytest.mark.parametrize("column_count", [10, 100])
@@ -203,7 +304,7 @@
     for result_option in options_and_tests:
         function, tols = options_and_tests[result_option]
         fp32tol, fp64tol = tols
-        res = getattr(result, result_option)
+        res = getattr(result, result_option + "_")
         if weighted:
             gtr = function(weighted_data)
         else:
@@ -212,6 +313,44 @@
         assert_allclose(gtr, res, atol=tol)
 
 
+@pytest.mark.parametrize("queue", get_queues())
+@pytest.mark.parametrize("row_count", [100, 1000])
+@pytest.mark.parametrize("column_count", [10, 100])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_all_option_on_random_sparse_data(queue, row_count, column_count, dtype):
+    seed = 77
+
+    gen = np.random.default_rng(seed)
+
+    X_sparse = gen_sparse_dataset(
+        row_count,
+        column_count,
+        density=0.05,
+        format="csr",
+        dtype=dtype,
+        random_state=gen,
+    )
+    X_dense = X_sparse.toarray()
+
+    result = compute_sparse_result(X_sparse, "all", queue)
+
+    for result_option in options_and_tests:
+        if not daal_check_version((2025, "P", 200)) and result_option in [
+            "max",
+            "sum_squares",
+        ]:
+            # TODO: There is a bug in 'max' and 'sum_squares' computations in oneDAL version < 2025.2
+            continue
+        function, tols = options_and_tests[result_option]
+        fp32tol, fp64tol = tols
+        res = getattr(result, result_option + "_")
+
+        gtr = function(X_dense)
+
+        tol = fp32tol if res.dtype == np.float32 else fp64tol
+        assert_allclose(gtr, res, atol=tol)
+
+
 @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
 @pytest.mark.parametrize("result_option", options_and_tests.keys())
 @pytest.mark.parametrize("data_size", [100, 1000])
@@ -238,7 +377,7 @@
     else:
         result = basicstat.fit(X_df)
 
-    res = getattr(result, result_option)
+    res = getattr(result, result_option + "_")
     if weighted:
         weighted_data = weights * X
         gtr = function(weighted_data)