Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sklearnex.BasicStatistics API for CSR inputs on GPU and a test for it #2253

Merged
merged 18 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion onedal/basic_statistics/basic_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,13 @@ def fit(self, data, sample_weight=None, queue=None):
sample_weight = _check_array(sample_weight, ensure_2d=False)

is_single_dim = data.ndim == 1
data_table, weights_table = to_table(data, sample_weight, queue=queue)

data_table = to_table(data, queue=queue)
weights_table = (
icfaust marked this conversation as resolved.
Show resolved Hide resolved
to_table(sample_weight, queue=queue)
if sample_weight is not None
else to_table(None)
)

dtype = data_table.dtype
raw_result = self._compute_raw(data_table, weights_table, policy, dtype, is_csr)
Expand Down
25 changes: 21 additions & 4 deletions onedal/basic_statistics/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,35 @@

import numpy as np


# Compute unbiased variation for the columns of array-like X
def variation(X):
X_mean = np.mean(X, axis=0)
if np.all(X_mean):
# Avoid division by zero
return np.std(X, axis=0, ddof=1) / X_mean
else:
return np.array(
[
x / y if y != 0 else np.nan
for x, y in zip(np.std(X, axis=0, ddof=1), X_mean)
]
)


options_and_tests = {
"sum": (lambda X: np.sum(X, axis=0), (5e-4, 1e-7)),
"min": (lambda X: np.min(X, axis=0), (1e-7, 1e-7)),
"max": (lambda X: np.max(X, axis=0), (1e-7, 1e-7)),
"mean": (lambda X: np.mean(X, axis=0), (5e-7, 1e-7)),
"variance": (lambda X: np.var(X, axis=0), (2e-3, 2e-3)),
"variation": (lambda X: np.std(X, axis=0) / np.mean(X, axis=0), (5e-2, 5e-2)),
# sklearnex computes unbiased variance and standard deviation that is why ddof=1
"variance": (lambda X: np.var(X, axis=0, ddof=1), (2e-4, 1e-7)),
"variation": (lambda X: variation(X), (1e-3, 1e-6)),
"sum_squares": (lambda X: np.sum(np.square(X), axis=0), (2e-4, 1e-7)),
"sum_squares_centered": (
lambda X: np.sum(np.square(X - np.mean(X, axis=0)), axis=0),
(2e-4, 1e-7),
(1e-3, 1e-7),
),
"standard_deviation": (lambda X: np.std(X, axis=0), (2e-3, 2e-3)),
"standard_deviation": (lambda X: np.std(X, axis=0, ddof=1), (2e-3, 1e-7)),
icfaust marked this conversation as resolved.
Show resolved Hide resolved
"second_order_raw_moment": (lambda X: np.mean(np.square(X), axis=0), (1e-6, 1e-7)),
}
41 changes: 36 additions & 5 deletions sklearnex/basic_statistics/basic_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
import warnings

import numpy as np
from scipy.sparse import issparse
from sklearn.base import BaseEstimator
from sklearn.utils import check_array
from sklearn.utils.validation import _check_sample_weight

from daal4py.sklearn._n_jobs_support import control_n_jobs
from daal4py.sklearn._utils import sklearn_check_version
from daal4py.sklearn._utils import daal_check_version, sklearn_check_version
from onedal.basic_statistics import BasicStatistics as onedal_BasicStatistics
from onedal.utils import _is_csr

from .._device_offload import dispatch
from .._utils import IntelEstimator, PatchingConditionsChain
Expand Down Expand Up @@ -166,21 +168,50 @@ def __getattr__(self, attr):
f"'{self.__class__.__name__}' object has no attribute '{attr}'"
)

def _onedal_supported(self, method_name, *data):
def _onedal_cpu_supported(self, method_name, *data):
patching_status = PatchingConditionsChain(
f"sklearnex.basic_statistics.{self.__class__.__name__}.{method_name}"
)
return patching_status

_onedal_cpu_supported = _onedal_supported
_onedal_gpu_supported = _onedal_supported
def _onedal_gpu_supported(self, method_name, *data):
patching_status = PatchingConditionsChain(
f"sklearnex.basic_statistics.{self.__class__.__name__}.{method_name}"
)
X, sample_weight = data

is_data_supported = (
_is_csr(X) and daal_check_version((2025, "P", 200))
icfaust marked this conversation as resolved.
Show resolved Hide resolved
) or not issparse(X)

is_sample_weight_supported = sample_weight is None or not issparse(X)
Vika-F marked this conversation as resolved.
Show resolved Hide resolved

patching_status.and_conditions(
[
(
is_sample_weight_supported,
"Sample weights are not supported for CSR data format",
),
(
is_data_supported,
"Supported data formats: Dense, CSR (oneDAL version >= 2025.2.0).",
),
]
)
return patching_status

def _onedal_fit(self, X, sample_weight=None, queue=None):
if sklearn_check_version("1.2"):
self._validate_params()

if sklearn_check_version("1.0"):
X = validate_data(self, X, dtype=[np.float64, np.float32], ensure_2d=False)
X = validate_data(
self,
X,
dtype=[np.float64, np.float32],
ensure_2d=False,
accept_sparse="csr",
)
else:
X = check_array(X, dtype=[np.float64, np.float32])

Expand Down
171 changes: 155 additions & 16 deletions sklearnex/basic_statistics/tests/test_basic_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,30 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose
from scipy import sparse as sp

from daal4py.sklearn._utils import daal_check_version
from onedal.basic_statistics.tests.utils import options_and_tests
from onedal.tests.utils._dataframes_support import (
_convert_to_dataframe,
get_dataframes_and_queues,
get_queues,
)
from sklearnex import config_context
from sklearnex.basic_statistics import BasicStatistics
from sklearnex.tests.utils import gen_sparse_dataset


# Compute the basic statistics on sparse data on CPU or GPU depending on the queue
def compute_sparse_result(X_sparse, options, queue):
if queue is not None and queue.sycl_device.is_gpu:
with config_context(target_offload="gpu"):
basicstat = BasicStatistics(result_options=options)
result = basicstat.fit(X_sparse)
else:
basicstat = BasicStatistics(result_options=options)
result = basicstat.fit(X_sparse)
return result


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
Expand All @@ -41,19 +57,19 @@
expected_min = np.array([0, 0])
expected_max = np.array([1, 1])

assert_allclose(expected_mean, result.mean)
assert_allclose(expected_max, result.max)
assert_allclose(expected_min, result.min)
assert_allclose(expected_mean, result.mean_)
assert_allclose(expected_max, result.max_)
assert_allclose(expected_min, result.min_)

result = BasicStatistics().fit(X_df, sample_weight=weights_df)

expected_weighted_mean = np.array([0.25, 0.25])
expected_weighted_min = np.array([0, 0])
expected_weighted_max = np.array([0.5, 0.5])

assert_allclose(expected_weighted_mean, result.mean)
assert_allclose(expected_weighted_min, result.min)
assert_allclose(expected_weighted_max, result.max)
assert_allclose(expected_weighted_mean, result.mean_)
assert_allclose(expected_weighted_min, result.min_)
assert_allclose(expected_weighted_max, result.max_)


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
Expand All @@ -78,16 +94,16 @@
expected_weighted_mean = np.array([0.25, 0.25])
expected_weighted_min = np.array([0, 0])
expected_weighted_max = np.array([0.5, 0.5])
assert_allclose(expected_weighted_mean, result.mean)
assert_allclose(expected_weighted_max, result.max)
assert_allclose(expected_weighted_min, result.min)
assert_allclose(expected_weighted_mean, result.mean_)
assert_allclose(expected_weighted_max, result.max_)
assert_allclose(expected_weighted_min, result.min_)
else:
expected_mean = np.array([0.5, 0.5])
expected_min = np.array([0, 0])
expected_max = np.array([1, 1])
assert_allclose(expected_mean, result.mean)
assert_allclose(expected_max, result.max)
assert_allclose(expected_min, result.min)
assert_allclose(expected_mean, result.mean_)
assert_allclose(expected_max, result.max_)
assert_allclose(expected_min, result.min_)


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
Expand Down Expand Up @@ -117,7 +133,7 @@
else:
result = basicstat.fit(X_df)

res = getattr(result, result_option)
res = getattr(result, result_option + "_")
if weighted:
weighted_data = np.diag(weights) @ X
gtr = function(weighted_data)
Expand All @@ -128,6 +144,49 @@
assert_allclose(gtr, res, atol=tol)


@pytest.mark.parametrize("queue", get_queues())
@pytest.mark.parametrize("result_option", options_and_tests.keys())
@pytest.mark.parametrize("row_count", [500, 2000])
@pytest.mark.parametrize("column_count", [10, 100])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_single_option_on_random_sparse_data(
queue, result_option, row_count, column_count, dtype
):
if not daal_check_version((2025, "P", 200)) and result_option in [
"max",
"sum_squares",
]:
pytest.skip(
"There is a bug in 'max' and 'sum_squares' computations in oneDAL version < 2025.2"
Vika-F marked this conversation as resolved.
Show resolved Hide resolved
)

function, tols = options_and_tests[result_option]
fp32tol, fp64tol = tols
seed = 77

gen = np.random.default_rng(seed)

X_sparse = gen_sparse_dataset(
row_count,
column_count,
density=0.01,
format="csr",
dtype=dtype,
random_state=gen,
)

X_dense = X_sparse.toarray()

result = compute_sparse_result(X_sparse, result_option, queue)

res = getattr(result, result_option + "_")

gtr = function(X_dense)

tol = fp32tol if res.dtype == np.float32 else fp64tol
assert_allclose(gtr, res, atol=tol)


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("row_count", [100, 1000])
@pytest.mark.parametrize("column_count", [10, 100])
Expand All @@ -152,7 +211,7 @@
else:
result = basicstat.fit(X_df)

res_mean, res_max, res_sum = result.mean, result.max, result.sum
res_mean, res_max, res_sum = result.mean_, result.max_, result.sum_
if weighted:
weighted_data = np.diag(weights) @ X
gtr_mean, gtr_max, gtr_sum = (
Expand All @@ -173,6 +232,48 @@
assert_allclose(gtr_sum, res_sum, atol=tol)


@pytest.mark.parametrize("queue", get_queues())
@pytest.mark.parametrize("row_count", [100, 1000])
@pytest.mark.parametrize("column_count", [10, 100])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_multiple_options_on_random_sparse_data(queue, row_count, column_count, dtype):
seed = 77

gen = np.random.default_rng(seed)

X_sparse = gen_sparse_dataset(
row_count,
column_count,
density=0.05,
format="csr",
dtype=dtype,
random_state=gen,
)

X_dense = X_sparse.toarray()

options = [
"sum",
"min",
"mean",
"standard_deviation",
"variance",
"second_order_raw_moment",
]

result = compute_sparse_result(X_sparse, options, queue)

for result_option in options_and_tests:
function, tols = options_and_tests[result_option]
if not result_option in options:
continue
fp32tol, fp64tol = tols
res = getattr(result, result_option + "_")
gtr = function(X_dense)
tol = fp32tol if res.dtype == np.float32 else fp64tol
assert_allclose(gtr, res, atol=tol)


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("row_count", [100, 1000])
@pytest.mark.parametrize("column_count", [10, 100])
Expand Down Expand Up @@ -203,7 +304,7 @@
for result_option in options_and_tests:
function, tols = options_and_tests[result_option]
fp32tol, fp64tol = tols
res = getattr(result, result_option)
res = getattr(result, result_option + "_")
if weighted:
gtr = function(weighted_data)
else:
Expand All @@ -212,6 +313,44 @@
assert_allclose(gtr, res, atol=tol)


@pytest.mark.parametrize("queue", get_queues())
@pytest.mark.parametrize("row_count", [100, 1000])
@pytest.mark.parametrize("column_count", [10, 100])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_all_option_on_random_sparse_data(queue, row_count, column_count, dtype):
seed = 77

gen = np.random.default_rng(seed)

X_sparse = gen_sparse_dataset(
row_count,
column_count,
density=0.05,
format="csr",
dtype=dtype,
random_state=gen,
)
X_dense = X_sparse.toarray()

result = compute_sparse_result(X_sparse, "all", queue)

for result_option in options_and_tests:
if not daal_check_version((2025, "P", 200)) and result_option in [
"max",
"sum_squares",
]:
# TODO: There is a bug in 'max' and 'sum_squares' computations in oneDAL version < 2025.2

Check notice on line 342 in sklearnex/basic_statistics/tests/test_basic_statistics.py

View check run for this annotation

codefactor.io / CodeFactor

sklearnex/basic_statistics/tests/test_basic_statistics.py#L342

Unresolved comment '# TODO: There is a bug in 'max' and 'sum_squares' computations in oneDAL version < 2025.2'. (C100)
icfaust marked this conversation as resolved.
Show resolved Hide resolved
continue
function, tols = options_and_tests[result_option]
fp32tol, fp64tol = tols
res = getattr(result, result_option + "_")

gtr = function(X_dense)

tol = fp32tol if res.dtype == np.float32 else fp64tol
assert_allclose(gtr, res, atol=tol)


@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("result_option", options_and_tests.keys())
@pytest.mark.parametrize("data_size", [100, 1000])
Expand All @@ -238,7 +377,7 @@
else:
result = basicstat.fit(X_df)

res = getattr(result, result_option)
res = getattr(result, result_option + "_")
if weighted:
weighted_data = weights * X
gtr = function(weighted_data)
Expand Down
Loading
Loading