From 95247e6e1fe7ab67a81d5348283810024136cb92 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 20:14:23 +0100 Subject: [PATCH 01/32] ENH make RandomUnderSampler accept dask array --- imblearn/dask/__init__.py | 0 imblearn/dask/_support.py | 13 ++++ imblearn/dask/tests/__init__.py | 0 imblearn/dask/tests/test_utils.py | 47 +++++++++++++ imblearn/dask/utils.py | 66 +++++++++++++++++++ .../_random_under_sampler.py | 57 +++++++++++----- imblearn/utils/_validation.py | 27 ++++++-- imblearn/utils/estimator_checks.py | 27 ++++++++ imblearn/utils/wrapper.py | 37 +++++++++++ 9 files changed, 252 insertions(+), 22 deletions(-) create mode 100644 imblearn/dask/__init__.py create mode 100644 imblearn/dask/_support.py create mode 100644 imblearn/dask/tests/__init__.py create mode 100644 imblearn/dask/tests/test_utils.py create mode 100644 imblearn/dask/utils.py create mode 100644 imblearn/utils/wrapper.py diff --git a/imblearn/dask/__init__.py b/imblearn/dask/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/imblearn/dask/_support.py b/imblearn/dask/_support.py new file mode 100644 index 000000000..b0b4153d2 --- /dev/null +++ b/imblearn/dask/_support.py @@ -0,0 +1,13 @@ +_REGISTERED_DASK_CONTAINER = [] + +try: + from dask import array, dataframe + _REGISTERED_DASK_CONTAINER += [ + array.Array, dataframe.Series, dataframe.DataFrame, + ] +except ImportError: + pass + + +def is_dask_container(container): + return isinstance(container, tuple(_REGISTERED_DASK_CONTAINER)) diff --git a/imblearn/dask/tests/__init__.py b/imblearn/dask/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py new file mode 100644 index 000000000..edf0665b6 --- /dev/null +++ b/imblearn/dask/tests/test_utils.py @@ -0,0 +1,47 @@ +import numpy as np +import pytest +from dask import array +from dask_ml.datasets import make_classification + +from imblearn.dask.utils import is_multilabel +from imblearn.dask.utils import type_of_target + + +def test_type_of_target_error(): + y = np.arange(10) + + err_msg = "Expected a Dask array, series or dataframe." + with pytest.raises(ValueError, match=err_msg): + type_of_target(y) + + +@pytest.mark.parametrize( + "y, expected_result", + [ + (array.from_array(np.array([0, 1, 0, 1])), False), + (array.from_array(np.array([[1, 0], [0, 0]])), True), + (array.from_array(np.array([[1], [0], [0]])), False), + (array.from_array(np.array([[1, 0, 0]])), True), + ] +) +def test_is_multilabel(y, expected_result): + assert is_multilabel(y) is expected_result + + +@pytest.mark.parametrize( + "y, expected_type_of_target", + [ + (array.from_array(np.array([[1, 0], [0, 0]])), "multilabel-indicator"), + (array.from_array(np.array([[1, 0, 0]])), "multilabel-indicator"), + (array.from_array(np.array([[[1, 2]]])), "unknown"), + (array.from_array(np.array([[]])), "unknown"), + (array.from_array(np.array([.1, .2, 3])), "continuous"), + (array.from_array(np.array([[.1, .2, 3]])), "continuous-multioutput"), + (array.from_array(np.array([[1., .2]])), "continuous-multioutput"), + (array.from_array(np.array([1, 2])), "binary"), + (array.from_array(np.array(["a", "b"])), "binary"), + ] +) +def test_type_of_target(y, expected_type_of_target): + target_type = type_of_target(y) + assert target_type == expected_type_of_target diff --git a/imblearn/dask/utils.py b/imblearn/dask/utils.py new file mode 100644 index 000000000..00f6128ba --- /dev/null +++ b/imblearn/dask/utils.py @@ -0,0 +1,66 @@ +import warnings + +from dask import dataframe +from dask import array +from sklearn.exceptions import DataConversionWarning +from sklearn.utils.multiclass import _is_integral_float + + +def is_multilabel(y): + if not (y.ndim == 2 and y.shape[1] > 1): + return False + + labels = array.unique(y).compute() + + return len(labels) < 3 and ( + y.dtype.kind in 'biu' or _is_integral_float(labels) + ) + + +def type_of_target(y): + if is_multilabel(y): + return 'multilabel-indicator' + + if y.ndim > 2: + return 'unknown' + + if y.ndim == 2 and y.shape[1] == 0: + return 'unknown' # [[]] + + if y.ndim == 2 and y.shape[1] > 1: + # [[1, 2], [1, 2]] + suffix = "-multioutput" + else: + # [1, 2, 3] or [[1], [2], [3]] + suffix = "" + + # check float and contains non-integer float values + if y.dtype.kind == 'f' and array.any(y != y.astype(int)): + # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] + # NOTE: we don't check for infinite values + return 'continuous' + suffix + + labels = array.unique(y).compute() + if (len((labels)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): + # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] + return 'multiclass' + suffix + # [1, 2] or [["a"], ["b"]] + return 'binary' + + +def column_or_1d(y, *, warn=False): + shape = y.shape + if len(shape) == 1: + return y.ravel() + if len(shape) == 2 and shape[1] == 1: + if warn: + warnings.warn( + "A column-vector y was passed when a 1d array was expected. " + "Please change the shape of y to (n_samples, ), for example " + "using ravel().", DataConversionWarning, stacklevel=2 + ) + return y.ravel() + + raise ValueError( + f"y should be a 1d array. Got an array of shape {shape} instead." + ) diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index e34d4e73d..8d9745622 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -10,6 +10,7 @@ from sklearn.utils import _safe_indexing from ..base import BaseUnderSampler +from ...dask._support import is_dask_container from ...utils import check_target_type from ...utils import Substitution from ...utils._docstring import _random_state_docstring @@ -80,44 +81,66 @@ def __init__( self.replacement = replacement def _check_X_y(self, X, y): - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = self._validate_data( - X, y, reset=True, accept_sparse=["csr", "csc"], dtype=None, - force_all_finite=False, + y, binarize_y, self._uniques = check_target_type( + y, + indicate_one_vs_all=True, + return_unique=True, ) + if not any([is_dask_container(arr) for arr in (X, y)]): + X, y = self._validate_data( + X, + y, + reset=True, + accept_sparse=["csr", "csc"], + dtype=None, + force_all_finite=False, + ) return X, y, binarize_y + @staticmethod + def _find_target_class_indices(y, target_class): + target_class_indices = np.flatnonzero(y == target_class) + if is_dask_container(y): + return target_class_indices.compute() + return target_class_indices + def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) - idx_under = np.empty((0,), dtype=int) + idx_under = [] - for target_class in np.unique(y): + for target_class in self._uniques: + target_class_indices = self._find_target_class_indices( + y, target_class + ) if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] index_target_class = random_state.choice( - range(np.count_nonzero(y == target_class)), + target_class_indices.size, size=n_samples, replace=self.replacement, ) else: index_target_class = slice(None) - idx_under = np.concatenate( - ( - idx_under, - np.flatnonzero(y == target_class)[index_target_class], - ), - axis=0, - ) + selected_indices = target_class_indices[index_target_class] + idx_under.append(selected_indices) - self.sample_indices_ = idx_under + self.sample_indices_ = np.hstack(idx_under) + self.sample_indices_.sort() - return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) + return ( + _safe_indexing(X, self.sample_indices_), + _safe_indexing(y, self.sample_indices_) + ) def _more_tags(self): return { - "X_types": ["2darray", "string"], + "X_types": [ + "2darray", + "string", + "dask-array", + ], "sample_indices": True, "allow_nan": True, } diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index fdc67619e..bc20267dd 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -14,10 +14,13 @@ from sklearn.base import clone from sklearn.neighbors._base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors -from sklearn.utils import column_or_1d -from sklearn.utils.multiclass import type_of_target +from ..dask._support import is_dask_container from ..exceptions import raise_isinstance_error +from .wrapper import _is_multiclass_encoded +from .wrapper import column_or_1d +from .wrapper import type_of_target +from .wrapper import unique SAMPLING_KIND = ( "over-sampling", @@ -99,10 +102,12 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): def _count_class_sample(y): unique, counts = np.unique(y, return_counts=True) + if is_dask_container(unique): + unique, counts = unique.compute(), counts.compute() return dict(zip(unique, counts)) -def check_target_type(y, indicate_one_vs_all=False): +def check_target_type(y, indicate_one_vs_all=False, return_unique=False): """Check the target types to be conform to the current samplers. The current samplers should be compatible with ``'binary'``, @@ -116,18 +121,24 @@ def check_target_type(y, indicate_one_vs_all=False): indicate_one_vs_all : bool, default=False Either to indicate if the targets are encoded in a one-vs-all fashion. + return_unique : bool, default=False + Either to return or not the unique values in y. + Returns ------- y : ndarray The returned target. + y_unique : ndarray + The unique values in `y`. + is_one_vs_all : bool, optional Indicate if the target was originally encoded in a one-vs-all fashion. Only returned if ``indicate_multilabel=True``. """ type_y = type_of_target(y) if type_y == "multilabel-indicator": - if np.any(y.sum(axis=1) > 1): + if not _is_multiclass_encoded(y): raise ValueError( "Imbalanced-learn currently supports binary, multiclass and " "binarized encoded multiclasss targets. Multilabel and " @@ -137,7 +148,13 @@ def check_target_type(y, indicate_one_vs_all=False): else: y = column_or_1d(y) - return (y, type_y == "multilabel-indicator") if indicate_one_vs_all else y + output = [y] + if indicate_one_vs_all: + output += [type_y == "multilabel-indicator"] + if return_unique: + output += [unique(y)] + + return output def _sampling_strategy_all(y, sampling_type): diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 729ceebea..1494fd695 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -51,6 +51,7 @@ def _set_checking_parameters(estimator): def _yield_sampler_checks(sampler): + tags = sampler._get_tags() yield check_target_type yield check_samplers_one_label yield check_samplers_fit @@ -58,6 +59,8 @@ def _yield_sampler_checks(sampler): yield check_samplers_sampling_strategy_fit_resample yield check_samplers_sparse yield check_samplers_pandas + if "dask-array" in tags["X_types"]: + yield check_samplers_dask_array yield check_samplers_list yield check_samplers_multiclass_ova yield check_samplers_preserve_dtype @@ -290,6 +293,30 @@ def check_samplers_pandas(name, sampler): assert_allclose(y_res_s.to_numpy(), y_res) +def check_samplers_dask_array(name, sampler): + dask = pytest.importorskip("dask") + # Check that the samplers handle pandas dataframe and pandas series + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) + X_dask = dask.array.from_array(X, chunks=100) + y_dask = dask.array.from_array(y, chunks=100) + + X_res_dask, y_res_dask = sampler.fit_resample(X_dask, y_dask) + X_res, y_res = sampler.fit_resample(X, y) + + # check that we return the same type for dataframes or series types + assert isinstance(X_res_dask, dask.array.Array) + assert isinstance(y_res_dask, dask.array.Array) + + assert_allclose(X_res_dask, X_res) + assert_allclose(y_res_dask, y_res) + + def check_samplers_list(name, sampler): # Check that the can samplers handle simple lists X, y = make_classification( diff --git a/imblearn/utils/wrapper.py b/imblearn/utils/wrapper.py new file mode 100644 index 000000000..646dc1a64 --- /dev/null +++ b/imblearn/utils/wrapper.py @@ -0,0 +1,37 @@ +import numpy as np + +from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target +from sklearn.utils.validation import column_or_1d as sklearn_column_or_1d + +from ..dask._support import is_dask_container + + +def type_of_target(y): + if is_dask_container(y): + from ..dask.utils import type_of_target as dask_type_of_target + + return dask_type_of_target(y) + return sklearn_type_of_target(y) + + +def _is_multiclass_encoded(y): + if is_dask_container(y): + from dask import array + + return array.all(y.sum(axis=1) == 1).compute() + return np.all(y.sum(axis=1) == 1) + + +def column_or_1d(y, *, warn=False): + if is_dask_container(y): + from ..dask.utils import column_or_1d as dask_column_or_1d + + return dask_column_or_1d(y, warn=warn) + return sklearn_column_or_1d(y, warn=warn) + + +def unique(*args, **kwargs): + output = np.unique(args, kwargs) + if is_dask_container(output): + return (arr.compute() for arr in output) + return output From ea30287c9638df632cb6aa20a5971312966ef3fc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 20:20:02 +0100 Subject: [PATCH 02/32] add dask to the install --- azure-pipelines.yml | 1 + build_tools/azure/install.sh | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1699a0d88..cb5b5c5b1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -35,6 +35,7 @@ jobs: PYTHON_VERSION: '3.8' COVERAGE: 'true' PANDAS_VERSION: '*' + DASK_VERSION: '*' TEST_DOCSTRINGS: 'true' JOBLIB_VERSION: '*' CHECK_WARNINGS: 'true' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 79c5d5814..80c6ada01 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -40,6 +40,10 @@ if [[ "$DISTRIB" == "conda" ]]; then TO_INSTALL="$TO_INSTALL pandas=$PANDAS_VERSION" fi + if [[ -n "$DASK_VERSION" ]]; then + TO_INSTALL="$TO_INSTALL dask=$DASK_VERSION" + fi + if [[ -n "$KERAS_VERSION" ]]; then TO_INSTALL="$TO_INSTALL keras=$KERAS_VERSION tensorflow=1" KERAS_BACKEND=tensorflow From 0766964224dd5802584c0a5a3a6e909f50d8e62c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 20:30:51 +0100 Subject: [PATCH 03/32] PEP8 --- imblearn/dask/tests/test_utils.py | 1 - imblearn/dask/utils.py | 1 - 2 files changed, 2 deletions(-) diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py index edf0665b6..524e87e1a 100644 --- a/imblearn/dask/tests/test_utils.py +++ b/imblearn/dask/tests/test_utils.py @@ -1,7 +1,6 @@ import numpy as np import pytest from dask import array -from dask_ml.datasets import make_classification from imblearn.dask.utils import is_multilabel from imblearn.dask.utils import type_of_target diff --git a/imblearn/dask/utils.py b/imblearn/dask/utils.py index 00f6128ba..5771120f4 100644 --- a/imblearn/dask/utils.py +++ b/imblearn/dask/utils.py @@ -1,6 +1,5 @@ import warnings -from dask import dataframe from dask import array from sklearn.exceptions import DataConversionWarning from sklearn.utils.multiclass import _is_integral_float From d9edb9ad591c803a99adcbc586fcb2892b78c3dd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 20:46:12 +0100 Subject: [PATCH 04/32] iter --- imblearn/dask/tests/test_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py index 524e87e1a..7787a0d20 100644 --- a/imblearn/dask/tests/test_utils.py +++ b/imblearn/dask/tests/test_utils.py @@ -1,5 +1,7 @@ import numpy as np import pytest + +dask = pytest.importorskip("dask") from dask import array from imblearn.dask.utils import is_multilabel @@ -18,7 +20,7 @@ def test_type_of_target_error(): "y, expected_result", [ (array.from_array(np.array([0, 1, 0, 1])), False), - (array.from_array(np.array([[1, 0], [0, 0]])), True), + (array(np.array([[1, 0], [0, 0]])), True), (array.from_array(np.array([[1], [0], [0]])), False), (array.from_array(np.array([[1, 0, 0]])), True), ] From 4960724378cc59ade14f27f34683f0f240162651 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 20:56:34 +0100 Subject: [PATCH 05/32] PEP8 --- imblearn/dask/tests/test_utils.py | 2 +- setup.cfg | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py index 7787a0d20..eaa4bcbc9 100644 --- a/imblearn/dask/tests/test_utils.py +++ b/imblearn/dask/tests/test_utils.py @@ -20,7 +20,7 @@ def test_type_of_target_error(): "y, expected_result", [ (array.from_array(np.array([0, 1, 0, 1])), False), - (array(np.array([[1, 0], [0, 0]])), True), + (array.from_array(np.array([[1, 0], [0, 0]])), True), (array.from_array(np.array([[1], [0], [0]])), False), (array.from_array(np.array([[1, 0, 0]])), True), ] diff --git a/setup.cfg b/setup.cfg index 1062c584c..ae0665223 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,3 +32,6 @@ addopts = filterwarnings = ignore:the matrix subclass:PendingDeprecationWarning +[flake8] +# Default flake8 3.5 ignored flags +ignore=E121,E123,E126,E226,E24,E704,W503,W504,E402 From 21524290a6b1bb01949a648ea2d7288b2bfdd294 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 21:26:38 +0100 Subject: [PATCH 06/32] iter --- conftest.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conftest.py b/conftest.py index d3ff91025..7ca37b601 100644 --- a/conftest.py +++ b/conftest.py @@ -31,3 +31,8 @@ def pytest_runtest_setup(item): import tensorflow except ImportError: pytest.skip('The tensorflow package is not installed.') + elif "dask" in fname: + try: + import dask + except ImportError: + pytest.skip('The dask package is not installed.') From e5ce7a6821ed7ee79afc61352e8c8de14da64168 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 21:28:49 +0100 Subject: [PATCH 07/32] PEP8 --- conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index 7ca37b601..607b4936e 100644 --- a/conftest.py +++ b/conftest.py @@ -22,17 +22,17 @@ def pytest_runtest_setup(item): if (fname.endswith(os.path.join('keras', '_generator.py')) or fname.endswith('miscellaneous.rst')): try: - import keras + import keras # noqa except ImportError: pytest.skip('The keras package is not installed.') elif (fname.endswith(os.path.join('tensorflow', '_generator.py')) or fname.endswith('miscellaneous.rst')): try: - import tensorflow + import tensorflow # noqa except ImportError: pytest.skip('The tensorflow package is not installed.') elif "dask" in fname: try: - import dask + import dask # noqa except ImportError: pytest.skip('The dask package is not installed.') From b537a201714d11ce94fad1cb531f46aa2273c44b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 21:51:33 +0100 Subject: [PATCH 08/32] iter --- imblearn/utils/_validation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index bc20267dd..f3d6d695b 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -129,12 +129,12 @@ def check_target_type(y, indicate_one_vs_all=False, return_unique=False): y : ndarray The returned target. - y_unique : ndarray - The unique values in `y`. - is_one_vs_all : bool, optional Indicate if the target was originally encoded in a one-vs-all fashion. Only returned if ``indicate_multilabel=True``. + + y_unique : ndarray + The unique values in `y`. """ type_y = type_of_target(y) if type_y == "multilabel-indicator": @@ -154,7 +154,7 @@ def check_target_type(y, indicate_one_vs_all=False, return_unique=False): if return_unique: output += [unique(y)] - return output + return output[0] if len(output) == 1 else tuple(output) def _sampling_strategy_all(y, sampling_type): From f781be0a2905bbea32c3a261b434df59d82480af Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 22:03:18 +0100 Subject: [PATCH 09/32] iter --- imblearn/utils/testing.py | 2 +- setup.cfg | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py index b5dc79828..b779b6cc1 100644 --- a/imblearn/utils/testing.py +++ b/imblearn/utils/testing.py @@ -53,7 +53,7 @@ def is_abstract(c): return True all_classes = [] - modules_to_ignore = {"tests"} + modules_to_ignore = {"tests", "dask"} root = str(Path(__file__).parent.parent) # Ignore deprecation warnings triggered at import time and from walking # packages diff --git a/setup.cfg b/setup.cfg index ae0665223..0b7b5b1d2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,7 +21,7 @@ test = pytest [tool:pytest] doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS -addopts = +addopts = --ignore build_tools --ignore benchmarks --ignore doc @@ -29,7 +29,7 @@ addopts = --ignore maint_tools --doctest-modules -rs -filterwarnings = +filterwarnings = ignore:the matrix subclass:PendingDeprecationWarning [flake8] From fb3d6a4cffca376cf6e34812b952689e2e157bb4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 22:23:24 +0100 Subject: [PATCH 10/32] avoid import dask explicitely --- conftest.py | 4 +++- imblearn/dask/utils.py | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/conftest.py b/conftest.py index 607b4936e..8de685080 100644 --- a/conftest.py +++ b/conftest.py @@ -19,6 +19,7 @@ def pytest_runtest_setup(item): fname = item.fspath.strpath + print(item) if (fname.endswith(os.path.join('keras', '_generator.py')) or fname.endswith('miscellaneous.rst')): try: @@ -31,7 +32,8 @@ def pytest_runtest_setup(item): import tensorflow # noqa except ImportError: pytest.skip('The tensorflow package is not installed.') - elif "dask" in fname: + elif (fname.endswith(os.path.join("dask", "utils.py")) or + fname.endswith(os.path.join("dask", "_support.py"))): try: import dask # noqa except ImportError: diff --git a/imblearn/dask/utils.py b/imblearn/dask/utils.py index 5771120f4..80a65713f 100644 --- a/imblearn/dask/utils.py +++ b/imblearn/dask/utils.py @@ -1,6 +1,6 @@ import warnings -from dask import array +import numpy as np from sklearn.exceptions import DataConversionWarning from sklearn.utils.multiclass import _is_integral_float @@ -9,7 +9,7 @@ def is_multilabel(y): if not (y.ndim == 2 and y.shape[1] > 1): return False - labels = array.unique(y).compute() + labels = np.unique(y).compute() return len(labels) < 3 and ( y.dtype.kind in 'biu' or _is_integral_float(labels) @@ -34,12 +34,12 @@ def type_of_target(y): suffix = "" # check float and contains non-integer float values - if y.dtype.kind == 'f' and array.any(y != y.astype(int)): + if y.dtype.kind == 'f' and np.any(y != y.astype(int)): # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] # NOTE: we don't check for infinite values return 'continuous' + suffix - labels = array.unique(y).compute() + labels = np.unique(y).compute() if (len((labels)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] return 'multiclass' + suffix From b7d9f3b3e2b120a173a1477a0ce96ed9555491ed Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 22:58:34 +0100 Subject: [PATCH 11/32] TST remove redundant test --- conftest.py | 7 -- .../tests/test_random_under_sampler.py | 76 +++++-------------- 2 files changed, 21 insertions(+), 62 deletions(-) diff --git a/conftest.py b/conftest.py index 8de685080..72e6a23da 100644 --- a/conftest.py +++ b/conftest.py @@ -19,7 +19,6 @@ def pytest_runtest_setup(item): fname = item.fspath.strpath - print(item) if (fname.endswith(os.path.join('keras', '_generator.py')) or fname.endswith('miscellaneous.rst')): try: @@ -32,9 +31,3 @@ def pytest_runtest_setup(item): import tensorflow # noqa except ImportError: pytest.skip('The tensorflow package is not installed.') - elif (fname.endswith(os.path.join("dask", "utils.py")) or - fname.endswith(os.path.join("dask", "_support.py"))): - try: - import dask # noqa - except ImportError: - pytest.skip('The dask package is not installed.') diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py index 945d31fec..355273dc1 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py @@ -30,61 +30,27 @@ Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) -@pytest.mark.parametrize("as_frame", [True, False], ids=['dataframe', 'array']) -def test_rus_fit_resample(as_frame): - if as_frame: - pd = pytest.importorskip("pandas") - X_ = pd.DataFrame(X) - else: - X_ = X - rus = RandomUnderSampler(random_state=RND_SEED, replacement=True) - X_resampled, y_resampled = rus.fit_resample(X_, Y) - - X_gt = np.array( - [ - [0.92923648, 0.76103773], - [0.47104475, 0.44386323], - [0.13347175, 0.12167502], - [0.09125309, -0.85409574], - [0.12372842, 0.6536186], - [0.04352327, -0.20515826], - ] - ) - y_gt = np.array([0, 0, 0, 1, 1, 1]) - - if as_frame: - assert hasattr(X_resampled, "loc") - X_resampled = X_resampled.to_numpy() - - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - - -def test_rus_fit_resample_half(): - sampling_strategy = {0: 3, 1: 6} - rus = RandomUnderSampler( - sampling_strategy=sampling_strategy, - random_state=RND_SEED, - replacement=True, - ) - X_resampled, y_resampled = rus.fit_resample(X, Y) - - X_gt = np.array( - [ - [0.92923648, 0.76103773], - [0.47104475, 0.44386323], - [0.92923648, 0.76103773], - [0.15490546, 0.3130677], - [0.15490546, 0.3130677], - [0.15490546, 0.3130677], - [0.20792588, 1.49407907], - [0.15490546, 0.3130677], - [0.12372842, 0.6536186], - ] - ) - y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) - assert_array_equal(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) +@pytest.mark.parametrize( + "sampling_strategy, expected_counts", + [ + ("auto", {0: 3, 1: 3}), + ({0: 3, 1: 6}, {0: 3, 1: 6}), + ] +) +def test_rus_fit_resample(sampling_strategy, expected_counts): + rus = RandomUnderSampler(sampling_strategy=sampling_strategy) + X_res, y_res = rus.fit_resample(X, Y) + + # check that there is not samples from class 0 resampled as class 1 and + # vice-versa + classes = [0, 1] + for c0, c1 in (classes, classes[::-1]): + X_c0 = X[Y == c0] + X_c1 = X_res[y_res == c1] + for s0 in X_c0: + assert not np.isclose(s0, X_c1).all(axis=1).any() + + assert Counter(y_res) == expected_counts def test_multiclass_fit_resample(): From d26da3cd7980bf98511ac4daf35257d8468a0f85 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 23:46:23 +0100 Subject: [PATCH 12/32] iter --- azure-pipelines.yml | 2 ++ build_tools/azure/install.sh | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cb5b5c5b1..3da97175e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -44,6 +44,7 @@ jobs: PYTHON_VERSION: '3.7' INSTALL_MKL: 'true' PANDAS_VERSION: '*' + DASK_VERSION: '*' KERAS_VERSION: '*' COVERAGE: 'true' JOBLIB_VERSION: '*' @@ -52,6 +53,7 @@ jobs: DISTRIB: 'conda' PYTHON_VERSION: '3.8' PANDAS_VERSION: '*' + DASK_VERSION: '*' JOBLIB_VERSION: '*' INSTALL_MKL: 'true' TENSORFLOW_VERSION: '*' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 80c6ada01..d4d7a3692 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -94,9 +94,10 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip python -m pip install numpy scipy joblib cython + python -m pip install pandas==$PANDAS_VERSION + python -m pip install dask==$DASK_VERSION python -m pip install scikit-learn python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist - python -m pip install pandas fi if [[ "$COVERAGE" == "true" ]]; then From c065808361bc393bb0707b8e4e75700b1dbfed02 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 23:51:34 +0100 Subject: [PATCH 13/32] xxx --- build_tools/azure/install.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index d4d7a3692..ed7ac29a9 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -94,8 +94,8 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip python -m pip install numpy scipy joblib cython - python -m pip install pandas==$PANDAS_VERSION - python -m pip install dask==$DASK_VERSION + python -m pip install pandas + python -m pip install dask python -m pip install scikit-learn python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist fi From f2d0ec0b1857eea9fce464c318404ad13ba684d3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Nov 2020 23:57:05 +0100 Subject: [PATCH 14/32] install complete dask --- build_tools/azure/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index ed7ac29a9..dba7754a6 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -95,7 +95,7 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then python -m pip install -U pip python -m pip install numpy scipy joblib cython python -m pip install pandas - python -m pip install dask + python -m pip install "dask[complete]" python -m pip install scikit-learn python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist fi From 20ba9348a24000e1d2916674397a3173bffc2739 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 00:06:23 +0100 Subject: [PATCH 15/32] iter --- imblearn/dask/tests/test_utils.py | 8 ----- .../_random_under_sampler.py | 1 + imblearn/utils/estimator_checks.py | 35 ++++++++++++++++++- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py index eaa4bcbc9..0a262a435 100644 --- a/imblearn/dask/tests/test_utils.py +++ b/imblearn/dask/tests/test_utils.py @@ -8,14 +8,6 @@ from imblearn.dask.utils import type_of_target -def test_type_of_target_error(): - y = np.arange(10) - - err_msg = "Expected a Dask array, series or dataframe." - with pytest.raises(ValueError, match=err_msg): - type_of_target(y) - - @pytest.mark.parametrize( "y, expected_result", [ diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 8d9745622..9bc807ea2 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -140,6 +140,7 @@ def _more_tags(self): "2darray", "string", "dask-array", + "dask-dataframe" ], "sample_indices": True, "allow_nan": True, diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 1494fd695..b04cc388c 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -61,6 +61,8 @@ def _yield_sampler_checks(sampler): yield check_samplers_pandas if "dask-array" in tags["X_types"]: yield check_samplers_dask_array + if "dask-dataframe" in tags["X_types"]: + yield check_samplers_dask_dataframe yield check_samplers_list yield check_samplers_multiclass_ova yield check_samplers_preserve_dtype @@ -295,7 +297,7 @@ def check_samplers_pandas(name, sampler): def check_samplers_dask_array(name, sampler): dask = pytest.importorskip("dask") - # Check that the samplers handle pandas dataframe and pandas series + # Check that the samplers handle dask array X, y = make_classification( n_samples=1000, n_classes=3, @@ -317,6 +319,37 @@ def check_samplers_dask_array(name, sampler): assert_allclose(y_res_dask, y_res) +def check_samplers_dask_dataframe(name, sampler): + dask = pytest.importorskip("dask") + # Check that the samplers handle dask dataframe and dask series + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) + X_df = dask.dataframe.from_array( + X, columns=[str(i) for i in range(X.shape[1])] + ) + y_s = dask.dataframe.from_array(y) + + X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) + X_res, y_res = sampler.fit_resample(X, y) + + # check that we return the same type for dataframes or series types + assert isinstance(X_res_df, dask.dataframe.DataFrame) + assert isinstance(y_res_s, dask.dataframe.Series) + + # assert X_df.columns.to_list() == X_res_df.columns.to_list() + # assert y_df.columns.to_list() == y_res_df.columns.to_list() + # assert y_s.name == y_res_s.name + + # assert_allclose(X_res_df.to_numpy(), X_res) + # assert_allclose(y_res_df.to_numpy().ravel(), y_res) + # assert_allclose(y_res_s.to_numpy(), y_res) + + def check_samplers_list(name, sampler): # Check that the can samplers handle simple lists X, y = make_classification( From 0941a5e7eedb8fffce5646df150334c1c1e877c5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 00:06:49 +0100 Subject: [PATCH 16/32] iter --- imblearn/ensemble/tests/test_weight_boosting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/ensemble/tests/test_weight_boosting.py b/imblearn/ensemble/tests/test_weight_boosting.py index 26facce90..517f61f40 100644 --- a/imblearn/ensemble/tests/test_weight_boosting.py +++ b/imblearn/ensemble/tests/test_weight_boosting.py @@ -77,7 +77,7 @@ def test_rusboost(imbalanced_dataset, algorithm): assert rusboost.decision_function(X_test).shape[1] == len(classes) score = rusboost.score(X_test, y_test) - assert score > 0.7, "Failed with algorithm {} and score {}".format( + assert score > 0.65, "Failed with algorithm {} and score {}".format( algorithm, score ) From 7aae9d924a5bc2759c80cc2dff200ea227aaf9db Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 00:07:32 +0100 Subject: [PATCH 17/32] iter --- .../_prototype_selection/_random_under_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 9bc807ea2..c9da1d28a 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -140,7 +140,7 @@ def _more_tags(self): "2darray", "string", "dask-array", - "dask-dataframe" + # "dask-dataframe" ], "sample_indices": True, "allow_nan": True, From 00c0a265f1930becf6a332195629dc9a2b917757 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 11:23:43 +0100 Subject: [PATCH 18/32] iter --- imblearn/base.py | 3 +- imblearn/dask/utils.py | 17 +++++++++-- .../_random_under_sampler.py | 8 +++++- imblearn/utils/_validation.py | 28 +++++++++++++++++-- imblearn/utils/estimator_checks.py | 11 ++++---- imblearn/utils/wrapper.py | 24 ++++++++++++---- 6 files changed, 74 insertions(+), 17 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 86bb53778..6a829e3e4 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -10,11 +10,11 @@ from sklearn.base import BaseEstimator from sklearn.preprocessing import label_binarize -from sklearn.utils.multiclass import check_classification_targets from .utils import check_sampling_strategy, check_target_type from .utils._validation import ArraysTransformer from .utils._validation import _deprecate_positional_args +from .utils.wrapper import check_classification_targets class SamplerMixin(BaseEstimator, metaclass=ABCMeta): @@ -82,6 +82,7 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) + # TODO: label binarize is not implemented with dask y_ = (label_binarize(output[1], np.unique(y)) if binarize_y else output[1]) diff --git a/imblearn/dask/utils.py b/imblearn/dask/utils.py index 80a65713f..814f9ce81 100644 --- a/imblearn/dask/utils.py +++ b/imblearn/dask/utils.py @@ -9,7 +9,10 @@ def is_multilabel(y): if not (y.ndim == 2 and y.shape[1] > 1): return False - labels = np.unique(y).compute() + if hasattr(y, "unique"): + labels = np.asarray(y.unique()) + else: + labels = np.unique(y).compute() return len(labels) < 3 and ( y.dtype.kind in 'biu' or _is_integral_float(labels) @@ -39,7 +42,10 @@ def type_of_target(y): # NOTE: we don't check for infinite values return 'continuous' + suffix - labels = np.unique(y).compute() + if hasattr(y, "unique"): + labels = np.asarray(y.unique()) + else: + labels = np.unique(y).compute() if (len((labels)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] return 'multiclass' + suffix @@ -63,3 +69,10 @@ def column_or_1d(y, *, warn=False): raise ValueError( f"y should be a 1d array. Got an array of shape {shape} instead." ) + + +def check_classification_targets(y): + y_type = type_of_target(y) + if y_type not in ['binary', 'multiclass', 'multiclass-multioutput', + 'multilabel-indicator', 'multilabel-sequences']: + raise ValueError("Unknown label type: %r" % y_type) diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index c9da1d28a..ab92071b6 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -81,6 +81,9 @@ def __init__( self.replacement = replacement def _check_X_y(self, X, y): + if is_dask_container(y) and hasattr(y, "to_dask_array"): + y = y.to_dask_array() + y.compute_chunk_sizes() y, binarize_y, self._uniques = check_target_type( y, indicate_one_vs_all=True, @@ -95,6 +98,9 @@ def _check_X_y(self, X, y): dtype=None, force_all_finite=False, ) + elif is_dask_container(X) and hasattr(X, "to_dask_array"): + X = X.to_dask_array() + X.compute_chunk_sizes() return X, y, binarize_y @staticmethod @@ -140,7 +146,7 @@ def _more_tags(self): "2darray", "string", "dask-array", - # "dask-dataframe" + "dask-dataframe" ], "sample_indices": True, "allow_nan": True, diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index f3d6d695b..f4875bef3 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -47,6 +47,9 @@ def transform(self, X, y): def _gets_props(self, array): props = {} props["type"] = array.__class__.__name__ + if props["type"].lower() in ("series", "dataframe"): + suffix = "dask-" if is_dask_container(array) else "pandas-" + props["type"] = suffix + props["type"] props["columns"] = getattr(array, "columns", None) props["name"] = getattr(array, "name", None) props["dtypes"] = getattr(array, "dtypes", None) @@ -56,13 +59,34 @@ def _transfrom_one(self, array, props): type_ = props["type"].lower() if type_ == "list": ret = array.tolist() - elif type_ == "dataframe": + elif type_ == "pandas-dataframe": import pandas as pd + ret = pd.DataFrame(array, columns=props["columns"]) ret = ret.astype(props["dtypes"]) - elif type_ == "series": + elif type_ == "pandas-series": import pandas as pd + ret = pd.Series(array, dtype=props["dtypes"], name=props["name"]) + elif type_ == "dask-dataframe": + from dask import dataframe + + if is_dask_container(array): + ret = dataframe.from_dask_array( + array, columns=props["columns"] + ) + else: + ret = dataframe.from_array(array, columns=props["columns"]) + ret = ret.astype(props["dtypes"]) + elif type_ == "dask-series": + from dask import dataframe + + if is_dask_container(array): + ret = dataframe.from_dask_array(array) + else: + ret = dataframe.from_array(array) + ret = ret.astype(props["dtypes"]) + ret = ret.rename(props["name"]) else: ret = array return ret diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index b04cc388c..b4dcbc904 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -333,6 +333,7 @@ def check_samplers_dask_dataframe(name, sampler): X, columns=[str(i) for i in range(X.shape[1])] ) y_s = dask.dataframe.from_array(y) + y_s = y_s.rename("target") X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) X_res, y_res = sampler.fit_resample(X, y) @@ -341,13 +342,11 @@ def check_samplers_dask_dataframe(name, sampler): assert isinstance(X_res_df, dask.dataframe.DataFrame) assert isinstance(y_res_s, dask.dataframe.Series) - # assert X_df.columns.to_list() == X_res_df.columns.to_list() - # assert y_df.columns.to_list() == y_res_df.columns.to_list() - # assert y_s.name == y_res_s.name + assert X_df.columns.to_list() == X_res_df.columns.to_list() + assert y_s.name == y_res_s.name - # assert_allclose(X_res_df.to_numpy(), X_res) - # assert_allclose(y_res_df.to_numpy().ravel(), y_res) - # assert_allclose(y_res_s.to_numpy(), y_res) + assert_allclose(np.array(X_res_df), X_res) + assert_allclose(np.array(y_res_s), y_res) def check_samplers_list(name, sampler): diff --git a/imblearn/utils/wrapper.py b/imblearn/utils/wrapper.py index 646dc1a64..7dbfa3b1e 100644 --- a/imblearn/utils/wrapper.py +++ b/imblearn/utils/wrapper.py @@ -1,5 +1,7 @@ import numpy as np +from sklearn.utils.multiclass import check_classification_targets as \ + sklearn_check_classification_targets from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target from sklearn.utils.validation import column_or_1d as sklearn_column_or_1d @@ -30,8 +32,20 @@ def column_or_1d(y, *, warn=False): return sklearn_column_or_1d(y, warn=warn) -def unique(*args, **kwargs): - output = np.unique(args, kwargs) - if is_dask_container(output): - return (arr.compute() for arr in output) - return output +def unique(arr, **kwargs): + if is_dask_container(arr): + if hasattr(arr, "unique"): + output = np.asarray(arr.unique(**kwargs)) + else: + output = np.unique(arr).compute() + return output + return np.unique(arr, **kwargs) + + +def check_classification_targets(y): + if is_dask_container(y): + from ..dask.utils import check_classification_targets as \ + dask_check_classification_targets + + return dask_check_classification_targets(y) + return sklearn_check_classification_targets(y) From 8bfa040a6aefc39979bd6d46f57f46f3f15f2473 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 11:30:42 +0100 Subject: [PATCH 19/32] requirements --- requirements.optional.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.optional.txt b/requirements.optional.txt index 826277d5e..f785df2ff 100644 --- a/requirements.optional.txt +++ b/requirements.optional.txt @@ -1,2 +1,3 @@ +dask[complete] keras tensorflow From d4aabf80da7b8e2720c8804a9e3a4dfc837cbe87 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 7 Nov 2020 23:06:34 +0100 Subject: [PATCH 20/32] iter --- imblearn/base.py | 36 +++- imblearn/dask/_support.py | 20 +-- .../_random_under_sampler.py | 44 +++-- imblearn/utils/_docstring.py | 7 + imblearn/utils/_validation.py | 167 +++++++++--------- imblearn/utils/estimator_checks.py | 18 +- imblearn/utils/wrapper.py | 12 +- 7 files changed, 168 insertions(+), 136 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 6a829e3e4..38f4259a4 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -11,9 +11,13 @@ from sklearn.base import BaseEstimator from sklearn.preprocessing import label_binarize +from .dask._support import is_dask_collection from .utils import check_sampling_strategy, check_target_type -from .utils._validation import ArraysTransformer -from .utils._validation import _deprecate_positional_args +from .utils._validation import ( + ArraysTransformer, + _deprecate_positional_args, + get_classes_counts, +) from .utils.wrapper import check_classification_targets @@ -45,9 +49,13 @@ def fit(self, X, y): self : object Return the instance itself. """ - X, y, _ = self._check_X_y(X, y) + dask_collection = any([is_dask_collection(arr) for arr in (X, y)]) + if (not dask_collection or + (dask_collection and self.validate_if_dask_collection)): + X, y, _ = self._check_X_y(X, y) + self._classes_counts = get_classes_counts(y) self.sampling_strategy_ = check_sampling_strategy( - self.sampling_strategy, y, self._sampling_type + self.sampling_strategy, self._classes_counts, self._sampling_type ) return self @@ -72,12 +80,19 @@ def fit_resample(self, X, y): y_resampled : array-like of shape (n_samples_new,) The corresponding label of `X_resampled`. """ - check_classification_targets(y) arrays_transformer = ArraysTransformer(X, y) - X, y, binarize_y = self._check_X_y(X, y) + dask_collection = any([is_dask_collection(arr) for arr in (X, y)]) + if (not dask_collection or + (dask_collection and self.validate_if_dask_collection)): + check_classification_targets(y) + X, y, binarize_y = self._check_X_y(X, y) + else: + X, y = arrays_transformer.to_dask_array(X, y) + binarize_y = False + self._classes_counts = get_classes_counts(y) self.sampling_strategy_ = check_sampling_strategy( - self.sampling_strategy, y, self._sampling_type + self.sampling_strategy, self._classes_counts, self._sampling_type ) output = self._fit_resample(X, y) @@ -125,8 +140,13 @@ class BaseSampler(SamplerMixin): instead. """ - def __init__(self, sampling_strategy="auto"): + def __init__( + self, + sampling_strategy="auto", + validate_if_dask_collection=False, + ): self.sampling_strategy = sampling_strategy + self.validate_if_dask_collection = validate_if_dask_collection def _check_X_y(self, X, y, accept_sparse=None): if accept_sparse is None: diff --git a/imblearn/dask/_support.py b/imblearn/dask/_support.py index b0b4153d2..b5239ccac 100644 --- a/imblearn/dask/_support.py +++ b/imblearn/dask/_support.py @@ -1,13 +1,9 @@ -_REGISTERED_DASK_CONTAINER = [] +def is_dask_collection(container): + try: + # to keep dask as an optional depency, keep the statement in a + # try/except statement + from dask import is_dask_collection -try: - from dask import array, dataframe - _REGISTERED_DASK_CONTAINER += [ - array.Array, dataframe.Series, dataframe.DataFrame, - ] -except ImportError: - pass - - -def is_dask_container(container): - return isinstance(container, tuple(_REGISTERED_DASK_CONTAINER)) + return is_dask_collection(container) + except ImportError: + return False diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index ab92071b6..28ef02d88 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -10,16 +10,20 @@ from sklearn.utils import _safe_indexing from ..base import BaseUnderSampler -from ...dask._support import is_dask_container +from ...dask._support import is_dask_collection from ...utils import check_target_type from ...utils import Substitution -from ...utils._docstring import _random_state_docstring +from ...utils._docstring import ( + _random_state_docstring, + _validate_if_dask_collection_docstring +) from ...utils._validation import _deprecate_positional_args @Substitution( sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, random_state=_random_state_docstring, + validate_if_dask_collection=_validate_if_dask_collection_docstring, ) class RandomUnderSampler(BaseUnderSampler): """Class to perform random under-sampling. @@ -38,6 +42,8 @@ class RandomUnderSampler(BaseUnderSampler): replacement : bool, default=False Whether the sample is with or without replacement. + {validate_if_dask_collection} + Attributes ---------- sample_indices_ : ndarray of shape (n_new_samples,) @@ -74,22 +80,23 @@ class RandomUnderSampler(BaseUnderSampler): @_deprecate_positional_args def __init__( - self, *, sampling_strategy="auto", random_state=None, replacement=False + self, + *, + sampling_strategy="auto", + random_state=None, + replacement=False, + validate_if_dask_collection=False, ): - super().__init__(sampling_strategy=sampling_strategy) + super().__init__( + sampling_strategy=sampling_strategy, + validate_if_dask_collection=validate_if_dask_collection, + ) self.random_state = random_state self.replacement = replacement def _check_X_y(self, X, y): - if is_dask_container(y) and hasattr(y, "to_dask_array"): - y = y.to_dask_array() - y.compute_chunk_sizes() - y, binarize_y, self._uniques = check_target_type( - y, - indicate_one_vs_all=True, - return_unique=True, - ) - if not any([is_dask_container(arr) for arr in (X, y)]): + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) + if not any([is_dask_collection(arr) for arr in (X, y)]): X, y = self._validate_data( X, y, @@ -98,16 +105,15 @@ def _check_X_y(self, X, y): dtype=None, force_all_finite=False, ) - elif is_dask_container(X) and hasattr(X, "to_dask_array"): - X = X.to_dask_array() - X.compute_chunk_sizes() return X, y, binarize_y @staticmethod def _find_target_class_indices(y, target_class): target_class_indices = np.flatnonzero(y == target_class) - if is_dask_container(y): - return target_class_indices.compute() + if is_dask_collection(y): + from dask import compute + + return compute(target_class_indices)[0] return target_class_indices def _fit_resample(self, X, y): @@ -115,7 +121,7 @@ def _fit_resample(self, X, y): idx_under = [] - for target_class in self._uniques: + for target_class in self._classes_counts: target_class_indices = self._find_target_class_indices( y, target_class ) diff --git a/imblearn/utils/_docstring.py b/imblearn/utils/_docstring.py index d03be3740..87907d73e 100644 --- a/imblearn/utils/_docstring.py +++ b/imblearn/utils/_docstring.py @@ -41,3 +41,10 @@ def __call__(self, obj): `Glossary `_ for more details. """.rstrip() + +_validate_if_dask_collection_docstring = \ + """validate_if_dask_collection : bool, default=False + Whether or not `X` and `y` should be validated. This parameter applies + only when `X` and `y` are Dask collections where validation might be + potentially costly. + """.rstrip() \ No newline at end of file diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index f4875bef3..6b3936836 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -15,7 +15,7 @@ from sklearn.neighbors._base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors -from ..dask._support import is_dask_container +from ..dask._support import is_dask_collection from ..exceptions import raise_isinstance_error from .wrapper import _is_multiclass_encoded from .wrapper import column_or_1d @@ -39,6 +39,16 @@ def __init__(self, X, y): self.x_props = self._gets_props(X) self.y_props = self._gets_props(y) + @staticmethod + def to_dask_array(X, y): + if hasattr(X, "to_dask_array"): + X = X.to_dask_array() + X.compute_chunk_sizes() + if hasattr(y, "to_dask_array"): + y = y.to_dask_array() + y.compute_chunk_sizes() + return X, y + def transform(self, X, y): X = self._transfrom_one(X, self.x_props) y = self._transfrom_one(y, self.y_props) @@ -48,7 +58,7 @@ def _gets_props(self, array): props = {} props["type"] = array.__class__.__name__ if props["type"].lower() in ("series", "dataframe"): - suffix = "dask-" if is_dask_container(array) else "pandas-" + suffix = "dask-" if is_dask_collection(array) else "pandas-" props["type"] = suffix + props["type"] props["columns"] = getattr(array, "columns", None) props["name"] = getattr(array, "name", None) @@ -71,7 +81,7 @@ def _transfrom_one(self, array, props): elif type_ == "dask-dataframe": from dask import dataframe - if is_dask_container(array): + if is_dask_collection(array): ret = dataframe.from_dask_array( array, columns=props["columns"] ) @@ -81,7 +91,7 @@ def _transfrom_one(self, array, props): elif type_ == "dask-series": from dask import dataframe - if is_dask_container(array): + if is_dask_collection(array): ret = dataframe.from_dask_array(array) else: ret = dataframe.from_array(array) @@ -124,14 +134,16 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object) -def _count_class_sample(y): +def get_classes_counts(y): unique, counts = np.unique(y, return_counts=True) - if is_dask_container(unique): - unique, counts = unique.compute(), counts.compute() + if is_dask_collection(unique): + from dask import compute + + unique, counts = compute(unique, counts) return dict(zip(unique, counts)) -def check_target_type(y, indicate_one_vs_all=False, return_unique=False): +def check_target_type(y, indicate_one_vs_all=False): """Check the target types to be conform to the current samplers. The current samplers should be compatible with ``'binary'``, @@ -145,9 +157,6 @@ def check_target_type(y, indicate_one_vs_all=False, return_unique=False): indicate_one_vs_all : bool, default=False Either to indicate if the targets are encoded in a one-vs-all fashion. - return_unique : bool, default=False - Either to return or not the unique values in y. - Returns ------- y : ndarray @@ -175,27 +184,24 @@ def check_target_type(y, indicate_one_vs_all=False, return_unique=False): output = [y] if indicate_one_vs_all: output += [type_y == "multilabel-indicator"] - if return_unique: - output += [unique(y)] return output[0] if len(output) == 1 else tuple(output) -def _sampling_strategy_all(y, sampling_type): +def _sampling_strategy_all(classes_counts, sampling_type): """Returns sampling target by targeting all classes.""" - target_stats = _count_class_sample(y) if sampling_type == "over-sampling": - n_sample_majority = max(target_stats.values()) + n_sample_majority = max(classes_counts.values()) sampling_strategy = { key: n_sample_majority - value - for (key, value) in target_stats.items() + for (key, value) in classes_counts.items() } elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): - n_sample_minority = min(target_stats.values()) + n_sample_minority = min(classes_counts.values()) sampling_strategy = { - key: n_sample_minority for key in target_stats.keys() + key: n_sample_minority for key in classes_counts.keys() } else: raise NotImplementedError @@ -203,7 +209,7 @@ def _sampling_strategy_all(y, sampling_type): return sampling_strategy -def _sampling_strategy_majority(y, sampling_type): +def _sampling_strategy_majority(classes_counts, sampling_type): """Returns sampling target by targeting the majority class only.""" if sampling_type == "over-sampling": raise ValueError( @@ -213,12 +219,11 @@ def _sampling_strategy_majority(y, sampling_type): elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): - target_stats = _count_class_sample(y) - class_majority = max(target_stats, key=target_stats.get) - n_sample_minority = min(target_stats.values()) + class_majority = max(classes_counts, key=classes_counts.get) + n_sample_minority = min(classes_counts.values()) sampling_strategy = { key: n_sample_minority - for key in target_stats.keys() + for key in classes_counts.keys() if key == class_majority } else: @@ -227,26 +232,25 @@ def _sampling_strategy_majority(y, sampling_type): return sampling_strategy -def _sampling_strategy_not_majority(y, sampling_type): +def _sampling_strategy_not_majority(classes_counts, sampling_type): """Returns sampling target by targeting all classes but not the majority.""" - target_stats = _count_class_sample(y) if sampling_type == "over-sampling": - n_sample_majority = max(target_stats.values()) - class_majority = max(target_stats, key=target_stats.get) + n_sample_majority = max(classes_counts.values()) + class_majority = max(classes_counts, key=classes_counts.get) sampling_strategy = { key: n_sample_majority - value - for (key, value) in target_stats.items() + for (key, value) in classes_counts.items() if key != class_majority } elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): - n_sample_minority = min(target_stats.values()) - class_majority = max(target_stats, key=target_stats.get) + n_sample_minority = min(classes_counts.values()) + class_majority = max(classes_counts, key=classes_counts.get) sampling_strategy = { key: n_sample_minority - for key in target_stats.keys() + for key in classes_counts.keys() if key != class_majority } else: @@ -255,26 +259,25 @@ def _sampling_strategy_not_majority(y, sampling_type): return sampling_strategy -def _sampling_strategy_not_minority(y, sampling_type): +def _sampling_strategy_not_minority(classes_counts, sampling_type): """Returns sampling target by targeting all classes but not the minority.""" - target_stats = _count_class_sample(y) if sampling_type == "over-sampling": - n_sample_majority = max(target_stats.values()) - class_minority = min(target_stats, key=target_stats.get) + n_sample_majority = max(classes_counts.values()) + class_minority = min(classes_counts, key=classes_counts.get) sampling_strategy = { key: n_sample_majority - value - for (key, value) in target_stats.items() + for (key, value) in classes_counts.items() if key != class_minority } elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): - n_sample_minority = min(target_stats.values()) - class_minority = min(target_stats, key=target_stats.get) + n_sample_minority = min(classes_counts.values()) + class_minority = min(classes_counts, key=classes_counts.get) sampling_strategy = { key: n_sample_minority - for key in target_stats.keys() + for key in classes_counts.keys() if key != class_minority } else: @@ -283,15 +286,14 @@ def _sampling_strategy_not_minority(y, sampling_type): return sampling_strategy -def _sampling_strategy_minority(y, sampling_type): +def _sampling_strategy_minority(classes_counts, sampling_type): """Returns sampling target by targeting the minority class only.""" - target_stats = _count_class_sample(y) if sampling_type == "over-sampling": - n_sample_majority = max(target_stats.values()) - class_minority = min(target_stats, key=target_stats.get) + n_sample_majority = max(classes_counts.values()) + class_minority = min(classes_counts, key=classes_counts.get) sampling_strategy = { key: n_sample_majority - value - for (key, value) in target_stats.items() + for (key, value) in classes_counts.items() if key == class_minority } elif ( @@ -307,24 +309,23 @@ def _sampling_strategy_minority(y, sampling_type): return sampling_strategy -def _sampling_strategy_auto(y, sampling_type): +def _sampling_strategy_auto(classes_counts, sampling_type): """Returns sampling target auto for over-sampling and not-minority for under-sampling.""" if sampling_type == "over-sampling": - return _sampling_strategy_not_majority(y, sampling_type) + return _sampling_strategy_not_majority(classes_counts, sampling_type) elif ( sampling_type == "under-sampling" or sampling_type == "clean-sampling" ): - return _sampling_strategy_not_minority(y, sampling_type) + return _sampling_strategy_not_minority(classes_counts, sampling_type) -def _sampling_strategy_dict(sampling_strategy, y, sampling_type): +def _sampling_strategy_dict(sampling_strategy, classes_counts, sampling_type): """Returns sampling target by converting the dictionary depending of the sampling.""" - target_stats = _count_class_sample(y) # check that all keys in sampling_strategy are also in y set_diff_sampling_strategy_target = set(sampling_strategy.keys()) - set( - target_stats.keys() + classes_counts.keys() ) if len(set_diff_sampling_strategy_target) > 0: raise ValueError( @@ -341,17 +342,17 @@ def _sampling_strategy_dict(sampling_strategy, y, sampling_type): ) sampling_strategy_ = {} if sampling_type == "over-sampling": - n_samples_majority = max(target_stats.values()) - class_majority = max(target_stats, key=target_stats.get) + n_samples_majority = max(classes_counts.values()) + class_majority = max(classes_counts, key=classes_counts.get) for class_sample, n_samples in sampling_strategy.items(): - if n_samples < target_stats[class_sample]: + if n_samples < classes_counts[class_sample]: raise ValueError( "With over-sampling methods, the number" " of samples in a class should be greater" " or equal to the original number of samples." " Originally, there is {} samples and {}" " samples are asked.".format( - target_stats[class_sample], n_samples + classes_counts[class_sample], n_samples ) ) if n_samples > n_samples_majority: @@ -367,18 +368,18 @@ def _sampling_strategy_dict(sampling_strategy, y, sampling_type): ) ) sampling_strategy_[class_sample] = ( - n_samples - target_stats[class_sample] + n_samples - classes_counts[class_sample] ) elif sampling_type == "under-sampling": for class_sample, n_samples in sampling_strategy.items(): - if n_samples > target_stats[class_sample]: + if n_samples > classes_counts[class_sample]: raise ValueError( "With under-sampling methods, the number of" " samples in a class should be less or equal" " to the original number of samples." " Originally, there is {} samples and {}" " samples are asked.".format( - target_stats[class_sample], n_samples + classes_counts[class_sample], n_samples ) ) sampling_strategy_[class_sample] = n_samples @@ -394,19 +395,18 @@ def _sampling_strategy_dict(sampling_strategy, y, sampling_type): return sampling_strategy_ -def _sampling_strategy_list(sampling_strategy, y, sampling_type): +def _sampling_strategy_list(sampling_strategy, classes_counts, sampling_type): """With cleaning methods, sampling_strategy can be a list to target the - class of interest.""" + class of interest.""" if sampling_type != "clean-sampling": raise ValueError( "'sampling_strategy' cannot be a list for samplers " "which are not cleaning methods." ) - target_stats = _count_class_sample(y) # check that all keys in sampling_strategy are also in y set_diff_sampling_strategy_target = set(sampling_strategy) - set( - target_stats.keys() + classes_counts.keys() ) if len(set_diff_sampling_strategy_target) > 0: raise ValueError( @@ -415,27 +415,26 @@ class of interest.""" ) return { - class_sample: min(target_stats.values()) + class_sample: min(classes_counts.values()) for class_sample in sampling_strategy } -def _sampling_strategy_float(sampling_strategy, y, sampling_type): +def _sampling_strategy_float(sampling_strategy, classes_counts, sampling_type): """Take a proportion of the majority (over-sampling) or minority (under-sampling) class in binary classification.""" - type_y = type_of_target(y) - if type_y != "binary": + + if len(classes_counts) != 2: raise ValueError( '"sampling_strategy" can be a float only when the type ' "of target is binary. For multi-class, use a dict." ) - target_stats = _count_class_sample(y) if sampling_type == "over-sampling": - n_sample_majority = max(target_stats.values()) - class_majority = max(target_stats, key=target_stats.get) + n_sample_majority = max(classes_counts.values()) + class_majority = max(classes_counts, key=classes_counts.get) sampling_strategy_ = { key: int(n_sample_majority * sampling_strategy - value) - for (key, value) in target_stats.items() + for (key, value) in classes_counts.items() if key != class_majority } if any([n_samples <= 0 for n_samples in sampling_strategy_.values()]): @@ -446,16 +445,16 @@ def _sampling_strategy_float(sampling_strategy, y, sampling_type): "ratio." ) elif sampling_type == "under-sampling": - n_sample_minority = min(target_stats.values()) - class_minority = min(target_stats, key=target_stats.get) + n_sample_minority = min(classes_counts.values()) + class_minority = min(classes_counts, key=classes_counts.get) sampling_strategy_ = { key: int(n_sample_minority / sampling_strategy) - for (key, value) in target_stats.items() + for (key, value) in classes_counts.items() if key != class_minority } if any( [ - n_samples > target_stats[target] + n_samples > classes_counts[target] for target, n_samples in sampling_strategy_.items() ] ): @@ -472,7 +471,9 @@ def _sampling_strategy_float(sampling_strategy, y, sampling_type): return sampling_strategy_ -def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): +def check_sampling_strategy( + sampling_strategy, classes_counts, sampling_type, **kwargs +): """Sampling target validation for samplers. Checks that ``sampling_strategy`` is of consistent type and return a @@ -567,10 +568,10 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): " instead.".format(SAMPLING_KIND, sampling_type) ) - if np.unique(y).size <= 1: + if len(classes_counts) <= 1: raise ValueError( "The target 'y' needs to have more than 1 class." - " Got {} class instead".format(np.unique(y).size) + " Got {} class instead".format(len(classes_counts)) ) if sampling_type in ("ensemble", "bypass"): @@ -587,7 +588,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): return OrderedDict( sorted( SAMPLING_TARGET_KIND[sampling_strategy]( - y, sampling_type + classes_counts, sampling_type ).items() ) ) @@ -595,7 +596,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): return OrderedDict( sorted( _sampling_strategy_dict( - sampling_strategy, y, sampling_type + sampling_strategy, classes_counts, sampling_type ).items() ) ) @@ -603,7 +604,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): return OrderedDict( sorted( _sampling_strategy_list( - sampling_strategy, y, sampling_type + sampling_strategy, classes_counts, sampling_type ).items() ) ) @@ -618,16 +619,16 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): return OrderedDict( sorted( _sampling_strategy_float( - sampling_strategy, y, sampling_type + sampling_strategy, classes_counts, sampling_type ).items() ) ) elif callable(sampling_strategy): - sampling_strategy_ = sampling_strategy(y, **kwargs) + sampling_strategy_ = sampling_strategy(classes_counts, **kwargs) return OrderedDict( sorted( _sampling_strategy_dict( - sampling_strategy_, y, sampling_type + sampling_strategy_, classes_counts, sampling_type ).items() ) ) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index b4dcbc904..6afb0f58a 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -297,6 +297,7 @@ def check_samplers_pandas(name, sampler): def check_samplers_dask_array(name, sampler): dask = pytest.importorskip("dask") + from dask import array # Check that the samplers handle dask array X, y = make_classification( n_samples=1000, @@ -305,15 +306,15 @@ def check_samplers_dask_array(name, sampler): weights=[0.2, 0.3, 0.5], random_state=0, ) - X_dask = dask.array.from_array(X, chunks=100) - y_dask = dask.array.from_array(y, chunks=100) + X_dask = array.from_array(X, chunks=100) + y_dask = array.from_array(y, chunks=100) X_res_dask, y_res_dask = sampler.fit_resample(X_dask, y_dask) X_res, y_res = sampler.fit_resample(X, y) # check that we return the same type for dataframes or series types - assert isinstance(X_res_dask, dask.array.Array) - assert isinstance(y_res_dask, dask.array.Array) + assert isinstance(X_res_dask, array.Array) + assert isinstance(y_res_dask, array.Array) assert_allclose(X_res_dask, X_res) assert_allclose(y_res_dask, y_res) @@ -321,6 +322,7 @@ def check_samplers_dask_array(name, sampler): def check_samplers_dask_dataframe(name, sampler): dask = pytest.importorskip("dask") + from dask import dataframe # Check that the samplers handle dask dataframe and dask series X, y = make_classification( n_samples=1000, @@ -329,18 +331,18 @@ def check_samplers_dask_dataframe(name, sampler): weights=[0.2, 0.3, 0.5], random_state=0, ) - X_df = dask.dataframe.from_array( + X_df = dataframe.from_array( X, columns=[str(i) for i in range(X.shape[1])] ) - y_s = dask.dataframe.from_array(y) + y_s = dataframe.from_array(y) y_s = y_s.rename("target") X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) X_res, y_res = sampler.fit_resample(X, y) # check that we return the same type for dataframes or series types - assert isinstance(X_res_df, dask.dataframe.DataFrame) - assert isinstance(y_res_s, dask.dataframe.Series) + assert isinstance(X_res_df, dataframe.DataFrame) + assert isinstance(y_res_s, dataframe.Series) assert X_df.columns.to_list() == X_res_df.columns.to_list() assert y_s.name == y_res_s.name diff --git a/imblearn/utils/wrapper.py b/imblearn/utils/wrapper.py index 7dbfa3b1e..d0559d0af 100644 --- a/imblearn/utils/wrapper.py +++ b/imblearn/utils/wrapper.py @@ -5,11 +5,11 @@ from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target from sklearn.utils.validation import column_or_1d as sklearn_column_or_1d -from ..dask._support import is_dask_container +from ..dask._support import is_dask_collection def type_of_target(y): - if is_dask_container(y): + if is_dask_collection(y): from ..dask.utils import type_of_target as dask_type_of_target return dask_type_of_target(y) @@ -17,7 +17,7 @@ def type_of_target(y): def _is_multiclass_encoded(y): - if is_dask_container(y): + if is_dask_collection(y): from dask import array return array.all(y.sum(axis=1) == 1).compute() @@ -25,7 +25,7 @@ def _is_multiclass_encoded(y): def column_or_1d(y, *, warn=False): - if is_dask_container(y): + if is_dask_collection(y): from ..dask.utils import column_or_1d as dask_column_or_1d return dask_column_or_1d(y, warn=warn) @@ -33,7 +33,7 @@ def column_or_1d(y, *, warn=False): def unique(arr, **kwargs): - if is_dask_container(arr): + if is_dask_collection(arr): if hasattr(arr, "unique"): output = np.asarray(arr.unique(**kwargs)) else: @@ -43,7 +43,7 @@ def unique(arr, **kwargs): def check_classification_targets(y): - if is_dask_container(y): + if is_dask_collection(y): from ..dask.utils import check_classification_targets as \ dask_check_classification_targets From 58acdf21c1c687fc3b9646e6dd7201d82d299562 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 7 Nov 2020 23:44:13 +0100 Subject: [PATCH 21/32] iter --- imblearn/base.py | 1 + imblearn/datasets/_imbalance.py | 9 ++++++--- imblearn/ensemble/_bagging.py | 8 ++++++-- imblearn/ensemble/_easy_ensemble.py | 10 ++++++++-- imblearn/ensemble/_forest.py | 10 +++++++--- 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 38f4259a4..9f3ea1303 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -53,6 +53,7 @@ def fit(self, X, y): if (not dask_collection or (dask_collection and self.validate_if_dask_collection)): X, y, _ = self._check_X_y(X, y) + self._classes_counts = get_classes_counts(y) self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, self._classes_counts, self._sampling_type diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py index b35d00ed2..77a2f64d3 100644 --- a/imblearn/datasets/_imbalance.py +++ b/imblearn/datasets/_imbalance.py @@ -9,7 +9,10 @@ from ..under_sampling import RandomUnderSampler from ..utils import check_sampling_strategy -from ..utils._validation import _deprecate_positional_args +from ..utils._validation import ( + _deprecate_positional_args, + get_classes_counts, +) @_deprecate_positional_args @@ -87,11 +90,11 @@ def make_imbalance( >>> print('Distribution after imbalancing: {}'.format(Counter(y_res))) Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10}) """ - target_stats = Counter(y) + target_stats = get_classes_counts(y) # restrict ratio to be a dict or a callable if isinstance(sampling_strategy, dict) or callable(sampling_strategy): sampling_strategy_ = check_sampling_strategy( - sampling_strategy, y, "under-sampling", **kwargs + sampling_strategy, target_stats, "under-sampling", **kwargs ) else: raise ValueError( diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py index d7c509194..c7107661e 100644 --- a/imblearn/ensemble/_bagging.py +++ b/imblearn/ensemble/_bagging.py @@ -18,7 +18,10 @@ from ..utils import Substitution, check_target_type, check_sampling_strategy from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring -from ..utils._validation import _deprecate_positional_args +from ..utils._validation import ( + _deprecate_positional_args, + get_classes_counts, +) @Substitution( @@ -216,11 +219,12 @@ def __init__( def _validate_y(self, y): y_encoded = super()._validate_y(y) + classes_counts = get_classes_counts(y) if isinstance(self.sampling_strategy, dict): self._sampling_strategy = { np.where(self.classes_ == key)[0][0]: value for key, value in check_sampling_strategy( - self.sampling_strategy, y, 'under-sampling', + self.sampling_strategy, classes_counts, 'under-sampling', ).items() } else: diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index f140120aa..4db266134 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -17,7 +17,10 @@ from ..utils import Substitution, check_target_type, check_sampling_strategy from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring -from ..utils._validation import _deprecate_positional_args +from ..utils._validation import ( + _deprecate_positional_args, + get_classes_counts, +) from ..pipeline import Pipeline MAX_INT = np.iinfo(np.int32).max @@ -156,11 +159,14 @@ def __init__( def _validate_y(self, y): y_encoded = super()._validate_y(y) + classes_counts = get_classes_counts(y) if isinstance(self.sampling_strategy, dict): self._sampling_strategy = { np.where(self.classes_ == key)[0][0]: value for key, value in check_sampling_strategy( - self.sampling_strategy, y, 'under-sampling', + self.sampling_strategy, + classes_counts, + "under-sampling", ).items() } else: diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py index 42ae9b255..5832628c8 100644 --- a/imblearn/ensemble/_forest.py +++ b/imblearn/ensemble/_forest.py @@ -33,8 +33,11 @@ from ..utils import Substitution from ..utils._docstring import _n_jobs_docstring from ..utils._docstring import _random_state_docstring -from ..utils._validation import check_sampling_strategy -from ..utils._validation import _deprecate_positional_args +from ..utils._validation import ( + check_sampling_strategy, + _deprecate_positional_args, + get_classes_counts, +) MAX_INT = np.iinfo(np.int32).max @@ -457,10 +460,11 @@ def fit(self, X, y, sample_weight=None): y_encoded = np.ascontiguousarray(y_encoded, dtype=DOUBLE) if isinstance(self.sampling_strategy, dict): + classes_counts = get_classes_counts(y) self._sampling_strategy = { np.where(self.classes_[0] == key)[0][0]: value for key, value in check_sampling_strategy( - self.sampling_strategy, y, 'under-sampling', + self.sampling_strategy, classes_counts, 'under-sampling', ).items() } else: From e54c772a5b8d10fdb5e4305fc5a568471f026d79 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 11:38:13 +0100 Subject: [PATCH 22/32] PEP8 --- imblearn/utils/_docstring.py | 2 +- imblearn/utils/_validation.py | 1 - imblearn/utils/estimator_checks.py | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/imblearn/utils/_docstring.py b/imblearn/utils/_docstring.py index 87907d73e..be94b1aac 100644 --- a/imblearn/utils/_docstring.py +++ b/imblearn/utils/_docstring.py @@ -47,4 +47,4 @@ def __call__(self, obj): Whether or not `X` and `y` should be validated. This parameter applies only when `X` and `y` are Dask collections where validation might be potentially costly. - """.rstrip() \ No newline at end of file + """.rstrip() diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 6b3936836..9fc81dd03 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -20,7 +20,6 @@ from .wrapper import _is_multiclass_encoded from .wrapper import column_or_1d from .wrapper import type_of_target -from .wrapper import unique SAMPLING_KIND = ( "over-sampling", diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 6afb0f58a..008a011ca 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -296,7 +296,7 @@ def check_samplers_pandas(name, sampler): def check_samplers_dask_array(name, sampler): - dask = pytest.importorskip("dask") + pytest.importorskip("dask") from dask import array # Check that the samplers handle dask array X, y = make_classification( @@ -321,7 +321,7 @@ def check_samplers_dask_array(name, sampler): def check_samplers_dask_dataframe(name, sampler): - dask = pytest.importorskip("dask") + pytest.importorskip("dask") from dask import dataframe # Check that the samplers handle dask dataframe and dask series X, y = make_classification( From f2a572f696aa076aaae99facf09c90ff76f8ba6d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 12:17:55 +0100 Subject: [PATCH 23/32] iter --- imblearn/utils/_validation.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 9fc81dd03..03dce4ab8 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -134,6 +134,19 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): def get_classes_counts(y): + """Compute the counts of each class present in `y`. + + Parameters + ---------- + y : ndarray of shape (n_samples,) + The target array. + + Returns + ------- + classes_counts : dict + A dictionary where the keys are the class labels and the values are the + counts for each class. + """ unique, counts = np.unique(y, return_counts=True) if is_dask_collection(unique): from dask import compute @@ -542,8 +555,14 @@ def check_sampling_strategy( correspond to the targeted classes. The values correspond to the desired number of samples for each class. - y : ndarray of shape (n_samples,) - The target array. + classes_counts : dict or ndarray of shape (n_samples,) + A dictionary where the keys are the class present in `y` and the values + are the counts. The function :func:`~imblearn.utils.get_classes_count` + provides such a dictionary, giving `y` as an input. + + .. deprecated:: 0.7 + Passing the array `y` is deprecated from 0.7 and will be removed + in 0.9. sampling_type : {{'over-sampling', 'under-sampling', 'clean-sampling'}} The type of sampling. Can be either ``'over-sampling'``, @@ -567,6 +586,15 @@ def check_sampling_strategy( " instead.".format(SAMPLING_KIND, sampling_type) ) + if hasattr(y, "__array__"): + warnings.warn( + f"Passing that array of target `y` is deprecated in 0.7 and will " + f"raise an error from 0.9. Instead, pass `y` to " + "imblearn.utils.get_classes_counts function to get the " + "dictionary.", FutureWarning + ) + classes_counts = get_classes_counts(classes_counts) + if len(classes_counts) <= 1: raise ValueError( "The target 'y' needs to have more than 1 class." From 36a0aa36cf2d64308b13f13cb31b3200f57a1ac7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 12:20:34 +0100 Subject: [PATCH 24/32] iter --- imblearn/utils/_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 03dce4ab8..c5b87e6ff 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -586,7 +586,7 @@ def check_sampling_strategy( " instead.".format(SAMPLING_KIND, sampling_type) ) - if hasattr(y, "__array__"): + if hasattr(classes_counts, "__array__"): warnings.warn( f"Passing that array of target `y` is deprecated in 0.7 and will " f"raise an error from 0.9. Instead, pass `y` to " From c7bdc74d45d10d342f04f8f84d7f308fd932ae17 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 12:38:10 +0100 Subject: [PATCH 25/32] check raise FutureWarning --- doc/api.rst | 2 +- imblearn/utils/__init__.py | 2 + imblearn/utils/tests/test_validation.py | 169 ++++++++++++++++++------ 3 files changed, 129 insertions(+), 44 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 07ac6413c..65bfd1b06 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -248,6 +248,6 @@ Imbalance-learn provides some fast-prototyping tools. :toctree: generated/ :template: function.rst - utils.estimator_checks.parametrize_with_checks utils.check_neighbors_object utils.check_sampling_strategy + utils.get_classes_counts diff --git a/imblearn/utils/__init__.py b/imblearn/utils/__init__.py index 4e74d2ee3..130d9f0c9 100644 --- a/imblearn/utils/__init__.py +++ b/imblearn/utils/__init__.py @@ -7,10 +7,12 @@ from ._validation import check_neighbors_object from ._validation import check_target_type from ._validation import check_sampling_strategy +from ._validation import get_classes_counts __all__ = [ "check_neighbors_object", "check_sampling_strategy", "check_target_type", + "get_classes_counts", "Substitution", ] diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index e4f9c01c8..b0ff57c83 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -17,11 +17,14 @@ from imblearn.utils import check_neighbors_object from imblearn.utils import check_sampling_strategy from imblearn.utils import check_target_type +from imblearn.utils import get_classes_counts from imblearn.utils._validation import ArraysTransformer from imblearn.utils._validation import _deprecate_positional_args multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25) +multiclass_classes_counts = get_classes_counts(multiclass_target) binary_target = np.array([1] * 25 + [0] * 100) +binary_classes_counts = get_classes_counts(binary_target) def test_check_neighbors_object(): @@ -70,11 +73,11 @@ def test_check_target_type_ova(target, output_target, is_ova): assert binarize_target == is_ova -def test_check_sampling_strategy_warning(): +def test_check_sampling_strategy_error_dict_cleaning_methods(): msg = "dict for cleaning methods is not supported" with pytest.raises(ValueError, match=msg): check_sampling_strategy( - {1: 0, 2: 0, 3: 0}, multiclass_target, "clean-sampling" + {1: 0, 2: 0, 3: 0}, multiclass_classes_counts, "clean-sampling" ) @@ -83,19 +86,19 @@ def test_check_sampling_strategy_warning(): [ ( 0.5, - binary_target, + binary_classes_counts, "clean-sampling", "'clean-sampling' methods do let the user specify the sampling ratio", # noqa ), ( 0.1, - np.array([0] * 10 + [1] * 20), + get_classes_counts(np.array([0] * 10 + [1] * 20)), "over-sampling", "remove samples from the minority class while trying to generate new", # noqa ), ( 0.1, - np.array([0] * 10 + [1] * 20), + get_classes_counts(np.array([0] * 10 + [1] * 20)), "under-sampling", "generate new sample in the majority class while trying to remove", ), @@ -108,15 +111,21 @@ def test_check_sampling_strategy_float_error(ratio, y, type, err_msg): def test_check_sampling_strategy_error(): with pytest.raises(ValueError, match="'sampling_type' should be one of"): - check_sampling_strategy("auto", np.array([1, 2, 3]), "rnd") + check_sampling_strategy( + "auto", get_classes_counts(np.array([1, 2, 3])), "rnd" + ) error_regex = "The target 'y' needs to have more than 1 class." with pytest.raises(ValueError, match=error_regex): - check_sampling_strategy("auto", np.ones((10,)), "over-sampling") + check_sampling_strategy( + "auto", get_classes_counts(np.ones((10,))), "over-sampling" + ) error_regex = "When 'sampling_strategy' is a string, it needs to be one of" with pytest.raises(ValueError, match=error_regex): - check_sampling_strategy("rnd", np.array([1, 2, 3]), "over-sampling") + check_sampling_strategy( + "rnd", get_classes_counts(np.array([1, 2, 3])), "over-sampling" + ) @pytest.mark.parametrize( @@ -136,7 +145,9 @@ def test_check_sampling_strategy_error_wrong_string( ), ): check_sampling_strategy( - sampling_strategy, np.array([1, 2, 3]), sampling_type + sampling_strategy, + get_classes_counts(np.array([1, 2, 3])), + sampling_type, ) @@ -153,14 +164,18 @@ def test_sampling_strategy_class_target_unknown( ): y = np.array([1] * 50 + [2] * 100 + [3] * 25) with pytest.raises(ValueError, match="are not present in the data."): - check_sampling_strategy(sampling_strategy, y, sampling_method) + check_sampling_strategy( + sampling_strategy, get_classes_counts(y), sampling_method + ) def test_sampling_strategy_dict_error(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy = {1: -100, 2: 50, 3: 25} with pytest.raises(ValueError, match="in a class cannot be negative."): - check_sampling_strategy(sampling_strategy, y, "under-sampling") + check_sampling_strategy( + sampling_strategy, get_classes_counts(y), "under-sampling" + ) sampling_strategy = {1: 45, 2: 100, 3: 70} error_regex = ( "With over-sampling methods, the number of samples in a" @@ -169,7 +184,9 @@ def test_sampling_strategy_dict_error(): " samples are asked." ) with pytest.raises(ValueError, match=error_regex): - check_sampling_strategy(sampling_strategy, y, "over-sampling") + check_sampling_strategy( + sampling_strategy, get_classes_counts(y), "over-sampling" + ) error_regex = ( "With under-sampling methods, the number of samples in a" @@ -178,21 +195,27 @@ def test_sampling_strategy_dict_error(): " are asked." ) with pytest.raises(ValueError, match=error_regex): - check_sampling_strategy(sampling_strategy, y, "under-sampling") + check_sampling_strategy( + sampling_strategy, get_classes_counts(y), "under-sampling" + ) @pytest.mark.parametrize("sampling_strategy", [-10, 10]) def test_sampling_strategy_float_error_not_in_range(sampling_strategy): y = np.array([1] * 50 + [2] * 100) with pytest.raises(ValueError, match="it should be in the range"): - check_sampling_strategy(sampling_strategy, y, "under-sampling") + check_sampling_strategy( + sampling_strategy, get_classes_counts(y), "under-sampling" + ) def test_sampling_strategy_float_error_not_binary(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) with pytest.raises(ValueError, match="the type of target is binary"): sampling_strategy = 0.5 - check_sampling_strategy(sampling_strategy, y, "under-sampling") + check_sampling_strategy( + sampling_strategy, get_classes_counts(y), "under-sampling" + ) @pytest.mark.parametrize( @@ -202,7 +225,9 @@ def test_sampling_strategy_list_error_not_clean_sampling(sampling_method): y = np.array([1] * 50 + [2] * 100 + [3] * 25) with pytest.raises(ValueError, match="cannot be a list for samplers"): sampling_strategy = [1, 2, 3] - check_sampling_strategy(sampling_strategy, y, sampling_method) + check_sampling_strategy( + sampling_strategy, get_classes_counts(y), sampling_method + ) def _sampling_strategy_func(y): @@ -215,42 +240,87 @@ def _sampling_strategy_func(y): @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_sampling_strategy, target", [ - ("auto", "under-sampling", {1: 25, 2: 25}, multiclass_target), - ("auto", "clean-sampling", {1: 25, 2: 25}, multiclass_target), - ("auto", "over-sampling", {1: 50, 3: 75}, multiclass_target), - ("all", "over-sampling", {1: 50, 2: 0, 3: 75}, multiclass_target), - ("all", "under-sampling", {1: 25, 2: 25, 3: 25}, multiclass_target), - ("all", "clean-sampling", {1: 25, 2: 25, 3: 25}, multiclass_target), - ("majority", "under-sampling", {2: 25}, multiclass_target), - ("majority", "clean-sampling", {2: 25}, multiclass_target), - ("minority", "over-sampling", {3: 75}, multiclass_target), - ("not minority", "over-sampling", {1: 50, 2: 0}, multiclass_target), - ("not minority", "under-sampling", {1: 25, 2: 25}, multiclass_target), - ("not minority", "clean-sampling", {1: 25, 2: 25}, multiclass_target), - ("not majority", "over-sampling", {1: 50, 3: 75}, multiclass_target), - ("not majority", "under-sampling", {1: 25, 3: 25}, multiclass_target), - ("not majority", "clean-sampling", {1: 25, 3: 25}, multiclass_target), + ("auto", "under-sampling", {1: 25, 2: 25}, multiclass_classes_counts), + ("auto", "clean-sampling", {1: 25, 2: 25}, multiclass_classes_counts), + ("auto", "over-sampling", {1: 50, 3: 75}, multiclass_classes_counts), + ( + "all", + "over-sampling", + {1: 50, 2: 0, 3: 75}, + multiclass_classes_counts, + ), + ( + "all", + "under-sampling", + {1: 25, 2: 25, 3: 25}, + multiclass_classes_counts, + ), + ( + "all", + "clean-sampling", + {1: 25, 2: 25, 3: 25}, + multiclass_classes_counts, + ), + ("majority", "under-sampling", {2: 25}, multiclass_classes_counts), + ("majority", "clean-sampling", {2: 25}, multiclass_classes_counts), + ("minority", "over-sampling", {3: 75}, multiclass_classes_counts), + ( + "not minority", + "over-sampling", + {1: 50, 2: 0}, + multiclass_classes_counts, + ), + ( + "not minority", + "under-sampling", + {1: 25, 2: 25}, + multiclass_classes_counts, + ), + ( + "not minority", + "clean-sampling", + {1: 25, 2: 25}, + multiclass_classes_counts, + ), + ( + "not majority", + "over-sampling", + {1: 50, 3: 75}, + multiclass_classes_counts, + ), + ( + "not majority", + "under-sampling", + {1: 25, 3: 25}, + multiclass_classes_counts, + ), + ( + "not majority", + "clean-sampling", + {1: 25, 3: 25}, + multiclass_classes_counts, + ), ( {1: 70, 2: 100, 3: 70}, "over-sampling", {1: 20, 2: 0, 3: 45}, - multiclass_target, + multiclass_classes_counts, ), ( {1: 30, 2: 45, 3: 25}, "under-sampling", {1: 30, 2: 45, 3: 25}, - multiclass_target, + multiclass_classes_counts, ), - ([1], "clean-sampling", {1: 25}, multiclass_target), + ([1], "clean-sampling", {1: 25}, multiclass_classes_counts), ( _sampling_strategy_func, "over-sampling", {1: 50, 2: 0, 3: 75}, - multiclass_target, + multiclass_classes_counts, ), - (0.5, "over-sampling", {1: 25}, binary_target), - (0.5, "under-sampling", {0: 50}, binary_target), + (0.5, "over-sampling", {1: 25}, binary_classes_counts), + (0.5, "under-sampling", {0: 50}, binary_classes_counts), ], ) def test_check_sampling_strategy( @@ -271,23 +341,27 @@ def test_sampling_strategy_dict_over_sampling(): r" the majority class \(class #2 -> 100\)" ) with warns(UserWarning, expected_msg): - check_sampling_strategy(sampling_strategy, y, "over-sampling") + check_sampling_strategy( + sampling_strategy, get_classes_counts(y), "over-sampling" + ) def test_sampling_strategy_callable_args(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) multiplier = {1: 1.5, 2: 1, 3: 3} - def sampling_strategy_func(y, multiplier): + def sampling_strategy_func(classes_counts, multiplier): """samples such that each class will be affected by the multiplier.""" - target_stats = Counter(y) return { key: int(values * multiplier[key]) - for key, values in target_stats.items() + for key, values in classes_counts.items() } sampling_strategy_ = check_sampling_strategy( - sampling_strategy_func, y, "over-sampling", multiplier=multiplier + sampling_strategy_func, + get_classes_counts(y), + "over-sampling", + multiplier=multiplier, ) assert sampling_strategy_ == {1: 25, 2: 0, 3: 50} @@ -314,11 +388,20 @@ def test_sampling_strategy_check_order( # dictionary is sorted. Refer to issue #428. y = np.array([1] * 50 + [2] * 100 + [3] * 25) sampling_strategy_ = check_sampling_strategy( - sampling_strategy, y, sampling_type + sampling_strategy, get_classes_counts(y), sampling_type ) assert sampling_strategy_ == expected_result +# FIXME: remove in 0.9 +def test_sampling_strategy_deprecation_array_target(): + # Check that we raise a FutureWarning when an array of target is passed + with pytest.warns(FutureWarning): + sampling_strategy = "auto" + check_sampling_strategy( + sampling_strategy, binary_target, "under-sampling", + ) + def test_arrays_transformer_plain_list(): X = np.array([[0, 0], [1, 1]]) y = np.array([[0, 0], [1, 1]]) From f09522154067b497d02c02ad4889e02244d9f3d7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 12:54:02 +0100 Subject: [PATCH 26/32] iter --- imblearn/utils/_validation.py | 4 ++-- imblearn/utils/tests/test_validation.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index c5b87e6ff..8538d7718 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -588,8 +588,8 @@ def check_sampling_strategy( if hasattr(classes_counts, "__array__"): warnings.warn( - f"Passing that array of target `y` is deprecated in 0.7 and will " - f"raise an error from 0.9. Instead, pass `y` to " + "Passing an array of target `y` is deprecated in 0.7 and will " + "raise an error from 0.9. Instead, pass `y` to " "imblearn.utils.get_classes_counts function to get the " "dictionary.", FutureWarning ) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index b0ff57c83..b5f06e5b6 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -402,6 +402,7 @@ def test_sampling_strategy_deprecation_array_target(): sampling_strategy, binary_target, "under-sampling", ) + def test_arrays_transformer_plain_list(): X = np.array([[0, 0], [1, 1]]) y = np.array([[0, 0], [1, 1]]) From 20b44c65c7a2cb6e9c47487d327ad7dd8b675c74 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 13:07:45 +0100 Subject: [PATCH 27/32] iter --- imblearn/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/imblearn/base.py b/imblearn/base.py index 9f3ea1303..e3cc26a0f 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -273,8 +273,9 @@ def fit_resample(self, X, y): X, y, accept_sparse=self.accept_sparse ) + self._classes_counts = get_classes_counts(y) self.sampling_strategy_ = check_sampling_strategy( - self.sampling_strategy, y, self._sampling_type + self.sampling_strategy, self._classes_counts, self._sampling_type ) output = self._fit_resample(X, y) From 4cd9116056790a08bc4b99ebe0b64c08574514e3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 19:20:53 +0100 Subject: [PATCH 28/32] iter --- imblearn/base.py | 27 ++++++++++++------ imblearn/dask/preprocessing.py | 10 +++++++ imblearn/utils/estimator_checks.py | 46 ++++++++++++++++++------------ imblearn/utils/wrapper.py | 9 ++++++ 4 files changed, 66 insertions(+), 26 deletions(-) create mode 100644 imblearn/dask/preprocessing.py diff --git a/imblearn/base.py b/imblearn/base.py index e3cc26a0f..6099abcba 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -18,7 +18,10 @@ _deprecate_positional_args, get_classes_counts, ) -from .utils.wrapper import check_classification_targets +from .utils.wrapper import ( + check_classification_targets, + label_binarize, +) class SamplerMixin(BaseEstimator, metaclass=ABCMeta): @@ -49,7 +52,11 @@ def fit(self, X, y): self : object Return the instance itself. """ + arrays_transformer = ArraysTransformer(X, y) dask_collection = any([is_dask_collection(arr) for arr in (X, y)]) + if dask_collection: + X, y = arrays_transformer.to_dask_array(X, y) + if (not dask_collection or (dask_collection and self.validate_if_dask_collection)): X, y, _ = self._check_X_y(X, y) @@ -83,12 +90,14 @@ def fit_resample(self, X, y): """ arrays_transformer = ArraysTransformer(X, y) dask_collection = any([is_dask_collection(arr) for arr in (X, y)]) + if dask_collection: + X, y = arrays_transformer.to_dask_array(X, y) + if (not dask_collection or (dask_collection and self.validate_if_dask_collection)): check_classification_targets(y) X, y, binarize_y = self._check_X_y(X, y) else: - X, y = arrays_transformer.to_dask_array(X, y) binarize_y = False self._classes_counts = get_classes_counts(y) @@ -98,9 +107,10 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) - # TODO: label binarize is not implemented with dask - y_ = (label_binarize(output[1], np.unique(y)) - if binarize_y else output[1]) + if binarize_y: + y_ = label_binarize(output[1], classes=np.unique(y)) + else: + y_ = output[1] X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) @@ -281,9 +291,10 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) if self.validate: - - y_ = (label_binarize(output[1], np.unique(y)) - if binarize_y else output[1]) + if binarize_y: + y_ = label_binarize(output[1], classes=np.unique(y)) + else: + y_ = output[1] X_, y_ = arrays_transformer.transform(output[0], y_) return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) diff --git a/imblearn/dask/preprocessing.py b/imblearn/dask/preprocessing.py new file mode 100644 index 000000000..f6038b74b --- /dev/null +++ b/imblearn/dask/preprocessing.py @@ -0,0 +1,10 @@ +import numpy as np + + +def label_binarize(y, *, classes): + import pandas as pd + from dask import dataframe + + cat_dtype = pd.CategoricalDtype(categories=classes) + y = dataframe.from_array(y).astype(cat_dtype) + return dataframe.get_dummies(y).to_dask_array() diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 008a011ca..ffa083428 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -295,9 +295,10 @@ def check_samplers_pandas(name, sampler): assert_allclose(y_res_s.to_numpy(), y_res) -def check_samplers_dask_array(name, sampler): +def check_samplers_dask_array(name, sampler_orig): pytest.importorskip("dask") from dask import array + sampler = clone(sampler_orig) # Check that the samplers handle dask array X, y = make_classification( n_samples=1000, @@ -309,20 +310,25 @@ def check_samplers_dask_array(name, sampler): X_dask = array.from_array(X, chunks=100) y_dask = array.from_array(y, chunks=100) - X_res_dask, y_res_dask = sampler.fit_resample(X_dask, y_dask) - X_res, y_res = sampler.fit_resample(X, y) + for validate_if_dask_collection in (True, False): + sampler.set_params( + validate_if_dask_collection=validate_if_dask_collection + ) + X_res_dask, y_res_dask = sampler.fit_resample(X_dask, y_dask) + X_res, y_res = sampler.fit_resample(X, y) - # check that we return the same type for dataframes or series types - assert isinstance(X_res_dask, array.Array) - assert isinstance(y_res_dask, array.Array) + # check that we return the same type for dataframes or series types + assert isinstance(X_res_dask, array.Array) + assert isinstance(y_res_dask, array.Array) - assert_allclose(X_res_dask, X_res) - assert_allclose(y_res_dask, y_res) + assert_allclose(X_res_dask, X_res) + assert_allclose(y_res_dask, y_res) -def check_samplers_dask_dataframe(name, sampler): +def check_samplers_dask_dataframe(name, sampler_orig): pytest.importorskip("dask") from dask import dataframe + sampler = clone(sampler_orig) # Check that the samplers handle dask dataframe and dask series X, y = make_classification( n_samples=1000, @@ -337,18 +343,22 @@ def check_samplers_dask_dataframe(name, sampler): y_s = dataframe.from_array(y) y_s = y_s.rename("target") - X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) - X_res, y_res = sampler.fit_resample(X, y) + for validate_if_dask_collection in (True, False): + sampler.set_params( + validate_if_dask_collection=validate_if_dask_collection + ) + X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) + X_res, y_res = sampler.fit_resample(X, y) - # check that we return the same type for dataframes or series types - assert isinstance(X_res_df, dataframe.DataFrame) - assert isinstance(y_res_s, dataframe.Series) + # check that we return the same type for dataframes or series types + assert isinstance(X_res_df, dataframe.DataFrame) + assert isinstance(y_res_s, dataframe.Series) - assert X_df.columns.to_list() == X_res_df.columns.to_list() - assert y_s.name == y_res_s.name + assert X_df.columns.to_list() == X_res_df.columns.to_list() + assert y_s.name == y_res_s.name - assert_allclose(np.array(X_res_df), X_res) - assert_allclose(np.array(y_res_s), y_res) + assert_allclose(np.array(X_res_df), X_res) + assert_allclose(np.array(y_res_s), y_res) def check_samplers_list(name, sampler): diff --git a/imblearn/utils/wrapper.py b/imblearn/utils/wrapper.py index d0559d0af..cbc9e1b1d 100644 --- a/imblearn/utils/wrapper.py +++ b/imblearn/utils/wrapper.py @@ -1,5 +1,6 @@ import numpy as np +from sklearn.preprocessing import label_binarize as sklearn_label_binarize from sklearn.utils.multiclass import check_classification_targets as \ sklearn_check_classification_targets from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target @@ -49,3 +50,11 @@ def check_classification_targets(y): return dask_check_classification_targets(y) return sklearn_check_classification_targets(y) + + +def label_binarize(y, *, classes): + if is_dask_collection(y): + from ..dask.preprocessing import label_binarize as dask_label_binarize + + return dask_label_binarize(y, classes=classes) + return sklearn_label_binarize(y, classes=classes) From a6e975b251079b6c9497b92cc70adee530adeb2f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 19:23:18 +0100 Subject: [PATCH 29/32] PEP8 --- imblearn/base.py | 1 - imblearn/dask/preprocessing.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 6099abcba..aebe2bf6d 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -9,7 +9,6 @@ import numpy as np from sklearn.base import BaseEstimator -from sklearn.preprocessing import label_binarize from .dask._support import is_dask_collection from .utils import check_sampling_strategy, check_target_type diff --git a/imblearn/dask/preprocessing.py b/imblearn/dask/preprocessing.py index f6038b74b..3a79fe576 100644 --- a/imblearn/dask/preprocessing.py +++ b/imblearn/dask/preprocessing.py @@ -1,6 +1,3 @@ -import numpy as np - - def label_binarize(y, *, classes): import pandas as pd from dask import dataframe From 32eda462602259bf5da468ca8b0434d887cbc82c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 19:52:10 +0100 Subject: [PATCH 30/32] iter --- imblearn/base.py | 8 ++++++-- imblearn/utils/estimator_checks.py | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index aebe2bf6d..4d69f5461 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -107,7 +107,9 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) if binarize_y: - y_ = label_binarize(output[1], classes=np.unique(y)) + y_ = label_binarize( + output[1], classes=list(self._classes_counts.keys()) + ) else: y_ = output[1] @@ -291,7 +293,9 @@ def fit_resample(self, X, y): if self.validate: if binarize_y: - y_ = label_binarize(output[1], classes=np.unique(y)) + y_ = label_binarize( + output[1], classes=list(self._classes_counts.keys()) + ) else: y_ = output[1] X_, y_ = arrays_transformer.transform(output[0], y_) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index ffa083428..1d7b8e328 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -327,6 +327,7 @@ def check_samplers_dask_array(name, sampler_orig): def check_samplers_dask_dataframe(name, sampler_orig): pytest.importorskip("dask") + pd = pytest.importorskip("pandas") from dask import dataframe sampler = clone(sampler_orig) # Check that the samplers handle dask dataframe and dask series @@ -342,20 +343,27 @@ def check_samplers_dask_dataframe(name, sampler_orig): ) y_s = dataframe.from_array(y) y_s = y_s.rename("target") + y_s_ohe = dataframe.get_dummies( + y_s.astype(pd.CategoricalDtype(categories=[0, 1, 2])) + ) for validate_if_dask_collection in (True, False): sampler.set_params( validate_if_dask_collection=validate_if_dask_collection ) X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) + # FIXME: not supported with validate=False + X_res, y_res_s_ohe = sampler.fit_resample(X, y_s_ohe) X_res, y_res = sampler.fit_resample(X, y) # check that we return the same type for dataframes or series types assert isinstance(X_res_df, dataframe.DataFrame) assert isinstance(y_res_s, dataframe.Series) + assert isinstance(y_res_s_ohe, dataframe.DataFrame) assert X_df.columns.to_list() == X_res_df.columns.to_list() assert y_s.name == y_res_s.name + assert y_s_ohe.columns.to_list() == y_res_s_ohe.columns.to_list() assert_allclose(np.array(X_res_df), X_res) assert_allclose(np.array(y_res_s), y_res) From 6c592ff367c8ead35ad4e63278446f987bf7c70f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 20:18:19 +0100 Subject: [PATCH 31/32] iter --- imblearn/utils/estimator_checks.py | 71 +++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 7 deletions(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 1d7b8e328..a5c5bb3f4 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -65,6 +65,10 @@ def _yield_sampler_checks(sampler): yield check_samplers_dask_dataframe yield check_samplers_list yield check_samplers_multiclass_ova + if "dask-array" in tags["X_types"]: + yield check_samplers_multiclass_ova_dask_array + if "dask-dataframe" in tags["X_types"]: + yield check_samplers_multiclass_ova_dask_dataframe yield check_samplers_preserve_dtype yield check_samplers_sample_indices yield check_samplers_2d_target @@ -343,27 +347,20 @@ def check_samplers_dask_dataframe(name, sampler_orig): ) y_s = dataframe.from_array(y) y_s = y_s.rename("target") - y_s_ohe = dataframe.get_dummies( - y_s.astype(pd.CategoricalDtype(categories=[0, 1, 2])) - ) for validate_if_dask_collection in (True, False): sampler.set_params( validate_if_dask_collection=validate_if_dask_collection ) X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) - # FIXME: not supported with validate=False - X_res, y_res_s_ohe = sampler.fit_resample(X, y_s_ohe) X_res, y_res = sampler.fit_resample(X, y) # check that we return the same type for dataframes or series types assert isinstance(X_res_df, dataframe.DataFrame) assert isinstance(y_res_s, dataframe.Series) - assert isinstance(y_res_s_ohe, dataframe.DataFrame) assert X_df.columns.to_list() == X_res_df.columns.to_list() assert y_s.name == y_res_s.name - assert y_s_ohe.columns.to_list() == y_res_s_ohe.columns.to_list() assert_allclose(np.array(X_res_df), X_res) assert_allclose(np.array(y_res_s), y_res) @@ -408,6 +405,66 @@ def check_samplers_multiclass_ova(name, sampler): assert_allclose(y_res, y_res_ova.argmax(axis=1)) +def check_samplers_multiclass_ova_dask_array(name, sampler_orig): + pytest.importorskip("dask") + from dask import array + sampler = clone(sampler_orig) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) + y_ova = label_binarize(y, np.unique(y)) + + X = array.from_array(X) + y = array.from_array(y) + y_ova = array.from_array(y_ova) + + sampler.set_params(validate_if_dask_collection=True) + X_res, y_res = sampler.fit_resample(X, y) + X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova) + + assert_allclose(X_res, X_res_ova) + assert type_of_target(y_res_ova) == type_of_target(y_ova) + assert_allclose(y_res, y_res_ova.argmax(axis=1)) + + assert isinstance(X_res_ova, array.Array) + assert isinstance(y_res, array.Array) + assert isinstance(y_res_ova, array.Array) + + +def check_samplers_multiclass_ova_dask_dataframe(name, sampler_orig): + pytest.importorskip("dask") + from dask import dataframe + sampler = clone(sampler_orig) + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) + y_ova = label_binarize(y, np.unique(y)) + + X = dataframe.from_array(X) + y = dataframe.from_array(y) + y_ova = dataframe.from_array(y_ova) + + sampler.set_params(validate_if_dask_collection=True) + X_res, y_res = sampler.fit_resample(X, y) + X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova) + + assert_allclose(X_res, X_res_ova) + assert type_of_target(y_res_ova) == type_of_target(y_ova) + assert_allclose(y_res, y_res_ova.to_dask_array().argmax(axis=1)) + + assert isinstance(X_res_ova, dataframe.DataFrame) + assert isinstance(y_res, dataframe.Series) + assert isinstance(y_res_ova, dataframe.DataFrame) + + def check_samplers_2d_target(name, sampler): X, y = make_classification( n_samples=100, From 456c3ebbf5623a681c07211cdf032943532921c8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 20:22:28 +0100 Subject: [PATCH 32/32] PEP8 --- imblearn/utils/estimator_checks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index a5c5bb3f4..ceb828272 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -331,7 +331,6 @@ def check_samplers_dask_array(name, sampler_orig): def check_samplers_dask_dataframe(name, sampler_orig): pytest.importorskip("dask") - pd = pytest.importorskip("pandas") from dask import dataframe sampler = clone(sampler_orig) # Check that the samplers handle dask dataframe and dask series