From 95247e6e1fe7ab67a81d5348283810024136cb92 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 20:14:23 +0100
Subject: [PATCH 01/32] ENH make RandomUnderSampler accept dask array

---
 imblearn/dask/__init__.py                     |  0
 imblearn/dask/_support.py                     | 13 ++++
 imblearn/dask/tests/__init__.py               |  0
 imblearn/dask/tests/test_utils.py             | 47 +++++++++++++
 imblearn/dask/utils.py                        | 66 +++++++++++++++++++
 .../_random_under_sampler.py                  | 57 +++++++++++-----
 imblearn/utils/_validation.py                 | 27 ++++++--
 imblearn/utils/estimator_checks.py            | 27 ++++++++
 imblearn/utils/wrapper.py                     | 37 +++++++++++
 9 files changed, 252 insertions(+), 22 deletions(-)
 create mode 100644 imblearn/dask/__init__.py
 create mode 100644 imblearn/dask/_support.py
 create mode 100644 imblearn/dask/tests/__init__.py
 create mode 100644 imblearn/dask/tests/test_utils.py
 create mode 100644 imblearn/dask/utils.py
 create mode 100644 imblearn/utils/wrapper.py

diff --git a/imblearn/dask/__init__.py b/imblearn/dask/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/imblearn/dask/_support.py b/imblearn/dask/_support.py
new file mode 100644
index 000000000..b0b4153d2
--- /dev/null
+++ b/imblearn/dask/_support.py
@@ -0,0 +1,13 @@
+_REGISTERED_DASK_CONTAINER = []
+
+try:
+    from dask import array, dataframe
+    _REGISTERED_DASK_CONTAINER += [
+        array.Array, dataframe.Series, dataframe.DataFrame,
+    ]
+except ImportError:
+    pass
+
+
+def is_dask_container(container):
+    return isinstance(container, tuple(_REGISTERED_DASK_CONTAINER))
diff --git a/imblearn/dask/tests/__init__.py b/imblearn/dask/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py
new file mode 100644
index 000000000..edf0665b6
--- /dev/null
+++ b/imblearn/dask/tests/test_utils.py
@@ -0,0 +1,47 @@
+import numpy as np
+import pytest
+from dask import array
+from dask_ml.datasets import make_classification
+
+from imblearn.dask.utils import is_multilabel
+from imblearn.dask.utils import type_of_target
+
+
+def test_type_of_target_error():
+    y = np.arange(10)
+
+    err_msg = "Expected a Dask array, series or dataframe."
+    with pytest.raises(ValueError, match=err_msg):
+        type_of_target(y)
+
+
+@pytest.mark.parametrize(
+    "y, expected_result",
+    [
+        (array.from_array(np.array([0, 1, 0, 1])), False),
+        (array.from_array(np.array([[1, 0], [0, 0]])), True),
+        (array.from_array(np.array([[1], [0], [0]])), False),
+        (array.from_array(np.array([[1, 0, 0]])), True),
+    ]
+)
+def test_is_multilabel(y, expected_result):
+    assert is_multilabel(y) is expected_result
+
+
+@pytest.mark.parametrize(
+    "y, expected_type_of_target",
+    [
+        (array.from_array(np.array([[1, 0], [0, 0]])), "multilabel-indicator"),
+        (array.from_array(np.array([[1, 0, 0]])), "multilabel-indicator"),
+        (array.from_array(np.array([[[1, 2]]])), "unknown"),
+        (array.from_array(np.array([[]])), "unknown"),
+        (array.from_array(np.array([.1, .2, 3])), "continuous"),
+        (array.from_array(np.array([[.1, .2, 3]])), "continuous-multioutput"),
+        (array.from_array(np.array([[1., .2]])), "continuous-multioutput"),
+        (array.from_array(np.array([1, 2])), "binary"),
+        (array.from_array(np.array(["a", "b"])), "binary"),
+    ]
+)
+def test_type_of_target(y, expected_type_of_target):
+    target_type = type_of_target(y)
+    assert target_type == expected_type_of_target
diff --git a/imblearn/dask/utils.py b/imblearn/dask/utils.py
new file mode 100644
index 000000000..00f6128ba
--- /dev/null
+++ b/imblearn/dask/utils.py
@@ -0,0 +1,66 @@
+import warnings
+
+from dask import dataframe
+from dask import array
+from sklearn.exceptions import DataConversionWarning
+from sklearn.utils.multiclass import _is_integral_float
+
+
+def is_multilabel(y):
+    if not (y.ndim == 2 and y.shape[1] > 1):
+        return False
+
+    labels = array.unique(y).compute()
+
+    return len(labels) < 3 and (
+        y.dtype.kind in 'biu' or _is_integral_float(labels)
+    )
+
+
+def type_of_target(y):
+    if is_multilabel(y):
+        return 'multilabel-indicator'
+
+    if y.ndim > 2:
+        return 'unknown'
+
+    if y.ndim == 2 and y.shape[1] == 0:
+        return 'unknown'  # [[]]
+
+    if y.ndim == 2 and y.shape[1] > 1:
+        # [[1, 2], [1, 2]]
+        suffix = "-multioutput"
+    else:
+        # [1, 2, 3] or [[1], [2], [3]]
+        suffix = ""
+
+    # check float and contains non-integer float values
+    if y.dtype.kind == 'f' and array.any(y != y.astype(int)):
+        # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
+        # NOTE: we don't check for infinite values
+        return 'continuous' + suffix
+
+    labels = array.unique(y).compute()
+    if (len((labels)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
+        # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
+        return 'multiclass' + suffix
+    # [1, 2] or [["a"], ["b"]]
+    return 'binary'
+
+
+def column_or_1d(y, *, warn=False):
+    shape = y.shape
+    if len(shape) == 1:
+        return y.ravel()
+    if len(shape) == 2 and shape[1] == 1:
+        if warn:
+            warnings.warn(
+                "A column-vector y was passed when a 1d array was  expected. "
+                "Please change the shape of y to (n_samples, ), for example "
+                "using ravel().", DataConversionWarning, stacklevel=2
+            )
+        return y.ravel()
+
+    raise ValueError(
+        f"y should be a 1d array. Got an array of shape {shape} instead."
+    )
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
index e34d4e73d..8d9745622 100644
--- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
+++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -10,6 +10,7 @@
 from sklearn.utils import _safe_indexing
 
 from ..base import BaseUnderSampler
+from ...dask._support import is_dask_container
 from ...utils import check_target_type
 from ...utils import Substitution
 from ...utils._docstring import _random_state_docstring
@@ -80,44 +81,66 @@ def __init__(
         self.replacement = replacement
 
     def _check_X_y(self, X, y):
-        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        X, y = self._validate_data(
-            X, y, reset=True, accept_sparse=["csr", "csc"], dtype=None,
-            force_all_finite=False,
+        y, binarize_y, self._uniques = check_target_type(
+            y,
+            indicate_one_vs_all=True,
+            return_unique=True,
         )
+        if not any([is_dask_container(arr) for arr in (X, y)]):
+            X, y = self._validate_data(
+                X,
+                y,
+                reset=True,
+                accept_sparse=["csr", "csc"],
+                dtype=None,
+                force_all_finite=False,
+            )
         return X, y, binarize_y
 
+    @staticmethod
+    def _find_target_class_indices(y, target_class):
+        target_class_indices = np.flatnonzero(y == target_class)
+        if is_dask_container(y):
+            return target_class_indices.compute()
+        return target_class_indices
+
     def _fit_resample(self, X, y):
         random_state = check_random_state(self.random_state)
 
-        idx_under = np.empty((0,), dtype=int)
+        idx_under = []
 
-        for target_class in np.unique(y):
+        for target_class in self._uniques:
+            target_class_indices = self._find_target_class_indices(
+                y, target_class
+            )
             if target_class in self.sampling_strategy_.keys():
                 n_samples = self.sampling_strategy_[target_class]
                 index_target_class = random_state.choice(
-                    range(np.count_nonzero(y == target_class)),
+                    target_class_indices.size,
                     size=n_samples,
                     replace=self.replacement,
                 )
             else:
                 index_target_class = slice(None)
 
-            idx_under = np.concatenate(
-                (
-                    idx_under,
-                    np.flatnonzero(y == target_class)[index_target_class],
-                ),
-                axis=0,
-            )
+            selected_indices = target_class_indices[index_target_class]
+            idx_under.append(selected_indices)
 
-        self.sample_indices_ = idx_under
+        self.sample_indices_ = np.hstack(idx_under)
+        self.sample_indices_.sort()
 
-        return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under)
+        return (
+            _safe_indexing(X, self.sample_indices_),
+            _safe_indexing(y, self.sample_indices_)
+        )
 
     def _more_tags(self):
         return {
-            "X_types": ["2darray", "string"],
+            "X_types": [
+                "2darray",
+                "string",
+                "dask-array",
+            ],
             "sample_indices": True,
             "allow_nan": True,
         }
diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
index fdc67619e..bc20267dd 100644
--- a/imblearn/utils/_validation.py
+++ b/imblearn/utils/_validation.py
@@ -14,10 +14,13 @@
 from sklearn.base import clone
 from sklearn.neighbors._base import KNeighborsMixin
 from sklearn.neighbors import NearestNeighbors
-from sklearn.utils import column_or_1d
-from sklearn.utils.multiclass import type_of_target
 
+from ..dask._support import is_dask_container
 from ..exceptions import raise_isinstance_error
+from .wrapper import _is_multiclass_encoded
+from .wrapper import column_or_1d
+from .wrapper import type_of_target
+from .wrapper import unique
 
 SAMPLING_KIND = (
     "over-sampling",
@@ -99,10 +102,12 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
 
 def _count_class_sample(y):
     unique, counts = np.unique(y, return_counts=True)
+    if is_dask_container(unique):
+        unique, counts = unique.compute(), counts.compute()
     return dict(zip(unique, counts))
 
 
-def check_target_type(y, indicate_one_vs_all=False):
+def check_target_type(y, indicate_one_vs_all=False, return_unique=False):
     """Check the target types to be conform to the current samplers.
 
     The current samplers should be compatible with ``'binary'``,
@@ -116,18 +121,24 @@ def check_target_type(y, indicate_one_vs_all=False):
     indicate_one_vs_all : bool, default=False
         Either to indicate if the targets are encoded in a one-vs-all fashion.
 
+    return_unique : bool, default=False
+        Either to return or not the unique values in y.
+
     Returns
     -------
     y : ndarray
         The returned target.
 
+    y_unique : ndarray
+        The unique values in `y`.
+
     is_one_vs_all : bool, optional
         Indicate if the target was originally encoded in a one-vs-all fashion.
         Only returned if ``indicate_multilabel=True``.
     """
     type_y = type_of_target(y)
     if type_y == "multilabel-indicator":
-        if np.any(y.sum(axis=1) > 1):
+        if not _is_multiclass_encoded(y):
             raise ValueError(
                 "Imbalanced-learn currently supports binary, multiclass and "
                 "binarized encoded multiclasss targets. Multilabel and "
@@ -137,7 +148,13 @@ def check_target_type(y, indicate_one_vs_all=False):
     else:
         y = column_or_1d(y)
 
-    return (y, type_y == "multilabel-indicator") if indicate_one_vs_all else y
+    output = [y]
+    if indicate_one_vs_all:
+        output += [type_y == "multilabel-indicator"]
+    if return_unique:
+        output += [unique(y)]
+
+    return output
 
 
 def _sampling_strategy_all(y, sampling_type):
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 729ceebea..1494fd695 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -51,6 +51,7 @@ def _set_checking_parameters(estimator):
 
 
 def _yield_sampler_checks(sampler):
+    tags = sampler._get_tags()
     yield check_target_type
     yield check_samplers_one_label
     yield check_samplers_fit
@@ -58,6 +59,8 @@ def _yield_sampler_checks(sampler):
     yield check_samplers_sampling_strategy_fit_resample
     yield check_samplers_sparse
     yield check_samplers_pandas
+    if "dask-array" in tags["X_types"]:
+        yield check_samplers_dask_array
     yield check_samplers_list
     yield check_samplers_multiclass_ova
     yield check_samplers_preserve_dtype
@@ -290,6 +293,30 @@ def check_samplers_pandas(name, sampler):
     assert_allclose(y_res_s.to_numpy(), y_res)
 
 
+def check_samplers_dask_array(name, sampler):
+    dask = pytest.importorskip("dask")
+    # Check that the samplers handle pandas dataframe and pandas series
+    X, y = make_classification(
+        n_samples=1000,
+        n_classes=3,
+        n_informative=4,
+        weights=[0.2, 0.3, 0.5],
+        random_state=0,
+    )
+    X_dask = dask.array.from_array(X, chunks=100)
+    y_dask = dask.array.from_array(y, chunks=100)
+
+    X_res_dask, y_res_dask = sampler.fit_resample(X_dask, y_dask)
+    X_res, y_res = sampler.fit_resample(X, y)
+
+    # check that we return the same type for dataframes or series types
+    assert isinstance(X_res_dask, dask.array.Array)
+    assert isinstance(y_res_dask, dask.array.Array)
+
+    assert_allclose(X_res_dask, X_res)
+    assert_allclose(y_res_dask, y_res)
+
+
 def check_samplers_list(name, sampler):
     # Check that the can samplers handle simple lists
     X, y = make_classification(
diff --git a/imblearn/utils/wrapper.py b/imblearn/utils/wrapper.py
new file mode 100644
index 000000000..646dc1a64
--- /dev/null
+++ b/imblearn/utils/wrapper.py
@@ -0,0 +1,37 @@
+import numpy as np
+
+from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target
+from sklearn.utils.validation import column_or_1d as sklearn_column_or_1d
+
+from ..dask._support import is_dask_container
+
+
+def type_of_target(y):
+    if is_dask_container(y):
+        from ..dask.utils import type_of_target as dask_type_of_target
+
+        return dask_type_of_target(y)
+    return sklearn_type_of_target(y)
+
+
+def _is_multiclass_encoded(y):
+    if is_dask_container(y):
+        from dask import array
+
+        return array.all(y.sum(axis=1) == 1).compute()
+    return np.all(y.sum(axis=1) == 1)
+
+
+def column_or_1d(y, *, warn=False):
+    if is_dask_container(y):
+        from ..dask.utils import column_or_1d as dask_column_or_1d
+
+        return dask_column_or_1d(y, warn=warn)
+    return sklearn_column_or_1d(y, warn=warn)
+
+
+def unique(*args, **kwargs):
+    output = np.unique(args, kwargs)
+    if is_dask_container(output):
+        return (arr.compute() for arr in output)
+    return output

From ea30287c9638df632cb6aa20a5971312966ef3fc Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 20:20:02 +0100
Subject: [PATCH 02/32] add dask to the install

---
 azure-pipelines.yml          | 1 +
 build_tools/azure/install.sh | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 1699a0d88..cb5b5c5b1 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -35,6 +35,7 @@ jobs:
         PYTHON_VERSION: '3.8'
         COVERAGE: 'true'
         PANDAS_VERSION: '*'
+        DASK_VERSION: '*'
         TEST_DOCSTRINGS: 'true'
         JOBLIB_VERSION: '*'
         CHECK_WARNINGS: 'true'
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 79c5d5814..80c6ada01 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -40,6 +40,10 @@ if [[ "$DISTRIB" == "conda" ]]; then
         TO_INSTALL="$TO_INSTALL pandas=$PANDAS_VERSION"
     fi
 
+    if [[ -n "$DASK_VERSION" ]]; then
+        TO_INSTALL="$TO_INSTALL dask=$DASK_VERSION"
+    fi
+
     if [[ -n "$KERAS_VERSION" ]]; then
         TO_INSTALL="$TO_INSTALL keras=$KERAS_VERSION tensorflow=1"
         KERAS_BACKEND=tensorflow

From 0766964224dd5802584c0a5a3a6e909f50d8e62c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 20:30:51 +0100
Subject: [PATCH 03/32] PEP8

---
 imblearn/dask/tests/test_utils.py | 1 -
 imblearn/dask/utils.py            | 1 -
 2 files changed, 2 deletions(-)

diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py
index edf0665b6..524e87e1a 100644
--- a/imblearn/dask/tests/test_utils.py
+++ b/imblearn/dask/tests/test_utils.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pytest
 from dask import array
-from dask_ml.datasets import make_classification
 
 from imblearn.dask.utils import is_multilabel
 from imblearn.dask.utils import type_of_target
diff --git a/imblearn/dask/utils.py b/imblearn/dask/utils.py
index 00f6128ba..5771120f4 100644
--- a/imblearn/dask/utils.py
+++ b/imblearn/dask/utils.py
@@ -1,6 +1,5 @@
 import warnings
 
-from dask import dataframe
 from dask import array
 from sklearn.exceptions import DataConversionWarning
 from sklearn.utils.multiclass import _is_integral_float

From d9edb9ad591c803a99adcbc586fcb2892b78c3dd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 20:46:12 +0100
Subject: [PATCH 04/32] iter

---
 imblearn/dask/tests/test_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py
index 524e87e1a..7787a0d20 100644
--- a/imblearn/dask/tests/test_utils.py
+++ b/imblearn/dask/tests/test_utils.py
@@ -1,5 +1,7 @@
 import numpy as np
 import pytest
+
+dask = pytest.importorskip("dask")
 from dask import array
 
 from imblearn.dask.utils import is_multilabel
@@ -18,7 +20,7 @@ def test_type_of_target_error():
     "y, expected_result",
     [
         (array.from_array(np.array([0, 1, 0, 1])), False),
-        (array.from_array(np.array([[1, 0], [0, 0]])), True),
+        (array(np.array([[1, 0], [0, 0]])), True),
         (array.from_array(np.array([[1], [0], [0]])), False),
         (array.from_array(np.array([[1, 0, 0]])), True),
     ]

From 4960724378cc59ade14f27f34683f0f240162651 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 20:56:34 +0100
Subject: [PATCH 05/32] PEP8

---
 imblearn/dask/tests/test_utils.py | 2 +-
 setup.cfg                         | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py
index 7787a0d20..eaa4bcbc9 100644
--- a/imblearn/dask/tests/test_utils.py
+++ b/imblearn/dask/tests/test_utils.py
@@ -20,7 +20,7 @@ def test_type_of_target_error():
     "y, expected_result",
     [
         (array.from_array(np.array([0, 1, 0, 1])), False),
-        (array(np.array([[1, 0], [0, 0]])), True),
+        (array.from_array(np.array([[1, 0], [0, 0]])), True),
         (array.from_array(np.array([[1], [0], [0]])), False),
         (array.from_array(np.array([[1, 0, 0]])), True),
     ]
diff --git a/setup.cfg b/setup.cfg
index 1062c584c..ae0665223 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -32,3 +32,6 @@ addopts =
 filterwarnings = 
 	ignore:the matrix subclass:PendingDeprecationWarning
 
+[flake8]
+# Default flake8 3.5 ignored flags
+ignore=E121,E123,E126,E226,E24,E704,W503,W504,E402

From 21524290a6b1bb01949a648ea2d7288b2bfdd294 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 21:26:38 +0100
Subject: [PATCH 06/32] iter

---
 conftest.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/conftest.py b/conftest.py
index d3ff91025..7ca37b601 100644
--- a/conftest.py
+++ b/conftest.py
@@ -31,3 +31,8 @@ def pytest_runtest_setup(item):
             import tensorflow
         except ImportError:
             pytest.skip('The tensorflow package is not installed.')
+    elif "dask" in fname:
+        try:
+            import dask
+        except ImportError:
+            pytest.skip('The dask package is not installed.')

From e5ce7a6821ed7ee79afc61352e8c8de14da64168 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 21:28:49 +0100
Subject: [PATCH 07/32] PEP8

---
 conftest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conftest.py b/conftest.py
index 7ca37b601..607b4936e 100644
--- a/conftest.py
+++ b/conftest.py
@@ -22,17 +22,17 @@ def pytest_runtest_setup(item):
     if (fname.endswith(os.path.join('keras', '_generator.py')) or
             fname.endswith('miscellaneous.rst')):
         try:
-            import keras
+            import keras  # noqa
         except ImportError:
             pytest.skip('The keras package is not installed.')
     elif (fname.endswith(os.path.join('tensorflow', '_generator.py')) or
           fname.endswith('miscellaneous.rst')):
         try:
-            import tensorflow
+            import tensorflow  # noqa
         except ImportError:
             pytest.skip('The tensorflow package is not installed.')
     elif "dask" in fname:
         try:
-            import dask
+            import dask  # noqa
         except ImportError:
             pytest.skip('The dask package is not installed.')

From b537a201714d11ce94fad1cb531f46aa2273c44b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 21:51:33 +0100
Subject: [PATCH 08/32] iter

---
 imblearn/utils/_validation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
index bc20267dd..f3d6d695b 100644
--- a/imblearn/utils/_validation.py
+++ b/imblearn/utils/_validation.py
@@ -129,12 +129,12 @@ def check_target_type(y, indicate_one_vs_all=False, return_unique=False):
     y : ndarray
         The returned target.
 
-    y_unique : ndarray
-        The unique values in `y`.
-
     is_one_vs_all : bool, optional
         Indicate if the target was originally encoded in a one-vs-all fashion.
         Only returned if ``indicate_multilabel=True``.
+
+    y_unique : ndarray
+        The unique values in `y`.
     """
     type_y = type_of_target(y)
     if type_y == "multilabel-indicator":
@@ -154,7 +154,7 @@ def check_target_type(y, indicate_one_vs_all=False, return_unique=False):
     if return_unique:
         output += [unique(y)]
 
-    return output
+    return output[0] if len(output) == 1 else tuple(output)
 
 
 def _sampling_strategy_all(y, sampling_type):

From f781be0a2905bbea32c3a261b434df59d82480af Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 22:03:18 +0100
Subject: [PATCH 09/32] iter

---
 imblearn/utils/testing.py | 2 +-
 setup.cfg                 | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py
index b5dc79828..b779b6cc1 100644
--- a/imblearn/utils/testing.py
+++ b/imblearn/utils/testing.py
@@ -53,7 +53,7 @@ def is_abstract(c):
         return True
 
     all_classes = []
-    modules_to_ignore = {"tests"}
+    modules_to_ignore = {"tests", "dask"}
     root = str(Path(__file__).parent.parent)
     # Ignore deprecation warnings triggered at import time and from walking
     # packages
diff --git a/setup.cfg b/setup.cfg
index ae0665223..0b7b5b1d2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,7 +21,7 @@ test = pytest
 
 [tool:pytest]
 doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS
-addopts = 
+addopts =
 	--ignore build_tools
 	--ignore benchmarks
 	--ignore doc
@@ -29,7 +29,7 @@ addopts =
 	--ignore maint_tools
 	--doctest-modules
 	-rs
-filterwarnings = 
+filterwarnings =
 	ignore:the matrix subclass:PendingDeprecationWarning
 
 [flake8]

From fb3d6a4cffca376cf6e34812b952689e2e157bb4 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 22:23:24 +0100
Subject: [PATCH 10/32] avoid import dask explicitely

---
 conftest.py            | 4 +++-
 imblearn/dask/utils.py | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/conftest.py b/conftest.py
index 607b4936e..8de685080 100644
--- a/conftest.py
+++ b/conftest.py
@@ -19,6 +19,7 @@
 
 def pytest_runtest_setup(item):
     fname = item.fspath.strpath
+    print(item)
     if (fname.endswith(os.path.join('keras', '_generator.py')) or
             fname.endswith('miscellaneous.rst')):
         try:
@@ -31,7 +32,8 @@ def pytest_runtest_setup(item):
             import tensorflow  # noqa
         except ImportError:
             pytest.skip('The tensorflow package is not installed.')
-    elif "dask" in fname:
+    elif (fname.endswith(os.path.join("dask", "utils.py")) or
+          fname.endswith(os.path.join("dask", "_support.py"))):
         try:
             import dask  # noqa
         except ImportError:
diff --git a/imblearn/dask/utils.py b/imblearn/dask/utils.py
index 5771120f4..80a65713f 100644
--- a/imblearn/dask/utils.py
+++ b/imblearn/dask/utils.py
@@ -1,6 +1,6 @@
 import warnings
 
-from dask import array
+import numpy as np
 from sklearn.exceptions import DataConversionWarning
 from sklearn.utils.multiclass import _is_integral_float
 
@@ -9,7 +9,7 @@ def is_multilabel(y):
     if not (y.ndim == 2 and y.shape[1] > 1):
         return False
 
-    labels = array.unique(y).compute()
+    labels = np.unique(y).compute()
 
     return len(labels) < 3 and (
         y.dtype.kind in 'biu' or _is_integral_float(labels)
@@ -34,12 +34,12 @@ def type_of_target(y):
         suffix = ""
 
     # check float and contains non-integer float values
-    if y.dtype.kind == 'f' and array.any(y != y.astype(int)):
+    if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
         # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
         # NOTE: we don't check for infinite values
         return 'continuous' + suffix
 
-    labels = array.unique(y).compute()
+    labels = np.unique(y).compute()
     if (len((labels)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
         # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
         return 'multiclass' + suffix

From b7d9f3b3e2b120a173a1477a0ce96ed9555491ed Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 22:58:34 +0100
Subject: [PATCH 11/32] TST remove redundant test

---
 conftest.py                                   |  7 --
 .../tests/test_random_under_sampler.py        | 76 +++++--------------
 2 files changed, 21 insertions(+), 62 deletions(-)

diff --git a/conftest.py b/conftest.py
index 8de685080..72e6a23da 100644
--- a/conftest.py
+++ b/conftest.py
@@ -19,7 +19,6 @@
 
 def pytest_runtest_setup(item):
     fname = item.fspath.strpath
-    print(item)
     if (fname.endswith(os.path.join('keras', '_generator.py')) or
             fname.endswith('miscellaneous.rst')):
         try:
@@ -32,9 +31,3 @@ def pytest_runtest_setup(item):
             import tensorflow  # noqa
         except ImportError:
             pytest.skip('The tensorflow package is not installed.')
-    elif (fname.endswith(os.path.join("dask", "utils.py")) or
-          fname.endswith(os.path.join("dask", "_support.py"))):
-        try:
-            import dask  # noqa
-        except ImportError:
-            pytest.skip('The dask package is not installed.')
diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py
index 945d31fec..355273dc1 100644
--- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py
+++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py
@@ -30,61 +30,27 @@
 Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1])
 
 
-@pytest.mark.parametrize("as_frame", [True, False], ids=['dataframe', 'array'])
-def test_rus_fit_resample(as_frame):
-    if as_frame:
-        pd = pytest.importorskip("pandas")
-        X_ = pd.DataFrame(X)
-    else:
-        X_ = X
-    rus = RandomUnderSampler(random_state=RND_SEED, replacement=True)
-    X_resampled, y_resampled = rus.fit_resample(X_, Y)
-
-    X_gt = np.array(
-        [
-            [0.92923648, 0.76103773],
-            [0.47104475, 0.44386323],
-            [0.13347175, 0.12167502],
-            [0.09125309, -0.85409574],
-            [0.12372842, 0.6536186],
-            [0.04352327, -0.20515826],
-        ]
-    )
-    y_gt = np.array([0, 0, 0, 1, 1, 1])
-
-    if as_frame:
-        assert hasattr(X_resampled, "loc")
-        X_resampled = X_resampled.to_numpy()
-
-    assert_array_equal(X_resampled, X_gt)
-    assert_array_equal(y_resampled, y_gt)
-
-
-def test_rus_fit_resample_half():
-    sampling_strategy = {0: 3, 1: 6}
-    rus = RandomUnderSampler(
-        sampling_strategy=sampling_strategy,
-        random_state=RND_SEED,
-        replacement=True,
-    )
-    X_resampled, y_resampled = rus.fit_resample(X, Y)
-
-    X_gt = np.array(
-        [
-            [0.92923648, 0.76103773],
-            [0.47104475, 0.44386323],
-            [0.92923648, 0.76103773],
-            [0.15490546, 0.3130677],
-            [0.15490546, 0.3130677],
-            [0.15490546, 0.3130677],
-            [0.20792588, 1.49407907],
-            [0.15490546, 0.3130677],
-            [0.12372842, 0.6536186],
-        ]
-    )
-    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
-    assert_array_equal(X_resampled, X_gt)
-    assert_array_equal(y_resampled, y_gt)
+@pytest.mark.parametrize(
+    "sampling_strategy, expected_counts",
+    [
+        ("auto", {0: 3, 1: 3}),
+        ({0: 3, 1: 6}, {0: 3, 1: 6}),
+    ]
+)
+def test_rus_fit_resample(sampling_strategy, expected_counts):
+    rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
+    X_res, y_res = rus.fit_resample(X, Y)
+
+    # check that there is not samples from class 0 resampled as class 1 and
+    # vice-versa
+    classes = [0, 1]
+    for c0, c1 in (classes, classes[::-1]):
+        X_c0 = X[Y == c0]
+        X_c1 = X_res[y_res == c1]
+        for s0 in X_c0:
+            assert not np.isclose(s0, X_c1).all(axis=1).any()
+
+    assert Counter(y_res) == expected_counts
 
 
 def test_multiclass_fit_resample():

From d26da3cd7980bf98511ac4daf35257d8468a0f85 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 23:46:23 +0100
Subject: [PATCH 12/32] iter

---
 azure-pipelines.yml          | 2 ++
 build_tools/azure/install.sh | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index cb5b5c5b1..3da97175e 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -44,6 +44,7 @@ jobs:
         PYTHON_VERSION: '3.7'
         INSTALL_MKL: 'true'
         PANDAS_VERSION: '*'
+        DASK_VERSION: '*'
         KERAS_VERSION: '*'
         COVERAGE: 'true'
         JOBLIB_VERSION: '*'
@@ -52,6 +53,7 @@ jobs:
         DISTRIB: 'conda'
         PYTHON_VERSION: '3.8'
         PANDAS_VERSION: '*'
+        DASK_VERSION: '*'
         JOBLIB_VERSION: '*'
         INSTALL_MKL: 'true'
         TENSORFLOW_VERSION: '*'
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 80c6ada01..d4d7a3692 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -94,9 +94,10 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     make_conda "python=$PYTHON_VERSION"
     python -m pip install -U pip
     python -m pip install numpy scipy joblib cython
+    python -m pip install pandas==$PANDAS_VERSION
+    python -m pip install dask==$DASK_VERSION
     python -m pip install scikit-learn
     python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist
-    python -m pip install pandas
 fi
 
 if [[ "$COVERAGE" == "true" ]]; then

From c065808361bc393bb0707b8e4e75700b1dbfed02 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 23:51:34 +0100
Subject: [PATCH 13/32] xxx

---
 build_tools/azure/install.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index d4d7a3692..ed7ac29a9 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -94,8 +94,8 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     make_conda "python=$PYTHON_VERSION"
     python -m pip install -U pip
     python -m pip install numpy scipy joblib cython
-    python -m pip install pandas==$PANDAS_VERSION
-    python -m pip install dask==$DASK_VERSION
+    python -m pip install pandas
+    python -m pip install dask
     python -m pip install scikit-learn
     python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist
 fi

From f2d0ec0b1857eea9fce464c318404ad13ba684d3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 5 Nov 2020 23:57:05 +0100
Subject: [PATCH 14/32] install complete dask

---
 build_tools/azure/install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index ed7ac29a9..dba7754a6 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -95,7 +95,7 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     python -m pip install -U pip
     python -m pip install numpy scipy joblib cython
     python -m pip install pandas
-    python -m pip install dask
+    python -m pip install "dask[complete]"
     python -m pip install scikit-learn
     python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist
 fi

From 20ba9348a24000e1d2916674397a3173bffc2739 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Nov 2020 00:06:23 +0100
Subject: [PATCH 15/32] iter

---
 imblearn/dask/tests/test_utils.py             |  8 -----
 .../_random_under_sampler.py                  |  1 +
 imblearn/utils/estimator_checks.py            | 35 ++++++++++++++++++-
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py
index eaa4bcbc9..0a262a435 100644
--- a/imblearn/dask/tests/test_utils.py
+++ b/imblearn/dask/tests/test_utils.py
@@ -8,14 +8,6 @@
 from imblearn.dask.utils import type_of_target
 
 
-def test_type_of_target_error():
-    y = np.arange(10)
-
-    err_msg = "Expected a Dask array, series or dataframe."
-    with pytest.raises(ValueError, match=err_msg):
-        type_of_target(y)
-
-
 @pytest.mark.parametrize(
     "y, expected_result",
     [
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
index 8d9745622..9bc807ea2 100644
--- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
+++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -140,6 +140,7 @@ def _more_tags(self):
                 "2darray",
                 "string",
                 "dask-array",
+                "dask-dataframe"
             ],
             "sample_indices": True,
             "allow_nan": True,
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 1494fd695..b04cc388c 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -61,6 +61,8 @@ def _yield_sampler_checks(sampler):
     yield check_samplers_pandas
     if "dask-array" in tags["X_types"]:
         yield check_samplers_dask_array
+    if "dask-dataframe" in tags["X_types"]:
+        yield check_samplers_dask_dataframe
     yield check_samplers_list
     yield check_samplers_multiclass_ova
     yield check_samplers_preserve_dtype
@@ -295,7 +297,7 @@ def check_samplers_pandas(name, sampler):
 
 def check_samplers_dask_array(name, sampler):
     dask = pytest.importorskip("dask")
-    # Check that the samplers handle pandas dataframe and pandas series
+    # Check that the samplers handle dask array
     X, y = make_classification(
         n_samples=1000,
         n_classes=3,
@@ -317,6 +319,37 @@ def check_samplers_dask_array(name, sampler):
     assert_allclose(y_res_dask, y_res)
 
 
+def check_samplers_dask_dataframe(name, sampler):
+    dask = pytest.importorskip("dask")
+    # Check that the samplers handle dask dataframe and dask series
+    X, y = make_classification(
+        n_samples=1000,
+        n_classes=3,
+        n_informative=4,
+        weights=[0.2, 0.3, 0.5],
+        random_state=0,
+    )
+    X_df = dask.dataframe.from_array(
+        X, columns=[str(i) for i in range(X.shape[1])]
+    )
+    y_s = dask.dataframe.from_array(y)
+
+    X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
+    X_res, y_res = sampler.fit_resample(X, y)
+
+    # check that we return the same type for dataframes or series types
+    assert isinstance(X_res_df, dask.dataframe.DataFrame)
+    assert isinstance(y_res_s, dask.dataframe.Series)
+
+    # assert X_df.columns.to_list() == X_res_df.columns.to_list()
+    # assert y_df.columns.to_list() == y_res_df.columns.to_list()
+    # assert y_s.name == y_res_s.name
+
+    # assert_allclose(X_res_df.to_numpy(), X_res)
+    # assert_allclose(y_res_df.to_numpy().ravel(), y_res)
+    # assert_allclose(y_res_s.to_numpy(), y_res)
+
+
 def check_samplers_list(name, sampler):
     # Check that the can samplers handle simple lists
     X, y = make_classification(

From 0941a5e7eedb8fffce5646df150334c1c1e877c5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Nov 2020 00:06:49 +0100
Subject: [PATCH 16/32] iter

---
 imblearn/ensemble/tests/test_weight_boosting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imblearn/ensemble/tests/test_weight_boosting.py b/imblearn/ensemble/tests/test_weight_boosting.py
index 26facce90..517f61f40 100644
--- a/imblearn/ensemble/tests/test_weight_boosting.py
+++ b/imblearn/ensemble/tests/test_weight_boosting.py
@@ -77,7 +77,7 @@ def test_rusboost(imbalanced_dataset, algorithm):
     assert rusboost.decision_function(X_test).shape[1] == len(classes)
 
     score = rusboost.score(X_test, y_test)
-    assert score > 0.7, "Failed with algorithm {} and score {}".format(
+    assert score > 0.65, "Failed with algorithm {} and score {}".format(
         algorithm, score
     )
 

From 7aae9d924a5bc2759c80cc2dff200ea227aaf9db Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Nov 2020 00:07:32 +0100
Subject: [PATCH 17/32] iter

---
 .../_prototype_selection/_random_under_sampler.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
index 9bc807ea2..c9da1d28a 100644
--- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
+++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -140,7 +140,7 @@ def _more_tags(self):
                 "2darray",
                 "string",
                 "dask-array",
-                "dask-dataframe"
+                # "dask-dataframe"
             ],
             "sample_indices": True,
             "allow_nan": True,

From 00c0a265f1930becf6a332195629dc9a2b917757 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Nov 2020 11:23:43 +0100
Subject: [PATCH 18/32] iter

---
 imblearn/base.py                              |  3 +-
 imblearn/dask/utils.py                        | 17 +++++++++--
 .../_random_under_sampler.py                  |  8 +++++-
 imblearn/utils/_validation.py                 | 28 +++++++++++++++++--
 imblearn/utils/estimator_checks.py            | 11 ++++----
 imblearn/utils/wrapper.py                     | 24 ++++++++++++----
 6 files changed, 74 insertions(+), 17 deletions(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index 86bb53778..6a829e3e4 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -10,11 +10,11 @@
 
 from sklearn.base import BaseEstimator
 from sklearn.preprocessing import label_binarize
-from sklearn.utils.multiclass import check_classification_targets
 
 from .utils import check_sampling_strategy, check_target_type
 from .utils._validation import ArraysTransformer
 from .utils._validation import _deprecate_positional_args
+from .utils.wrapper import check_classification_targets
 
 
 class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
@@ -82,6 +82,7 @@ def fit_resample(self, X, y):
 
         output = self._fit_resample(X, y)
 
+        # TODO: label binarize is not implemented with dask
         y_ = (label_binarize(output[1], np.unique(y))
               if binarize_y else output[1])
 
diff --git a/imblearn/dask/utils.py b/imblearn/dask/utils.py
index 80a65713f..814f9ce81 100644
--- a/imblearn/dask/utils.py
+++ b/imblearn/dask/utils.py
@@ -9,7 +9,10 @@ def is_multilabel(y):
     if not (y.ndim == 2 and y.shape[1] > 1):
         return False
 
-    labels = np.unique(y).compute()
+    if hasattr(y, "unique"):
+        labels = np.asarray(y.unique())
+    else:
+        labels = np.unique(y).compute()
 
     return len(labels) < 3 and (
         y.dtype.kind in 'biu' or _is_integral_float(labels)
@@ -39,7 +42,10 @@ def type_of_target(y):
         # NOTE: we don't check for infinite values
         return 'continuous' + suffix
 
-    labels = np.unique(y).compute()
+    if hasattr(y, "unique"):
+        labels = np.asarray(y.unique())
+    else:
+        labels = np.unique(y).compute()
     if (len((labels)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
         # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
         return 'multiclass' + suffix
@@ -63,3 +69,10 @@ def column_or_1d(y, *, warn=False):
     raise ValueError(
         f"y should be a 1d array. Got an array of shape {shape} instead."
     )
+
+
+def check_classification_targets(y):
+    y_type = type_of_target(y)
+    if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
+                      'multilabel-indicator', 'multilabel-sequences']:
+        raise ValueError("Unknown label type: %r" % y_type)
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
index c9da1d28a..ab92071b6 100644
--- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
+++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -81,6 +81,9 @@ def __init__(
         self.replacement = replacement
 
     def _check_X_y(self, X, y):
+        if is_dask_container(y) and hasattr(y, "to_dask_array"):
+            y = y.to_dask_array()
+            y.compute_chunk_sizes()
         y, binarize_y, self._uniques = check_target_type(
             y,
             indicate_one_vs_all=True,
@@ -95,6 +98,9 @@ def _check_X_y(self, X, y):
                 dtype=None,
                 force_all_finite=False,
             )
+        elif is_dask_container(X) and hasattr(X, "to_dask_array"):
+            X = X.to_dask_array()
+            X.compute_chunk_sizes()
         return X, y, binarize_y
 
     @staticmethod
@@ -140,7 +146,7 @@ def _more_tags(self):
                 "2darray",
                 "string",
                 "dask-array",
-                # "dask-dataframe"
+                "dask-dataframe"
             ],
             "sample_indices": True,
             "allow_nan": True,
diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
index f3d6d695b..f4875bef3 100644
--- a/imblearn/utils/_validation.py
+++ b/imblearn/utils/_validation.py
@@ -47,6 +47,9 @@ def transform(self, X, y):
     def _gets_props(self, array):
         props = {}
         props["type"] = array.__class__.__name__
+        if props["type"].lower() in ("series", "dataframe"):
+            suffix = "dask-" if is_dask_container(array) else "pandas-"
+            props["type"] = suffix + props["type"]
         props["columns"] = getattr(array, "columns", None)
         props["name"] = getattr(array, "name", None)
         props["dtypes"] = getattr(array, "dtypes", None)
@@ -56,13 +59,34 @@ def _transfrom_one(self, array, props):
         type_ = props["type"].lower()
         if type_ == "list":
             ret = array.tolist()
-        elif type_ == "dataframe":
+        elif type_ == "pandas-dataframe":
             import pandas as pd
+
             ret = pd.DataFrame(array, columns=props["columns"])
             ret = ret.astype(props["dtypes"])
-        elif type_ == "series":
+        elif type_ == "pandas-series":
             import pandas as pd
+
             ret = pd.Series(array, dtype=props["dtypes"], name=props["name"])
+        elif type_ == "dask-dataframe":
+            from dask import dataframe
+
+            if is_dask_container(array):
+                ret = dataframe.from_dask_array(
+                    array, columns=props["columns"]
+                )
+            else:
+                ret = dataframe.from_array(array, columns=props["columns"])
+            ret = ret.astype(props["dtypes"])
+        elif type_ == "dask-series":
+            from dask import dataframe
+
+            if is_dask_container(array):
+                ret = dataframe.from_dask_array(array)
+            else:
+                ret = dataframe.from_array(array)
+            ret = ret.astype(props["dtypes"])
+            ret = ret.rename(props["name"])
         else:
             ret = array
         return ret
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index b04cc388c..b4dcbc904 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -333,6 +333,7 @@ def check_samplers_dask_dataframe(name, sampler):
         X, columns=[str(i) for i in range(X.shape[1])]
     )
     y_s = dask.dataframe.from_array(y)
+    y_s = y_s.rename("target")
 
     X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
     X_res, y_res = sampler.fit_resample(X, y)
@@ -341,13 +342,11 @@ def check_samplers_dask_dataframe(name, sampler):
     assert isinstance(X_res_df, dask.dataframe.DataFrame)
     assert isinstance(y_res_s, dask.dataframe.Series)
 
-    # assert X_df.columns.to_list() == X_res_df.columns.to_list()
-    # assert y_df.columns.to_list() == y_res_df.columns.to_list()
-    # assert y_s.name == y_res_s.name
+    assert X_df.columns.to_list() == X_res_df.columns.to_list()
+    assert y_s.name == y_res_s.name
 
-    # assert_allclose(X_res_df.to_numpy(), X_res)
-    # assert_allclose(y_res_df.to_numpy().ravel(), y_res)
-    # assert_allclose(y_res_s.to_numpy(), y_res)
+    assert_allclose(np.array(X_res_df), X_res)
+    assert_allclose(np.array(y_res_s), y_res)
 
 
 def check_samplers_list(name, sampler):
diff --git a/imblearn/utils/wrapper.py b/imblearn/utils/wrapper.py
index 646dc1a64..7dbfa3b1e 100644
--- a/imblearn/utils/wrapper.py
+++ b/imblearn/utils/wrapper.py
@@ -1,5 +1,7 @@
 import numpy as np
 
+from sklearn.utils.multiclass import check_classification_targets as \
+    sklearn_check_classification_targets
 from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target
 from sklearn.utils.validation import column_or_1d as sklearn_column_or_1d
 
@@ -30,8 +32,20 @@ def column_or_1d(y, *, warn=False):
     return sklearn_column_or_1d(y, warn=warn)
 
 
-def unique(*args, **kwargs):
-    output = np.unique(args, kwargs)
-    if is_dask_container(output):
-        return (arr.compute() for arr in output)
-    return output
+def unique(arr, **kwargs):
+    if is_dask_container(arr):
+        if hasattr(arr, "unique"):
+            output = np.asarray(arr.unique(**kwargs))
+        else:
+            output = np.unique(arr).compute()
+        return output
+    return np.unique(arr, **kwargs)
+
+
+def check_classification_targets(y):
+    if is_dask_container(y):
+        from ..dask.utils import check_classification_targets as \
+            dask_check_classification_targets
+
+        return dask_check_classification_targets(y)
+    return sklearn_check_classification_targets(y)

From 8bfa040a6aefc39979bd6d46f57f46f3f15f2473 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 6 Nov 2020 11:30:42 +0100
Subject: [PATCH 19/32] requirements

---
 requirements.optional.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.optional.txt b/requirements.optional.txt
index 826277d5e..f785df2ff 100644
--- a/requirements.optional.txt
+++ b/requirements.optional.txt
@@ -1,2 +1,3 @@
+dask[complete]
 keras
 tensorflow

From d4aabf80da7b8e2720c8804a9e3a4dfc837cbe87 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 7 Nov 2020 23:06:34 +0100
Subject: [PATCH 20/32] iter

---
 imblearn/base.py                              |  36 +++-
 imblearn/dask/_support.py                     |  20 +--
 .../_random_under_sampler.py                  |  44 +++--
 imblearn/utils/_docstring.py                  |   7 +
 imblearn/utils/_validation.py                 | 167 +++++++++---------
 imblearn/utils/estimator_checks.py            |  18 +-
 imblearn/utils/wrapper.py                     |  12 +-
 7 files changed, 168 insertions(+), 136 deletions(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index 6a829e3e4..38f4259a4 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -11,9 +11,13 @@
 from sklearn.base import BaseEstimator
 from sklearn.preprocessing import label_binarize
 
+from .dask._support import is_dask_collection
 from .utils import check_sampling_strategy, check_target_type
-from .utils._validation import ArraysTransformer
-from .utils._validation import _deprecate_positional_args
+from .utils._validation import (
+    ArraysTransformer,
+    _deprecate_positional_args,
+    get_classes_counts,
+)
 from .utils.wrapper import check_classification_targets
 
 
@@ -45,9 +49,13 @@ def fit(self, X, y):
         self : object
             Return the instance itself.
         """
-        X, y, _ = self._check_X_y(X, y)
+        dask_collection = any([is_dask_collection(arr) for arr in (X, y)])
+        if (not dask_collection or
+                (dask_collection and self.validate_if_dask_collection)):
+            X, y, _ = self._check_X_y(X, y)
+        self._classes_counts = get_classes_counts(y)
         self.sampling_strategy_ = check_sampling_strategy(
-            self.sampling_strategy, y, self._sampling_type
+            self.sampling_strategy, self._classes_counts, self._sampling_type
         )
         return self
 
@@ -72,12 +80,19 @@ def fit_resample(self, X, y):
         y_resampled : array-like of shape (n_samples_new,)
             The corresponding label of `X_resampled`.
         """
-        check_classification_targets(y)
         arrays_transformer = ArraysTransformer(X, y)
-        X, y, binarize_y = self._check_X_y(X, y)
+        dask_collection = any([is_dask_collection(arr) for arr in (X, y)])
+        if (not dask_collection or
+                (dask_collection and self.validate_if_dask_collection)):
+            check_classification_targets(y)
+            X, y, binarize_y = self._check_X_y(X, y)
+        else:
+            X, y = arrays_transformer.to_dask_array(X, y)
+            binarize_y = False
 
+        self._classes_counts = get_classes_counts(y)
         self.sampling_strategy_ = check_sampling_strategy(
-            self.sampling_strategy, y, self._sampling_type
+            self.sampling_strategy, self._classes_counts, self._sampling_type
         )
 
         output = self._fit_resample(X, y)
@@ -125,8 +140,13 @@ class BaseSampler(SamplerMixin):
     instead.
     """
 
-    def __init__(self, sampling_strategy="auto"):
+    def __init__(
+        self,
+        sampling_strategy="auto",
+        validate_if_dask_collection=False,
+    ):
         self.sampling_strategy = sampling_strategy
+        self.validate_if_dask_collection = validate_if_dask_collection
 
     def _check_X_y(self, X, y, accept_sparse=None):
         if accept_sparse is None:
diff --git a/imblearn/dask/_support.py b/imblearn/dask/_support.py
index b0b4153d2..b5239ccac 100644
--- a/imblearn/dask/_support.py
+++ b/imblearn/dask/_support.py
@@ -1,13 +1,9 @@
-_REGISTERED_DASK_CONTAINER = []
+def is_dask_collection(container):
+    try:
+        # to keep dask as an optional depency, keep the statement in a
+        # try/except statement
+        from dask import is_dask_collection
 
-try:
-    from dask import array, dataframe
-    _REGISTERED_DASK_CONTAINER += [
-        array.Array, dataframe.Series, dataframe.DataFrame,
-    ]
-except ImportError:
-    pass
-
-
-def is_dask_container(container):
-    return isinstance(container, tuple(_REGISTERED_DASK_CONTAINER))
+        return is_dask_collection(container)
+    except ImportError:
+        return False
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
index ab92071b6..28ef02d88 100644
--- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
+++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -10,16 +10,20 @@
 from sklearn.utils import _safe_indexing
 
 from ..base import BaseUnderSampler
-from ...dask._support import is_dask_container
+from ...dask._support import is_dask_collection
 from ...utils import check_target_type
 from ...utils import Substitution
-from ...utils._docstring import _random_state_docstring
+from ...utils._docstring import (
+    _random_state_docstring,
+    _validate_if_dask_collection_docstring
+)
 from ...utils._validation import _deprecate_positional_args
 
 
 @Substitution(
     sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
     random_state=_random_state_docstring,
+    validate_if_dask_collection=_validate_if_dask_collection_docstring,
 )
 class RandomUnderSampler(BaseUnderSampler):
     """Class to perform random under-sampling.
@@ -38,6 +42,8 @@ class RandomUnderSampler(BaseUnderSampler):
     replacement : bool, default=False
         Whether the sample is with or without replacement.
 
+    {validate_if_dask_collection}
+
     Attributes
     ----------
     sample_indices_ : ndarray of shape (n_new_samples,)
@@ -74,22 +80,23 @@ class RandomUnderSampler(BaseUnderSampler):
 
     @_deprecate_positional_args
     def __init__(
-        self, *, sampling_strategy="auto", random_state=None, replacement=False
+        self,
+        *,
+        sampling_strategy="auto",
+        random_state=None,
+        replacement=False,
+        validate_if_dask_collection=False,
     ):
-        super().__init__(sampling_strategy=sampling_strategy)
+        super().__init__(
+            sampling_strategy=sampling_strategy,
+            validate_if_dask_collection=validate_if_dask_collection,
+        )
         self.random_state = random_state
         self.replacement = replacement
 
     def _check_X_y(self, X, y):
-        if is_dask_container(y) and hasattr(y, "to_dask_array"):
-            y = y.to_dask_array()
-            y.compute_chunk_sizes()
-        y, binarize_y, self._uniques = check_target_type(
-            y,
-            indicate_one_vs_all=True,
-            return_unique=True,
-        )
-        if not any([is_dask_container(arr) for arr in (X, y)]):
+        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
+        if not any([is_dask_collection(arr) for arr in (X, y)]):
             X, y = self._validate_data(
                 X,
                 y,
@@ -98,16 +105,15 @@ def _check_X_y(self, X, y):
                 dtype=None,
                 force_all_finite=False,
             )
-        elif is_dask_container(X) and hasattr(X, "to_dask_array"):
-            X = X.to_dask_array()
-            X.compute_chunk_sizes()
         return X, y, binarize_y
 
     @staticmethod
     def _find_target_class_indices(y, target_class):
         target_class_indices = np.flatnonzero(y == target_class)
-        if is_dask_container(y):
-            return target_class_indices.compute()
+        if is_dask_collection(y):
+            from dask import compute
+
+            return compute(target_class_indices)[0]
         return target_class_indices
 
     def _fit_resample(self, X, y):
@@ -115,7 +121,7 @@ def _fit_resample(self, X, y):
 
         idx_under = []
 
-        for target_class in self._uniques:
+        for target_class in self._classes_counts:
             target_class_indices = self._find_target_class_indices(
                 y, target_class
             )
diff --git a/imblearn/utils/_docstring.py b/imblearn/utils/_docstring.py
index d03be3740..87907d73e 100644
--- a/imblearn/utils/_docstring.py
+++ b/imblearn/utils/_docstring.py
@@ -41,3 +41,10 @@ def __call__(self, obj):
         `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_
         for more details.
     """.rstrip()
+
+_validate_if_dask_collection_docstring = \
+  """validate_if_dask_collection : bool, default=False
+        Whether or not `X` and `y` should be validated. This parameter applies
+        only when `X` and `y` are Dask collections where validation might be
+        potentially costly.
+    """.rstrip()
\ No newline at end of file
diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
index f4875bef3..6b3936836 100644
--- a/imblearn/utils/_validation.py
+++ b/imblearn/utils/_validation.py
@@ -15,7 +15,7 @@
 from sklearn.neighbors._base import KNeighborsMixin
 from sklearn.neighbors import NearestNeighbors
 
-from ..dask._support import is_dask_container
+from ..dask._support import is_dask_collection
 from ..exceptions import raise_isinstance_error
 from .wrapper import _is_multiclass_encoded
 from .wrapper import column_or_1d
@@ -39,6 +39,16 @@ def __init__(self, X, y):
         self.x_props = self._gets_props(X)
         self.y_props = self._gets_props(y)
 
+    @staticmethod
+    def to_dask_array(X, y):
+        if hasattr(X, "to_dask_array"):
+            X = X.to_dask_array()
+            X.compute_chunk_sizes()
+        if hasattr(y, "to_dask_array"):
+            y = y.to_dask_array()
+            y.compute_chunk_sizes()
+        return X, y
+
     def transform(self, X, y):
         X = self._transfrom_one(X, self.x_props)
         y = self._transfrom_one(y, self.y_props)
@@ -48,7 +58,7 @@ def _gets_props(self, array):
         props = {}
         props["type"] = array.__class__.__name__
         if props["type"].lower() in ("series", "dataframe"):
-            suffix = "dask-" if is_dask_container(array) else "pandas-"
+            suffix = "dask-" if is_dask_collection(array) else "pandas-"
             props["type"] = suffix + props["type"]
         props["columns"] = getattr(array, "columns", None)
         props["name"] = getattr(array, "name", None)
@@ -71,7 +81,7 @@ def _transfrom_one(self, array, props):
         elif type_ == "dask-dataframe":
             from dask import dataframe
 
-            if is_dask_container(array):
+            if is_dask_collection(array):
                 ret = dataframe.from_dask_array(
                     array, columns=props["columns"]
                 )
@@ -81,7 +91,7 @@ def _transfrom_one(self, array, props):
         elif type_ == "dask-series":
             from dask import dataframe
 
-            if is_dask_container(array):
+            if is_dask_collection(array):
                 ret = dataframe.from_dask_array(array)
             else:
                 ret = dataframe.from_array(array)
@@ -124,14 +134,16 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
         raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object)
 
 
-def _count_class_sample(y):
+def get_classes_counts(y):
     unique, counts = np.unique(y, return_counts=True)
-    if is_dask_container(unique):
-        unique, counts = unique.compute(), counts.compute()
+    if is_dask_collection(unique):
+        from dask import compute
+
+        unique, counts = compute(unique, counts)
     return dict(zip(unique, counts))
 
 
-def check_target_type(y, indicate_one_vs_all=False, return_unique=False):
+def check_target_type(y, indicate_one_vs_all=False):
     """Check the target types to be conform to the current samplers.
 
     The current samplers should be compatible with ``'binary'``,
@@ -145,9 +157,6 @@ def check_target_type(y, indicate_one_vs_all=False, return_unique=False):
     indicate_one_vs_all : bool, default=False
         Either to indicate if the targets are encoded in a one-vs-all fashion.
 
-    return_unique : bool, default=False
-        Either to return or not the unique values in y.
-
     Returns
     -------
     y : ndarray
@@ -175,27 +184,24 @@ def check_target_type(y, indicate_one_vs_all=False, return_unique=False):
     output = [y]
     if indicate_one_vs_all:
         output += [type_y == "multilabel-indicator"]
-    if return_unique:
-        output += [unique(y)]
 
     return output[0] if len(output) == 1 else tuple(output)
 
 
-def _sampling_strategy_all(y, sampling_type):
+def _sampling_strategy_all(classes_counts, sampling_type):
     """Returns sampling target by targeting all classes."""
-    target_stats = _count_class_sample(y)
     if sampling_type == "over-sampling":
-        n_sample_majority = max(target_stats.values())
+        n_sample_majority = max(classes_counts.values())
         sampling_strategy = {
             key: n_sample_majority - value
-            for (key, value) in target_stats.items()
+            for (key, value) in classes_counts.items()
         }
     elif (
         sampling_type == "under-sampling" or sampling_type == "clean-sampling"
     ):
-        n_sample_minority = min(target_stats.values())
+        n_sample_minority = min(classes_counts.values())
         sampling_strategy = {
-            key: n_sample_minority for key in target_stats.keys()
+            key: n_sample_minority for key in classes_counts.keys()
         }
     else:
         raise NotImplementedError
@@ -203,7 +209,7 @@ def _sampling_strategy_all(y, sampling_type):
     return sampling_strategy
 
 
-def _sampling_strategy_majority(y, sampling_type):
+def _sampling_strategy_majority(classes_counts, sampling_type):
     """Returns sampling target by targeting the majority class only."""
     if sampling_type == "over-sampling":
         raise ValueError(
@@ -213,12 +219,11 @@ def _sampling_strategy_majority(y, sampling_type):
     elif (
         sampling_type == "under-sampling" or sampling_type == "clean-sampling"
     ):
-        target_stats = _count_class_sample(y)
-        class_majority = max(target_stats, key=target_stats.get)
-        n_sample_minority = min(target_stats.values())
+        class_majority = max(classes_counts, key=classes_counts.get)
+        n_sample_minority = min(classes_counts.values())
         sampling_strategy = {
             key: n_sample_minority
-            for key in target_stats.keys()
+            for key in classes_counts.keys()
             if key == class_majority
         }
     else:
@@ -227,26 +232,25 @@ def _sampling_strategy_majority(y, sampling_type):
     return sampling_strategy
 
 
-def _sampling_strategy_not_majority(y, sampling_type):
+def _sampling_strategy_not_majority(classes_counts, sampling_type):
     """Returns sampling target by targeting all classes but not the
     majority."""
-    target_stats = _count_class_sample(y)
     if sampling_type == "over-sampling":
-        n_sample_majority = max(target_stats.values())
-        class_majority = max(target_stats, key=target_stats.get)
+        n_sample_majority = max(classes_counts.values())
+        class_majority = max(classes_counts, key=classes_counts.get)
         sampling_strategy = {
             key: n_sample_majority - value
-            for (key, value) in target_stats.items()
+            for (key, value) in classes_counts.items()
             if key != class_majority
         }
     elif (
         sampling_type == "under-sampling" or sampling_type == "clean-sampling"
     ):
-        n_sample_minority = min(target_stats.values())
-        class_majority = max(target_stats, key=target_stats.get)
+        n_sample_minority = min(classes_counts.values())
+        class_majority = max(classes_counts, key=classes_counts.get)
         sampling_strategy = {
             key: n_sample_minority
-            for key in target_stats.keys()
+            for key in classes_counts.keys()
             if key != class_majority
         }
     else:
@@ -255,26 +259,25 @@ def _sampling_strategy_not_majority(y, sampling_type):
     return sampling_strategy
 
 
-def _sampling_strategy_not_minority(y, sampling_type):
+def _sampling_strategy_not_minority(classes_counts, sampling_type):
     """Returns sampling target by targeting all classes but not the
     minority."""
-    target_stats = _count_class_sample(y)
     if sampling_type == "over-sampling":
-        n_sample_majority = max(target_stats.values())
-        class_minority = min(target_stats, key=target_stats.get)
+        n_sample_majority = max(classes_counts.values())
+        class_minority = min(classes_counts, key=classes_counts.get)
         sampling_strategy = {
             key: n_sample_majority - value
-            for (key, value) in target_stats.items()
+            for (key, value) in classes_counts.items()
             if key != class_minority
         }
     elif (
         sampling_type == "under-sampling" or sampling_type == "clean-sampling"
     ):
-        n_sample_minority = min(target_stats.values())
-        class_minority = min(target_stats, key=target_stats.get)
+        n_sample_minority = min(classes_counts.values())
+        class_minority = min(classes_counts, key=classes_counts.get)
         sampling_strategy = {
             key: n_sample_minority
-            for key in target_stats.keys()
+            for key in classes_counts.keys()
             if key != class_minority
         }
     else:
@@ -283,15 +286,14 @@ def _sampling_strategy_not_minority(y, sampling_type):
     return sampling_strategy
 
 
-def _sampling_strategy_minority(y, sampling_type):
+def _sampling_strategy_minority(classes_counts, sampling_type):
     """Returns sampling target by targeting the minority class only."""
-    target_stats = _count_class_sample(y)
     if sampling_type == "over-sampling":
-        n_sample_majority = max(target_stats.values())
-        class_minority = min(target_stats, key=target_stats.get)
+        n_sample_majority = max(classes_counts.values())
+        class_minority = min(classes_counts, key=classes_counts.get)
         sampling_strategy = {
             key: n_sample_majority - value
-            for (key, value) in target_stats.items()
+            for (key, value) in classes_counts.items()
             if key == class_minority
         }
     elif (
@@ -307,24 +309,23 @@ def _sampling_strategy_minority(y, sampling_type):
     return sampling_strategy
 
 
-def _sampling_strategy_auto(y, sampling_type):
+def _sampling_strategy_auto(classes_counts, sampling_type):
     """Returns sampling target auto for over-sampling and not-minority for
     under-sampling."""
     if sampling_type == "over-sampling":
-        return _sampling_strategy_not_majority(y, sampling_type)
+        return _sampling_strategy_not_majority(classes_counts, sampling_type)
     elif (
         sampling_type == "under-sampling" or sampling_type == "clean-sampling"
     ):
-        return _sampling_strategy_not_minority(y, sampling_type)
+        return _sampling_strategy_not_minority(classes_counts, sampling_type)
 
 
-def _sampling_strategy_dict(sampling_strategy, y, sampling_type):
+def _sampling_strategy_dict(sampling_strategy, classes_counts, sampling_type):
     """Returns sampling target by converting the dictionary depending of the
     sampling."""
-    target_stats = _count_class_sample(y)
     # check that all keys in sampling_strategy are also in y
     set_diff_sampling_strategy_target = set(sampling_strategy.keys()) - set(
-        target_stats.keys()
+        classes_counts.keys()
     )
     if len(set_diff_sampling_strategy_target) > 0:
         raise ValueError(
@@ -341,17 +342,17 @@ def _sampling_strategy_dict(sampling_strategy, y, sampling_type):
         )
     sampling_strategy_ = {}
     if sampling_type == "over-sampling":
-        n_samples_majority = max(target_stats.values())
-        class_majority = max(target_stats, key=target_stats.get)
+        n_samples_majority = max(classes_counts.values())
+        class_majority = max(classes_counts, key=classes_counts.get)
         for class_sample, n_samples in sampling_strategy.items():
-            if n_samples < target_stats[class_sample]:
+            if n_samples < classes_counts[class_sample]:
                 raise ValueError(
                     "With over-sampling methods, the number"
                     " of samples in a class should be greater"
                     " or equal to the original number of samples."
                     " Originally, there is {} samples and {}"
                     " samples are asked.".format(
-                        target_stats[class_sample], n_samples
+                        classes_counts[class_sample], n_samples
                     )
                 )
             if n_samples > n_samples_majority:
@@ -367,18 +368,18 @@ def _sampling_strategy_dict(sampling_strategy, y, sampling_type):
                     )
                 )
             sampling_strategy_[class_sample] = (
-                n_samples - target_stats[class_sample]
+                n_samples - classes_counts[class_sample]
             )
     elif sampling_type == "under-sampling":
         for class_sample, n_samples in sampling_strategy.items():
-            if n_samples > target_stats[class_sample]:
+            if n_samples > classes_counts[class_sample]:
                 raise ValueError(
                     "With under-sampling methods, the number of"
                     " samples in a class should be less or equal"
                     " to the original number of samples."
                     " Originally, there is {} samples and {}"
                     " samples are asked.".format(
-                        target_stats[class_sample], n_samples
+                        classes_counts[class_sample], n_samples
                     )
                 )
             sampling_strategy_[class_sample] = n_samples
@@ -394,19 +395,18 @@ def _sampling_strategy_dict(sampling_strategy, y, sampling_type):
     return sampling_strategy_
 
 
-def _sampling_strategy_list(sampling_strategy, y, sampling_type):
+def _sampling_strategy_list(sampling_strategy, classes_counts, sampling_type):
     """With cleaning methods, sampling_strategy can be a list to target the
- class of interest."""
+    class of interest."""
     if sampling_type != "clean-sampling":
         raise ValueError(
             "'sampling_strategy' cannot be a list for samplers "
             "which are not cleaning methods."
         )
 
-    target_stats = _count_class_sample(y)
     # check that all keys in sampling_strategy are also in y
     set_diff_sampling_strategy_target = set(sampling_strategy) - set(
-        target_stats.keys()
+        classes_counts.keys()
     )
     if len(set_diff_sampling_strategy_target) > 0:
         raise ValueError(
@@ -415,27 +415,26 @@ class of interest."""
         )
 
     return {
-        class_sample: min(target_stats.values())
+        class_sample: min(classes_counts.values())
         for class_sample in sampling_strategy
     }
 
 
-def _sampling_strategy_float(sampling_strategy, y, sampling_type):
+def _sampling_strategy_float(sampling_strategy, classes_counts, sampling_type):
     """Take a proportion of the majority (over-sampling) or minority
     (under-sampling) class in binary classification."""
-    type_y = type_of_target(y)
-    if type_y != "binary":
+
+    if len(classes_counts) != 2:
         raise ValueError(
             '"sampling_strategy" can be a float only when the type '
             "of target is binary. For multi-class, use a dict."
         )
-    target_stats = _count_class_sample(y)
     if sampling_type == "over-sampling":
-        n_sample_majority = max(target_stats.values())
-        class_majority = max(target_stats, key=target_stats.get)
+        n_sample_majority = max(classes_counts.values())
+        class_majority = max(classes_counts, key=classes_counts.get)
         sampling_strategy_ = {
             key: int(n_sample_majority * sampling_strategy - value)
-            for (key, value) in target_stats.items()
+            for (key, value) in classes_counts.items()
             if key != class_majority
         }
         if any([n_samples <= 0 for n_samples in sampling_strategy_.values()]):
@@ -446,16 +445,16 @@ def _sampling_strategy_float(sampling_strategy, y, sampling_type):
                 "ratio."
             )
     elif sampling_type == "under-sampling":
-        n_sample_minority = min(target_stats.values())
-        class_minority = min(target_stats, key=target_stats.get)
+        n_sample_minority = min(classes_counts.values())
+        class_minority = min(classes_counts, key=classes_counts.get)
         sampling_strategy_ = {
             key: int(n_sample_minority / sampling_strategy)
-            for (key, value) in target_stats.items()
+            for (key, value) in classes_counts.items()
             if key != class_minority
         }
         if any(
             [
-                n_samples > target_stats[target]
+                n_samples > classes_counts[target]
                 for target, n_samples in sampling_strategy_.items()
             ]
         ):
@@ -472,7 +471,9 @@ def _sampling_strategy_float(sampling_strategy, y, sampling_type):
     return sampling_strategy_
 
 
-def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs):
+def check_sampling_strategy(
+    sampling_strategy, classes_counts, sampling_type, **kwargs
+):
     """Sampling target validation for samplers.
 
     Checks that ``sampling_strategy`` is of consistent type and return a
@@ -567,10 +568,10 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs):
             " instead.".format(SAMPLING_KIND, sampling_type)
         )
 
-    if np.unique(y).size <= 1:
+    if len(classes_counts) <= 1:
         raise ValueError(
             "The target 'y' needs to have more than 1 class."
-            " Got {} class instead".format(np.unique(y).size)
+            " Got {} class instead".format(len(classes_counts))
         )
 
     if sampling_type in ("ensemble", "bypass"):
@@ -587,7 +588,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs):
         return OrderedDict(
             sorted(
                 SAMPLING_TARGET_KIND[sampling_strategy](
-                    y, sampling_type
+                    classes_counts, sampling_type
                 ).items()
             )
         )
@@ -595,7 +596,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs):
         return OrderedDict(
             sorted(
                 _sampling_strategy_dict(
-                    sampling_strategy, y, sampling_type
+                    sampling_strategy, classes_counts, sampling_type
                 ).items()
             )
         )
@@ -603,7 +604,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs):
         return OrderedDict(
             sorted(
                 _sampling_strategy_list(
-                    sampling_strategy, y, sampling_type
+                    sampling_strategy, classes_counts, sampling_type
                 ).items()
             )
         )
@@ -618,16 +619,16 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs):
         return OrderedDict(
             sorted(
                 _sampling_strategy_float(
-                    sampling_strategy, y, sampling_type
+                    sampling_strategy, classes_counts, sampling_type
                 ).items()
             )
         )
     elif callable(sampling_strategy):
-        sampling_strategy_ = sampling_strategy(y, **kwargs)
+        sampling_strategy_ = sampling_strategy(classes_counts, **kwargs)
         return OrderedDict(
             sorted(
                 _sampling_strategy_dict(
-                    sampling_strategy_, y, sampling_type
+                    sampling_strategy_, classes_counts, sampling_type
                 ).items()
             )
         )
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index b4dcbc904..6afb0f58a 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -297,6 +297,7 @@ def check_samplers_pandas(name, sampler):
 
 def check_samplers_dask_array(name, sampler):
     dask = pytest.importorskip("dask")
+    from dask import array
     # Check that the samplers handle dask array
     X, y = make_classification(
         n_samples=1000,
@@ -305,15 +306,15 @@ def check_samplers_dask_array(name, sampler):
         weights=[0.2, 0.3, 0.5],
         random_state=0,
     )
-    X_dask = dask.array.from_array(X, chunks=100)
-    y_dask = dask.array.from_array(y, chunks=100)
+    X_dask = array.from_array(X, chunks=100)
+    y_dask = array.from_array(y, chunks=100)
 
     X_res_dask, y_res_dask = sampler.fit_resample(X_dask, y_dask)
     X_res, y_res = sampler.fit_resample(X, y)
 
     # check that we return the same type for dataframes or series types
-    assert isinstance(X_res_dask, dask.array.Array)
-    assert isinstance(y_res_dask, dask.array.Array)
+    assert isinstance(X_res_dask, array.Array)
+    assert isinstance(y_res_dask, array.Array)
 
     assert_allclose(X_res_dask, X_res)
     assert_allclose(y_res_dask, y_res)
@@ -321,6 +322,7 @@ def check_samplers_dask_array(name, sampler):
 
 def check_samplers_dask_dataframe(name, sampler):
     dask = pytest.importorskip("dask")
+    from dask import dataframe
     # Check that the samplers handle dask dataframe and dask series
     X, y = make_classification(
         n_samples=1000,
@@ -329,18 +331,18 @@ def check_samplers_dask_dataframe(name, sampler):
         weights=[0.2, 0.3, 0.5],
         random_state=0,
     )
-    X_df = dask.dataframe.from_array(
+    X_df = dataframe.from_array(
         X, columns=[str(i) for i in range(X.shape[1])]
     )
-    y_s = dask.dataframe.from_array(y)
+    y_s = dataframe.from_array(y)
     y_s = y_s.rename("target")
 
     X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
     X_res, y_res = sampler.fit_resample(X, y)
 
     # check that we return the same type for dataframes or series types
-    assert isinstance(X_res_df, dask.dataframe.DataFrame)
-    assert isinstance(y_res_s, dask.dataframe.Series)
+    assert isinstance(X_res_df, dataframe.DataFrame)
+    assert isinstance(y_res_s, dataframe.Series)
 
     assert X_df.columns.to_list() == X_res_df.columns.to_list()
     assert y_s.name == y_res_s.name
diff --git a/imblearn/utils/wrapper.py b/imblearn/utils/wrapper.py
index 7dbfa3b1e..d0559d0af 100644
--- a/imblearn/utils/wrapper.py
+++ b/imblearn/utils/wrapper.py
@@ -5,11 +5,11 @@
 from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target
 from sklearn.utils.validation import column_or_1d as sklearn_column_or_1d
 
-from ..dask._support import is_dask_container
+from ..dask._support import is_dask_collection
 
 
 def type_of_target(y):
-    if is_dask_container(y):
+    if is_dask_collection(y):
         from ..dask.utils import type_of_target as dask_type_of_target
 
         return dask_type_of_target(y)
@@ -17,7 +17,7 @@ def type_of_target(y):
 
 
 def _is_multiclass_encoded(y):
-    if is_dask_container(y):
+    if is_dask_collection(y):
         from dask import array
 
         return array.all(y.sum(axis=1) == 1).compute()
@@ -25,7 +25,7 @@ def _is_multiclass_encoded(y):
 
 
 def column_or_1d(y, *, warn=False):
-    if is_dask_container(y):
+    if is_dask_collection(y):
         from ..dask.utils import column_or_1d as dask_column_or_1d
 
         return dask_column_or_1d(y, warn=warn)
@@ -33,7 +33,7 @@ def column_or_1d(y, *, warn=False):
 
 
 def unique(arr, **kwargs):
-    if is_dask_container(arr):
+    if is_dask_collection(arr):
         if hasattr(arr, "unique"):
             output = np.asarray(arr.unique(**kwargs))
         else:
@@ -43,7 +43,7 @@ def unique(arr, **kwargs):
 
 
 def check_classification_targets(y):
-    if is_dask_container(y):
+    if is_dask_collection(y):
         from ..dask.utils import check_classification_targets as \
             dask_check_classification_targets
 

From 58acdf21c1c687fc3b9646e6dd7201d82d299562 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 7 Nov 2020 23:44:13 +0100
Subject: [PATCH 21/32] iter

---
 imblearn/base.py                    |  1 +
 imblearn/datasets/_imbalance.py     |  9 ++++++---
 imblearn/ensemble/_bagging.py       |  8 ++++++--
 imblearn/ensemble/_easy_ensemble.py | 10 ++++++++--
 imblearn/ensemble/_forest.py        | 10 +++++++---
 5 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index 38f4259a4..9f3ea1303 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -53,6 +53,7 @@ def fit(self, X, y):
         if (not dask_collection or
                 (dask_collection and self.validate_if_dask_collection)):
             X, y, _ = self._check_X_y(X, y)
+
         self._classes_counts = get_classes_counts(y)
         self.sampling_strategy_ = check_sampling_strategy(
             self.sampling_strategy, self._classes_counts, self._sampling_type
diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py
index b35d00ed2..77a2f64d3 100644
--- a/imblearn/datasets/_imbalance.py
+++ b/imblearn/datasets/_imbalance.py
@@ -9,7 +9,10 @@
 
 from ..under_sampling import RandomUnderSampler
 from ..utils import check_sampling_strategy
-from ..utils._validation import _deprecate_positional_args
+from ..utils._validation import (
+    _deprecate_positional_args,
+    get_classes_counts,
+)
 
 
 @_deprecate_positional_args
@@ -87,11 +90,11 @@ def make_imbalance(
     >>> print('Distribution after imbalancing: {}'.format(Counter(y_res)))
     Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10})
     """
-    target_stats = Counter(y)
+    target_stats = get_classes_counts(y)
     # restrict ratio to be a dict or a callable
     if isinstance(sampling_strategy, dict) or callable(sampling_strategy):
         sampling_strategy_ = check_sampling_strategy(
-            sampling_strategy, y, "under-sampling", **kwargs
+            sampling_strategy, target_stats, "under-sampling", **kwargs
         )
     else:
         raise ValueError(
diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py
index d7c509194..c7107661e 100644
--- a/imblearn/ensemble/_bagging.py
+++ b/imblearn/ensemble/_bagging.py
@@ -18,7 +18,10 @@
 from ..utils import Substitution, check_target_type, check_sampling_strategy
 from ..utils._docstring import _n_jobs_docstring
 from ..utils._docstring import _random_state_docstring
-from ..utils._validation import _deprecate_positional_args
+from ..utils._validation import (
+    _deprecate_positional_args,
+    get_classes_counts,
+)
 
 
 @Substitution(
@@ -216,11 +219,12 @@ def __init__(
 
     def _validate_y(self, y):
         y_encoded = super()._validate_y(y)
+        classes_counts = get_classes_counts(y)
         if isinstance(self.sampling_strategy, dict):
             self._sampling_strategy = {
                 np.where(self.classes_ == key)[0][0]: value
                 for key, value in check_sampling_strategy(
-                    self.sampling_strategy, y, 'under-sampling',
+                    self.sampling_strategy, classes_counts, 'under-sampling',
                 ).items()
             }
         else:
diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py
index f140120aa..4db266134 100644
--- a/imblearn/ensemble/_easy_ensemble.py
+++ b/imblearn/ensemble/_easy_ensemble.py
@@ -17,7 +17,10 @@
 from ..utils import Substitution, check_target_type, check_sampling_strategy
 from ..utils._docstring import _n_jobs_docstring
 from ..utils._docstring import _random_state_docstring
-from ..utils._validation import _deprecate_positional_args
+from ..utils._validation import (
+    _deprecate_positional_args,
+    get_classes_counts,
+)
 from ..pipeline import Pipeline
 
 MAX_INT = np.iinfo(np.int32).max
@@ -156,11 +159,14 @@ def __init__(
 
     def _validate_y(self, y):
         y_encoded = super()._validate_y(y)
+        classes_counts = get_classes_counts(y)
         if isinstance(self.sampling_strategy, dict):
             self._sampling_strategy = {
                 np.where(self.classes_ == key)[0][0]: value
                 for key, value in check_sampling_strategy(
-                    self.sampling_strategy, y, 'under-sampling',
+                    self.sampling_strategy,
+                    classes_counts,
+                    "under-sampling",
                 ).items()
             }
         else:
diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py
index 42ae9b255..5832628c8 100644
--- a/imblearn/ensemble/_forest.py
+++ b/imblearn/ensemble/_forest.py
@@ -33,8 +33,11 @@
 from ..utils import Substitution
 from ..utils._docstring import _n_jobs_docstring
 from ..utils._docstring import _random_state_docstring
-from ..utils._validation import check_sampling_strategy
-from ..utils._validation import _deprecate_positional_args
+from ..utils._validation import (
+    check_sampling_strategy,
+    _deprecate_positional_args,
+    get_classes_counts,
+)
 
 MAX_INT = np.iinfo(np.int32).max
 
@@ -457,10 +460,11 @@ def fit(self, X, y, sample_weight=None):
             y_encoded = np.ascontiguousarray(y_encoded, dtype=DOUBLE)
 
         if isinstance(self.sampling_strategy, dict):
+            classes_counts = get_classes_counts(y)
             self._sampling_strategy = {
                 np.where(self.classes_[0] == key)[0][0]: value
                 for key, value in check_sampling_strategy(
-                    self.sampling_strategy, y, 'under-sampling',
+                    self.sampling_strategy, classes_counts, 'under-sampling',
                 ).items()
             }
         else:

From e54c772a5b8d10fdb5e4305fc5a568471f026d79 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 11:38:13 +0100
Subject: [PATCH 22/32] PEP8

---
 imblearn/utils/_docstring.py       | 2 +-
 imblearn/utils/_validation.py      | 1 -
 imblearn/utils/estimator_checks.py | 4 ++--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/imblearn/utils/_docstring.py b/imblearn/utils/_docstring.py
index 87907d73e..be94b1aac 100644
--- a/imblearn/utils/_docstring.py
+++ b/imblearn/utils/_docstring.py
@@ -47,4 +47,4 @@ def __call__(self, obj):
         Whether or not `X` and `y` should be validated. This parameter applies
         only when `X` and `y` are Dask collections where validation might be
         potentially costly.
-    """.rstrip()
\ No newline at end of file
+    """.rstrip()
diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
index 6b3936836..9fc81dd03 100644
--- a/imblearn/utils/_validation.py
+++ b/imblearn/utils/_validation.py
@@ -20,7 +20,6 @@
 from .wrapper import _is_multiclass_encoded
 from .wrapper import column_or_1d
 from .wrapper import type_of_target
-from .wrapper import unique
 
 SAMPLING_KIND = (
     "over-sampling",
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 6afb0f58a..008a011ca 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -296,7 +296,7 @@ def check_samplers_pandas(name, sampler):
 
 
 def check_samplers_dask_array(name, sampler):
-    dask = pytest.importorskip("dask")
+    pytest.importorskip("dask")
     from dask import array
     # Check that the samplers handle dask array
     X, y = make_classification(
@@ -321,7 +321,7 @@ def check_samplers_dask_array(name, sampler):
 
 
 def check_samplers_dask_dataframe(name, sampler):
-    dask = pytest.importorskip("dask")
+    pytest.importorskip("dask")
     from dask import dataframe
     # Check that the samplers handle dask dataframe and dask series
     X, y = make_classification(

From f2a572f696aa076aaae99facf09c90ff76f8ba6d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 12:17:55 +0100
Subject: [PATCH 23/32] iter

---
 imblearn/utils/_validation.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
index 9fc81dd03..03dce4ab8 100644
--- a/imblearn/utils/_validation.py
+++ b/imblearn/utils/_validation.py
@@ -134,6 +134,19 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
 
 
 def get_classes_counts(y):
+    """Compute the counts of each class present in `y`.
+
+    Parameters
+    ----------
+    y : ndarray of shape (n_samples,)
+        The target array.
+
+    Returns
+    -------
+    classes_counts : dict
+        A dictionary where the keys are the class labels and the values are the
+        counts for each class.
+    """
     unique, counts = np.unique(y, return_counts=True)
     if is_dask_collection(unique):
         from dask import compute
@@ -542,8 +555,14 @@ def check_sampling_strategy(
           correspond to the targeted classes. The values correspond to the
           desired number of samples for each class.
 
-    y : ndarray of shape (n_samples,)
-        The target array.
+    classes_counts : dict or ndarray of shape (n_samples,)
+        A dictionary where the keys are the class present in `y` and the values
+        are the counts. The function :func:`~imblearn.utils.get_classes_count`
+        provides such a dictionary, giving `y` as an input.
+
+        .. deprecated:: 0.7
+           Passing the array `y` is deprecated from 0.7 and will be removed
+           in 0.9.
 
     sampling_type : {{'over-sampling', 'under-sampling', 'clean-sampling'}}
         The type of sampling. Can be either ``'over-sampling'``,
@@ -567,6 +586,15 @@ def check_sampling_strategy(
             " instead.".format(SAMPLING_KIND, sampling_type)
         )
 
+    if hasattr(y, "__array__"):
+        warnings.warn(
+            f"Passing that array of target `y` is deprecated in 0.7 and will "
+            f"raise an error from 0.9. Instead, pass `y` to "
+            "imblearn.utils.get_classes_counts function to get the "
+            "dictionary.", FutureWarning
+        )
+        classes_counts = get_classes_counts(classes_counts)
+
     if len(classes_counts) <= 1:
         raise ValueError(
             "The target 'y' needs to have more than 1 class."

From 36a0aa36cf2d64308b13f13cb31b3200f57a1ac7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 12:20:34 +0100
Subject: [PATCH 24/32] iter

---
 imblearn/utils/_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
index 03dce4ab8..c5b87e6ff 100644
--- a/imblearn/utils/_validation.py
+++ b/imblearn/utils/_validation.py
@@ -586,7 +586,7 @@ def check_sampling_strategy(
             " instead.".format(SAMPLING_KIND, sampling_type)
         )
 
-    if hasattr(y, "__array__"):
+    if hasattr(classes_counts, "__array__"):
         warnings.warn(
             f"Passing that array of target `y` is deprecated in 0.7 and will "
             f"raise an error from 0.9. Instead, pass `y` to "

From c7bdc74d45d10d342f04f8f84d7f308fd932ae17 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 12:38:10 +0100
Subject: [PATCH 25/32] check raise FutureWarning

---
 doc/api.rst                             |   2 +-
 imblearn/utils/__init__.py              |   2 +
 imblearn/utils/tests/test_validation.py | 169 ++++++++++++++++++------
 3 files changed, 129 insertions(+), 44 deletions(-)

diff --git a/doc/api.rst b/doc/api.rst
index 07ac6413c..65bfd1b06 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -248,6 +248,6 @@ Imbalance-learn provides some fast-prototyping tools.
    :toctree: generated/
    :template: function.rst
 
-   utils.estimator_checks.parametrize_with_checks
    utils.check_neighbors_object
    utils.check_sampling_strategy
+   utils.get_classes_counts
diff --git a/imblearn/utils/__init__.py b/imblearn/utils/__init__.py
index 4e74d2ee3..130d9f0c9 100644
--- a/imblearn/utils/__init__.py
+++ b/imblearn/utils/__init__.py
@@ -7,10 +7,12 @@
 from ._validation import check_neighbors_object
 from ._validation import check_target_type
 from ._validation import check_sampling_strategy
+from ._validation import get_classes_counts
 
 __all__ = [
     "check_neighbors_object",
     "check_sampling_strategy",
     "check_target_type",
+    "get_classes_counts",
     "Substitution",
 ]
diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py
index e4f9c01c8..b0ff57c83 100644
--- a/imblearn/utils/tests/test_validation.py
+++ b/imblearn/utils/tests/test_validation.py
@@ -17,11 +17,14 @@
 from imblearn.utils import check_neighbors_object
 from imblearn.utils import check_sampling_strategy
 from imblearn.utils import check_target_type
+from imblearn.utils import get_classes_counts
 from imblearn.utils._validation import ArraysTransformer
 from imblearn.utils._validation import _deprecate_positional_args
 
 multiclass_target = np.array([1] * 50 + [2] * 100 + [3] * 25)
+multiclass_classes_counts = get_classes_counts(multiclass_target)
 binary_target = np.array([1] * 25 + [0] * 100)
+binary_classes_counts = get_classes_counts(binary_target)
 
 
 def test_check_neighbors_object():
@@ -70,11 +73,11 @@ def test_check_target_type_ova(target, output_target, is_ova):
     assert binarize_target == is_ova
 
 
-def test_check_sampling_strategy_warning():
+def test_check_sampling_strategy_error_dict_cleaning_methods():
     msg = "dict for cleaning methods is not supported"
     with pytest.raises(ValueError, match=msg):
         check_sampling_strategy(
-            {1: 0, 2: 0, 3: 0}, multiclass_target, "clean-sampling"
+            {1: 0, 2: 0, 3: 0}, multiclass_classes_counts, "clean-sampling"
         )
 
 
@@ -83,19 +86,19 @@ def test_check_sampling_strategy_warning():
     [
         (
             0.5,
-            binary_target,
+            binary_classes_counts,
             "clean-sampling",
             "'clean-sampling' methods do let the user specify the sampling ratio",  # noqa
         ),
         (
             0.1,
-            np.array([0] * 10 + [1] * 20),
+            get_classes_counts(np.array([0] * 10 + [1] * 20)),
             "over-sampling",
             "remove samples from the minority class while trying to generate new",  # noqa
         ),
         (
             0.1,
-            np.array([0] * 10 + [1] * 20),
+            get_classes_counts(np.array([0] * 10 + [1] * 20)),
             "under-sampling",
             "generate new sample in the majority class while trying to remove",
         ),
@@ -108,15 +111,21 @@ def test_check_sampling_strategy_float_error(ratio, y, type, err_msg):
 
 def test_check_sampling_strategy_error():
     with pytest.raises(ValueError, match="'sampling_type' should be one of"):
-        check_sampling_strategy("auto", np.array([1, 2, 3]), "rnd")
+        check_sampling_strategy(
+            "auto", get_classes_counts(np.array([1, 2, 3])), "rnd"
+        )
 
     error_regex = "The target 'y' needs to have more than 1 class."
     with pytest.raises(ValueError, match=error_regex):
-        check_sampling_strategy("auto", np.ones((10,)), "over-sampling")
+        check_sampling_strategy(
+            "auto", get_classes_counts(np.ones((10,))), "over-sampling"
+        )
 
     error_regex = "When 'sampling_strategy' is a string, it needs to be one of"
     with pytest.raises(ValueError, match=error_regex):
-        check_sampling_strategy("rnd", np.array([1, 2, 3]), "over-sampling")
+        check_sampling_strategy(
+            "rnd", get_classes_counts(np.array([1, 2, 3])), "over-sampling"
+        )
 
 
 @pytest.mark.parametrize(
@@ -136,7 +145,9 @@ def test_check_sampling_strategy_error_wrong_string(
         ),
     ):
         check_sampling_strategy(
-            sampling_strategy, np.array([1, 2, 3]), sampling_type
+            sampling_strategy,
+            get_classes_counts(np.array([1, 2, 3])),
+            sampling_type,
         )
 
 
@@ -153,14 +164,18 @@ def test_sampling_strategy_class_target_unknown(
 ):
     y = np.array([1] * 50 + [2] * 100 + [3] * 25)
     with pytest.raises(ValueError, match="are not present in the data."):
-        check_sampling_strategy(sampling_strategy, y, sampling_method)
+        check_sampling_strategy(
+            sampling_strategy, get_classes_counts(y), sampling_method
+        )
 
 
 def test_sampling_strategy_dict_error():
     y = np.array([1] * 50 + [2] * 100 + [3] * 25)
     sampling_strategy = {1: -100, 2: 50, 3: 25}
     with pytest.raises(ValueError, match="in a class cannot be negative."):
-        check_sampling_strategy(sampling_strategy, y, "under-sampling")
+        check_sampling_strategy(
+            sampling_strategy, get_classes_counts(y), "under-sampling"
+        )
     sampling_strategy = {1: 45, 2: 100, 3: 70}
     error_regex = (
         "With over-sampling methods, the number of samples in a"
@@ -169,7 +184,9 @@ def test_sampling_strategy_dict_error():
         " samples are asked."
     )
     with pytest.raises(ValueError, match=error_regex):
-        check_sampling_strategy(sampling_strategy, y, "over-sampling")
+        check_sampling_strategy(
+            sampling_strategy, get_classes_counts(y), "over-sampling"
+        )
 
     error_regex = (
         "With under-sampling methods, the number of samples in a"
@@ -178,21 +195,27 @@ def test_sampling_strategy_dict_error():
         " are asked."
     )
     with pytest.raises(ValueError, match=error_regex):
-        check_sampling_strategy(sampling_strategy, y, "under-sampling")
+        check_sampling_strategy(
+            sampling_strategy, get_classes_counts(y), "under-sampling"
+        )
 
 
 @pytest.mark.parametrize("sampling_strategy", [-10, 10])
 def test_sampling_strategy_float_error_not_in_range(sampling_strategy):
     y = np.array([1] * 50 + [2] * 100)
     with pytest.raises(ValueError, match="it should be in the range"):
-        check_sampling_strategy(sampling_strategy, y, "under-sampling")
+        check_sampling_strategy(
+            sampling_strategy, get_classes_counts(y), "under-sampling"
+        )
 
 
 def test_sampling_strategy_float_error_not_binary():
     y = np.array([1] * 50 + [2] * 100 + [3] * 25)
     with pytest.raises(ValueError, match="the type of target is binary"):
         sampling_strategy = 0.5
-        check_sampling_strategy(sampling_strategy, y, "under-sampling")
+        check_sampling_strategy(
+            sampling_strategy, get_classes_counts(y), "under-sampling"
+        )
 
 
 @pytest.mark.parametrize(
@@ -202,7 +225,9 @@ def test_sampling_strategy_list_error_not_clean_sampling(sampling_method):
     y = np.array([1] * 50 + [2] * 100 + [3] * 25)
     with pytest.raises(ValueError, match="cannot be a list for samplers"):
         sampling_strategy = [1, 2, 3]
-        check_sampling_strategy(sampling_strategy, y, sampling_method)
+        check_sampling_strategy(
+            sampling_strategy, get_classes_counts(y), sampling_method
+        )
 
 
 def _sampling_strategy_func(y):
@@ -215,42 +240,87 @@ def _sampling_strategy_func(y):
 @pytest.mark.parametrize(
     "sampling_strategy, sampling_type, expected_sampling_strategy, target",
     [
-        ("auto", "under-sampling", {1: 25, 2: 25}, multiclass_target),
-        ("auto", "clean-sampling", {1: 25, 2: 25}, multiclass_target),
-        ("auto", "over-sampling", {1: 50, 3: 75}, multiclass_target),
-        ("all", "over-sampling", {1: 50, 2: 0, 3: 75}, multiclass_target),
-        ("all", "under-sampling", {1: 25, 2: 25, 3: 25}, multiclass_target),
-        ("all", "clean-sampling", {1: 25, 2: 25, 3: 25}, multiclass_target),
-        ("majority", "under-sampling", {2: 25}, multiclass_target),
-        ("majority", "clean-sampling", {2: 25}, multiclass_target),
-        ("minority", "over-sampling", {3: 75}, multiclass_target),
-        ("not minority", "over-sampling", {1: 50, 2: 0}, multiclass_target),
-        ("not minority", "under-sampling", {1: 25, 2: 25}, multiclass_target),
-        ("not minority", "clean-sampling", {1: 25, 2: 25}, multiclass_target),
-        ("not majority", "over-sampling", {1: 50, 3: 75}, multiclass_target),
-        ("not majority", "under-sampling", {1: 25, 3: 25}, multiclass_target),
-        ("not majority", "clean-sampling", {1: 25, 3: 25}, multiclass_target),
+        ("auto", "under-sampling", {1: 25, 2: 25}, multiclass_classes_counts),
+        ("auto", "clean-sampling", {1: 25, 2: 25}, multiclass_classes_counts),
+        ("auto", "over-sampling", {1: 50, 3: 75}, multiclass_classes_counts),
+        (
+            "all",
+            "over-sampling",
+            {1: 50, 2: 0, 3: 75},
+            multiclass_classes_counts,
+        ),
+        (
+            "all",
+            "under-sampling",
+            {1: 25, 2: 25, 3: 25},
+            multiclass_classes_counts,
+        ),
+        (
+            "all",
+            "clean-sampling",
+            {1: 25, 2: 25, 3: 25},
+            multiclass_classes_counts,
+        ),
+        ("majority", "under-sampling", {2: 25}, multiclass_classes_counts),
+        ("majority", "clean-sampling", {2: 25}, multiclass_classes_counts),
+        ("minority", "over-sampling", {3: 75}, multiclass_classes_counts),
+        (
+            "not minority",
+            "over-sampling",
+            {1: 50, 2: 0},
+            multiclass_classes_counts,
+        ),
+        (
+            "not minority",
+            "under-sampling",
+            {1: 25, 2: 25},
+            multiclass_classes_counts,
+        ),
+        (
+            "not minority",
+            "clean-sampling",
+            {1: 25, 2: 25},
+            multiclass_classes_counts,
+        ),
+        (
+            "not majority",
+            "over-sampling",
+            {1: 50, 3: 75},
+            multiclass_classes_counts,
+        ),
+        (
+            "not majority",
+            "under-sampling",
+            {1: 25, 3: 25},
+            multiclass_classes_counts,
+        ),
+        (
+            "not majority",
+            "clean-sampling",
+            {1: 25, 3: 25},
+            multiclass_classes_counts,
+        ),
         (
             {1: 70, 2: 100, 3: 70},
             "over-sampling",
             {1: 20, 2: 0, 3: 45},
-            multiclass_target,
+            multiclass_classes_counts,
         ),
         (
             {1: 30, 2: 45, 3: 25},
             "under-sampling",
             {1: 30, 2: 45, 3: 25},
-            multiclass_target,
+            multiclass_classes_counts,
         ),
-        ([1], "clean-sampling", {1: 25}, multiclass_target),
+        ([1], "clean-sampling", {1: 25}, multiclass_classes_counts),
         (
             _sampling_strategy_func,
             "over-sampling",
             {1: 50, 2: 0, 3: 75},
-            multiclass_target,
+            multiclass_classes_counts,
         ),
-        (0.5, "over-sampling", {1: 25}, binary_target),
-        (0.5, "under-sampling", {0: 50}, binary_target),
+        (0.5, "over-sampling", {1: 25}, binary_classes_counts),
+        (0.5, "under-sampling", {0: 50}, binary_classes_counts),
     ],
 )
 def test_check_sampling_strategy(
@@ -271,23 +341,27 @@ def test_sampling_strategy_dict_over_sampling():
         r" the majority class \(class #2 -> 100\)"
     )
     with warns(UserWarning, expected_msg):
-        check_sampling_strategy(sampling_strategy, y, "over-sampling")
+        check_sampling_strategy(
+            sampling_strategy, get_classes_counts(y), "over-sampling"
+        )
 
 
 def test_sampling_strategy_callable_args():
     y = np.array([1] * 50 + [2] * 100 + [3] * 25)
     multiplier = {1: 1.5, 2: 1, 3: 3}
 
-    def sampling_strategy_func(y, multiplier):
+    def sampling_strategy_func(classes_counts, multiplier):
         """samples such that each class will be affected by the multiplier."""
-        target_stats = Counter(y)
         return {
             key: int(values * multiplier[key])
-            for key, values in target_stats.items()
+            for key, values in classes_counts.items()
         }
 
     sampling_strategy_ = check_sampling_strategy(
-        sampling_strategy_func, y, "over-sampling", multiplier=multiplier
+        sampling_strategy_func,
+        get_classes_counts(y),
+        "over-sampling",
+        multiplier=multiplier,
     )
     assert sampling_strategy_ == {1: 25, 2: 0, 3: 50}
 
@@ -314,11 +388,20 @@ def test_sampling_strategy_check_order(
     # dictionary is sorted. Refer to issue #428.
     y = np.array([1] * 50 + [2] * 100 + [3] * 25)
     sampling_strategy_ = check_sampling_strategy(
-        sampling_strategy, y, sampling_type
+        sampling_strategy, get_classes_counts(y), sampling_type
     )
     assert sampling_strategy_ == expected_result
 
 
+# FIXME: remove in 0.9
+def test_sampling_strategy_deprecation_array_target():
+    # Check that we raise a FutureWarning when an array of target is passed
+    with pytest.warns(FutureWarning):
+        sampling_strategy = "auto"
+        check_sampling_strategy(
+            sampling_strategy, binary_target, "under-sampling",
+        )
+
 def test_arrays_transformer_plain_list():
     X = np.array([[0, 0], [1, 1]])
     y = np.array([[0, 0], [1, 1]])

From f09522154067b497d02c02ad4889e02244d9f3d7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 12:54:02 +0100
Subject: [PATCH 26/32] iter

---
 imblearn/utils/_validation.py           | 4 ++--
 imblearn/utils/tests/test_validation.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
index c5b87e6ff..8538d7718 100644
--- a/imblearn/utils/_validation.py
+++ b/imblearn/utils/_validation.py
@@ -588,8 +588,8 @@ def check_sampling_strategy(
 
     if hasattr(classes_counts, "__array__"):
         warnings.warn(
-            f"Passing that array of target `y` is deprecated in 0.7 and will "
-            f"raise an error from 0.9. Instead, pass `y` to "
+            "Passing an array of target `y` is deprecated in 0.7 and will "
+            "raise an error from 0.9. Instead, pass `y` to "
             "imblearn.utils.get_classes_counts function to get the "
             "dictionary.", FutureWarning
         )
diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py
index b0ff57c83..b5f06e5b6 100644
--- a/imblearn/utils/tests/test_validation.py
+++ b/imblearn/utils/tests/test_validation.py
@@ -402,6 +402,7 @@ def test_sampling_strategy_deprecation_array_target():
             sampling_strategy, binary_target, "under-sampling",
         )
 
+
 def test_arrays_transformer_plain_list():
     X = np.array([[0, 0], [1, 1]])
     y = np.array([[0, 0], [1, 1]])

From 20b44c65c7a2cb6e9c47487d327ad7dd8b675c74 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 13:07:45 +0100
Subject: [PATCH 27/32] iter

---
 imblearn/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index 9f3ea1303..e3cc26a0f 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -273,8 +273,9 @@ def fit_resample(self, X, y):
                 X, y, accept_sparse=self.accept_sparse
             )
 
+        self._classes_counts = get_classes_counts(y)
         self.sampling_strategy_ = check_sampling_strategy(
-            self.sampling_strategy, y, self._sampling_type
+            self.sampling_strategy, self._classes_counts, self._sampling_type
         )
 
         output = self._fit_resample(X, y)

From 4cd9116056790a08bc4b99ebe0b64c08574514e3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 19:20:53 +0100
Subject: [PATCH 28/32] iter

---
 imblearn/base.py                   | 27 ++++++++++++------
 imblearn/dask/preprocessing.py     | 10 +++++++
 imblearn/utils/estimator_checks.py | 46 ++++++++++++++++++------------
 imblearn/utils/wrapper.py          |  9 ++++++
 4 files changed, 66 insertions(+), 26 deletions(-)
 create mode 100644 imblearn/dask/preprocessing.py

diff --git a/imblearn/base.py b/imblearn/base.py
index e3cc26a0f..6099abcba 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -18,7 +18,10 @@
     _deprecate_positional_args,
     get_classes_counts,
 )
-from .utils.wrapper import check_classification_targets
+from .utils.wrapper import (
+    check_classification_targets,
+    label_binarize,
+)
 
 
 class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
@@ -49,7 +52,11 @@ def fit(self, X, y):
         self : object
             Return the instance itself.
         """
+        arrays_transformer = ArraysTransformer(X, y)
         dask_collection = any([is_dask_collection(arr) for arr in (X, y)])
+        if dask_collection:
+            X, y = arrays_transformer.to_dask_array(X, y)
+
         if (not dask_collection or
                 (dask_collection and self.validate_if_dask_collection)):
             X, y, _ = self._check_X_y(X, y)
@@ -83,12 +90,14 @@ def fit_resample(self, X, y):
         """
         arrays_transformer = ArraysTransformer(X, y)
         dask_collection = any([is_dask_collection(arr) for arr in (X, y)])
+        if dask_collection:
+            X, y = arrays_transformer.to_dask_array(X, y)
+
         if (not dask_collection or
                 (dask_collection and self.validate_if_dask_collection)):
             check_classification_targets(y)
             X, y, binarize_y = self._check_X_y(X, y)
         else:
-            X, y = arrays_transformer.to_dask_array(X, y)
             binarize_y = False
 
         self._classes_counts = get_classes_counts(y)
@@ -98,9 +107,10 @@ def fit_resample(self, X, y):
 
         output = self._fit_resample(X, y)
 
-        # TODO: label binarize is not implemented with dask
-        y_ = (label_binarize(output[1], np.unique(y))
-              if binarize_y else output[1])
+        if binarize_y:
+            y_ = label_binarize(output[1], classes=np.unique(y))
+        else:
+            y_ = output[1]
 
         X_, y_ = arrays_transformer.transform(output[0], y_)
         return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
@@ -281,9 +291,10 @@ def fit_resample(self, X, y):
         output = self._fit_resample(X, y)
 
         if self.validate:
-
-            y_ = (label_binarize(output[1], np.unique(y))
-                  if binarize_y else output[1])
+            if binarize_y:
+                y_ = label_binarize(output[1], classes=np.unique(y))
+            else:
+                y_ = output[1]
             X_, y_ = arrays_transformer.transform(output[0], y_)
             return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
 
diff --git a/imblearn/dask/preprocessing.py b/imblearn/dask/preprocessing.py
new file mode 100644
index 000000000..f6038b74b
--- /dev/null
+++ b/imblearn/dask/preprocessing.py
@@ -0,0 +1,10 @@
+import numpy as np
+
+
+def label_binarize(y, *, classes):
+    import pandas as pd
+    from dask import dataframe
+
+    cat_dtype = pd.CategoricalDtype(categories=classes)
+    y = dataframe.from_array(y).astype(cat_dtype)
+    return dataframe.get_dummies(y).to_dask_array()
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 008a011ca..ffa083428 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -295,9 +295,10 @@ def check_samplers_pandas(name, sampler):
     assert_allclose(y_res_s.to_numpy(), y_res)
 
 
-def check_samplers_dask_array(name, sampler):
+def check_samplers_dask_array(name, sampler_orig):
     pytest.importorskip("dask")
     from dask import array
+    sampler = clone(sampler_orig)
     # Check that the samplers handle dask array
     X, y = make_classification(
         n_samples=1000,
@@ -309,20 +310,25 @@ def check_samplers_dask_array(name, sampler):
     X_dask = array.from_array(X, chunks=100)
     y_dask = array.from_array(y, chunks=100)
 
-    X_res_dask, y_res_dask = sampler.fit_resample(X_dask, y_dask)
-    X_res, y_res = sampler.fit_resample(X, y)
+    for validate_if_dask_collection in (True, False):
+        sampler.set_params(
+            validate_if_dask_collection=validate_if_dask_collection
+        )
+        X_res_dask, y_res_dask = sampler.fit_resample(X_dask, y_dask)
+        X_res, y_res = sampler.fit_resample(X, y)
 
-    # check that we return the same type for dataframes or series types
-    assert isinstance(X_res_dask, array.Array)
-    assert isinstance(y_res_dask, array.Array)
+        # check that we return the same type for dataframes or series types
+        assert isinstance(X_res_dask, array.Array)
+        assert isinstance(y_res_dask, array.Array)
 
-    assert_allclose(X_res_dask, X_res)
-    assert_allclose(y_res_dask, y_res)
+        assert_allclose(X_res_dask, X_res)
+        assert_allclose(y_res_dask, y_res)
 
 
-def check_samplers_dask_dataframe(name, sampler):
+def check_samplers_dask_dataframe(name, sampler_orig):
     pytest.importorskip("dask")
     from dask import dataframe
+    sampler = clone(sampler_orig)
     # Check that the samplers handle dask dataframe and dask series
     X, y = make_classification(
         n_samples=1000,
@@ -337,18 +343,22 @@ def check_samplers_dask_dataframe(name, sampler):
     y_s = dataframe.from_array(y)
     y_s = y_s.rename("target")
 
-    X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
-    X_res, y_res = sampler.fit_resample(X, y)
+    for validate_if_dask_collection in (True, False):
+        sampler.set_params(
+            validate_if_dask_collection=validate_if_dask_collection
+        )
+        X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
+        X_res, y_res = sampler.fit_resample(X, y)
 
-    # check that we return the same type for dataframes or series types
-    assert isinstance(X_res_df, dataframe.DataFrame)
-    assert isinstance(y_res_s, dataframe.Series)
+        # check that we return the same type for dataframes or series types
+        assert isinstance(X_res_df, dataframe.DataFrame)
+        assert isinstance(y_res_s, dataframe.Series)
 
-    assert X_df.columns.to_list() == X_res_df.columns.to_list()
-    assert y_s.name == y_res_s.name
+        assert X_df.columns.to_list() == X_res_df.columns.to_list()
+        assert y_s.name == y_res_s.name
 
-    assert_allclose(np.array(X_res_df), X_res)
-    assert_allclose(np.array(y_res_s), y_res)
+        assert_allclose(np.array(X_res_df), X_res)
+        assert_allclose(np.array(y_res_s), y_res)
 
 
 def check_samplers_list(name, sampler):
diff --git a/imblearn/utils/wrapper.py b/imblearn/utils/wrapper.py
index d0559d0af..cbc9e1b1d 100644
--- a/imblearn/utils/wrapper.py
+++ b/imblearn/utils/wrapper.py
@@ -1,5 +1,6 @@
 import numpy as np
 
+from sklearn.preprocessing import label_binarize as sklearn_label_binarize
 from sklearn.utils.multiclass import check_classification_targets as \
     sklearn_check_classification_targets
 from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target
@@ -49,3 +50,11 @@ def check_classification_targets(y):
 
         return dask_check_classification_targets(y)
     return sklearn_check_classification_targets(y)
+
+
+def label_binarize(y, *, classes):
+    if is_dask_collection(y):
+        from ..dask.preprocessing import label_binarize as dask_label_binarize
+
+        return dask_label_binarize(y, classes=classes)
+    return sklearn_label_binarize(y, classes=classes)

From a6e975b251079b6c9497b92cc70adee530adeb2f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 19:23:18 +0100
Subject: [PATCH 29/32] PEP8

---
 imblearn/base.py               | 1 -
 imblearn/dask/preprocessing.py | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index 6099abcba..aebe2bf6d 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -9,7 +9,6 @@
 import numpy as np
 
 from sklearn.base import BaseEstimator
-from sklearn.preprocessing import label_binarize
 
 from .dask._support import is_dask_collection
 from .utils import check_sampling_strategy, check_target_type
diff --git a/imblearn/dask/preprocessing.py b/imblearn/dask/preprocessing.py
index f6038b74b..3a79fe576 100644
--- a/imblearn/dask/preprocessing.py
+++ b/imblearn/dask/preprocessing.py
@@ -1,6 +1,3 @@
-import numpy as np
-
-
 def label_binarize(y, *, classes):
     import pandas as pd
     from dask import dataframe

From 32eda462602259bf5da468ca8b0434d887cbc82c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 19:52:10 +0100
Subject: [PATCH 30/32] iter

---
 imblearn/base.py                   | 8 ++++++--
 imblearn/utils/estimator_checks.py | 8 ++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index aebe2bf6d..4d69f5461 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -107,7 +107,9 @@ def fit_resample(self, X, y):
         output = self._fit_resample(X, y)
 
         if binarize_y:
-            y_ = label_binarize(output[1], classes=np.unique(y))
+            y_ = label_binarize(
+                output[1], classes=list(self._classes_counts.keys())
+            )
         else:
             y_ = output[1]
 
@@ -291,7 +293,9 @@ def fit_resample(self, X, y):
 
         if self.validate:
             if binarize_y:
-                y_ = label_binarize(output[1], classes=np.unique(y))
+                y_ = label_binarize(
+                    output[1], classes=list(self._classes_counts.keys())
+                )
             else:
                 y_ = output[1]
             X_, y_ = arrays_transformer.transform(output[0], y_)
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index ffa083428..1d7b8e328 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -327,6 +327,7 @@ def check_samplers_dask_array(name, sampler_orig):
 
 def check_samplers_dask_dataframe(name, sampler_orig):
     pytest.importorskip("dask")
+    pd = pytest.importorskip("pandas")
     from dask import dataframe
     sampler = clone(sampler_orig)
     # Check that the samplers handle dask dataframe and dask series
@@ -342,20 +343,27 @@ def check_samplers_dask_dataframe(name, sampler_orig):
     )
     y_s = dataframe.from_array(y)
     y_s = y_s.rename("target")
+    y_s_ohe = dataframe.get_dummies(
+        y_s.astype(pd.CategoricalDtype(categories=[0, 1, 2]))
+    )
 
     for validate_if_dask_collection in (True, False):
         sampler.set_params(
             validate_if_dask_collection=validate_if_dask_collection
         )
         X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
+        # FIXME: not supported with validate=False
+        X_res, y_res_s_ohe = sampler.fit_resample(X, y_s_ohe)
         X_res, y_res = sampler.fit_resample(X, y)
 
         # check that we return the same type for dataframes or series types
         assert isinstance(X_res_df, dataframe.DataFrame)
         assert isinstance(y_res_s, dataframe.Series)
+        assert isinstance(y_res_s_ohe, dataframe.DataFrame)
 
         assert X_df.columns.to_list() == X_res_df.columns.to_list()
         assert y_s.name == y_res_s.name
+        assert y_s_ohe.columns.to_list() == y_res_s_ohe.columns.to_list()
 
         assert_allclose(np.array(X_res_df), X_res)
         assert_allclose(np.array(y_res_s), y_res)

From 6c592ff367c8ead35ad4e63278446f987bf7c70f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 20:18:19 +0100
Subject: [PATCH 31/32] iter

---
 imblearn/utils/estimator_checks.py | 71 +++++++++++++++++++++++++++---
 1 file changed, 64 insertions(+), 7 deletions(-)

diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 1d7b8e328..a5c5bb3f4 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -65,6 +65,10 @@ def _yield_sampler_checks(sampler):
         yield check_samplers_dask_dataframe
     yield check_samplers_list
     yield check_samplers_multiclass_ova
+    if "dask-array" in tags["X_types"]:
+        yield check_samplers_multiclass_ova_dask_array
+    if "dask-dataframe" in tags["X_types"]:
+        yield check_samplers_multiclass_ova_dask_dataframe
     yield check_samplers_preserve_dtype
     yield check_samplers_sample_indices
     yield check_samplers_2d_target
@@ -343,27 +347,20 @@ def check_samplers_dask_dataframe(name, sampler_orig):
     )
     y_s = dataframe.from_array(y)
     y_s = y_s.rename("target")
-    y_s_ohe = dataframe.get_dummies(
-        y_s.astype(pd.CategoricalDtype(categories=[0, 1, 2]))
-    )
 
     for validate_if_dask_collection in (True, False):
         sampler.set_params(
             validate_if_dask_collection=validate_if_dask_collection
         )
         X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
-        # FIXME: not supported with validate=False
-        X_res, y_res_s_ohe = sampler.fit_resample(X, y_s_ohe)
         X_res, y_res = sampler.fit_resample(X, y)
 
         # check that we return the same type for dataframes or series types
         assert isinstance(X_res_df, dataframe.DataFrame)
         assert isinstance(y_res_s, dataframe.Series)
-        assert isinstance(y_res_s_ohe, dataframe.DataFrame)
 
         assert X_df.columns.to_list() == X_res_df.columns.to_list()
         assert y_s.name == y_res_s.name
-        assert y_s_ohe.columns.to_list() == y_res_s_ohe.columns.to_list()
 
         assert_allclose(np.array(X_res_df), X_res)
         assert_allclose(np.array(y_res_s), y_res)
@@ -408,6 +405,66 @@ def check_samplers_multiclass_ova(name, sampler):
     assert_allclose(y_res, y_res_ova.argmax(axis=1))
 
 
+def check_samplers_multiclass_ova_dask_array(name, sampler_orig):
+    pytest.importorskip("dask")
+    from dask import array
+    sampler = clone(sampler_orig)
+    X, y = make_classification(
+        n_samples=1000,
+        n_classes=3,
+        n_informative=4,
+        weights=[0.2, 0.3, 0.5],
+        random_state=0,
+    )
+    y_ova = label_binarize(y, np.unique(y))
+
+    X = array.from_array(X)
+    y = array.from_array(y)
+    y_ova = array.from_array(y_ova)
+
+    sampler.set_params(validate_if_dask_collection=True)
+    X_res, y_res = sampler.fit_resample(X, y)
+    X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova)
+
+    assert_allclose(X_res, X_res_ova)
+    assert type_of_target(y_res_ova) == type_of_target(y_ova)
+    assert_allclose(y_res, y_res_ova.argmax(axis=1))
+
+    assert isinstance(X_res_ova, array.Array)
+    assert isinstance(y_res, array.Array)
+    assert isinstance(y_res_ova, array.Array)
+
+
+def check_samplers_multiclass_ova_dask_dataframe(name, sampler_orig):
+    pytest.importorskip("dask")
+    from dask import dataframe
+    sampler = clone(sampler_orig)
+    X, y = make_classification(
+        n_samples=1000,
+        n_classes=3,
+        n_informative=4,
+        weights=[0.2, 0.3, 0.5],
+        random_state=0,
+    )
+    y_ova = label_binarize(y, np.unique(y))
+
+    X = dataframe.from_array(X)
+    y = dataframe.from_array(y)
+    y_ova = dataframe.from_array(y_ova)
+
+    sampler.set_params(validate_if_dask_collection=True)
+    X_res, y_res = sampler.fit_resample(X, y)
+    X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova)
+
+    assert_allclose(X_res, X_res_ova)
+    assert type_of_target(y_res_ova) == type_of_target(y_ova)
+    assert_allclose(y_res, y_res_ova.to_dask_array().argmax(axis=1))
+
+    assert isinstance(X_res_ova, dataframe.DataFrame)
+    assert isinstance(y_res, dataframe.Series)
+    assert isinstance(y_res_ova, dataframe.DataFrame)
+
+
 def check_samplers_2d_target(name, sampler):
     X, y = make_classification(
         n_samples=100,

From 456c3ebbf5623a681c07211cdf032943532921c8 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 8 Nov 2020 20:22:28 +0100
Subject: [PATCH 32/32] PEP8

---
 imblearn/utils/estimator_checks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index a5c5bb3f4..ceb828272 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -331,7 +331,6 @@ def check_samplers_dask_array(name, sampler_orig):
 
 def check_samplers_dask_dataframe(name, sampler_orig):
     pytest.importorskip("dask")
-    pd = pytest.importorskip("pandas")
     from dask import dataframe
     sampler = clone(sampler_orig)
     # Check that the samplers handle dask dataframe and dask series