scikit-learn-contrib · glemaitre · Nov 5, 2020 · Nov 5, 2020 · Nov 5, 2020 · Nov 5, 2020
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -35,6 +35,7 @@ jobs:
         PYTHON_VERSION: '3.8'
         COVERAGE: 'true'
         PANDAS_VERSION: '*'
+        DASK_VERSION: '*'
         TEST_DOCSTRINGS: 'true'
         JOBLIB_VERSION: '*'
         CHECK_WARNINGS: 'true'
@@ -43,6 +44,7 @@ jobs:
         PYTHON_VERSION: '3.7'
         INSTALL_MKL: 'true'
         PANDAS_VERSION: '*'
+        DASK_VERSION: '*'
         KERAS_VERSION: '*'
         COVERAGE: 'true'
         JOBLIB_VERSION: '*'
@@ -51,6 +53,7 @@ jobs:
         DISTRIB: 'conda'
         PYTHON_VERSION: '3.8'
         PANDAS_VERSION: '*'
+        DASK_VERSION: '*'
         JOBLIB_VERSION: '*'
         INSTALL_MKL: 'true'
         TENSORFLOW_VERSION: '*'

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
@@ -40,6 +40,10 @@ if [[ "$DISTRIB" == "conda" ]]; then
         TO_INSTALL="$TO_INSTALL pandas=$PANDAS_VERSION"
     fi
 
+    if [[ -n "$DASK_VERSION" ]]; then
+        TO_INSTALL="$TO_INSTALL dask=$DASK_VERSION"
+    fi
+
     if [[ -n "$KERAS_VERSION" ]]; then
         TO_INSTALL="$TO_INSTALL keras=$KERAS_VERSION tensorflow=1"
         KERAS_BACKEND=tensorflow
@@ -90,9 +94,10 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     make_conda "python=$PYTHON_VERSION"
     python -m pip install -U pip
     python -m pip install numpy scipy joblib cython
+    python -m pip install pandas
+    python -m pip install "dask[complete]"
     python -m pip install scikit-learn
     python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist
-    python -m pip install pandas
 fi
 
 if [[ "$COVERAGE" == "true" ]]; then

diff --git a/conftest.py b/conftest.py
@@ -22,12 +22,12 @@ def pytest_runtest_setup(item):
     if (fname.endswith(os.path.join('keras', '_generator.py')) or
             fname.endswith('miscellaneous.rst')):
         try:
-            import keras
+            import keras  # noqa
         except ImportError:
             pytest.skip('The keras package is not installed.')
     elif (fname.endswith(os.path.join('tensorflow', '_generator.py')) or
           fname.endswith('miscellaneous.rst')):
         try:
-            import tensorflow
+            import tensorflow  # noqa
         except ImportError:
             pytest.skip('The tensorflow package is not installed.')
diff --git a/doc/api.rst b/doc/api.rst
@@ -248,6 +248,6 @@ Imbalance-learn provides some fast-prototyping tools.
    :toctree: generated/
    :template: function.rst
 
-   utils.estimator_checks.parametrize_with_checks
    utils.check_neighbors_object
    utils.check_sampling_strategy
+   utils.get_classes_counts
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -9,12 +9,18 @@
 import numpy as np
 
 from sklearn.base import BaseEstimator
-from sklearn.preprocessing import label_binarize
-from sklearn.utils.multiclass import check_classification_targets
 
+from .dask._support import is_dask_collection
 from .utils import check_sampling_strategy, check_target_type
-from .utils._validation import ArraysTransformer
-from .utils._validation import _deprecate_positional_args
+from .utils._validation import (
+    ArraysTransformer,
+    _deprecate_positional_args,
+    get_classes_counts,
+)
+from .utils.wrapper import (
+    check_classification_targets,
+    label_binarize,
+)
 
 
 class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
@@ -45,9 +51,18 @@
         self : object
             Return the instance itself.
         """
-        X, y, _ = self._check_X_y(X, y)
+        arrays_transformer = ArraysTransformer(X, y)
+        dask_collection = any([is_dask_collection(arr) for arr in (X, y)])
+        if dask_collection:
+            X, y = arrays_transformer.to_dask_array(X, y)
+
+        if (not dask_collection or
+                (dask_collection and self.validate_if_dask_collection)):
+            X, y, _ = self._check_X_y(X, y)
+
+        self._classes_counts = get_classes_counts(y)
         self.sampling_strategy_ = check_sampling_strategy(
-            self.sampling_strategy, y, self._sampling_type
+            self.sampling_strategy, self._classes_counts, self._sampling_type
         )
         return self
 
@@ -72,18 +87,31 @@
         y_resampled : array-like of shape (n_samples_new,)
             The corresponding label of `X_resampled`.
         """
-        check_classification_targets(y)
         arrays_transformer = ArraysTransformer(X, y)
-        X, y, binarize_y = self._check_X_y(X, y)
+        dask_collection = any([is_dask_collection(arr) for arr in (X, y)])
+        if dask_collection:
+            X, y = arrays_transformer.to_dask_array(X, y)
 
+        if (not dask_collection or
+                (dask_collection and self.validate_if_dask_collection)):
+            check_classification_targets(y)
+            X, y, binarize_y = self._check_X_y(X, y)
+        else:
+            binarize_y = False
+
+        self._classes_counts = get_classes_counts(y)
         self.sampling_strategy_ = check_sampling_strategy(
-            self.sampling_strategy, y, self._sampling_type
+            self.sampling_strategy, self._classes_counts, self._sampling_type
         )
 
         output = self._fit_resample(X, y)
 
-        y_ = (label_binarize(output[1], np.unique(y))
-              if binarize_y else output[1])
+        if binarize_y:
+            y_ = label_binarize(
+                output[1], classes=list(self._classes_counts.keys())
+            )
+        else:
+            y_ = output[1]
 
         X_, y_ = arrays_transformer.transform(output[0], y_)
         return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
@@ -124,8 +152,13 @@
     instead.
     """
 
-    def __init__(self, sampling_strategy="auto"):
+    def __init__(
+        self,
+        sampling_strategy="auto",
+        validate_if_dask_collection=False,
+    ):
         self.sampling_strategy = sampling_strategy
+        self.validate_if_dask_collection = validate_if_dask_collection
 
     def _check_X_y(self, X, y, accept_sparse=None):
         if accept_sparse is None:
@@ -251,16 +284,20 @@
                 X, y, accept_sparse=self.accept_sparse
             )
 
+        self._classes_counts = get_classes_counts(y)
         self.sampling_strategy_ = check_sampling_strategy(
-            self.sampling_strategy, y, self._sampling_type
+            self.sampling_strategy, self._classes_counts, self._sampling_type
         )
 
         output = self._fit_resample(X, y)
 
         if self.validate:
-
-            y_ = (label_binarize(output[1], np.unique(y))
-                  if binarize_y else output[1])
+            if binarize_y:
+                y_ = label_binarize(
+                    output[1], classes=list(self._classes_counts.keys())
+                )
+            else:
+                y_ = output[1]
             X_, y_ = arrays_transformer.transform(output[0], y_)
             return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
 

diff --git a/imblearn/dask/__init__.py b/imblearn/dask/__init__.py
diff --git a/imblearn/dask/_support.py b/imblearn/dask/_support.py
@@ -0,0 +1,9 @@
+def is_dask_collection(container):
+    try:
+        # to keep dask as an optional depency, keep the statement in a
+        # try/except statement
+        from dask import is_dask_collection
+
+        return is_dask_collection(container)
+    except ImportError:
+        return False
diff --git a/imblearn/dask/preprocessing.py b/imblearn/dask/preprocessing.py
@@ -0,0 +1,7 @@
+def label_binarize(y, *, classes):
+    import pandas as pd
+    from dask import dataframe
+
+    cat_dtype = pd.CategoricalDtype(categories=classes)
+    y = dataframe.from_array(y).astype(cat_dtype)
+    return dataframe.get_dummies(y).to_dask_array()
diff --git a/imblearn/dask/tests/__init__.py b/imblearn/dask/tests/__init__.py
diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pytest
+
+dask = pytest.importorskip("dask")
+from dask import array
+
+from imblearn.dask.utils import is_multilabel
+from imblearn.dask.utils import type_of_target
+
+
+@pytest.mark.parametrize(
+    "y, expected_result",
+    [
+        (array.from_array(np.array([0, 1, 0, 1])), False),
+        (array.from_array(np.array([[1, 0], [0, 0]])), True),
+        (array.from_array(np.array([[1], [0], [0]])), False),
+        (array.from_array(np.array([[1, 0, 0]])), True),
+    ]
+)
+def test_is_multilabel(y, expected_result):
+    assert is_multilabel(y) is expected_result
+
+
+@pytest.mark.parametrize(
+    "y, expected_type_of_target",
+    [
+        (array.from_array(np.array([[1, 0], [0, 0]])), "multilabel-indicator"),
+        (array.from_array(np.array([[1, 0, 0]])), "multilabel-indicator"),
+        (array.from_array(np.array([[[1, 2]]])), "unknown"),
+        (array.from_array(np.array([[]])), "unknown"),
+        (array.from_array(np.array([.1, .2, 3])), "continuous"),
+        (array.from_array(np.array([[.1, .2, 3]])), "continuous-multioutput"),
+        (array.from_array(np.array([[1., .2]])), "continuous-multioutput"),
+        (array.from_array(np.array([1, 2])), "binary"),
+        (array.from_array(np.array(["a", "b"])), "binary"),
+    ]
+)
+def test_type_of_target(y, expected_type_of_target):
+    target_type = type_of_target(y)
+    assert target_type == expected_type_of_target
diff --git a/imblearn/dask/utils.py b/imblearn/dask/utils.py
@@ -0,0 +1,78 @@
+import warnings
+
+import numpy as np
+from sklearn.exceptions import DataConversionWarning
+from sklearn.utils.multiclass import _is_integral_float
+
+
+def is_multilabel(y):
+    if not (y.ndim == 2 and y.shape[1] > 1):
+        return False
+
+    if hasattr(y, "unique"):
+        labels = np.asarray(y.unique())
+    else:
+        labels = np.unique(y).compute()
+
+    return len(labels) < 3 and (
+        y.dtype.kind in 'biu' or _is_integral_float(labels)
+    )
+
+
+def type_of_target(y):
+    if is_multilabel(y):
+        return 'multilabel-indicator'
+
+    if y.ndim > 2:
+        return 'unknown'
+
+    if y.ndim == 2 and y.shape[1] == 0:
+        return 'unknown'  # [[]]
+
+    if y.ndim == 2 and y.shape[1] > 1:
+        # [[1, 2], [1, 2]]
+        suffix = "-multioutput"
+    else:
+        # [1, 2, 3] or [[1], [2], [3]]
+        suffix = ""
+
+    # check float and contains non-integer float values
+    if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
+        # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
+        # NOTE: we don't check for infinite values
+        return 'continuous' + suffix
+
+    if hasattr(y, "unique"):
+        labels = np.asarray(y.unique())
+    else:
+        labels = np.unique(y).compute()
+    if (len((labels)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
+        # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
+        return 'multiclass' + suffix
+    # [1, 2] or [["a"], ["b"]]
+    return 'binary'
+
+
+def column_or_1d(y, *, warn=False):
+    shape = y.shape
+    if len(shape) == 1:
+        return y.ravel()
+    if len(shape) == 2 and shape[1] == 1:
+        if warn:
+            warnings.warn(
+                "A column-vector y was passed when a 1d array was  expected. "
+                "Please change the shape of y to (n_samples, ), for example "
+                "using ravel().", DataConversionWarning, stacklevel=2
+            )
+        return y.ravel()
+
+    raise ValueError(
+        f"y should be a 1d array. Got an array of shape {shape} instead."
+    )
+
+
+def check_classification_targets(y):
+    y_type = type_of_target(y)
+    if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
+                      'multilabel-indicator', 'multilabel-sequences']:
+        raise ValueError("Unknown label type: %r" % y_type)
diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py
@@ -9,7 +9,10 @@
 
 from ..under_sampling import RandomUnderSampler
 from ..utils import check_sampling_strategy
-from ..utils._validation import _deprecate_positional_args
+from ..utils._validation import (
+    _deprecate_positional_args,
+    get_classes_counts,
+)
 
 
 @_deprecate_positional_args
@@ -87,11 +90,11 @@ def make_imbalance(
     >>> print('Distribution after imbalancing: {}'.format(Counter(y_res)))
     Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10})
     """
-    target_stats = Counter(y)
+    target_stats = get_classes_counts(y)
     # restrict ratio to be a dict or a callable
     if isinstance(sampling_strategy, dict) or callable(sampling_strategy):
         sampling_strategy_ = check_sampling_strategy(
-            sampling_strategy, y, "under-sampling", **kwargs
+            sampling_strategy, target_stats, "under-sampling", **kwargs
         )
     else:
         raise ValueError(

diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py
@@ -18,7 +18,10 @@
 from ..utils import Substitution, check_target_type, check_sampling_strategy
 from ..utils._docstring import _n_jobs_docstring
 from ..utils._docstring import _random_state_docstring
-from ..utils._validation import _deprecate_positional_args
+from ..utils._validation import (
+    _deprecate_positional_args,
+    get_classes_counts,
+)
 
 
 @Substitution(
@@ -216,11 +219,12 @@ def __init__(
 
     def _validate_y(self, y):
         y_encoded = super()._validate_y(y)
+        classes_counts = get_classes_counts(y)
         if isinstance(self.sampling_strategy, dict):
             self._sampling_strategy = {
                 np.where(self.classes_ == key)[0][0]: value
                 for key, value in check_sampling_strategy(
-                    self.sampling_strategy, y, 'under-sampling',
+                    self.sampling_strategy, classes_counts, 'under-sampling',
                 ).items()
             }
         else: