Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH make Random*Sampler accept dask array and dataframe #777

Open
wants to merge 32 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ jobs:
PYTHON_VERSION: '3.8'
COVERAGE: 'true'
PANDAS_VERSION: '*'
DASK_VERSION: '*'
TEST_DOCSTRINGS: 'true'
JOBLIB_VERSION: '*'
CHECK_WARNINGS: 'true'
Expand All @@ -43,6 +44,7 @@ jobs:
PYTHON_VERSION: '3.7'
INSTALL_MKL: 'true'
PANDAS_VERSION: '*'
DASK_VERSION: '*'
KERAS_VERSION: '*'
COVERAGE: 'true'
JOBLIB_VERSION: '*'
Expand All @@ -51,6 +53,7 @@ jobs:
DISTRIB: 'conda'
PYTHON_VERSION: '3.8'
PANDAS_VERSION: '*'
DASK_VERSION: '*'
JOBLIB_VERSION: '*'
INSTALL_MKL: 'true'
TENSORFLOW_VERSION: '*'
Expand Down
7 changes: 6 additions & 1 deletion build_tools/azure/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ if [[ "$DISTRIB" == "conda" ]]; then
TO_INSTALL="$TO_INSTALL pandas=$PANDAS_VERSION"
fi

if [[ -n "$DASK_VERSION" ]]; then
TO_INSTALL="$TO_INSTALL dask=$DASK_VERSION"
fi

if [[ -n "$KERAS_VERSION" ]]; then
TO_INSTALL="$TO_INSTALL keras=$KERAS_VERSION tensorflow=1"
KERAS_BACKEND=tensorflow
Expand Down Expand Up @@ -90,9 +94,10 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
make_conda "python=$PYTHON_VERSION"
python -m pip install -U pip
python -m pip install numpy scipy joblib cython
python -m pip install pandas
python -m pip install "dask[complete]"
python -m pip install scikit-learn
python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist
python -m pip install pandas
fi

if [[ "$COVERAGE" == "true" ]]; then
Expand Down
4 changes: 2 additions & 2 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ def pytest_runtest_setup(item):
if (fname.endswith(os.path.join('keras', '_generator.py')) or
fname.endswith('miscellaneous.rst')):
try:
import keras
import keras # noqa
except ImportError:
pytest.skip('The keras package is not installed.')
elif (fname.endswith(os.path.join('tensorflow', '_generator.py')) or
fname.endswith('miscellaneous.rst')):
try:
import tensorflow
import tensorflow # noqa
except ImportError:
pytest.skip('The tensorflow package is not installed.')
2 changes: 1 addition & 1 deletion doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,6 @@ Imbalance-learn provides some fast-prototyping tools.
:toctree: generated/
:template: function.rst

utils.estimator_checks.parametrize_with_checks
utils.check_neighbors_object
utils.check_sampling_strategy
utils.get_classes_counts
69 changes: 53 additions & 16 deletions imblearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@
import numpy as np

from sklearn.base import BaseEstimator
from sklearn.preprocessing import label_binarize
from sklearn.utils.multiclass import check_classification_targets

from .dask._support import is_dask_collection
from .utils import check_sampling_strategy, check_target_type
from .utils._validation import ArraysTransformer
from .utils._validation import _deprecate_positional_args
from .utils._validation import (
ArraysTransformer,
_deprecate_positional_args,
get_classes_counts,
)
from .utils.wrapper import (
check_classification_targets,
label_binarize,
)


class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
Expand Down Expand Up @@ -45,9 +51,18 @@
self : object
Return the instance itself.
"""
X, y, _ = self._check_X_y(X, y)
arrays_transformer = ArraysTransformer(X, y)
dask_collection = any([is_dask_collection(arr) for arr in (X, y)])
if dask_collection:
X, y = arrays_transformer.to_dask_array(X, y)

Check warning on line 57 in imblearn/base.py

View check run for this annotation

Codecov / codecov/patch

imblearn/base.py#L57

Added line #L57 was not covered by tests

if (not dask_collection or
(dask_collection and self.validate_if_dask_collection)):
X, y, _ = self._check_X_y(X, y)

self._classes_counts = get_classes_counts(y)
self.sampling_strategy_ = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type
self.sampling_strategy, self._classes_counts, self._sampling_type
)
return self

Expand All @@ -72,18 +87,31 @@
y_resampled : array-like of shape (n_samples_new,)
The corresponding label of `X_resampled`.
"""
check_classification_targets(y)
arrays_transformer = ArraysTransformer(X, y)
X, y, binarize_y = self._check_X_y(X, y)
dask_collection = any([is_dask_collection(arr) for arr in (X, y)])
if dask_collection:
X, y = arrays_transformer.to_dask_array(X, y)

Check warning on line 93 in imblearn/base.py

View check run for this annotation

Codecov / codecov/patch

imblearn/base.py#L93

Added line #L93 was not covered by tests

if (not dask_collection or
(dask_collection and self.validate_if_dask_collection)):
check_classification_targets(y)
X, y, binarize_y = self._check_X_y(X, y)
else:
binarize_y = False

Check warning on line 100 in imblearn/base.py

View check run for this annotation

Codecov / codecov/patch

imblearn/base.py#L100

Added line #L100 was not covered by tests

self._classes_counts = get_classes_counts(y)
self.sampling_strategy_ = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type
self.sampling_strategy, self._classes_counts, self._sampling_type
)

output = self._fit_resample(X, y)

y_ = (label_binarize(output[1], np.unique(y))
if binarize_y else output[1])
if binarize_y:
y_ = label_binarize(
output[1], classes=list(self._classes_counts.keys())
)
else:
y_ = output[1]

X_, y_ = arrays_transformer.transform(output[0], y_)
return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
Expand Down Expand Up @@ -124,8 +152,13 @@
instead.
"""

def __init__(self, sampling_strategy="auto"):
def __init__(
self,
sampling_strategy="auto",
validate_if_dask_collection=False,
):
self.sampling_strategy = sampling_strategy
self.validate_if_dask_collection = validate_if_dask_collection

def _check_X_y(self, X, y, accept_sparse=None):
if accept_sparse is None:
Expand Down Expand Up @@ -251,16 +284,20 @@
X, y, accept_sparse=self.accept_sparse
)

self._classes_counts = get_classes_counts(y)
self.sampling_strategy_ = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type
self.sampling_strategy, self._classes_counts, self._sampling_type
)

output = self._fit_resample(X, y)

if self.validate:

y_ = (label_binarize(output[1], np.unique(y))
if binarize_y else output[1])
if binarize_y:
y_ = label_binarize(
output[1], classes=list(self._classes_counts.keys())
)
else:
y_ = output[1]
X_, y_ = arrays_transformer.transform(output[0], y_)
return (X_, y_) if len(output) == 2 else (X_, y_, output[2])

Expand Down
Empty file added imblearn/dask/__init__.py
Empty file.
9 changes: 9 additions & 0 deletions imblearn/dask/_support.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
def is_dask_collection(container):
try:
# to keep dask as an optional depency, keep the statement in a
# try/except statement
from dask import is_dask_collection

return is_dask_collection(container)
except ImportError:
return False
7 changes: 7 additions & 0 deletions imblearn/dask/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
def label_binarize(y, *, classes):
import pandas as pd
from dask import dataframe

cat_dtype = pd.CategoricalDtype(categories=classes)
y = dataframe.from_array(y).astype(cat_dtype)
return dataframe.get_dummies(y).to_dask_array()
Empty file added imblearn/dask/tests/__init__.py
Empty file.
40 changes: 40 additions & 0 deletions imblearn/dask/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import numpy as np
import pytest

dask = pytest.importorskip("dask")
from dask import array

from imblearn.dask.utils import is_multilabel
from imblearn.dask.utils import type_of_target


@pytest.mark.parametrize(
"y, expected_result",
[
(array.from_array(np.array([0, 1, 0, 1])), False),
(array.from_array(np.array([[1, 0], [0, 0]])), True),
(array.from_array(np.array([[1], [0], [0]])), False),
(array.from_array(np.array([[1, 0, 0]])), True),
]
)
def test_is_multilabel(y, expected_result):
assert is_multilabel(y) is expected_result


@pytest.mark.parametrize(
"y, expected_type_of_target",
[
(array.from_array(np.array([[1, 0], [0, 0]])), "multilabel-indicator"),
(array.from_array(np.array([[1, 0, 0]])), "multilabel-indicator"),
(array.from_array(np.array([[[1, 2]]])), "unknown"),
(array.from_array(np.array([[]])), "unknown"),
(array.from_array(np.array([.1, .2, 3])), "continuous"),
(array.from_array(np.array([[.1, .2, 3]])), "continuous-multioutput"),
(array.from_array(np.array([[1., .2]])), "continuous-multioutput"),
(array.from_array(np.array([1, 2])), "binary"),
(array.from_array(np.array(["a", "b"])), "binary"),
]
)
def test_type_of_target(y, expected_type_of_target):
target_type = type_of_target(y)
assert target_type == expected_type_of_target
78 changes: 78 additions & 0 deletions imblearn/dask/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import warnings

import numpy as np
from sklearn.exceptions import DataConversionWarning
from sklearn.utils.multiclass import _is_integral_float


def is_multilabel(y):
if not (y.ndim == 2 and y.shape[1] > 1):
return False

if hasattr(y, "unique"):
labels = np.asarray(y.unique())

Check warning on line 13 in imblearn/dask/utils.py

View check run for this annotation

Codecov / codecov/patch

imblearn/dask/utils.py#L13

Added line #L13 was not covered by tests
else:
labels = np.unique(y).compute()
Comment on lines +12 to +15

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've struggled with this check in dask-ml. Depending on where it's called, it's potentially very expensive (you might be loading a ton of data just to check if it's multi-label, and then loading it again to to the training).

Whenever possible, it's helpful to provide an option to skip this check by having the user specify it when creating the estimator, or in a keyword to fit (dunno if that applies here).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about it. Do you think that having a context manager outside would make sense:

with set_config(avoid_check=True):
    # some imblearn/scikit-learn/dask code

Thought, we might get into trouble with issues related to scikit-learn/scikit-learn#18736

It might just be easier to have an optional class parameter that applies only for dask arrays.


return len(labels) < 3 and (
y.dtype.kind in 'biu' or _is_integral_float(labels)
)


def type_of_target(y):
if is_multilabel(y):
return 'multilabel-indicator'

if y.ndim > 2:
return 'unknown'

if y.ndim == 2 and y.shape[1] == 0:
return 'unknown' # [[]]

if y.ndim == 2 and y.shape[1] > 1:
# [[1, 2], [1, 2]]
suffix = "-multioutput"
else:
# [1, 2, 3] or [[1], [2], [3]]
suffix = ""

# check float and contains non-integer float values
if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
# [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
# NOTE: we don't check for infinite values
return 'continuous' + suffix

if hasattr(y, "unique"):
labels = np.asarray(y.unique())

Check warning on line 46 in imblearn/dask/utils.py

View check run for this annotation

Codecov / codecov/patch

imblearn/dask/utils.py#L46

Added line #L46 was not covered by tests
else:
labels = np.unique(y).compute()
if (len((labels)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
# [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
return 'multiclass' + suffix
# [1, 2] or [["a"], ["b"]]
return 'binary'


def column_or_1d(y, *, warn=False):
shape = y.shape
if len(shape) == 1:
return y.ravel()
if len(shape) == 2 and shape[1] == 1:
if warn:
warnings.warn(

Check warning on line 62 in imblearn/dask/utils.py

View check run for this annotation

Codecov / codecov/patch

imblearn/dask/utils.py#L62

Added line #L62 was not covered by tests
"A column-vector y was passed when a 1d array was expected. "
"Please change the shape of y to (n_samples, ), for example "
"using ravel().", DataConversionWarning, stacklevel=2
)
return y.ravel()

Check warning on line 67 in imblearn/dask/utils.py

View check run for this annotation

Codecov / codecov/patch

imblearn/dask/utils.py#L67

Added line #L67 was not covered by tests

raise ValueError(

Check warning on line 69 in imblearn/dask/utils.py

View check run for this annotation

Codecov / codecov/patch

imblearn/dask/utils.py#L69

Added line #L69 was not covered by tests
f"y should be a 1d array. Got an array of shape {shape} instead."
)


def check_classification_targets(y):
y_type = type_of_target(y)
if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
'multilabel-indicator', 'multilabel-sequences']:
raise ValueError("Unknown label type: %r" % y_type)

Check warning on line 78 in imblearn/dask/utils.py

View check run for this annotation

Codecov / codecov/patch

imblearn/dask/utils.py#L78

Added line #L78 was not covered by tests
9 changes: 6 additions & 3 deletions imblearn/datasets/_imbalance.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

from ..under_sampling import RandomUnderSampler
from ..utils import check_sampling_strategy
from ..utils._validation import _deprecate_positional_args
from ..utils._validation import (
_deprecate_positional_args,
get_classes_counts,
)


@_deprecate_positional_args
Expand Down Expand Up @@ -87,11 +90,11 @@ def make_imbalance(
>>> print('Distribution after imbalancing: {}'.format(Counter(y_res)))
Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10})
"""
target_stats = Counter(y)
target_stats = get_classes_counts(y)
# restrict ratio to be a dict or a callable
if isinstance(sampling_strategy, dict) or callable(sampling_strategy):
sampling_strategy_ = check_sampling_strategy(
sampling_strategy, y, "under-sampling", **kwargs
sampling_strategy, target_stats, "under-sampling", **kwargs
)
else:
raise ValueError(
Expand Down
8 changes: 6 additions & 2 deletions imblearn/ensemble/_bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
from ..utils import Substitution, check_target_type, check_sampling_strategy
from ..utils._docstring import _n_jobs_docstring
from ..utils._docstring import _random_state_docstring
from ..utils._validation import _deprecate_positional_args
from ..utils._validation import (
_deprecate_positional_args,
get_classes_counts,
)


@Substitution(
Expand Down Expand Up @@ -216,11 +219,12 @@ def __init__(

def _validate_y(self, y):
y_encoded = super()._validate_y(y)
classes_counts = get_classes_counts(y)
if isinstance(self.sampling_strategy, dict):
self._sampling_strategy = {
np.where(self.classes_ == key)[0][0]: value
for key, value in check_sampling_strategy(
self.sampling_strategy, y, 'under-sampling',
self.sampling_strategy, classes_counts, 'under-sampling',
).items()
}
else:
Expand Down
Loading