Skip to content

Commit

Permalink
177 pipeline slicing (#194)
Browse files Browse the repository at this point in the history
* added array slicing to sklearn pipelines
* reformatted tests in order of features added
  • Loading branch information
shania-m authored Mar 14, 2022
1 parent ed92818 commit 4984393
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 75 deletions.
44 changes: 44 additions & 0 deletions rubicon_ml/sklearn/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,50 @@ def get_estimator_logger(self, step_name=None, estimator=None):

return logger

def __getitem__(self, ind):
"""
This method is based off of __getitem__ method in Sklearn.Pipeline however it returns a Rubicon Pipeline with the correct project, loggers, and
experiment params.
Parameters
----------
ind: slice or index to obtain subset steps from the Rubicon pipeline.
Returns
-------
a sub-pipeline or a single estimator in the pipeline
Indexing with an integer will return an estimator; using a slice
returns another Pipeline instance which copies a slice of this
Pipeline. This copy is shallow: modifying (or fitting) estimators in
the sub-pipeline will affect the larger pipeline and vice-versa.
However, replacing a value in `step` will not affect a copy.
(doc string source: Scikit-Learn)
"""
if isinstance(ind, slice):
if ind.step not in (1, None):
raise ValueError("Pipeline slicing only supports a step of 1")
user_defined_loggers_slice = self.__get_logger_slice__(self.steps[ind])
return self.__class__(
self.project,
self.steps[ind],
user_defined_loggers_slice,
self.experiment_kwargs,
memory=self.memory,
verbose=self.verbose,
)
try:
_, est = self.steps[ind]
except TypeError:
# Not an int, try get step by name
return self.named_steps[ind]
return est

def __get_logger_slice__(self, steps):
"""Given a slice of estimators, returns the associated slice of loggers"""
user_defined_loggers_slice = {}
for name, _ in steps:
if name in self.user_defined_loggers:
user_defined_loggers_slice[name] = self.user_defined_loggers[name]
return user_defined_loggers_slice


def make_pipeline(
project,
Expand Down
194 changes: 119 additions & 75 deletions tests/unit/sklearn/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,78 +80,6 @@ def test_score_logs_metric(project_client, fake_estimator_cls):
mock_log_metric.assert_called_once()


def test_multiple_fit_multiple_scores(project_client, fake_estimator_cls):
project = project_client
estimator = fake_estimator_cls()
steps = [("est", estimator)]
user_defined_logger = {"est": FilterEstimatorLogger(ignore_all=True)}
pipeline = RubiconPipeline(project, steps, user_defined_logger)

with patch.object(Pipeline, "fit", return_value=None):
with patch.object(
FilterEstimatorLogger, "log_parameters", return_value=None
) as mock_log_parameters:
pipeline.fit(["fake data"])
pipeline.fit("additional fake data")

assert mock_log_parameters._mock_call_count == 2
assert len(project.experiments()) == 2

with patch.object(Pipeline, "score", return_value=None):
with patch.object(EstimatorLogger, "log_metric", return_value=None) as mock_log_metric:
pipeline.score(["fake data"])
assert pipeline.experiment is None
pipeline.score(["additional fake data"])

assert mock_log_metric._mock_call_count == 2
assert len(project.experiments()) == 3


def test_multiple_scores(project_client, fake_estimator_cls):
project = project_client
estimator = fake_estimator_cls()
steps = [("est", estimator)]
pipeline = RubiconPipeline(project, steps)

with patch.object(Pipeline, "score", return_value=None) as mock_log_metric:
with patch.object(EstimatorLogger, "log_metric", return_value=None):
# first score gets its own explicitly declared experiment
experiment = project.log_experiment(name="fake experiment")
pipeline.score(["fake data"], experiment=experiment)
pipeline.score(["additional fake data"])

experiments = project.experiments()
assert len(experiments) == 2
assert mock_log_metric._mock_call_count == 2
assert experiments[0].name == "fake experiment"
assert experiments[1].name == "RubiconPipeline experiment"


def test_multiple_fits(project_client, fake_estimator_cls):
project = project_client
estimator = fake_estimator_cls()
steps = [("est", estimator)]
user_defined_logger = {"est": FilterEstimatorLogger(ignore_all=True)}

pipeline = RubiconPipeline(project, steps, user_defined_logger)

with patch.object(Pipeline, "fit", return_value=None):
with patch.object(
FilterEstimatorLogger, "log_parameters", return_value=None
) as mock_log_parameters:
# first fit gets its own explicitly declared experiment
experiment = project.log_experiment(name="fake experiment")
pipeline.fit(["fake data"], experiment=experiment)
pipeline.fit("additional fake data")

experiments = project.experiments()
assert len(experiments) == 2
assert mock_log_parameters._mock_call_count == 2

assert experiments[0].name == "fake experiment"
assert experiments[1].name == "RubiconPipeline experiment"


def test_make_pipeline(project_client, fake_estimator_cls):
project = project_client
clf = fake_estimator_cls()
Expand Down Expand Up @@ -214,19 +142,91 @@ def test_pipeline_memory_verbose(project_client, fake_estimator_cls):
cachedir = mkdtemp()
user_defined_logger = FilterEstimatorLogger()

pipeline = RubiconPipeline(project, steps, {"est", user_defined_logger}, memory=cachedir)
pipeline = RubiconPipeline(project, steps, {"est": user_defined_logger}, memory=cachedir)
assert pipeline.memory == cachedir
assert pipeline.verbose is False

pipeline = RubiconPipeline(project, steps, {"est", user_defined_logger}, verbose=True)
pipeline = RubiconPipeline(project, steps, {"est": user_defined_logger}, verbose=True)
assert pipeline.memory is None
assert pipeline.verbose is True

pipeline = RubiconPipeline(project, steps, {"est", user_defined_logger})
pipeline = RubiconPipeline(project, steps, {"est": user_defined_logger})
assert pipeline.memory is None
assert pipeline.verbose is False


def test_multiple_fit_multiple_scores(project_client, fake_estimator_cls):
project = project_client
estimator = fake_estimator_cls()
steps = [("est", estimator)]
user_defined_logger = {"est": FilterEstimatorLogger(ignore_all=True)}
pipeline = RubiconPipeline(project, steps, user_defined_logger)

with patch.object(Pipeline, "fit", return_value=None):
with patch.object(
FilterEstimatorLogger, "log_parameters", return_value=None
) as mock_log_parameters:
pipeline.fit(["fake data"])
pipeline.fit("additional fake data")

assert mock_log_parameters._mock_call_count == 2
assert len(project.experiments()) == 2

with patch.object(Pipeline, "score", return_value=None):
with patch.object(EstimatorLogger, "log_metric", return_value=None) as mock_log_metric:
pipeline.score(["fake data"])
assert pipeline.experiment is None
pipeline.score(["additional fake data"])

assert mock_log_metric._mock_call_count == 2
assert len(project.experiments()) == 3


def test_multiple_scores(project_client, fake_estimator_cls):
project = project_client
estimator = fake_estimator_cls()
steps = [("est", estimator)]
pipeline = RubiconPipeline(project, steps)

with patch.object(Pipeline, "score", return_value=None) as mock_log_metric:
with patch.object(EstimatorLogger, "log_metric", return_value=None):
# first score gets its own explicitly declared experiment
experiment = project.log_experiment(name="fake experiment")
pipeline.score(["fake data"], experiment=experiment)
pipeline.score(["additional fake data"])

experiments = project.experiments()
assert len(experiments) == 2
assert mock_log_metric._mock_call_count == 2
assert experiments[0].name == "fake experiment"
assert experiments[1].name == "RubiconPipeline experiment"


def test_multiple_fits(project_client, fake_estimator_cls):
project = project_client
estimator = fake_estimator_cls()
steps = [("est", estimator)]
user_defined_logger = {"est": FilterEstimatorLogger(ignore_all=True)}

pipeline = RubiconPipeline(project, steps, user_defined_logger)

with patch.object(Pipeline, "fit", return_value=None):
with patch.object(
FilterEstimatorLogger, "log_parameters", return_value=None
) as mock_log_parameters:
# first fit gets its own explicitly declared experiment
experiment = project.log_experiment(name="fake experiment")
pipeline.fit(["fake data"], experiment=experiment)
pipeline.fit("additional fake data")

experiments = project.experiments()
assert len(experiments) == 2
assert mock_log_parameters._mock_call_count == 2

assert experiments[0].name == "fake experiment"
assert experiments[1].name == "RubiconPipeline experiment"


def test_score_samples(project_client, fake_estimator_cls):
project = project_client
estimator = fake_estimator_cls()
Expand Down Expand Up @@ -256,3 +256,47 @@ def test_score_samples(project_client, fake_estimator_cls):
assert mock_log_metric._mock_call_count == 3
assert len(project.experiments()) == 3
assert project.experiments()[2].name == "fake experiment"


def test_pipeline_slices(project_client, fake_estimator_cls):
project = project_client

steps = [
("est", fake_estimator_cls()),
("est1", fake_estimator_cls()),
("est2", fake_estimator_cls()),
]
cachedir = mkdtemp()
est_logger = FilterEstimatorLogger()
est1_logger = FilterEstimatorLogger
user_defined_loggers = {"est": est_logger, "est1": est1_logger}

pipeline = RubiconPipeline(project, steps, user_defined_loggers, memory=cachedir)
assert pipeline[1:].steps == steps[1:]
assert pipeline[1:].user_defined_loggers == {"est1": est1_logger}
assert pipeline[:-1].steps == steps[:-1]

with raises(ValueError) as e:
pipeline[::-1]

assert "Pipeline slicing only supports a step of 1" == str(e.value)


def test_sklearn_pipeline_invalid_step_count(project_client, fake_estimator_cls):
project = project_client

steps = [
("est", fake_estimator_cls()),
("est1", fake_estimator_cls()),
("est2", fake_estimator_cls()),
]
cachedir = mkdtemp()
est_logger = FilterEstimatorLogger()
est1_logger = FilterEstimatorLogger
user_defined_loggers = {"est": est_logger, "est1": est1_logger}

pipeline = RubiconPipeline(project, steps, user_defined_loggers, memory=cachedir)
with raises(ValueError) as e:
pipeline[::-1]

assert "Pipeline slicing only supports a step of 1" == str(e.value)

0 comments on commit 4984393

Please sign in to comment.