177 pipeline slicing (#194)

* added array slicing to sklearn pipelines * reformatted tests in order of features added
capitalone · Mar 14, 2022 · 4984393 · 4984393
1 parent ed92818
commit 4984393
Show file tree

Hide file tree

Showing 2 changed files with 163 additions and 75 deletions.
diff --git a/rubicon_ml/sklearn/pipeline.py b/rubicon_ml/sklearn/pipeline.py
@@ -192,6 +192,50 @@ def get_estimator_logger(self, step_name=None, estimator=None):
 
         return logger
 
+    def __getitem__(self, ind):
+        """
+        This method is based off of __getitem__ method in Sklearn.Pipeline however it returns a Rubicon Pipeline with the correct project, loggers, and
+        experiment params.
+        Parameters
+        ----------
+        ind: slice or index to obtain subset steps from the Rubicon pipeline.
+        Returns
+        -------
+        a sub-pipeline or a single estimator in the pipeline
+        Indexing with an integer will return an estimator; using a slice
+        returns another Pipeline instance which copies a slice of this
+        Pipeline. This copy is shallow: modifying (or fitting) estimators in
+        the sub-pipeline will affect the larger pipeline and vice-versa.
+        However, replacing a value in `step` will not affect a copy.
+        (doc string source: Scikit-Learn)
+        """
+        if isinstance(ind, slice):
+            if ind.step not in (1, None):
+                raise ValueError("Pipeline slicing only supports a step of 1")
+            user_defined_loggers_slice = self.__get_logger_slice__(self.steps[ind])
+            return self.__class__(
+                self.project,
+                self.steps[ind],
+                user_defined_loggers_slice,
+                self.experiment_kwargs,
+                memory=self.memory,
+                verbose=self.verbose,
+            )
+        try:
+            _, est = self.steps[ind]
+        except TypeError:
+            # Not an int, try get step by name
+            return self.named_steps[ind]
+        return est
+
+    def __get_logger_slice__(self, steps):
+        """Given a slice of estimators, returns the associated slice of loggers"""
+        user_defined_loggers_slice = {}
+        for name, _ in steps:
+            if name in self.user_defined_loggers:
+                user_defined_loggers_slice[name] = self.user_defined_loggers[name]
+        return user_defined_loggers_slice
+
 
 def make_pipeline(
     project,

diff --git a/tests/unit/sklearn/test_pipeline.py b/tests/unit/sklearn/test_pipeline.py
@@ -80,78 +80,6 @@ def test_score_logs_metric(project_client, fake_estimator_cls):
     mock_log_metric.assert_called_once()
 
 
-def test_multiple_fit_multiple_scores(project_client, fake_estimator_cls):
-    project = project_client
-    estimator = fake_estimator_cls()
-    steps = [("est", estimator)]
-    user_defined_logger = {"est": FilterEstimatorLogger(ignore_all=True)}
-    pipeline = RubiconPipeline(project, steps, user_defined_logger)
-
-    with patch.object(Pipeline, "fit", return_value=None):
-        with patch.object(
-            FilterEstimatorLogger, "log_parameters", return_value=None
-        ) as mock_log_parameters:
-            pipeline.fit(["fake data"])
-            pipeline.fit("additional fake data")
-
-    assert mock_log_parameters._mock_call_count == 2
-    assert len(project.experiments()) == 2
-
-    with patch.object(Pipeline, "score", return_value=None):
-        with patch.object(EstimatorLogger, "log_metric", return_value=None) as mock_log_metric:
-            pipeline.score(["fake data"])
-            assert pipeline.experiment is None
-            pipeline.score(["additional fake data"])
-
-    assert mock_log_metric._mock_call_count == 2
-    assert len(project.experiments()) == 3
-
-
-def test_multiple_scores(project_client, fake_estimator_cls):
-    project = project_client
-    estimator = fake_estimator_cls()
-    steps = [("est", estimator)]
-    pipeline = RubiconPipeline(project, steps)
-
-    with patch.object(Pipeline, "score", return_value=None) as mock_log_metric:
-        with patch.object(EstimatorLogger, "log_metric", return_value=None):
-            # first score gets its own explicitly declared experiment
-            experiment = project.log_experiment(name="fake experiment")
-            pipeline.score(["fake data"], experiment=experiment)
-            pipeline.score(["additional fake data"])
-
-    experiments = project.experiments()
-    assert len(experiments) == 2
-    assert mock_log_metric._mock_call_count == 2
-    assert experiments[0].name == "fake experiment"
-    assert experiments[1].name == "RubiconPipeline experiment"
-
-
-def test_multiple_fits(project_client, fake_estimator_cls):
-    project = project_client
-    estimator = fake_estimator_cls()
-    steps = [("est", estimator)]
-    user_defined_logger = {"est": FilterEstimatorLogger(ignore_all=True)}
-
-    pipeline = RubiconPipeline(project, steps, user_defined_logger)
-
-    with patch.object(Pipeline, "fit", return_value=None):
-        with patch.object(
-            FilterEstimatorLogger, "log_parameters", return_value=None
-        ) as mock_log_parameters:
-            # first fit gets its own explicitly declared experiment
-            experiment = project.log_experiment(name="fake experiment")
-            pipeline.fit(["fake data"], experiment=experiment)
-            pipeline.fit("additional fake data")
-
-    experiments = project.experiments()
-    assert len(experiments) == 2
-    assert mock_log_parameters._mock_call_count == 2
-
-    assert experiments[0].name == "fake experiment"
-    assert experiments[1].name == "RubiconPipeline experiment"
-
-
 def test_make_pipeline(project_client, fake_estimator_cls):
     project = project_client
     clf = fake_estimator_cls()
@@ -214,19 +142,91 @@ def test_pipeline_memory_verbose(project_client, fake_estimator_cls):
     cachedir = mkdtemp()
     user_defined_logger = FilterEstimatorLogger()
 
-    pipeline = RubiconPipeline(project, steps, {"est", user_defined_logger}, memory=cachedir)
+    pipeline = RubiconPipeline(project, steps, {"est": user_defined_logger}, memory=cachedir)
     assert pipeline.memory == cachedir
     assert pipeline.verbose is False
 
-    pipeline = RubiconPipeline(project, steps, {"est", user_defined_logger}, verbose=True)
+    pipeline = RubiconPipeline(project, steps, {"est": user_defined_logger}, verbose=True)
     assert pipeline.memory is None
     assert pipeline.verbose is True
 
-    pipeline = RubiconPipeline(project, steps, {"est", user_defined_logger})
+    pipeline = RubiconPipeline(project, steps, {"est": user_defined_logger})
     assert pipeline.memory is None
     assert pipeline.verbose is False
 
 
+def test_multiple_fit_multiple_scores(project_client, fake_estimator_cls):
+    project = project_client
+    estimator = fake_estimator_cls()
+    steps = [("est", estimator)]
+    user_defined_logger = {"est": FilterEstimatorLogger(ignore_all=True)}
+    pipeline = RubiconPipeline(project, steps, user_defined_logger)
+
+    with patch.object(Pipeline, "fit", return_value=None):
+        with patch.object(
+            FilterEstimatorLogger, "log_parameters", return_value=None
+        ) as mock_log_parameters:
+            pipeline.fit(["fake data"])
+            pipeline.fit("additional fake data")
+
+    assert mock_log_parameters._mock_call_count == 2
+    assert len(project.experiments()) == 2
+
+    with patch.object(Pipeline, "score", return_value=None):
+        with patch.object(EstimatorLogger, "log_metric", return_value=None) as mock_log_metric:
+            pipeline.score(["fake data"])
+            assert pipeline.experiment is None
+            pipeline.score(["additional fake data"])
+
+    assert mock_log_metric._mock_call_count == 2
+    assert len(project.experiments()) == 3
+
+
+def test_multiple_scores(project_client, fake_estimator_cls):
+    project = project_client
+    estimator = fake_estimator_cls()
+    steps = [("est", estimator)]
+    pipeline = RubiconPipeline(project, steps)
+
+    with patch.object(Pipeline, "score", return_value=None) as mock_log_metric:
+        with patch.object(EstimatorLogger, "log_metric", return_value=None):
+            # first score gets its own explicitly declared experiment
+            experiment = project.log_experiment(name="fake experiment")
+            pipeline.score(["fake data"], experiment=experiment)
+            pipeline.score(["additional fake data"])
+
+    experiments = project.experiments()
+    assert len(experiments) == 2
+    assert mock_log_metric._mock_call_count == 2
+    assert experiments[0].name == "fake experiment"
+    assert experiments[1].name == "RubiconPipeline experiment"
+
+
+def test_multiple_fits(project_client, fake_estimator_cls):
+    project = project_client
+    estimator = fake_estimator_cls()
+    steps = [("est", estimator)]
+    user_defined_logger = {"est": FilterEstimatorLogger(ignore_all=True)}
+
+    pipeline = RubiconPipeline(project, steps, user_defined_logger)
+
+    with patch.object(Pipeline, "fit", return_value=None):
+        with patch.object(
+            FilterEstimatorLogger, "log_parameters", return_value=None
+        ) as mock_log_parameters:
+            # first fit gets its own explicitly declared experiment
+            experiment = project.log_experiment(name="fake experiment")
+            pipeline.fit(["fake data"], experiment=experiment)
+            pipeline.fit("additional fake data")
+
+    experiments = project.experiments()
+    assert len(experiments) == 2
+    assert mock_log_parameters._mock_call_count == 2
+
+    assert experiments[0].name == "fake experiment"
+    assert experiments[1].name == "RubiconPipeline experiment"
+
+
 def test_score_samples(project_client, fake_estimator_cls):
     project = project_client
     estimator = fake_estimator_cls()
@@ -256,3 +256,47 @@ def test_score_samples(project_client, fake_estimator_cls):
     assert mock_log_metric._mock_call_count == 3
     assert len(project.experiments()) == 3
     assert project.experiments()[2].name == "fake experiment"
+
+
+def test_pipeline_slices(project_client, fake_estimator_cls):
+    project = project_client
+
+    steps = [
+        ("est", fake_estimator_cls()),
+        ("est1", fake_estimator_cls()),
+        ("est2", fake_estimator_cls()),
+    ]
+    cachedir = mkdtemp()
+    est_logger = FilterEstimatorLogger()
+    est1_logger = FilterEstimatorLogger
+    user_defined_loggers = {"est": est_logger, "est1": est1_logger}
+
+    pipeline = RubiconPipeline(project, steps, user_defined_loggers, memory=cachedir)
+    assert pipeline[1:].steps == steps[1:]
+    assert pipeline[1:].user_defined_loggers == {"est1": est1_logger}
+    assert pipeline[:-1].steps == steps[:-1]
+
+    with raises(ValueError) as e:
+        pipeline[::-1]
+
+    assert "Pipeline slicing only supports a step of 1" == str(e.value)
+
+
+def test_sklearn_pipeline_invalid_step_count(project_client, fake_estimator_cls):
+    project = project_client
+
+    steps = [
+        ("est", fake_estimator_cls()),
+        ("est1", fake_estimator_cls()),
+        ("est2", fake_estimator_cls()),
+    ]
+    cachedir = mkdtemp()
+    est_logger = FilterEstimatorLogger()
+    est1_logger = FilterEstimatorLogger
+    user_defined_loggers = {"est": est_logger, "est1": est1_logger}
+
+    pipeline = RubiconPipeline(project, steps, user_defined_loggers, memory=cachedir)
+    with raises(ValueError) as e:
+        pipeline[::-1]
+
+    assert "Pipeline slicing only supports a step of 1" == str(e.value)