From 1cd5e5bf9805e1c512c12758c019a74d02160cf1 Mon Sep 17 00:00:00 2001
From: Frank LaNasa <frank.lanasa@alteryx.com>
Date: Wed, 7 Jun 2023 13:21:19 -0400
Subject: [PATCH] update get_forecast_distance to handle gap (#4200)

* fix get_forecast_distance to properly handle gap

* update release notes

* fix test

* fix docstring test
---
 docs/source/release_notes.rst                            | 1 +
 evalml/pipelines/time_series_regression_pipeline.py      | 9 ++++-----
 .../pipeline_tests/test_time_series_baseline_pipeline.py | 9 ++++-----
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
index c1f11d5640..f171ae482a 100644
--- a/docs/source/release_notes.rst
+++ b/docs/source/release_notes.rst
@@ -8,6 +8,7 @@ Release Notes
         * Added option to exclude time index in ``IDColumnsDataCheck`` :pr:`4194`
     * Fixes
         * Fixed small errors in ``ARIMARegressor`` implementation :pr:`4186`
+        * Fixed ``get_forecast_period`` to properly handle ``gap`` parameter :pr:`4200`
     * Changes
     * Documentation Changes
     * Testing Changes
diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py
index 5490b2a4ad..93c4ec2a5a 100644
--- a/evalml/pipelines/time_series_regression_pipeline.py
+++ b/evalml/pipelines/time_series_regression_pipeline.py
@@ -114,7 +114,7 @@ def get_forecast_period(self, X):
             ValueError: If pipeline is not trained.
 
         Returns:
-            pd.Series: Datetime periods out to `forecast_horizon + gap`.
+            pd.Series: Datetime periods from `gap` to `forecast_horizon + gap`.
 
         Example:
             >>> X = pd.DataFrame({'date': pd.date_range(start='1-1-2022', periods=10, freq='D'), 'feature': range(10, 20)})
@@ -128,7 +128,7 @@ def get_forecast_period(self, X):
             >>> pipeline.fit(X, y)
             pipeline = TimeSeriesRegressionPipeline(component_graph={'Linear Regressor': ['Linear Regressor', 'X', 'y']}, parameters={'Linear Regressor':{'fit_intercept': True, 'n_jobs': -1}, 'pipeline':{'gap': 1, 'max_delay': 1, 'forecast_horizon': 2, 'time_index': 'date'}}, random_seed=0)
             >>> dates = pipeline.get_forecast_period(X)
-            >>> expected = pd.Series(pd.date_range(start='2022-01-11', periods=(gap + forecast_horizon), freq='D'), name='date', index=[10, 11, 12])
+            >>> expected = pd.Series(pd.date_range(start='2022-01-11', periods=forecast_horizon, freq='D').shift(gap), name='date', index=[10, 11])
             >>> assert dates.equals(expected)
         """
         if not self._is_fitted:
@@ -142,10 +142,9 @@ def get_forecast_period(self, X):
             pd.date_range(
                 start=first_date,
                 periods=self.forecast_horizon
-                + self.gap
                 + 1,  # Add additional period to account for dropping first date row
                 freq=self.frequency,
-            ),
+            ).shift(self.gap),
         )
 
         # Generate numerical index
@@ -165,7 +164,7 @@ def get_forecast_predictions(self, X, y):
             y (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train].
 
         Returns:
-            Predictions out to `forecast_horizon + gap` periods.
+            Predictions from `gap` periods out to `forecast_horizon + gap` periods.
         """
         X, y = self._convert_to_woodwork(X, y)
         pred_dates = pd.DataFrame(self.get_forecast_period(X))
diff --git a/evalml/tests/pipeline_tests/test_time_series_baseline_pipeline.py b/evalml/tests/pipeline_tests/test_time_series_baseline_pipeline.py
index f0d0c60e25..20a5f4edc5 100644
--- a/evalml/tests/pipeline_tests/test_time_series_baseline_pipeline.py
+++ b/evalml/tests/pipeline_tests/test_time_series_baseline_pipeline.py
@@ -107,9 +107,9 @@ def test_time_series_get_forecast_period(forecast_horizon, gap, numeric_idx, ts_
     clf.fit(X, y)
     result = clf.get_forecast_period(X)
 
-    assert result.size == forecast_horizon + gap
-    assert all(result.index == range(len(X), len(X) + forecast_horizon + gap))
-    assert result.iloc[0] == X.iloc[-1]["date"] + np.timedelta64(1, clf.frequency)
+    assert result.size == forecast_horizon
+    assert all(result.index == range(len(X), len(X) + forecast_horizon))
+    assert result.iloc[0] == X.iloc[-1]["date"] + np.timedelta64(1 + gap, clf.frequency)
     assert np.issubdtype(result.dtype, np.datetime64)
     assert result.name == "date"
 
@@ -119,7 +119,7 @@ def test_time_series_get_forecast_predictions(forecast_horizon, gap, ts_data):
     X, _, y = ts_data(problem_type=ProblemTypes.TIME_SERIES_REGRESSION)
 
     X_train, y_train = X.iloc[:15], y.iloc[:15]
-    X_validation = X.iloc[15 : (15 + gap + forecast_horizon)]
+    X_validation = X.iloc[15 + gap : (15 + gap + forecast_horizon)]
 
     clf = TimeSeriesRegressionPipeline(
         component_graph={
@@ -166,5 +166,4 @@ def test_time_series_get_forecast_predictions(forecast_horizon, gap, ts_data):
     clf.fit(X_train, y_train)
     forecast_preds = clf.get_forecast_predictions(X=X_train, y=y_train)
     X_val_preds = clf.predict(X_validation, X_train=X_train, y_train=y_train)
-
     assert_series_equal(forecast_preds, X_val_preds)