From 8ecc59ba9999e50e8863e0441d18834acbba9fef Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Tue, 9 Jan 2024 15:01:38 -0500 Subject: [PATCH 01/10] Initial commit --- evalml/pipelines/component_graph.py | 2 +- .../components/estimators/estimator.py | 1 + .../regressors/catboost_regressor.py | 2 + .../regressors/decision_tree_regressor.py | 2 + .../regressors/elasticnet_regressor.py | 2 + .../estimators/regressors/et_regressor.py | 2 + .../regressors/lightgbm_regressor.py | 5 ++- .../estimators/regressors/linear_regressor.py | 2 + .../estimators/regressors/rf_regressor.py | 2 + .../regressors/xgboost_regressor.py | 2 + .../drop_nan_rows_transformer.py | 20 +++++++-- .../preprocessing/time_series_featurizer.py | 13 +++++- .../multiseries_regression_pipeline.py | 45 ++++++++++++++++--- evalml/pipelines/utils.py | 26 ++++++++--- evalml/preprocessing/utils.py | 2 +- 15 files changed, 110 insertions(+), 18 deletions(-) diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index 0f3f4e5810..a1f7bc207f 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -462,7 +462,7 @@ def _transform_features( ) if not _schema_is_equal(X_schema, self._input_types): raise PipelineError( - "Input X data types are different from the input types the pipeline was fitted on.", + f"Input X data types are different from the input types the pipeline was fitted on. Input is {self._input_types} while expecting {X_schema}.", code=PipelineErrorCodeEnum.PREDICT_INPUT_SCHEMA_UNEQUAL, details={ "input_features_types": X_schema.types, diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index 736772ecfb..460861a3b7 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -114,6 +114,7 @@ def predict(self, X: pd.DataFrame) -> pd.Series: try: X = infer_feature_types(X) X = _handle_column_names_for_scikit(X) + print("input feat:", X.columns, X) predictions = self._component_obj.predict(X) except AttributeError: raise MethodPropertyNotFoundError( diff --git a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py index 1a5945e3e5..a1d854f4f0 100644 --- a/evalml/pipelines/components/estimators/regressors/catboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/catboost_regressor.py @@ -47,10 +47,12 @@ class CatBoostRegressor(Estimator): supported_problem_types = [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] """[ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]""" def __init__( diff --git a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py index 5bd7066892..df6af93e8c 100644 --- a/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/decision_tree_regressor.py @@ -55,10 +55,12 @@ class DecisionTreeRegressor(Estimator): supported_problem_types = [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] """[ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]""" def __init__( diff --git a/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py b/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py index 417fffc561..8cab2ad109 100644 --- a/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/elasticnet_regressor.py @@ -33,10 +33,12 @@ class ElasticNetRegressor(Estimator): supported_problem_types = [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] """[ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]""" def __init__( diff --git a/evalml/pipelines/components/estimators/regressors/et_regressor.py b/evalml/pipelines/components/estimators/regressors/et_regressor.py index c5991db8b1..81673715df 100644 --- a/evalml/pipelines/components/estimators/regressors/et_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/et_regressor.py @@ -56,10 +56,12 @@ class ExtraTreesRegressor(Estimator): supported_problem_types = [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] """[ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]""" def __init__( diff --git a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py index 7070f09e18..d371267d98 100644 --- a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py @@ -68,7 +68,10 @@ class LightGBMRegressor(Estimator): ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ] - """[ProblemTypes.REGRESSION]""" + """[ + ProblemTypes.REGRESSION, + ProblemTypes.TIME_SERIES_REGRESSION, + ]""" SEED_MIN = 0 SEED_MAX = SEED_BOUNDS.max_bound diff --git a/evalml/pipelines/components/estimators/regressors/linear_regressor.py b/evalml/pipelines/components/estimators/regressors/linear_regressor.py index f255761d17..15c95d4654 100644 --- a/evalml/pipelines/components/estimators/regressors/linear_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/linear_regressor.py @@ -28,10 +28,12 @@ class LinearRegressor(Estimator): supported_problem_types = [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] """[ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]""" def __init__(self, fit_intercept=True, n_jobs=-1, random_seed=0, **kwargs): diff --git a/evalml/pipelines/components/estimators/regressors/rf_regressor.py b/evalml/pipelines/components/estimators/regressors/rf_regressor.py index 3a2939ff22..5ab13317d9 100644 --- a/evalml/pipelines/components/estimators/regressors/rf_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/rf_regressor.py @@ -37,10 +37,12 @@ class RandomForestRegressor(Estimator): supported_problem_types = [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] """[ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]""" def __init__( diff --git a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py index 3a63a9f58c..3f06e4310e 100644 --- a/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/xgboost_regressor.py @@ -40,10 +40,12 @@ class XGBoostRegressor(Estimator): supported_problem_types = [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ] """[ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ]""" # xgboost supports seeds from -2**31 to 2**31 - 1 inclusive. these limits ensure the random seed generated below diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_nan_rows_transformer.py b/evalml/pipelines/components/transformers/preprocessing/drop_nan_rows_transformer.py index 3c52c1647d..868431b91d 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_nan_rows_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_nan_rows_transformer.py @@ -1,4 +1,5 @@ """Transformer to drop rows specified by row indices.""" +import pandas as pd from woodwork import init_series from evalml.pipelines.components.transformers import Transformer @@ -43,12 +44,25 @@ def transform(self, X, y=None): y_t = infer_feature_types(y) if y is not None else None X_t_schema = X_t.ww.schema + y_t_logical = None + y_t_semantic = None if y_t is not None: - y_t_logical = y_t.ww.logical_type - y_t_semantic = y_t.ww.semantic_tags + if isinstance(y_t, pd.DataFrame): + y_t_logical = y_t.ww.logical_types + y_t_semantic = y_t.ww.semantic_tags + else: + y_t_logical = y_t.ww.logical_type + y_t_semantic = y_t.ww.semantic_tags X_t, y_t = drop_rows_with_nans(X_t, y_t) X_t.ww.init_with_full_schema(X_t_schema) if y_t is not None: - y_t = init_series(y_t, logical_type=y_t_logical, semantic_tags=y_t_semantic) + if isinstance(y_t, pd.DataFrame): + y_t.ww.init(logical_types=y_t_logical, semantic_tags=y_t_semantic) + else: + y_t = init_series( + y_t, + logical_type=y_t_logical, + semantic_tags=y_t_semantic, + ) return X_t, y_t diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py index f812471090..7e3f088cdb 100644 --- a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py @@ -129,7 +129,16 @@ def fit(self, X, y=None): # For the multiseries case, where we only want the start delay lag for the baseline if isinstance(y, pd.DataFrame): - self.statistically_significant_lags = [self.start_delay] + self.statistically_significant_lags = {} + for column in y.columns: + self.statistically_significant_lags[ + column + ] = self._find_significant_lags( + y[column], + conf_level=self.conf_level, + start_delay=self.start_delay, + max_delay=self.max_delay, + ) else: self.statistically_significant_lags = self._find_significant_lags( y, @@ -234,7 +243,7 @@ def _delay_df( col = data[col_name] if categorical_columns and col_name in categorical_columns: col = X_categorical[col_name] - for t in self.statistically_significant_lags: + for t in self.statistically_significant_lags[col_name]: lagged_features[self.df_colname_prefix.format(col_name, t)] = col.shift( t, ) diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py index 948ce040c5..04f75bf5ee 100644 --- a/evalml/pipelines/multiseries_regression_pipeline.py +++ b/evalml/pipelines/multiseries_regression_pipeline.py @@ -83,6 +83,7 @@ def _fit(self, X, y): self.component_graph.fit(X_unstacked, y_unstacked) self.input_feature_names = self.component_graph.input_feature_names + self.series_id_target_names = y_unstacked.columns def predict_in_sample( self, @@ -114,6 +115,9 @@ def predict_in_sample( """ from evalml.pipelines.utils import stack_data, unstack_multiseries + print("PinSin_X", X) + print("PinSin_y", y) + X_unstacked, y_unstacked = unstack_multiseries( X, y, @@ -144,7 +148,7 @@ def predict_in_sample( ] y_overlapping_features = [ feature - for feature in y_train_unstacked.columns + for feature in self.series_id_target_names if feature in y_unstacked.columns ] y_unstacked = y_unstacked[y_overlapping_features] @@ -154,7 +158,8 @@ def predict_in_sample( y_train_unstacked = infer_feature_types(y_train_unstacked) X_unstacked = infer_feature_types(X_unstacked) y_unstacked = infer_feature_types(y_unstacked) - + print("PinSX", X_unstacked) + print("PinSy", y_unstacked) unstacked_predictions = super().predict_in_sample( X_unstacked, y_unstacked, @@ -163,16 +168,46 @@ def predict_in_sample( objective, calculating_residuals, ) + unstacked_predictions = unstacked_predictions[ + [ + series_id_target + for series_id_target in y_train_unstacked.columns + if series_id_target in unstacked_predictions.columns + ] + ] + unstacked_predictions.index = X_unstacked[self.time_index] stacked_predictions = stack_data( unstacked_predictions, - include_series_id=include_series_id, + include_series_id=True, series_id_name=self.series_id, ) - + stacked_predictions = stacked_predictions.reset_index() + sp_dtypes = { + self.time_index: X[self.time_index].dtype, + self.series_id: X[self.series_id].dtype, + self.input_target_name: y.dtype, + } + stacked_predictions = stacked_predictions.astype(sp_dtypes) + + # Order prediction based on input (date, series_id) + output_cols = ( + [self.series_id, self.input_target_name] + if include_series_id + else [self.input_target_name] + ) + stacked_predictions = pd.merge( + X, + stacked_predictions, + on=[self.time_index, self.series_id], + )[output_cols] # Index will start at the unstacked index, so we need to reset it to the original index stacked_predictions.index = X.index stacked_predictions = infer_feature_types(stacked_predictions) - return stacked_predictions + + if not include_series_id: + return stacked_predictions[self.input_target_name] + else: + return stacked_predictions def get_forecast_period(self, X): """Generates all possible forecasting time points based on latest data point in X. diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 26ffb9463b..2918a6bbb7 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -132,6 +132,7 @@ def _get_datetime(X, y, problem_type, estimator_class, sampler_name=None): if add_datetime_featurizer and estimator_class.model_family not in [ ModelFamily.ARIMA, ModelFamily.PROPHET, + ModelFamily.VARMAX, ]: components.append(DateTimeFeaturizer) return components @@ -300,9 +301,18 @@ def _get_preprocessing_components( """ if is_multiseries(problem_type): if include_decomposer: - components_functions = [_get_decomposer] + components_functions = [ + _get_time_series_featurizer, + _get_decomposer, + _get_datetime, + _get_drop_nan_rows_transformer, + ] else: - return [] + components_functions = [ + _get_time_series_featurizer, + _get_datetime, + _get_drop_nan_rows_transformer, + ] elif is_time_series(problem_type): components_functions = [ @@ -1516,14 +1526,20 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values pd.DataFrame: The restacked features. """ original_columns = set() - series_ids = series_id_values or set() - if series_id_values is None: + if series_id_values is not None: + series_ids = series_id_values + else: + # Using list to maintain order (vs. a set) + series_ids = list() for col in X.columns: if col == time_index: continue separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL) original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1])) series_ids.add(separated_name[-1]) + # Remove duplicates + seen = set() + series_ids = [val for val in series_ids if not (val in seen or seen.add(val))] if len(series_ids) == 0: raise ValueError( @@ -1542,7 +1558,7 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values restacked_X = pd.DataFrame( { time_index: time_index_col, - series_id_name: sorted(list(series_ids)) * len(X), + series_id_name: list(series_ids) * len(X), }, index=stacked_index, ) diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index dc17e75ee8..7d1008d955 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -79,7 +79,7 @@ def split_multiseries_data(X, y, series_id, time_index, **kwargs): # Get unique series value from X if there is only the time_index column # Otherwise, this information is generated in `stack_X` from the column values - series_id_values = set(X[series_id]) if len(X_unstacked.columns) == 1 else None + series_id_values = X[series_id].unique() if len(X_unstacked.columns) == 1 else None X_train = stack_X( X_train_unstacked, From 3f0fb8a475508e293bc024a6201c998f93fd8805 Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Fri, 26 Jan 2024 13:05:13 -0500 Subject: [PATCH 02/10] Updated tests --- evalml/pipelines/component_graph.py | 2 +- .../components/estimators/estimator.py | 1 - .../preprocessing/time_series_featurizer.py | 20 ++++++++- .../multiseries_regression_pipeline.py | 5 --- .../time_series_regression_pipeline.py | 8 +++- evalml/pipelines/utils.py | 21 ++------- .../automl_tests/test_default_algorithm.py | 6 +-- .../automl_tests/test_iterative_algorithm.py | 8 ++-- .../test_decision_tree_regressor.py | 1 + .../component_tests/test_en_regressor.py | 1 + .../component_tests/test_et_regressor.py | 1 + .../test_time_series_featurizer.py | 2 +- evalml/tests/conftest.py | 1 + .../test_multiseries_regression_pipeline.py | 33 +++++++++++--- .../pipeline_tests/test_pipeline_utils.py | 43 ++++++++----------- 15 files changed, 88 insertions(+), 65 deletions(-) diff --git a/evalml/pipelines/component_graph.py b/evalml/pipelines/component_graph.py index a1f7bc207f..0f3f4e5810 100644 --- a/evalml/pipelines/component_graph.py +++ b/evalml/pipelines/component_graph.py @@ -462,7 +462,7 @@ def _transform_features( ) if not _schema_is_equal(X_schema, self._input_types): raise PipelineError( - f"Input X data types are different from the input types the pipeline was fitted on. Input is {self._input_types} while expecting {X_schema}.", + "Input X data types are different from the input types the pipeline was fitted on.", code=PipelineErrorCodeEnum.PREDICT_INPUT_SCHEMA_UNEQUAL, details={ "input_features_types": X_schema.types, diff --git a/evalml/pipelines/components/estimators/estimator.py b/evalml/pipelines/components/estimators/estimator.py index 460861a3b7..736772ecfb 100644 --- a/evalml/pipelines/components/estimators/estimator.py +++ b/evalml/pipelines/components/estimators/estimator.py @@ -114,7 +114,6 @@ def predict(self, X: pd.DataFrame) -> pd.Series: try: X = infer_feature_types(X) X = _handle_column_names_for_scikit(X) - print("input feat:", X.columns, X) predictions = self._component_obj.predict(X) except AttributeError: raise MethodPropertyNotFoundError( diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py index 7e3f088cdb..f093d7b19a 100644 --- a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py @@ -243,7 +243,25 @@ def _delay_df( col = data[col_name] if categorical_columns and col_name in categorical_columns: col = X_categorical[col_name] - for t in self.statistically_significant_lags[col_name]: + # Lags are stored in a dict for multiseries problems + # Returns the lags corresponding to the series ID value + if isinstance(self.statistically_significant_lags, dict): + from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL + + col_series_id = ( + MULTISERIES_SEPARATOR_SYMBOL + + col_name.split(MULTISERIES_SEPARATOR_SYMBOL)[-1] + ) + for ( + series_id_target_name, + lag_list, + ) in self.statistically_significant_lags.items(): + if series_id_target_name.endswith(col_series_id): + lags = lag_list + break + else: + lags = self.statistically_significant_lags + for t in lags: lagged_features[self.df_colname_prefix.format(col_name, t)] = col.shift( t, ) diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py index 04f75bf5ee..6e84476140 100644 --- a/evalml/pipelines/multiseries_regression_pipeline.py +++ b/evalml/pipelines/multiseries_regression_pipeline.py @@ -115,9 +115,6 @@ def predict_in_sample( """ from evalml.pipelines.utils import stack_data, unstack_multiseries - print("PinSin_X", X) - print("PinSin_y", y) - X_unstacked, y_unstacked = unstack_multiseries( X, y, @@ -158,8 +155,6 @@ def predict_in_sample( y_train_unstacked = infer_feature_types(y_train_unstacked) X_unstacked = infer_feature_types(X_unstacked) y_unstacked = infer_feature_types(y_unstacked) - print("PinSX", X_unstacked) - print("PinSy", y_unstacked) unstacked_predictions = super().predict_in_sample( X_unstacked, y_unstacked, diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index 60a5e61cc6..b8255a0487 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -260,6 +260,8 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y): ).get_trend_prediction_intervals(y, coverage=coverage) if is_multiseries(self.problem_type): + from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL + # Coverage label is label for each prediction interval limit(e.g. "0.95_lower") coverage_labels = list(list(pred_intervals.values())[0].keys()) @@ -270,7 +272,11 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y): # `pred_intervals` are in {series_id: {coverage_label: bound_value}} form for series_id, series_intervals in pred_intervals.items(): - series_id_target_name = str(series_id) + series_id_target_name = ( + self.input_target_name + + MULTISERIES_SEPARATOR_SYMBOL + + str(series_id) + ) series_id_prediction_intervals = _get_series_intervals( series_intervals, residuals[series_id], diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 2918a6bbb7..2649a82c3c 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -299,22 +299,7 @@ def _get_preprocessing_components( Returns: list[Transformer]: A list of applicable preprocessing components to use with the estimator. """ - if is_multiseries(problem_type): - if include_decomposer: - components_functions = [ - _get_time_series_featurizer, - _get_decomposer, - _get_datetime, - _get_drop_nan_rows_transformer, - ] - else: - components_functions = [ - _get_time_series_featurizer, - _get_datetime, - _get_drop_nan_rows_transformer, - ] - - elif is_time_series(problem_type): + if is_time_series(problem_type): components_functions = [ _get_label_encoder, _get_drop_all_null, @@ -1518,7 +1503,7 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values time_index (str): The name of the time index column. starting_index (int): The starting index to use for the stacked DataFrame. If None, the starting index will match that of the input data. Defaults to None. - series_id_values (set, list): The unique values of a series ID, used to generate the index. If None, values will + series_id_values (list): The unique values of a series ID, used to generate the index. If None, values will be generated from X column values. Required if X only has time index values and no exogenous values. Defaults to None. @@ -1536,7 +1521,7 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values continue separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL) original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1])) - series_ids.add(separated_name[-1]) + series_ids.append(separated_name[-1]) # Remove duplicates seen = set() series_ids = [val for val in series_ids if not (val in seen or seen.add(val))] diff --git a/evalml/tests/automl_tests/test_default_algorithm.py b/evalml/tests/automl_tests/test_default_algorithm.py index 31b8a166f7..826b5253d1 100644 --- a/evalml/tests/automl_tests/test_default_algorithm.py +++ b/evalml/tests/automl_tests/test_default_algorithm.py @@ -670,7 +670,7 @@ def test_default_algorithm_multiseries_time_series( ) first_batch = algo.next_batch() - assert len(first_batch) == 2 + assert len(first_batch) == 8 pipeline = first_batch[0] assert pipeline.model_family == ModelFamily.VARMAX assert pipeline.parameters["pipeline"] == search_parameters["pipeline"] @@ -679,8 +679,8 @@ def test_default_algorithm_multiseries_time_series( long_explore = algo.next_batch() long_estimators = set([pipeline.estimator.name for pipeline in long_explore]) - assert len(long_explore) == 100 - assert len(long_estimators) == 1 + assert len(long_explore) == 300 + assert len(long_estimators) == 3 @pytest.mark.parametrize( diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index 3030c09909..7c7aada78c 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -18,7 +18,6 @@ DateTimeFeaturizer, EmailFeaturizer, NaturalLanguageFeaturizer, - STLDecomposer, TimeSeriesFeaturizer, URLFeaturizer, ) @@ -98,7 +97,7 @@ def test_iterative_algorithm_init( assert algo.batch_number == 0 assert algo.default_max_batches == 1 estimators = get_estimators(problem_type) - decomposer = [STLDecomposer] if is_regression(problem_type) else [] + decomposer = [True, False] if is_regression(problem_type) else [True] assert len(algo.allowed_pipelines) == len( [ make_pipeline( @@ -106,11 +105,12 @@ def test_iterative_algorithm_init( y, estimator, problem_type, + include_decomposer=include_decomposer, parameters=search_parameters, ) for estimator in estimators - ] - + decomposer, + for include_decomposer in decomposer + ], ) diff --git a/evalml/tests/component_tests/test_decision_tree_regressor.py b/evalml/tests/component_tests/test_decision_tree_regressor.py index 882c88138f..ba14cb8710 100644 --- a/evalml/tests/component_tests/test_decision_tree_regressor.py +++ b/evalml/tests/component_tests/test_decision_tree_regressor.py @@ -14,6 +14,7 @@ def test_problem_types(): assert set(DecisionTreeRegressor.supported_problem_types) == { ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, } diff --git a/evalml/tests/component_tests/test_en_regressor.py b/evalml/tests/component_tests/test_en_regressor.py index 064ce573f3..e4eb418856 100644 --- a/evalml/tests/component_tests/test_en_regressor.py +++ b/evalml/tests/component_tests/test_en_regressor.py @@ -24,6 +24,7 @@ def test_problem_types(): assert set(ElasticNetRegressor.supported_problem_types) == { ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, } diff --git a/evalml/tests/component_tests/test_et_regressor.py b/evalml/tests/component_tests/test_et_regressor.py index c570dd7c46..aa6a4af46a 100644 --- a/evalml/tests/component_tests/test_et_regressor.py +++ b/evalml/tests/component_tests/test_et_regressor.py @@ -14,6 +14,7 @@ def test_problem_types(): assert set(ExtraTreesRegressor.supported_problem_types) == { ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, } diff --git a/evalml/tests/component_tests/test_time_series_featurizer.py b/evalml/tests/component_tests/test_time_series_featurizer.py index 703132b4a9..8f58906c7a 100644 --- a/evalml/tests/component_tests/test_time_series_featurizer.py +++ b/evalml/tests/component_tests/test_time_series_featurizer.py @@ -990,7 +990,7 @@ def test_featurizer_y_dataframe(multiseries_ts_data_unstacked): featurizer = TimeSeriesFeaturizer(time_index="date", gap=1, forecast_horizon=5) featurizer.fit(X, y) - assert featurizer.statistically_significant_lags == [6] + assert featurizer.statistically_significant_lags == {col: [6] for col in y.columns} expected_y_cols = [ f"target{MULTISERIES_SEPARATOR_SYMBOL}{i}_delay_6" for i in range(y.shape[1]) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 80036d0704..553ee9fe8c 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -987,6 +987,7 @@ def _X_y_based_on_pipeline_or_problem_type(pipeline_or_type): ProblemTypes.TIME_SERIES_BINARY: "binary", ProblemTypes.TIME_SERIES_MULTICLASS: "multiclass", ProblemTypes.TIME_SERIES_REGRESSION: "regression", + ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION: "regression", } pipeline_classes = { BinaryClassificationPipeline: "binary", diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py index e777c5391b..fb5be37beb 100644 --- a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py +++ b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_multiseries_regression_pipeline.py @@ -120,7 +120,7 @@ def test_multiseries_pipeline_predict_in_sample( range(55, 65), index=range(90, 100), name="target", - dtype="float64", + dtype="int64", ) if include_series_id: expected = pd.concat([X_holdout["series_id"], expected], axis=1) @@ -147,7 +147,6 @@ def test_multiseries_pipeline_predict_in_sample_series_out_of_order( # Reorder rows but keep ordered by date # Store ordered series ID values to compare to output later - X_holdout_series_id = X_holdout["series_id"] X_index = X_holdout.index X_holdout = X_holdout.sample(frac=1).sort_values(by="date") y_holdout = y_holdout.reindex(X_holdout.index) @@ -165,17 +164,39 @@ def test_multiseries_pipeline_predict_in_sample_series_out_of_order( y_train=y_train, include_series_id=include_series_id, ) + expected = pd.Series( range(55, 65), index=range(90, 100), name="target", - dtype="float64", + dtype="int64", + ) + expected = pd.concat( + [ + X_holdout["date"], + pd.Series( + [0, 1, 2, 3, 4] * 2, + name="series_id", + dtype=int, + index=range(90, 100), + ), + expected, + ], + axis=1, ) + expected = pd.merge( + infer_feature_types(X_holdout[["date", "series_id"]]), + expected, + on=["date", "series_id"], + ) + expected = expected.drop("date", axis=1) + expected.index = range(90, 100) + if include_series_id: - expected = pd.concat([X_holdout_series_id, expected], axis=1) expected = infer_feature_types(expected) pd.testing.assert_frame_equal(y_pred, expected) else: + expected = expected["target"] pd.testing.assert_series_equal(y_pred, expected) @@ -209,7 +230,7 @@ def test_multiseries_pipeline_predict( range(55, 65), index=range(90, 100), name="target", - dtype="float64", + dtype="int64", ) # Only the first predicted value is present in the delayed features else: @@ -217,7 +238,7 @@ def test_multiseries_pipeline_predict( [85, 86, 87, 88, 89, 0, 0, 0, 0, 0], index=range(90, 100), name="target", - dtype="float64", + dtype="int64", ) pd.testing.assert_series_equal(y_pred, expected) diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 5a9d4b163e..44641ef431 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -146,7 +146,7 @@ def test_make_pipeline( datetime = ( [DateTimeFeaturizer] if estimator_class.model_family - not in [ModelFamily.ARIMA, ModelFamily.PROPHET] + not in [ModelFamily.ARIMA, ModelFamily.PROPHET, ModelFamily.VARMAX] and "dates" in column_names else [] ) @@ -170,25 +170,22 @@ def test_make_pipeline( ) if is_time_series(problem_type): - if is_multiseries(problem_type): - expected_components = dfs + decomposer + [estimator_class] - else: - expected_components = ( - dfs - + label_encoder - + email_featurizer - + url_featurizer - + drop_null - + natural_language_featurizer - + imputer - + delayed_features - + decomposer - + datetime - + ohe - + drop_nan_rows_transformer - + standard_scaler - + [estimator_class] - ) + expected_components = ( + dfs + + label_encoder + + email_featurizer + + url_featurizer + + drop_null + + natural_language_featurizer + + imputer + + delayed_features + + decomposer + + datetime + + ohe + + drop_nan_rows_transformer + + standard_scaler + + [estimator_class] + ) else: expected_components = ( dfs @@ -624,7 +621,7 @@ def test_get_estimators(): problem_type=ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION, ), ) - == 1 + == 5 ) assert len(get_estimators(problem_type=ProblemTypes.BINARY, model_families=[])) == 0 @@ -1474,13 +1471,11 @@ def test_stack_data_noop(): pd.testing.assert_series_equal(stack_data(series_y), series_y) -@pytest.mark.parametrize("series_id_values_type", [set, list]) @pytest.mark.parametrize("no_features", [True, False]) @pytest.mark.parametrize("starting_index", [None, 1, 132]) def test_stack_X( starting_index, no_features, - series_id_values_type, multiseries_ts_data_stacked, multiseries_ts_data_unstacked, ): @@ -1491,7 +1486,7 @@ def test_stack_X( X_expected.index = X_expected.index + starting_index if no_features: - series_id_values = series_id_values_type(str(i) for i in range(0, 5)) + series_id_values = list(str(i) for i in range(0, 5)) X = pd.DataFrame(X["date"]) X_expected = X_expected[["date", "series_id"]] From ed7e0c9d81e0a077c17e5968d70229f465e120e7 Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Fri, 26 Jan 2024 16:12:12 -0500 Subject: [PATCH 03/10] Added addtional drop nan test case --- .../test_drop_nan_rows_transformer.py | 48 +++++++++++++++---- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/evalml/tests/component_tests/test_drop_nan_rows_transformer.py b/evalml/tests/component_tests/test_drop_nan_rows_transformer.py index 20214a3c47..ad3d6a77f3 100644 --- a/evalml/tests/component_tests/test_drop_nan_rows_transformer.py +++ b/evalml/tests/component_tests/test_drop_nan_rows_transformer.py @@ -27,8 +27,9 @@ def test_drop_rows_transformer(): assert_frame_equal(fit_transformed_X, X_expected) +@pytest.mark.parametrize("y_is_df", [True, False]) @pytest.mark.parametrize("null_value", [pd.NA, np.NaN]) -def test_drop_rows_transformer_retain_ww_schema(null_value): +def test_drop_rows_transformer_retain_ww_schema(null_value, y_is_df): # Expecting float because of np.NaN values X = pd.DataFrame( {"a column": [null_value, 2, 3, 4], "another col": ["a", null_value, "c", "d"]}, @@ -46,20 +47,47 @@ def test_drop_rows_transformer_retain_ww_schema(null_value): ) X_expected_schema = X.ww.schema - y = pd.Series([3, 2, 1, null_value]) - y = init_series(y, logical_type="IntegerNullable", semantic_tags="y_custom_tag") + if y_is_df: + y = pd.DataFrame( + {"series_a": [3, 2, 1, null_value], "series_b": [1, null_value, 3, 4]}, + ) + y.ww.init() + y.ww.set_types( + logical_types={ + "series_a": "IntegerNullable", + "series_b": "IntegerNullable", + }, + semantic_tags={"series_a": "custom_tag_a", "series_b": "custom_tag_b"}, + ) - y_expected = pd.Series([1], index=[2]) - y_expected = init_series( - y_expected, - logical_type="IntegerNullable", - semantic_tags="y_custom_tag", - ) + y_expected = pd.DataFrame({"series_a": [1], "series_b": [3]}, index=[2]) + y_expected.ww.init() + y_expected.ww.set_types( + logical_types={ + "series_a": "IntegerNullable", + "series_b": "IntegerNullable", + }, + semantic_tags={"series_a": "custom_tag_a", "series_b": "custom_tag_b"}, + ) + else: + y = pd.Series([3, 2, 1, null_value]) + y = init_series(y, logical_type="IntegerNullable", semantic_tags="y_custom_tag") + + y_expected = pd.Series([1], index=[2]) + y_expected = init_series( + y_expected, + logical_type="IntegerNullable", + semantic_tags="y_custom_tag", + ) y_expected_schema = y.ww.schema drop_rows_transformer = DropNaNRowsTransformer() transformed_X, transformed_y = drop_rows_transformer.fit_transform(X, y) assert_frame_equal(transformed_X, X_expected) - assert_series_equal(transformed_y, y_expected) assert _schema_is_equal(transformed_X.ww.schema, X_expected_schema) + + if y_is_df: + assert_frame_equal(transformed_y, y_expected) + else: + assert_series_equal(transformed_y, y_expected) assert transformed_y.ww.schema == y_expected_schema From 382cec2032f21051245add877849e8902c306757 Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Fri, 26 Jan 2024 16:17:09 -0500 Subject: [PATCH 04/10] Updated release notes --- docs/source/release_notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 4dda45a71f..994d705ad6 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,7 @@ Release Notes ------------- **Future Releases** * Enhancements + * Added support for additional estimators for multiseries datasets :pr:`4385` * Fixes * Fixed bug in `_downcast_nullable_y` causing woodwork initialization issues :pr:`4369` * Fixed multiseries prediction interval labels :pr:`4377` From 443f7a57983c07c1db7351d9225c3dfed8372f59 Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Mon, 29 Jan 2024 09:24:32 -0500 Subject: [PATCH 05/10] Reverted series ID name --- evalml/pipelines/time_series_regression_pipeline.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/evalml/pipelines/time_series_regression_pipeline.py b/evalml/pipelines/time_series_regression_pipeline.py index b8255a0487..60a5e61cc6 100644 --- a/evalml/pipelines/time_series_regression_pipeline.py +++ b/evalml/pipelines/time_series_regression_pipeline.py @@ -260,8 +260,6 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y): ).get_trend_prediction_intervals(y, coverage=coverage) if is_multiseries(self.problem_type): - from evalml.pipelines.utils import MULTISERIES_SEPARATOR_SYMBOL - # Coverage label is label for each prediction interval limit(e.g. "0.95_lower") coverage_labels = list(list(pred_intervals.values())[0].keys()) @@ -272,11 +270,7 @@ def _get_series_intervals(intervals, residuals, trend_pred_intervals, y): # `pred_intervals` are in {series_id: {coverage_label: bound_value}} form for series_id, series_intervals in pred_intervals.items(): - series_id_target_name = ( - self.input_target_name - + MULTISERIES_SEPARATOR_SYMBOL - + str(series_id) - ) + series_id_target_name = str(series_id) series_id_prediction_intervals = _get_series_intervals( series_intervals, residuals[series_id], From fa3dec8498268c258a478938581ba54bee71da20 Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Mon, 29 Jan 2024 14:15:34 -0500 Subject: [PATCH 06/10] Moved infer feature types --- evalml/pipelines/multiseries_regression_pipeline.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py index 6e84476140..6c0cf64b98 100644 --- a/evalml/pipelines/multiseries_regression_pipeline.py +++ b/evalml/pipelines/multiseries_regression_pipeline.py @@ -197,12 +197,11 @@ def predict_in_sample( )[output_cols] # Index will start at the unstacked index, so we need to reset it to the original index stacked_predictions.index = X.index - stacked_predictions = infer_feature_types(stacked_predictions) if not include_series_id: - return stacked_predictions[self.input_target_name] + return infer_feature_types(stacked_predictions[self.input_target_name]) else: - return stacked_predictions + return infer_feature_types(stacked_predictions) def get_forecast_period(self, X): """Generates all possible forecasting time points based on latest data point in X. From b7b6bf9e3535339e93deac7f59c58fd06b5966cb Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Tue, 30 Jan 2024 15:52:27 -0500 Subject: [PATCH 07/10] Added clarifying comments and updated test --- .../transformers/preprocessing/time_series_featurizer.py | 2 +- evalml/pipelines/utils.py | 2 +- evalml/tests/automl_tests/test_iterative_algorithm.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py index f093d7b19a..61246a78d2 100644 --- a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py @@ -127,7 +127,7 @@ def fit(self, X, y=None): if self.time_index is None: raise ValueError("time_index cannot be None!") - # For the multiseries case, where we only want the start delay lag for the baseline + # For the multiseries case, each series ID has individualized lag values if isinstance(y, pd.DataFrame): self.statistically_significant_lags = {} for column in y.columns: diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 2649a82c3c..f6b0154468 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -1522,7 +1522,7 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values separated_name = col.split(MULTISERIES_SEPARATOR_SYMBOL) original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1])) series_ids.append(separated_name[-1]) - # Remove duplicates + # Remove duplicates while maintaining insertion order seen = set() series_ids = [val for val in series_ids if not (val in seen or seen.add(val))] diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index 7c7aada78c..978d212c06 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -97,7 +97,6 @@ def test_iterative_algorithm_init( assert algo.batch_number == 0 assert algo.default_max_batches == 1 estimators = get_estimators(problem_type) - decomposer = [True, False] if is_regression(problem_type) else [True] assert len(algo.allowed_pipelines) == len( [ make_pipeline( @@ -109,7 +108,9 @@ def test_iterative_algorithm_init( parameters=search_parameters, ) for estimator in estimators - for include_decomposer in decomposer + for include_decomposer in ( + [True, False] if is_regression(problem_type) else [False] + ) ], ) From 6f765bbe153f351892c242c9b8a69b812f94dd61 Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Wed, 31 Jan 2024 13:59:10 -0500 Subject: [PATCH 08/10] Consolidated code and added additional clarifying comments --- .../transformers/preprocessing/drop_nan_rows_transformer.py | 3 +-- evalml/pipelines/utils.py | 1 + evalml/preprocessing/utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/drop_nan_rows_transformer.py b/evalml/pipelines/components/transformers/preprocessing/drop_nan_rows_transformer.py index 868431b91d..b71d4eb9e8 100644 --- a/evalml/pipelines/components/transformers/preprocessing/drop_nan_rows_transformer.py +++ b/evalml/pipelines/components/transformers/preprocessing/drop_nan_rows_transformer.py @@ -49,10 +49,9 @@ def transform(self, X, y=None): if y_t is not None: if isinstance(y_t, pd.DataFrame): y_t_logical = y_t.ww.logical_types - y_t_semantic = y_t.ww.semantic_tags else: y_t_logical = y_t.ww.logical_type - y_t_semantic = y_t.ww.semantic_tags + y_t_semantic = y_t.ww.semantic_tags X_t, y_t = drop_rows_with_nans(X_t, y_t) X_t.ww.init_with_full_schema(X_t_schema) diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index f6b0154468..9b73e40c31 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -1523,6 +1523,7 @@ def stack_X(X, series_id_name, time_index, starting_index=None, series_id_values original_columns.add(MULTISERIES_SEPARATOR_SYMBOL.join(separated_name[:-1])) series_ids.append(separated_name[-1]) # Remove duplicates while maintaining insertion order + # Need order to match series ID labels correctly when restacking columns seen = set() series_ids = [val for val in series_ids if not (val in seen or seen.add(val))] diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 7d1008d955..013dc71fa9 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -77,7 +77,7 @@ def split_multiseries_data(X, y, series_id, time_index, **kwargs): X_unstacked, y_unstacked, problem_type="time series regression", **kwargs ) - # Get unique series value from X if there is only the time_index column + # Get unique series values (as a list to maintain order) from X if there is only the time_index column # Otherwise, this information is generated in `stack_X` from the column values series_id_values = X[series_id].unique() if len(X_unstacked.columns) == 1 else None From e49dfa5000f29229a9eb7ce4aca88eb7c0d7972c Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Wed, 31 Jan 2024 15:01:57 -0500 Subject: [PATCH 09/10] Code cleanup --- .../preprocessing/time_series_featurizer.py | 71 ++++++++++--------- .../multiseries_regression_pipeline.py | 9 ++- .../automl_tests/test_iterative_algorithm.py | 1 + 3 files changed, 45 insertions(+), 36 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py index 61246a78d2..4726f2710d 100644 --- a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py @@ -127,8 +127,16 @@ def fit(self, X, y=None): if self.time_index is None: raise ValueError("time_index cannot be None!") - # For the multiseries case, each series ID has individualized lag values - if isinstance(y, pd.DataFrame): + if y is None: + # Set lags to all possible lag values + self.statistically_significant_lags = np.arange( + self.start_delay, + self.start_delay + self.max_delay + 1, + ) + else: + # For the multiseries case, each series ID has individualized lag values + if isinstance(y, pd.Series): + y = y.to_frame() self.statistically_significant_lags = {} for column in y.columns: self.statistically_significant_lags[ @@ -139,13 +147,11 @@ def fit(self, X, y=None): start_delay=self.start_delay, max_delay=self.max_delay, ) - else: - self.statistically_significant_lags = self._find_significant_lags( - y, - conf_level=self.conf_level, - start_delay=self.start_delay, - max_delay=self.max_delay, - ) + if len(y.columns) == 1: + self.statistically_significant_lags = ( + self.statistically_significant_lags[column] + ) + return self return self @staticmethod @@ -169,31 +175,28 @@ def _encode_X_while_preserving_index(X_categorical): @staticmethod def _find_significant_lags(y, conf_level, start_delay, max_delay): all_lags = np.arange(start_delay, start_delay + max_delay + 1) - if y is not None: - # Compute the acf and find its peaks - acf_values, ci_intervals = acf( - y, - nlags=len(y) - 1, - fft=True, - alpha=conf_level, - ) - peaks, _ = find_peaks(acf_values) - # Significant lags are the union of: - # 1. the peaks (local maxima) that are significant - # 2. The significant lags among the first 10 lags. - # We then filter the list to be in the range [start_delay, start_delay + max_delay] - index = np.arange(len(acf_values)) - significant = np.logical_or(ci_intervals[:, 0] > 0, ci_intervals[:, 1] < 0) - first_significant_10 = index[:10][significant[:10]] - significant_lags = ( - set(index[significant]).intersection(peaks).union(first_significant_10) - ) - # If no lags are significant get the first lag - significant_lags = sorted(significant_lags.intersection(all_lags)) or [ - start_delay, - ] - else: - significant_lags = all_lags + # Compute the acf and find its peaks + acf_values, ci_intervals = acf( + y, + nlags=len(y) - 1, + fft=True, + alpha=conf_level, + ) + peaks, _ = find_peaks(acf_values) + # Significant lags are the union of: + # 1. the peaks (local maxima) that are significant + # 2. The significant lags among the first 10 lags. + # We then filter the list to be in the range [start_delay, start_delay + max_delay] + index = np.arange(len(acf_values)) + significant = np.logical_or(ci_intervals[:, 0] > 0, ci_intervals[:, 1] < 0) + first_significant_10 = index[:10][significant[:10]] + significant_lags = ( + set(index[significant]).intersection(peaks).union(first_significant_10) + ) + # If no lags are significant get the first lag + significant_lags = sorted(significant_lags.intersection(all_lags)) or [ + start_delay, + ] return significant_lags def _compute_rolling_transforms(self, X, y, original_features): diff --git a/evalml/pipelines/multiseries_regression_pipeline.py b/evalml/pipelines/multiseries_regression_pipeline.py index 6c0cf64b98..df9bce6632 100644 --- a/evalml/pipelines/multiseries_regression_pipeline.py +++ b/evalml/pipelines/multiseries_regression_pipeline.py @@ -166,17 +166,21 @@ def predict_in_sample( unstacked_predictions = unstacked_predictions[ [ series_id_target - for series_id_target in y_train_unstacked.columns + for series_id_target in self.series_id_target_names if series_id_target in unstacked_predictions.columns ] ] + + # Add `time_index` column to index for generating stacked datetime column in `stack_data()` unstacked_predictions.index = X_unstacked[self.time_index] stacked_predictions = stack_data( unstacked_predictions, include_series_id=True, series_id_name=self.series_id, ) - stacked_predictions = stacked_predictions.reset_index() + # Move datetime index into separate date column to use when merging later + stacked_predictions = stacked_predictions.reset_index(drop=False) + sp_dtypes = { self.time_index: X[self.time_index].dtype, self.series_id: X[self.series_id].dtype, @@ -195,6 +199,7 @@ def predict_in_sample( stacked_predictions, on=[self.time_index, self.series_id], )[output_cols] + # Index will start at the unstacked index, so we need to reset it to the original index stacked_predictions.index = X.index diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index 978d212c06..4fc4f0f538 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -108,6 +108,7 @@ def test_iterative_algorithm_init( parameters=search_parameters, ) for estimator in estimators + # Generate both decomposer and non-decomposer pipelines when problem type is multiseries time series reg. for include_decomposer in ( [True, False] if is_regression(problem_type) else [False] ) From 701eef662bb49a182f64fb8e20caa06700d5440e Mon Sep 17 00:00:00 2001 From: christopherbunn Date: Wed, 31 Jan 2024 15:30:49 -0500 Subject: [PATCH 10/10] Added support for ndarrays for featurizer --- .../transformers/preprocessing/time_series_featurizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py index 4726f2710d..bbd35cc1e7 100644 --- a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py +++ b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py @@ -135,8 +135,9 @@ def fit(self, X, y=None): ) else: # For the multiseries case, each series ID has individualized lag values - if isinstance(y, pd.Series): - y = y.to_frame() + if isinstance(y, pd.Series) or isinstance(y, np.ndarray): + y = pd.DataFrame(y) + self.statistically_significant_lags = {} for column in y.columns: self.statistically_significant_lags[