From a47c0a5112c91531342e438f4498f6b69ae4192f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Dock=C3=A8s?= Date: Fri, 21 Jun 2024 16:48:04 +0200 Subject: [PATCH] Rename TableVectorizer parameters (#947) --- CHANGES.rst | 6 ++ benchmarks/bench_gap_divergence.py | 2 +- benchmarks/bench_tablevectorizer_tuning.py | 2 +- benchmarks/run_on_openml_datasets.py | 4 +- examples/01_encodings.py | 3 +- examples/03_datetime_encoder.py | 8 +-- examples/08_join_aggregation.py | 4 +- ...grid_searching_with_the_tablevectorizer.py | 6 +- skrub/_interpolation_joiner.py | 2 +- skrub/_table_vectorizer.py | 60 +++++++++---------- skrub/_tabular_learner.py | 24 ++++---- skrub/tests/test_interpolation_join.py | 2 +- skrub/tests/test_table_vectorizer.py | 30 +++++----- skrub/tests/test_tabular_learner.py | 22 +++---- 14 files changed, 83 insertions(+), 92 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index edca68de3..06c7a5c59 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -22,6 +22,12 @@ Major changes used twice (go through 2 different transformers). :pr:`902` by :user:`Jérôme Dockès `. +* Some parameters of :class:`TableVectorizer` have been renamed: + `high_cardinality_transformer` → `high_cardinality`, + `low_cardinality_transformer` → `low_cardinality`, + `datetime_transformer` → `datetime`, `numeric_transformer` → `numeric`. + :pr:`947` by :user:`Jérôme Dockès `. + * The :class:`GapEncoder` and :class:`MinHashEncoder` are now a single-column transformers: their ``fit``, ``fit_transform`` and ``transform`` methods accept a single column (a pandas or polars Series). Dataframes and numpy diff --git a/benchmarks/bench_gap_divergence.py b/benchmarks/bench_gap_divergence.py index f78ddfe32..cf74cb99a 100644 --- a/benchmarks/bench_gap_divergence.py +++ b/benchmarks/bench_gap_divergence.py @@ -206,7 +206,7 @@ def benchmark(max_iter_e_step, dataset_name): ( "encoding", TableVectorizer( - high_cardinality_transformer=ModifiedGapEncoder( + high_cardinality=ModifiedGapEncoder( min_iter=5, max_iter=5, max_iter_e_step=max_iter_e_step, diff --git a/benchmarks/bench_tablevectorizer_tuning.py b/benchmarks/bench_tablevectorizer_tuning.py index ceadadf3c..53abbeda8 100644 --- a/benchmarks/bench_tablevectorizer_tuning.py +++ b/benchmarks/bench_tablevectorizer_tuning.py @@ -59,7 +59,7 @@ def benchmark( ): tv = TableVectorizer( cardinality_threshold=tv_cardinality_threshold, - high_cardinality_transformer=MinHashEncoder(n_components=minhash_n_components), + high_cardinality=MinHashEncoder(n_components=minhash_n_components), ) dataset = dataset_map[dataset_name] diff --git a/benchmarks/run_on_openml_datasets.py b/benchmarks/run_on_openml_datasets.py index a7f5c80fa..97b21971b 100644 --- a/benchmarks/run_on_openml_datasets.py +++ b/benchmarks/run_on_openml_datasets.py @@ -47,14 +47,14 @@ classification_pipeline = Pipeline( [ - ("vectorizer", TableVectorizer(high_cardinality_transformer=MinHashEncoder())), + ("vectorizer", TableVectorizer(high_cardinality=MinHashEncoder())), ("classifier", HistGradientBoostingClassifier()), ] ) regression_pipeline = Pipeline( [ - ("vectorizer", TableVectorizer(high_cardinality_transformer=MinHashEncoder())), + ("vectorizer", TableVectorizer(high_cardinality=MinHashEncoder())), ("regressor", HistGradientBoostingRegressor()), ] ) diff --git a/examples/01_encodings.py b/examples/01_encodings.py index 243b38fa5..8fa4acf1e 100644 --- a/examples/01_encodings.py +++ b/examples/01_encodings.py @@ -267,8 +267,7 @@ from skrub import MinHashEncoder, ToCategorical vectorizer = TableVectorizer( - low_cardinality_transformer=ToCategorical(), - high_cardinality_transformer=MinHashEncoder(), + low_cardinality=ToCategorical(), high_cardinality=MinHashEncoder() ) pipeline = make_pipeline( vectorizer, HistGradientBoostingRegressor(categorical_features="from_dtype") diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py index a7152324e..88e080228 100644 --- a/examples/03_datetime_encoder.py +++ b/examples/03_datetime_encoder.py @@ -120,9 +120,7 @@ # # Here, for example, we want it to extract the day of the week. -table_vec = TableVectorizer( - datetime_transformer=DatetimeEncoder(add_weekday=True), -).fit(X) +table_vec = TableVectorizer(datetime=DatetimeEncoder(add_weekday=True)).fit(X) pprint(table_vec.get_feature_names_out()) ############################################################################### @@ -257,9 +255,7 @@ ############################################################################### from sklearn.inspection import permutation_importance -table_vec = TableVectorizer( - datetime_transformer=DatetimeEncoder(add_weekday=True), -) +table_vec = TableVectorizer(datetime=DatetimeEncoder(add_weekday=True)) # In this case, we don't use a pipeline, because we want to compute the # importance of the features created by the DatetimeEncoder diff --git a/examples/08_join_aggregation.py b/examples/08_join_aggregation.py index cf532c03b..e26148928 100644 --- a/examples/08_join_aggregation.py +++ b/examples/08_join_aggregation.py @@ -85,9 +85,7 @@ # columns, and doesn't interact with numerical columns. from skrub import DatetimeEncoder, TableVectorizer -table_vectorizer = TableVectorizer( - datetime_transformer=DatetimeEncoder(add_weekday=True) -) +table_vectorizer = TableVectorizer(datetime=DatetimeEncoder(add_weekday=True)) X_date_encoded = table_vectorizer.fit_transform(X) X_date_encoded.head() diff --git a/examples/FIXME/07_grid_searching_with_the_tablevectorizer.py b/examples/FIXME/07_grid_searching_with_the_tablevectorizer.py index 0af8bc96a..49249b6ae 100644 --- a/examples/FIXME/07_grid_searching_with_the_tablevectorizer.py +++ b/examples/FIXME/07_grid_searching_with_the_tablevectorizer.py @@ -62,9 +62,7 @@ from skrub import MinHashEncoder -tv = TableVectorizer( - high_cardinality_transformer=MinHashEncoder(), -) +tv = TableVectorizer(high_cardinality=MinHashEncoder()) tv.fit(X) pprint(tv.transformers_) @@ -117,7 +115,7 @@ pipeline = make_pipeline( TableVectorizer( - high_cardinality_transformer=GapEncoder(), + high_cardinality=GapEncoder(), specific_transformers=[ ("mh_dep_name", MinHashEncoder(), ["department_name"]), ], diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index 69702cf2b..9befffde2 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -14,7 +14,7 @@ from skrub._minhash_encoder import MinHashEncoder from skrub._table_vectorizer import TableVectorizer -DEFAULT_VECTORIZER = TableVectorizer(high_cardinality_transformer=MinHashEncoder()) +DEFAULT_VECTORIZER = TableVectorizer(high_cardinality=MinHashEncoder()) DEFAULT_REGRESSOR = HistGradientBoostingRegressor() DEFAULT_CLASSIFIER = HistGradientBoostingClassifier() diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 4e621f1d4..f7dffffce 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -123,7 +123,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator): String and categorical columns with a count of unique values smaller than a given threshold (40 by default). Category encoding schemes such as one-hot encoding, ordinal encoding etc. are typically appropriate - for low-cardinality columns. + for columns with few unique values. - high_cardinality: String and categorical columns with many unique values, such as free-form text. Such columns have so many distinct values that it is @@ -140,8 +140,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator): multivariate transformations are therefore not supported. The transformer for each kind of column can be configured with the - corresponding ``*_transformer`` parameter: ``numeric_transformer``, - ``datetime_transformer``, ... + corresponding parameter. A transformer can be a scikit-learn Transformer (an object providing the ``fit``, ``fit_transform`` and ``transform`` methods), a clone of which @@ -156,30 +155,33 @@ class TableVectorizer(TransformerMixin, BaseEstimator): .. note:: - The ``specific_transformers`` parameter is likely to be removed in a - future version of ``skrub``, when better utilities for building complex + The ``specific_transformers`` parameter will be removed in a future + version of ``skrub``, when better utilities for building complex pipelines are introduced. Parameters ---------- cardinality_threshold : int, default=40 String and categorical features with a number of unique values strictly - smaller than this threshold are considered ``low_cardinality``, the - rest are considered ``high_cardinality``. + smaller than this threshold are handled by the transformer ``low_cardinality``, the + rest are handled by the transformer ``high_cardinality``. - low_cardinality_transformer : transformer, "passthrough" or "drop", optional - The transformer for ``low_cardinality`` columns. The default is a + low_cardinality : transformer, "passthrough" or "drop", optional + The transformer for string or categorical columns with strictly fewer + than ``cardinality_threshold`` unique values. The default is a ``OneHotEncoder``. - high_cardinality_transformer : transformer, "passthrough" or "drop", optional - The transformer for ``high_cardinality`` columns. The default is a - ``GapEncoder`` with 30 components (30 output columns for each input). + high_cardinality : transformer, "passthrough" or "drop", optional + The transformer for string or categorical columns with at least + ``cardinality_threshold`` unique values. The default is a ``GapEncoder`` + with 30 components (30 output columns for each input). - numeric_transformer : transformer, "passthrough" or "drop", optional - The transformer for ``numeric`` columns. The default is passthrough. + numeric : transformer, "passthrough" or "drop", optional + The transformer for numeric columns (floats, ints, booleans). The + default is passthrough. - datetime_transformer : transformer, "passthrough" or "drop", optional - The transformer for ``datetime`` columns. The default is + datetime : transformer, "passthrough" or "drop", optional + The transformer for date and datetime columns. The default is ``DatetimeEncoder``, which extracts features such as year, month, etc. specific_transformers : list of (transformer, list of column names) pairs, optional @@ -407,26 +409,22 @@ def __init__( self, *, cardinality_threshold=40, - low_cardinality_transformer=LOW_CARDINALITY_TRANSFORMER, - high_cardinality_transformer=HIGH_CARDINALITY_TRANSFORMER, - numeric_transformer=NUMERIC_TRANSFORMER, - datetime_transformer=DATETIME_TRANSFORMER, + low_cardinality=LOW_CARDINALITY_TRANSFORMER, + high_cardinality=HIGH_CARDINALITY_TRANSFORMER, + numeric=NUMERIC_TRANSFORMER, + datetime=DATETIME_TRANSFORMER, specific_transformers=(), n_jobs=None, ): self.cardinality_threshold = cardinality_threshold - self.low_cardinality_transformer = _utils.clone_if_default( - low_cardinality_transformer, LOW_CARDINALITY_TRANSFORMER + self.low_cardinality = _utils.clone_if_default( + low_cardinality, LOW_CARDINALITY_TRANSFORMER ) - self.high_cardinality_transformer = _utils.clone_if_default( - high_cardinality_transformer, HIGH_CARDINALITY_TRANSFORMER - ) - self.numeric_transformer = _utils.clone_if_default( - numeric_transformer, NUMERIC_TRANSFORMER - ) - self.datetime_transformer = _utils.clone_if_default( - datetime_transformer, DATETIME_TRANSFORMER + self.high_cardinality = _utils.clone_if_default( + high_cardinality, HIGH_CARDINALITY_TRANSFORMER ) + self.numeric = _utils.clone_if_default(numeric, NUMERIC_TRANSFORMER) + self.datetime = _utils.clone_if_default(datetime, DATETIME_TRANSFORMER) self.specific_transformers = specific_transformers self.n_jobs = n_jobs @@ -562,7 +560,7 @@ def add_step(steps, transformer, cols, allow_reject=False): ]: self._named_encoders[name] = add_step( self._encoders, - getattr(self, f"{name}_transformer"), + getattr(self, name), cols & selector - _created_by(*self._encoders), ) diff --git a/skrub/_tabular_learner.py b/skrub/_tabular_learner.py index 69a60edfb..687c922f1 100644 --- a/skrub/_tabular_learner.py +++ b/skrub/_tabular_learner.py @@ -108,8 +108,8 @@ def tabular_learner(estimator, *, n_jobs=None): >>> tabular_learner('regressor') # doctest: +SKIP Pipeline(steps=[('tablevectorizer', - TableVectorizer(high_cardinality_transformer=MinHashEncoder(), - low_cardinality_transformer=ToCategorical())), + TableVectorizer(high_cardinality=MinHashEncoder(), + low_cardinality=ToCategorical())), ('histgradientboostingregressor', HistGradientBoostingRegressor(categorical_features='from_dtype'))]) @@ -118,8 +118,8 @@ def tabular_learner(estimator, *, n_jobs=None): >>> tabular_learner('classifier') # doctest: +SKIP Pipeline(steps=[('tablevectorizer', - TableVectorizer(high_cardinality_transformer=MinHashEncoder(), - low_cardinality_transformer=ToCategorical())), + TableVectorizer(high_cardinality=MinHashEncoder(), + low_cardinality=ToCategorical())), ('histgradientboostingclassifier', HistGradientBoostingClassifier(categorical_features='from_dtype'))]) @@ -192,13 +192,13 @@ def tabular_learner(estimator, *, n_jobs=None): >>> tabular_learner('classifier') # doctest: +SKIP Pipeline(steps=[('tablevectorizer', - TableVectorizer(high_cardinality_transformer=MinHashEncoder(), - low_cardinality_transformer=ToCategorical())), + TableVectorizer(high_cardinality=MinHashEncoder(), + low_cardinality=ToCategorical())), ('histgradientboostingclassifier', HistGradientBoostingClassifier(categorical_features='from_dtype'))]) - A :obj:`MinHashEncoder` is used as the - ``high_cardinality_transformer``. This encoder provides good + ``high_cardinality``. This encoder provides good performance when the supervised estimator is based on a decision tree or ensemble of trees, as is the case for the :obj:`~sklearn.ensemble.HistGradientBoostingClassifier`. Unlike the @@ -206,7 +206,7 @@ def tabular_learner(estimator, *, n_jobs=None): interpretable features. However, it is much faster and uses less memory. - - The ``low_cardinality_transformer`` does not one-hot encode features. + - The ``low_cardinality`` does not one-hot encode features. The :obj:`~sklearn.ensemble.HistGradientBoostingClassifier` has built-in support for categorical data which is more efficient than one-hot encoding. Therefore the selected encoder, :obj:`ToCategorical`, simply @@ -257,13 +257,13 @@ def tabular_learner(estimator, *, n_jobs=None): and getattr(estimator, "categorical_features", None) == "from_dtype" ): vectorizer.set_params( - low_cardinality_transformer=ToCategorical(), - high_cardinality_transformer=MinHashEncoder(), + low_cardinality=ToCategorical(), + high_cardinality=MinHashEncoder(), ) elif isinstance(estimator, _TREE_ENSEMBLE_CLASSES): vectorizer.set_params( - low_cardinality_transformer=OrdinalEncoder(), - high_cardinality_transformer=MinHashEncoder(), + low_cardinality=OrdinalEncoder(), + high_cardinality=MinHashEncoder(), ) steps = [vectorizer] if not hasattr(estimator, "_get_tags") or not estimator._get_tags().get( diff --git a/skrub/tests/test_interpolation_join.py b/skrub/tests/test_interpolation_join.py index 11e49c9dc..f706d7b83 100644 --- a/skrub/tests/test_interpolation_join.py +++ b/skrub/tests/test_interpolation_join.py @@ -200,7 +200,7 @@ def test_join_on_date(df_module): aux_key="date", regressor=KNeighborsRegressor(1), ) - .set_params(vectorizer__datetime_transformer__resolution=None) + .set_params(vectorizer__datetime__resolution=None) .fit_transform(sales) ) assert_array_equal(ns.to_list(ns.col(transformed, "temp")), [-10, 10]) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 0a6b85d69..207a075d7 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -248,10 +248,10 @@ def test_duplicate_column_names(): def passthrough_vectorizer(): return TableVectorizer( - high_cardinality_transformer="passthrough", - low_cardinality_transformer="passthrough", - numeric_transformer="passthrough", - datetime_transformer="passthrough", + high_cardinality="passthrough", + low_cardinality="passthrough", + numeric="passthrough", + datetime="passthrough", ) @@ -374,7 +374,7 @@ def test_fit_transform_equiv(X): def test_handle_unknown_category(): X = _get_clean_dataframe() - # Treat all columns as low cardinality + # Treat all columns as having few unique values table_vec = TableVectorizer(cardinality_threshold=7).fit(X) X_unknown = pd.DataFrame( { @@ -423,7 +423,7 @@ def test_handle_unknown_category(): [ TableVectorizer(), TableVectorizer( - low_cardinality_transformer=MinHashEncoder(), + low_cardinality=MinHashEncoder(), ), ], ) @@ -506,7 +506,7 @@ def test_changing_types(X_train, X_test, expected_X_out): """ table_vec = TableVectorizer( # only extract the total seconds - datetime_transformer=DatetimeEncoder(resolution=None) + datetime=DatetimeEncoder(resolution=None) ) table_vec.fit(X_train) X_out = table_vec.transform(X_test) @@ -536,7 +536,7 @@ def test_column_by_column(): pytest.xfail("pandas is_string_dtype incorrect in old pandas") X = _get_clean_dataframe() vectorizer = TableVectorizer( - high_cardinality_transformer=GapEncoder(n_components=2, random_state=0), + high_cardinality=GapEncoder(n_components=2, random_state=0), cardinality_threshold=4, ) X_trans = vectorizer.fit_transform(X) @@ -551,7 +551,7 @@ def test_column_by_column(): @skip_if_no_parallel @pytest.mark.parametrize( - "high_cardinality_transformer", + "high_cardinality", # The GapEncoder and the MinHashEncoder should be parallelized on all columns. # The OneHotEncoder should not be parallelized. [ @@ -560,10 +560,10 @@ def test_column_by_column(): MinHashEncoder(n_components=2), ], ) -def test_parallelism(high_cardinality_transformer): +def test_parallelism(high_cardinality): X = _get_clean_dataframe() params = dict( - high_cardinality_transformer=high_cardinality_transformer, + high_cardinality=high_cardinality, cardinality_threshold=4, ) vectorizer = TableVectorizer(**params) @@ -607,9 +607,9 @@ def test_pandas_sparse_array(): def test_wrong_transformer(): X = _get_clean_dataframe() with pytest.raises(ValueError): - TableVectorizer(high_cardinality_transformer="passthroughtypo").fit(X) + TableVectorizer(high_cardinality="passthroughtypo").fit(X) with pytest.raises(TypeError): - TableVectorizer(high_cardinality_transformer=None).fit(X) + TableVectorizer(high_cardinality=None).fit(X) invalid_tuples = [ @@ -678,7 +678,7 @@ def test_accept_pipeline(): # non-regression test for https://github.com/skrub-data/skrub/issues/886 # TableVectorizer used to force transformers to inherit from TransformerMixin df = pd.DataFrame(dict(a=[1.1, 2.2])) - tv = TableVectorizer(numeric_transformer=make_pipeline("passthrough")) + tv = TableVectorizer(numeric=make_pipeline("passthrough")) tv.fit(df) @@ -720,5 +720,5 @@ def test_supervised_encoder(df_module): # of the defaults encoders do) X = df_module.make_dataframe({"a": [f"c_{i}" for _ in range(5) for i in range(4)]}) y = np.random.default_rng(0).normal(size=sbd.shape(X)[0]) - tv = TableVectorizer(low_cardinality_transformer=TargetEncoder()) + tv = TableVectorizer(low_cardinality=TargetEncoder()) tv.fit_transform(X, y) diff --git a/skrub/tests/test_tabular_learner.py b/skrub/tests/test_tabular_learner.py index b036d8a80..72df72c52 100644 --- a/skrub/tests/test_tabular_learner.py +++ b/skrub/tests/test_tabular_learner.py @@ -20,11 +20,11 @@ def test_default_pipeline(learner_kind): p = tabular_learner(learner_kind) tv, learner = [e for _, e in p.steps] assert isinstance(tv, TableVectorizer) - assert isinstance(tv.high_cardinality_transformer, MinHashEncoder) + assert isinstance(tv.high_cardinality, MinHashEncoder) if parse_version(sklearn.__version__) < parse_version("1.4"): - assert isinstance(tv.low_cardinality_transformer, OrdinalEncoder) + assert isinstance(tv.low_cardinality, OrdinalEncoder) else: - assert isinstance(tv.low_cardinality_transformer, ToCategorical) + assert isinstance(tv.low_cardinality, ToCategorical) assert learner.categorical_features == "from_dtype" if learner_kind == "regressor": assert isinstance(learner, ensemble.HistGradientBoostingRegressor) @@ -48,8 +48,8 @@ def test_linear_learner(): p = tabular_learner(original_learner) tv, imputer, scaler, learner = [e for _, e in p.steps] assert learner is original_learner - assert isinstance(tv.high_cardinality_transformer, GapEncoder) - assert isinstance(tv.low_cardinality_transformer, OneHotEncoder) + assert isinstance(tv.high_cardinality, GapEncoder) + assert isinstance(tv.low_cardinality, OneHotEncoder) assert isinstance(imputer, SimpleImputer) assert isinstance(scaler, StandardScaler) @@ -63,20 +63,16 @@ def test_tree_learner(): else: tv, learner = [e for _, e in p.steps] assert learner is original_learner - assert isinstance(tv.high_cardinality_transformer, MinHashEncoder) - assert isinstance(tv.low_cardinality_transformer, OrdinalEncoder) + assert isinstance(tv.high_cardinality, MinHashEncoder) + assert isinstance(tv.low_cardinality, OrdinalEncoder) def test_from_dtype(): p = tabular_learner(ensemble.HistGradientBoostingRegressor(categorical_features=())) - assert isinstance( - p.named_steps["tablevectorizer"].low_cardinality_transformer, OrdinalEncoder - ) + assert isinstance(p.named_steps["tablevectorizer"].low_cardinality, OrdinalEncoder) if parse_version(sklearn.__version__) < parse_version("1.4"): return p = tabular_learner( ensemble.HistGradientBoostingRegressor(categorical_features="from_dtype") ) - assert isinstance( - p.named_steps["tablevectorizer"].low_cardinality_transformer, ToCategorical - ) + assert isinstance(p.named_steps["tablevectorizer"].low_cardinality, ToCategorical)