From f58f4fb9820ba78da14152a923c130bbb80843ef Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 10 Jan 2025 16:10:24 -0800 Subject: [PATCH] change sparse argument of step_dummy() to be an enum --- NEWS.md | 2 +- R/dummy.R | 13 +++++++------ man/step_dummy.Rd | 9 +++++---- tests/testthat/_snaps/dummy.md | 4 ++-- tests/testthat/test-dummy.R | 10 +++++----- 5 files changed, 20 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5f1c02ce1..ac03ae4f8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,7 +10,7 @@ * All steps and checks now require arguments `trained`, `skip`, `role`, and `id` at all times. -* `step_dummy()` gained `sparse` argument. When set to `TRUE`, `step_dummy()` will produce sparse vectors. (#1392) +* `step_dummy()` gained `sparse` argument. When set to `"yes"`, `step_dummy()` will produce sparse vectors. (#1392) # recipes 1.1.0 diff --git a/R/dummy.R b/R/dummy.R index 19e85ea8b..9ef8c0911 100644 --- a/R/dummy.R +++ b/R/dummy.R @@ -18,9 +18,10 @@ #' @param levels A list that contains the information needed to create dummy #' variables for each variable contained in `terms`. This is `NULL` until the #' step is trained by [prep()]. -#' @param sparse A logical. Should the columns produced be sparse vectors. -#' Sparsity is only supported for `"contr.treatment"` contrasts. Defaults to -#' `FALSE`. +#' @param sparse A single string. Should the columns produced be sparse vectors. +#' Can take the values `"yes"`, `"no"`, and `"auto"`. If `sparse = "auto"` +#' then workflows can determine the best option. Sparsity is only supported +#' for `"contr.treatment"` contrasts. Defaults to `"auto"`. #' @template step-return #' @family dummy variable and encoding steps #' @seealso [dummy_names()] @@ -125,7 +126,7 @@ step_dummy <- preserve = deprecated(), naming = dummy_names, levels = NULL, - sparse = FALSE, + sparse = "auto", keep_original_cols = FALSE, skip = FALSE, id = rand_id("dummy")) { @@ -181,7 +182,7 @@ prep.step_dummy <- function(x, training, info = NULL, ...) { check_type(training[, col_names], types = c("factor", "ordered")) check_bool(x$one_hot, arg = "one_hot") check_function(x$naming, arg = "naming", allow_empty = FALSE) - check_bool(x$sparse, arg = "sparse") + rlang::arg_match0(x$sparse, c("auto", "yes", "no"), arg_nm = "sparse") if (length(col_names) > 0) { ## I hate doing this but currently we are going to have @@ -301,7 +302,7 @@ bake.step_dummy <- function(object, new_data, ...) { ordered = is_ordered ) - if (object$sparse) { + if (object$sparse == "yes") { current_contrast <- getOption("contrasts")[is_ordered + 1] if (current_contrast != "contr.treatment") { cli::cli_abort( diff --git a/man/step_dummy.Rd b/man/step_dummy.Rd index 1a6b91560..308622adc 100644 --- a/man/step_dummy.Rd +++ b/man/step_dummy.Rd @@ -13,7 +13,7 @@ step_dummy( preserve = deprecated(), naming = dummy_names, levels = NULL, - sparse = FALSE, + sparse = "auto", keep_original_cols = FALSE, skip = FALSE, id = rand_id("dummy") @@ -47,9 +47,10 @@ columns. See Details below.} variables for each variable contained in \code{terms}. This is \code{NULL} until the step is trained by \code{\link[=prep]{prep()}}.} -\item{sparse}{A logical. Should the columns produced be sparse vectors. -Sparsity is only supported for \code{"contr.treatment"} contrasts. Defaults to -\code{FALSE}.} +\item{sparse}{A single string. Should the columns produced be sparse vectors. +Can take the values \code{"yes"}, \code{"no"}, and \code{"auto"}. If \code{sparse = "auto"} +then workflows can determine the best option. Sparsity is only supported +for \code{"contr.treatment"} contrasts. Defaults to \code{"auto"}.} \item{keep_original_cols}{A logical to keep the original variables in the output. Defaults to \code{FALSE}.} diff --git a/tests/testthat/_snaps/dummy.md b/tests/testthat/_snaps/dummy.md index 2b9da790c..cc99c2702 100644 --- a/tests/testthat/_snaps/dummy.md +++ b/tests/testthat/_snaps/dummy.md @@ -154,10 +154,10 @@ Caused by error in `bake()`: ! Only one factor level in `x`: "only-level". -# sparse = TRUE errors on unsupported contrasts +# sparse = 'yes' errors on unsupported contrasts Code - recipe(~., data = tibble(x = letters)) %>% step_dummy(x, sparse = TRUE) %>% + recipe(~., data = tibble(x = letters)) %>% step_dummy(x, sparse = "yes") %>% prep() Condition Error in `step_dummy()`: diff --git a/tests/testthat/test-dummy.R b/tests/testthat/test-dummy.R index 1d77ea4a8..b944ffb68 100644 --- a/tests/testthat/test-dummy.R +++ b/tests/testthat/test-dummy.R @@ -354,13 +354,13 @@ test_that("throws an informative error for single level", { ) }) -test_that("sparse = TRUE works", { +test_that("sparse = 'yes' works", { rec <- recipe(~ ., data = tibble(x = c(NA, letters))) suppressWarnings({ - dense <- rec %>% step_dummy(x, sparse = FALSE) %>% prep() %>% bake(NULL) + dense <- rec %>% step_dummy(x, sparse = "no") %>% prep() %>% bake(NULL) dense <- purrr::map(dense, as.integer) %>% tibble::new_tibble() - sparse <- rec %>% step_dummy(x, sparse = TRUE) %>% prep() %>% bake(NULL) + sparse <- rec %>% step_dummy(x, sparse = "yes") %>% prep() %>% bake(NULL) }) expect_identical(dense, sparse) @@ -369,7 +369,7 @@ test_that("sparse = TRUE works", { expect_true(all(vapply(sparse, sparsevctrs::is_sparse_vector, logical(1)))) }) -test_that("sparse = TRUE errors on unsupported contrasts", { +test_that("sparse = 'yes' errors on unsupported contrasts", { go_helmert <- getOption("contrasts") go_helmert["unordered"] <- "contr.helmert" withr::local_options(contrasts = go_helmert) @@ -377,7 +377,7 @@ test_that("sparse = TRUE errors on unsupported contrasts", { expect_snapshot( error = TRUE, recipe(~ ., data = tibble(x = letters)) %>% - step_dummy(x, sparse = TRUE) %>% + step_dummy(x, sparse = "yes") %>% prep() ) })