Merge branch 'main' into jrwinget-fix/step_cut-error-handling

tidymodels · Oct 31, 2024 · 95e49ad · 95e49ad
2 parents 507c413 + b645e20
commit 95e49ad
Show file tree

Hide file tree

Showing 244 changed files with 2,749 additions and 563 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: recipes
 Title: Preprocessing and Feature Engineering Steps for Modeling
-Version: 1.1.0.9000
+Version: 1.1.0.9001
 Authors@R: c(
     person("Max", "Kuhn", , "[email protected]", role = c("aut", "cre")),
     person("Hadley", "Wickham", , "[email protected]", role = "aut"),
@@ -34,6 +34,7 @@ Imports:
     Matrix,
     purrr (>= 1.0.0),
     rlang (>= 1.1.0),
+    sparsevctrs (>= 0.1.0.9002),
     stats,
     tibble,
     tidyr (>= 1.0.0),
@@ -50,6 +51,7 @@ Suggests:
     igraph,
     kernlab,
     knitr,
+    methods,
     modeldata (>= 0.1.1),
     parsnip (>= 1.2.0),
     RANN,
@@ -62,6 +64,8 @@ Suggests:
     testthat (>= 3.0.0),
     workflows,
     xml2
+Remotes:  
+    r-lib/sparsevctrs
 VignetteBuilder: 
     knitr
 RdMacros: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -748,6 +748,7 @@ importFrom(lubridate,am)
 importFrom(lubridate,decimal_date)
 importFrom(lubridate,hour)
 importFrom(lubridate,is.Date)
+importFrom(lubridate,mday)
 importFrom(lubridate,minute)
 importFrom(lubridate,month)
 importFrom(lubridate,quarter)
@@ -764,7 +765,6 @@ importFrom(purrr,map_dbl)
 importFrom(purrr,map_lgl)
 importFrom(stats,as.formula)
 importFrom(stats,binomial)
-importFrom(stats,complete.cases)
 importFrom(stats,cov)
 importFrom(stats,cov.wt)
 importFrom(stats,lm)
@@ -793,4 +793,5 @@ importFrom(utils,install.packages)
 importFrom(utils,object.size)
 importFrom(vctrs,vec_cast)
 importFrom(vctrs,vec_cbind)
+importFrom(vctrs,vec_detect_complete)
 importFrom(vctrs,vec_slice)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,13 @@
 # recipes (development version)
 
+* Example for `step_novel()` now better illustrates how it works. (@Edgar-Zamora, #1248)
+
+* `recipe()`, `prep()`, and `bake()` now work with sparse tibbles. (#1364, #1366)
+
+* `recipe()`, `prep()`, and `bake()` now work with sparse matrices. (#1364, #1368, #1369)
+
+* `prep.recipe(..., strings_as_factors = TRUE)` now only converts string variables that have role "predictor" or "outcome". (@dajmcdon, #1358, #1376)
+
 # recipes 1.1.0
 
 ## Improvements
@@ -34,6 +42,8 @@
 
 * `step_dummy()` now throws more informative warnings for `NA` values. (#450)
 
+* `step_date()` now accepts `"mday"` as a possible feature. (@Edgar-Zamora, #1211)
+
 ## Bug Fixes
 
 * `NA` levels in factors aren't dropped when passed to `recipe()`. (#1291)

diff --git a/R/BoxCox.R b/R/BoxCox.R
@@ -160,7 +160,7 @@ bc_trans <- function(x, lambda, eps = .001) {
   if (any(x <= 0)) {
     cli::cli_warn(
       "Applying Box-Cox transformation to non-positive data in column \\
-      {names(lambda)}"
+      {.field {names(lambda)}}."
     )
   }
 

diff --git a/R/case_weights.R b/R/case_weights.R
@@ -88,10 +88,10 @@ get_case_weights <- function(info, .data, call = rlang::caller_env()) {
     if (!is.numeric(res)) {
       cli::cli_abort(
         c(
-          "x" = "{.arg {wt_col}} has a {.code case_weights} role,\\
-                 but is not numeric.",
-          "i" = "{.arg {wt_col}} is {.obj_type_friendly {wt_col}}."
-          ),
+          x = "{.field {wt_col}} has a {.code case_weights} role and should be 
+              numeric, but is {.obj_type_friendly {wt_col}}.",
+          i = "Only numeric case weights are supported in recipes."
+        ),
         call = call
       )
     }
@@ -131,7 +131,7 @@ wt_calcs <- function(x, wts, statistic = "mean") {
     wts <- rep(1L, nrow(x))
   }
 
-  complete <- stats::complete.cases(x) & !is.na(wts)
+  complete <- vec_detect_complete(x) & !is.na(wts)
   wts <- wts[complete]
   x <- x[complete,,drop = FALSE]
   res <- stats::cov.wt(x, wt = wts, cor = statistic == "cor")

diff --git a/R/classdist.R b/R/classdist.R
@@ -58,7 +58,7 @@
 #'
 #' @examplesIf rlang::is_installed(c("modeldata"))
 #' data(penguins, package = "modeldata")
-#' penguins <- penguins[complete.cases(penguins), ]
+#' penguins <- penguins[vctrs::vec_detect_complete(penguins), ]
 #' penguins$island <- NULL
 #' penguins$sex <- NULL
 #'

diff --git a/R/classdist_shrunken.R b/R/classdist_shrunken.R
@@ -62,7 +62,7 @@
 #' of the National Academy of Sciences_, 99(10), 6567-6572.
 #' @examplesIf rlang::is_installed(c("modeldata"))
 #' data(penguins, package = "modeldata")
-#' penguins <- penguins[complete.cases(penguins), ]
+#' penguins <- penguins[vctrs::vec_detect_complete(penguins), ]
 #' penguins$island <- NULL
 #' penguins$sex <- NULL
 #'

diff --git a/R/colcheck.R b/R/colcheck.R
@@ -81,6 +81,9 @@ bake.check_cols <- function(object, new_data, ...) {
   new_cols <- names(new_data)
   missing <- setdiff(original_cols, new_cols)
   if (length(missing) > 0) {
+    # This is functionally not reachable after we added ptype checking in
+    # https://github.com/tidymodels/recipes/pull/1330
+    # but it feels too harsh to deprecate this check function.
     cli::cli_abort(c(
       x = "{cli::qty(length(missing))}The following column{?s} {?is/are} \\
       missing from {.arg new_data}:",

diff --git a/R/corr.R b/R/corr.R
@@ -185,7 +185,7 @@ corr_filter <-
            method = "pearson") {
     x <- correlations(x, wts = wts, use = use, method = method)
 
-    if (any(!complete.cases(x))) {
+    if (any(!vec_detect_complete(x))) {
       all_na <- apply(x, 2, function(x) all(is.na(x)))
       if (sum(all_na) >= nrow(x) - 1) {
         cli::cli_warn(

diff --git a/R/count.R b/R/count.R
@@ -129,14 +129,6 @@ prep.step_count <- function(x, training, info = NULL, ...) {
   col_name <- recipes_eval_select(x$terms, training, info)
   check_type(training[, col_name], types = c("string", "factor", "ordered"))
 
-  if (length(col_name) > 1) {
-    cli::cli_abort(c(
-      x = "The selector should select at most a single variable.",
-      i = "The following {length(col_name)} were selected: \\
-          {.and {.var {col_name}}}."
-    ))
-  }
-
   step_count_new(
     terms = x$terms,
     role = x$role,

diff --git a/R/cut.R b/R/cut.R
@@ -113,6 +113,13 @@ prep.step_cut <- function(x, training, info = NULL, ...) {
   col_names <- recipes_eval_select(x$terms, training, info)
   check_type(training[, col_names], types = c("double", "integer"))
 
+  if (!is.numeric(x$breaks)) {
+    cli::cli_abort(
+      "{.arg breaks} must be a numeric vector, \\
+      not {.obj_type_friendly {x$breaks}}."
+    )
+  }
+
   all_breaks <- vector("list", length(col_names))
   names(all_breaks) <- col_names
   for (col_name in col_names) {
@@ -132,24 +139,26 @@ prep.step_cut <- function(x, training, info = NULL, ...) {
   )
 }
 
-create_full_breaks <- function(var, breaks) {
+create_full_breaks <- function(var, breaks, call = rlang::caller_env()) {
   if (!is.numeric(var)) {
     cli::cli_abort(
-      "{.arg var} must be a numeric vector, not {.obj_type_friendly {var}}."
+      "{.arg var} must be a numeric vector, not {.obj_type_friendly {var}}.",
+      call = call
     )
   }
 
   if (!is.numeric(breaks)) {
     cli::cli_abort(
-      "{.arg breaks} must be a numeric vector, \\
-      not {.obj_type_friendly {breaks}}."
+      "{.arg breaks} must be a numeric vector, not {.obj_type_friendly {breaks}}.",
+      call = call
     )
   }
 
   if (any(is.na(var))) {
     cli::cli_warn(
-      "{.arg var} contains missing values. These will be ignored in break \\
-      calculations."
+      "{.arg var} contains missing values. These will be ignored in break
+       calculations.",
+      call = call
     )
     var <- var[!is.na(var)]
   }
@@ -214,12 +223,6 @@ cut_var <- function(var, breaks, include_outside_range) {
 # the levels when bake.recipe itself is called. Moreover,
 # it is cleaner to show it in this way.
 adjust_levels_min_max <- function(x) {
-  if (!is.factor(x)) {
-    cli::cli_abort(
-      "{.arg x} must be a factor, not {.obj_type_friendly {x}}.",
-      .internal = TRUE
-    )
-  }
   levs <- levels(x)
   if (length(levs) == 1) {
     return(factor(rep("[min,max]", length(x))))

diff --git a/R/date.R b/R/date.R
@@ -9,7 +9,7 @@
 #'  for this step. The selected variables should have class `Date` or
 #'  `POSIXct`. See [selections()] for more details.
 #' @param features A character string that includes at least one
-#'  of the following values: `month`, `dow` (day of week),
+#'  of the following values: `month`, `dow` (day of week), `mday` (day of month),
 #'  `doy` (day of year), `week`, `month`,
 #'  `decimal` (decimal date, e.g. 2002.197), `quarter`,
 #'  `semester`, `year`.
@@ -98,6 +98,7 @@ step_date <-
       c(
         "year",
         "doy",
+        "mday",
         "week",
         "decimal",
         "semester",
@@ -110,8 +111,7 @@ step_date <-
         offenders <- features[!features %in% feat]
 
         cli::cli_abort(c(
-          x = "Possible values of {.arg features} should include:",
-          "*" = "{.or {.val {feat}}}.",
+          x = "Possible values of {.arg features} are {.or {.val {feat}}}.",
           i = "Invalid values were: {.val {offenders}}."
         ))
       }
@@ -202,6 +202,9 @@ get_date_features <-
     if ("doy" %in% feats) {
       res[, grepl("doy$", names(res))] <- vec_cast(yday(dt), integer())
     }
+    if ("mday" %in% feats) {
+      res[, grepl("mday$", names(res))] <- vec_cast(mday(dt), integer())
+    }
     if ("week" %in% feats) {
       res[, grepl("week$", names(res))] <- vec_cast(week(dt), integer())
     }

diff --git a/R/dummy.R b/R/dummy.R
@@ -271,13 +271,9 @@ bake.step_dummy <- function(object, new_data, ...) {
     # the original (see the note above)
     is_ordered <- attr(levels, "dataClasses") == "ordered"
 
-    if (is.null(levels_values)) {
-      cli::cli_abort("Factor level values not recorded in {.var col_name}.")
-    }
-
     if (length(levels_values) == 1) {
       cli::cli_abort(
-        "Only one factor level in {.var col_name}: {levels_values}."
+        "Only one factor level in {.var {col_name}}: {.val {levels_values}}."
       )
     }
 

diff --git a/R/dummy_multi_choice.R b/R/dummy_multi_choice.R
@@ -142,8 +142,6 @@ prep.step_dummy_multi_choice <- function(x, training, info = NULL, ...) {
   col_names <- recipes_eval_select(x$terms, training, info)
   check_type(training[, col_names], types = c("nominal", "logical"))
 
-  multi_dummy_check_type(training[, col_names])
-
   levels <- purrr::map(training[, col_names], levels)
   levels <- vctrs::list_unchop(levels, ptype = character(), name_spec = rlang::zap())
   levels <- levels[!is.na(levels)]
@@ -165,23 +163,6 @@ prep.step_dummy_multi_choice <- function(x, training, info = NULL, ...) {
   )
 }
 
-multi_dummy_check_type <- function(dat, call = rlang::caller_env()) {
-  is_good <- function(x) {
-    is.factor(x) | is.character(x) | all(is.na(x))
-  }
-
-  all_good <- vapply(dat, is_good, logical(1))
-  if (!all(all_good)) {
-    offenders <- names(dat)[!all_good]
-    cli::cli_abort(c(
-      "x" = "All columns selected for the step should be \\
-            factor, character, or NA. The following were not:",
-      "*" = "{.var {offenders}}."
-    ), call = call)
-  }
-  invisible(all_good)
-}
-
 #' @export
 bake.step_dummy_multi_choice <- function(object, new_data, ...) {
   col_names <- object$input

diff --git a/R/extract_parameter.R b/R/extract_parameter.R
@@ -7,20 +7,24 @@ extract_parameter_set_dials.recipe <- function(x, ...) {
       tuning_param %>% dplyr::select(-tunable),
       all_args,
       by = c("name", "source", "component", "component_id")
-    ) %>%
-    mutate(object = purrr::map(call_info, eval_call_info))
+    ) 
+
+  objects <- list()
+  for (i in seq_len(nrow(res))) {
+    objects[[i]] <- eval_call_info(res$call_info[[i]])
+  }
 
   dials::parameters_constr(
     res$name,
     res$id,
     res$source,
     res$component,
     res$component_id,
-    res$object
+    objects
   )
 }
 
-eval_call_info <- function(x) {
+eval_call_info <- function(x, call) {
   if (!is.null(x)) {
     # Look for other options
     allowed_opts <- c("range", "trans", "values")
@@ -32,7 +36,8 @@ eval_call_info <- function(x) {
     res <- try(rlang::eval_tidy(rlang::call2(x$fun, .ns = x$pkg, !!!opts)), silent = TRUE)
     if (inherits(res, "try-error")) {
       cli::cli_abort(
-        "Error when calling {.fn {x$fun}}: {as.character(res)}"
+        "Error when calling {.fn {x$fun}}: {as.character(res)}",
+        call = call
       )
     }
   } else {

diff --git a/R/impute_bag.R b/R/impute_bag.R
@@ -248,14 +248,14 @@ bake.step_impute_bag <- function(object, new_data, ...) {
   col_names <- names(object$models)
   check_new_data(col_names, object, new_data)
 
-  missing_rows <- !complete.cases(new_data)
+  missing_rows <- !vec_detect_complete(new_data)
   if (!any(missing_rows)) {
     return(new_data)
   }
 
   old_data <- new_data
   for (col_name in col_names) {
-    missing_rows <- !complete.cases(new_data[[col_name]])
+    missing_rows <- !vec_detect_complete(new_data[[col_name]])
     if (!any(missing_rows)) {
       next
     }

diff --git a/R/impute_knn.R b/R/impute_knn.R
@@ -118,7 +118,7 @@ step_impute_knn <-
     if (length(options) > 0) {
       if (any(!(opt_nms %in% c("eps", "nthread")))) {
         cli::cli_abort(
-          "Valid values for {.arg options} are {.val eps}, and {.val nthread}."
+          "Valid values for {.arg options} are {.val eps} and {.val nthread}."
         )
       }
       if (all(opt_nms != "nthread")) {
@@ -221,7 +221,7 @@ bake.step_impute_knn <- function(object, new_data, ...) {
   all_cols <- unique(unlist(object$columns, recursive = TRUE))
   check_new_data(all_cols, object, new_data)
 
-  missing_rows <- !complete.cases(new_data)
+  missing_rows <- !vec_detect_complete(new_data)
   if (!any(missing_rows)) {
     return(new_data)
   }
@@ -230,7 +230,7 @@ bake.step_impute_knn <- function(object, new_data, ...) {
 
   old_data <- new_data
   for (col_name in col_names) {
-    missing_rows <- !complete.cases(new_data[, col_name])
+    missing_rows <- !vec_detect_complete(new_data[, col_name])
     if (!any(missing_rows)) {
       next
     }