Skip to content

Commit

Permalink
Merge branch 'main' into jrwinget-fix/step_cut-error-handling
Browse files Browse the repository at this point in the history
  • Loading branch information
topepo committed Oct 31, 2024
2 parents 507c413 + b645e20 commit 95e49ad
Show file tree
Hide file tree
Showing 244 changed files with 2,749 additions and 563 deletions.
6 changes: 5 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: recipes
Title: Preprocessing and Feature Engineering Steps for Modeling
Version: 1.1.0.9000
Version: 1.1.0.9001
Authors@R: c(
person("Max", "Kuhn", , "[email protected]", role = c("aut", "cre")),
person("Hadley", "Wickham", , "[email protected]", role = "aut"),
Expand Down Expand Up @@ -34,6 +34,7 @@ Imports:
Matrix,
purrr (>= 1.0.0),
rlang (>= 1.1.0),
sparsevctrs (>= 0.1.0.9002),
stats,
tibble,
tidyr (>= 1.0.0),
Expand All @@ -50,6 +51,7 @@ Suggests:
igraph,
kernlab,
knitr,
methods,
modeldata (>= 0.1.1),
parsnip (>= 1.2.0),
RANN,
Expand All @@ -62,6 +64,8 @@ Suggests:
testthat (>= 3.0.0),
workflows,
xml2
Remotes:
r-lib/sparsevctrs
VignetteBuilder:
knitr
RdMacros:
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,7 @@ importFrom(lubridate,am)
importFrom(lubridate,decimal_date)
importFrom(lubridate,hour)
importFrom(lubridate,is.Date)
importFrom(lubridate,mday)
importFrom(lubridate,minute)
importFrom(lubridate,month)
importFrom(lubridate,quarter)
Expand All @@ -764,7 +765,6 @@ importFrom(purrr,map_dbl)
importFrom(purrr,map_lgl)
importFrom(stats,as.formula)
importFrom(stats,binomial)
importFrom(stats,complete.cases)
importFrom(stats,cov)
importFrom(stats,cov.wt)
importFrom(stats,lm)
Expand Down Expand Up @@ -793,4 +793,5 @@ importFrom(utils,install.packages)
importFrom(utils,object.size)
importFrom(vctrs,vec_cast)
importFrom(vctrs,vec_cbind)
importFrom(vctrs,vec_detect_complete)
importFrom(vctrs,vec_slice)
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# recipes (development version)

* Example for `step_novel()` now better illustrates how it works. (@Edgar-Zamora, #1248)

* `recipe()`, `prep()`, and `bake()` now work with sparse tibbles. (#1364, #1366)

* `recipe()`, `prep()`, and `bake()` now work with sparse matrices. (#1364, #1368, #1369)

* `prep.recipe(..., strings_as_factors = TRUE)` now only converts string variables that have role "predictor" or "outcome". (@dajmcdon, #1358, #1376)

# recipes 1.1.0

## Improvements
Expand Down Expand Up @@ -34,6 +42,8 @@

* `step_dummy()` now throws more informative warnings for `NA` values. (#450)

* `step_date()` now accepts `"mday"` as a possible feature. (@Edgar-Zamora, #1211)

## Bug Fixes

* `NA` levels in factors aren't dropped when passed to `recipe()`. (#1291)
Expand Down
2 changes: 1 addition & 1 deletion R/BoxCox.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ bc_trans <- function(x, lambda, eps = .001) {
if (any(x <= 0)) {
cli::cli_warn(
"Applying Box-Cox transformation to non-positive data in column \\
{names(lambda)}"
{.field {names(lambda)}}."
)
}

Expand Down
10 changes: 5 additions & 5 deletions R/case_weights.R
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,10 @@ get_case_weights <- function(info, .data, call = rlang::caller_env()) {
if (!is.numeric(res)) {
cli::cli_abort(
c(
"x" = "{.arg {wt_col}} has a {.code case_weights} role,\\
but is not numeric.",
"i" = "{.arg {wt_col}} is {.obj_type_friendly {wt_col}}."
),
x = "{.field {wt_col}} has a {.code case_weights} role and should be
numeric, but is {.obj_type_friendly {wt_col}}.",
i = "Only numeric case weights are supported in recipes."
),
call = call
)
}
Expand Down Expand Up @@ -131,7 +131,7 @@ wt_calcs <- function(x, wts, statistic = "mean") {
wts <- rep(1L, nrow(x))
}

complete <- stats::complete.cases(x) & !is.na(wts)
complete <- vec_detect_complete(x) & !is.na(wts)
wts <- wts[complete]
x <- x[complete,,drop = FALSE]
res <- stats::cov.wt(x, wt = wts, cor = statistic == "cor")
Expand Down
2 changes: 1 addition & 1 deletion R/classdist.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
#'
#' @examplesIf rlang::is_installed(c("modeldata"))
#' data(penguins, package = "modeldata")
#' penguins <- penguins[complete.cases(penguins), ]
#' penguins <- penguins[vctrs::vec_detect_complete(penguins), ]
#' penguins$island <- NULL
#' penguins$sex <- NULL
#'
Expand Down
2 changes: 1 addition & 1 deletion R/classdist_shrunken.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
#' of the National Academy of Sciences_, 99(10), 6567-6572.
#' @examplesIf rlang::is_installed(c("modeldata"))
#' data(penguins, package = "modeldata")
#' penguins <- penguins[complete.cases(penguins), ]
#' penguins <- penguins[vctrs::vec_detect_complete(penguins), ]
#' penguins$island <- NULL
#' penguins$sex <- NULL
#'
Expand Down
3 changes: 3 additions & 0 deletions R/colcheck.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ bake.check_cols <- function(object, new_data, ...) {
new_cols <- names(new_data)
missing <- setdiff(original_cols, new_cols)
if (length(missing) > 0) {
# This is functionally not reachable after we added ptype checking in
# https://github.com/tidymodels/recipes/pull/1330
# but it feels too harsh to deprecate this check function.
cli::cli_abort(c(
x = "{cli::qty(length(missing))}The following column{?s} {?is/are} \\
missing from {.arg new_data}:",
Expand Down
2 changes: 1 addition & 1 deletion R/corr.R
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ corr_filter <-
method = "pearson") {
x <- correlations(x, wts = wts, use = use, method = method)

if (any(!complete.cases(x))) {
if (any(!vec_detect_complete(x))) {
all_na <- apply(x, 2, function(x) all(is.na(x)))
if (sum(all_na) >= nrow(x) - 1) {
cli::cli_warn(
Expand Down
8 changes: 0 additions & 8 deletions R/count.R
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,6 @@ prep.step_count <- function(x, training, info = NULL, ...) {
col_name <- recipes_eval_select(x$terms, training, info)
check_type(training[, col_name], types = c("string", "factor", "ordered"))

if (length(col_name) > 1) {
cli::cli_abort(c(
x = "The selector should select at most a single variable.",
i = "The following {length(col_name)} were selected: \\
{.and {.var {col_name}}}."
))
}

step_count_new(
terms = x$terms,
role = x$role,
Expand Down
27 changes: 15 additions & 12 deletions R/cut.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ prep.step_cut <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)
check_type(training[, col_names], types = c("double", "integer"))

if (!is.numeric(x$breaks)) {
cli::cli_abort(
"{.arg breaks} must be a numeric vector, \\
not {.obj_type_friendly {x$breaks}}."
)
}

all_breaks <- vector("list", length(col_names))
names(all_breaks) <- col_names
for (col_name in col_names) {
Expand All @@ -132,24 +139,26 @@ prep.step_cut <- function(x, training, info = NULL, ...) {
)
}

create_full_breaks <- function(var, breaks) {
create_full_breaks <- function(var, breaks, call = rlang::caller_env()) {
if (!is.numeric(var)) {
cli::cli_abort(
"{.arg var} must be a numeric vector, not {.obj_type_friendly {var}}."
"{.arg var} must be a numeric vector, not {.obj_type_friendly {var}}.",
call = call
)
}

if (!is.numeric(breaks)) {
cli::cli_abort(
"{.arg breaks} must be a numeric vector, \\
not {.obj_type_friendly {breaks}}."
"{.arg breaks} must be a numeric vector, not {.obj_type_friendly {breaks}}.",
call = call
)
}

if (any(is.na(var))) {
cli::cli_warn(
"{.arg var} contains missing values. These will be ignored in break \\
calculations."
"{.arg var} contains missing values. These will be ignored in break
calculations.",
call = call
)
var <- var[!is.na(var)]
}
Expand Down Expand Up @@ -214,12 +223,6 @@ cut_var <- function(var, breaks, include_outside_range) {
# the levels when bake.recipe itself is called. Moreover,
# it is cleaner to show it in this way.
adjust_levels_min_max <- function(x) {
if (!is.factor(x)) {
cli::cli_abort(
"{.arg x} must be a factor, not {.obj_type_friendly {x}}.",
.internal = TRUE
)
}
levs <- levels(x)
if (length(levs) == 1) {
return(factor(rep("[min,max]", length(x))))
Expand Down
9 changes: 6 additions & 3 deletions R/date.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' for this step. The selected variables should have class `Date` or
#' `POSIXct`. See [selections()] for more details.
#' @param features A character string that includes at least one
#' of the following values: `month`, `dow` (day of week),
#' of the following values: `month`, `dow` (day of week), `mday` (day of month),
#' `doy` (day of year), `week`, `month`,
#' `decimal` (decimal date, e.g. 2002.197), `quarter`,
#' `semester`, `year`.
Expand Down Expand Up @@ -98,6 +98,7 @@ step_date <-
c(
"year",
"doy",
"mday",
"week",
"decimal",
"semester",
Expand All @@ -110,8 +111,7 @@ step_date <-
offenders <- features[!features %in% feat]

cli::cli_abort(c(
x = "Possible values of {.arg features} should include:",
"*" = "{.or {.val {feat}}}.",
x = "Possible values of {.arg features} are {.or {.val {feat}}}.",
i = "Invalid values were: {.val {offenders}}."
))
}
Expand Down Expand Up @@ -202,6 +202,9 @@ get_date_features <-
if ("doy" %in% feats) {
res[, grepl("doy$", names(res))] <- vec_cast(yday(dt), integer())
}
if ("mday" %in% feats) {
res[, grepl("mday$", names(res))] <- vec_cast(mday(dt), integer())
}
if ("week" %in% feats) {
res[, grepl("week$", names(res))] <- vec_cast(week(dt), integer())
}
Expand Down
6 changes: 1 addition & 5 deletions R/dummy.R
Original file line number Diff line number Diff line change
Expand Up @@ -271,13 +271,9 @@ bake.step_dummy <- function(object, new_data, ...) {
# the original (see the note above)
is_ordered <- attr(levels, "dataClasses") == "ordered"

if (is.null(levels_values)) {
cli::cli_abort("Factor level values not recorded in {.var col_name}.")
}

if (length(levels_values) == 1) {
cli::cli_abort(
"Only one factor level in {.var col_name}: {levels_values}."
"Only one factor level in {.var {col_name}}: {.val {levels_values}}."
)
}

Expand Down
19 changes: 0 additions & 19 deletions R/dummy_multi_choice.R
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,6 @@ prep.step_dummy_multi_choice <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)
check_type(training[, col_names], types = c("nominal", "logical"))

multi_dummy_check_type(training[, col_names])

levels <- purrr::map(training[, col_names], levels)
levels <- vctrs::list_unchop(levels, ptype = character(), name_spec = rlang::zap())
levels <- levels[!is.na(levels)]
Expand All @@ -165,23 +163,6 @@ prep.step_dummy_multi_choice <- function(x, training, info = NULL, ...) {
)
}

multi_dummy_check_type <- function(dat, call = rlang::caller_env()) {
is_good <- function(x) {
is.factor(x) | is.character(x) | all(is.na(x))
}

all_good <- vapply(dat, is_good, logical(1))
if (!all(all_good)) {
offenders <- names(dat)[!all_good]
cli::cli_abort(c(
"x" = "All columns selected for the step should be \\
factor, character, or NA. The following were not:",
"*" = "{.var {offenders}}."
), call = call)
}
invisible(all_good)
}

#' @export
bake.step_dummy_multi_choice <- function(object, new_data, ...) {
col_names <- object$input
Expand Down
15 changes: 10 additions & 5 deletions R/extract_parameter.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,24 @@ extract_parameter_set_dials.recipe <- function(x, ...) {
tuning_param %>% dplyr::select(-tunable),
all_args,
by = c("name", "source", "component", "component_id")
) %>%
mutate(object = purrr::map(call_info, eval_call_info))
)

objects <- list()
for (i in seq_len(nrow(res))) {
objects[[i]] <- eval_call_info(res$call_info[[i]])
}

dials::parameters_constr(
res$name,
res$id,
res$source,
res$component,
res$component_id,
res$object
objects
)
}

eval_call_info <- function(x) {
eval_call_info <- function(x, call) {
if (!is.null(x)) {
# Look for other options
allowed_opts <- c("range", "trans", "values")
Expand All @@ -32,7 +36,8 @@ eval_call_info <- function(x) {
res <- try(rlang::eval_tidy(rlang::call2(x$fun, .ns = x$pkg, !!!opts)), silent = TRUE)
if (inherits(res, "try-error")) {
cli::cli_abort(
"Error when calling {.fn {x$fun}}: {as.character(res)}"
"Error when calling {.fn {x$fun}}: {as.character(res)}",
call = call
)
}
} else {
Expand Down
4 changes: 2 additions & 2 deletions R/impute_bag.R
Original file line number Diff line number Diff line change
Expand Up @@ -248,14 +248,14 @@ bake.step_impute_bag <- function(object, new_data, ...) {
col_names <- names(object$models)
check_new_data(col_names, object, new_data)

missing_rows <- !complete.cases(new_data)
missing_rows <- !vec_detect_complete(new_data)
if (!any(missing_rows)) {
return(new_data)
}

old_data <- new_data
for (col_name in col_names) {
missing_rows <- !complete.cases(new_data[[col_name]])
missing_rows <- !vec_detect_complete(new_data[[col_name]])
if (!any(missing_rows)) {
next
}
Expand Down
6 changes: 3 additions & 3 deletions R/impute_knn.R
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ step_impute_knn <-
if (length(options) > 0) {
if (any(!(opt_nms %in% c("eps", "nthread")))) {
cli::cli_abort(
"Valid values for {.arg options} are {.val eps}, and {.val nthread}."
"Valid values for {.arg options} are {.val eps} and {.val nthread}."
)
}
if (all(opt_nms != "nthread")) {
Expand Down Expand Up @@ -221,7 +221,7 @@ bake.step_impute_knn <- function(object, new_data, ...) {
all_cols <- unique(unlist(object$columns, recursive = TRUE))
check_new_data(all_cols, object, new_data)

missing_rows <- !complete.cases(new_data)
missing_rows <- !vec_detect_complete(new_data)
if (!any(missing_rows)) {
return(new_data)
}
Expand All @@ -230,7 +230,7 @@ bake.step_impute_knn <- function(object, new_data, ...) {

old_data <- new_data
for (col_name in col_names) {
missing_rows <- !complete.cases(new_data[, col_name])
missing_rows <- !vec_detect_complete(new_data[, col_name])
if (!any(missing_rows)) {
next
}
Expand Down
Loading

0 comments on commit 95e49ad

Please sign in to comment.