From bdb7e40c2f24a7e81565800ecf5fc91fca0ee709 Mon Sep 17 00:00:00 2001 From: Jeremy Winget Date: Thu, 31 Oct 2024 07:35:13 -0500 Subject: [PATCH] Improve error messaging for `step_cut()` with missing data (#1359) * Improve error messaging for `step_cut()` with missing data Fixes #1351 Updated the `create_full_breaks()` function to provide a more informative error message when `step_cut()` is applied to a column containing missing values (`NA`). The function now checks for missing data and warns users appropriately. * Remove redundant error handling for all NA cases, as this is already managed by `recipes_error_context` * Add snapshot tests to verify informative error messages in `step_cut()` * redoc --------- Co-authored-by: topepo --- R/cut.R | 27 ++++++++++++++++++++++++++- man/prep.Rd | 7 ++++--- tests/testthat/test-cut.R | 25 ++++++++++++++++++++++++- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/R/cut.R b/R/cut.R index 6a709c804..d78f857c8 100644 --- a/R/cut.R +++ b/R/cut.R @@ -139,13 +139,38 @@ prep.step_cut <- function(x, training, info = NULL, ...) { ) } -create_full_breaks <- function(var, breaks) { +create_full_breaks <- function(var, breaks, call = rlang::caller_env()) { + if (!is.numeric(var)) { + cli::cli_abort( + "{.arg var} must be a numeric vector, not {.obj_type_friendly {var}}.", + call = call + ) + } + + if (!is.numeric(breaks)) { + cli::cli_abort( + "{.arg breaks} must be a numeric vector, not {.obj_type_friendly {breaks}}.", + call = call + ) + } + + if (any(is.na(var))) { + cli::cli_warn( + "{.arg var} contains missing values. These will be ignored in break + calculations.", + call = call + ) + var <- var[!is.na(var)] + } + if (min(var) < min(breaks)) { breaks <- c(min(var), breaks) } + if (max(var) > max(breaks)) { breaks <- c(max(var), breaks) } + sort(breaks) } diff --git a/man/prep.Rd b/man/prep.Rd index 204a1c6e4..4385d4b40 100644 --- a/man/prep.Rd +++ b/man/prep.Rd @@ -47,9 +47,10 @@ since it does not take environments into account.} \item{log_changes}{A logical for printing a summary for each step regarding which (if any) columns were added or removed during training.} -\item{strings_as_factors}{A logical: should character columns be converted to -factors? This affects the preprocessed training set (when -\code{retain = TRUE}) as well as the results of \code{bake.recipe}.} +\item{strings_as_factors}{A logical: should character columns that have role +"predictor" or "outcome" be converted to factors? This affects the +preprocessed training set (when \code{retain = TRUE}) as well as the results of +\code{bake.recipe}.} } \value{ A recipe whose step objects have been updated with the required diff --git a/tests/testthat/test-cut.R b/tests/testthat/test-cut.R index 5d95c7a4b..f3e38a977 100644 --- a/tests/testthat/test-cut.R +++ b/tests/testthat/test-cut.R @@ -167,6 +167,28 @@ test_that("tidy method works", { ) }) +test_that("step_cut() provides informative error on missing values", { + # Single missing value + mtcars_with_na <- mtcars + mtcars_with_na[1, "mpg"] <- NA + + expect_warning( + recipe(~ ., data = mtcars_with_na) %>% + step_cut(mpg, breaks = 20) %>% + prep() + ) + + # Multiple missing values + mtcars_with_nas <- mtcars + mtcars_with_nas[c(1, 3, 5), "mpg"] <- NA + + expect_warning( + recipe(~ ., data = mtcars_with_nas) %>% + step_cut(mpg, breaks = 20) %>% + prep() + ) +}) + test_that("breaks argument are type checked", { expect_snapshot( error = TRUE, @@ -174,7 +196,7 @@ test_that("breaks argument are type checked", { step_cut(disp, hp, breaks = TRUE) %>% prep() ) - + expect_snapshot( error = TRUE, recipe(~., data = mtcars) %>% @@ -183,6 +205,7 @@ test_that("breaks argument are type checked", { ) }) + # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", {