From 29240b495eb13ba494b966722198c1cd70fb64b7 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 2 Feb 2024 10:56:15 -0800 Subject: [PATCH 01/11] rewrite fun_calls() to not use recursion --- R/misc.R | 19 ++----------------- R/recipe.R | 10 +++++----- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/R/misc.R b/R/misc.R index 697848f13..34e3e0d4b 100644 --- a/R/misc.R +++ b/R/misc.R @@ -136,25 +136,10 @@ dummy_extract_names <- function(var, lvl, ordinal = FALSE, sep = "_") { nms } - -## As suggested by HW, brought in from the `pryr` package -## https://github.com/hadley/pryr -fun_calls <- function(f) { - if (is.function(f)) { - fun_calls(body(f)) - } else if (is_quosure(f)) { - fun_calls(quo_get_expr(f)) - } else if (is.call(f)) { - fname <- as.character(f[[1]]) - # Calls inside .Internal are special and shouldn't be included - if (identical(fname, ".Internal")) { - return(fname) - } - unique(c(fname, unlist(lapply(f[-1], fun_calls), use.names = FALSE))) - } +fun_calls <- function(f, data) { + setdiff(all.names(f), colnames(data)) } - get_levels <- function(x) { if (!is.factor(x) & !is.character(x)) { return(list(values = NA, ordered = NA)) diff --git a/R/recipe.R b/R/recipe.R index 893f3cc7e..0d2bac6b1 100644 --- a/R/recipe.R +++ b/R/recipe.R @@ -194,7 +194,7 @@ recipe.data.frame <- #' @export recipe.formula <- function(formula, data, ...) { # check for minus: - f_funcs <- fun_calls(formula) + f_funcs <- fun_calls(formula, data) if (any(f_funcs == "-")) { cli::cli_abort(c( "x" = "{.code -} is not allowed in a recipe formula.", @@ -230,7 +230,7 @@ form2args <- function(formula, data, ..., call = rlang::caller_env()) { } ## check for in-line formulas - inline_check(formula) + inline_check(formula, data) if (!is_tibble(data)) { data <- as_tibble(data) @@ -271,9 +271,9 @@ form2args <- function(formula, data, ..., call = rlang::caller_env()) { list(x = data, vars = vars, roles = roles) } -inline_check <- function(x) { - funs <- fun_calls(x) - funs <- funs[!(funs %in% c("~", "+", "-"))] +inline_check <- function(x, data) { + funs <- fun_calls(x, data) + funs <- funs[!(funs %in% c("~", "+", "-", "."))] if (length(funs) > 0) { cli::cli_abort(c( From f2d327390d6008c0b0d95886f20c391bcff41928 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Tue, 12 Mar 2024 09:50:23 -0700 Subject: [PATCH 02/11] update snapshot --- tests/testthat/_snaps/basics.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/_snaps/basics.md b/tests/testthat/_snaps/basics.md index 9f7f9b2e1..f64de60f4 100644 --- a/tests/testthat/_snaps/basics.md +++ b/tests/testthat/_snaps/basics.md @@ -178,6 +178,6 @@ Code recipe(mpg ~ .) Condition - Error in `recipe()`: - ! Argument `data` is missing, with no default. + Error in `recipe.formula()`: + ! argument "data" is missing, with no default From 5fec638956dcd2a2b37020ca10817908d7f1cf35 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Thu, 23 May 2024 16:10:55 -0700 Subject: [PATCH 03/11] check missing data early in recipe.formula --- R/recipe.R | 8 +++++--- tests/testthat/_snaps/basics.md | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/R/recipe.R b/R/recipe.R index 18cab4049..291ada502 100644 --- a/R/recipe.R +++ b/R/recipe.R @@ -193,6 +193,11 @@ recipe.data.frame <- #' @rdname recipe #' @export recipe.formula <- function(formula, data, ...) { + + if (rlang::is_missing(data)) { + cli::cli_abort("Argument {.var data} is missing, with no default.") + } + # check for minus: f_funcs <- fun_calls(formula, data) if (any(f_funcs == "-")) { @@ -202,9 +207,6 @@ recipe.formula <- function(formula, data, ...) { )) } - if (rlang::is_missing(data)) { - cli::cli_abort("Argument {.var data} is missing, with no default.") - } # Check for other in-line functions args <- form2args(formula, data, ...) obj <- recipe.data.frame( diff --git a/tests/testthat/_snaps/basics.md b/tests/testthat/_snaps/basics.md index f64de60f4..9f7f9b2e1 100644 --- a/tests/testthat/_snaps/basics.md +++ b/tests/testthat/_snaps/basics.md @@ -178,6 +178,6 @@ Code recipe(mpg ~ .) Condition - Error in `recipe.formula()`: - ! argument "data" is missing, with no default + Error in `recipe()`: + ! Argument `data` is missing, with no default. From 968849dce06051fc72bfa888a46ecda221a0d63c Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Thu, 23 May 2024 17:05:27 -0700 Subject: [PATCH 04/11] refactor out model.frame from get_rhs_vars() --- R/misc.R | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/R/misc.R b/R/misc.R index 34e3e0d4b..632f60247 100644 --- a/R/misc.R +++ b/R/misc.R @@ -21,13 +21,21 @@ get_rhs_vars <- function(formula, data, no_lhs = FALSE) { ## or should it? what about Y ~ log(x)? ## Answer: when called from `form2args`, the function ## `inline_check` stops when in-line functions are used. - data_info <- attr(model.frame(formula, data[1, ]), "terms") - response_info <- attr(data_info, "response") - predictor_names <- names(attr(data_info, "dataClasses")) - if (length(response_info) > 0 && all(response_info > 0)) { - predictor_names <- predictor_names[-response_info] + + outcomes_names <- all.names(rlang::f_lhs(formula), functions = FALSE) + + formula_rhs <- rlang::f_rhs(formula) + if (identical(formula_rhs, quote(.))) { + predictors_names <- colnames(data) + } else { + predictors_names <- all.names(rlang::f_rhs(formula), functions = FALSE) } - predictor_names + + if (length(predictors_names) > 0 && length(outcomes_names) > 0) { + predictors_names <- setdiff(predictors_names, outcomes_names) + } + + predictors_names } #' Naming Tools From 20477fc813ca6a258b57b53502f52f26288bbd68 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Thu, 23 May 2024 17:07:13 -0700 Subject: [PATCH 05/11] add test for long formulas --- tests/testthat/test-basics.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/testthat/test-basics.R b/tests/testthat/test-basics.R index a493447b6..1add39542 100644 --- a/tests/testthat/test-basics.R +++ b/tests/testthat/test-basics.R @@ -346,3 +346,16 @@ test_that("`internal data is kept as tibbles when prepping", { test_that("recipe() errors if `data` is missing", { expect_snapshot(error = TRUE, recipe(mpg ~ .)) }) + + +test_that("recipe() can handle very long formulas (#1283)", { + df <- matrix(1:10000, ncol = 10000) + df <- as.data.frame(df) + names(df) <- c(paste0("x", 1:10000)) + + long_formula <- as.formula(paste("~ ", paste(names(df), collapse = " + "))) + + expect_no_error( + rec <- recipe(long_formula, df) + ) +}) From 4bb8d76c5f50aaaca6761ad8ccbc389b9433b883 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Thu, 23 May 2024 17:11:26 -0700 Subject: [PATCH 06/11] add news --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 42dd2e05d..d2ee2d394 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ * `NA` levels in factors aren't dropped when passed to `recipe()`. (#1291) +* `recipe()` no longer crashes when given long formula expression. (#1283) + # recipes 1.0.10 ## Bug Fixes From cd51ae9eb7004e58b515f60a626101ded29589a6 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Thu, 23 May 2024 17:17:24 -0700 Subject: [PATCH 07/11] update inline_check() --- R/recipe.R | 16 ++++++------ tests/testthat/_snaps/basics.md | 46 ++++++++++++++++++++++++--------- tests/testthat/test-basics.R | 12 +++++++++ 3 files changed, 54 insertions(+), 20 deletions(-) diff --git a/R/recipe.R b/R/recipe.R index 291ada502..19b1f7124 100644 --- a/R/recipe.R +++ b/R/recipe.R @@ -193,11 +193,11 @@ recipe.data.frame <- #' @rdname recipe #' @export recipe.formula <- function(formula, data, ...) { - + if (rlang::is_missing(data)) { cli::cli_abort("Argument {.var data} is missing, with no default.") } - + # check for minus: f_funcs <- fun_calls(formula, data) if (any(f_funcs == "-")) { @@ -232,7 +232,7 @@ form2args <- function(formula, data, ..., call = rlang::caller_env()) { } ## check for in-line formulas - inline_check(formula, data) + inline_check(formula, data, call) if (!is_tibble(data)) { data <- as_tibble(data) @@ -273,20 +273,20 @@ form2args <- function(formula, data, ..., call = rlang::caller_env()) { list(x = data, vars = vars, roles = roles) } -inline_check <- function(x, data) { +inline_check <- function(x, data, call) { funs <- fun_calls(x, data) funs <- funs[!(funs %in% c("~", "+", "-", "."))] if (length(funs) > 0) { cli::cli_abort(c( - x = "No in-line functions should be used here.", - i = "{cli::qty(length(funs))}The following function{?s} {?was/were} \\ - found: {.and {.code {funs}}}.", + x = "misspelled variable name or in-line functions detected.", + i = "{cli::qty(length(funs))}The following function/misspellings{?s} \\ + {?was/were} found: {.and {.code {funs}}}.", i = "Use steps to do transformations instead.", i = "If your modeling engine uses special terms in formulas, pass \\ that formula to workflows as a \\ {.help [model formula](parsnip::model_formula)}." - )) + ), call = call) } invisible(x) diff --git a/tests/testthat/_snaps/basics.md b/tests/testthat/_snaps/basics.md index 9f7f9b2e1..912c49ae6 100644 --- a/tests/testthat/_snaps/basics.md +++ b/tests/testthat/_snaps/basics.md @@ -3,9 +3,9 @@ Code recipe(HHV ~ log(nitrogen), data = biomass) Condition - Error in `inline_check()`: - x No in-line functions should be used here. - i The following function was found: `log`. + Error in `recipe()`: + x misspelled variable name or in-line functions detected. + i The following function/misspellings was found: `log`. i Use steps to do transformations instead. i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). @@ -14,9 +14,9 @@ Code recipe(HHV ~ (.)^2, data = biomass) Condition - Error in `inline_check()`: - x No in-line functions should be used here. - i The following functions were found: `^` and `(`. + Error in `recipe()`: + x misspelled variable name or in-line functions detected. + i The following function/misspellingss were found: `^` and `(`. i Use steps to do transformations instead. i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). @@ -25,9 +25,9 @@ Code recipe(HHV ~ nitrogen + sulfur + nitrogen:sulfur, data = biomass) Condition - Error in `inline_check()`: - x No in-line functions should be used here. - i The following function was found: `:`. + Error in `recipe()`: + x misspelled variable name or in-line functions detected. + i The following function/misspellings was found: `:`. i Use steps to do transformations instead. i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). @@ -36,9 +36,31 @@ Code recipe(HHV ~ nitrogen^2, data = biomass) Condition - Error in `inline_check()`: - x No in-line functions should be used here. - i The following function was found: `^`. + Error in `recipe()`: + x misspelled variable name or in-line functions detected. + i The following function/misspellings was found: `^`. + i Use steps to do transformations instead. + i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). + +# Recipe on missspelled variables in formulas + + Code + recipe(HHV ~ not_nitrogen, data = biomass) + Condition + Error in `recipe()`: + x misspelled variable name or in-line functions detected. + i The following function/misspellings was found: `not_nitrogen`. + i Use steps to do transformations instead. + i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). + +--- + + Code + recipe(not_HHV ~ nitrogen, data = biomass) + Condition + Error in `recipe()`: + x misspelled variable name or in-line functions detected. + i The following function/misspellings was found: `not_HHV`. i Use steps to do transformations instead. i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). diff --git a/tests/testthat/test-basics.R b/tests/testthat/test-basics.R index fc25699a1..38793b082 100644 --- a/tests/testthat/test-basics.R +++ b/tests/testthat/test-basics.R @@ -30,6 +30,18 @@ test_that("Recipe fails on in-line functions", { ) }) +test_that("Recipe on missspelled variables in formulas", { + expect_snapshot( + error = TRUE, + recipe(HHV ~ not_nitrogen, data = biomass) + ) + + expect_snapshot( + error = TRUE, + recipe(not_HHV ~ nitrogen, data = biomass) + ) +}) + test_that("return character or factor values", { raw_recipe <- recipe(HHV ~ ., data = biomass) centered <- raw_recipe %>% From 23f06122542f1d9ee5cde81920e246ccf296f9b2 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 24 May 2024 09:45:42 -0700 Subject: [PATCH 08/11] Apply suggestions from code review Co-authored-by: Simon P. Couch --- NEWS.md | 2 +- R/recipe.R | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index d2ee2d394..9b1d40d60 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ * `NA` levels in factors aren't dropped when passed to `recipe()`. (#1291) -* `recipe()` no longer crashes when given long formula expression. (#1283) +* `recipe()` no longer crashes when given long formula expression (#1283). # recipes 1.0.10 diff --git a/R/recipe.R b/R/recipe.R index 19b1f7124..e61f3b0b7 100644 --- a/R/recipe.R +++ b/R/recipe.R @@ -195,7 +195,7 @@ recipe.data.frame <- recipe.formula <- function(formula, data, ...) { if (rlang::is_missing(data)) { - cli::cli_abort("Argument {.var data} is missing, with no default.") + cli::cli_abort("{.arg data} is missing with no default.") } # check for minus: @@ -279,8 +279,8 @@ inline_check <- function(x, data, call) { if (length(funs) > 0) { cli::cli_abort(c( - x = "misspelled variable name or in-line functions detected.", - i = "{cli::qty(length(funs))}The following function/misspellings{?s} \\ + x = "Misspelled variable name or in-line functions detected.", + i = "{cli::qty(length(funs))}The following function{?s}/misspelling{?s} \\ {?was/were} found: {.and {.code {funs}}}.", i = "Use steps to do transformations instead.", i = "If your modeling engine uses special terms in formulas, pass \\ From c18a826280027c0324fc88094caff81448c22acd Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 24 May 2024 09:53:16 -0700 Subject: [PATCH 09/11] update snapshots --- tests/testthat/_snaps/basics.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/testthat/_snaps/basics.md b/tests/testthat/_snaps/basics.md index 912c49ae6..52036f89a 100644 --- a/tests/testthat/_snaps/basics.md +++ b/tests/testthat/_snaps/basics.md @@ -4,8 +4,8 @@ recipe(HHV ~ log(nitrogen), data = biomass) Condition Error in `recipe()`: - x misspelled variable name or in-line functions detected. - i The following function/misspellings was found: `log`. + x Misspelled variable name or in-line functions detected. + i The following function/misspelling was found: `log`. i Use steps to do transformations instead. i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). @@ -15,8 +15,8 @@ recipe(HHV ~ (.)^2, data = biomass) Condition Error in `recipe()`: - x misspelled variable name or in-line functions detected. - i The following function/misspellingss were found: `^` and `(`. + x Misspelled variable name or in-line functions detected. + i The following functions/misspellings were found: `^` and `(`. i Use steps to do transformations instead. i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). @@ -26,8 +26,8 @@ recipe(HHV ~ nitrogen + sulfur + nitrogen:sulfur, data = biomass) Condition Error in `recipe()`: - x misspelled variable name or in-line functions detected. - i The following function/misspellings was found: `:`. + x Misspelled variable name or in-line functions detected. + i The following function/misspelling was found: `:`. i Use steps to do transformations instead. i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). @@ -37,8 +37,8 @@ recipe(HHV ~ nitrogen^2, data = biomass) Condition Error in `recipe()`: - x misspelled variable name or in-line functions detected. - i The following function/misspellings was found: `^`. + x Misspelled variable name or in-line functions detected. + i The following function/misspelling was found: `^`. i Use steps to do transformations instead. i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). @@ -48,8 +48,8 @@ recipe(HHV ~ not_nitrogen, data = biomass) Condition Error in `recipe()`: - x misspelled variable name or in-line functions detected. - i The following function/misspellings was found: `not_nitrogen`. + x Misspelled variable name or in-line functions detected. + i The following function/misspelling was found: `not_nitrogen`. i Use steps to do transformations instead. i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). @@ -59,8 +59,8 @@ recipe(not_HHV ~ nitrogen, data = biomass) Condition Error in `recipe()`: - x misspelled variable name or in-line functions detected. - i The following function/misspellings was found: `not_HHV`. + x Misspelled variable name or in-line functions detected. + i The following function/misspelling was found: `not_HHV`. i Use steps to do transformations instead. i If your modeling engine uses special terms in formulas, pass that formula to workflows as a model formula (`?parsnip::model_formula()`). @@ -201,5 +201,5 @@ recipe(mpg ~ .) Condition Error in `recipe()`: - ! Argument `data` is missing, with no default. + ! `data` is missing with no default. From fd694aaa1bffda3b0d95bab393f143afb66b2e2b Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 24 May 2024 09:54:06 -0700 Subject: [PATCH 10/11] add news --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 9b1d40d60..0edff82bc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ * `recipe()` no longer crashes when given long formula expression (#1283). +* `recipe()` will now show better error when columns are misspelled in formula (#1283). + # recipes 1.0.10 ## Bug Fixes From e3b1a5c5016aab8cac0b50f3df8e9b177c36dbb6 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 24 May 2024 10:06:04 -0700 Subject: [PATCH 11/11] handle weird formula cases --- R/misc.R | 21 +++++++++++++++------ tests/testthat/test-basics.R | 18 ++++++++++++++++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/R/misc.R b/R/misc.R index 2668746c7..09574f135 100644 --- a/R/misc.R +++ b/R/misc.R @@ -22,13 +22,22 @@ get_rhs_vars <- function(formula, data, no_lhs = FALSE) { ## Answer: when called from `form2args`, the function ## `inline_check` stops when in-line functions are used. - outcomes_names <- all.names(rlang::f_lhs(formula), functions = FALSE) + outcomes_names <- all.names( + rlang::f_lhs(formula), + functions = FALSE, + unique = TRUE + ) - formula_rhs <- rlang::f_rhs(formula) - if (identical(formula_rhs, quote(.))) { - predictors_names <- colnames(data) - } else { - predictors_names <- all.names(rlang::f_rhs(formula), functions = FALSE) + predictors_names <- all.names( + rlang::f_rhs(formula), + functions = FALSE, + unique = TRUE + ) + + if (any(predictors_names == ".")) { + predictors_names <- predictors_names[predictors_names != "."] + predictors_names <- c(predictors_names, colnames(data)) + predictors_names <- unique(predictors_names) } if (length(predictors_names) > 0 && length(outcomes_names) > 0) { diff --git a/tests/testthat/test-basics.R b/tests/testthat/test-basics.R index 38793b082..817ce3230 100644 --- a/tests/testthat/test-basics.R +++ b/tests/testthat/test-basics.R @@ -382,3 +382,21 @@ test_that("recipe() can handle very long formulas (#1283)", { rec <- recipe(long_formula, df) ) }) + +test_that("recipe() works with odd formula usage (#1283)", { + + expect_identical( + sort(recipe(mpg ~ ., data = mtcars)$var_info$variable), + sort(colnames(mtcars)) + ) + + expect_identical( + sort(recipe(mpg ~ . + disp, data = mtcars)$var_info$variable), + sort(colnames(mtcars)) + ) + + expect_identical( + sort(recipe(mpg ~ disp + disp, mtcars)$var_info$variable), + c("disp", "mpg") + ) +})