From b5e83825601f4a8a9edbeec0419e49c57d80c153 Mon Sep 17 00:00:00 2001 From: Brian Williamson Date: Wed, 13 Dec 2023 16:15:25 -0800 Subject: [PATCH] Update README, other minor changes --- .Rbuildignore | 34 +- CRAN-SUBMISSION | 5 +- DESCRIPTION | 136 ++++---- R/est_predictiveness_cv.R | 348 +++++++++---------- R/vimp_accuracy.R | 118 +++---- R/vimp_anova.R | 158 ++++----- R/vimp_ci.R | 128 +++---- README.md | 40 +-- cran-comments.md | 36 +- tests/testthat/test-merge_vim.R | 170 ++++----- tests/testthat/test-vim.R | 274 +++++++-------- vignettes/precomputed-regressions.Rmd | 474 +++++++++++++------------- 12 files changed, 955 insertions(+), 966 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 4c5f46c..8e9d034 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,17 +1,17 @@ -^.*\.Rproj$ -^\.Rproj\.user$ -^appveyor\.yml$ -^\.travis\.yml$ -^codecov\.yml$ -cran-comments.md -LICENSE.md -README.md -^CRAN-RELEASE$ -^_pkgdown\.yml$ -^docs$ -^pkgdown$ -docs/* -^\.github$ -^doc$ -^Meta$ -^CRAN-SUBMISSION$ +^.*\.Rproj$ +^\.Rproj\.user$ +^appveyor\.yml$ +^\.travis\.yml$ +^codecov\.yml$ +cran-comments.md +LICENSE.md +README.md +^CRAN-RELEASE$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ +docs/* +^\.github$ +^doc$ +^Meta$ +^CRAN-SUBMISSION$ diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index 4b409d4..c3f7299 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,2 @@ -Version: 2.3.1 -Date: 2022-12-09 16:45:46 UTC -SHA: 99c427465eddd318782f6b583edadf36e2277b99 +Version: 2.3.3 +Date: 2023-08-28 20:46:13 UTC diff --git a/DESCRIPTION b/DESCRIPTION index 64f4dfb..426f354 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,68 +1,68 @@ -Package: vimp -Type: Package -Title: Perform Inference on Algorithm-Agnostic Variable Importance -Version: 2.3.3 -Authors@R: - c(person(given = "Brian D.", - family = "Williamson", - role = c("aut", "cre"), - email = "brian.d.williamson@kp.org", - comment = c(ORCID = "0000-0002-7024-548X")), - person(given = "Jean", - family = "Feng", - role = "ctb"), - person(given = "Charlie", - family = "Wolock", - role = "ctb"), - person(given = "Noah", - family = "Simon", - role = "ths", - comment = c(ORCID = "0000-0002-8985-2474")), - person(given = "Marco", - family = "Carone", - role = "ths", - comment = c(ORCID = "0000-0003-2106-0953"))) -Description: Calculate point estimates of and valid confidence intervals for - nonparametric, algorithm-agnostic variable importance measures in high and low dimensions, - using flexible estimators of the underlying regression functions. For more information - about the methods, please see Williamson et al. (Biometrics, 2020), Williamson et al. (JASA, 2021), and Williamson and Feng (ICML, 2020). -Depends: - R (>= 3.1.0) -Imports: - SuperLearner, - stats, - dplyr, - magrittr, - ROCR, - tibble, - rlang, - MASS, - boot, - data.table -Suggests: - knitr, - rmarkdown, - gam, - xgboost, - glmnet, - ranger, - polspline, - quadprog, - covr, - testthat, - ggplot2, - cowplot, - cvAUC, - tidyselect, - WeightedROC, - purrr, - boot -License: MIT + file LICENSE -URL: https://bdwilliamson.github.io/vimp/, - https://github.com/bdwilliamson/vimp, - http://bdwilliamson.github.io/vimp/ -BugReports: https://github.com/bdwilliamson/vimp/issues -RoxygenNote: 7.2.3 -VignetteBuilder: knitr -LazyData: true -Encoding: UTF-8 +Package: vimp +Type: Package +Title: Perform Inference on Algorithm-Agnostic Variable Importance +Version: 2.3.3 +Authors@R: + c(person(given = "Brian D.", + family = "Williamson", + role = c("aut", "cre"), + email = "brian.d.williamson@kp.org", + comment = c(ORCID = "0000-0002-7024-548X")), + person(given = "Jean", + family = "Feng", + role = "ctb"), + person(given = "Charlie", + family = "Wolock", + role = "ctb"), + person(given = "Noah", + family = "Simon", + role = "ths", + comment = c(ORCID = "0000-0002-8985-2474")), + person(given = "Marco", + family = "Carone", + role = "ths", + comment = c(ORCID = "0000-0003-2106-0953"))) +Description: Calculate point estimates of and valid confidence intervals for + nonparametric, algorithm-agnostic variable importance measures in high and low dimensions, + using flexible estimators of the underlying regression functions. For more information + about the methods, please see Williamson et al. (Biometrics, 2020), Williamson et al. (JASA, 2021), and Williamson and Feng (ICML, 2020). +Depends: + R (>= 3.1.0) +Imports: + SuperLearner, + stats, + dplyr, + magrittr, + ROCR, + tibble, + rlang, + MASS, + boot, + data.table +Suggests: + knitr, + rmarkdown, + gam, + xgboost, + glmnet, + ranger, + polspline, + quadprog, + covr, + testthat, + ggplot2, + cowplot, + cvAUC, + tidyselect, + WeightedROC, + purrr, + boot +License: MIT + file LICENSE +URL: https://bdwilliamson.github.io/vimp/, + https://github.com/bdwilliamson/vimp, + http://bdwilliamson.github.io/vimp/ +BugReports: https://github.com/bdwilliamson/vimp/issues +RoxygenNote: 7.2.3 +VignetteBuilder: knitr +LazyData: true +Encoding: UTF-8 diff --git a/R/est_predictiveness_cv.R b/R/est_predictiveness_cv.R index e5b078e..c0d490f 100644 --- a/R/est_predictiveness_cv.R +++ b/R/est_predictiveness_cv.R @@ -1,174 +1,174 @@ -#' Estimate a nonparametric predictiveness functional using cross-fitting -#' -#' Compute nonparametric estimates of the chosen measure of predictiveness. -#' -#' @param fitted_values fitted values from a regression function using the -#' observed data; a list of length V, where each object is a set of -#' predictions on the validation data, or a vector of the same length as \code{y}. -#' @param y the observed outcome. -#' @param full_y the observed outcome (from the entire dataset, for -#' cross-fitted estimates). -#' @param folds the cross-validation folds for the observed data. -#' @param type which parameter are you estimating (defaults to \code{r_squared}, -#' for R-squared-based variable importance)? -#' @param C the indicator of coarsening (1 denotes observed, 0 denotes -#' unobserved). -#' @param Z either \code{NULL} (if no coarsening) or a matrix-like object -#' containing the fully observed data. -#' @param folds_Z either the cross-validation folds for the observed data -#' (no coarsening) or a vector of folds for the fully observed data Z. -#' @param ipc_weights weights for inverse probability of coarsening (e.g., -#' inverse weights from a two-phase sample) weighted estimation. Assumed to be -#' already inverted (i.e., ipc_weights = 1 / [estimated probability weights]). -#' @param ipc_fit_type if "external", then use \code{ipc_eif_preds}; if "SL", -#' fit a SuperLearner to determine the correction to the efficient -#' influence function. -#' @param ipc_eif_preds if \code{ipc_fit_type = "external"}, the fitted values -#' from a regression of the full-data EIF on the fully observed -#' covariates/outcome; otherwise, not used. -#' @param ipc_est_type IPC correction, either \code{"ipw"} (for classical -#' inverse probability weighting) or \code{"aipw"} (for augmented inverse -#' probability weighting; the default). -#' @param scale if doing an IPC correction, then the scale that the correction -#' should be computed on (e.g., "identity"; or "logit" to logit-transform, -#' apply the correction, and back-transform). -#' @param na.rm logical; should NA's be removed in computation? -#' (defaults to \code{FALSE}) -#' @param ... other arguments to SuperLearner, if \code{ipc_fit_type = "SL"}. -#' -#' @return The estimated measure of predictiveness. -#' -#' @details See the paper by Williamson, Gilbert, Simon, and Carone for more -#' details on the mathematics behind this function and the definition of the -#' parameter of interest. If sample-splitting is also requested -#' (recommended, since in this case inferences -#' will be valid even if the variable has zero true importance), then the -#' prediction functions are trained as if \eqn{2K}-fold cross-validation were run, -#' but are evaluated on only \eqn{K} sets (independent between the full and -#' reduced nuisance regression). -#' @export -est_predictiveness_cv <- function(fitted_values, y, full_y = NULL, - folds, - type = "r_squared", - C = rep(1, length(y)), Z = NULL, - folds_Z = folds, - ipc_weights = rep(1, length(C)), - ipc_fit_type = "external", - ipc_eif_preds = rep(1, length(C)), - ipc_est_type = "aipw", scale = "identity", - na.rm = FALSE, ...) { - # get the correct measure function; if not one of the supported ones, say so - types <- c("accuracy", "auc", "deviance", "r_squared", "anova", "mse", - "cross_entropy") - full_type <- types[pmatch(type, types)] - if (is.na(full_type)) stop( - paste0("We currently do not support the entered variable importance ", - "parameter.") - ) - measure_funcs <- c(measure_accuracy, measure_auc, measure_cross_entropy, - measure_mse, NA, measure_mse, measure_cross_entropy) - measure_func <- measure_funcs[pmatch(type, types)] - if (is.list(fitted_values)) { - fitted_values_vector <- vector("numeric", length = length(y)) - for (v in seq_len(length(unique(folds)))) { - fitted_values_vector[folds == v] <- fitted_values[[v]] - } - fitted_values <- fitted_values_vector - } - - # compute point estimate, EIF - if (!is.na(measure_func)) { - V <- length(unique(folds)) - max_nrow <- max(sapply(1:V, function(v) length(C[folds_Z == v]))) - ics <- vector("list", length = V) - ic <- vector("numeric", length = length(C)) - measures <- vector("list", V) - for (v in seq_len(V)) { - measures[[v]] <- measure_func[[1]]( - fitted_values = fitted_values[folds == v], y = y[folds == v], - full_y = full_y, - C = C[folds_Z == v], Z = Z[folds_Z == v, , drop = FALSE], - ipc_weights = ipc_weights[folds_Z == v], - ipc_fit_type = ipc_fit_type, - ipc_eif_preds = ipc_eif_preds[folds_Z == v], - ipc_est_type = ipc_est_type, scale = scale, na.rm = na.rm, ... - ) - ics[[v]] <- measures[[v]]$eif - ic[folds_Z == v] <- measures[[v]]$eif - } - point_ests <- sapply(1:V, function(v) measures[[v]]$point_est) - point_est <- mean(point_ests) - ipc_eif_preds <- sapply( - 1:V, function(v) measures[[v]]$ipc_eif_preds, simplify = FALSE - ) - } else { # if type is anova, no plug-in from predictiveness - point_est <- point_ests <- NA - ic <- rep(NA, length(y)) - } - # check whether or not we need to do ipc weighting -- if y is fully observed, - # i.e., is part of Z, then we don't - do_ipcw <- as.numeric(!all(ipc_weights == 1)) - if (is.null(full_y)) { - mn_y <- mean(y, na.rm = na.rm) - } else { - mn_y <- mean(full_y, na.rm = na.rm) - } - # if full_type is "r_squared" or "deviance", post-hoc computing from "mse" - # or "cross_entropy" - if (full_type == "r_squared") { - var <- measure_mse( - fitted_values = rep(mn_y, length(y)), y, - C = switch(do_ipcw + 1, rep(1, length(C)), C), Z = Z, - ipc_weights = ipc_weights, - ipc_fit_type = switch(do_ipcw + 1, ipc_fit_type, "SL"), ipc_eif_preds, - ipc_est_type = ipc_est_type, scale = "identity", na.rm = na.rm, ... - ) - ic <- (-1) * as.vector( - matrix(c(1 / var$point_est, - -point_est / (var$point_est ^ 2)), - nrow = 1) %*% t(cbind(ic, var$eif)) - ) - tmp_ics <- vector("list", length = V) - for (v in 1:V) { - tmp_ics[[v]] <- (-1) * as.vector( - matrix(c(1 / var$point_est, - -point_ests[v] / (var$point_est ^ 2)), - nrow = 1) %*% t(cbind(ics[[v]], - var$eif[folds_Z == v])) - ) - } - point_ests <- 1 - point_ests/var$point_est - point_est <- mean(point_ests) - ics <- tmp_ics - } - if (full_type == "deviance") { - denom <- measure_cross_entropy( - fitted_values = rep(mn_y, length(y)), y, - C = switch(do_ipcw + 1, rep(1, length(C)), C), Z = Z, - ipc_weights = ipc_weights, - ipc_fit_type = switch(do_ipcw + 1, ipc_fit_type, "SL"), - ipc_eif_preds, ipc_est_type = ipc_est_type, - scale = "identity", na.rm = na.rm, ... - ) - ic <- (-1) * as.vector( - matrix(c(1 / denom$point_est, - -point_est / (denom$point_est ^ 2)), - nrow = 1) %*% t(cbind(ic, denom$eif)) - ) - tmp_ics <- vector("list", length = V) - for (v in 1:V) { - tmp_ics[[v]] <- (-1) * as.vector( - matrix(c(1 / denom$point_est, - -point_ests[v] / (denom$point_est ^ 2)), - nrow = 1) %*% t(cbind(ics[[v]], - denom$eif[folds_Z == v])) - ) - } - point_ests <- 1 - point_ests / denom$point_est - point_est <- mean(point_ests) - ics <- tmp_ics - } - # return it - return(list(point_est = point_est, all_ests = point_ests, eif = ic, - all_eifs = ics, ipc_eif_preds = ipc_eif_preds)) -} +#' Estimate a nonparametric predictiveness functional using cross-fitting +#' +#' Compute nonparametric estimates of the chosen measure of predictiveness. +#' +#' @param fitted_values fitted values from a regression function using the +#' observed data; a list of length V, where each object is a set of +#' predictions on the validation data, or a vector of the same length as \code{y}. +#' @param y the observed outcome. +#' @param full_y the observed outcome (from the entire dataset, for +#' cross-fitted estimates). +#' @param folds the cross-validation folds for the observed data. +#' @param type which parameter are you estimating (defaults to \code{r_squared}, +#' for R-squared-based variable importance)? +#' @param C the indicator of coarsening (1 denotes observed, 0 denotes +#' unobserved). +#' @param Z either \code{NULL} (if no coarsening) or a matrix-like object +#' containing the fully observed data. +#' @param folds_Z either the cross-validation folds for the observed data +#' (no coarsening) or a vector of folds for the fully observed data Z. +#' @param ipc_weights weights for inverse probability of coarsening (e.g., +#' inverse weights from a two-phase sample) weighted estimation. Assumed to be +#' already inverted (i.e., ipc_weights = 1 / [estimated probability weights]). +#' @param ipc_fit_type if "external", then use \code{ipc_eif_preds}; if "SL", +#' fit a SuperLearner to determine the correction to the efficient +#' influence function. +#' @param ipc_eif_preds if \code{ipc_fit_type = "external"}, the fitted values +#' from a regression of the full-data EIF on the fully observed +#' covariates/outcome; otherwise, not used. +#' @param ipc_est_type IPC correction, either \code{"ipw"} (for classical +#' inverse probability weighting) or \code{"aipw"} (for augmented inverse +#' probability weighting; the default). +#' @param scale if doing an IPC correction, then the scale that the correction +#' should be computed on (e.g., "identity"; or "logit" to logit-transform, +#' apply the correction, and back-transform). +#' @param na.rm logical; should NA's be removed in computation? +#' (defaults to \code{FALSE}) +#' @param ... other arguments to SuperLearner, if \code{ipc_fit_type = "SL"}. +#' +#' @return The estimated measure of predictiveness. +#' +#' @details See the paper by Williamson, Gilbert, Simon, and Carone for more +#' details on the mathematics behind this function and the definition of the +#' parameter of interest. If sample-splitting is also requested +#' (recommended, since in this case inferences +#' will be valid even if the variable has zero true importance), then the +#' prediction functions are trained as if \eqn{2K}-fold cross-validation were run, +#' but are evaluated on only \eqn{K} sets (independent between the full and +#' reduced nuisance regression). +#' @export +est_predictiveness_cv <- function(fitted_values, y, full_y = NULL, + folds, + type = "r_squared", + C = rep(1, length(y)), Z = NULL, + folds_Z = folds, + ipc_weights = rep(1, length(C)), + ipc_fit_type = "external", + ipc_eif_preds = rep(1, length(C)), + ipc_est_type = "aipw", scale = "identity", + na.rm = FALSE, ...) { + # get the correct measure function; if not one of the supported ones, say so + types <- c("accuracy", "auc", "deviance", "r_squared", "anova", "mse", + "cross_entropy") + full_type <- types[pmatch(type, types)] + if (is.na(full_type)) stop( + paste0("We currently do not support the entered variable importance ", + "parameter.") + ) + measure_funcs <- c(measure_accuracy, measure_auc, measure_cross_entropy, + measure_mse, NA, measure_mse, measure_cross_entropy) + measure_func <- measure_funcs[pmatch(type, types)] + if (is.list(fitted_values)) { + fitted_values_vector <- vector("numeric", length = length(y)) + for (v in seq_len(length(unique(folds)))) { + fitted_values_vector[folds == v] <- fitted_values[[v]] + } + fitted_values <- fitted_values_vector + } + + # compute point estimate, EIF + if (!is.na(measure_func)) { + V <- length(unique(folds)) + max_nrow <- max(sapply(1:V, function(v) length(C[folds_Z == v]))) + ics <- vector("list", length = V) + ic <- vector("numeric", length = length(C)) + measures <- vector("list", V) + for (v in seq_len(V)) { + measures[[v]] <- measure_func[[1]]( + fitted_values = fitted_values[folds == v], y = y[folds == v], + full_y = full_y, + C = C[folds_Z == v], Z = Z[folds_Z == v, , drop = FALSE], + ipc_weights = ipc_weights[folds_Z == v], + ipc_fit_type = ipc_fit_type, + ipc_eif_preds = ipc_eif_preds[folds_Z == v], + ipc_est_type = ipc_est_type, scale = scale, na.rm = na.rm, ... + ) + ics[[v]] <- measures[[v]]$eif + ic[folds_Z == v] <- measures[[v]]$eif + } + point_ests <- sapply(1:V, function(v) measures[[v]]$point_est) + point_est <- mean(point_ests) + ipc_eif_preds <- sapply( + 1:V, function(v) measures[[v]]$ipc_eif_preds, simplify = FALSE + ) + } else { # if type is anova, no plug-in from predictiveness + point_est <- point_ests <- NA + ic <- rep(NA, length(y)) + } + # check whether or not we need to do ipc weighting -- if y is fully observed, + # i.e., is part of Z, then we don't + do_ipcw <- as.numeric(!all(ipc_weights == 1)) + if (is.null(full_y)) { + mn_y <- mean(y, na.rm = na.rm) + } else { + mn_y <- mean(full_y, na.rm = na.rm) + } + # if full_type is "r_squared" or "deviance", post-hoc computing from "mse" + # or "cross_entropy" + if (full_type == "r_squared") { + var <- measure_mse( + fitted_values = rep(mn_y, length(y)), y, + C = switch(do_ipcw + 1, rep(1, length(C)), C), Z = Z, + ipc_weights = ipc_weights, + ipc_fit_type = switch(do_ipcw + 1, ipc_fit_type, "SL"), ipc_eif_preds, + ipc_est_type = ipc_est_type, scale = "identity", na.rm = na.rm, ... + ) + ic <- (-1) * as.vector( + matrix(c(1 / var$point_est, + -point_est / (var$point_est ^ 2)), + nrow = 1) %*% t(cbind(ic, var$eif)) + ) + tmp_ics <- vector("list", length = V) + for (v in 1:V) { + tmp_ics[[v]] <- (-1) * as.vector( + matrix(c(1 / var$point_est, + -point_ests[v] / (var$point_est ^ 2)), + nrow = 1) %*% t(cbind(ics[[v]], + var$eif[folds_Z == v])) + ) + } + point_ests <- 1 - point_ests/var$point_est + point_est <- mean(point_ests) + ics <- tmp_ics + } + if (full_type == "deviance") { + denom <- measure_cross_entropy( + fitted_values = rep(mn_y, length(y)), y, + C = switch(do_ipcw + 1, rep(1, length(C)), C), Z = Z, + ipc_weights = ipc_weights, + ipc_fit_type = switch(do_ipcw + 1, ipc_fit_type, "SL"), + ipc_eif_preds, ipc_est_type = ipc_est_type, + scale = "identity", na.rm = na.rm, ... + ) + ic <- (-1) * as.vector( + matrix(c(1 / denom$point_est, + -point_est / (denom$point_est ^ 2)), + nrow = 1) %*% t(cbind(ic, denom$eif)) + ) + tmp_ics <- vector("list", length = V) + for (v in 1:V) { + tmp_ics[[v]] <- (-1) * as.vector( + matrix(c(1 / denom$point_est, + -point_ests[v] / (denom$point_est ^ 2)), + nrow = 1) %*% t(cbind(ics[[v]], + denom$eif[folds_Z == v])) + ) + } + point_ests <- 1 - point_ests / denom$point_est + point_est <- mean(point_ests) + ics <- tmp_ics + } + # return it + return(list(point_est = point_est, all_ests = point_ests, eif = ic, + all_eifs = ics, ipc_eif_preds = ipc_eif_preds)) +} diff --git a/R/vimp_accuracy.R b/R/vimp_accuracy.R index 9d7c4fe..e1789dd 100644 --- a/R/vimp_accuracy.R +++ b/R/vimp_accuracy.R @@ -1,59 +1,59 @@ -#' Nonparametric Intrinsic Variable Importance Estimates: Classification accuracy -#' -#' Compute estimates of and confidence intervals for nonparametric -#' difference in classification accuracy-based intrinsic variable importance. -#' This is a wrapper function for \code{cv_vim}, with \code{type = "accuracy"}. -#' -#' @inheritParams cv_vim -#' -#' @return An object of classes \code{vim} and \code{vim_accuracy}. -#' See Details for more information. -#' -#' @inherit cv_vim details -#' -#' @examples -#' # generate the data -#' # generate X -#' p <- 2 -#' n <- 100 -#' x <- data.frame(replicate(p, stats::runif(n, -1, 1))) -#' -#' # apply the function to the x's -#' f <- function(x) 0.5 + 0.3*x[1] + 0.2*x[2] -#' smooth <- apply(x, 1, function(z) f(z)) -#' -#' # generate Y ~ Normal (smooth, 1) -#' y <- matrix(rbinom(n, size = 1, prob = smooth)) -#' -#' # set up a library for SuperLearner; note simple library for speed -#' library("SuperLearner") -#' learners <- c("SL.glm", "SL.mean") -#' -#' # estimate (with a small number of folds, for illustration only) -#' est <- vimp_accuracy(y, x, indx = 2, -#' alpha = 0.05, run_regression = TRUE, -#' SL.library = learners, V = 2, cvControl = list(V = 2)) -#' -#' @seealso \code{\link[SuperLearner]{SuperLearner}} for specific usage of the \code{SuperLearner} function and package. -#' @export -vimp_accuracy <- function(Y = NULL, X = NULL, cross_fitted_f1 = NULL, - cross_fitted_f2 = NULL, f1 = NULL, f2 = NULL, indx = 1, - V = 10, run_regression = TRUE, - SL.library = c("SL.glmnet", "SL.xgboost", "SL.mean"), - alpha = 0.05, delta = 0, na.rm = FALSE, - final_point_estimate = "split", - cross_fitting_folds = NULL, sample_splitting_folds = NULL, - stratified = TRUE, C = rep(1, length(Y)), Z = NULL, - ipc_weights = rep(1, length(Y)), scale = "logit", - ipc_est_type = "aipw", scale_est = TRUE, - cross_fitted_se = TRUE, ...) { - cv_vim(type = "accuracy", Y = Y, X = X, cross_fitted_f1 = cross_fitted_f1, - cross_fitted_f2 = cross_fitted_f2, f1 = f1, f2 = f2, indx = indx, - V = V, run_regression = run_regression, SL.library = SL.library, - alpha = alpha, delta = delta, na.rm = na.rm, stratified = stratified, - final_point_estimate = final_point_estimate, - cross_fitting_folds = cross_fitting_folds, ipc_weights = ipc_weights, - sample_splitting_folds = sample_splitting_folds, C = C, Z = Z, - scale = scale, ipc_est_type = ipc_est_type, scale_est = scale_est, - cross_fitted_se = cross_fitted_se, ...) -} +#' Nonparametric Intrinsic Variable Importance Estimates: Classification accuracy +#' +#' Compute estimates of and confidence intervals for nonparametric +#' difference in classification accuracy-based intrinsic variable importance. +#' This is a wrapper function for \code{cv_vim}, with \code{type = "accuracy"}. +#' +#' @inheritParams cv_vim +#' +#' @return An object of classes \code{vim} and \code{vim_accuracy}. +#' See Details for more information. +#' +#' @inherit cv_vim details +#' +#' @examples +#' # generate the data +#' # generate X +#' p <- 2 +#' n <- 100 +#' x <- data.frame(replicate(p, stats::runif(n, -1, 1))) +#' +#' # apply the function to the x's +#' f <- function(x) 0.5 + 0.3*x[1] + 0.2*x[2] +#' smooth <- apply(x, 1, function(z) f(z)) +#' +#' # generate Y ~ Normal (smooth, 1) +#' y <- matrix(rbinom(n, size = 1, prob = smooth)) +#' +#' # set up a library for SuperLearner; note simple library for speed +#' library("SuperLearner") +#' learners <- c("SL.glm", "SL.mean") +#' +#' # estimate (with a small number of folds, for illustration only) +#' est <- vimp_accuracy(y, x, indx = 2, +#' alpha = 0.05, run_regression = TRUE, +#' SL.library = learners, V = 2, cvControl = list(V = 2)) +#' +#' @seealso \code{\link[SuperLearner]{SuperLearner}} for specific usage of the \code{SuperLearner} function and package. +#' @export +vimp_accuracy <- function(Y = NULL, X = NULL, cross_fitted_f1 = NULL, + cross_fitted_f2 = NULL, f1 = NULL, f2 = NULL, indx = 1, + V = 10, run_regression = TRUE, + SL.library = c("SL.glmnet", "SL.xgboost", "SL.mean"), + alpha = 0.05, delta = 0, na.rm = FALSE, + final_point_estimate = "split", + cross_fitting_folds = NULL, sample_splitting_folds = NULL, + stratified = TRUE, C = rep(1, length(Y)), Z = NULL, + ipc_weights = rep(1, length(Y)), scale = "logit", + ipc_est_type = "aipw", scale_est = TRUE, + cross_fitted_se = TRUE, ...) { + cv_vim(type = "accuracy", Y = Y, X = X, cross_fitted_f1 = cross_fitted_f1, + cross_fitted_f2 = cross_fitted_f2, f1 = f1, f2 = f2, indx = indx, + V = V, run_regression = run_regression, SL.library = SL.library, + alpha = alpha, delta = delta, na.rm = na.rm, stratified = stratified, + final_point_estimate = final_point_estimate, + cross_fitting_folds = cross_fitting_folds, ipc_weights = ipc_weights, + sample_splitting_folds = sample_splitting_folds, C = C, Z = Z, + scale = scale, ipc_est_type = ipc_est_type, scale_est = scale_est, + cross_fitted_se = cross_fitted_se, ...) +} diff --git a/R/vimp_anova.R b/R/vimp_anova.R index dc6ca7b..8e249e1 100644 --- a/R/vimp_anova.R +++ b/R/vimp_anova.R @@ -1,79 +1,79 @@ -#' Nonparametric Intrinsic Variable Importance Estimates: ANOVA -#' -#' Compute estimates of and confidence intervals for nonparametric ANOVA-based -#' intrinsic variable importance. This is a wrapper function for \code{cv_vim}, -#' with \code{type = "anova"}. This type -#' has limited functionality compared to other -#' types; in particular, null hypothesis tests -#' are not possible using \code{type = "anova"}. -#' If you want to do null hypothesis testing -#' on an equivalent population parameter, use -#' \code{vimp_rsquared} instead. -#' -#' @inheritParams cv_vim -#' -#' @return An object of classes \code{vim} and \code{vim_anova}. -#' See Details for more information. -#' -#' @details We define the population ANOVA -#' parameter for the group of features (or single feature) \eqn{s} by -#' \deqn{\psi_{0,s} := E_0\{f_0(X) - f_{0,s}(X)\}^2/var_0(Y),} -#' where \eqn{f_0} is the population conditional mean using all features, -#' \eqn{f_{0,s}} is the population conditional mean using the features with -#' index not in \eqn{s}, and \eqn{E_0} and \eqn{var_0} denote expectation and -#' variance under the true data-generating distribution, respectively. -#' -#' Cross-fitted ANOVA estimates are computed by first -#' splitting the data into \eqn{K} folds; then using each fold in turn as a -#' hold-out set, constructing estimators \eqn{f_{n,k}} and \eqn{f_{n,k,s}} of -#' \eqn{f_0} and \eqn{f_{0,s}}, respectively on the training data and estimator -#' \eqn{E_{n,k}} of \eqn{E_0} using the test data; and finally, computing -#' \deqn{\psi_{n,s} := K^{(-1)}\sum_{k=1}^K E_{n,k}\{f_{n,k}(X) - f_{n,k,s}(X)\}^2/var_n(Y),} -#' where \eqn{var_n} is the empirical variance. -#' See the paper by Williamson, Gilbert, Simon, and Carone for more -#' details on the mathematics behind this function. -#' -#' @examples -#' # generate the data -#' # generate X -#' p <- 2 -#' n <- 100 -#' x <- data.frame(replicate(p, stats::runif(n, -5, 5))) -#' -#' # apply the function to the x's -#' smooth <- (x[,1]/5)^2*(x[,1]+7)/5 + (x[,2]/3)^2 -#' -#' # generate Y ~ Normal (smooth, 1) -#' y <- smooth + stats::rnorm(n, 0, 1) -#' -#' # set up a library for SuperLearner; note simple library for speed -#' library("SuperLearner") -#' learners <- c("SL.glm", "SL.mean") -#' -#' # estimate (with a small number of folds, for illustration only) -#' est <- vimp_anova(y, x, indx = 2, -#' alpha = 0.05, run_regression = TRUE, -#' SL.library = learners, V = 2, cvControl = list(V = 2)) -#' -#' @seealso \code{\link[SuperLearner]{SuperLearner}} for specific usage of the -#' \code{SuperLearner} function and package. -#' @export -vimp_anova <- function(Y = NULL, X = NULL, cross_fitted_f1 = NULL, - cross_fitted_f2 = NULL, indx = 1, - V = 10, run_regression = TRUE, - SL.library = c("SL.glmnet", "SL.xgboost", "SL.mean"), - alpha = 0.05, delta = 0, na.rm = FALSE, - cross_fitting_folds = NULL, - stratified = FALSE, C = rep(1, length(Y)), Z = NULL, - ipc_weights = rep(1, length(Y)), scale = "logit", - ipc_est_type = "aipw", scale_est = TRUE, - cross_fitted_se = TRUE, ...) { - cv_vim(type = "anova", Y = Y, X = X, cross_fitted_f1 = cross_fitted_f1, - cross_fitted_f2 = cross_fitted_f2, f1 = NULL, f2 = NULL, indx = indx, - V = V, run_regression = run_regression, SL.library = SL.library, - alpha = alpha, delta = delta, na.rm = na.rm, stratified = stratified, - cross_fitting_folds = cross_fitting_folds, ipc_weights = ipc_weights, - sample_splitting = FALSE, sample_splitting_folds = NULL, - C = C, Z = Z, scale = scale, ipc_est_type = ipc_est_type, - scale_est = scale_est, cross_fitted_se = cross_fitted_se, ...) -} +#' Nonparametric Intrinsic Variable Importance Estimates: ANOVA +#' +#' Compute estimates of and confidence intervals for nonparametric ANOVA-based +#' intrinsic variable importance. This is a wrapper function for \code{cv_vim}, +#' with \code{type = "anova"}. This type +#' has limited functionality compared to other +#' types; in particular, null hypothesis tests +#' are not possible using \code{type = "anova"}. +#' If you want to do null hypothesis testing +#' on an equivalent population parameter, use +#' \code{vimp_rsquared} instead. +#' +#' @inheritParams cv_vim +#' +#' @return An object of classes \code{vim} and \code{vim_anova}. +#' See Details for more information. +#' +#' @details We define the population ANOVA +#' parameter for the group of features (or single feature) \eqn{s} by +#' \deqn{\psi_{0,s} := E_0\{f_0(X) - f_{0,s}(X)\}^2/var_0(Y),} +#' where \eqn{f_0} is the population conditional mean using all features, +#' \eqn{f_{0,s}} is the population conditional mean using the features with +#' index not in \eqn{s}, and \eqn{E_0} and \eqn{var_0} denote expectation and +#' variance under the true data-generating distribution, respectively. +#' +#' Cross-fitted ANOVA estimates are computed by first +#' splitting the data into \eqn{K} folds; then using each fold in turn as a +#' hold-out set, constructing estimators \eqn{f_{n,k}} and \eqn{f_{n,k,s}} of +#' \eqn{f_0} and \eqn{f_{0,s}}, respectively on the training data and estimator +#' \eqn{E_{n,k}} of \eqn{E_0} using the test data; and finally, computing +#' \deqn{\psi_{n,s} := K^{(-1)}\sum_{k=1}^K E_{n,k}\{f_{n,k}(X) - f_{n,k,s}(X)\}^2/var_n(Y),} +#' where \eqn{var_n} is the empirical variance. +#' See the paper by Williamson, Gilbert, Simon, and Carone for more +#' details on the mathematics behind this function. +#' +#' @examples +#' # generate the data +#' # generate X +#' p <- 2 +#' n <- 100 +#' x <- data.frame(replicate(p, stats::runif(n, -5, 5))) +#' +#' # apply the function to the x's +#' smooth <- (x[,1]/5)^2*(x[,1]+7)/5 + (x[,2]/3)^2 +#' +#' # generate Y ~ Normal (smooth, 1) +#' y <- smooth + stats::rnorm(n, 0, 1) +#' +#' # set up a library for SuperLearner; note simple library for speed +#' library("SuperLearner") +#' learners <- c("SL.glm", "SL.mean") +#' +#' # estimate (with a small number of folds, for illustration only) +#' est <- vimp_anova(y, x, indx = 2, +#' alpha = 0.05, run_regression = TRUE, +#' SL.library = learners, V = 2, cvControl = list(V = 2)) +#' +#' @seealso \code{\link[SuperLearner]{SuperLearner}} for specific usage of the +#' \code{SuperLearner} function and package. +#' @export +vimp_anova <- function(Y = NULL, X = NULL, cross_fitted_f1 = NULL, + cross_fitted_f2 = NULL, indx = 1, + V = 10, run_regression = TRUE, + SL.library = c("SL.glmnet", "SL.xgboost", "SL.mean"), + alpha = 0.05, delta = 0, na.rm = FALSE, + cross_fitting_folds = NULL, + stratified = FALSE, C = rep(1, length(Y)), Z = NULL, + ipc_weights = rep(1, length(Y)), scale = "logit", + ipc_est_type = "aipw", scale_est = TRUE, + cross_fitted_se = TRUE, ...) { + cv_vim(type = "anova", Y = Y, X = X, cross_fitted_f1 = cross_fitted_f1, + cross_fitted_f2 = cross_fitted_f2, f1 = NULL, f2 = NULL, indx = indx, + V = V, run_regression = run_regression, SL.library = SL.library, + alpha = alpha, delta = delta, na.rm = na.rm, stratified = stratified, + cross_fitting_folds = cross_fitting_folds, ipc_weights = ipc_weights, + sample_splitting = FALSE, sample_splitting_folds = NULL, + C = C, Z = Z, scale = scale, ipc_est_type = ipc_est_type, + scale_est = scale_est, cross_fitted_se = cross_fitted_se, ...) +} diff --git a/R/vimp_ci.R b/R/vimp_ci.R index 39ba98a..e744142 100644 --- a/R/vimp_ci.R +++ b/R/vimp_ci.R @@ -1,64 +1,64 @@ -#' Confidence intervals for variable importance -#' -#' Compute confidence intervals for the true variable importance parameter. -#' -#' @param est estimate of variable importance, e.g., from a call to \code{vimp_point_est}. -#' @param se estimate of the standard error of \code{est}, e.g., from a call to \code{vimp_se}. -#' @param scale scale to compute interval estimate on (defaults to "identity": compute Wald-type CI). -#' @param level confidence interval type (defaults to 0.95). -#' @param truncate truncate CIs to have lower limit at (or above) zero? -#' -#' @return The Wald-based confidence interval for the true importance of the given group of left-out covariates. -#' -#' @details See the paper by Williamson, Gilbert, Simon, and Carone for more -#' details on the mathematics behind this function and the definition of the parameter of interest. -#' @importFrom stats qlogis plogis -#' @export -vimp_ci <- function(est, se, scale = "identity", level = 0.95, truncate = TRUE) { - # set up the level - a <- (1 - level)/2 - a <- c(a, 1 - a) - - # get the quantiles - fac <- stats::qnorm(a) - - # create the ci - ci <- array(NA, dim = c(length(est), 2L), dimnames = list(names(est))) - # get scale - scales <- c("log", "logit", "identity") - full_scale <- scales[pmatch(scale, scales)] - if (full_scale == "log") { - tmp <- suppressWarnings(log(est)) - is_zero <- FALSE - if ((is.na(tmp) & est <= 0 & !is.na(est))| is.infinite(tmp)) { - tmp <- 0 - is_zero <- TRUE - } - grad <- 1 / est - ci[] <- exp(tmp + sqrt(se ^ 2 * grad ^ 2) %o% fac) - if (is_zero) { - ci[, 1] <- 0 - } - } else if (full_scale == "logit") { - tmp <- suppressWarnings(qlogis(est)) - is_zero <- FALSE - if ((is.na(tmp) & est <= 0 & !is.na(est)) | is.infinite(tmp)) { - tmp <- 0 - is_zero <- TRUE - } - grad <- 1 / (est - est ^ 2) - ci[] <- plogis(tmp + sqrt(se ^ 2 * grad ^ 2) %o% fac) - if (is_zero) { - ci[, 1] <- 0 - } - } else { - ci[] <- est + (se) %o% fac - if (any(ci[, 1] < 0) & !all(is.na(ci)) & truncate) { - ci[ci[, 1] < 0, 1] <- 0 - } - } - if (any(ci[, 2] < 0) & !all(is.na(ci)) & truncate) { - ci[ci[, 2] < 0, 2] <- 0 - } - return(ci) -} +#' Confidence intervals for variable importance +#' +#' Compute confidence intervals for the true variable importance parameter. +#' +#' @param est estimate of variable importance, e.g., from a call to \code{vimp_point_est}. +#' @param se estimate of the standard error of \code{est}, e.g., from a call to \code{vimp_se}. +#' @param scale scale to compute interval estimate on (defaults to "identity": compute Wald-type CI). +#' @param level confidence interval type (defaults to 0.95). +#' @param truncate truncate CIs to have lower limit at (or above) zero? +#' +#' @return The Wald-based confidence interval for the true importance of the given group of left-out covariates. +#' +#' @details See the paper by Williamson, Gilbert, Simon, and Carone for more +#' details on the mathematics behind this function and the definition of the parameter of interest. +#' @importFrom stats qlogis plogis +#' @export +vimp_ci <- function(est, se, scale = "identity", level = 0.95, truncate = TRUE) { + # set up the level + a <- (1 - level)/2 + a <- c(a, 1 - a) + + # get the quantiles + fac <- stats::qnorm(a) + + # create the ci + ci <- array(NA, dim = c(length(est), 2L), dimnames = list(names(est))) + # get scale + scales <- c("log", "logit", "identity") + full_scale <- scales[pmatch(scale, scales)] + if (full_scale == "log") { + tmp <- suppressWarnings(log(est)) + is_zero <- FALSE + if ((is.na(tmp) & est <= 0 & !is.na(est))| is.infinite(tmp)) { + tmp <- 0 + is_zero <- TRUE + } + grad <- 1 / est + ci[] <- exp(tmp + sqrt(se ^ 2 * grad ^ 2) %o% fac) + if (is_zero) { + ci[, 1] <- 0 + } + } else if (full_scale == "logit") { + tmp <- suppressWarnings(qlogis(est)) + is_zero <- FALSE + if ((is.na(tmp) & est <= 0 & !is.na(est)) | is.infinite(tmp)) { + tmp <- 0 + is_zero <- TRUE + } + grad <- 1 / (est - est ^ 2) + ci[] <- plogis(tmp + sqrt(se ^ 2 * grad ^ 2) %o% fac) + if (is_zero) { + ci[, 1] <- 0 + } + } else { + ci[] <- est + (se) %o% fac + if (any(ci[, 1] < 0) & !all(is.na(ci)) & truncate) { + ci[ci[, 1] < 0, 1] <- 0 + } + } + if (any(ci[, 2] < 0) & !all(is.na(ci)) & truncate) { + ci[ci[, 2] < 0, 2] <- 0 + } + return(ci) +} diff --git a/README.md b/README.md index 9f5c5d8..3f067ad 100755 --- a/README.md +++ b/README.md @@ -46,45 +46,35 @@ devtools::install_github(repo = "bdwilliamson/vimp") ## Example -This example shows how to use `vimp` in a simple setting with simulated data, using `SuperLearner` to estimate the conditional mean functions and specifying the importance measure of interest as the R-squared-based measure. For more examples and detailed explanation, please see the [vignette](https://bdwilliamson.github.io/vimp/introduction_to_vimp.html). +This example shows how to use `vimp` in a simple setting with simulated data, using `SuperLearner` to estimate the conditional mean functions and specifying the importance measure of interest as the R-squared-based measure. For more examples and detailed explanation, please see the [vignette](https://bdwilliamson.github.io/vimp/introduction-to-vimp.html). ```r -## load required functions and libraries +# load required functions and libraries library("SuperLearner") library("vimp") library("xgboost") library("glmnet") -## ------------------------------------------------------------- -## problem setup -## ------------------------------------------------------------- -## set up the data +# ------------------------------------------------------------- +# problem setup +# ------------------------------------------------------------- +# set up the data n <- 100 p <- 2 s <- 1 # desire importance for X_1 x <- as.data.frame(replicate(p, runif(n, -1, 1))) y <- (x[,1])^2*(x[,1]+7/5) + (25/9)*(x[,2])^2 + rnorm(n, 0, 1) -## ------------------------------------------------------------- -## preliminary step: estimate the conditional means -## ------------------------------------------------------------- -## set up the learner library, consisting of the mean, boosted trees, -## elastic net, and random forest +# ------------------------------------------------------------- +# get variable importance! +# ------------------------------------------------------------- +# set up the learner library, consisting of the mean, boosted trees, +# elastic net, and random forest learner.lib <- c("SL.mean", "SL.xgboost", "SL.glmnet", "SL.randomForest") - -## the full conditional mean -full_regression <- SuperLearner::SuperLearner(Y = y, X = x, family = gaussian(), SL.library = learner.lib) -full_fit <- full_regression$SL.predict - -## the reduced conditional mean -reduced_regression <- SuperLearner::SuperLearner(Y = full_fit, X = x[, -s, drop = FALSE], family = gaussian(), SL.library = learner.lib) -reduced_fit <- reduced_regression$SL.predict - -## ------------------------------------------------------------- -## get variable importance! -## ------------------------------------------------------------- -## get the variable importance estimate, SE, and CI -vimp <- vimp_rsquared(Y = y, f1 = full_fit, f2 = reduced_fit, indx = 1, run_regression = FALSE) +# get the variable importance estimate, SE, and CI +# I'm using only 2 cross-validation folds to make things run quickly; in practice, you should use more +set.seed(20231213) +vimp <- vimp_rsquared(Y = y, X = x, indx = 1, V = 2) ``` ## Citation diff --git a/cran-comments.md b/cran-comments.md index 21bf750..3ce1ff1 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,18 +1,18 @@ -## Resubmission -This is a resubmission. In this version, I have: - -* Updated internal functions - -## Test environments -* local windows-x86_64, R 4.3.1 -* ubuntu 16.04.6 LTS (on travis-ci), R 4.0.2 -* windows server 2012 (on appveyor), R 4.3.1 -* R-hub windows-x86_64-devel (r-devel) -* R-hub ubuntu-gcc-release (r-release) -* R-hub fedora-clang-devel (r-devel) - -## R CMD check results -There were no ERRORs or WARNINGs; there was one NOTE, that I (Brian D. Williamson) am the package maintainer. - -## Downstream dependencies -None. +## Resubmission +This is a resubmission. In this version, I have: + +* Updated internal functions + +## Test environments +* local windows-x86_64, R 4.3.1 +* ubuntu 16.04.6 LTS (on travis-ci), R 4.0.2 +* windows server 2012 (on appveyor), R 4.3.1 +* R-hub windows-x86_64-devel (r-devel) +* R-hub ubuntu-gcc-release (r-release) +* R-hub fedora-clang-devel (r-devel) + +## R CMD check results +There were no ERRORs or WARNINGs; there was one NOTE, that I (Brian D. Williamson) am the package maintainer. + +## Downstream dependencies +None. diff --git a/tests/testthat/test-merge_vim.R b/tests/testthat/test-merge_vim.R index 157aa78..fbb5fc9 100644 --- a/tests/testthat/test-merge_vim.R +++ b/tests/testthat/test-merge_vim.R @@ -1,85 +1,85 @@ -# load required functions and packages -library("testthat") -suppressWarnings(library("SuperLearner")) - -# generate the data -- note that this is a simple setting, for speed ----------- -set.seed(4747) -p <- 2 -n <- 5e4 -x <- replicate(p, stats::rnorm(n, 0, 1)) -x_df <- as.data.frame(x) -# apply the function to the x's -y <- 1 + 0.5 * x[, 1] + 0.75 * x[, 2] + stats::rnorm(n, 0, 1) -true_var <- mean((y - mean(y)) ^ 2) -# note that true difference in R-squareds for variable j, under independence, is -# beta_j^2 * var(x_j) / var(y) -r2_one <- 0.5 ^ 2 * 1 / true_var -r2_two <- 0.75 ^ 2 * 1 / true_var - -# folds for sample-splitting -folds <- sample(rep(seq_len(2), length = length(y))) -folds_lst <- lapply(as.list(seq_len(2)), function(i) which(folds == i)) - -# fit nuisance regressions ----------------------------------------------------- -# set up a library for SuperLearner -learners <- c("SL.glm") -V <- 2 - -# fit the data with all covariates -set.seed(1234) -# fit a CV.SL for sample-splitting -full_fit <- suppressWarnings( - SuperLearner::CV.SuperLearner(Y = y, X = x_df, - SL.library = learners, - cvControl = list(V = 2, validRows = folds_lst), - innerCvControl = list(list(V = V))) -) -full_fitted <- SuperLearner::predict.SuperLearner(full_fit)$pred - -# fit the data with only X1 -reduced_fit_1 <- suppressWarnings( - SuperLearner::CV.SuperLearner(Y = full_fitted, X = x_df[, -2, drop = FALSE], - SL.library = learners, - cvControl = list(V = 2, validRows = full_fit$folds), - innerCvControl = list(list(V = V))) -) -reduced_fitted_1 <- SuperLearner::predict.SuperLearner(reduced_fit_1)$pred - -# fit data with only X2 -reduced_fit_2 <- suppressWarnings( - SuperLearner::CV.SuperLearner(Y = full_fitted, X = x_df[, -1, drop = FALSE], - SL.library = learners, - cvControl = list(V = 2, validRows = full_fit$folds), - innerCvControl = list(list(V = V))) -) -reduced_fitted_2 <- SuperLearner::predict.SuperLearner(reduced_fit_2)$pred - -# test merging ----------------------------------------------------------------- -set.seed(4747) -test_that("Merging variable importance estimates works", { - est_1 <- vim(Y = y, f1 = full_fitted, f2 = reduced_fitted_1, - run_regression = FALSE, indx = 2, type = "r_squared", - sample_splitting_folds = folds) - est_2 <- vim(Y = y, f1 = full_fitted, f2 = reduced_fitted_2, - run_regression = FALSE, indx = 1, type = "r_squared", - sample_splitting_folds = folds) - - merged_ests <- merge_vim(est_1, est_2) - expect_equal(merged_ests$est[1], r2_two, tolerance = 0.05, scale = 1) - expect_equal(merged_ests$est[2], r2_one, tolerance = 0.05, scale = 1) - expect_output(print(merged_ests), "Estimate", fixed = TRUE) -}) - -test_that("Merging cross-validated variable importance estimates works", { - est_1 <- cv_vim(Y = y, X = x_df, run_regression = TRUE, indx = 2, - V = V, cvControl = list(V = V), SL.library = learners, - env = environment(), na.rm = TRUE) - est_2 <- cv_vim(Y = y, X = x_df, run_regression = TRUE, indx = 1, - V = V, cvControl = list(V = V), SL.library = learners, - env = environment(), na.rm = TRUE) - - merged_ests <- merge_vim(est_1, est_2) - expect_equal(merged_ests$est[1], r2_two, tolerance = 0.1, scale = 1) - expect_equal(merged_ests$est[2], r2_one, tolerance = 0.1, scale = 1) - expect_output(print(merged_ests), "Estimate", fixed = TRUE) -}) +# load required functions and packages +library("testthat") +suppressWarnings(library("SuperLearner")) + +# generate the data -- note that this is a simple setting, for speed ----------- +set.seed(4747) +p <- 2 +n <- 5e4 +x <- replicate(p, stats::rnorm(n, 0, 1)) +x_df <- as.data.frame(x) +# apply the function to the x's +y <- 1 + 0.5 * x[, 1] + 0.75 * x[, 2] + stats::rnorm(n, 0, 1) +true_var <- mean((y - mean(y)) ^ 2) +# note that true difference in R-squareds for variable j, under independence, is +# beta_j^2 * var(x_j) / var(y) +r2_one <- 0.5 ^ 2 * 1 / true_var +r2_two <- 0.75 ^ 2 * 1 / true_var + +# folds for sample-splitting +folds <- sample(rep(seq_len(2), length = length(y))) +folds_lst <- lapply(as.list(seq_len(2)), function(i) which(folds == i)) + +# fit nuisance regressions ----------------------------------------------------- +# set up a library for SuperLearner +learners <- c("SL.glm") +V <- 2 + +# fit the data with all covariates +set.seed(1234) +# fit a CV.SL for sample-splitting +full_fit <- suppressWarnings( + SuperLearner::CV.SuperLearner(Y = y, X = x_df, + SL.library = learners, + cvControl = list(V = 2, validRows = folds_lst), + innerCvControl = list(list(V = V))) +) +full_fitted <- SuperLearner::predict.SuperLearner(full_fit)$pred + +# fit the data with only X1 +reduced_fit_1 <- suppressWarnings( + SuperLearner::CV.SuperLearner(Y = full_fitted, X = x_df[, -2, drop = FALSE], + SL.library = learners, + cvControl = list(V = 2, validRows = full_fit$folds), + innerCvControl = list(list(V = V))) +) +reduced_fitted_1 <- SuperLearner::predict.SuperLearner(reduced_fit_1)$pred + +# fit data with only X2 +reduced_fit_2 <- suppressWarnings( + SuperLearner::CV.SuperLearner(Y = full_fitted, X = x_df[, -1, drop = FALSE], + SL.library = learners, + cvControl = list(V = 2, validRows = full_fit$folds), + innerCvControl = list(list(V = V))) +) +reduced_fitted_2 <- SuperLearner::predict.SuperLearner(reduced_fit_2)$pred + +# test merging ----------------------------------------------------------------- +set.seed(4747) +test_that("Merging variable importance estimates works", { + est_1 <- vim(Y = y, f1 = full_fitted, f2 = reduced_fitted_1, + run_regression = FALSE, indx = 2, type = "r_squared", + sample_splitting_folds = folds) + est_2 <- vim(Y = y, f1 = full_fitted, f2 = reduced_fitted_2, + run_regression = FALSE, indx = 1, type = "r_squared", + sample_splitting_folds = folds) + + merged_ests <- merge_vim(est_1, est_2) + expect_equal(merged_ests$est[1], r2_two, tolerance = 0.05, scale = 1) + expect_equal(merged_ests$est[2], r2_one, tolerance = 0.05, scale = 1) + expect_output(print(merged_ests), "Estimate", fixed = TRUE) +}) + +test_that("Merging cross-validated variable importance estimates works", { + est_1 <- cv_vim(Y = y, X = x_df, run_regression = TRUE, indx = 2, + V = V, cvControl = list(V = V), SL.library = learners, + env = environment(), na.rm = TRUE) + est_2 <- cv_vim(Y = y, X = x_df, run_regression = TRUE, indx = 1, + V = V, cvControl = list(V = V), SL.library = learners, + env = environment(), na.rm = TRUE) + + merged_ests <- merge_vim(est_1, est_2) + expect_equal(merged_ests$est[1], r2_two, tolerance = 0.1, scale = 1) + expect_equal(merged_ests$est[2], r2_one, tolerance = 0.1, scale = 1) + expect_output(print(merged_ests), "Estimate", fixed = TRUE) +}) diff --git a/tests/testthat/test-vim.R b/tests/testthat/test-vim.R index 55527c7..1d03fa7 100644 --- a/tests/testthat/test-vim.R +++ b/tests/testthat/test-vim.R @@ -1,137 +1,137 @@ -# load required functions and packages -library("testthat") -suppressWarnings(library("SuperLearner")) - -# generate the data -- note that this is a simple setting, for speed ----------- -set.seed(4747) -p <- 3 -n <- 5e4 -x <- as.data.frame(replicate(p, stats::rnorm(n, 0, 1))) -# apply the function to the covariates -y <- 1 + 0.5 * x[, 1] + 0.75 * x[, 2] + stats::rnorm(n, 0, 1) -true_var <- 1 + .5 ^ 2 + .75 ^ 2 -# note that true difference in R-squareds for variable j, under independence, is -# beta_j^2 * var(x_j) / var(y) -r2_one <- 0.5 ^ 2 * 1 / true_var -r2_two <- 0.75 ^ 2 * 1 / true_var -folds <- sample(rep(seq_len(2), length = length(y))) -folds_lst <- lapply(as.list(seq_len(2)), function(i) which(folds == i)) - -# set up a library for SuperLearner -learners <- c("SL.glm", "SL.mean") -V <- 5 - -# test VIM --------------------------------------------------------------------- -set.seed(4747) -test_that("VIM without sample splitting works", { - est <- vim(Y = y, X = x, indx = 2, type = "r_squared", run_regression = TRUE, - SL.library = learners, alpha = 0.05, cvControl = list(V = V), - env = environment(), sample_splitting = FALSE) - # check that the estimate is approximately correct - expect_equal(est$est, r2_two, tolerance = 0.1, scale = 1) -}) -set.seed(4747) -test_that("ANOVA-VIM without sample splitting works", { - expect_message( - est <- vim(Y = y, X = x, indx = 2, type = "anova", run_regression = TRUE, - SL.library = learners, alpha = 0.05, cvControl = list(V = V), - env = environment(), sample_splitting = FALSE) - ) - # check that the estimate is approximately correct - expect_equal(est$est, r2_two, tolerance = 0.1, scale = 1) -}) - -set.seed(1234) -test_that("General variable importance estimates using internally-computed fitted values work", { - est <- vim(Y = y, X = x, indx = 2, type = "r_squared", run_regression = TRUE, - SL.library = learners, alpha = 0.05, cvControl = list(V = V), - env = environment(), sample_splitting_folds = folds) - # check that the estimate is approximately correct - expect_equal(est$est, r2_two, tolerance = 0.1, scale = 1) - # check that the SE, CI work - expect_length(est$ci, 2) - expect_length(est$se, 1) - # check that the p-value worked - expect_length(est$p_value, 1) - expect_true(est$test) - # check that printing, plotting, etc. work - expect_silent(format(est)[1]) - expect_output(print(est), "Estimate", fixed = TRUE) - # check that influence curve worked - expect_length(est$eif_full, length(y) / 2) -}) - -# fit nuisance regressions ----------------------------------------------------- -# fit the data with all covariates and sample-splitting (note V = 2 in CV.SL) -set.seed(4747) -full_fit <- suppressWarnings( - SuperLearner::CV.SuperLearner(Y = y, X = x, SL.library = learners, - cvControl = list(V = 2, validRows = folds_lst), - innerCvControl = list(list(V = V))) -) -full_fitted <- SuperLearner::predict.SuperLearner(full_fit, onlySL = TRUE)$pred -# fit the data with only X1 -reduced_fit <- suppressWarnings( - SuperLearner::CV.SuperLearner(Y = full_fitted, - X = x[, -2, drop = FALSE], - SL.library = learners, - cvControl = list(V = 2, validRows = full_fit$folds), - innerCvControl = list(list(V = V))) -) -reduced_fitted <- SuperLearner::predict.SuperLearner(reduced_fit, onlySL = TRUE)$pred - -# test VIM with pre-computed nuisance estimates -------------------------------- -test_that("General variable importance estimates using externally-computed fitted values work", { - # provide folds - est <- vim(Y = y, X = x, indx = 2, type = "r_squared", - run_regression = FALSE, - f1 = full_fitted, f2 = reduced_fitted, - sample_splitting_folds = folds) - # check that estimate worked - expect_equal(est$est, r2_two, tolerance = 0.15, scale = 1) - # check that p-value exists - expect_length(est$p_value, 1) - # check that CIs with other transformations work - ci_log <- vimp_ci(est$est, est$se, scale = "log", level = 0.95) - ci_logit <- vimp_ci(est$est, est$se, scale = "logit", level = 0.95) - expect_length(ci_log, 2) - expect_length(ci_logit, 2) -}) -test_that("VIM estimates using externally-computed fitted values with different folds for estimation and inference work", { - # provide folds - est_average <- vim(Y = y, X = x, indx = 2, type = "r_squared", - run_regression = FALSE, - f1 = full_fitted, f2 = reduced_fitted, - sample_splitting_folds = folds, - final_point_estimate = "average") - est_full <- vim(Y = y, X = x, indx = 2, type = "r_squared", - run_regression = FALSE, - f1 = full_fitted, f2 = reduced_fitted, - sample_splitting_folds = folds, - final_point_estimate = "full") - # check that estimate worked - expect_equal(est_average$est, r2_two, tolerance = 0.15, scale = 1) - expect_equal(est_full$est, r2_two, tolerance = 0.15, scale = 1) -}) - - -# measures of predictiveness --------------------------------------------------- -test_that("Measures of predictiveness work", { - full_rsquared <- est_predictiveness(full_fitted, y, - type = "r_squared") - expect_equal(full_rsquared$point_est, 0.44, tolerance = 0.1) - expect_length(full_rsquared$eif, length(y)) -}) - -# error messages --------------------------------------------------------------- -test_that("Error messages work", { - expect_error(vim(X = x)) - expect_error(vim(Y = y)) - expect_error(vim(Y = y, X = x, SL.library = NULL)) - expect_error(vim(Y = y, X = x, run_regression = FALSE)) - expect_error(vim(Y = y, f1 = mean(y))) - expect_error(vim(Y = y, f1 = rep(mean(y), length(y)), f2 = mean(y))) - expect_error(vim(Y = y, X = as.data.frame(x), SL.library = learners, - type = "nonsense_type")) - expect_error(vim(Y = y, X = X, SL.library = learners, indx = ncol(X) + 1)) -}) +# load required functions and packages +library("testthat") +suppressWarnings(library("SuperLearner")) + +# generate the data -- note that this is a simple setting, for speed ----------- +set.seed(4747) +p <- 3 +n <- 5e4 +x <- as.data.frame(replicate(p, stats::rnorm(n, 0, 1))) +# apply the function to the covariates +y <- 1 + 0.5 * x[, 1] + 0.75 * x[, 2] + stats::rnorm(n, 0, 1) +true_var <- 1 + .5 ^ 2 + .75 ^ 2 +# note that true difference in R-squareds for variable j, under independence, is +# beta_j^2 * var(x_j) / var(y) +r2_one <- 0.5 ^ 2 * 1 / true_var +r2_two <- 0.75 ^ 2 * 1 / true_var +folds <- sample(rep(seq_len(2), length = length(y))) +folds_lst <- lapply(as.list(seq_len(2)), function(i) which(folds == i)) + +# set up a library for SuperLearner +learners <- c("SL.glm", "SL.mean") +V <- 5 + +# test VIM --------------------------------------------------------------------- +set.seed(4747) +test_that("VIM without sample splitting works", { + est <- vim(Y = y, X = x, indx = 2, type = "r_squared", run_regression = TRUE, + SL.library = learners, alpha = 0.05, cvControl = list(V = V), + env = environment(), sample_splitting = FALSE) + # check that the estimate is approximately correct + expect_equal(est$est, r2_two, tolerance = 0.1, scale = 1) +}) +set.seed(4747) +test_that("ANOVA-VIM without sample splitting works", { + expect_message( + est <- vim(Y = y, X = x, indx = 2, type = "anova", run_regression = TRUE, + SL.library = learners, alpha = 0.05, cvControl = list(V = V), + env = environment(), sample_splitting = FALSE) + ) + # check that the estimate is approximately correct + expect_equal(est$est, r2_two, tolerance = 0.1, scale = 1) +}) + +set.seed(1234) +test_that("General variable importance estimates using internally-computed fitted values work", { + est <- vim(Y = y, X = x, indx = 2, type = "r_squared", run_regression = TRUE, + SL.library = learners, alpha = 0.05, cvControl = list(V = V), + env = environment(), sample_splitting_folds = folds) + # check that the estimate is approximately correct + expect_equal(est$est, r2_two, tolerance = 0.1, scale = 1) + # check that the SE, CI work + expect_length(est$ci, 2) + expect_length(est$se, 1) + # check that the p-value worked + expect_length(est$p_value, 1) + expect_true(est$test) + # check that printing, plotting, etc. work + expect_silent(format(est)[1]) + expect_output(print(est), "Estimate", fixed = TRUE) + # check that influence curve worked + expect_length(est$eif_full, length(y) / 2) +}) + +# fit nuisance regressions ----------------------------------------------------- +# fit the data with all covariates and sample-splitting (note V = 2 in CV.SL) +set.seed(4747) +full_fit <- suppressWarnings( + SuperLearner::CV.SuperLearner(Y = y, X = x, SL.library = learners, + cvControl = list(V = 2, validRows = folds_lst), + innerCvControl = list(list(V = V))) +) +full_fitted <- SuperLearner::predict.SuperLearner(full_fit, onlySL = TRUE)$pred +# fit the data with only X1 +reduced_fit <- suppressWarnings( + SuperLearner::CV.SuperLearner(Y = full_fitted, + X = x[, -2, drop = FALSE], + SL.library = learners, + cvControl = list(V = 2, validRows = full_fit$folds), + innerCvControl = list(list(V = V))) +) +reduced_fitted <- SuperLearner::predict.SuperLearner(reduced_fit, onlySL = TRUE)$pred + +# test VIM with pre-computed nuisance estimates -------------------------------- +test_that("General variable importance estimates using externally-computed fitted values work", { + # provide folds + est <- vim(Y = y, X = x, indx = 2, type = "r_squared", + run_regression = FALSE, + f1 = full_fitted, f2 = reduced_fitted, + sample_splitting_folds = folds) + # check that estimate worked + expect_equal(est$est, r2_two, tolerance = 0.15, scale = 1) + # check that p-value exists + expect_length(est$p_value, 1) + # check that CIs with other transformations work + ci_log <- vimp_ci(est$est, est$se, scale = "log", level = 0.95) + ci_logit <- vimp_ci(est$est, est$se, scale = "logit", level = 0.95) + expect_length(ci_log, 2) + expect_length(ci_logit, 2) +}) +test_that("VIM estimates using externally-computed fitted values with different folds for estimation and inference work", { + # provide folds + est_average <- vim(Y = y, X = x, indx = 2, type = "r_squared", + run_regression = FALSE, + f1 = full_fitted, f2 = reduced_fitted, + sample_splitting_folds = folds, + final_point_estimate = "average") + est_full <- vim(Y = y, X = x, indx = 2, type = "r_squared", + run_regression = FALSE, + f1 = full_fitted, f2 = reduced_fitted, + sample_splitting_folds = folds, + final_point_estimate = "full") + # check that estimate worked + expect_equal(est_average$est, r2_two, tolerance = 0.15, scale = 1) + expect_equal(est_full$est, r2_two, tolerance = 0.15, scale = 1) +}) + + +# measures of predictiveness --------------------------------------------------- +test_that("Measures of predictiveness work", { + full_rsquared <- est_predictiveness(full_fitted, y, + type = "r_squared") + expect_equal(full_rsquared$point_est, 0.44, tolerance = 0.1) + expect_length(full_rsquared$eif, length(y)) +}) + +# error messages --------------------------------------------------------------- +test_that("Error messages work", { + expect_error(vim(X = x)) + expect_error(vim(Y = y)) + expect_error(vim(Y = y, X = x, SL.library = NULL)) + expect_error(vim(Y = y, X = x, run_regression = FALSE)) + expect_error(vim(Y = y, f1 = mean(y))) + expect_error(vim(Y = y, f1 = rep(mean(y), length(y)), f2 = mean(y))) + expect_error(vim(Y = y, X = as.data.frame(x), SL.library = learners, + type = "nonsense_type")) + expect_error(vim(Y = y, X = X, SL.library = learners, indx = ncol(X) + 1)) +}) diff --git a/vignettes/precomputed-regressions.Rmd b/vignettes/precomputed-regressions.Rmd index b603461..987c452 100644 --- a/vignettes/precomputed-regressions.Rmd +++ b/vignettes/precomputed-regressions.Rmd @@ -1,237 +1,237 @@ ---- -title: "Using precomputed regression function estimates in `vimp`" -author: "Brian D. Williamson" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Using precomputed regression function estimates in `vimp`} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} -csl: chicago-author-date.csl -bibliography: vimp_bib.bib ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -```{r setup, message = FALSE} -library("vimp") -library("SuperLearner") -``` - -## Introduction - -In the [main vignette](introduction-to-vimp.html), we analyzed the VRC01 data [@magaret2019], a subset of the data freely available from the Los Alamos National Laboratory's Compile, Neutralize, and Tally Neutralizing Antibody Panels database. Information about these data is available [here](https://doi.org/10.1371/journal.pcbi.1006952). In each of the analyses, I used `run_regression = TRUE`. In this vignette, I discuss how to use precomputed regression function estimates with `vimp`. The results of this analysis replicate the analysis in the [main vignette](introduction-to-vimp.html). - -```{r load-vrc01-data} -# read in the data -data("vrc01") -# subset to the columns of interest for this analysis -library("dplyr") -library("tidyselect") -# retain only the columns of interest for this analysis -y <- vrc01$ic50.censored -X <- vrc01 %>% - select(starts_with("geog"), starts_with("subtype"), starts_with("length")) -set.seed(1234) -vrc01_folds <- make_folds(y = y, V = 2) -``` - -## Using precomputed regression function estimates without cross-fitting - -### A first approach: linear regression - -As in the [main vignette](introduction-to-vimp.html), we first start by fitting only linear regression models. In this section, we use the function `vim()`; this function does not use cross-fitting to estimate variable importance, and greatly simplifies the code for precomputed regression models. - -```{r est-regressions-lm, warning = FALSE} -library("rlang") -vrc01_subset <- vrc01 %>% - select(ic50.censored, starts_with("geog"), starts_with("subtype"), starts_with("length")) %>% - rename(y = ic50.censored) -# estimate prediction function on each subset, predict on held-out fold -full_fit <- vector("numeric", length = nrow(vrc01)) -for (v in 1:2) { - train_v <- subset(vrc01_subset, vrc01_folds != v) - test_v <- subset(vrc01_subset, vrc01_folds == v) - full_mod <- glm(y ~ ., data = train_v) - full_fit[vrc01_folds == v] <- predict(full_mod, newdata = test_v) -} - -# estimate the reduced conditional means for each of the individual variables -# remove the outcome for the predictor matrix -geog_indx <- max(which(grepl("geog", names(X)))) -for (i in seq_len(ncol(X) - geog_indx)) { - this_name <- names(X)[i + geog_indx] - red_fit <- vector("numeric", length = nrow(vrc01)) - for (v in 1:2) { - train_v <- subset(vrc01_subset, vrc01_folds != v) - test_v <- subset(vrc01_subset, vrc01_folds == v) - red_fit[vrc01_folds == v] <- suppressWarnings( - predict(glm(y ~ ., data = train_v %>% select(-!!this_name)), - newdata = test_v) - ) - } - this_vim <- vim(Y = y, f1 = full_fit, f2 = red_fit, indx = i + geog_indx, - run_regression = FALSE, type = "r_squared", - sample_splitting_folds = vrc01_folds, scale = "logit") - if (i == 1) { - lm_mat <- this_vim - } else { - lm_mat <- merge_vim(lm_mat, this_vim) - } -} -# print out the matrix -lm_mat -``` - -## Estimating variable importance for a single variable using precomputed regression function estimates - -In this section, we will use cross-fitting and pre-computed estimates of the regression functions. This can be especially useful if you have already run a call to `CV.SuperLearner` -- that function returns estimates based on each observation being part of the hold-out set. However, while this approach can save you some computation time, it requires a hefty amount of mental overhead. - -We will use `CV.SuperLearner` to fit the individual regression functions, taking care to use the same cross-fitting folds in each regression. We will then create two groups of validation folds for sample-splitting. For this analysis, we will use `V = 2` folds for cross-fitted variable importance estimation (as we did in the [main vignette](introduction-to-vimp.html)). Note that this entails running `CV.SuperLearner` with $2V = 4$ folds. - -First, we estimate the regression function based on all variables: -```{r estimate-full-regression-with-cf, message = FALSE, warning = FALSE} -learners <- "SL.ranger" -# estimate the full regression function -V <- 2 -set.seed(4747) -full_cv_fit <- suppressWarnings( - SuperLearner::CV.SuperLearner( - Y = y, X = X, SL.library = learners, cvControl = list(V = 2 * V), - innerCvControl = list(list(V = V)), family = binomial() -) -) -# get a numeric vector of cross-fitting folds -cross_fitting_folds <- get_cv_sl_folds(full_cv_fit$folds) -# get sample splitting folds -set.seed(1234) -sample_splitting_folds <- make_folds(unique(cross_fitting_folds), V = 2) -full_cv_preds <- full_cv_fit$SL.predict -``` - -Next, to estimate the importance of each variable, we need to estimate the reduced regression function for each variable: -```{r estimate-reduced-regressions-with-cf, message = FALSE, warning = FALSE} -vars <- names(X)[(geog_indx + 1):ncol(X)] -set.seed(1234) -for (i in seq_len(length(vars))) { - # use "eval" and "parse" to assign the objects of interest to avoid duplicating code - eval(parse(text = paste0("reduced_", vars[i], "_cv_fit <- suppressWarnings(SuperLearner::CV.SuperLearner( - Y = y, X = X[, -(geog_indx + i), drop = FALSE], SL.library = learners, - cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds), - innerCvControl = list(list(V = V)), family = binomial() -))"))) - eval(parse(text = paste0("reduced_", vars[i], "_cv_preds <- reduced_", vars[i], "_cv_fit$SL.predict"))) -} -``` - -Then we can plug these values into `vimp_rsquared()` (or equivalently, `cv_vim()` with `type = "r_squared"`) as follows: -```{r cf-vims} -for (i in seq_len(length(vars))) { - # again, use "eval" and "parse" to assign the objects of interest to avoid duplicating code - eval(parse(text = paste0("cf_", vars[i], "_vim <- vimp_rsquared(Y = y, - cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_", vars[i], "_cv_preds, - indx = (geog_indx + i), cross_fitting_folds = cross_fitting_folds, - sample_splitting_folds = sample_splitting_folds, run_regression = FALSE, alpha = 0.05, - V = V, na.rm = TRUE, scale = 'logit')"))) -} -cf_ests <- merge_vim(cf_subtype.is.01_AE_vim, - cf_subtype.is.02_AG_vim, cf_subtype.is.07_BC_vim, - cf_subtype.is.A1_vim, cf_subtype.is.A1C_vim, - cf_subtype.is.A1D_vim, cf_subtype.is.B_vim, - cf_subtype.is.C_vim, cf_subtype.is.D_vim, - cf_subtype.is.O_vim, cf_subtype.is.Other_vim, - cf_length.env_vim, cf_length.gp120_vim, cf_length.loop.e_vim, - cf_length.loop.e.outliers_vim, cf_length.v5_vim, - cf_length.v5.outliers_vim) -all_vars <- c(paste0("Subtype is ", c("01_AE", "02_AG", "07_BC", "A1", "A1C", "A1D", - "B", "C", "D", "O", "Other")), - paste0("Length of ", c("Env", "gp120", "V5", "V5 outliers", "Loop E", - "Loop E outliers"))) -``` - -And we can view them all simultaneously by plotting: -```{r plot-cf-vim, fig.width = 8.5, fig.height = 8} -library("ggplot2") -library("cowplot") -theme_set(theme_cowplot()) -cf_est_plot_tib <- cf_ests$mat %>% - mutate( - var_fct = rev(factor(s, levels = cf_ests$mat$s, - labels = all_vars[as.numeric(cf_ests$mat$s) - geog_indx], - ordered = TRUE)) - ) - -# plot -cf_est_plot_tib %>% - ggplot(aes(x = est, y = var_fct)) + - geom_point() + - geom_errorbarh(aes(xmin = cil, xmax = ciu)) + - xlab(expression(paste("Variable importance estimates: ", R^2, sep = ""))) + - ylab("") + - ggtitle("Estimated individual feature importance") + - labs(subtitle = "in the VRC01 data (considering only geographic confounders, subtype, and viral geometry)") -``` - -## Estimating variable importance for a group of variables using precomputed regression function estimates - -Finally, we can estimate and plot group importance: -```{r cf-group-vim, fig.width = 8.5, fig.height = 8, warning = FALSE} -set.seed(91011) -reduced_subtype_cv_fit <- suppressWarnings( - SuperLearner::CV.SuperLearner( - Y = y, X = X[, -c(5:15), drop = FALSE], SL.library = learners, - cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds), - innerCvControl = list(list(V = V)), family = binomial() -) -) -reduced_subtype_cv_preds <- reduced_subtype_cv_fit$SL.predict -reduced_geometry_cv_fit <- suppressWarnings( - SuperLearner::CV.SuperLearner( - Y = y, X = X[, -c(16:21), drop = FALSE], SL.library = learners, - cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds), - innerCvControl = list(list(V = V)), family = binomial() -) -) -reduced_geometry_cv_preds <- reduced_geometry_cv_fit$SL.predict -cf_subtype_vim <- vimp_rsquared( - Y = y, cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_subtype_cv_preds, - indx = 5:15, run_regression = FALSE, V = V, - cross_fitting_folds = cross_fitting_folds, sample_splitting_folds = sample_splitting_folds, - scale = "logit" -) -cf_geometry_vim <- vimp_rsquared( - Y = y, cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_geometry_cv_preds, - indx = 16:21, run_regression = FALSE, V = V, - cross_fitting_folds = cross_fitting_folds, sample_splitting_folds = sample_splitting_folds, - scale = "logit" -) -cf_groups <- merge_vim(cf_subtype_vim, cf_geometry_vim) -all_grp_nms <- c("Viral subtype", "Viral geometry") - -grp_plot_tib <- cf_groups$mat %>% - mutate( - grp_fct = factor(case_when( - s == "5,6,7,8,9,10,11,12,13,14,15" ~ "1", - s == "16,17,18,19,20,21" ~ "2" - ), levels = c("1", "2"), labels = all_grp_nms, ordered = TRUE) - ) -grp_plot_tib %>% - ggplot(aes(x = est, y = grp_fct)) + - geom_point() + - geom_errorbarh(aes(xmin = cil, xmax = ciu)) + - xlab(expression(paste("Variable importance estimates: ", R^2, sep = ""))) + - ylab("") + - ggtitle("Estimated feature group importance") + - labs(subtitle = "in the VRC01 data (considering only geographic confounders, subtype, and viral geometry)") -``` - -## Conclusion - -In this document, we learned a second method for computing variable importance estimates: rather than having `vimp` run all regression functions for you, you can compute your own regressions and pass these to `vimp`. The results are equivalent, but there is a tradeoff: what you save in computation time by only computing the full regression once must be balanced with the mental overhead of correctly computing the regressions. Additionally, this task is more difficult when using cross-fitted variable importance, which I recommend in nearly all cases when using flexible machine learning tools. - -## References +--- +title: "Using precomputed regression function estimates in `vimp`" +author: "Brian D. Williamson" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Using precomputed regression function estimates in `vimp`} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +csl: chicago-author-date.csl +bibliography: vimp_bib.bib +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup, message = FALSE} +library("vimp") +library("SuperLearner") +``` + +## Introduction + +In the [main vignette](introduction-to-vimp.html), we analyzed the VRC01 data [@magaret2019], a subset of the data freely available from the Los Alamos National Laboratory's Compile, Neutralize, and Tally Neutralizing Antibody Panels database. Information about these data is available [here](https://doi.org/10.1371/journal.pcbi.1006952). In each of the analyses, I used `run_regression = TRUE`. In this vignette, I discuss how to use precomputed regression function estimates with `vimp`. The results of this analysis replicate the analysis in the [main vignette](introduction-to-vimp.html). + +```{r load-vrc01-data} +# read in the data +data("vrc01") +# subset to the columns of interest for this analysis +library("dplyr") +library("tidyselect") +# retain only the columns of interest for this analysis +y <- vrc01$ic50.censored +X <- vrc01 %>% + select(starts_with("geog"), starts_with("subtype"), starts_with("length")) +set.seed(1234) +vrc01_folds <- make_folds(y = y, V = 2) +``` + +## Using precomputed regression function estimates without cross-fitting + +### A first approach: linear regression + +As in the [main vignette](introduction-to-vimp.html), we first start by fitting only linear regression models. In this section, we use the function `vim()`; this function does not use cross-fitting to estimate variable importance, and greatly simplifies the code for precomputed regression models. + +```{r est-regressions-lm, warning = FALSE} +library("rlang") +vrc01_subset <- vrc01 %>% + select(ic50.censored, starts_with("geog"), starts_with("subtype"), starts_with("length")) %>% + rename(y = ic50.censored) +# estimate prediction function on each subset, predict on held-out fold +full_fit <- vector("numeric", length = nrow(vrc01)) +for (v in 1:2) { + train_v <- subset(vrc01_subset, vrc01_folds != v) + test_v <- subset(vrc01_subset, vrc01_folds == v) + full_mod <- glm(y ~ ., data = train_v) + full_fit[vrc01_folds == v] <- predict(full_mod, newdata = test_v) +} + +# estimate the reduced conditional means for each of the individual variables +# remove the outcome for the predictor matrix +geog_indx <- max(which(grepl("geog", names(X)))) +for (i in seq_len(ncol(X) - geog_indx)) { + this_name <- names(X)[i + geog_indx] + red_fit <- vector("numeric", length = nrow(vrc01)) + for (v in 1:2) { + train_v <- subset(vrc01_subset, vrc01_folds != v) + test_v <- subset(vrc01_subset, vrc01_folds == v) + red_fit[vrc01_folds == v] <- suppressWarnings( + predict(glm(y ~ ., data = train_v %>% select(-!!this_name)), + newdata = test_v) + ) + } + this_vim <- vim(Y = y, f1 = full_fit, f2 = red_fit, indx = i + geog_indx, + run_regression = FALSE, type = "r_squared", + sample_splitting_folds = vrc01_folds, scale = "logit") + if (i == 1) { + lm_mat <- this_vim + } else { + lm_mat <- merge_vim(lm_mat, this_vim) + } +} +# print out the matrix +lm_mat +``` + +## Estimating variable importance for a single variable using precomputed regression function estimates + +In this section, we will use cross-fitting and pre-computed estimates of the regression functions. This can be especially useful if you have already run a call to `CV.SuperLearner` -- that function returns estimates based on each observation being part of the hold-out set. However, while this approach can save you some computation time, it requires a hefty amount of mental overhead. + +We will use `CV.SuperLearner` to fit the individual regression functions, taking care to use the same cross-fitting folds in each regression. We will then create two groups of validation folds for sample-splitting. For this analysis, we will use `V = 2` folds for cross-fitted variable importance estimation (as we did in the [main vignette](introduction-to-vimp.html)). Note that this entails running `CV.SuperLearner` with $2V = 4$ folds. + +First, we estimate the regression function based on all variables: +```{r estimate-full-regression-with-cf, message = FALSE, warning = FALSE} +learners <- "SL.ranger" +# estimate the full regression function +V <- 2 +set.seed(4747) +full_cv_fit <- suppressWarnings( + SuperLearner::CV.SuperLearner( + Y = y, X = X, SL.library = learners, cvControl = list(V = 2 * V), + innerCvControl = list(list(V = V)), family = binomial() +) +) +# get a numeric vector of cross-fitting folds +cross_fitting_folds <- get_cv_sl_folds(full_cv_fit$folds) +# get sample splitting folds +set.seed(1234) +sample_splitting_folds <- make_folds(unique(cross_fitting_folds), V = 2) +full_cv_preds <- full_cv_fit$SL.predict +``` + +Next, to estimate the importance of each variable, we need to estimate the reduced regression function for each variable: +```{r estimate-reduced-regressions-with-cf, message = FALSE, warning = FALSE} +vars <- names(X)[(geog_indx + 1):ncol(X)] +set.seed(1234) +for (i in seq_len(length(vars))) { + # use "eval" and "parse" to assign the objects of interest to avoid duplicating code + eval(parse(text = paste0("reduced_", vars[i], "_cv_fit <- suppressWarnings(SuperLearner::CV.SuperLearner( + Y = y, X = X[, -(geog_indx + i), drop = FALSE], SL.library = learners, + cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds), + innerCvControl = list(list(V = V)), family = binomial() +))"))) + eval(parse(text = paste0("reduced_", vars[i], "_cv_preds <- reduced_", vars[i], "_cv_fit$SL.predict"))) +} +``` + +Then we can plug these values into `vimp_rsquared()` (or equivalently, `cv_vim()` with `type = "r_squared"`) as follows: +```{r cf-vims} +for (i in seq_len(length(vars))) { + # again, use "eval" and "parse" to assign the objects of interest to avoid duplicating code + eval(parse(text = paste0("cf_", vars[i], "_vim <- vimp_rsquared(Y = y, + cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_", vars[i], "_cv_preds, + indx = (geog_indx + i), cross_fitting_folds = cross_fitting_folds, + sample_splitting_folds = sample_splitting_folds, run_regression = FALSE, alpha = 0.05, + V = V, na.rm = TRUE, scale = 'logit')"))) +} +cf_ests <- merge_vim(cf_subtype.is.01_AE_vim, + cf_subtype.is.02_AG_vim, cf_subtype.is.07_BC_vim, + cf_subtype.is.A1_vim, cf_subtype.is.A1C_vim, + cf_subtype.is.A1D_vim, cf_subtype.is.B_vim, + cf_subtype.is.C_vim, cf_subtype.is.D_vim, + cf_subtype.is.O_vim, cf_subtype.is.Other_vim, + cf_length.env_vim, cf_length.gp120_vim, cf_length.loop.e_vim, + cf_length.loop.e.outliers_vim, cf_length.v5_vim, + cf_length.v5.outliers_vim) +all_vars <- c(paste0("Subtype is ", c("01_AE", "02_AG", "07_BC", "A1", "A1C", "A1D", + "B", "C", "D", "O", "Other")), + paste0("Length of ", c("Env", "gp120", "V5", "V5 outliers", "Loop E", + "Loop E outliers"))) +``` + +And we can view them all simultaneously by plotting: +```{r plot-cf-vim, fig.width = 8.5, fig.height = 8} +library("ggplot2") +library("cowplot") +theme_set(theme_cowplot()) +cf_est_plot_tib <- cf_ests$mat %>% + mutate( + var_fct = rev(factor(s, levels = cf_ests$mat$s, + labels = all_vars[as.numeric(cf_ests$mat$s) - geog_indx], + ordered = TRUE)) + ) + +# plot +cf_est_plot_tib %>% + ggplot(aes(x = est, y = var_fct)) + + geom_point() + + geom_errorbarh(aes(xmin = cil, xmax = ciu)) + + xlab(expression(paste("Variable importance estimates: ", R^2, sep = ""))) + + ylab("") + + ggtitle("Estimated individual feature importance") + + labs(subtitle = "in the VRC01 data (considering only geographic confounders, subtype, and viral geometry)") +``` + +## Estimating variable importance for a group of variables using precomputed regression function estimates + +Finally, we can estimate and plot group importance: +```{r cf-group-vim, fig.width = 8.5, fig.height = 8, warning = FALSE} +set.seed(91011) +reduced_subtype_cv_fit <- suppressWarnings( + SuperLearner::CV.SuperLearner( + Y = y, X = X[, -c(5:15), drop = FALSE], SL.library = learners, + cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds), + innerCvControl = list(list(V = V)), family = binomial() +) +) +reduced_subtype_cv_preds <- reduced_subtype_cv_fit$SL.predict +reduced_geometry_cv_fit <- suppressWarnings( + SuperLearner::CV.SuperLearner( + Y = y, X = X[, -c(16:21), drop = FALSE], SL.library = learners, + cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds), + innerCvControl = list(list(V = V)), family = binomial() +) +) +reduced_geometry_cv_preds <- reduced_geometry_cv_fit$SL.predict +cf_subtype_vim <- vimp_rsquared( + Y = y, cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_subtype_cv_preds, + indx = 5:15, run_regression = FALSE, V = V, + cross_fitting_folds = cross_fitting_folds, sample_splitting_folds = sample_splitting_folds, + scale = "logit" +) +cf_geometry_vim <- vimp_rsquared( + Y = y, cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_geometry_cv_preds, + indx = 16:21, run_regression = FALSE, V = V, + cross_fitting_folds = cross_fitting_folds, sample_splitting_folds = sample_splitting_folds, + scale = "logit" +) +cf_groups <- merge_vim(cf_subtype_vim, cf_geometry_vim) +all_grp_nms <- c("Viral subtype", "Viral geometry") + +grp_plot_tib <- cf_groups$mat %>% + mutate( + grp_fct = factor(case_when( + s == "5,6,7,8,9,10,11,12,13,14,15" ~ "1", + s == "16,17,18,19,20,21" ~ "2" + ), levels = c("1", "2"), labels = all_grp_nms, ordered = TRUE) + ) +grp_plot_tib %>% + ggplot(aes(x = est, y = grp_fct)) + + geom_point() + + geom_errorbarh(aes(xmin = cil, xmax = ciu)) + + xlab(expression(paste("Variable importance estimates: ", R^2, sep = ""))) + + ylab("") + + ggtitle("Estimated feature group importance") + + labs(subtitle = "in the VRC01 data (considering only geographic confounders, subtype, and viral geometry)") +``` + +## Conclusion + +In this document, we learned a second method for computing variable importance estimates: rather than having `vimp` run all regression functions for you, you can compute your own regressions and pass these to `vimp`. The results are equivalent, but there is a tradeoff: what you save in computation time by only computing the full regression once must be balanced with the mental overhead of correctly computing the regressions. Additionally, this task is more difficult when using cross-fitted variable importance, which I recommend in nearly all cases when using flexible machine learning tools. + +## References