diff --git a/R/InterXshift.R b/R/InterXshift.R index fef7012..b9524a3 100644 --- a/R/InterXshift.R +++ b/R/InterXshift.R @@ -124,6 +124,7 @@ InterXshift <- function(w, colnames(w) <- w_names } + # coerce W to matrix and, if no names in W, assign them generically a <- data.frame(a) a_names <- colnames(a) @@ -311,9 +312,6 @@ InterXshift <- function(w, } - - - for (i in 1:nrow(fold_negative_effects)) { exposure <- fold_negative_effects$Variable[i] @@ -477,9 +475,9 @@ InterXshift <- function(w, syngery_in_fold <- calc_final_joint_shift_param( joint_shift_fold_results = intxn_results_list, - rank, - fold_k, - deltas_updated, + rank = rank, + fold_k = fold_k, + deltas_updated = deltas_updated, exposures = exposures ) @@ -574,10 +572,10 @@ InterXshift <- function(w, antagonism_in_fold <- calc_final_joint_shift_param( joint_shift_fold_results = intxn_results_list, - rank, - fold_k, - deltas_updated, - exposures + rank = rank, + fold_k = fold_k, + deltas_updated = deltas_updated, + exposures = exposures ) antagonism_rank_fold_results[[ @@ -608,13 +606,13 @@ InterXshift <- function(w, results_list }, - .options = furrr::furrr_options(seed = seed, packages = "SuperNOVA") + .options = furrr::furrr_options(seed = seed, packages = "InterXshift") ) - top_positive_results <- purrr::map(fold_SuperNOVA_results, c("top_positive_effects")) - top_negative_results <- purrr::map(fold_SuperNOVA_results, c("top_negative_effects")) - top_synergy_results <- purrr::map(fold_SuperNOVA_results, c("top_synergy_effects")) - top_antagonism_results <- purrr::map(fold_SuperNOVA_results, c("top_antagonism_effects")) + top_positive_results <- purrr::map(fold_InterXshift_results, c("top_positive_effects")) + top_negative_results <- purrr::map(fold_InterXshift_results, c("top_negative_effects")) + top_synergy_results <- purrr::map(fold_InterXshift_results, c("top_synergy_effects")) + top_antagonism_results <- purrr::map(fold_InterXshift_results, c("top_antagonism_effects")) top_positive_results <- unlist(top_positive_results, recursive = FALSE) top_negative_results <- unlist(top_negative_results, recursive = FALSE) @@ -636,7 +634,7 @@ InterXshift <- function(w, n_folds = n_folds ) - pooled_intxn_shift_results <- calc_pooled_intxn_shifts( + pooled_synergy_shift_results <- calc_pooled_intxn_shifts( intxn_shift_results = top_synergy_results, estimator = estimator, a_names = a_names, @@ -646,13 +644,24 @@ InterXshift <- function(w, n_folds = n_folds ) + pooled_antagonism_shift_results <- calc_pooled_intxn_shifts( + intxn_shift_results = top_antagonism_results, + estimator = estimator, + a_names = a_names, + w_names = w_names, + z_names = z_names, + fluctuation = fluctuation, + n_folds = n_folds + ) + results_list <- list( - "Basis Fold Proportions" = basis_prop_in_fold, - "Effect Mod Results" = pooled_em_shift_results, - "Indiv Shift Results" = pooled_indiv_shift_results, - "Joint Shift Results" = pooled_intxn_shift_results, - "Mediation Shift Results" = pooled_med_shift_results + "Pos Shift Results" = pooled_pos_shift_results, + "Neg Shift Results" = pooled_neg_shift_results, + "K Fold Synergy Results" = pooled_synergy_shift_results$k_fold_results, + "Pooled Synergy Results" = pooled_synergy_shift_results$pooled_results, + "K Fold Antagonism Results" = pooled_antagonism_shift_results$k_fold_results, + "Pooled Antagonism Results" = pooled_antagonism_shift_results$pooled_results ) return(results_list) diff --git a/R/calc_pooled_intxn_shifts.R b/R/calc_pooled_intxn_shifts.R index 245d1b5..7c9d38f 100644 --- a/R/calc_pooled_intxn_shifts.R +++ b/R/calc_pooled_intxn_shifts.R @@ -119,23 +119,21 @@ calc_pooled_intxn_shifts <- function(intxn_shift_results, intxn_results_list[[i]] <- tmle_fit } - # var_names <- extract_vars_from_basis( - # var_set, 1, - # a_names, w_names, z_names - # ) - intxn_pooled <- calc_final_joint_shift_param( joint_shift_fold_results = intxn_results_list, rank = var_set, fold_k = "Pooled TMLE", - deltas_updated = deltas + deltas_updated = deltas, + exposures = c("Var 1","Var 2","Joint","Interaction") ) + rownames(k_fold_results) <- NULL + pooled_results_list[[var_set]] <- intxn_pooled k_fold_results_list[[var_set]] <- k_fold_results } - return(results_list) + return(list("k_fold_results" = k_fold_results_list, "pooled_results" = pooled_results_list)) } diff --git a/R/final_result_utils.R b/R/final_result_utils.R index 20b998a..fab4bc7 100644 --- a/R/final_result_utils.R +++ b/R/final_result_utils.R @@ -81,7 +81,7 @@ calculatePooledEstimate <- function(results_df, n_folds, delta = NULL) { #' @export calc_final_ind_shift_param <- function(tmle_fit, exposure, fold_k) { condition <- exposure - psi_param <- tmle_fit$noshift_psi - tmle_fit$psi + psi_param <- tmle_fit$psi - tmle_fit$noshift_psi variance_est <- var(tmle_fit$eif - tmle_fit$noshift_eif) / length(tmle_fit$eif) se_est <- sqrt(variance_est) @@ -274,6 +274,7 @@ calc_final_effect_mod_param <- function(tmle_fit_av, #' @title Calculates the Joint Shift Parameter #' @description Estimates the shift parameter for a joint shift #' @param joint_shift_fold_results Results of the joint shift +#' @param rank ranking of the interaction found #' @param exposures Exposures shifted #' @param fold_k Fold the joint shift is identified #' @param deltas_updated The new delta, could be updated if Hn has positivity diff --git a/R/find_synergy_antagonism.R b/R/find_synergy_antagonism.R index ecb092c..af60442 100644 --- a/R/find_synergy_antagonism.R +++ b/R/find_synergy_antagonism.R @@ -68,17 +68,17 @@ find_synergy_antagonism <- function(data, deltas, a_names, w_names, outcome, out shifted_data <- as.data.frame(data) shifted_data[[var]] <- shifted_data[[var]] + deltas[[var]] predictions <- sl_fit$predict(sl3::make_sl3_Task(data = shifted_data, covariates = c(a_names, w_names), outcome = outcome)) - effect <- mean(data[[outcome]] - predictions) + effect <- mean(predictions - data[[outcome]]) individual_effects_df <- rbind(individual_effects_df, data.frame(Variable = var, Effect = effect)) } # Rank individual effects individual_effects_df <- individual_effects_df[order(-individual_effects_df$Effect), ] - top_positive_effects <- head(individual_effects_df[individual_effects_df$Effect > 0, ], top_n) - top_negative_effects <- tail(individual_effects_df[individual_effects_df$Effect < 0, ], top_n) + top_positive_effects <- head(individual_effects_df, top_n) + top_negative_effects <- tail(individual_effects_df, top_n) top_positive_effects$Rank <- seq(top_n) - top_negative_effects$Rank <- seq(top_n) + top_negative_effects$Rank <- rev(seq(top_n)) # Calculate interaction effects for (indices in combn(seq_along(a_names), 2, simplify = FALSE)) { @@ -93,11 +93,11 @@ find_synergy_antagonism <- function(data, deltas, a_names, w_names, outcome, out # Rank interaction effects interaction_effects_df <- interaction_effects_df[order(-interaction_effects_df$Effect), ] - top_synergistic_interactions <- head(interaction_effects_df[interaction_effects_df$Effect > 0, ], top_n) - top_antagonistic_interactions <- tail(interaction_effects_df[interaction_effects_df$Effect < 0, ], top_n) + top_synergistic_interactions <- head(interaction_effects_df, top_n) + top_antagonistic_interactions <- tail(interaction_effects_df, top_n) top_synergistic_interactions$Rank <- seq(top_n) - top_antagonistic_interactions$Rank <- seq(top_n) + top_antagonistic_interactions$Rank <- rev(seq(top_n)) # Return the results return(list( diff --git a/README.Rmd b/README.Rmd index 68f717b..01e96fc 100644 --- a/README.Rmd +++ b/README.Rmd @@ -15,64 +15,51 @@ knitr::opts_chunk$set( ) ``` -# R/`SuperNOVA` +# R/`InterXShift` -[![R-CMD-check](https://github.com/blind-contours/SuperNOVA/workflows/R-CMD-check/badge.svg)](https://github.com/blind-contours/SuperNOVA/actions) -[![Coverage Status](https://img.shields.io/codecov/c/github/blind-contours/SuperNOVA/master.svg)](https://codecov.io/github/blind-contours/SuperNOVA?branch=master) -[![CRAN](https://www.r-pkg.org/badges/version/SupernOVA)](https://www.r-pkg.org/pkg/SuperNOVA) -[![CRAN downloads](https://cranlogs.r-pkg.org/badges/SuperNOVA)](https://CRAN.R-project.org/package=SuperNOVA) -[![CRAN total downloads](http://cranlogs.r-pkg.org/badges/grand-total/SuperNOVA)](https://CRAN.R-project.org/package=SuperNOVA) +[![R-CMD-check](https://github.com/blind-contours/InterXShift/workflows/R-CMD-check/badge.svg)](https://github.com/blind-contours/InterXShift/actions) +[![Coverage Status](https://img.shields.io/codecov/c/github/blind-contours/InterXShift/master.svg)](https://codecov.io/github/blind-contours/InterXShift?branch=master) +[![CRAN](https://www.r-pkg.org/badges/version/InterXShift)](https://www.r-pkg.org/pkg/InterXShift) +[![CRAN downloads](https://cranlogs.r-pkg.org/badges/InterXShift)](https://CRAN.R-project.org/package=InterXShift) +[![CRAN total downloads](http://cranlogs.r-pkg.org/badges/grand-total/InterXShift)](https://CRAN.R-project.org/package=InterXShift) [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) [![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT) -> Efficient Estimation of the Causal Effects of Non-Parametric Interactions, Effect Modifications and Mediation using Stochastic Interventions +> Interaction Identification and Estimation using Data-Adaptive Stochastic Interventions __Authors:__ [David McCoy](https://davidmccoy.org) --- -## What's `SuperNOVA`? +## What's `InterXshift`? -The `SuperNOVA` R package offers a comprehensive toolset for identifying predictive variable sets, be it subsets of exposures, exposure-covariates, or exposure-mediators, for a specified outcome. It further assists in creating efficient estimators for the counterfactual mean of the outcome under stochastic interventions on these variable sets. This means making exposure changes that depend on naturally observed values, as described in past literature [@diaz2012population; @haneuse2013estimation]. +The `InterXshift` R package offers an approach which identifies and estimates the impact interactions in a mixed exposure on an outcome. We define interaction as the counterfactual mean of the outcome under stochastic interventions of two exposures compared to the additive counterfactual mean of the two expowures intervened on indepdentently. These interventions or exposure changes depend on naturally observed values, as described in past literature [@diaz2012population; @haneuse2013estimation]. -`SuperNOVA` introduces several estimators, constructed based on the patterns found in the data. At present, semi-parametric estimators are available for various contexts: interaction, effect modification, marginal impacts, and mediation. The target parameters that this package calculates are grounded in previously published works: one regarding interaction and effect modification [@mccoy2023semiparametric] and another dedicated to mediation [McCoy2023mediation]. +`InterXshift` builds on work described in [@mccoy2023semiparametric]. However instead of identifying interactions through an semi-parametric definition of an F-statistics and then estimating our interaction target parameter using CV-TMLE pooled across exposure sets, we provide a more streamlined approach. In this package, we identify interactions through g-computation first - evaluating the expected outcome under joint shift compared to the sum of individual shifts using Super Learner. We then rank these estimates as the highest sets of synergy and antagonism. We then use CV-TMLE and pool within the ranks. -The `SuperNOVA` package builds upon the capabilities of the `txshift` package, which implements the TML estimator for a stochastic shift causal parameter [@diaz2012population]. A notable extension in SuperNOVA is its support for joint stochastic interventions on two exposures. This allows for the creation of a non-parametric interaction parameter, shedding light on the combined effect of concurrently shifting two variables relative to the cumulative effect of individual shifts. At this stage, the focus is on two-way shifts. -Various parameters are calculated based on identified patterns: +The package ensures robustness by employing a k-fold cross-validation framework. This framework helps in estimating a data-adaptive parameter, which is the stochastic shift target parameters for the exposure sets identified as having synergy or antagonism. The process begins by partitioning the data into parameter-generating and estimation samples. In the parameter-generating sample, we identify our ranks of antogonistic and synergistic exposure sets through a machine learning g-computation framework. In the estimation sample we then estimate our interaction target parameter using the doubly robust estimator TMLE to ensure asymptotic efficiency which allows us to construct confidence intervals for our estimates (unlike the g-comp method). -* Interaction Parameter: When two exposures show signs of interaction, this parameter is calculated. +By using InterXshift, users get access to a tool that offers both k-fold specific and aggregated results for the top synergistic and antagonistic relationships, ensuring that researchers can glean the most information from their data. For a more in-depth exploration, there's an accompanying vignette. -* Individual Stochastic Interventions: When marginal effects are identified, this estimates the difference in outcomes upon making a specific shift compared to no intervention. +To utilize the package, users need to provide vectors for exposures, covariates, and outcomes. They also specify the respective $\delta$ for each exposure (indicating the degree of shift) and if this delta should be adaptive in response to positivity violations. The `top_n` parameter defines the top number of synergistic, antagonistic, positive and negative ranked impacts to estiamte. A detailed guide is provided in the vignette. With these inputs, `InterXshift` processes the data and delivers tables showcasing fold-specific results and aggregated outcomes, allowing users to glean insights effectively. -* Effect Modification: In scenarios where effect modification is evident, the target parameter for it is the mean outcome under intervention within specific regions of the covariate space that the data suggests. This contrasts with the marginal effect by diving deeper into strata of covariates, for example, gauging the effects of exposure shifts among distinct genders. - -* Mediation: In scenarios where mediation is found, the target parameter are the natural direct and indirect effects as defined by stochastic intervention. That is, for the natural indirect effect, `SuperNOVA` estimates the impact of shifting an exposure through a mediator and the direct effect which is not through the mediator. - -Mediation in `SuperNOVA` is a new feature. Here, `SuperNOVA` brings to the table estimates initially conceived in another work [@Diaz2020a], while also supporting mediation estimates for continuous exposures. - -The package ensures robustness by employing a k-fold cross-validation framework. This framework helps in estimating a data-adaptive parameter, which is the stochastic shift target parameters for the variable sets identified as influential for the outcome. The process begins by partitioning the data into parameter-generating and estimation samples. The former sample assists in fitting a collection of basis function estimators to the data. The one with the lowest cross-validated mean squared error is selected. Key variable sets are then extracted using an ANOVA-like variance decomposition methodology. For the estimation sample, targeted learning is harnessed to gauge causal target parameters across different contexts: interaction, mediation, effect modification, and individual variable shifts. - -By using SuperNOVA, users get access to a tool that offers both k-fold specific and aggregated results for each target parameter, ensuring that researchers can glean the most information from their data. For a more in-depth exploration, there's an accompanying vignette. - -To utilize the package, users need to provide vectors for exposures, covariates, mediators, and outcomes. They also specify the respective $\delta$ for each exposure (indicating the degree of shift) and if this delta should be adaptive in response to positivity violations. A detailed guide is provided in the vignette. With these inputs, `SuperNOVA` processes the data and delivers tables showcasing fold-specific results and aggregated outcomes, allowing users to glean insights effectively. - -`SuperNOVA` also incorporates features from the `sl3` package [@coyle-sl3-rpkg], facilitating ensemble machine learning in the estimation process. If the user does not specify any stack parameters, `SuperNOVA` will automatically create an ensemble of machine learning algorithms that strike a balance between flexibility and computational efficiency. +`InterXshift` also incorporates features from the `sl3` package [@coyle-sl3-rpkg], facilitating ensemble machine learning in the estimation process. If the user does not specify any stack parameters, `SuperNOVA` will automatically create an ensemble of machine learning algorithms that strike a balance between flexibility and computational efficiency. --- ## Installation -*Note:* Because the `SuperNOVA` package (currently) depends on `sl3` that +*Note:* Because the `InterXshift` package (currently) depends on `sl3` that allows ensemble machine learning to be used for nuisance parameter -estimation and `sl3` is not on CRAN the `SuperNOVA` package is not +estimation and `sl3` is not on CRAN the `InterXshift` package is not available on CRAN and must be downloaded here. -There are many depedencies for `SuperNOVA` so it's easier to break up +There are many depedencies for `InterXshift` so it's easier to break up installation of the various packages to ensure proper installation. First install the basis estimators used in the data-adaptive @@ -83,10 +70,10 @@ install.packages("earth") install.packages("hal9001") ``` -`SuperNOVA` uses the `sl3` package to build ensemble machine learners for each nuisance parameter. +`InterXshift` uses the `sl3` package to build ensemble machine learners for each nuisance parameter. We have to install off the development branch, first download these two packages for `sl3` -```{r super-learner-packages, eval = FALSE} +```{r InterXshift-packages, eval = FALSE} install.packages(c("ranger", "arm", "xgboost", "nnls")) ``` @@ -96,13 +83,13 @@ Now install `sl3` on devel: remotes::install_github("tlverse/sl3@devel") ``` -Make sure `sl3` installs correctly then install `SuperNOVA` +Make sure `sl3` installs correctly then install `InterXshift` -```{r SuperNOVA_install, eval = FALSE} -remotes::install_github("blind-contours/SuperNOVA@main") +```{r InterXshift_install, eval = FALSE} +remotes::install_github("blind-contours/InterXshift@main") ``` -`SuperNOVA` has some other miscellaneous dependencies that are used in the examples as well as in the plotting functions. +`InterXshift` has some other miscellaneous dependencies that are used in the examples as well as in the plotting functions. ```{r msc_package, eval = FALSE} install.packages(c("kableExtra", "hrbrthemes", "viridis")) @@ -112,59 +99,48 @@ install.packages(c("kableExtra", "hrbrthemes", "viridis")) ## Example -To illustrate how `SuperNOVA` may be used to ascertain the effect of a mixed exposure, consider the following example: +To illustrate how `InterXshift` may be used to ascertain the effect of a mixed exposure, consider the following example: ```{r example, warning=FALSE} -library(SuperNOVA) +library(InterXshift) library(devtools) library(kableExtra) library(sl3) set.seed(429153) # simulate simple data -n_obs <- 100000 +n_obs <- 10000 ``` -The `simulate_data` is a function for generating synthetic data with a complex structure to study the causal effects of shifting values in the mixtures of exposures. The primary purpose of this simulation is to provide a controlled environment for testing and validating estimates given by the `SuperNOVA` package. - -The simulate_data function generates synthetic data for n_obs observations with a pre-specified covariance structure (sigma_mod) and a shift parameter (delta). It simulates four mixture components (M1, M2, M3, M4) and three covariates (W1, W2, W3) with specific relationships between them. The outcome variable Y is generated as a function of these mixtures and covariates. - -After generating the data, the function applies a shift (delta) to each mixture component separately and calculates the average treatment effect for each component. Additionally, it calculates the interaction effect of shifting two mixture components simultaneously (m14_intxn). These ground truth effects can be used for validating and comparing the performance of various causal inference methods on this synthetic dataset. +We will directly use synthetic data from the NIEHS used to test new mixture methods. This data has built in strong positive and negative marginal effects and certain interactions. Found here: https://github.com/niehs-prime/2015-NIEHS-MIxtures-Workshop -The function returns a list containing the generated data and the ground truth effects of the shifts applied to each mixture component, the interaction effect, and the modified effect results based on a specific level in the W3 covariate. +```{r NIEHS example} +data("NIEHS_data_1", package = "SuperNOVA") +``` +```{r NIEH Nodes} +NIEHS_data_1$W <- rnorm(nrow(NIEHS_data_1), mean = 0, sd = 0.1) +w <- NIEHS_data_1[, c("W", "Z")] +a <- NIEHS_data_1[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7")] +y <- NIEHS_data_1$Y -```{r simulate data, eval = TRUE} -sim_out <- simulate_data(n_obs = n_obs) -data <- sim_out$data -head(data) %>% - kbl(caption = "Simulated Data") %>% +deltas <- list( + "X1" = 1, "X2" = 1, "X3" = 1, + "X4" = 1, "X5" = 1, "X6" = 1, "X7" = 1 +) +head(NIEHS_data_1) %>% + kbl(caption = "NIEHS Data") %>% kable_classic(full_width = F, html_font = "Cambria") ``` - -And therefore, in `SuperNOVA` we would expect most of the fold CIs to cover -this number and the pooled estimate to also cover this true effect. Let's -run `SuperNOVA` to see if it correctly identifies the exposures that drive -the outcome and any interaction/effect modification that exists in the DGP. - -Of note, there are three exposures M1, M2, M3 - M1 and M3 have individual effects -and interactions that drive the outcome. There is also effect modification -between M3 and W1. +Based on the data key, we expect X1 to have the strongest positive effect, X5 the strongest negative. So we would expect these to take the top ranks for these marginal associations. For interactions -```{r run SuperNOVA, eval = TRUE, message=FALSE, warning=FALSE} -data_sample <- data[sample(nrow(data), 4000), ] - -w <- data_sample[, c("W1", "W2", "W3")] -a <- data_sample[, c("M1", "M2", "M3", "M4")] -y <- data_sample$Y - -deltas <- list("M1" = 1, "M2" = 1, "M3" = 1, "M4" = 1) +```{r run InterXshift, eval = TRUE, message=FALSE, warning=FALSE} ptm <- proc.time() -sim_results <- SuperNOVA( +sim_results <- InterXshift( w = w, a = a, y = y, @@ -172,71 +148,77 @@ sim_results <- SuperNOVA( n_folds = 3, num_cores = 6, outcome_type = "continuous", - quantile_thresh = 0, - seed = 294580 + seed = 294580, + top_n = 2 ) proc.time() - ptm -basis_in_folds <- sim_results$`Basis Fold Proportions` -indiv_shift_results <- sim_results$`Indiv Shift Results` -em_results <- sim_results$`Effect Mod Results` -joint_shift_results <- sim_results$`Joint Shift Results` +## marginal effects +top_positive_effects <- sim_results$`Pos Shift Results` +top_negative_effects <- sim_results$`Neg Shift Results` + +## interaction effects +pooled_synergy_effects <- sim_results$`Pooled Synergy Results` +pooled_antagonism_effects <- sim_results$`Pooled Antagonism Results` + +k_fold_synergy_effects <- sim_results$`K Fold Synergy Results` +k_fold_antagonism_effects <- sim_results$`K Fold Antagonism Results` + ``` -Let's first look at the variable relationships used in the folds: -```{r basis in folds} -basis_in_folds +```{r pos shift results} +top_positive_effects %>% + kbl(caption = "Rank 1 Positive Stochastic Intervention Results") %>% + kable_classic(full_width = F, html_font = "Cambria") ``` -The above list shows that marginal effects for exposures M1 and M4 were found, an interaction for M1 and M4, and effect modification for M3 and W3 - all of which are correct as the outcome is generated from these relationships. There is no effect for M2 which we correctly reject. +Above we show the findings for the top rank positive marginal effect. Here we consistently find X1 which is true based on what is built into the DGP. + -Let's first look at the results for individual stochastic shifts by delta compared to no shift: +Next we look at the top negative result: -```{r individual shift results} -indiv_shift_results$M1 %>% - kbl(caption = "Individual Stochastic Intervention Results for M1") %>% +```{r top negative results} +top_negative_effects$`Rank 1` %>% + kbl(caption = "Rank 1 Negative Stochastic Intervention Results") %>% kable_classic(full_width = F, html_font = "Cambria") ``` -The true effect for a shifted M1 vs observed M1 is: -```{r m1 truth} -sim_out$m1_effect -``` +Here we consistently see X5 as having the strongest negative impact which is also true compared to the true DGP. -And so we see that we have proper coverage. -Next we can look at effect modifications: +Next we will look at the top synergy results which is defined as the exposures that when shifted jointly have the highest, most positive, expected outcome difference compared to the sum of individual shifts of the same variables. -```{r effect modification results} -em_results$M3W3 %>% - kbl(caption = "Effect Modification Stochastic Intervention Results for M3 and W3") %>% +```{r joint results} +pooled_synergy_effects$`Rank 1` %>% + kbl(caption = "Rank 1 Synergy Stochastic Intervention Results") %>% kable_classic(full_width = F, html_font = "Cambria") ``` -Let's first look at the truth: -```{r em truth} -sim_out$effect_mod -``` -When W3 is 1 the truth effect is 11, our estimates are 11 with CI coverage. When W3 is 0 the truth is 1, our estimate is 1.9 with CIs that cover the truth as well. -And finally results for the joint shift which is a joint shift compared to additive individual shifts. +Above this table shows the pooled results for the rank 1 synergy exposure interaction. Of course, the exposure sets in the interaction deemed to have the highest impact, synergy, may differ between the folds and thus this pooling may be over different exposure sets. Thus, the first line shows the pooled estimate for a shift in the first variable, the second line the second variable, third line the joint and fourth line the difference between the joint and sum of the first two lines, or the interaction effect. Therefore, in this case, we could be pooling over different variables because of inconcistency in what is included as rank 1 between the folds. Next we look at the k-fold specific results. -```{r joint results} -joint_shift_results$M1M4 %>% - kbl(caption = "Interactions Stochastic Intervention Results for M1 and M4") %>% +```{r k-fold synergy} +k_fold_synergy_effects$`Rank 1` %>% + kbl(caption = "K-fold Synergy Stochastic Intervention Results") %>% kable_classic(full_width = F, html_font = "Cambria") ``` -Let's look at the truth again: +Here we see that the interaction between X4 and X5 was consistently found to have the highest antagonistic interaction across the folds. Therefore, for our pooled parameter var 1 represents the pooled effects of shifting X4, var 2 represents the pooled effects of shifting X5, joint is X4 and X5 together and the interaction represents the interaction effect for these two variables. -```{r interaction truths} -sim_out$m1_effect -sim_out$m4_effect -sim_out$m14_effect -sim_out$m14_intxn + +Next we'll look at the k-fold antagonistic interactions: + +```{r k-fold antagonistic} +k_fold_antagonism_effects$`Rank 1` %>% + kbl(caption = "K-fold Antagonistic Stochastic Intervention Results") %>% + kable_classic(full_width = F, html_font = "Cambria") ``` -So comparing the results to the above table in the pooled section we can see all our estimates for the marginal shifts, dual shift, and difference between dual and sum of marginals have CIs that cover the truth. + + +Overall, this package provides implementation of estimation a non-parametric definition of interaction. We define positive values as synergy meaning the expected outcome under joint shift is much larger compared to individual addivitive effects. Likewise, we define antagonism as negative effects, the joint value being lower than the additive effects. + + --- @@ -244,11 +226,11 @@ So comparing the results to the above table in the pooled section we can see all If you encounter any bugs or have any specific feature requests, please [file an -issue](https://github.com/blind-contours/SuperNOVA/issues). Further details +issue](https://github.com/blind-contours/InterXshift/issues). Further details on filing issues are provided in our [contribution guidelines](https://github.com/blind-contours/ -SuperNOVA/main/contributing.md). +InterXshift/main/contributing.md). --- @@ -256,14 +238,14 @@ SuperNOVA/main/contributing.md). Contributions are very welcome. Interested contributors should consult our [contribution -guidelines](https://github.com/blind-contours/SuperNOVA/blob/master/CONTRIBUTING.md) +guidelines](https://github.com/blind-contours/InterXshift/blob/master/CONTRIBUTING.md) prior to submitting a pull request. --- ## Citation -After using the `SuperNOVA` R package, please cite the following: +After using the `InterXshift` R package, please cite the following: --- diff --git a/README.md b/README.md index 76f9244..7530991 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ -# R/`SuperNOVA` +# R/`InterXShift` -[![R-CMD-check](https://github.com/blind-contours/SuperNOVA/workflows/R-CMD-check/badge.svg)](https://github.com/blind-contours/SuperNOVA/actions) +[![R-CMD-check](https://github.com/blind-contours/InterXShift/workflows/R-CMD-check/badge.svg)](https://github.com/blind-contours/InterXShift/actions) [![Coverage -Status](https://img.shields.io/codecov/c/github/blind-contours/SuperNOVA/master.svg)](https://codecov.io/github/blind-contours/SuperNOVA?branch=master) -[![CRAN](https://www.r-pkg.org/badges/version/SupernOVA)](https://www.r-pkg.org/pkg/SuperNOVA) +Status](https://img.shields.io/codecov/c/github/blind-contours/InterXShift/master.svg)](https://codecov.io/github/blind-contours/InterXShift?branch=master) +[![CRAN](https://www.r-pkg.org/badges/version/InterXShift)](https://www.r-pkg.org/pkg/InterXShift) [![CRAN -downloads](https://cranlogs.r-pkg.org/badges/SuperNOVA)](https://CRAN.R-project.org/package=SuperNOVA) +downloads](https://cranlogs.r-pkg.org/badges/InterXShift)](https://CRAN.R-project.org/package=InterXShift) [![CRAN total -downloads](http://cranlogs.r-pkg.org/badges/grand-total/SuperNOVA)](https://CRAN.R-project.org/package=SuperNOVA) +downloads](http://cranlogs.r-pkg.org/badges/grand-total/InterXShift)](https://CRAN.R-project.org/package=InterXShift) [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) @@ -22,95 +22,62 @@ license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://open -> Efficient Estimation of the Causal Effects of Non-Parametric -> Interactions, Effect Modifications and Mediation using Stochastic -> Interventions **Authors:** [David McCoy](https://davidmccoy.org) +> Interaction Identification and Estimation using Data-Adaptive +> Stochastic Interventions **Authors:** [David +> McCoy](https://davidmccoy.org) ------------------------------------------------------------------------ -## What’s `SuperNOVA`? - -The `SuperNOVA` R package offers a comprehensive toolset for identifying -predictive variable sets, be it subsets of exposures, -exposure-covariates, or exposure-mediators, for a specified outcome. It -further assists in creating efficient estimators for the counterfactual -mean of the outcome under stochastic interventions on these variable -sets. This means making exposure changes that depend on naturally -observed values, as described in past literature (Dı́az and van der Laan -2012; Haneuse and Rotnitzky 2013). - -`SuperNOVA` introduces several estimators, constructed based on the -patterns found in the data. At present, semi-parametric estimators are -available for various contexts: interaction, effect modification, -marginal impacts, and mediation. The target parameters that this package -calculates are grounded in previously published works: one regarding -interaction and effect modification (McCoy et al. 2023) and another -dedicated to mediation \[McCoy2023mediation\]. - -The `SuperNOVA` package builds upon the capabilities of the `txshift` -package, which implements the TML estimator for a stochastic shift -causal parameter (Dı́az and van der Laan 2012). A notable extension in -SuperNOVA is its support for joint stochastic interventions on two -exposures. This allows for the creation of a non-parametric interaction -parameter, shedding light on the combined effect of concurrently -shifting two variables relative to the cumulative effect of individual -shifts. At this stage, the focus is on two-way shifts. - -Various parameters are calculated based on identified patterns: - -- Interaction Parameter: When two exposures show signs of interaction, - this parameter is calculated. - -- Individual Stochastic Interventions: When marginal effects are - identified, this estimates the difference in outcomes upon making a - specific shift compared to no intervention. - -- Effect Modification: In scenarios where effect modification is - evident, the target parameter for it is the mean outcome under - intervention within specific regions of the covariate space that the - data suggests. This contrasts with the marginal effect by diving - deeper into strata of covariates, for example, gauging the effects of - exposure shifts among distinct genders. - -- Mediation: In scenarios where mediation is found, the target parameter - are the natural direct and indirect effects as defined by stochastic - intervention. That is, for the natural indirect effect, `SuperNOVA` - estimates the impact of shifting an exposure through a mediator and - the direct effect which is not through the mediator. - -Mediation in `SuperNOVA` is a new feature. Here, `SuperNOVA` brings to -the table estimates initially conceived in another work (Díaz and Hejazi -2020), while also supporting mediation estimates for continuous -exposures. +## What’s `InterXshift`? + +The `InterXshift` R package offers an approach which identifies and +estimates the impact interactions in a mixed exposure on an outcome. We +define interaction as the counterfactual mean of the outcome under +stochastic interventions of two exposures compared to the additive +counterfactual mean of the two expowures intervened on indepdentently. +These interventions or exposure changes depend on naturally observed +values, as described in past literature (Dı́az and van der Laan 2012; +Haneuse and Rotnitzky 2013). + +`InterXshift` builds on work described in (McCoy et al. 2023). However +instead of identifying interactions through an semi-parametric +definition of an F-statistics and then estimating our interaction target +parameter using CV-TMLE pooled across exposure sets, we provide a more +streamlined approach. In this package, we identify interactions through +g-computation first - evaluating the expected outcome under joint shift +compared to the sum of individual shifts using Super Learner. We then +rank these estimates as the highest sets of synergy and antagonism. We +then use CV-TMLE and pool within the ranks. The package ensures robustness by employing a k-fold cross-validation framework. This framework helps in estimating a data-adaptive parameter, -which is the stochastic shift target parameters for the variable sets -identified as influential for the outcome. The process begins by +which is the stochastic shift target parameters for the exposure sets +identified as having synergy or antagonism. The process begins by partitioning the data into parameter-generating and estimation samples. -The former sample assists in fitting a collection of basis function -estimators to the data. The one with the lowest cross-validated mean -squared error is selected. Key variable sets are then extracted using an -ANOVA-like variance decomposition methodology. For the estimation -sample, targeted learning is harnessed to gauge causal target parameters -across different contexts: interaction, mediation, effect modification, -and individual variable shifts. - -By using SuperNOVA, users get access to a tool that offers both k-fold -specific and aggregated results for each target parameter, ensuring that -researchers can glean the most information from their data. For a more -in-depth exploration, there’s an accompanying vignette. +In the parameter-generating sample, we identify our ranks of +antogonistic and synergistic exposure sets through a machine learning +g-computation framework. In the estimation sample we then estimate our +interaction target parameter using the doubly robust estimator TMLE to +ensure asymptotic efficiency which allows us to construct confidence +intervals for our estimates (unlike the g-comp method). + +By using InterXshift, users get access to a tool that offers both k-fold +specific and aggregated results for the top synergistic and antagonistic +relationships, ensuring that researchers can glean the most information +from their data. For a more in-depth exploration, there’s an +accompanying vignette. To utilize the package, users need to provide vectors for exposures, -covariates, mediators, and outcomes. They also specify the respective -$\delta$ for each exposure (indicating the degree of shift) and if this -delta should be adaptive in response to positivity violations. A -detailed guide is provided in the vignette. With these inputs, -`SuperNOVA` processes the data and delivers tables showcasing -fold-specific results and aggregated outcomes, allowing users to glean -insights effectively. - -`SuperNOVA` also incorporates features from the `sl3` package (Coyle, +covariates, and outcomes. They also specify the respective $\delta$ for +each exposure (indicating the degree of shift) and if this delta should +be adaptive in response to positivity violations. The `top_n` parameter +defines the top number of synergistic, antagonistic, positive and +negative ranked impacts to estiamte. A detailed guide is provided in the +vignette. With these inputs, `InterXshift` processes the data and +delivers tables showcasing fold-specific results and aggregated +outcomes, allowing users to glean insights effectively. + +`InterXshift` also incorporates features from the `sl3` package (Coyle, Hejazi, Malenica, et al. 2022), facilitating ensemble machine learning in the estimation process. If the user does not specify any stack parameters, `SuperNOVA` will automatically create an ensemble of machine @@ -121,12 +88,12 @@ computational efficiency. ## Installation -*Note:* Because the `SuperNOVA` package (currently) depends on `sl3` +*Note:* Because the `InterXshift` package (currently) depends on `sl3` that allows ensemble machine learning to be used for nuisance parameter -estimation and `sl3` is not on CRAN the `SuperNOVA` package is not +estimation and `sl3` is not on CRAN the `InterXshift` package is not available on CRAN and must be downloaded here. -There are many depedencies for `SuperNOVA` so it’s easier to break up +There are many depedencies for `InterXshift` so it’s easier to break up installation of the various packages to ensure proper installation. First install the basis estimators used in the data-adaptive variable @@ -137,7 +104,7 @@ install.packages("earth") install.packages("hal9001") ``` -`SuperNOVA` uses the `sl3` package to build ensemble machine learners +`InterXshift` uses the `sl3` package to build ensemble machine learners for each nuisance parameter. We have to install off the development branch, first download these two packages for `sl3` @@ -151,13 +118,13 @@ Now install `sl3` on devel: remotes::install_github("tlverse/sl3@devel") ``` -Make sure `sl3` installs correctly then install `SuperNOVA` +Make sure `sl3` installs correctly then install `InterXshift` ``` r -remotes::install_github("blind-contours/SuperNOVA@main") +remotes::install_github("blind-contours/InterXshift@main") ``` -`SuperNOVA` has some other miscellaneous dependencies that are used in +`InterXshift` has some other miscellaneous dependencies that are used in the examples as well as in the plotting functions. ``` r @@ -168,11 +135,11 @@ install.packages(c("kableExtra", "hrbrthemes", "viridis")) ## Example -To illustrate how `SuperNOVA` may be used to ascertain the effect of a +To illustrate how `InterXshift` may be used to ascertain the effect of a mixed exposure, consider the following example: ``` r -library(SuperNOVA) +library(InterXshift) library(devtools) #> Loading required package: usethis library(kableExtra) @@ -180,256 +147,296 @@ library(sl3) set.seed(429153) # simulate simple data -n_obs <- 100000 +n_obs <- 10000 ``` -The `simulate_data` is a function for generating synthetic data with a -complex structure to study the causal effects of shifting values in the -mixtures of exposures. The primary purpose of this simulation is to -provide a controlled environment for testing and validating estimates -given by the `SuperNOVA` package. - -The simulate_data function generates synthetic data for n_obs -observations with a pre-specified covariance structure (sigma_mod) and a -shift parameter (delta). It simulates four mixture components (M1, M2, -M3, M4) and three covariates (W1, W2, W3) with specific relationships -between them. The outcome variable Y is generated as a function of these -mixtures and covariates. - -After generating the data, the function applies a shift (delta) to each -mixture component separately and calculates the average treatment effect -for each component. Additionally, it calculates the interaction effect -of shifting two mixture components simultaneously (m14_intxn). These -ground truth effects can be used for validating and comparing the -performance of various causal inference methods on this synthetic -dataset. - -The function returns a list containing the generated data and the ground -truth effects of the shifts applied to each mixture component, the -interaction effect, and the modified effect results based on a specific -level in the W3 covariate. +We will directly use synthetic data from the NIEHS used to test new +mixture methods. This data has built in strong positive and negative +marginal effects and certain interactions. Found here: + ``` r -sim_out <- simulate_data(n_obs = n_obs) -data <- sim_out$data -head(data) %>% - kbl(caption = "Simulated Data") %>% +data("NIEHS_data_1", package = "SuperNOVA") +``` + +``` r +NIEHS_data_1$W <- rnorm(nrow(NIEHS_data_1), mean = 0, sd = 0.1) +w <- NIEHS_data_1[, c("W", "Z")] +a <- NIEHS_data_1[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7")] +y <- NIEHS_data_1$Y + +deltas <- list( + "X1" = 1, "X2" = 1, "X3" = 1, + "X4" = 1, "X5" = 1, "X6" = 1, "X7" = 1 +) +head(NIEHS_data_1) %>% + kbl(caption = "NIEHS Data") %>% kable_classic(full_width = F, html_font = "Cambria") ``` + + + + + + + + + + + + + + + + + + + + +
-Simulated Data +NIEHS Data
-M1 +obs -M2 +Y -M3 +X1 -M4 +X2 -W1 +X3 -W2 +X4 -W3 +X5 -Y +X6 + +X7 + +Z + +W
-23.01099 +1 -3.864569 +7.534686 -4.461534 +0.4157066 -4.747169 +0.5308077 -7.873068 +0.2223965 -7.362137 +1.1592634 -1 +2.4577556 + +0.9438601 -72.76739 +1.8714406 + +0 + +0.1335790
-23.70848 +2 -4.169044 +19.611934 -4.990670 +0.5293572 -5.568766 +0.9339570 -6.677126 +1.1210595 -7.302253 +1.3350074 -1 +0.3096883 + +0.5190970 + +0.2418065 + +0 -87.81536 +0.0585291
-22.44274 +3 + +12.664050 + +0.4849759 + +0.7210988 -2.922274 +0.4629027 -4.885933 +1.0334138 -3.347086 +0.9492810 -7.598614 +0.3664090 -7.647076 +0.3502445 0 -42.33824 +0.1342057
-22.99681 +4 -2.507461 +15.600288 -4.227508 +0.8275456 -1.719390 +1.0457137 -7.321127 +0.9699040 -6.907626 +0.9045099 -1 +0.9107914 + +0.4299847 -38.72763 +1.0007901 + +0 + +0.0734320
-22.88541 +5 -3.349974 +18.606498 -4.369185 +0.5190363 -6.618387 +0.7802400 -6.203951 +0.6142188 -6.547683 +0.3729743 -1 +0.5038126 + +0.3575472 -90.86418 +0.5906156 + +0 + +-0.0148427
-22.88781 +6 + +18.525890 + +0.4009491 + +0.8639886 -3.890133 +0.5501847 -5.370329 +0.9011016 -5.736624 +1.2907615 -7.677073 +0.7990418 -8.250861 +1.5097039 0 -68.59041 +0.1749775
-And therefore, in `SuperNOVA` we would expect most of the fold CIs to -cover this number and the pooled estimate to also cover this true -effect. Let’s run `SuperNOVA` to see if it correctly identifies the -exposures that drive the outcome and any interaction/effect modification -that exists in the DGP. - -Of note, there are three exposures M1, M2, M3 - M1 and M3 have -individual effects and interactions that drive the outcome. There is -also effect modification between M3 and W1. +Based on the data key, we expect X1 to have the strongest positive +effect, X5 the strongest negative. So we would expect these to take the +top ranks for these marginal associations. For interactions ``` r -data_sample <- data[sample(nrow(data), 4000), ] - -w <- data_sample[, c("W1", "W2", "W3")] -a <- data_sample[, c("M1", "M2", "M3", "M4")] -y <- data_sample$Y - -deltas <- list("M1" = 1, "M2" = 1, "M3" = 1, "M4" = 1) ptm <- proc.time() -sim_results <- SuperNOVA( +sim_results <- InterXshift( w = w, a = a, y = y, @@ -437,128 +444,237 @@ sim_results <- SuperNOVA( n_folds = 3, num_cores = 6, outcome_type = "continuous", - quantile_thresh = 0, - seed = 294580 + seed = 294580, + top_n = 2 ) #> -#> Iter: 1 fn: 5616.2821 Pars: 0.83768 0.16232 -#> Iter: 2 fn: 5615.9460 Pars: 0.99996649 0.00003351 -#> Iter: 3 fn: 5615.9460 Pars: 0.99997849 0.00002151 -#> solnp--> Completed in 3 iterations +#> Iter: 1 fn: 207.8441 Pars: 0.24716 0.75284 +#> Iter: 2 fn: 207.8441 Pars: 0.24716 0.75284 +#> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 3815.7486 Pars: 0.99998993 0.00001007 -#> Iter: 2 fn: 3815.7486 Pars: 0.999998834 0.000001166 +#> Iter: 1 fn: 377.5727 Pars: 0.15499 0.84501 +#> Iter: 2 fn: 377.5727 Pars: 0.15499 0.84501 #> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 3737.1413 Pars: 0.999996084 0.000003915 -#> Iter: 2 fn: 3737.1412 Pars: 0.9999991151 0.0000008849 +#> Iter: 1 fn: 330.3357 Pars: 0.0000008108 0.9999991895 +#> Iter: 2 fn: 330.3357 Pars: 0.0000003246 0.9999996754 #> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 3731.8788 Pars: 0.88038 0.11962 -#> Iter: 2 fn: 3731.8383 Pars: 0.99995226 0.00004774 -#> Iter: 3 fn: 3731.8383 Pars: 0.99997338 0.00002662 -#> solnp--> Completed in 3 iterations +#> Iter: 1 fn: 401.8556 Pars: 0.86134 0.13866 +#> Iter: 2 fn: 401.8556 Pars: 0.86138 0.13862 +#> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 5615.8427 Pars: 0.37837 0.62163 -#> Iter: 2 fn: 5615.7080 Pars: 0.61125 0.38875 -#> Iter: 3 fn: 5615.7080 Pars: 0.61254 0.38746 -#> solnp--> Completed in 3 iterations +#> Iter: 1 fn: 332.3736 Pars: 0.0000004115 0.9999995885 +#> Iter: 2 fn: 332.3736 Pars: 0.0000002689 0.9999997311 +#> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 5621.0789 Pars: 0.84929 0.15071 -#> Iter: 2 fn: 5620.8684 Pars: 0.00007712 0.99992288 -#> Iter: 3 fn: 5620.8684 Pars: 0.000002771 0.999997229 -#> solnp--> Completed in 3 iterations +#> Iter: 1 fn: 397.6163 Pars: 0.99999899 0.00000101 +#> Iter: 2 fn: 397.6163 Pars: 0.9999993431 0.0000006569 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 401.6642 Pars: 0.34162 0.65838 +#> Iter: 2 fn: 401.6642 Pars: 0.34162 0.65838 +#> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 5612.7158 Pars: 0.82116 0.17884 -#> Iter: 2 fn: 5612.5096 Pars: 0.9999838 0.0000162 -#> Iter: 3 fn: 5612.5096 Pars: 0.99998959 0.00001041 +#> Iter: 1 fn: 394.7229 Pars: 0.999996612 0.000003388 +#> Iter: 2 fn: 394.7229 Pars: 0.9999992222 0.0000007778 +#> Iter: 3 fn: 394.7229 Pars: 0.9999996471 0.0000003529 #> solnp--> Completed in 3 iterations #> -#> Iter: 1 fn: 3842.7726 Pars: 0.999856 0.000144 -#> Iter: 2 fn: 3842.7726 Pars: 0.99995859 0.00004141 +#> Iter: 1 fn: 356.1339 Pars: 0.000005701 0.999994299 +#> Iter: 2 fn: 356.1339 Pars: 0.000002949 0.999997051 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 266.7341 Pars: 0.05148 0.94852 +#> Iter: 2 fn: 266.7341 Pars: 0.05148 0.94852 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 333.6510 Pars: 0.00000144 0.99999856 +#> Iter: 2 fn: 333.6510 Pars: 0.0000008037 0.9999991963 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 400.0847 Pars: 0.999997174 0.000002826 +#> Iter: 2 fn: 400.0847 Pars: 0.999998661 0.000001339 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 404.9955 Pars: 0.000001765 0.999998235 +#> Iter: 2 fn: 404.9955 Pars: 0.0000005755 0.9999994245 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 398.2493 Pars: 0.91734 0.08266 +#> Iter: 2 fn: 398.2493 Pars: 0.91734 0.08266 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 357.3580 Pars: 0.73183 0.26817 +#> Iter: 2 fn: 357.3580 Pars: 0.73184 0.26816 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 273.0408 Pars: 0.05909 0.94091 +#> Iter: 2 fn: 273.0408 Pars: 0.05909 0.94091 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 183.0370 Pars: 0.01430 0.98570 +#> Iter: 2 fn: 183.0370 Pars: 0.01430 0.98570 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 368.3078 Pars: 0.28787 0.71213 +#> Iter: 2 fn: 368.3078 Pars: 0.28787 0.71213 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 339.4673 Pars: 0.31384 0.68616 +#> Iter: 2 fn: 339.4673 Pars: 0.31385 0.68615 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 383.2973 Pars: 0.09198 0.90802 +#> Iter: 2 fn: 383.2973 Pars: 0.09198 0.90802 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 340.5352 Pars: 0.72757 0.27243 +#> Iter: 2 fn: 340.5352 Pars: 0.72757 0.27243 #> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 3811.0844 Pars: 0.67596 0.32404 -#> Iter: 2 fn: 3811.0844 Pars: 0.67601 0.32399 +#> Iter: 1 fn: 391.8501 Pars: 0.20653 0.79347 +#> Iter: 2 fn: 391.8501 Pars: 0.20653 0.79347 #> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 3841.7125 Pars: 0.997439 0.002561 -#> Iter: 2 fn: 3841.7125 Pars: 0.9993128 0.0006872 +#> Iter: 1 fn: 393.0514 Pars: 0.41725 0.58275 +#> Iter: 2 fn: 393.0514 Pars: 0.41724 0.58276 #> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 5612.7076 Pars: 0.43156 0.56844 -#> Iter: 2 fn: 5612.3977 Pars: 0.999891 0.000109 -#> Iter: 3 fn: 5612.3977 Pars: 0.99992987 0.00007013 +#> Iter: 1 fn: 385.7589 Pars: 0.23439 0.76561 +#> Iter: 2 fn: 385.7589 Pars: 0.23439 0.76561 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 359.8025 Pars: 0.64588 0.35412 +#> Iter: 2 fn: 359.8025 Pars: 0.64589 0.35411 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 298.1049 Pars: 0.13593 0.86407 +#> Iter: 2 fn: 298.1049 Pars: 0.13593 0.86407 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 343.2012 Pars: 0.69630 0.30370 +#> Iter: 2 fn: 343.2012 Pars: 0.69631 0.30369 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 387.6236 Pars: 0.26685 0.73315 +#> Iter: 2 fn: 387.6236 Pars: 0.26685 0.73315 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 382.9423 Pars: 0.38852 0.61148 +#> Iter: 2 fn: 382.9423 Pars: 0.38852 0.61148 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 386.7048 Pars: 0.26779 0.73221 +#> Iter: 2 fn: 386.7048 Pars: 0.26780 0.73220 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 358.5122 Pars: 0.81840 0.18160 +#> Iter: 2 fn: 358.5122 Pars: 0.81843 0.18157 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 299.7883 Pars: 0.18540 0.81460 +#> Iter: 2 fn: 299.7883 Pars: 0.18540 0.81460 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 175.6340 Pars: 0.0000000298 0.9999999697 +#> Iter: 2 fn: 175.6340 Pars: 0.000000002366 0.999999997634 +#> Iter: 3 fn: 175.6340 Pars: 0.000000001002 0.999999998998 #> solnp--> Completed in 3 iterations #> -#> Iter: 1 fn: 5621.1314 Pars: 0.51060 0.48940 -#> Iter: 2 fn: 5621.1314 Pars: 0.51078 0.48922 +#> Iter: 1 fn: 390.2291 Pars: 0.10353 0.89647 +#> Iter: 2 fn: 390.2291 Pars: 0.10353 0.89647 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 345.9065 Pars: 0.0000001012 0.9999998989 +#> Iter: 2 fn: 345.9065 Pars: 0.00000006067 0.99999993933 #> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 5600.7646 Pars: 0.99997834 0.00002166 -#> Iter: 2 fn: 5600.7645 Pars: 0.999995592 0.000004408 +#> Iter: 1 fn: 400.4320 Pars: 0.04147 0.95853 +#> Iter: 2 fn: 400.4320 Pars: 0.04141 0.95859 #> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 3831.8447 Pars: 0.45887 0.54113 -#> Iter: 2 fn: 3831.8447 Pars: 0.45886 0.54114 +#> Iter: 1 fn: 350.7453 Pars: 0.07386 0.92614 +#> Iter: 2 fn: 350.7453 Pars: 0.07386 0.92614 #> solnp--> Completed in 2 iterations #> -#> Iter: 1 fn: 3799.4818 Pars: 0.02808 0.97192 -#> Iter: 2 fn: 3797.3950 Pars: 0.9998439 0.0001561 -#> Iter: 3 fn: 3797.3948 Pars: 0.99998699 0.00001301 -#> Iter: 4 fn: 3797.3948 Pars: 0.999993236 0.000006764 +#> Iter: 1 fn: 399.6946 Pars: 0.72024 0.27976 +#> Iter: 2 fn: 399.6854 Pars: 0.994207 0.005793 +#> Iter: 3 fn: 399.6853 Pars: 0.9997299 0.0002701 +#> Iter: 4 fn: 399.6853 Pars: 0.999892 0.000108 #> solnp--> Completed in 4 iterations #> -#> Iter: 1 fn: 3830.8913 Pars: 0.84699 0.15301 -#> Iter: 2 fn: 3830.8719 Pars: 0.75597 0.24403 -#> Iter: 3 fn: 3830.8719 Pars: 0.75597 0.24403 +#> Iter: 1 fn: 401.3928 Pars: 0.87583 0.12417 +#> Iter: 2 fn: 401.3928 Pars: 0.87590 0.12410 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 402.8578 Pars: 0.000699 0.999301 +#> Iter: 2 fn: 398.8164 Pars: 0.999998775 0.000001225 +#> Iter: 3 fn: 398.8164 Pars: 0.9999997522 0.0000002478 #> solnp--> Completed in 3 iterations #> -#> Iter: 1 fn: 5600.6732 Pars: 0.13089 0.86911 -#> Iter: 2 fn: 5600.6602 Pars: 0.007388 0.992612 -#> Iter: 3 fn: 5600.6602 Pars: 0.007384 0.992616 +#> Iter: 1 fn: 351.1701 Pars: 0.33886 0.66114 +#> Iter: 2 fn: 351.1701 Pars: 0.33886 0.66114 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 263.4308 Pars: 0.10274 0.89726 +#> Iter: 2 fn: 263.4308 Pars: 0.10275 0.89725 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 349.1731 Pars: 0.09060 0.90940 +#> Iter: 2 fn: 349.1731 Pars: 0.09060 0.90940 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 398.4919 Pars: 0.12241 0.87759 +#> Iter: 2 fn: 398.4902 Pars: 0.21590 0.78410 +#> Iter: 3 fn: 398.4902 Pars: 0.21590 0.78410 #> solnp--> Completed in 3 iterations #> -#> Iter: 1 fn: 5600.7959 Pars: 0.9998956 0.0001044 -#> Iter: 2 fn: 5600.7958 Pars: 0.99998238 0.00001762 +#> Iter: 1 fn: 406.4360 Pars: 0.76833 0.23167 +#> Iter: 2 fn: 406.4360 Pars: 0.76833 0.23167 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 403.4659 Pars: 0.90911 0.09089 +#> Iter: 2 fn: 403.4659 Pars: 0.90972 0.09028 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 349.7103 Pars: 0.000002336 0.999997663 +#> Iter: 2 fn: 349.7103 Pars: 0.0000002178 0.9999997822 +#> solnp--> Completed in 2 iterations +#> +#> Iter: 1 fn: 261.9160 Pars: 0.000002122 0.999997879 +#> Iter: 2 fn: 261.9160 Pars: 0.0000004885 0.9999995115 #> solnp--> Completed in 2 iterations proc.time() - ptm -#> user system elapsed -#> 51.213 4.756 824.470 +#> user system elapsed +#> 76.950 4.150 1362.112 -basis_in_folds <- sim_results$`Basis Fold Proportions` -indiv_shift_results <- sim_results$`Indiv Shift Results` -em_results <- sim_results$`Effect Mod Results` -joint_shift_results <- sim_results$`Joint Shift Results` -``` +## marginal effects +top_positive_effects <- sim_results$`Pos Shift Results` +top_negative_effects <- sim_results$`Neg Shift Results` -Let’s first look at the variable relationships used in the folds: +## interaction effects +pooled_synergy_effects <- sim_results$`Pooled Synergy Results` +pooled_antagonism_effects <- sim_results$`Pooled Antagonism Results` -``` r -basis_in_folds -#> -#> M1 M1M4 M3M4 M3W3 M4 W3 -#> 1.00 0.33 0.67 1.00 1.00 1.00 +k_fold_synergy_effects <- sim_results$`K Fold Synergy Results` +k_fold_antagonism_effects <- sim_results$`K Fold Antagonism Results` ``` -The above list shows that marginal effects for exposures M1 and M4 were -found, an interaction for M1 and M4, and effect modification for M3 and -W3 - all of which are correct as the outcome is generated from these -relationships. There is no effect for M2 which we correctly reject. - -Let’s first look at the results for individual stochastic shifts by -delta compared to no shift: - ``` r -indiv_shift_results$M1 %>% - kbl(caption = "Individual Stochastic Intervention Results for M1") %>% +top_positive_effects %>% + kbl(caption = "Rank 1 Positive Stochastic Intervention Results") %>% kable_classic(full_width = F, html_font = "Cambria") ``` - +
+ + +
-Individual Stochastic Intervention Results for M1 +Rank 1 Positive Stochastic Intervention Results
+
@@ -602,25 +718,25 @@ Delta
-M1 +X1 -2.664497 +13.272347 -0.7389329 +0.4610176 -0.8596120 +0.6789827 -0.9797 +11.9416 -4.3493 +14.6031 -0.0019375 +0.0000000 1 @@ -629,10 +745,10 @@ M1 Indiv Shift -M1 +X1 -1334 +167 1 @@ -640,25 +756,25 @@ M1
-M1 +X1 -1.752571 +-2.687271 -0.5247758 +1.6059094 -0.7244141 +1.2672448 -0.3327 +-5.1710 -3.1724 +-0.2035 -0.0155506 +0.0339587 2 @@ -667,10 +783,10 @@ M1 Indiv Shift -M1 +X1 -1333 +167 1 @@ -678,25 +794,25 @@ M1
-M1 +X1 -2.214215 +15.980403 -0.5609105 +0.6346225 -0.7489396 +0.7966320 -0.7463 +14.4190 -3.6821 +17.5418 -0.0031119 +0.0000000 3 @@ -705,10 +821,10 @@ M1 Indiv Shift -M1 +X1 -1333 +166 1 @@ -716,25 +832,25 @@ M1
-M1 +Rank 1 -1.496780 +14.366549 -0.1667218 +0.2866767 -0.4083158 +0.5354220 -0.6965 +13.3171 -2.2971 +15.4160 -0.0002466 +0.0000000 Pooled TMLE @@ -743,10 +859,10 @@ Pooled TMLE Indiv Shift -M1 +Rank 1 -4000 +500 1 @@ -754,28 +870,9 @@ M1
- -The true effect for a shifted M1 vs observed M1 is: - -``` r -sim_out$m1_effect -#> [1] 1.603984 -``` - -And so we see that we have proper coverage. - -Next we can look at effect modifications: - -``` r -em_results$M3W3 %>% - kbl(caption = "Effect Modification Stochastic Intervention Results for M3 and W3") %>% - kable_classic(full_width = F, html_font = "Cambria") -``` - - - + + + + +
-Effect Modification Stochastic Intervention Results for M3 and W3 -
+ + +
@@ -791,13 +888,13 @@ Variance SE -Lower_CI +Lower CI -Upper_CI +Upper CI -P_value +P-value Fold @@ -819,37 +916,37 @@ Delta
-Level 1 Shift Diff in W3 \<= 0 +X7 --3.335721 +3.418538 -3.3306066 +0.7348417 -1.8249950 +0.8572291 --6.9126 +1.7384 -0.2412 +5.0987 -0.0675799 +0.0000667 1 -Effect Mod +Indiv Shift -M3W3 +X7 -1334 +167 1 @@ -857,37 +954,37 @@ M3W3
-Level 0 Shift Diff in W3 \<= 0 +X7 -4.242965 +2.716254 -8.7593547 +0.7413940 -2.9596207 +0.8610424 --1.5578 +1.0286 -10.0437 +4.4039 -0.1516814 +0.0016071 -1 +2 -Effect Mod +Indiv Shift -M3W3 +X7 -1334 +167 1 @@ -895,37 +992,37 @@ M3W3
-Level 1 Shift Diff in W3 \<= 0 +X7 -3.485335 +2.879122 -4.2329425 +0.5118995 -2.0574116 +0.7154715 --0.5471 +1.4768 -7.5178 +4.2814 -0.0902579 +0.0000572 -2 +3 -Effect Mod +Indiv Shift -M3W3 +X7 -1333 +166 1 @@ -933,75 +1030,139 @@ M3W3
-Level 0 Shift Diff in W3 \<= 0 +Rank 2 -11.881071 +3.138611 -3.4080300 +0.2468434 -1.8460850 +0.4968334 -8.2628 +2.1648 -15.4993 +4.1124 0.0000000 -2 +Pooled TMLE -Effect Mod +Indiv Shift -M3W3 +Rank 2 -1333 +500 1
+
+ +Above we show the findings for the top rank positive marginal effect. +Here we consistently find X1 which is true based on what is built into +the DGP. + +Next we look at the top negative result: + +``` r +top_negative_effects$`Rank 1` %>% + kbl(caption = "Rank 1 Negative Stochastic Intervention Results") %>% + kable_classic(full_width = F, html_font = "Cambria") +``` + + + + + + + + + + + + + + + + + + + +
+Rank 1 Negative Stochastic Intervention Results +
+Condition + +Psi + +Variance + +SE + +Lower CI + +Upper CI + +P-value + +Fold + +Type + +Variables + +N + +Delta +
-Level 1 Shift Diff in W3 \<= 0 +X4 -4.945589 +-0.8445876 -41.1813216 +0.7361409 -6.4172675 +0.8579865 --7.6320 +-2.5262 -17.5232 +0.8370 -0.4409031 +0.3249271 -3 +1 -Effect Mod +Indiv Shift -M3W3 +X4 -1333 +167 1 @@ -1009,37 +1170,37 @@ M3W3
-Level 0 Shift Diff in W3 \<= 0 +X4 -11.089110 +-0.6088896 -6.3564564 +0.7229439 -2.5212014 +0.8502611 -6.1476 +-2.2754 -16.0306 +1.0576 -0.0000109 +0.4739168 -3 +2 -Effect Mod +Indiv Shift -M3W3 +X4 -1333 +167 1 @@ -1047,37 +1208,37 @@ M3W3
-Level 1 Shift Diff in W3 \<= 0 +X4 -1.689869 +-0.4510666 -0.9935718 +0.5543036 -0.9967807 +0.7445157 --0.2638 +-1.9103 -3.6435 +1.0082 -0.0900134 +0.5446128 -Pooled TMLE +3 -Effect Mod +Indiv Shift -M3W3 +X4 -4000 +166 1 @@ -1085,37 +1246,37 @@ M3W3
-Level 0 Shift Diff in W3 \<= 0 +Rank 1 -10.114451 +-0.7797371 -1.4270177 +0.2457405 -1.1945785 +0.4957222 -7.7731 +-1.7513 -12.4558 +0.1919 -0.0000000 +0.1157346 Pooled TMLE -Effect Mod +Indiv Shift -M3W3 +Rank 1 -4000 +500 1 @@ -1124,38 +1285,28 @@ M3W3
-Let’s first look at the truth: - -``` r -sim_out$effect_mod -#> $`Level 0 Shift Diff in W3 <= 0` -#> [1] 10.99749 -#> -#> $`Level 1 Shift Diff in W3 <= 0` -#> [1] 1 -``` - -When W3 is 1 the truth effect is 11, our estimates are 11 with CI -coverage. When W3 is 0 the truth is 1, our estimate is 1.9 with CIs that -cover the truth as well. +Here we consistently see X5 as having the strongest negative impact +which is also true compared to the true DGP. -And finally results for the joint shift which is a joint shift compared -to additive individual shifts. +Next we will look at the top synergy results which is defined as the +exposures that when shifted jointly have the highest, most positive, +expected outcome difference compared to the sum of individual shifts of +the same variables. ``` r -joint_shift_results$M1M4 %>% - kbl(caption = "Interactions Stochastic Intervention Results for M1 and M4") %>% +pooled_synergy_effects$`Rank 1` %>% + kbl(caption = "Rank 1 Synergy Stochastic Intervention Results") %>% kable_classic(full_width = F, html_font = "Cambria") ``` - - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
-Interactions Stochastic Intervention Results for M1 and M4 +Rank 1 Synergy Stochastic Intervention Results
-Condition +Rank Psi @@ -1178,57 +1329,784 @@ P-value Fold -Type - -Variables - N -Delta M1 +Delta Exposure 1 -Delta M4 +Delta Exposure 2 + +Type
-M1 +Rank 1 -2.701885 +-0.7839407 -0.7471848 +0.2439296 -0.8643985 +0.4938923 -1.0077 +-1.7520 -4.3961 +0.1841 -0.0036597 +0.2646390 -1 +Pooled TMLE -Interaction + +500 + +1 + +1 + +Var 1 +
+Rank 1 + +-3.5652153 + +0.2374525 + +0.4872909 + +-4.5203 + +-2.6101 + +0.0000003 + +Pooled TMLE + +500 + +1 + +1 + +Var 2 +
+Rank 1 + +-4.0194749 + +0.2484014 + +0.4983989 + +-4.9963 + +-3.0426 + +0.0000000 + +Pooled TMLE + +500 + +1 + +1 + +Joint +
+Rank 1 + +0.3296811 + +0.2511571 + +0.5011557 + +-0.6526 + +1.3119 + +0.6414291 + +Pooled TMLE + +500 + +1 + +1 + +Interaction +
+ +Above this table shows the pooled results for the rank 1 synergy +exposure interaction. Of course, the exposure sets in the interaction +deemed to have the highest impact, synergy, may differ between the folds +and thus this pooling may be over different exposure sets. Thus, the +first line shows the pooled estimate for a shift in the first variable, +the second line the second variable, third line the joint and fourth +line the difference between the joint and sum of the first two lines, or +the interaction effect. Therefore, in this case, we could be pooling +over different variables because of inconcistency in what is included as +rank 1 between the folds. Next we look at the k-fold specific results. + +``` r +k_fold_synergy_effects$`Rank 1` %>% + kbl(caption = "K-fold Synergy Stochastic Intervention Results") %>% + kable_classic(full_width = F, html_font = "Cambria") +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+K-fold Synergy Stochastic Intervention Results +
+Rank + +Psi + +Variance + +SE + +Lower CI + +Upper CI + +P-value + +Fold + +N + +Delta Exposure 1 + +Delta Exposure 2 + +Type +
+1 + +-0.8307433 + +0.7422102 + +0.8615162 + +-2.5193 + +0.8578 + +0.3707738 + +1 + +167 + +1 + +1 + +X4 +
+1 + +-3.6842524 + +0.6854063 + +0.8278927 + +-5.3069 + +-2.0616 + +0.0000514 + +1 + +167 + +1 + +1 + +X5 +
+1 + +-3.8745306 + +0.6396612 + +0.7997882 + +-5.4421 + +-2.3070 + +0.0000147 + +1 + +167 + +1 + +1 + +X4-X5 +
+1 + +0.6404651 + +0.8229767 + +0.9071806 + +-1.1376 + +2.4185 + +0.5013085 + +1 + +167 + +1 + +1 + +Interaction +
+1 + +-0.6414818 + +0.7115233 + +0.8435184 + +-2.2947 + +1.0118 + +0.4848941 + +2 + +167 + +1 + +1 + +X4 +
+1 + +-3.8106050 + +0.7023745 + +0.8380779 + +-5.4532 + +-2.1680 + +0.0000315 + +2 + +167 + +1 + +1 + +X5 +
+1 + +-3.9552472 + +1.0958593 + +1.0468330 + +-6.0070 + +-1.9035 + +0.0001107 + +2 + +167 + +1 + +1 + +X4-X5 +
+1 + +0.4968396 + +1.0111131 + +1.0055412 + +-1.4740 + +2.4677 + +0.6202693 + +2 + +167 + +1 + +1 + +Interaction +
+1 + +-0.5022413 + +0.5467407 + +0.7394192 + +-1.9515 + +0.9470 + +0.5591713 + +3 + +166 + +1 + +1 + +X4 +
+1 + +-3.3334914 + +0.4549681 + +0.6745133 + +-4.6555 + +-2.0115 + +0.0000493 + +3 + +166 + +1 + +1 + +X5 +
+1 + +-3.8353454 + +0.4708589 + +0.6861916 + +-5.1803 + +-2.4904 + +0.0000037 + +3 + +166 + +1 + +1 + +X4-X5 +
+1 + +0.0003873 + +0.5123543 + +0.7157893 + +-1.4025 + +1.4033 + +0.9996347 + +3 + +166 + +1 + +1 + +Interaction +
+ +Here we see that the interaction between X4 and X5 was consistently +found to have the highest antagonistic interaction across the folds. +Therefore, for our pooled parameter var 1 represents the pooled effects +of shifting X4, var 2 represents the pooled effects of shifting X5, +joint is X4 and X5 together and the interaction represents the +interaction effect for these two variables. + +Next we’ll look at the k-fold antagonistic interactions: + +``` r +k_fold_antagonism_effects$`Rank 1` %>% + kbl(caption = "K-fold Antagonistic Stochastic Intervention Results") %>% + kable_classic(full_width = F, html_font = "Cambria") +``` + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + - - - - + - - - - + - - + + + - + + + + + + + + + + - - - + + + + + + + + + + + + + - - + + + - + + + + + + + + + - - + + - - - + - - - + + + + + + + + + + + + +
+K-fold Antagonistic Stochastic Intervention Results +
+Rank + +Psi + +Variance + +SE + +Lower CI + +Upper CI + +P-value + +Fold + +N + +Delta Exposure 1 + +Delta Exposure 2 + +Type +
+1 -M1&M4 + +-0.8417448 + +0.7276929 + +0.8530492 + +-2.5137 + +0.8302 -1334 +0.3621019 + +1 + +167 1 @@ -1236,40 +2114,37 @@ M1&M4 1 +X4 +
-M4 + +1 -10.236543 +-3.7508836 -0.3365067 +0.6817120 -0.5800920 +0.8256585 -9.0996 +-5.3691 -11.3735 +-2.1326 -0.0000000 +0.0000366 + 1 -Interaction - -M1&M4 - -1334 +167 1 @@ -1277,40 +2152,37 @@ M1&M4 1 +X5 +
-M1&M4 + +1 -11.832085 +-3.8262154 -0.3484322 +0.6713063 -0.5902815 +0.8193329 -10.6752 +-5.4321 -12.9890 +-2.2204 -0.0000000 +0.0000237 + 1 -Interaction - -M1&M4 - -1334 +167 1 @@ -1318,40 +2190,75 @@ M1&M4 1 +X4-X5 +
-Psi + +1 --1.106343 +0.7664130 -0.7334625 +0.8668148 -0.8564242 +0.9310289 --2.7849 +-1.0584 -0.5722 +2.5912 -0.2318963 +0.4270243 + +1 + +167 + +1 + 1 Interaction -M1&M4 +
+1 + +-0.6171086 + +0.7073394 + +0.8410347 + +-2.2655 + +1.0313 + +0.5010069 -1334 +2 + +167 1 @@ -1359,40 +2266,75 @@ M1&M4 1 +X4 +
-M1 + +1 -2.701885 +-3.7414210 -0.7471848 +0.7056593 -0.8643985 +0.8400353 -1.0077 +-5.3879 -4.3961 +-2.0950 -0.0036597 +0.0000446 -Pooled TMLE + +2 -Interaction + +167 + +1 + +1 -M1&M4 +X5 +
+1 + +-4.0338956 + +1.1183969 + +1.0575429 -1334 +-6.1066 + +-1.9611 + +0.0000876 + +2 + +167 1 @@ -1400,40 +2342,75 @@ M1&M4 1 +X4-X5 +
-M4 + +1 -10.236543 +0.3246339 -0.3365067 +0.9701553 -0.5800920 +0.9849646 -9.0996 +-1.6059 -11.3735 +2.2551 -0.0000000 +0.7435905 -Pooled TMLE + +2 + +167 + +1 + +1 Interaction -M1&M4 +
+1 -1334 +-0.4952579 + +0.5544481 + +0.7446127 + +-1.9547 + +0.9642 + +0.5660087 + +3 + +166 1 @@ -1441,40 +2418,37 @@ M1&M4 1
-M1&M4 +X4
-11.832085 +1 -0.3484322 +-3.3504760 -0.5902815 +0.4553080 -10.6752 +0.6747651 -12.9890 +-4.6730 -0.0000000 - -Pooled TMLE +-2.0280 -Interaction + +0.0000453 -M1&M4 + +3 -1334 +166 1 @@ -1482,40 +2456,75 @@ M1&M4 1 +X5 +
-Psi + +1 --1.106343 +-3.9422309 -0.7334625 +0.4672794 -0.8564242 +0.6835783 --2.7849 +-5.2820 -0.5722 +-2.6024 -0.2318963 +0.0000019 -Pooled TMLE + +3 -Interaction + +166 + +1 + +1 -M1&M4 +X4-X5 +
+1 + +-0.0964970 + +0.5204951 + +0.7214535 + +-1.5105 + +1.3175 -1334 +0.9095484 + +3 + +166 1 @@ -1523,36 +2532,28 @@ M1&M4 1 +Interaction +
-Let’s look at the truth again: - -``` r -sim_out$m1_effect -#> [1] 1.603984 -sim_out$m4_effect -#> [1] 10.41265 -sim_out$m14_effect -#> [1] 12.41664 -sim_out$m14_intxn -#> [1] 0.4 -``` - -So comparing the results to the above table in the pooled section we can -see all our estimates for the marginal shifts, dual shift, and -difference between dual and sum of marginals have CIs that cover the -truth. +Overall, this package provides implementation of estimation a +non-parametric definition of interaction. We define positive values as +synergy meaning the expected outcome under joint shift is much larger +compared to individual addivitive effects. Likewise, we define +antagonism as negative effects, the joint value being lower than the +additive effects. ------------------------------------------------------------------------ ## Issues If you encounter any bugs or have any specific feature requests, please -[file an issue](https://github.com/blind-contours/SuperNOVA/issues). +[file an issue](https://github.com/blind-contours/InterXshift/issues). Further details on filing issues are provided in our [contribution -guidelines](https://github.com/blind-contours/%20SuperNOVA/main/contributing.md). +guidelines](https://github.com/blind-contours/%20InterXshift/main/contributing.md). ------------------------------------------------------------------------ @@ -1560,14 +2561,14 @@ guidelines](https://github.com/blind-contours/%20SuperNOVA/main/contributing.md) Contributions are very welcome. Interested contributors should consult our [contribution -guidelines](https://github.com/blind-contours/SuperNOVA/blob/master/CONTRIBUTING.md) +guidelines](https://github.com/blind-contours/InterXshift/blob/master/CONTRIBUTING.md) prior to submitting a pull request. ------------------------------------------------------------------------ ## Citation -After using the `SuperNOVA` R package, please cite the following: +After using the `InterXshift` R package, please cite the following: ------------------------------------------------------------------------ @@ -1652,15 +2653,6 @@ Scalable Highly Adaptive Lasso.” -
- -Díaz, Iván, and Nima S. Hejazi. 2020. “Causal -mediation analysis for stochastic interventions.” *Journal of the -Royal Statistical Society. Series B: Statistical Methodology* 82 (3): -661–83. . - -
-
Dı́az, Iván, and Nima S Hejazi. 2020. “Causal Mediation Analysis for diff --git a/doc/SuperNOVA-vignette.R b/doc/SuperNOVA-vignette.R deleted file mode 100644 index 802b667..0000000 --- a/doc/SuperNOVA-vignette.R +++ /dev/null @@ -1,77 +0,0 @@ -## ---- include = FALSE--------------------------------------------------------- -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) - -## ----figure, echo=FALSE, out.width='100%', fig.align='center'----------------- -library(knitr) -include_graphics("Biometrics_Flow_Chart.png") - -## ----deltas, message=FALSE, warning=FALSE------------------------------------- -deltas <- c("M1" = 1, "M2" = 2.3, "M3" = 1.4) - -## ----setup, message=FALSE, warning=FALSE-------------------------------------- -library(data.table) -library(dplyr) -library(kableExtra) -library(SuperNOVA) - -seed <- 325911 - -## ----NIEHS example------------------------------------------------------------ -data("NIEHS_data_1", package = "SuperNOVA") - -## ----NIEH Nodes--------------------------------------------------------------- -NIEHS_data_1$W <- rnorm(nrow(NIEHS_data_1), mean = 0, sd = 0.1) -w <- NIEHS_data_1[, c("W", "Z")] -a <- NIEHS_data_1[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7")] -y <- NIEHS_data_1$Y - -## ----run SuperNOVA NIEHS data, eval = TRUE------------------------------------ -deltas <- list( - "X1" = 1, "X2" = 1, "X3" = 1, - "X4" = 1, "X5" = 1, "X6" = 1, "X7" = 1 -) - -ptm <- proc.time() - -NIEH_results <- SuperNOVA( - w = w, - a = a, - y = y, - deltas = deltas, - estimator = "tmle", - fluctuation = "standard", - n_folds = 2, - outcome_type = "continuous", - quantile_thresh = 0, - verbose = TRUE, - parallel = FALSE, - parallel_type = "sequential", - num_cores = 2, - seed = seed, - adaptive_delta = TRUE -) - -proc.time() - ptm - -indiv_shift_results <- NIEH_results$`Indiv Shift Results` -em_results <- NIEH_results$`Effect Mod Results` -joint_shift_results <- NIEH_results$`Joint Shift Results` - -## ----individual results------------------------------------------------------- -indiv_shift_results$X7 %>% - kableExtra::kbl(caption = "Effect Modification Results") %>% - kable_classic(full_width = F, html_font = "Cambria") - -## ----joint results------------------------------------------------------------ -em_results$X7Z %>% - kableExtra::kbl(caption = "Effect Modification Results") %>% - kableExtra::kable_classic(full_width = F, html_font = "Cambria") - -## ----interaction results------------------------------------------------------ -joint_shift_results$X2X7 %>% - kableExtra::kbl(caption = "Interaction Results") %>% - kableExtra::kable_classic(full_width = T, html_font = "Cambria") - diff --git a/doc/SuperNOVA-vignette.Rmd b/doc/SuperNOVA-vignette.Rmd deleted file mode 100644 index 6046ee3..0000000 --- a/doc/SuperNOVA-vignette.Rmd +++ /dev/null @@ -1,280 +0,0 @@ ---- -title: "Analysis of Variance using Super Learner with Data-Adaptive Stochastic Interventions" -author: "David McCoy" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -bibliography: ../inst/references.bib -vignette: > - %\VignetteIndexEntry{Analysis of Variance using Super Learner with Data-Adaptive Stochastic Interventions} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -## Motivation - -The motivation behind the package SuperNOVA is to address the limitations of traditional statistical methods in environmental epidemiology studies. Analysts are often interested in understanding the joint impact of a mixed exposure, i.e. a vector of exposures, but the most important variables and variable sets remain unknown. Traditional methods make overly simplistic assumptions, such as linear and additive relationships, and the resulting statistical quantities may not be directly applicable to public policy decisions. - -To overcome these limitations, SuperNOVA uses data-adaptive machine learning methods to identify the variables and variable sets that have the most explanatory power on an outcome of interest. The package builds a discrete Super Learner, which is then analyzed using ANOVA style analysis to determine the variables that contribute most to the model fit through basis functions. The target parameter, which may be a single shift, effect modification, interaction or mediation, is then applied to the data-adaptively determined variable sets. - -In this way, SuperNOVA allows analysts to explore modified treatment policies and ask causal questions, such as "If exposure to collections of PFAS chemicals increases, what is the change in cholesterol, immune function, or cancer?" By providing more nuanced and data-driven insights, SuperNOVA aims to inform public policy decisions and improve our understanding of the impact of mixed exposures on health outcomes. - -For more detailed information, please read [@mccoy2023semiparametric @McCoy2023mediation] - here we focus on providing a general overview and enough information to understand processes and output. - -### Data-Adaptive Machine Learning - -The package SuperNOVA uses a data-adaptive approach to identify important variables and variable sets in mixed exposure studies. To avoid over-fitting and incorrect model assumptions, the package employs cross-validation procedures. In each fold, the full data is split into a training and estimation sample, the training data is used to: - -1. Identify the variable sets of interest using flexible machine learning algorithms. -2. Fit estimators for the relevant nuisance parameters and the final target parameter of interest using the identified variable sets. - -Then the estimation sample is used to: - -3. Obtain estimates of the nuisance parameters and the final target parameter of interest using the held-out validation data. -4. The process is repeated in each fold to provide an estimate specific to each validation data. To optimize the balance between bias and variance, SuperNOVA uses targeted minimum loss based estimation (TMLE). -5. Estimates across the folds are then averaged and a pooled fluctuation step is performed to estimate the efficient influence function and derive pooled variance estimates. - - -# Data and Parameter of Interest - -## Overview of our Data-Adaptive Target Parameter - -In contemporary epidemiological analyses, high-dimensional data sets have revealed the constraints of conventional statistical methods. In many cases, the interactions that exist -in multiple exposures, the baseline covariates that modify the impacts of exposures or the -mediators that may mediate the impact of exposures are all unknown a priori. Thus, we need to -identify these relationships and build efficient estimators for when interventions are made -on these variable sets in the respective relationships. - -Hubbard and van der Laan [@Hubbard2016] championed data-adaptive target parameters which aim to answer the question of how to both identify and estimate target parameter in data which requires the combination of data-splitting and efficient estimators. Because of the number of steps required for robust estimation is complex, we first provide an overview of the method here: - -```{r figure, echo=FALSE, out.width='100%', fig.align='center'} -library(knitr) -include_graphics("Biometrics_Flow_Chart.png") -``` - -## Notation and Framework - -Building upon the foundational concepts presented in [@diaz2012population]. - -Let $O=(W, \mathbf{A}, Y)$ represent a random variable with distribution $P_0$, and $O_1,...,O_n$ represent $n$ i.i.d. observations of $O$. These are our observations - -The true distribution $P_0$ of $O$ is: - -\[ P_0(O) = P_0(Y|\mathbf{A}, W)P_0(\mathbf{A}|W)P_0(W), \] - -Here, we are not considering mediation. - -We are interested in the counterfactual outcome if the exposure distribution was shifted by some amount $\delta$. For example, we may want to observe how childhood asthma changes if we reduce exposure to PM2.5 by a small amount and also if we reduce exposure to NO by a small amount. - -We denote these counterfactual outcomes are denoted by $Y_{\mathbf{P}\delta}$. - -We denote a shift in the conditional density distribution of exposures as: - -\[ g_0(\mathbf{A}-\boldsymbol{\delta}(W)|W), \] - -Of course, we cannot change the entire population's exposure to a chemical by some amount and follow up to observe outcomes and then go back in time adjust the exposures again and then follow up over time - this would be our full causal model which we can't observe. Thus we need to see how close we can get to this parameter given our observed data. - -## Identification - -The statistical parameter for a stochastic shift intervention of 1 or many exposures is expressed as: - -\begin{align*} -\Phi(P_n) &= \int_{\mathbf{A}} \int_{W} P(Y=y|\mathbf{A}=a, W=w) \\ -&\times P_\delta(g)(\mathbf{A}=a|W=w)P(W=w) \, da \, dw. -\end{align*} - -The causal parameter: - -\[ \theta(P_0) = \mathbb{E}[Y(A-\delta)] \] - -Under the assumptions of Conditional Ignorability and Positivity - that is, no unmeasured confounding (we have adjusted for all necessary covariates) and we have enough exposure values across our covariates: - -\[ \Phi(P_n) = \theta(P_0) \] - -\[ \Phi(P_n) = \theta(P_0) = \int_{\mathbf{A}} \int_{W} \bar{Q}(\mathbf{A}, W)P_\delta(g)(\mathbf{A}|W)Q_W(W) \, da \, dw. \] - -Thus to construct our target parameter we need estimators for $\bar{Q}$ and $\bar{g}$ - -## Interaction Target Parameter - -For $\mathbf{A} = A_1, A_2$, a joint shift in both exposures is noted as $E[Y_{g^*_{\boldsymbol{\delta}}}]$. - -\[ \mathbb{E}[Y_{g^*_{\boldsymbol{\delta}}}] - \left( \mathbb{E}[Y_{g^*_{\delta_1}}] + \mathbb{E}[Y_{g^*_{\delta_2}}] \right) \] - -In words, our target parameter for interaction compares the counterfactual outcome mean under a joint exposure shift to the sum of expected outcomes under individual shifts of the same exposures. - -## Interpretation: Analogy to Interaction Coefficient in a Linear Model - -In traditional epidemiological studies often, interactions are defined and estimated as a product term in a linear regression model. Given a simplified model: - -\[ Y = \beta_0 + \beta_1A_1 + \beta_2A_2 + \beta_{12}A_1A_2 + \epsilon \] - -where \( \beta_{12} \) is the interaction term of interest, we can understand this coefficient from a stochastic shift perspective. Given this structural model we can ask about average outcomes under unidimensional unit shifts in $A_1, A_2$ as well as unit shifts in both and with some algebra we arrive at a representation of \( \beta_{12} \) in terms of expected outcomes under these shifts: - -\[ \beta_{12} = \mathbb{E}[Y_{g^*_{\boldsymbol{\delta}}}] - \left( \mathbb{E}[Y_{g^*_{\delta_1}}] + \mathbb{E}[Y_{g^*_{\delta_2}}] \right) + \mathbb{E}[Y] \] - -This not only links classical regression approaches with modern causal inference techniques for stochastic shift interventions but also simplifies interpretation in epidemiological studies. Therefore, the estimates provided by our method have a similar interpretation to that of an interaction $\beta$ in a linear model (up to a constant $E[Y]$), which epidemiologists are more accustomed to; however, our estimate is not contingent on assumptions of linearity. - -# Estimation and Interpretation - -In Diaz and Van der Laan's work on stochastic interventions [@diaz2012population], the efficient function (EIF) is given for an individual shift and this EIF does not change for multiple shifts. We provide more details for the EIF in our paper [@mccoy2023semiparametric]. Briefly, we update our counterfactuals from plug-in machine learning using targeted learning such that the bias for our target parameter is 0, we then plug these new counterfactuals into the EIF which we can then use to get variance estimates for confidence intervals and hypothesis testing. - -# Cross-Validation Results - -Because seperate folds are used to find the variable relationships and do estimation on these relationships, we offer to types of results. We provide fold specific results, that are tables for each variable relationship and estimation found in each fold. This is done so users can see how consistent estimates are across the folds or if there are inconsistencies in identifying varialbe relationships. - -We also offer pooled results. In this case, estimates are averages across all the folds, leveraging the full data and thus providing tighter confidence intervals. This is done by pooling our nuisance estimates across all the validation folds and doing a pooled TMLE update and calculating the EIF for the full data. For the sake of this vignette, users just need to know that both pooled and v-fold results are given in the SuperNOVA output. - -Specifically, users should be aware that, it is possible to get inconsistent results. For example, if an interaction is only found in 1 of 10 folds, this is an inconsistent result and reporting this as a robust result should be done with caution. Likewise for any other variable relationship that is found. Thus users should evaluate how consistent 1. the relationships are found across the folds and 2. how consistent the empirical results are for the variable relationships. - -# Exposure Set Identification using Ensemble Approaches - -In the realm of high-dimensional data, the challenge isn't solely the estimation but rather the identification of influential variable sets, especially when exposure dimensions are vast. To address this, we resort to an ensemble method: the discrete Super Learner \cite{SL_2008}. This algorithm employs a cross-validation mechanism, optimizing over a library of candidate algorithms to select the estimator with the best empirical fit. - -More specifically, the Super Learner in our context uses basis splines and their tensor products, permitting the model to capture intricate non-linear relationships as linear combinations of basis functions while remaining interpretable. Such formulations are represented in prominent packages like \texttt{earth} \cite{earth}, \texttt{polySpline} \cite{polspline}, and \texttt{hal9001} \cite{hal9001}. We employ this estimator for $E[Y| \boldsymbol{A}, W]$ used in the data-adaptive discovery procedure using the parameter-generating data. - -# Non-Parametric Analysis of Variance (NP-ANOVA) - -Upon establishing the best fitting estimator, discerning the hierarchy of variable set importance becomes paramount. Traditional ANOVA techniques fall short given the inherent non-linearity and extensive dimensionality intrinsic to our models. In response, we introduce a non-parametric extension: NP-ANOVA. - -This method decomposes the variance of the response variable, attributing it to contributions from individual basis functions. Irrespective of the model's underpinnings (be it MARS, HAL, or others), the variance partitioning furnishes insights into the relative importance of each basis function. - -The F-statistic serves as our metric of choice, but instead of applied to variables in a linear model, which is done in standard ANOVA analyses, we apply an ANOVA over basis functions used in the best fitting estimator. At its core, the F-statistic juxtaposes the variance elucidated by the model against the residual variance, normalized by their respective degrees of freedom. With the model construed as a blend of basis functions, we resort to the conventional sums of squares (RSS, TSS, ESS) to compute the F-statistic. - -Subsequent to this, variable sets earn their rankings via aggregated F-statistics. That is, each basis function, attached to a particular exposure, or exposure set, has a particular F-statistic, we then aggregate these F-statisics to the exposure set level. Employing quantile-based thresholding, we extract influential subsets, thus streamlining the set for subsequent parameter estimations which is useful in very high-dimensional exposure setting but can also be bypassed as part of the data-adaptive parameter. - -For a more details into NP-ANOVA method for exposure set discovery, we direct readers to the supplementary material. - -Overall, our method for exposure set discovery includes the following: - - 1. Fit $$E[Y| \boldsymbol{A}, W]$$ using the aforementiond Super Leaner of basis function estimators. - 2. Extract the model matrix from the best fitting estimator - 3. Either use all sets of exposures of length 2 used in the basis functions (basis functions with two exposures indicate this basis is capturing interaction) or apply our F-statistic method to extract "important" basis functions with interaction. - - -## Targeted Learning - -We use targeted minimum loss based estimation (TMLE) to debias our initial outcome estimates given a shift in exposure such that the resulting estimator is asymptotically unbiased and has the smallest variance. This procedure involves constructing a least favorable submodel which uses a ratio of conditional exposure densities under shift and now shift as a covariate to debias the initial estimates. More details of targeted learning for stochastic shifts are here @diaz2012population - - -## Review - -Overall, because we need to both identify variable relationships and create efficient estimators given an intervention on these relationships we need to use sample splitting. As such, we use one part of the data to find these relationships - -## Data Adaptive Delta - -For each exposure, the user inputs a respective delta, for example for exposures $M1, M2, M3$ the analyst puts in the `deltas` vector - -```{r deltas, message=FALSE, warning=FALSE} -deltas <- c("M1" = 1, "M2" = 2.3, "M3" = 1.4) -``` - -Which assigns a delta shift amount to each exposure. However, because the user doesn't know the underlying experimentation in the data, a delta that is too big may result in positivity violations which leads to bias and high variance of the estimator. That is, if the user asks for a shift in exposure density that is very unlikely given the data. To solve this, the analyst can also choose to set `adaptive_delta = TRUE` which then makes the shift amount a data adaptive parameter as well. The delta will be reduced until the max of the ratio of densities for each exposure is less than or equal to `hn_trunc_thresh`. - -## Application: - -First, let's load the packages we'll use and set a seed for simulation: - -```{r setup, message=FALSE, warning=FALSE} -library(data.table) -library(dplyr) -library(kableExtra) -library(SuperNOVA) - -seed <- 325911 -``` - -Below we show implementation of `SuperNOVA` using simulated data. Because estimates are given for each data-adaptively identified parameter for each fold, after the demonstration we also show our pooled estimate approach for parameters that are found across all the folds. - -## NIEHS Mixture Workshop Data - -The `SupernOVA` package comes with 2 datasets from the 2015-NIEHS-Mixtures-Workshop simulation data (https://github.com/niehs-prime/2015-NIEHS-MIxtures-Workshop). Let's load this data and run `SuperNOVA` to see if we identify 1. any interactions in the mixture variables or any marginally important mixture variables and/or 2. any effect modifying variables. - -```{r NIEHS example} -data("NIEHS_data_1", package = "SuperNOVA") -``` - -```{r NIEH Nodes} -NIEHS_data_1$W <- rnorm(nrow(NIEHS_data_1), mean = 0, sd = 0.1) -w <- NIEHS_data_1[, c("W", "Z")] -a <- NIEHS_data_1[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7")] -y <- NIEHS_data_1$Y -``` - - -```{r run SuperNOVA NIEHS data, eval = TRUE} -deltas <- list( - "X1" = 1, "X2" = 1, "X3" = 1, - "X4" = 1, "X5" = 1, "X6" = 1, "X7" = 1 -) - -ptm <- proc.time() - -NIEH_results <- SuperNOVA( - w = w, - a = a, - y = y, - deltas = deltas, - estimator = "tmle", - fluctuation = "standard", - n_folds = 2, - outcome_type = "continuous", - quantile_thresh = 0, - verbose = TRUE, - parallel = FALSE, - parallel_type = "sequential", - num_cores = 2, - seed = seed, - adaptive_delta = TRUE -) - -proc.time() - ptm - -indiv_shift_results <- NIEH_results$`Indiv Shift Results` -em_results <- NIEH_results$`Effect Mod Results` -joint_shift_results <- NIEH_results$`Joint Shift Results` -``` - - -Let's look at the results for the variable $X7$ which should have a positive -effect given the data dictionary provided: - -```{r individual results} -indiv_shift_results$X7 %>% - kableExtra::kbl(caption = "Effect Modification Results") %>% - kable_classic(full_width = F, html_font = "Cambria") -``` - -We ran `SuperNOVA` with two folds and above we see this exposure was identified in both folds. We also set `adaptive_delta` = TRUE, -with an initial shift value of 1. As we can see - in both folds this exposure was found, the delta was reduced to 0.59 to ensure there are no positivity violations (incurring a shift that is unfeasible). The pooled estimate is an average across the folds and variance estimates for the pooled result are done via a pooled targeted estimation fluctuation step using nuisance parameters across the folds. Here, our interpretation of this finding is "If all individuals were exposed to a 0.59 increase in X7 the expected outcome increases by 2.37". This result is significant for both the pooled estimate and the fold specific estimates. - -Now let's look for effect modification: -```{r joint results} -em_results$X7Z %>% - kableExtra::kbl(caption = "Effect Modification Results") %>% - kableExtra::kable_classic(full_width = F, html_font = "Cambria") -``` - -This shows there are differential effects on the impact of shifting $X7$ between -strata of the baseline covariate Z. This result was found in 1 fold and so the fold specific and pooled results are the same. This is an example of a less consistent finding as it wasn't found in all the folds. The user should incorporate this information into their interpretation of findings. Findings are more consistent with higher CV fold values. - -Here we see that when Z = 0 a 0.47 shift in X7 leads to an average outcome of 17.43 and when Z = 1 the average outcome for the same shift is 33.7. - -```{r interaction results} -joint_shift_results$X2X7 %>% - kableExtra::kbl(caption = "Interaction Results") %>% - kableExtra::kable_classic(full_width = T, html_font = "Cambria") -``` - -Here we see that in one of the folds an interaction between X2 and X7 was found. The adaptive delta was set to 0.22 for X2 and 0.35 for X7. An increase of 0.22 in X2 leads to a 0.64 increase in Y and a 0.35 increase in X7 leads to a 1.23 increase in Y. An increase in both by these amounts changes Y by 1.87 compared to the additive sum 1.87 if we just add both these individual estimates together. Thus, there is no evidence of interaction given our definition in this case that is more than additive effects. - - - - - diff --git a/doc/SuperNOVA-vignette.html b/doc/SuperNOVA-vignette.html deleted file mode 100644 index a669260..0000000 --- a/doc/SuperNOVA-vignette.html +++ /dev/null @@ -1,1057 +0,0 @@ - - - - - - - - - - - - - - - - -Analysis of Variance using Super Learner with Data-Adaptive Stochastic Interventions - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Analysis of Variance using Super Learner -with Data-Adaptive Stochastic Interventions

-

David McCoy

-

2023-10-09

- - - -
-

Motivation

-

The motivation behind the package SuperNOVA is to address the -limitations of traditional statistical methods in environmental -epidemiology studies. Analysts are often interested in understanding the -joint impact of a mixed exposure, i.e. a vector of exposures, but the -most important variables and variable sets remain unknown. Traditional -methods make overly simplistic assumptions, such as linear and additive -relationships, and the resulting statistical quantities may not be -directly applicable to public policy decisions.

-

To overcome these limitations, SuperNOVA uses data-adaptive machine -learning methods to identify the variables and variable sets that have -the most explanatory power on an outcome of interest. The package builds -a discrete Super Learner, which is then analyzed using ANOVA style -analysis to determine the variables that contribute most to the model -fit through basis functions. The target parameter, which may be a single -shift, effect modification, interaction or mediation, is then applied to -the data-adaptively determined variable sets.

-

In this way, SuperNOVA allows analysts to explore modified treatment -policies and ask causal questions, such as “If exposure to collections -of PFAS chemicals increases, what is the change in cholesterol, immune -function, or cancer?” By providing more nuanced and data-driven -insights, SuperNOVA aims to inform public policy decisions and improve -our understanding of the impact of mixed exposures on health -outcomes.

-

For more detailed information, please read McCoy, Hubbard, Laan, et al. (2023) - here we -focus on providing a general overview and enough information to -understand processes and output.

-
-

Data-Adaptive Machine Learning

-

The package SuperNOVA uses a data-adaptive approach to identify -important variables and variable sets in mixed exposure studies. To -avoid over-fitting and incorrect model assumptions, the package employs -cross-validation procedures. In each fold, the full data is split into a -training and estimation sample, the training data is used to:

-
    -
  1. Identify the variable sets of interest using flexible machine -learning algorithms.
  2. -
  3. Fit estimators for the relevant nuisance parameters and the final -target parameter of interest using the identified variable sets.
  4. -
-

Then the estimation sample is used to:

-
    -
  1. Obtain estimates of the nuisance parameters and the final target -parameter of interest using the held-out validation data.
  2. -
  3. The process is repeated in each fold to provide an estimate specific -to each validation data. To optimize the balance between bias and -variance, SuperNOVA uses targeted minimum loss based estimation -(TMLE).
  4. -
  5. Estimates across the folds are then averaged and a pooled -fluctuation step is performed to estimate the efficient influence -function and derive pooled variance estimates.
  6. -
-
-
-
-

Data and Parameter of Interest

-
-

Overview of our Data-Adaptive Target Parameter

-

In contemporary epidemiological analyses, high-dimensional data sets -have revealed the constraints of conventional statistical methods. In -many cases, the interactions that exist in multiple exposures, the -baseline covariates that modify the impacts of exposures or the -mediators that may mediate the impact of exposures are all unknown a -priori. Thus, we need to identify these relationships and build -efficient estimators for when interventions are made on these variable -sets in the respective relationships.

-

Hubbard and van der Laan (Hubbard, -Kherad-Pajouh, and Van Der Laan 2016) championed data-adaptive -target parameters which aim to answer the question of how to both -identify and estimate target parameter in data which requires the -combination of data-splitting and efficient estimators. Because of the -number of steps required for robust estimation is complex, we first -provide an overview of the method here:

-

-
-
-

Notation and Framework

-

Building upon the foundational concepts presented in (Dı́az and van der Laan 2012).

-

Let \(O=(W, \mathbf{A}, Y)\) -represent a random variable with distribution \(P_0\), and \(O_1,...,O_n\) represent \(n\) i.i.d. observations of \(O\). These are our observations

-

The true distribution \(P_0\) of -\(O\) is:

-

\[ P_0(O) = P_0(Y|\mathbf{A}, -W)P_0(\mathbf{A}|W)P_0(W), \]

-

Here, we are not considering mediation.

-

We are interested in the counterfactual outcome if the exposure -distribution was shifted by some amount \(\delta\). For example, we may want to -observe how childhood asthma changes if we reduce exposure to PM2.5 by a -small amount and also if we reduce exposure to NO by a small amount.

-

We denote these counterfactual outcomes are denoted by \(Y_{\mathbf{P}\delta}\).

-

We denote a shift in the conditional density distribution of -exposures as:

-

\[ -g_0(\mathbf{A}-\boldsymbol{\delta}(W)|W), \]

-

Of course, we cannot change the entire population’s exposure to a -chemical by some amount and follow up to observe outcomes and then go -back in time adjust the exposures again and then follow up over time - -this would be our full causal model which we can’t observe. Thus we need -to see how close we can get to this parameter given our observed -data.

-
-
-

Identification

-

The statistical parameter for a stochastic shift intervention of 1 or -many exposures is expressed as:

-

\[\begin{align*} -\Phi(P_n) &= \int_{\mathbf{A}} \int_{W} P(Y=y|\mathbf{A}=a, W=w) \\ -&\times P_\delta(g)(\mathbf{A}=a|W=w)P(W=w) \, da \, dw. -\end{align*}\]

-

The causal parameter:

-

\[ \theta(P_0) = \mathbb{E}[Y(A-\delta)] -\]

-

Under the assumptions of Conditional Ignorability and Positivity - -that is, no unmeasured confounding (we have adjusted for all necessary -covariates) and we have enough exposure values across our -covariates:

-

\[ \Phi(P_n) = \theta(P_0) \]

-

\[ \Phi(P_n) = \theta(P_0) = -\int_{\mathbf{A}} \int_{W} \bar{Q}(\mathbf{A}, -W)P_\delta(g)(\mathbf{A}|W)Q_W(W) \, da \, dw. \]

-

Thus to construct our target parameter we need estimators for \(\bar{Q}\) and \(\bar{g}\)

-
-
-

Interaction Target Parameter

-

For \(\mathbf{A} = A_1, A_2\), a -joint shift in both exposures is noted as \(E[Y_{g^*_{\boldsymbol{\delta}}}]\).

-

\[ -\mathbb{E}[Y_{g^*_{\boldsymbol{\delta}}}] - \left( -\mathbb{E}[Y_{g^*_{\delta_1}}] + \mathbb{E}[Y_{g^*_{\delta_2}}] \right) -\]

-

In words, our target parameter for interaction compares the -counterfactual outcome mean under a joint exposure shift to the sum of -expected outcomes under individual shifts of the same exposures.

-
-
-

Interpretation: Analogy to Interaction Coefficient in a Linear -Model

-

In traditional epidemiological studies often, interactions are -defined and estimated as a product term in a linear regression model. -Given a simplified model:

-

\[ Y = \beta_0 + \beta_1A_1 + \beta_2A_2 + -\beta_{12}A_1A_2 + \epsilon \]

-

where \(\beta_{12}\) is the -interaction term of interest, we can understand this coefficient from a -stochastic shift perspective. Given this structural model we can ask -about average outcomes under unidimensional unit shifts in \(A_1, A_2\) as well as unit shifts in both -and with some algebra we arrive at a representation of \(\beta_{12}\) in terms of expected outcomes -under these shifts:

-

\[ \beta_{12} = -\mathbb{E}[Y_{g^*_{\boldsymbol{\delta}}}] - \left( -\mathbb{E}[Y_{g^*_{\delta_1}}] + \mathbb{E}[Y_{g^*_{\delta_2}}] \right) -+ \mathbb{E}[Y] \]

-

This not only links classical regression approaches with modern -causal inference techniques for stochastic shift interventions but also -simplifies interpretation in epidemiological studies. Therefore, the -estimates provided by our method have a similar interpretation to that -of an interaction \(\beta\) in a linear -model (up to a constant \(E[Y]\)), -which epidemiologists are more accustomed to; however, our estimate is -not contingent on assumptions of linearity.

-
-
-
-

Estimation and Interpretation

-

In Diaz and Van der Laan’s work on stochastic interventions (Dı́az and van der Laan 2012), the efficient -function (EIF) is given for an individual shift and this EIF does not -change for multiple shifts. We provide more details for the EIF in our -paper (McCoy, Hubbard, Schuler, et al. -2023). Briefly, we update our counterfactuals from plug-in -machine learning using targeted learning such that the bias for our -target parameter is 0, we then plug these new counterfactuals into the -EIF which we can then use to get variance estimates for confidence -intervals and hypothesis testing.

-
-
-

Cross-Validation Results

-

Because seperate folds are used to find the variable relationships -and do estimation on these relationships, we offer to types of results. -We provide fold specific results, that are tables for each variable -relationship and estimation found in each fold. This is done so users -can see how consistent estimates are across the folds or if there are -inconsistencies in identifying varialbe relationships.

-

We also offer pooled results. In this case, estimates are averages -across all the folds, leveraging the full data and thus providing -tighter confidence intervals. This is done by pooling our nuisance -estimates across all the validation folds and doing a pooled TMLE update -and calculating the EIF for the full data. For the sake of this -vignette, users just need to know that both pooled and v-fold results -are given in the SuperNOVA output.

-

Specifically, users should be aware that, it is possible to get -inconsistent results. For example, if an interaction is only found in 1 -of 10 folds, this is an inconsistent result and reporting this as a -robust result should be done with caution. Likewise for any other -variable relationship that is found. Thus users should evaluate how -consistent 1. the relationships are found across the folds and 2. how -consistent the empirical results are for the variable relationships.

-
-
-

Exposure Set Identification using Ensemble Approaches

-

In the realm of high-dimensional data, the challenge isn’t solely the -estimation but rather the identification of influential variable sets, -especially when exposure dimensions are vast. To address this, we resort -to an ensemble method: the discrete Super Learner . This algorithm -employs a cross-validation mechanism, optimizing over a library of -candidate algorithms to select the estimator with the best empirical -fit.

-

More specifically, the Super Learner in our context uses basis -splines and their tensor products, permitting the model to capture -intricate non-linear relationships as linear combinations of basis -functions while remaining interpretable. Such formulations are -represented in prominent packages like , , and . We employ this -estimator for \(E[Y| \boldsymbol{A}, -W]\) used in the data-adaptive discovery procedure using the -parameter-generating data.

-

Upon establishing the best fitting estimator, discerning the -hierarchy of variable set importance becomes paramount. Traditional -ANOVA techniques fall short given the inherent non-linearity and -extensive dimensionality intrinsic to our models. In response, we -introduce a non-parametric extension: NP-ANOVA.

-

This method decomposes the variance of the response variable, -attributing it to contributions from individual basis functions. -Irrespective of the model’s underpinnings (be it MARS, HAL, or others), -the variance partitioning furnishes insights into the relative -importance of each basis function.

-

The F-statistic serves as our metric of choice, but instead of -applied to variables in a linear model, which is done in standard ANOVA -analyses, we apply an ANOVA over basis functions used in the best -fitting estimator. At its core, the F-statistic juxtaposes the variance -elucidated by the model against the residual variance, normalized by -their respective degrees of freedom. With the model construed as a blend -of basis functions, we resort to the conventional sums of squares (RSS, -TSS, ESS) to compute the F-statistic.

-

Subsequent to this, variable sets earn their rankings via aggregated -F-statistics. That is, each basis function, attached to a particular -exposure, or exposure set, has a particular F-statistic, we then -aggregate these F-statisics to the exposure set level. Employing -quantile-based thresholding, we extract influential subsets, thus -streamlining the set for subsequent parameter estimations which is -useful in very high-dimensional exposure setting but can also be -bypassed as part of the data-adaptive parameter.

-

-Overall, our method for exposure set discovery includes the following: -
-

Targeted Learning

-

We use targeted minimum loss based estimation (TMLE) to debias our -initial outcome estimates given a shift in exposure such that the -resulting estimator is asymptotically unbiased and has the smallest -variance. This procedure involves constructing a least favorable -submodel which uses a ratio of conditional exposure densities under -shift and now shift as a covariate to debias the initial estimates. More -details of targeted learning for stochastic shifts are here Dı́az and van der Laan (2012)

-
-
-

Pooling Estimates Found Across the Folds

-

Because SuperNOVA gets the TMLE updated shift parameter -for each condition (individual, effect modification interaction and -joint), for each fold, we can pool estimates that are found across all -folds (are consistently found in the data adaptive procedure). We do -this by stacking the nuisance parameter estimates across the folds to do -a pooled fluctuation step, making use of the full data to solve the -efficient influence function. This allows us to estimate variance for -the full data which results in tighter confidence intervals. Point -estimates across the folds are averaged. Thus, SuperNOVA -outputs estimates for both V-fold specific results and pooled -estimates.

-
-
-

Data Adaptive Delta

-

For each exposure, the user inputs a respective delta, for example -for exposures \(M1, M2, M3\) the -analyst puts in the deltas vector

-
deltas <- c("M1" = 1, "M2" = 2.3, "M3" = 1.4)
-

Which assigns a delta shift amount to each exposure. However, because -the user doesn’t know the underlying experimentation in the data, a -delta that is too big may result in positivity violations which leads to -bias and high variance of the estimator. That is, if the user asks for a -shift in exposure density that is very unlikely given the data. To solve -this, the analyst can also choose to set -adaptive_delta = TRUE which then makes the shift amount a -data adaptive parameter as well. The delta will be reduced until the max -of the ratio of densities for each exposure is less than or equal to -hn_trunc_thresh.

-
-
-

Application:

-

First, let’s load the packages we’ll use and set a seed for -simulation:

-
library(data.table)
-library(dplyr)
-library(kableExtra)
-library(SuperNOVA)
-
-seed <- 325911
-

Below we show implementation of SuperNOVA using -simulated data. Because estimates are given for each data-adaptively -identified parameter for each fold, after the demonstration we also show -our pooled estimate approach for parameters that are found across all -the folds.

-
-
-

NIEHS Mixture Workshop Data

-

The SupernOVA package comes with 2 datasets from the -2015-NIEHS-Mixtures-Workshop simulation data (https://github.com/niehs-prime/2015-NIEHS-MIxtures-Workshop). -Let’s load this data and run SuperNOVA to see if we -identify 1. any interactions in the mixture variables or any marginally -important mixture variables and/or 2. any effect modifying -variables.

-
data("NIEHS_data_1", package = "SuperNOVA")
-
NIEHS_data_1$W <- rnorm(nrow(NIEHS_data_1), mean = 0, sd = 0.1)
-w <- NIEHS_data_1[, c("W", "Z")]
-a <- NIEHS_data_1[, c("X1", "X2", "X3", "X4", "X5", "X6", "X7")]
-y <- NIEHS_data_1$Y
-
deltas <- list(
-  "X1" = 1, "X2" = 1, "X3" = 1,
-  "X4" = 1, "X5" = 1, "X6" = 1, "X7" = 1
-)
-
-ptm <- proc.time()
-
-NIEH_results <- SuperNOVA(
-  w = w,
-  a = a,
-  y = y,
-  deltas = deltas,
-  estimator = "tmle",
-  fluctuation = "standard",
-  n_folds = 2,
-  outcome_type = "continuous",
-  quantile_thresh = 0,
-  verbose = TRUE,
-  parallel = FALSE,
-  parallel_type = "sequential",
-  num_cores = 2,
-  seed = seed,
-  adaptive_delta = TRUE
-)
-#> 
-#> Attaching package: 'purrr'
-#> The following object is masked from 'package:data.table':
-#> 
-#>     transpose
-
-proc.time() - ptm
-#>    user  system elapsed 
-#>  30.257   0.564  33.629
-
-indiv_shift_results <- NIEH_results$`Indiv Shift Results`
-em_results <- NIEH_results$`Effect Mod Results`
-joint_shift_results <- NIEH_results$`Joint Shift Results`
-

Let’s look at the results for the variable \(X7\) which should have a positive effect -given the data dictionary provided:

-
indiv_shift_results$X7 %>%
-  kableExtra::kbl(caption = "Effect Modification Results") %>%
-  kable_classic(full_width = F, html_font = "Cambria")
- - - - - - -
-Effect Modification Results -
-

We ran SuperNOVA with two folds and above we see this -exposure was identified in both folds. We also set -adaptive_delta = TRUE, with an initial shift value of 1. As -we can see - in both folds this exposure was found, the delta was -reduced to 0.59 to ensure there are no positivity violations (incurring -a shift that is unfeasible). The pooled estimate is an average across -the folds and variance estimates for the pooled result are done via a -pooled targeted estimation fluctuation step using nuisance parameters -across the folds. Here, our interpretation of this finding is “If all -individuals were exposed to a 0.59 increase in X7 the expected outcome -increases by 2.37”. This result is significant for both the pooled -estimate and the fold specific estimates.

-

Now let’s look for effect modification:

-
em_results$X7Z %>%
-  kableExtra::kbl(caption = "Effect Modification Results") %>%
-  kableExtra::kable_classic(full_width = F, html_font = "Cambria")
- - - - - - -
-Effect Modification Results -
-

This shows there are differential effects on the impact of shifting -\(X7\) between strata of the baseline -covariate Z. This result was found in 1 fold and so the fold specific -and pooled results are the same. This is an example of a less consistent -finding as it wasn’t found in all the folds. The user should incorporate -this information into their interpretation of findings. Findings are -more consistent with higher CV fold values.

-

Here we see that when Z = 0 a 0.47 shift in X7 leads to an average -outcome of 17.43 and when Z = 1 the average outcome for the same shift -is 33.7.

-
joint_shift_results$X2X7 %>%
-  kableExtra::kbl(caption = "Interaction Results") %>%
-  kableExtra::kable_classic(full_width = T, html_font = "Cambria")
- - - - - - -
-Interaction Results -
-

Here we see that in one of the folds an interaction between X2 and X7 -was found. The adaptive delta was set to 0.22 for X2 and 0.35 for X7. An -increase of 0.22 in X2 leads to a 0.64 increase in Y and a 0.35 increase -in X7 leads to a 1.23 increase in Y. An increase in both by these -amounts changes Y by 1.87 compared to the additive sum 1.87 if we just -add both these individual estimates together. Thus, there is no evidence -of interaction given our definition in this case that is more than -additive effects.

-
-
-Dı́az, Iván, and Mark J van der Laan. 2012. “Population -Intervention Causal Effects Based on Stochastic Interventions.” -Biometrics 68 (2): 541–49. -
-
-Hubbard, Alan E., Sara Kherad-Pajouh, and Mark J. Van Der Laan. 2016. -Statistical Inference for Data Adaptive -Target Parameters.” International Journal of -Biostatistics 12 (1): 3–19. https://doi.org/10.1515/ijb-2015-0013. -
-
-McCoy, David B., Alan E. Hubbard, Mark van der Laan, and Alejandro -Schuler. 2023. Unveiling Causal Mediation -Pathways in High-Dimensional Mixed Exposures: A Data-Adaptive Target -Parameter Strategy,” 1–32. http://arxiv.org/abs/2307.02667. -
-
-McCoy, David B., Alan E. Hubbard, Alejandro Schuler, and Mark J. van der -Laan. 2023. “Semi-Parametric Identification and Estimation of -Interaction and Effect Modification in Mixed Exposures Using Stochastic -Interventions.” https://arxiv.org/abs/2305.01849. -
-
-
-
- - - - - - - - - - - diff --git a/man/calc_final_joint_shift_param.Rd b/man/calc_final_joint_shift_param.Rd index b8ce4fc..5625a89 100644 --- a/man/calc_final_joint_shift_param.Rd +++ b/man/calc_final_joint_shift_param.Rd @@ -8,12 +8,15 @@ calc_final_joint_shift_param( joint_shift_fold_results, rank, fold_k, - deltas_updated + deltas_updated, + exposures ) } \arguments{ \item{joint_shift_fold_results}{Results of the joint shift} +\item{rank}{ranking of the interaction found} + \item{fold_k}{Fold the joint shift is identified} \item{deltas_updated}{The new delta, could be updated if Hn has positivity