From ae0f11aeaf2fada628ce35039baab7d9361eb9c6 Mon Sep 17 00:00:00 2001 From: vpnsctl Date: Thu, 12 Dec 2024 02:55:42 +0300 Subject: [PATCH 1/7] update cross_validation --- R/inlabru_rspde.R | 1889 ++++++++++++--------------------------------- R/util.R | 116 +-- 2 files changed, 558 insertions(+), 1447 deletions(-) diff --git a/R/inlabru_rspde.R b/R/inlabru_rspde.R index cdc5b4f1..089df926 100644 --- a/R/inlabru_rspde.R +++ b/R/inlabru_rspde.R @@ -117,24 +117,15 @@ ibm_jacobian.bru_mapper_inla_rspde <- function(mapper, input, ...) { ) } + #' @noRd -# Function to process bru's formula - -process_formula <- function(bru_result) { - form <- bru_result$bru_info$model$formula[3] - form <- as.character(form) - form <- strsplit(form, "f\\(") - form <- form[[1]] - form <- form[-1] - form_proc <- sub(",.*", "", strsplit(form, "f\\(")[1]) - if (length(form) > 1) { - for (i in 2:(length(form))) { - form_proc <- paste(form_proc, " + ", sub(",.*", "", strsplit(form, "f\\(")[i])) + +process_formula_lhoods <- function(bru_result, like_number) { + form <- bru_result$bru_info$lhoods[[like_number]]$formula[3] + form <- as.character(form) + form_proc <- paste("~", "linkfuninv(", form, ")") + return(stats::as.formula(form_proc)) } - } - form_proc <- paste("~", "linkfuninv(", form_proc, ")") - return(stats::as.formula(form_proc)) -} #' @noRd # Function to process the link function @@ -211,7 +202,9 @@ bru_rerun_with_data <- function(result, idx_data, true_CV, fit_verbose) { original_timings <- result[["bru_timings"]] lhoods_tmp <- info[["lhoods"]] - lhoods_tmp[[1]]$response_data$BRU_response[-idx_data] <- NA + for(i_like in seq_along(lhoods_tmp)){ + lhoods_tmp[[i_like]]$response_data$BRU_response[-idx_data[[i_like]]] <- NA + } result <- inlabru::iinla( model = info[["model"]], @@ -264,554 +257,13 @@ get_post_var <- function(density_df) { return(post_var) } -#' @noRd - -prepare_df_pred <- function(df_pred, result, idx_test) { - info <- result[["bru_info"]] - list_of_components <- names(info[["model"]][["effects"]]) - lhoods_tmp <- info[["lhoods"]] - - for (comp in list_of_components) { - name_input_group <- info[["model"]][["effects"]][[comp]][["group"]][["input"]][["input"]] - if (!is.null(name_input_group)) { - name_input_group <- as.character(name_input_group) - comp_group_tmp <- info[["model"]][["effects"]][[comp]][["env"]][[name_input_group]] - if (!is.null(comp_group_tmp)) { - if (!is.null(dim(comp_group_tmp))) { - comp_group_tmp <- comp_group_tmp[idx_test, , drop = FALSE] - } else { - comp_group_tmp <- comp_group_tmp[idx_test] - } - } else { - if (!is.null(dim(lhoods_tmp[[1]]$data[[name_input_group]]))) { - comp_group_tmp <- lhoods_tmp[[1]]$data[[name_input_group]][idx_test, , drop = FALSE] - } else { - comp_group_tmp <- lhoods_tmp[[1]]$data[[name_input_group]][idx_test] - } - } - df_pred[[name_input_group]] <- comp_group_tmp - } - name_input_repl <- info[["model"]][["effects"]][[comp]][["replicate"]][["input"]][["input"]] - if (!is.null(name_input_repl)) { - name_input_repl <- as.character(name_input_repl) - comp_repl_tmp <- info[["model"]][["effects"]][[comp]][["env"]][[name_input_repl]] - if (!is.null(comp_repl_tmp)) { - if (!is.null(dim(comp_repl_tmp))) { - comp_repl_tmp <- comp_repl_tmp[idx_test, , drop = FALSE] - } else { - comp_repl_tmp <- comp_repl_tmp[idx_test] - } - } else { - if (!is.null(dim(lhoods_tmp[[1]]$data[[name_input_repl]]))) { - comp_repl_tmp <- lhoods_tmp[[1]]$data[[name_input_repl]][idx_test, , drop = FALSE] - } else { - comp_repl_tmp <- lhoods_tmp[[1]]$data[[name_input_repl]][idx_test] - } - } - df_pred[[name_input_repl]] <- comp_repl_tmp - } - } - return(df_pred) -} - -# #' @noRd -# calculate_scores <- function(family, test_data, posterior_samples, hyper_samples, n_samples, parallelize_RP, n_cores_RP) { -# scores <- list() - -# if (family == "gaussian") { -# # Calculate MSE -# posterior_mean <- rowMeans(posterior_samples) -# mse <- mean((test_data - posterior_mean)^2) - -# # Calculate DSS -# precision_mean <- 1 / mean(hyper_samples[, "Precision for the Gaussian observations"]) -# posterior_variance_of_mean <- rowMeans(posterior_samples[, 1:n_samples]^2) - (rowMeans(posterior_samples[, 1:n_samples]))^2 -# post_var <- precision_mean + posterior_variance_of_mean -# y_mean <- rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)]) -# dss <- mean((test_data - y_mean)^2 / post_var + log(post_var)) - -# # Calculate CRPS and SCRPS -# phi_sample_1 <- hyper_samples[, "Precision for the Gaussian observations"][1:n_samples] -# phi_sample_2 <- hyper_samples[, "Precision for the Gaussian observations"][(n_samples + 1):(2 * n_samples)] -# sd_sample_1 <- 1 / sqrt(phi_sample_1) -# sd_sample_2 <- 1 / sqrt(phi_sample_2) - -# if (parallelize_RP) { -# cl <- makeCluster(n_cores_RP) -# registerDoParallel(cl) -# Y1_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# posterior_samples[i, 1:n_samples] + sd_sample_1[i] * rnorm(n_samples) -# } -# Y2_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# posterior_samples[i, (n_samples + 1):(2 * n_samples)] + sd_sample_2[i] * rnorm(n_samples) -# } -# stopCluster(cl) -# Y1_sample <- split(Y1_sample, rep(1:length(test_data), each = n_samples)) -# Y2_sample <- split(Y2_sample, rep(1:length(test_data), each = n_samples)) - -# E1_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# } -# E2_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# } -# } else { -# Y1_sample <- lapply(1:length(test_data), function(i) { -# posterior_samples[i, 1:n_samples] + sd_sample_1[i] * rnorm(n_samples) -# }) -# Y2_sample <- lapply(1:length(test_data), function(i) { -# posterior_samples[i, (n_samples + 1):(2 * n_samples)] + sd_sample_2[i] * rnorm(n_samples) -# }) - -# E1_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# }) -# E2_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# }) -# } - -# crps <- mean(-E1_tmp + 0.5 * E2_tmp) -# scrps <- mean(-E1_tmp / E2_tmp - 0.5 * log(E2_tmp)) - -# scores <- list(mse = mse, dss = dss, crps = crps, scrps = scrps) - -# } else if (family == "gamma") { -# # Calculate MSE -# posterior_mean <- rowMeans(posterior_samples) -# mse <- mean((test_data - posterior_mean)^2) - -# # Calculate DSS -# phi_sample_1 <- hyper_samples[, "Precision parameter for the Gamma observations"][1:n_samples] -# phi_sample_2 <- hyper_samples[, "Precision parameter for the Gamma observations"][(n_samples + 1):(2 * n_samples)] -# post_var <- rowMeans(posterior_samples[, 1:n_samples]^2) - (rowMeans(posterior_samples[, 1:n_samples]))^2 + -# rowMeans(posterior_samples[, 1:n_samples])^2 / mean(phi_sample_1) -# y_mean <- rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)]) -# dss <- mean((test_data - y_mean)^2 / post_var + log(post_var)) - -# # Calculate CRPS and SCRPS -# if (parallelize_RP) { -# cl <- makeCluster(n_cores_RP) -# registerDoParallel(cl) -# Y1_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# scale_temp <- posterior_samples[i, 1:n_samples] / phi_sample_1[i] -# rgamma(n_samples, shape = phi_sample_1[i], scale = scale_temp) -# } -# Y2_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# scale_temp <- posterior_samples[i, (n_samples + 1):(2 * n_samples)] / phi_sample_2[i] -# rgamma(n_samples, shape = phi_sample_2[i], scale = scale_temp) -# } -# stopCluster(cl) -# Y1_sample <- split(Y1_sample, rep(1:length(test_data), each = n_samples)) -# Y2_sample <- split(Y2_sample, rep(1:length(test_data), each = n_samples)) - -# E1_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# } -# E2_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# } -# } else { -# Y1_sample <- lapply(1:length(test_data), function(i) { -# scale_temp <- posterior_samples[i, 1:n_samples] / phi_sample_1[i] -# rgamma(n_samples, shape = phi_sample_1[i], scale = scale_temp) -# }) -# Y2_sample <- lapply(1:length(test_data), function(i) { -# scale_temp <- posterior_samples[i, (n_samples + 1):(2 * n_samples)] / phi_sample_2[i] -# rgamma(n_samples, shape = phi_sample_2[i], scale = scale_temp) -# }) - -# E1_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# }) -# E2_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# }) -# } - -# crps <- mean(-E1_tmp + 0.5 * E2_tmp) -# scrps <- mean(-E1_tmp / E2_tmp - 0.5 * log(E2_tmp)) - -# scores <- list(mse = mse, dss = dss, crps = crps, scrps = scrps) - -# } else if (family == "poisson") { -# # Calculate MSE -# posterior_mean <- rowMeans(posterior_samples) -# mse <- mean((test_data - posterior_mean)^2) - -# # Calculate DSS -# post_var <- rowMeans(posterior_samples[, 1:n_samples]^2) - (rowMeans(posterior_samples[, 1:n_samples]))^2 + -# rowMeans(posterior_samples[, 1:n_samples]) -# y_mean <- rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)]) -# dss <- mean((test_data - y_mean)^2 / post_var + log(post_var)) - -# # Calculate CRPS and SCRPS -# if (parallelize_RP) { -# cl <- makeCluster(n_cores_RP) -# registerDoParallel(cl) -# Y1_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# rpois(n_samples, posterior_samples[i, 1:n_samples]) -# } -# Y2_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# rpois(n_samples, posterior_samples[i, (n_samples + 1):(2 * n_samples)]) -# } -# stopCluster(cl) -# Y1_sample <- split(Y1_sample, rep(1:length(test_data), each = n_samples)) -# Y2_sample <- split(Y2_sample, rep(1:length(test_data), each = n_samples)) - -# E1_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# } -# E2_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# } -# } else { -# Y1_sample <- lapply(1:length(test_data), function(i) { -# rpois(n_samples, posterior_samples[i, 1:n_samples]) -# }) -# Y2_sample <- lapply(1:length(test_data), function(i) { -# rpois(n_samples, posterior_samples[i, (n_samples + 1):(2 * n_samples)]) -# }) - -# E1_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# }) -# E2_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# }) -# } - -# crps <- mean(-E1_tmp + 0.5 * E2_tmp) -# scrps <- mean(-E1_tmp / E2_tmp - 0.5 * log(E2_tmp)) - -# scores <- list(mse = mse, dss = dss, crps = crps, scrps = scrps) - -# } else if (family %in% c("stochvol", "stochvolln", "stochvolnig", "stochvolt")) { -# # Initialize variables based on family -# if (family == "stochvol") { -# # Extract phi parameters -# if ("Offset precision for stochvol" %in% colnames(hyper_samples)) { -# phi_sample_1 <- hyper_samples[, "Offset precision for stochvol"][1:n_samples] -# phi_sample_2 <- hyper_samples[, "Offset precision for stochvol"][(n_samples + 1):(2 * n_samples)] -# } else { -# phi_sample_1 <- NA -# phi_sample_2 <- NA -# } - -# # Calculate MSE -# posterior_mean <- rowMeans(posterior_samples) -# mse <- mean((test_data - posterior_mean)^2) - -# # Calculate DSS -# y_mean <- rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)]) -# post_var <- rowMeans(posterior_samples[, 1:n_samples]^2) - (rowMeans(posterior_samples[, 1:n_samples]))^2 + -# 1 / phi_sample_1 -# dss <- mean((test_data - y_mean)^2 / post_var + log(post_var)) - -# # Calculate CRPS and SCRPS -# if (parallelize_RP) { -# cl <- makeCluster(n_cores_RP) -# registerDoParallel(cl) -# Y1_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# if (is.infinite(phi_sample_1[i])) { -# sqrt(posterior_samples[i, 1:n_samples]) * rnorm(n_samples) -# } else { -# sqrt(posterior_samples[i, 1:n_samples] + 1 / phi_sample_1[i]) * rnorm(n_samples) -# } -# } -# Y2_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# if (is.infinite(phi_sample_2[i])) { -# sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)]) * rnorm(n_samples) -# } else { -# sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)] + 1 / phi_sample_2[i]) * rnorm(n_samples) -# } -# } -# stopCluster(cl) -# Y1_sample <- split(Y1_sample, rep(1:length(test_data), each = n_samples)) -# Y2_sample <- split(Y2_sample, rep(1:length(test_data), each = n_samples)) - -# E1_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# } -# E2_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# } -# } else { -# Y1_sample <- lapply(1:length(test_data), function(i) { -# if (is.infinite(phi_sample_1[i])) { -# sqrt(posterior_samples[i, 1:n_samples]) * rnorm(n_samples) -# } else { -# sqrt(posterior_samples[i, 1:n_samples] + 1 / phi_sample_1[i]) * rnorm(n_samples) -# } -# }) -# Y2_sample <- lapply(1:length(test_data), function(i) { -# if (is.infinite(phi_sample_2[i])) { -# sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)]) * rnorm(n_samples) -# } else { -# sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)] + 1 / phi_sample_2[i]) * rnorm(n_samples) -# } -# }) - -# E1_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# }) -# E2_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# }) -# } - -# crps <- mean(-E1_tmp + 0.5 * E2_tmp) -# scrps <- mean(-E1_tmp / E2_tmp - 0.5 * log(E2_tmp)) - -# scores <- list(mse = mse, dss = dss, crps = crps, scrps = scrps) - -# } else if (family == "stochvolln") { -# # Extract relevant parameters -# if ("Offset precision for stochvolln" %in% colnames(hyper_samples)) { -# phi_sample_1 <- hyper_samples[, "Offset precision for stochvolln"][1:n_samples] -# phi_sample_2 <- hyper_samples[, "Offset precision for stochvolln"][(n_samples + 1):(2 * n_samples)] -# } else { -# phi_sample_1 <- NA -# phi_sample_2 <- NA -# } - -# if ("Mean offset for stochvolln" %in% colnames(hyper_samples)) { -# mu_sample_1 <- hyper_samples[, "Mean offset for stochvolln"][1:n_samples] -# mu_sample_2 <- hyper_samples[, "Mean offset for stochvolln"][(n_samples + 1):(2 * n_samples)] -# } else { -# mu_sample_1 <- NA -# mu_sample_2 <- NA -# } - -# # Calculate MSE -# posterior_mean <- rowMeans(posterior_samples) -# mse <- mean((test_data - posterior_mean)^2) - -# # Calculate DSS -# y_mean <- rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)]) -# post_var <- rowMeans(posterior_samples[, 1:n_samples]^2) - (rowMeans(posterior_samples[, 1:n_samples]))^2 + -# rowMeans(posterior_samples[, 1:n_samples])^2 / mean(phi_sample_1) -# dss <- mean((test_data - y_mean)^2 / post_var + log(post_var)) - -# # Calculate CRPS and SCRPS -# if (parallelize_RP) { -# cl <- makeCluster(n_cores_RP) -# registerDoParallel(cl) -# Y1_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = c("foreach", "ngme2")) %dopar% { -# if (is.infinite(phi_sample_1[i])) { -# mu_sample_1[i] + sqrt(1 / phi_sample_1[i]) * rnorm(n_samples) -# } else { -# mu_sample_1[i] + sqrt(1 / phi_sample_1[i]) * rnorm(n_samples) -# } -# } -# Y2_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = c("foreach", "ngme2")) %dopar% { -# if (is.infinite(phi_sample_2[i])) { -# mu_sample_2[i] + sqrt(1 / phi_sample_2[i]) * rnorm(n_samples) -# } else { -# mu_sample_2[i] + sqrt(1 / phi_sample_2[i]) * rnorm(n_samples) -# } -# } -# stopCluster(cl) -# Y1_sample <- split(Y1_sample, rep(1:length(test_data), each = n_samples)) -# Y2_sample <- split(Y2_sample, rep(1:length(test_data), each = n_samples)) - -# E1_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# } -# E2_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# } -# } else { -# Y1_sample <- lapply(1:length(test_data), function(i) { -# if (is.infinite(phi_sample_1[i])) { -# mu_sample_1[i] + sqrt(1 / phi_sample_1[i]) * rnorm(n_samples) -# } else { -# mu_sample_1[i] + sqrt(1 / phi_sample_1[i]) * rnorm(n_samples) -# } -# }) -# Y2_sample <- lapply(1:length(test_data), function(i) { -# if (is.infinite(phi_sample_2[i])) { -# mu_sample_2[i] + sqrt(1 / phi_sample_2[i]) * rnorm(n_samples) -# } else { -# mu_sample_2[i] + sqrt(1 / phi_sample_2[i]) * rnorm(n_samples) -# } -# }) - -# E1_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# }) -# E2_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# }) -# } - -# crps <- mean(-E1_tmp + 0.5 * E2_tmp) -# scrps <- mean(-E1_tmp / E2_tmp - 0.5 * log(E2_tmp)) - -# scores <- list(mse = mse, dss = dss, crps = crps, scrps = scrps) - -# } else if (family == "stochvolnig") { -# # Implement handling for stochvolnig -# # Extract relevant parameters -# if ("shape parameter for stochvol-nig" %in% colnames(hyper_samples)) { -# shape_1 <- hyper_samples[, "shape parameter for stochvol-nig"][1:n_samples] -# shape_2 <- hyper_samples[, "shape parameter for stochvol-nig"][(n_samples + 1):(2 * n_samples)] -# } else { -# shape_1 <- NA -# shape_2 <- NA -# } - -# if ("skewness parameter for stochvol-nig" %in% colnames(hyper_samples)) { -# skewness_1 <- hyper_samples[, "skewness parameter for stochvol-nig"][1:n_samples] -# skewness_2 <- hyper_samples[, "skewness parameter for stochvol-nig"][(n_samples + 1):(2 * n_samples)] -# } else { -# skewness_1 <- NA -# skewness_2 <- NA -# } - -# # Calculate MSE -# posterior_mean <- rowMeans(posterior_samples) -# mse <- mean((test_data - posterior_mean)^2) - -# # Calculate DSS -# y_mean <- rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)]) -# post_var <- rowMeans(posterior_samples[, 1:n_samples]^2) - (rowMeans(posterior_samples[, 1:n_samples]))^2 + -# rowMeans(posterior_samples[, 1:n_samples])^2 / mean(phi_sample_1) -# dss <- mean((test_data - y_mean)^2 / post_var + log(post_var)) - -# # Calculate CRPS and SCRPS -# if (parallelize_RP) { -# cl <- makeCluster(n_cores_RP) -# registerDoParallel(cl) -# Y1_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = c("foreach", "ngme2")) %dopar% { -# sqrt(posterior_samples[i, 1:n_samples] + 1 / phi_sample_1[i]) * ngme2::rnig(n_samples, -# delta = -skewness_1[i]/sqrt(1 + (skewness_1[i]^2 / shape_1[i]^2)), -# mu = skewness_1[i], -# nu = shape_1[i]^2, -# sigma = 1/sqrt(1 + (skewness_1[i]^2 / shape_1[i]^2))) -# } -# Y2_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = c("foreach", "ngme2")) %dopar% { -# sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)] + 1 / phi_sample_2[i]) * ngme2::rnig(n_samples, -# delta = -skewness_2[i]/sqrt(1 + (skewness_2[i]^2 / shape_2[i]^2)), -# mu = skewness_2[i], -# nu = shape_2[i]^2, -# sigma = 1/sqrt(1 + (skewness_2[i]^2 / shape_2[i]^2))) -# } -# stopCluster(cl) -# Y1_sample <- split(Y1_sample, rep(1:length(test_data), each = n_samples)) -# Y2_sample <- split(Y2_sample, rep(1:length(test_data), each = n_samples)) - -# E1_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# } -# E2_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# } -# } else { -# Y1_sample <- lapply(1:length(test_data), function(i) { -# sqrt(posterior_samples[i, 1:n_samples] + 1 / phi_sample_1[i]) * ngme2::rnig(n_samples, -# delta = -skewness_1[i]/sqrt(1 + (skewness_1[i]^2 / shape_1[i]^2)), -# mu = skewness_1[i], -# nu = shape_1[i]^2, -# sigma = 1/sqrt(1 + (skewness_1[i]^2 / shape_1[i]^2))) -# }) -# Y2_sample <- lapply(1:length(test_data), function(i) { -# sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)] + 1 / phi_sample_2[i]) * ngme2::rnig(n_samples, -# delta = -skewness_2[i]/sqrt(1 + (skewness_2[i]^2 / shape_2[i]^2)), -# mu = skewness_2[i], -# nu = shape_2[i]^2, -# sigma = 1/sqrt(1 + (skewness_2[i]^2 / shape_2[i]^2))) -# }) - -# E1_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# }) -# E2_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# }) -# } - -# crps <- mean(-E1_tmp + 0.5 * E2_tmp) -# scrps <- mean(-E1_tmp / E2_tmp - 0.5 * log(E2_tmp)) - -# scores <- list(mse = mse, dss = dss, crps = crps, scrps = scrps) - -# } else if (family == "stochvolt") { -# # Extract relevant parameters -# if ("degrees of freedom for stochvol student-t" %in% colnames(hyper_samples)) { -# degree_1 <- hyper_samples[, "degrees of freedom for stochvol student-t"][1:n_samples] -# degree_2 <- hyper_samples[, "degrees of freedom for stochvol student-t"][(n_samples + 1):(2 * n_samples)] -# } else { -# degree_1 <- NA -# degree_2 <- NA -# } - -# # Calculate MSE -# posterior_mean <- rowMeans(posterior_samples) -# mse <- mean((test_data - posterior_mean)^2) - -# # Calculate DSS -# y_mean <- rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)]) -# post_var <- rowMeans(posterior_samples[, 1:n_samples]^2) - (rowMeans(posterior_samples[, 1:n_samples]))^2 + -# rowMeans(posterior_samples[, 1:n_samples]) -# dss <- mean((test_data - y_mean)^2 / post_var + log(post_var)) - -# # Calculate CRPS and SCRPS -# if (parallelize_RP) { -# cl <- makeCluster(n_cores_RP) -# registerDoParallel(cl) -# Y1_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# sqrt(posterior_samples[i, 1:n_samples]) * rt(n_samples, degree_1[i]) -# } -# Y2_sample <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)]) * rt(n_samples, degree_2[i]) -# } -# stopCluster(cl) -# Y1_sample <- split(Y1_sample, rep(1:length(test_data), each = n_samples)) -# Y2_sample <- split(Y2_sample, rep(1:length(test_data), each = n_samples)) - -# E1_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# } -# E2_tmp <- foreach(i = 1:length(test_data), .combine = 'c', .packages = "foreach") %dopar% { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# } -# } else { -# Y1_sample <- lapply(1:length(test_data), function(i) { -# sqrt(posterior_samples[i, 1:n_samples]) * rt(n_samples, degree_1[i]) -# }) -# Y2_sample <- lapply(1:length(test_data), function(i) { -# sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)]) * rt(n_samples, degree_2[i]) -# }) - -# E1_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - test_data[i])) -# }) -# E2_tmp <- sapply(1:length(test_data), function(i) { -# mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) -# }) -# } - -# crps <- mean(-E1_tmp + 0.5 * E2_tmp) -# scrps <- mean(-E1_tmp / E2_tmp - 0.5 * log(E2_tmp)) - -# scores <- list(mse = mse, dss = dss, crps = crps, scrps = scrps) - -# } -# } else { -# stop(paste("Family", family, "is not supported in calculate_scores function.")) -# } - -# return(scores) -# } - - - #' @name cross_validation #' @title Perform cross-validation on a list of fitted models. #' @description Obtain several scores for a list of fitted models according #' to a folding scheme. -#' @param models A fitted model obtained from calling the `bru()` function or a list of models fitted with the `bru()` function. +#' @param models A fitted model obtained from calling the `bru()` function or a list of fitted models. +#' All models in the list must have the same number of likelihoods and must be fitted to +#' identical datasets. #' @param model_names A vector containing the names of the models to appear in the returned `data.frame`. If `NULL`, the names will be of the form `Model 1`, `Model 2`, and so on. By default, it will try to obtain the name from the models list. #' @param scores A vector containing the scores to be computed. The options are "mse", "crps", "scrps" and "dss". By default, all scores are computed. #' @param cv_type The type of the folding to be carried out. The options are `k-fold` for `k`-fold cross-validation, in which case the parameter `k` should be provided, @@ -824,9 +276,11 @@ prepare_df_pred <- function(df_pred, result, idx_test) { #' @param return_scores_folds If `TRUE`, the scores for each fold will also be returned. #' @param orientation_results character vector. The options are "negative" and "positive". If "negative", the smaller the scores the better. If "positive", the larger the scores the better. #' @param include_best Should a row indicating which model was the best for each score be included? -#' @param train_test_indexes A list containing two entries `train`, which is a list whose elements are vectors of indexes of the training data, and `test`, which is a list whose elements are vectors of indexes of the test data. -#' Typically this will be returned list obtained by setting the argument `return_train_test` to `TRUE`. -#' @param return_train_test Logical. Should the training and test indexes be returned? If 'TRUE' the train and test indexes will the 'train_test' element of the returned list. +#' @param train_test_indexes A list where each element corresponds to a fold. Each fold contains: +#' - `train`: A list of training index vectors, one for each likelihood. +#' - `test`: A list of test index vectors, one for each likelihood, with the same length as `train`. +#' This list is typically obtained by setting the argument `return_train_test` to `TRUE`. +#' @param return_train_test Logical. Should the training and test indexes be returned? If 'TRUE' the train and test indexes will the 'train_test' element of the returned list. #' @param return_post_samples If `TRUE` the posterior samples will be included in the returned list. #' @param return_true_test_values If `TRUE` the true test values will be included in the returned list. #' @param parallelize_RP Logical. Should the computation of CRPS and SCRPS (and for some cases, DSS) be parallelized? @@ -899,6 +353,16 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps } } + # The number of likelihoods. All models must have the same number of likelihoods. + + n_likelihoods <- length(models[[1]]$bru_info$lhoods) + + for (model_number in seq_along(models)) { + if (length(models[[model_number]]$bru_info$lhoods) != n_likelihoods) { + stop(paste("Model", model_number, "does not have the same number of likelihoods as the first model.")) + } + } + if (is.null(model_names) && is.list(models)) { model_names <- names(models) } @@ -935,70 +399,80 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps doParallel::registerDoParallel(cluster_tmp) } - # Getting the data if NULL - data <- models[[1]]$bru_info$lhoods[[1]]$data - - if (is.vector(data)) { - data <- as.data.frame(data) - } - # Creating lists of train and test datasets if (is.null(train_test_indexes)) { - train_test_indexes <- create_train_test_indices(data, + data_list <- lapply(seq_along(models$bru_info$lhoods), function(i) { + models$bru_info$lhoods[[i]]$data + }) + train_test_indexes <- create_train_test_indices(data_list, cv_type = cv_type, k = k, percentage = percentage, number_folds = number_folds ) - train_list <- train_test_indexes[["train"]] - test_list <- train_test_indexes[["test"]] } else { if (!is.list(train_test_indexes)) { - stop("train_test_indexes should be a list!") - } - if (is.null(train_test_indexes[["train"]])) { - stop("train_test_indexes must contain a 'train' element.") + stop("train_test_indexes should be a list!") } - if (is.null(train_test_indexes[["test"]])) { - stop("train_test_indexes must contain a 'test' element.") - } - if (!is.list(train_test_indexes[["train"]])) { - stop("train_test_indexes$train must be a list!") - } - if (!is.list(train_test_indexes[["test"]])) { - stop("train_test_indexes$test must be a list!") + + for (i in seq_along(train_test_indexes)) { + fold_entry <- train_test_indexes[[i]] + + if (!is.list(fold_entry)) { + stop(paste("train_test_indexes[[", i, "]] should be a list!", sep = "")) + } + + if (is.null(fold_entry[["train"]])) { + stop(paste("train_test_indexes[[", i, "]] must contain a 'train' element.", sep = "")) + } + + if (is.null(fold_entry[["test"]])) { + stop(paste("train_test_indexes[[", i, "]] must contain a 'test' element.", sep = "")) + } + + if (!is.list(fold_entry[["train"]])) { + stop(paste("train_test_indexes[[", i, "]][['train']] must be a list!", sep = "")) + } + + if (!is.list(fold_entry[["test"]])) { + stop(paste("train_test_indexes[[", i, "]][['test']] must be a list!", sep = "")) + } + + if (length(fold_entry[["train"]]) != n_likelihoods) { + stop(paste("train_test_indexes[[", i, "]][['train']] must have length equal to n_likelihoods:", n_likelihoods)) + } + + if (length(fold_entry[["test"]]) != n_likelihoods) { + stop(paste("train_test_indexes[[", i, "]][['test']] must have length equal to n_likelihoods:", n_likelihoods)) + } } - train_list <- train_test_indexes[["train"]] - test_list <- train_test_indexes[["test"]] } post_samples <- list() - hyper_samples <- list() true_test_values = list() for (model_number in 1:length(models)) { - post_samples[[model_names[[model_number]]]] <- vector(mode = "list", length = length(train_list)) - hyper_samples[[model_names[[model_number]]]] <- vector(mode = "list", length = 2) - hyper_samples[[model_names[[model_number]]]][[1]] <- vector(mode = "list", length = length(train_list)) - hyper_samples[[model_names[[model_number]]]][[2]] <- vector(mode = "list", length = length(train_list)) + n_likelihoods <- length(models[[model_number]]$bru_info$lhoods) + post_samples[[model_names[[model_number]]]] <- vector(mode = "list", length = length(train_test_indexes)) if(return_true_test_values){ - true_test_values[[model_names[[model_number]]]] <- vector(mode = "list", length = length(train_list)) + true_test_values[[model_names[[model_number]]]] <- vector(mode = "list", length = length(train_test_indexes)) + } + for(j in seq_along(train_test_indexes)){ + post_samples[[model_names[[model_number]]]][[j]] <- vector(mode = "list", length = n_likelihoods) + if(return_true_test_values){ + true_test_values[[model_names[[model_number]]]][[j]] <- vector(mode = "list", length = n_likelihoods) + } } } # Perform the cross-validation result_df <- data.frame(Model = model_names) - dss <- matrix(numeric(length(train_list) * length(models)), ncol = length(models)) - mse <- matrix(numeric(length(train_list) * length(models)), ncol = length(models)) - crps <- matrix(numeric(length(train_list) * length(models)), ncol = length(models)) - scrps <- matrix(numeric(length(train_list) * length(models)), ncol = length(models)) - - # Get the formulas for the models - - formula_list <- lapply(models, function(model) { - process_formula(model) - }) + dss <- lapply(1:n_likelihoods, matrix(numeric(n_folds * n_models), ncol = n_models)) + mse <- lapply(1:n_likelihoods, matrix(numeric(n_folds * n_models), ncol = n_models)) + mae <- lapply(1:n_likelihoods, matrix(numeric(n_folds * n_models), ncol = n_models)) + crps <- lapply(1:n_likelihoods, matrix(numeric(n_folds * n_models), ncol = n_models)) + scrps <- lapply(1:n_likelihoods, matrix(numeric(n_folds * n_models), ncol = n_models)) if (("crps" %in% scores) || ("scrps" %in% scores) || ("dss" %in% scores)) { new_n_samples <- 2 * n_samples @@ -1017,261 +491,63 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps } } - test_data <- models[[model_number]]$bru_info$lhoods[[1]]$response_data$BRU_response[test_list[[fold]]] - - link_name <- models[[model_number]]$.args$control.family[[1]]$link - - if (link_name == "default") { - if (models[[model_number]]$.args$family == "gaussian") { - linkfuninv <- function(x) { - x - } - } else if (models[[model_number]]$.args$family %in% c("gamma", "poisson", "stochvol", "stochvolln", "stochvolnig", "stochvolt")) { - linkfuninv <- function(x) { - exp(x) - } - } else if(models[[model_number]]$.args$family == "binomial"){ - linkfuninv <- function(x) { - exp(x)/(1 + exp(x)) - } - } else{ - stop(paste("The family", models[[model_number]]$.args$family, "is not supported yet, please, raise an issue in https://github.com/davidbolin/rSPDE/issues requesting the support.")) - } - } else { - linkfuninv <- process_link(link_name) - } - - formula_tmp <- formula_list[[model_number]] - env_tmp <- environment(formula_tmp) - assign("linkfuninv", linkfuninv, envir = env_tmp) + train_list <- train_test_indexes[[fold]][["train"]] + test_list <- train_test_indexes[[fold]][["test"]] - if (models[[model_number]]$.args$family %in% c("stochvol", "stochvolln", "stochvolnig", "stochvolt")) { - tmp_n_samples <- new_n_samples - new_n_samples <- 2 * n_samples - } + new_model <- bru_rerun_with_data(models[[model_number]], train_list, true_CV = true_CV, fit_verbose = fit_verbose) + for(i_lik in 1:n_likelihoods){ - if (print) { - cat("Generating samples...\n") - } - - post_predict <- group_predict( - models = models[[model_number]], model_names = model_names[[model_number]], - formula = formula_tmp, train_indices = train_list[[fold]], - test_indices = test_list[[fold]], n_samples = new_n_samples, - pseudo_predict = !true_CV, return_samples = TRUE, return_hyper_samples = TRUE, - n_hyper_samples = 1, compute_posterior_means = TRUE, print = FALSE, fit_verbose = fit_verbose - ) - - if (print) { - cat("Samples generated!\n") - } - - hyper_marginals <- post_predict[["hyper_marginals"]][[model_names[[model_number]]]][[1]] - hyper_summary <- post_predict[["hyper_summary"]][[model_names[[model_number]]]][[1]] - hyper_samples_1 <- post_predict[["hyper_samples"]][[model_names[[model_number]]]][[1]][[1]] - posterior_samples <- post_predict[["post_samples"]][[model_names[[model_number]]]][[1]] - posterior_mean <- post_predict[["post_means"]][[model_names[[model_number]]]][[1]] - - if (return_post_samples) { - post_samples[[model_names[[model_number]]]][[fold]] <- posterior_samples - hyper_samples[[model_names[[model_number]]]][[1]][[fold]] <- hyper_samples_1 - } - - if(return_true_test_values){ - true_test_values[[model_names[[model_number]]]][[fold]] <- test_data - } - - - if (!(models[[model_number]]$.args$family %in% c("stochvol", "stochvolln", "stochvolnig", "stochvolt"))) { - if ("mse" %in% scores) { - mse[fold, model_number] <- mean((test_data - posterior_mean)^2) - if (orientation_results == "positive") { - mse[fold, model_number] <- -mse[fold, model_number] - } - if (print) { - cat(paste("MSE:", mse[fold, model_number], "\n")) - } - } - } + model_family <- models[[model_number]]$.args$family[[i_lik]] + post_linear_predictors <- samples_posterior_linear_predictor(new_model, i_lik, test_list, n_samples, print) + post_samples[[model_names[[model_number]]]][[fold]][[i_lik]] <- get_posterior_samples( + post_linear_predictors = post_linear_predictors, new_model = new_model, + i_lik = i_lik, new_n_samples = new_n_samples, + full_model = models[[model_number]], + true_CV = true_CV, print = print) - if (models[[model_number]]$.args$family == "gaussian") { - if ("dss" %in% scores) { - density_df <- hyper_marginals$`Precision for the Gaussian observations` - Expect_post_var <- tryCatch(get_post_var(density_df), error = function(e) NA) - if (is.na(Expect_post_var)) { - Expect_post_var <- 1 / hyper_summary["Precision for the Gaussian observations", "mean"] + test_data <- models[[model_number]]$bru_info$lhoods[[i_lik]]$response_data$BRU_response[test_list[[i_lik]]] + + if(return_true_test_values){ + true_test_values[[model_names[[model_number]]]][[fold]] <- test_data } - posterior_variance_of_mean <- rowMeans(posterior_samples[, 1:n_samples]^2) - (rowMeans(posterior_samples[, 1:n_samples]))^2 - post_var <- Expect_post_var + posterior_variance_of_mean - dss[fold, model_number] <- mean((test_data - rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)]))^2 / post_var + log(post_var)) - if (orientation_results == "positive") { - dss[fold, model_number] <- -dss[fold, model_number] - } - if (print) { - cat(paste("DSS:", dss[fold, model_number], "\n")) + if (!(model_family %in% c("stochvol", "stochvolln", "stochvolnig", "stochvolt"))) { + if ("mse" %in% scores) { + mse[[i_lik]][fold, model_number] <- mean((test_data - posterior_mean)^2, na.rm = TRUE) + if (orientation_results == "positive") { + mse[[i_lik]][fold, model_number] <- -mse[[i_lik]][fold, model_number] + } + if (print) { + cat(paste0("MSE - Likelihood ",i_lik,": ", mse[[i_lik]][fold, model_number], "\n")) + } + } + if ("mae" %in% scores) { + mae[[i_lik]][fold, model_number] <- mean(abs(test_data - posterior_mean), na.rm = TRUE) + if (orientation_results == "positive") { + mae[[i_lik]][fold, model_number] <- -mae[[i_lik]][fold, model_number] + } + if (print) { + cat(paste0("MAE - Likelihood ",i_lik,": ", mae[[i_lik]][fold, model_number], "\n")) + } + } } - } - - if (("crps" %in% scores) || ("scrps" %in% scores)) { - phi_sample_1 <- as.vector(hyper_samples_1[, "Precision for the Gaussian observations"][1:n_samples]) - sd_sample_1 <- 1 / sqrt(phi_sample_1) - - phi_sample_2 <- as.vector(hyper_samples_1[, "Precision for the Gaussian observations"][(n_samples + 1):(2 * n_samples)]) - sd_sample_2 <- 1 / sqrt(phi_sample_2) + + if ("dss" %in% scores) { + post_var <- rowMeans(post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, 1:n_samples]^2) - (rowMeans(post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, 1:n_samples]))^2 - if (parallelize_RP) { - Y1_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - posterior_samples[i, 1:n_samples] + sd_sample_1 * rnorm(n_samples) - }) - Y2_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - posterior_samples[i, (n_samples + 1):(2 * n_samples)] + sd_sample_2 * rnorm(n_samples) - }) - E1_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } else { - Y1_sample <- lapply(1:length(test_data), function(i) { - posterior_samples[i, 1:n_samples] + sd_sample_1 * rnorm(n_samples) - }) - Y2_sample <- lapply(1:length(test_data), function(i) { - posterior_samples[i, (n_samples + 1):(2 * n_samples)] + sd_sample_2 * rnorm(n_samples) - }) - E1_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } - } - } else if (models[[model_number]]$.args$family == "gamma") { - if ("dss" %in% scores) { - post_mean_tmp <- rowMeans(posterior_samples[, 1:n_samples]) - Expected_post_var <- hyper_marginals["Precision parameter for the Gamma observations", "mean"] / (post_mean_tmp^2) - posterior_variance_of_mean <- rowMeans(posterior_samples[, 1:n_samples]^2) - post_mean_tmp^2 - - post_var <- Expected_post_var + posterior_variance_of_mean - dss[fold, model_number] <- mean((test_data - (rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)])))^2 / post_var + log(post_var)) - if (orientation_results == "positive") { - dss[fold, model_number] <- -dss[fold, model_number] - } - if (print) { - cat(paste("DSS:", dss[fold, model_number], "\n")) + dss[[i_lik]][fold, model_number] <- mean((test_data - rowMeans(post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, (n_samples + 1):(2 * n_samples)]))^2 / post_var + log(post_var)) } - } - - if (("crps" %in% scores) || ("scrps" %in% scores)) { - phi_sample_1 <- as.vector(hyper_samples_1[, "Precision parameter for the Gamma observations"][1:n_samples]) - phi_sample_2 <- as.vector(hyper_samples_1[, "Precision parameter for the Gamma observations"][(n_samples + 1):(2 * n_samples)]) - - if (parallelize_RP) { - Y1_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - scale_temp <- posterior_samples[i, 1:n_samples] / phi_sample_1 - stats::rgamma(n_samples, shape = phi_sample_1, scale = scale_temp) - }) - Y2_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - scale_temp <- posterior_samples[i, (n_samples + 1):(2 * n_samples)] / phi_sample_2 - stats::rgamma(n_samples, shape = phi_sample_2, scale = scale_temp) - }) - E1_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } else { - Y1_sample <- lapply(1:length(test_data), function(i) { - scale_temp <- posterior_samples[i, 1:n_samples] / phi_sample_1 - stats::rgamma(n_samples, shape = phi_sample_1, scale = scale_temp) - }) - Y2_sample <- lapply(1:length(test_data), function(i) { - scale_temp <- posterior_samples[i, (n_samples + 1):(2 * n_samples)] / phi_sample_2 - stats::rgamma(n_samples, shape = phi_sample_2, scale = scale_temp) - }) - E1_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } - } - } else if (models[[model_number]]$.args$family == "poisson") { - if ("dss" %in% scores) { - post_mean_tmp <- rowMeans(posterior_samples[, 1:n_samples]) - posterior_variance_of_mean <- rowMeans(posterior_samples[, 1:n_samples]^2) - post_mean_tmp^2 - post_var <- post_mean_tmp + posterior_variance_of_mean - - dss[fold, model_number] <- mean((test_data - rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)]))^2 / post_var + log(post_var)) - if (orientation_results == "positive") { - dss[fold, model_number] <- -dss[fold, model_number] - } - if (print) { - cat(paste("DSS:", dss[fold, model_number], "\n")) - } - } - if (("crps" %in% scores) || ("scrps" %in% scores)) { - if (parallelize_RP) { - Y1_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - stats::rpois(n_samples, posterior_samples[i, 1:n_samples]) - }) - Y2_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - stats::rpois(n_samples, posterior_samples[i, (n_samples + 1):(2 * n_samples)]) - }) - E1_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } else { - Y1_sample <- lapply(1:length(test_data), function(i) { - stats::rpois(n_samples, posterior_samples[i, 1:n_samples]) - }) - Y2_sample <- lapply(1:length(test_data), function(i) { - stats::rpois(n_samples, posterior_samples[i, (n_samples + 1):(2 * n_samples)]) - }) - E1_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } - } - } else if (models[[model_number]]$.args$family == "binomial") { - if ("dss" %in% scores) { - post_mean_tmp <- rowMeans(posterior_samples[, 1:n_samples]) - posterior_variance_of_mean <- rowMeans(posterior_samples[, 1:n_samples]^2) - post_mean_tmp^2 - post_var <- post_mean_tmp + posterior_variance_of_mean - - dss[fold, model_number] <- mean((test_data - rowMeans(posterior_samples[, (n_samples + 1):(2 * n_samples)]))^2 / post_var + log(post_var)) - if (orientation_results == "positive") { - dss[fold, model_number] <- -dss[fold, model_number] - } - if (print) { - cat(paste("DSS:", dss[fold, model_number], "\n")) - } - } if (("crps" %in% scores) || ("scrps" %in% scores)) { + Y1_sample <- post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, 1:n_samples] + Y2_sample <- post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, (n_samples + 1):(2 * n_samples)] if (parallelize_RP) { - Y1_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - stats::rbinom(n = n_samples, size = 1, prob = posterior_samples[i, 1:n_samples]) - }) - Y2_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - stats::rbinom(n = n_samples, size = 1, prob = posterior_samples[i, (n_samples + 1):(2 * n_samples)]) - }) E1_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { mean(abs(Y1_sample[[i]] - test_data[i])) }) @@ -1279,328 +555,14 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) }) } else { - Y1_sample <- lapply(1:length(test_data), function(i) { - stats::rbinom(n = n_samples, size = 1, prob = posterior_samples[i, 1:n_samples]) - }) - Y2_sample <- lapply(1:length(test_data), function(i) { - stats::rbinom(n = n_samples, size = 1, prob = posterior_samples[i, (n_samples + 1):(2 * n_samples)]) - }) E1_tmp <- lapply(1:length(test_data), function(i) { mean(abs(Y1_sample[[i]] - test_data[i])) }) E2_tmp <- lapply(1:length(test_data), function(i) { mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) }) - } - } - } else if (models[[model_number]]$.args$family == "stochvol") { - new_n_samples <- tmp_n_samples - - if ("Offset precision for stochvol" %in% colnames(hyper_samples_1)) { - phi_sample_1 <- as.vector(hyper_samples_1[, "Offset precision for stochvol"][1:n_samples]) - phi_sample_2 <- as.vector(hyper_samples_1[, "Offset precision for stochvol"][(n_samples + 1):(2 * n_samples)]) - } else { - phi_sample_1 <- Inf - phi_sample_2 <- Inf - } - - if (parallelize_RP) { - Y1_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - sqrt(posterior_samples[i, 1:n_samples] + 1 / phi_sample_1) * rnorm(n_samples) - }) - Y2_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)] + 1 / phi_sample_2) * rnorm(n_samples) - }) - E1_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } else { - Y1_sample <- lapply(1:length(test_data), function(i) { - sqrt(posterior_samples[i, 1:n_samples] + 1 / phi_sample_1) * rnorm(n_samples) - }) - Y2_sample <- lapply(1:length(test_data), function(i) { - sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)] + 1 / phi_sample_2) * rnorm(n_samples) - }) - E1_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } - - if ("mse" %in% scores) { - Y_mean <- lapply(Y1_sample, mean) - Y_mean <- unlist(Y_mean) - mse[fold, model_number] <- mean((test_data - Y_mean)^2) - if (orientation_results == "positive") { - mse[fold, model_number] <- -mse[fold, model_number] - } - if (print) { - cat(paste("MSE:", mse[fold, model_number], "\n")) - } - } - - if ("dss" %in% scores) { - Y_var <- lapply(Y2_sample, var) - Y_mean <- lapply(Y1_sample, mean) - Y_var <- unlist(Y_var) - Y_mean <- unlist(Y_mean) - - post_var <- Y_var - - dss[fold, model_number] <- mean((test_data - Y_mean)^2 / post_var + log(post_var)) - if (orientation_results == "positive") { - dss[fold, model_number] <- -dss[fold, model_number] - } - if (print) { - cat(paste("DSS:", dss[fold, model_number], "\n")) - } - } - } else if (models[[model_number]]$.args$family == "stochvolln") { - new_n_samples <- tmp_n_samples - - if ("Offset precision for stochvolln" %in% colnames(hyper_samples_1)) { - phi_sample_1 <- as.vector(hyper_samples_1[, "Offset precision for stochvolln"][1:n_samples]) - phi_sample_2 <- as.vector(hyper_samples_1[, "Offset precision for stochvolln"][(n_samples + 1):(2 * n_samples)]) - } else { - phi_sample_1 <- Inf - phi_sample_2 <- Inf - } - - mu_sample_1 <- as.vector(hyper_samples_1[, "Mean offset for stochvolln"][1:n_samples]) - mu_sample_2 <- as.vector(hyper_samples_1[, "Mean offset for stochvolln"][(n_samples + 1):(2 * n_samples)]) - - var_1 <- posterior_samples[i, 1:n_samples] + 1 / phi_sample_1 - var_2 <- posterior_samples[i, (n_samples + 1):(2 * n_samples)] + 1 / phi_sample_2 - - mean_1 <- mu_sample_1 - 0.5 * var_1 - mean_2 <- mu_sample_2 - 0.5 * var_2 - - if (parallelize_RP) { - Y1_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean_1 + sqrt(var_1) * rnorm(n_samples) - }) - Y2_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean_2 + sqrt(var_2) * rnorm(n_samples) - }) - E1_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } else { - Y1_sample <- lapply(1:length(test_data), function(i) { - mean_1 + sqrt(var_1) * rnorm(n_samples) - }) - Y2_sample <- lapply(1:length(test_data), function(i) { - mean_2 + sqrt(var_2) * rnorm(n_samples) - }) - E1_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } - - if ("mse" %in% scores) { - Y_mean <- lapply(Y1_sample, mean) - Y_mean <- unlist(Y_mean) - mse[fold, model_number] <- mean((test_data - Y_mean)^2) - if (orientation_results == "positive") { - mse[fold, model_number] <- -mse[fold, model_number] - } - if (print) { - cat(paste("MSE:", mse[fold, model_number], "\n")) - } - } - - if ("dss" %in% scores) { - Y_var <- lapply(Y2_sample, var) - Y_mean <- lapply(Y1_sample, mean) - Y_var <- unlist(Y_var) - Y_mean <- unlist(Y_mean) - - post_var <- Y_var - - dss[fold, model_number] <- mean((test_data - Y_mean)^2 / post_var + log(post_var)) - if (orientation_results == "positive") { - dss[fold, model_number] <- -dss[fold, model_number] - } - if (print) { - cat(paste("DSS:", dss[fold, model_number], "\n")) - } - } - } else if (models[[model_number]]$.args$family == "stochvolnig") { - new_n_samples <- tmp_n_samples - - shape_1 <- as.vector(hyper_samples_1[, "shape parameter for stochvol-nig"][1:n_samples]) - shape_2 <- as.vector(hyper_samples_1[, "shape parameter for stochvol-nig"][(n_samples + 1):(2 * n_samples)]) - - skewness_1 <- as.vector(hyper_samples_1[, "skewness parameter for stochvol-nig"][1:n_samples]) - skewness_2 <- as.vector(hyper_samples_1[, "skewness parameter for stochvol-nig"][(n_samples + 1):(2 * n_samples)]) - - gamma_1 <- sqrt(1+skewness_1^2/shape_1^2) - gamma_2 <- sqrt(1+skewness_2^2/shape_2^2) - - if (parallelize_RP) { - Y1_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - sqrt(posterior_samples[i, 1:n_samples]) * GeneralizedHyperbolic::rnig(n_samples, - mu = -skewness_1 / gamma_1, # mu_new corresponds to delta_old = -skewness_1 / gamma_1 - delta = shape_1 / gamma_1, # delta_new = sqrt(nu * sigma^2) = shape_1 / gamma_1 - alpha = sqrt(skewness_1^2 * gamma_1^4 + shape_1^2 * gamma_1^2), - # alpha = sqrt(mu^2 / sigma^4 + nu / sigma^2) - # = sqrt(skewness_1^2 * gamma_1^4 + shape_1^2 * gamma_1^2) - - beta = skewness_1 * gamma_1 # beta = mu_old / sigma^2 = skewness_1 * gamma_1 - ) - }) - Y2_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)]) * GeneralizedHyperbolic::rnig(n_samples, - mu = -skewness_2 / gamma_2, # mu_new corresponds to delta_old = -skewness_2 / gamma_2 - delta = shape_2 / gamma_2, # delta_new = sqrt(nu * sigma^2) = shape_2 / gamma_2 - alpha = sqrt(skewness_2^2 * gamma_2^4 + shape_2^2 * gamma_2^2), - # alpha = sqrt(mu^2 / sigma^4 + nu / sigma^2) - # = sqrt(skewness_2^2 * gamma_2^4 + shape_2^2 * gamma_2^2) - - beta = skewness_2 * gamma_2 # beta = mu_old / sigma^2 = skewness_2 * gamma_2 - ) - }) - E1_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } else { - Y1_sample <- lapply(1:length(test_data), function(i) { - sqrt(posterior_samples[i, 1:n_samples]) * GeneralizedHyperbolic::rnig(n_samples, - mu = -skewness_1 / gamma_1, # mu_new corresponds to delta_old = -skewness_1 / gamma_1 - delta = shape_1 / gamma_1, # delta_new = sqrt(nu * sigma^2) = shape_1 / gamma_1 - alpha = sqrt(skewness_1^2 * gamma_1^4 + shape_1^2 * gamma_1^2), - # alpha = sqrt(mu^2 / sigma^4 + nu / sigma^2) - # = sqrt(skewness_1^2 * gamma_1^4 + shape_1^2 * gamma_1^2) - - beta = skewness_1 * gamma_1 # beta = mu_old / sigma^2 = skewness_1 * gamma_1 - ) - }) - Y2_sample <- lapply(1:length(test_data), function(i) { - sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)]) * GeneralizedHyperbolic::rnig(n_samples, - mu = -skewness_2 / gamma_2, # mu_new corresponds to delta_old = -skewness_2 / gamma_2 - delta = shape_2 / gamma_2, # delta_new = sqrt(nu * sigma^2) = shape_2 / gamma_2 - alpha = sqrt(skewness_2^2 * gamma_2^4 + shape_2^2 * gamma_2^2), - # alpha = sqrt(mu^2 / sigma^4 + nu / sigma^2) - # = sqrt(skewness_2^2 * gamma_2^4 + shape_2^2 * gamma_2^2) - - beta = skewness_2 * gamma_2 # beta = mu_old / sigma^2 = skewness_2 * gamma_2 - ) - }) - E1_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } - - if ("mse" %in% scores) { - Y_mean <- lapply(Y1_sample, mean) - Y_mean <- unlist(Y_mean) - mse[fold, model_number] <- mean((test_data - Y_mean)^2) - if (orientation_results == "positive") { - mse[fold, model_number] <- -mse[fold, model_number] - } - if (print) { - cat(paste("MSE:", mse[fold, model_number], "\n")) - } - } - - if ("dss" %in% scores) { - Y_var <- lapply(Y2_sample, var) - Y_mean <- lapply(Y1_sample, mean) - Y_var <- unlist(Y_var) - Y_mean <- unlist(Y_mean) - - post_var <- Y_var - - dss[fold, model_number] <- mean((test_data - Y_mean)^2 / post_var + log(post_var)) - if (orientation_results == "positive") { - dss[fold, model_number] <- -dss[fold, model_number] - } - if (print) { - cat(paste("DSS:", dss[fold, model_number], "\n")) - } - } - } else if (models[[model_number]]$.args$family == "stochvolt") { - new_n_samples <- tmp_n_samples - - degree_1 <- as.vector(hyper_samples_1[, "degrees of freedom for stochvol student-t"][1:n_samples]) - degree_2 <- as.vector(hyper_samples_1[, "degrees of freedom for stochvol student-t"][(n_samples + 1):(2 * n_samples)]) - - if (parallelize_RP) { - Y1_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - sqrt(posterior_samples[i, 1:n_samples]) * rt(n_samples, degree_1) - }) - Y2_sample <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)]) * rt(n_samples, degree_2) - }) - E1_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } else { - Y1_sample <- lapply(1:length(test_data), function(i) { - sqrt(posterior_samples[i, 1:n_samples]) * rt(n_samples, degree_1) - }) - Y2_sample <- lapply(1:length(test_data), function(i) { - sqrt(posterior_samples[i, (n_samples + 1):(2 * n_samples)]) * rt(n_samples, degree_2) - }) - E1_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - test_data[i])) - }) - E2_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) - }) - } - - if ("mse" %in% scores) { - Y_mean <- lapply(Y1_sample, mean) - Y_mean <- unlist(Y_mean) - mse[fold, model_number] <- mean((test_data - Y_mean)^2) - if (orientation_results == "positive") { - mse[fold, model_number] <- -mse[fold, model_number] - } - if (print) { - cat(paste("MSE:", mse[fold, model_number], "\n")) - } - } - - if ("dss" %in% scores) { - Y_var <- lapply(Y2_sample, var) - Y_mean <- lapply(Y1_sample, mean) - Y_var <- unlist(Y_var) - Y_mean <- unlist(Y_mean) - - post_var <- Y_var - - dss[fold, model_number] <- mean((test_data - Y_mean)^2 / post_var + log(post_var)) - if (orientation_results == "positive") { - dss[fold, model_number] <- -dss[fold, model_number] - } - if (print) { - cat(paste("DSS:", dss[fold, model_number], "\n")) - } + } } - } else { - stop(paste("The family", models[[model_number]]$.args$family, "is not supported yet, please, raise an issue in https://github.com/davidbolin/rSPDE/issues requesting the support.")) - } if ("crps" %in% scores) { crps_temp <- lapply(1:length(test_data), function(i) { @@ -1608,13 +570,13 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps }) crps_temp <- unlist(crps_temp) - crps[fold, model_number] <- mean(crps_temp) + crps[[i_lik]][fold, model_number] <- mean(crps_temp) if (orientation_results == "negative") { - crps[fold, model_number] <- -crps[fold, model_number] + crps[[i_lik]][fold, model_number] <- -crps[[i_lik]][fold, model_number] } if (print) { - cat(paste("CRPS:", crps[fold, model_number], "\n")) + cat(paste0("CRPS - Likelihood ",i_lik,": ", crps[[i_lik]][fold, model_number], "\n")) } } @@ -1623,35 +585,134 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps return(-E1_tmp[[i]] / E2_tmp[[i]] - 0.5 * log(E2_tmp[[i]])) }) scrps_temp <- unlist(scrps_temp) - scrps[fold, model_number] <- mean(scrps_temp) + scrps[[i_lik]][fold, model_number] <- mean(scrps_temp) if (orientation_results == "negative") { - scrps[fold, model_number] <- -scrps[fold, model_number] + scrps[[i_lik]][fold, model_number] <- -scrps[[i_lik]][fold, model_number] } if (print) { - cat(paste("SCRPS:", scrps[fold, model_number], "\n")) + cat(paste("SCRPS: - Likelihood ",i_lik,": ", scrps[[i_lik]][fold, model_number], "\n")) } - } + } + } + } } - } - if ("dss" %in% scores) { - dss_mean <- colMeans(dss) - result_df <- data.frame(result_df, dss = dss_mean) - } - if ("mse" %in% scores) { - mse_mean <- colMeans(mse) - result_df <- data.frame(result_df, mse = mse_mean) - } - if ("crps" %in% scores) { - crps_mean <- colMeans(crps) - result_df <- data.frame(result_df, crps = crps_mean) - } - if ("scrps" %in% scores) { - scrps_mean <- colMeans(scrps) - result_df <- data.frame(result_df, scrps = scrps_mean) + if (n_likelihoods > 1) { + if ("mse" %in% scores) { + # Individual likelihood means + for (i in 1:n_likelihoods) { + mse_mean <- colMeans(mse[[i]]) + result_df <- data.frame(result_df, mse = mse_mean) + names(result_df)[names(result_df) == "mse"] <- paste0("mse_lik", i) + } + # Total MSE (weighted average across likelihoods) + mse_weighted <- matrix(0, nrow = nrow(mse[[1]]), ncol = ncol(mse[[1]])) + total_weights <- matrix(0, nrow = nrow(mse[[1]]), ncol = ncol(mse[[1]])) + for (i in 1:n_likelihoods) { + weights <- sapply(1:nrow(mse[[i]]), function(fold) length(train_test_indexes[[fold]][["test"]][[i]])) + mse_weighted <- mse_weighted + weights * mse[[i]] + total_weights <- total_weights + weights + } + mse_total <- colMeans(mse_weighted / total_weights) + result_df <- data.frame(result_df, mse_total = mse_total) + } + + if ("mae" %in% scores) { + for (i in 1:n_likelihoods) { + mae_mean <- colMeans(mae[[i]]) + result_df <- data.frame(result_df, mae = mae_mean) + names(result_df)[names(result_df) == "mae"] <- paste0("mae_lik", i) + } + mae_weighted <- matrix(0, nrow = nrow(mae[[1]]), ncol = ncol(mae[[1]])) + total_weights <- matrix(0, nrow = nrow(mae[[1]]), ncol = ncol(mae[[1]])) + for (i in 1:n_likelihoods) { + weights <- sapply(1:nrow(mae[[i]]), function(fold) length(train_test_indexes[[fold]][["test"]][[i]])) + mae_weighted <- mae_weighted + weights * mae[[i]] + total_weights <- total_weights + weights + } + mae_total <- colMeans(mae_weighted / total_weights) + result_df <- data.frame(result_df, mae_total = mae_total) + } + + if ("dss" %in% scores) { + for (i in 1:n_likelihoods) { + dss_mean <- colMeans(dss[[i]]) + result_df <- data.frame(result_df, dss = dss_mean) + names(result_df)[names(result_df) == "dss"] <- paste0("dss_lik", i) + } + dss_weighted <- matrix(0, nrow = nrow(dss[[1]]), ncol = ncol(dss[[1]])) + total_weights <- matrix(0, nrow = nrow(dss[[1]]), ncol = ncol(dss[[1]])) + for (i in 1:n_likelihoods) { + weights <- sapply(1:nrow(dss[[i]]), function(fold) length(train_test_indexes[[fold]][["test"]][[i]])) + dss_weighted <- dss_weighted + weights * dss[[i]] + total_weights <- total_weights + weights + } + dss_total <- colMeans(dss_weighted / total_weights) + result_df <- data.frame(result_df, dss_total = dss_total) + } + + if ("crps" %in% scores) { + for (i in 1:n_likelihoods) { + crps_mean <- colMeans(crps[[i]]) + result_df <- data.frame(result_df, crps = crps_mean) + names(result_df)[names(result_df) == "crps"] <- paste0("crps_lik", i) + } + crps_weighted <- matrix(0, nrow = nrow(crps[[1]]), ncol = ncol(crps[[1]])) + total_weights <- matrix(0, nrow = nrow(crps[[1]]), ncol = ncol(crps[[1]])) + for (i in 1:n_likelihoods) { + weights <- sapply(1:nrow(crps[[i]]), function(fold) length(train_test_indexes[[fold]][["test"]][[i]])) + crps_weighted <- crps_weighted + weights * crps[[i]] + total_weights <- total_weights + weights + } + crps_total <- colMeans(crps_weighted / total_weights) + result_df <- data.frame(result_df, crps_total = crps_total) + } + + if ("scrps" %in% scores) { + for (i in 1:n_likelihoods) { + scrps_mean <- colMeans(scrps[[i]]) + result_df <- data.frame(result_df, scrps = scrps_mean) + names(result_df)[names(result_df) == "scrps"] <- paste0("scrps_lik", i) + } + scrps_weighted <- matrix(0, nrow = nrow(scrps[[1]]), ncol = ncol(scrps[[1]])) + total_weights <- matrix(0, nrow = nrow(scrps[[1]]), ncol = ncol(scrps[[1]])) + for (i in 1:n_likelihoods) { + weights <- sapply(1:nrow(scrps[[i]]), function(fold) length(train_test_indexes[[fold]][["test"]][[i]])) + scrps_weighted <- scrps_weighted + weights * scrps[[i]] + total_weights <- total_weights + weights + } + scrps_total <- colMeans(scrps_weighted / total_weights) + result_df <- data.frame(result_df, scrps_total = scrps_total) + } + } else { + # Original code for n_likelihoods == 1 + if ("mse" %in% scores) { + mse_mean <- colMeans(mse[[1]]) + result_df <- data.frame(result_df, mse = mse_mean) + } + + if ("mae" %in% scores) { + mae_mean <- colMeans(mae[[1]]) + result_df <- data.frame(result_df, mae = mae_mean) + } + + if ("dss" %in% scores) { + dss_mean <- colMeans(dss[[1]]) + result_df <- data.frame(result_df, dss = dss_mean) + } + + if ("crps" %in% scores) { + crps_mean <- colMeans(crps[[1]]) + result_df <- data.frame(result_df, crps = crps_mean) + } + + if ("scrps" %in% scores) { + scrps_mean <- colMeans(scrps[[1]]) + result_df <- data.frame(result_df, scrps = scrps_mean) + } } if (save_settings) { @@ -1667,263 +728,285 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps } } + # Best model identification section if (include_best) { - n_fit_scores <- ncol(result_df) - 1 - final_row <- c("Best") - for (j in 2:ncol(result_df)) { - if (orientation_results == "negative") { - best_tmp <- which.min(result_df[, j]) - final_row <- c(final_row, model_names[best_tmp]) - } else { - best_tmp <- which.max(result_df[, j]) - final_row <- c(final_row, model_names[best_tmp]) - } - } - result_df <- rbind(result_df, final_row) - row.names(result_df)[nrow(result_df)] <- "" + n_fit_scores <- ncol(result_df) - 1 + final_row <- c("Best") + + for (j in 2:ncol(result_df)) { + colname <- names(result_df)[j] + # Skip if it's not a metric column + if (!any(sapply(c("mse", "mae", "dss", "crps", "scrps"), function(x) startsWith(colname, x)))) { + final_row <- c(final_row, "") + next + } + + if (orientation_results == "negative") { + best_tmp <- which.min(result_df[, j]) + } else { + best_tmp <- which.max(result_df[, j]) + } + final_row <- c(final_row, model_names[best_tmp]) + } + result_df <- rbind(result_df, final_row) + row.names(result_df)[nrow(result_df)] <- "" } - - + + # Cluster cleanup if (parallelize_RP) { - parallel::stopCluster(cluster_tmp) + parallel::stopCluster(cluster_tmp) } - + + # Set return flag if (return_post_samples) { - return_scores_folds <- TRUE + return_scores_folds <- TRUE } - + + # Prepare output if (!return_scores_folds) { - if (save_settings) { - out <- list( - scores_df = result_df, - settings = settings_list - ) - if (return_train_test) { - out[["train_test"]] <- list(train = train_list, test = test_list) - } - if(return_true_test_values){ - out[["true_test_values"]] <- true_test_values - } - } else if (return_train_test) { - out <- list(scores_df = result_df, train_test = list(train = train_list, test = test_list)) - if(return_true_test_values){ - out[["true_test_values"]] <- true_test_values - } - } else if(return_true_test_values){ - out <- list(scores_df = result_df, true_test_values = true_test_values) - } else { - out <- result_df - } + if (save_settings) { + out <- list( + scores_df = result_df, + settings = settings_list + ) + if (return_train_test) { + out[["train_test"]] <- train_test_indexes + } + if(return_true_test_values){ + out[["true_test_values"]] <- true_test_values + } + } else if (return_train_test) { + out <- list(scores_df = result_df, train_test = train_test_indexes) + if(return_true_test_values){ + out[["true_test_values"]] <- true_test_values + } + } else if(return_true_test_values){ + out <- list(scores_df = result_df, true_test_values = true_test_values) + } else { + out <- result_df + } } else { - colnames(dss) <- model_names - colnames(mse) <- model_names - colnames(crps) <- model_names - colnames(scrps) <- model_names - out <- list( - scores_df = result_df, - scores_folds = list(dss = dss, mse = mse, crps = crps, scrps = scrps) - ) - if (save_settings) { - out[["settings"]] <- settings_list - } - if (return_train_test) { - out[["train_test"]] <- list(train = train_list, test = test_list) - } - - if(return_true_test_values){ - out[["true_test_values"]] <- true_test_values - } - - if (return_post_samples) { - out[["post_samples"]] <- post_samples - out[["hyper_samples"]] <- hyper_samples - } + # Add model names to all matrices in score lists + add_model_names <- function(score_list) { + lapply(score_list, function(mat) { + colnames(mat) <- model_names + return(mat) + }) + } + + scores_folds <- list() + if ("dss" %in% scores) scores_folds$dss <- add_model_names(dss) + if ("mse" %in% scores) scores_folds$mse <- add_model_names(mse) + if ("mae" %in% scores) scores_folds$mae <- add_model_names(mae) + if ("crps" %in% scores) scores_folds$crps <- add_model_names(crps) + if ("scrps" %in% scores) scores_folds$scrps <- add_model_names(scrps) + + out <- list( + scores_df = result_df, + scores_folds = scores_folds + ) + + if (save_settings) { + out[["settings"]] <- settings_list + } + if (return_train_test) { + out[["train_test"]] <- train_test_indexes + } + if(return_true_test_values){ + out[["true_test_values"]] <- true_test_values + } + if (return_post_samples) { + out[["post_samples"]] <- post_samples + out[["hyper_samples"]] <- hyper_samples + } } - - - return(out) } -#' @name group_predict -#' @title Perform prediction on a testing set based on a training set -#' @description Compute prediction of a formula-based expression on a testing set based on a training set. -#' @param models A fitted model obtained from calling the `bru()` function or a list of models fitted with the `bru()` function. -#' @param model_names A vector containing the names of the models to appear in the returned `data.frame`. If `NULL`, the names will be of the form `Model 1`, `Model 2`, and so on. By default, it will try to obtain the name from the models list. -#' @param formula A formula where the right hand side defines an R expression to evaluate for each generated sample. If `NULL``, the latent and hyperparameter states are returned as named list elements. See the manual for the `predict` method in the `inlabru` package. -#' @param train_indices A list containing the indices of the observations for the model to be trained, or a numerical vector containing the indices. -#' @param test_indices A list containing the indices of the test data, where the prediction will be done, or a numerical vector containing the indices. -#' @param n_samples Number of samples to compute the posterior statistics to be used to compute the scores. -#' @param pseudo_predict If `TRUE`, the models will NOT be refitted on the training data, and the parameters obtained on the entire dataset will be used. If `FALSE`, the models will be refitted on the training data. -#' @param return_samples Should the posterior samples be returned? -#' @param return_hyper_samples Should samples for the hyperparameters be obtained? -#' @param n_hyper_samples Number of independent samples of the hyper parameters of size `n_samples`. -#' @param compute_posterior_means Should the posterior means be computed from the posterior samples? -#' @param print Should partial results be printed throughout the computation? -#' @param fit_verbose Should INLA's run during the prediction be verbose? -#' @return A data.frame with the fitted models and the corresponding scores. -#' @export - -group_predict <- function(models, model_names = NULL, formula = NULL, - train_indices, test_indices, n_samples = 1000, - pseudo_predict = TRUE, - return_samples = FALSE, return_hyper_samples = FALSE, - n_hyper_samples = 1, - compute_posterior_means = TRUE, - print = TRUE, fit_verbose = FALSE) { - if (length(train_indices) != length(test_indices)) { - if (!is.numeric(train_indices) || !is.numeric(test_indices)) { - stop("train_indices and test_indices must be lists of the same length or must be numerical vectors containing the indices!") - } - } - - if (is.numeric(train_indices) && is.numeric(test_indices)) { - train_indices <- list(train_indices) - test_indices <- list(test_indices) - } - - if (!is.numeric(n_samples)) { - stop("n_samples must be numeric!") - } - - if (n_samples %% 1 != 0) { - warning("Non-integer n_samples given, it will be rounded to an integer number.") - n_samples <- round(n_samples) - } - - if (n_samples <= 0) { - stop("n_samples must be positive!") - } - - if (!is.list(models)) { - stop("models should either be a result from a bru() call or a list of results from bru() calls!") - } - if (inherits(models, "bru")) { - models <- list(models) - } else { - for (i in 1:length(models)) { - if (!inherits(models[[i]], "bru")) { - stop("models must be either a result from a bru call or a list of results from bru() calls!") - } - } - } - - if (is.null(model_names) && is.list(models)) { - model_names <- names(models) - } - - if (!is.null(model_names)) { - if (!is.character(model_names)) { - stop("model_names must be a vector of strings!") - } - if (length(models) != length(model_names)) { - stop("model_names must contain one name for each model!") - } - } else { - model_names <- vector(mode = "character", length(models)) - for (i in 1:length(models)) { - model_names[i] <- paste("Model", i) - } - } - - # Getting the data if NULL - data <- models[[1]]$bru_info$lhoods[[1]]$data - if (is.vector(data)) { - data <- as.data.frame(data) - } +#' @noRd - post_samples <- list() - post_means <- list() - hyper_samples <- list() - hyper_marginals <- list() - hyper_summary <- list() - test_data <- vector(mode = "list", length = length(train_indices)) +samples_posterior_linear_predictor <- function(model, i_lik, test_list, n_samples, print){ + test_data <- model$bru_info$lhoods[[i_lik]]$response_data$BRU_response[test_list[[i_lik]]] - for (model_number in 1:length(models)) { - post_samples[[model_names[[model_number]]]] <- vector(mode = "list", length = length(train_indices)) - post_means[[model_names[[model_number]]]] <- vector(mode = "list", length = length(train_indices)) - hyper_samples[[model_names[[model_number]]]] <- vector(mode = "list", length = n_hyper_samples) - hyper_marginals[[model_names[[model_number]]]] <- vector(mode = "list", length = length(train_indices)) - hyper_summary[[model_names[[model_number]]]] <- vector(mode = "list", length = length(train_indices)) - for (j in 1:n_hyper_samples) { - hyper_samples[[model_names[[model_number]]]][[j]] <- vector(mode = "list", length = length(train_indices)) - } - } + link_name <- model$.args$control.family[[i_lik]]$link + model_family <- model$.args$family[[i_lik]] - for (fold in 1:length(train_indices)) { - for (model_number in 1:length(models)) { - if (print) { - cat(paste("Fold:", fold, "/", length(train_indices), "\n")) - if (!is.null(model_names)) { - cat(paste("Model:", model_names[[model_number]], "\n")) + if (link_name == "default") { + if (model_family == "gaussian") { + linkfuninv <- function(x) { + x + } + } else if (model_family %in% c("gamma", "poisson", "stochvol", "stochvolln", "stochvolnig", "stochvolt")) { + linkfuninv <- function(x) { + exp(x) + } + } else if(model_family == "binomial"){ + linkfuninv <- function(x) { + exp(x)/(1 + exp(x)) + } + } else{ + stop(paste("The family", model_family, "is not supported yet, please, raise an issue in https://github.com/davidbolin/rSPDE/issues requesting the support.")) + } } else { - cat(paste("Model:", model_number, "\n")) + linkfuninv <- process_link(link_name) } - } - # Generate posterior samples of the mean - if (is.null(models[[model_number]]$.args)) { - stop("There was a problem with INLA's fit. Please, check your model specifications carefully and re-fit the model.") - } - - - df_train <- select_indexes(data, train_indices[[fold]]) - df_pred <- select_indexes(data, test_indices[[fold]]) + formula_tmp <- process_formula_lhoods(models[[model_number]], i_lik) + + env_tmp <- environment(formula_tmp) + assign("linkfuninv", linkfuninv, envir = env_tmp) + if (model_family %in% c("stochvol", "stochvolln", "stochvolnig", "stochvolt")) { + tmp_n_samples <- new_n_samples + new_n_samples <- 2 * n_samples + } + if (print) { + cat("Generating samples...\n") + } - df_pred <- prepare_df_pred(df_pred, models[[model_number]], test_indices[[fold]]) - new_model <- bru_rerun_with_data(models[[model_number]], train_indices[[fold]], true_CV = !pseudo_predict, fit_verbose = fit_verbose) + data <- models[[model_number]]$bru_info$lhoods[[i_lik]]$data - if (print) { - cat("Generating samples...\n") - } + df_pred <- select_indexes(data, test_list[[i_lik]]) - post_samples[[model_names[[model_number]]]][[fold]] <- inlabru::generate(new_model, newdata = df_pred, formula = formula, n.samples = n_samples) + post_samples <- inlabru::generate(new_model, newdata = df_pred, formula = formula, n.samples = n_samples) - test_data[[fold]] <- models[[model_number]]$bru_info$lhoods[[1]]$response_data$BRU_response[test_indices[[fold]]] + return(post_samples) +} - if (print) { - cat("Samples generated!\n") - } - if (nrow(post_samples[[model_names[[model_number]]]][[fold]]) == 1) { - post_samples[[model_names[[model_number]]]][[fold]] <- matrix(rep(post_samples[[model_names[[model_number]]]][[fold]], length(test_indices[[fold]])), ncol = ncol(post_samples[[model_names[[model_number]]]][[fold]]), byrow = TRUE) - } +#' @noRd - if (compute_posterior_means) { - post_means[[model_names[[model_number]]]][[fold]] <- rowMeans(post_samples[[model_names[[model_number]]]][[fold]]) - } +get_posterior_samples <- function(post_linear_predictors, new_model, i_lik, new_n_samples, full_model, true_CV, print){ + model_family <- new_model$.args$family[[i_lik]] + + if(true_CV){ + model_sample <- new_model + } else{ + model_sample <- full_model + } - hyper_marginals[[model_names[[model_number]]]][[fold]] <- new_model$marginals.hyperpar + family_mappings <- map_models_to_strings(full_model) + if(family_mappings[[i_lik]] != ".none"){ + meas_err_par <- lapply(family_mappings[[i_lik]], function(param) { + INLA::inla.rmarginal( + new_n_samples, + model_sample$marginals.hyperpar[[param]] + ) + }) + } - hyper_summary[[model_names[[model_number]]]][[fold]] <- new_model$summary.hyperpar + if (models[[model_number]]$.args$family == "gaussian") { + sd_sample <- 1 / sqrt(as.vector(meas_err_par[[1]])) + Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { + post_linear_predictors[i, ] + sd_sample * rnorm(new_n_samples) + }) + } else if(models[[model_number]]$.args$family == "gamma"){ + phi_sample <- as.vector(meas_err_par[[1]]) + Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { + scale_temp <- post_linear_predictors[i,] / phi_sample + rgamma(new_n_samples, shape = phi_sample, scale = scale_temp) + }) + } else if(models[[model_number]]$.args$family == "poisson"){ + Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { + stats::rpois(n_samples, post_linear_predictors[i,]) + }) + } else if(models[[model_number]]$.args$family == "binomial"){ + Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { + stats::rbinom(n = new_n_samples, size = 1, prob = post_linear_predictors[i, ]) + }) + } else if(models[[model_number]]$.args$family == "stochvol"){ + phi_sample <- as.vector(meas_err_par[[1]]) + Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { + sqrt(post_linear_predictors[i, ] + 1 / phi_sample) * rnorm(new_n_samples) + }) + } else if(models[[model_number]]$.args$family == "stochvolln"){ + phi_sample <- as.vector(meas_err_par[[1]]) + mu_sample <- as.vector(meas_err_par[[2]]) + Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { + var <- post_linear_predictors[i, ] + 1 / phi_sample + mean <- mu_sample - 0.5 * var + mean + sqrt(var) * rnorm(new_n_samples) + }) + } else if(models[[model_number]]$.args$family == "stochvolnig"){ + shape <- as.vector(meas_err_par[[1]]) + skewness <- as.vector(meas_err_par[[2]]) + gamma <- sqrt(1+skewness^2/shape^2) + Y_sample <- lapply(1:length(test_data), function(i) { + sqrt(post_linear_predictors[i, ]) * GeneralizedHyperbolic::rnig(new_n_samples, + mu = -skewness / gamma, # mu_new corresponds to delta_old = -skewness_1 / gamma_1 + delta = shape / gamma, # delta_new = sqrt(nu * sigma^2) = shape_1 / gamma_1 + alpha = sqrt(skewness^2 * gamma^4 + shape^2 * gamma^2), + # alpha = sqrt(mu^2 / sigma^4 + nu / sigma^2) + # = sqrt(skewness_1^2 * gamma_1^4 + shape_1^2 * gamma_1^2) - if (return_hyper_samples) { - for (j in 1:n_hyper_samples) { - hyper_samples[[model_names[[model_number]]]][[j]][[fold]] <- INLA::inla.hyperpar.sample(n_samples, new_model, improve.marginals = TRUE) - } - } + beta = skewness * gamma # beta = mu_old / sigma^2 = skewness_1 * gamma_1 + ) + }) + } else if(models[[model_number]]$.args$family == "stochvolt"){ + degree <- as.vector(meas_err_par[[1]]) + Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { + sqrt(post_linear_predictors[i, ]) * rt(new_n_samples, degree) + }) } - } + if (print) { + cat("Samples generated!\n") + } + + return(Y_sample) +} +map_models_to_strings <- function(models) { + # Extract families from models + families <- models$.args$family + + # Define base mappings + mapping <- list( + "gaussian" = "Precision for the Gaussian observations", + "gamma" = "Precision parameter for the Gamma observations", + "poisson" = ".none", + "binomial" = ".none", + "stochvol" = "Offset precision for stochvol", + "stochvolln" = c("Offset precision for stochvolln","Mean offset for stochvolln"), + "stochvolnig" = c("shape parameter for stochvol-nig", "skewness parameter for stochvol-nig"), + "stochvolt" = "degrees of freedom for stochvol student-t" + ) + + # Count occurrences of each family + family_counts <- table(families) + + # Initialize result list + result <- vector("list", length(families)) + + # Track how many times we've seen each family + seen_counts <- list() + + # Process each family + for (i in seq_along(families)) { + family <- families[i] + + # Initialize or increment counter for this family + seen_counts[[family]] <- if (is.null(seen_counts[[family]])) 1 else seen_counts[[family]] + 1 + + # Get base string(s) for this family + base_string <- mapping[[family]] + + # Only append counter if it's not .none and appears multiple times + if (base_string[1] != ".none" && family_counts[family] > 1 && seen_counts[[family]] > 1) { + if (length(base_string) == 1) { + result[[i]] <- paste0(base_string, "[", seen_counts[[family]], "]") + } else { + result[[i]] <- paste0(base_string, "[", seen_counts[[family]], "]") + } + } else { + result[[i]] <- base_string + } + } + + return(result) +} - out <- list() - if (return_samples) { - out[["post_samples"]] <- post_samples - } - if (return_hyper_samples) { - out[["hyper_samples"]] <- hyper_samples - } - out[["hyper_marginals"]] <- hyper_marginals - out[["hyper_summary"]] <- hyper_summary - out[["test_data"]] <- test_data - if (compute_posterior_means) { - out[["post_means"]] <- post_means - } - return(out) -} diff --git a/R/util.R b/R/util.R index 825aae6e..933cc425 100644 --- a/R/util.R +++ b/R/util.R @@ -1869,72 +1869,100 @@ select_indexes <- function(data, idx) { } - -#' Create train and test splits to be used in the `cross_validation` function +#' Create train and test splits for cross-validation #' -#' Train and test splits +#' @description +#' Creates train and test splits for cross-validation by handling multiple data types +#' and supporting k-fold, leave-one-out (LOO), and leave-percentage-out (LPO) methods. +#' Handles missing values and maintains data structure across multiple datasets. #' -#' @param data A `list`, `data.frame`, `SpatialPointsDataFrame` or `metric_graph_data` objects. -#' @param cv_type The type of the folding to be carried out. The options are `k-fold` for `k`-fold cross-validation, in which case the parameter `k` should be provided, -#' `loo`, for leave-one-out and `lpo` for leave-percentage-out, in this case, the parameter `percentage` should be given, and also the `number_folds` -#' with the number of folds to be done. The default is `k-fold`. -#' @param k The number of folds to be used in `k`-fold cross-validation. Will only be used if `cv_type` is `k-fold`. -#' @param percentage The percentage (from 1 to 99) of the data to be used to train the model. Will only be used if `cv_type` is `lpo`. -#' @param number_folds Number of folds to be done if `cv_type` is `lpo`. -#' @return A list with two elements, `train` containing the training indices and `test` containing indices. -#' @export - -create_train_test_indices <- function(data, cv_type = c("k-fold", "loo", "lpo"), - k = 5, percentage = 20, number_folds = 10) { - if (inherits(data, "metric_graph_data")) { - idx <- seq_len(nrow(as.data.frame(data))) +#' @param data_list A list of datasets, one per likelihood. Each dataset can be a data.frame, +#' SpatialPointsDataFrame, or metric_graph_data object +#' @param cv_type Type of cross-validation: "k-fold", "loo", or "lpo". Default is "k-fold" +#' @param k Number of folds for k-fold CV. Default is 5 +#' @param percentage Training data percentage for LPO CV (1-99). Default is 20 +#' @param number_folds Number of folds for LPO CV. Default is 10 +#' +#' @return A list where each element contains: +#' \item{train}{Indices for training data mapped to original datasets} +#' \item{test}{Indices for test data mapped to original datasets} +#' +#' @details +#' The function handles NA values by removing rows with any missing values before +#' creating splits. For multiple datasets, indices are mapped back to their original +#' positions in each dataset. +#' @export + +create_train_test_indices <- function(data_list, cv_type = c("k-fold", "loo", "lpo"), + k = 5, percentage = 20, number_folds = 10) { + # First concatenate all data + if (inherits(data_list[[1]], "metric_graph_data")) { + data <- do.call(rbind, lapply(data_list, as.data.frame)) } else { - idx <- seq_len(nrow(data)) + data <- do.call(rbind, data_list) } - if (inherits(data, "SpatialPointsDataFrame")) { + + # Get indices for concatenated data as before + idx <- seq_len(nrow(data)) + + if (inherits(data_list[[1]], "SpatialPointsDataFrame")) { data_tmp <- data@data data_nonNA <- !is.na(data_tmp) - } else if (inherits(data, "metric_graph_data")) { + } else if (inherits(data_list[[1]], "metric_graph_data")) { data_nonNA <- !is.na(as.data.frame(data)) } else { data_nonNA <- !is.na(data) } + idx_nonNA <- sapply(1:length(idx), function(i) { all(data_nonNA[i, ]) }) idx <- idx[idx_nonNA] + + # Get cumulative sizes to map back to individual datasets + n_samples <- sapply(data_list, nrow) + cum_sizes <- cumsum(c(0, n_samples)) + + # Function to map concatenated indices to individual dataset indices + map_to_likelihood_indices <- function(indices) { + lapply(seq_along(data_list), function(i) { + likelihood_indices <- indices[indices > cum_sizes[i] & indices <= cum_sizes[i + 1]] + likelihood_indices - cum_sizes[i] + }) + } + if (cv_type == "k-fold") { - # split idx into k folds <- cut(sample(idx), breaks = k, label = FALSE) - test_list_idx <- lapply(1:k, function(i) { - which(folds == i, arr.ind = TRUE) - }) - test_list <- lapply(test_list_idx, function(idx_test) { - idx[idx_test] - }) - train_list <- lapply(1:k, function(i) { - idx[-test_list_idx[[i]]] + fold_list <- lapply(1:k, function(i) { + test_idx <- which(folds == i, arr.ind = TRUE) + train_idx <- idx[-test_idx] + test_idx <- idx[test_idx] + + list( + train = map_to_likelihood_indices(train_idx), + test = map_to_likelihood_indices(test_idx) + ) }) } else if (cv_type == "loo") { - train_list <- lapply(1:length(idx), function(i) { - idx[-i] + fold_list <- lapply(seq_along(idx), function(i) { + list( + train = map_to_likelihood_indices(idx[-i]), + test = map_to_likelihood_indices(idx[i]) + ) }) - # test_list <- lapply(1:length(idx), function(i){idx[i]}) - test_list <- as.list(idx) } else if (cv_type == "lpo") { - test_list_idx <- list() - n_Y <- length(idx) - for (i in number_folds:1) { - test_list_idx[[i]] <- sample(1:length(idx), size = (1 - percentage / 100) * n_Y) - } - train_list <- lapply(1:number_folds, function(i) { - idx[-test_list_idx[[i]]] - }) - test_list <- lapply(test_list_idx, function(idx_test) { - idx[idx_test] + fold_list <- lapply(1:number_folds, function(i) { + test_idx <- sample(idx, size = (1 - percentage / 100) * length(idx)) + train_idx <- idx[-match(test_idx, idx)] + + list( + train = map_to_likelihood_indices(train_idx), + test = map_to_likelihood_indices(test_idx) + ) }) } - return(list(train = train_list, test = test_list)) + + return(fold_list) } # Check for required packages From 2d94ba3cc145c72be39ff30f3e97fe17b501d290 Mon Sep 17 00:00:00 2001 From: vpnsctl Date: Thu, 12 Dec 2024 04:16:24 +0300 Subject: [PATCH 2/7] updates --- NAMESPACE | 1 - R/inlabru_rspde.R | 88 ++++++++++++------- man/bru_get_mapper.inla_rspde.Rd | 8 +- ...bru_get_mapper.inla_rspde_anisotropic2d.Rd | 2 +- man/bru_get_mapper.inla_rspde_matern1d.Rd | 8 +- man/bru_get_mapper.inla_rspde_spacetime.Rd | 2 +- man/create_train_test_indices.Rd | 30 ++++--- man/cross_validation.Rd | 12 ++- man/group_predict.Rd | 55 ------------ 9 files changed, 96 insertions(+), 110 deletions(-) delete mode 100644 man/group_predict.Rd diff --git a/NAMESPACE b/NAMESPACE index 58b1717c..54ae022b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -73,7 +73,6 @@ export(get.initial.values.rSPDE) export(gg_df) export(glance) export(graph_data_rspde) -export(group_predict) export(intrinsic.matern.operators) export(matern.covariance) export(matern.operators) diff --git a/R/inlabru_rspde.R b/R/inlabru_rspde.R index 089df926..ee732e5a 100644 --- a/R/inlabru_rspde.R +++ b/R/inlabru_rspde.R @@ -117,12 +117,32 @@ ibm_jacobian.bru_mapper_inla_rspde <- function(mapper, input, ...) { ) } +#' @noRd + +process_formula <- function(bru_result) { + form <- bru_result$bru_info$model$formula[3] + form <- as.character(form) + form <- strsplit(form, "f\\(") + form <- form[[1]] + form <- form[-1] + form_proc <- sub(",.*", "", strsplit(form, "f\\(")[1]) + if (length(form) > 1) { + for (i in 2:(length(form))) { + form_proc <- paste(form_proc, " + ", sub(",.*", "", strsplit(form, "f\\(")[i])) + } + } + form_proc <- paste("~", "linkfuninv(", form_proc, ")") + return(stats::as.formula(form_proc)) +} #' @noRd process_formula_lhoods <- function(bru_result, like_number) { form <- bru_result$bru_info$lhoods[[like_number]]$formula[3] form <- as.character(form) + if(form == "."){ + return(process_formula(bru_result)) + } form_proc <- paste("~", "linkfuninv(", form, ")") return(stats::as.formula(form_proc)) } @@ -402,8 +422,10 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps # Creating lists of train and test datasets if (is.null(train_test_indexes)) { - data_list <- lapply(seq_along(models$bru_info$lhoods), function(i) { - models$bru_info$lhoods[[i]]$data + # Observe that here we are assuming that all models use the same data, which we added in the description as an assumption. + + data_list <- lapply(seq_len(n_likelihoods), function(i) { + models[[1]]$bru_info$lhoods[[i]]$data }) train_test_indexes <- create_train_test_indices(data_list, cv_type = cv_type, @@ -468,11 +490,14 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps result_df <- data.frame(Model = model_names) - dss <- lapply(1:n_likelihoods, matrix(numeric(n_folds * n_models), ncol = n_models)) - mse <- lapply(1:n_likelihoods, matrix(numeric(n_folds * n_models), ncol = n_models)) - mae <- lapply(1:n_likelihoods, matrix(numeric(n_folds * n_models), ncol = n_models)) - crps <- lapply(1:n_likelihoods, matrix(numeric(n_folds * n_models), ncol = n_models)) - scrps <- lapply(1:n_likelihoods, matrix(numeric(n_folds * n_models), ncol = n_models)) + n_folds <- length(train_test_indexes) + n_models <- length(models) + + dss <- lapply(1:n_likelihoods, function(i) matrix(numeric(n_folds * n_models), ncol = n_models)) + mse <- lapply(1:n_likelihoods, function(i) matrix(numeric(n_folds * n_models), ncol = n_models)) + mae <- lapply(1:n_likelihoods, function(i) matrix(numeric(n_folds * n_models), ncol = n_models)) + crps <- lapply(1:n_likelihoods, function(i) matrix(numeric(n_folds * n_models), ncol = n_models)) + scrps <- lapply(1:n_likelihoods, function(i) matrix(numeric(n_folds * n_models), ncol = n_models)) if (("crps" %in% scores) || ("scrps" %in% scores) || ("dss" %in% scores)) { new_n_samples <- 2 * n_samples @@ -480,10 +505,10 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps new_n_samples <- n_samples } - for (fold in 1:length(train_list)) { + for (fold in 1:length(train_test_indexes)) { for (model_number in 1:length(models)) { if (print) { - cat(paste("Fold:", fold, "/", length(train_list), "\n")) + cat(paste("Fold:", fold, "/", length(train_test_indexes), "\n")) if (!is.null(model_names)) { cat(paste("Model:", model_names[[model_number]], "\n")) } else { @@ -500,7 +525,7 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps model_family <- models[[model_number]]$.args$family[[i_lik]] - post_linear_predictors <- samples_posterior_linear_predictor(new_model, i_lik, test_list, n_samples, print) + post_linear_predictors <- sample_posterior_linear_predictor(new_model, i_lik, test_list, n_samples, print) post_samples[[model_names[[model_number]]]][[fold]][[i_lik]] <- get_posterior_samples( post_linear_predictors = post_linear_predictors, new_model = new_model, @@ -508,6 +533,8 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps full_model = models[[model_number]], true_CV = true_CV, print = print) + post_samples[[model_names[[model_number]]]][[fold]][[i_lik]] <- do.call(rbind, post_samples[[model_names[[model_number]]]][[fold]][[i_lik]]) + test_data <- models[[model_number]]$bru_info$lhoods[[i_lik]]$response_data$BRU_response[test_list[[i_lik]]] if(return_true_test_values){ @@ -516,6 +543,7 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps if (!(model_family %in% c("stochvol", "stochvolln", "stochvolnig", "stochvolt"))) { + posterior_mean <- rowMeans(post_samples[[model_names[[model_number]]]][[fold]][[i_lik]]) if ("mse" %in% scores) { mse[[i_lik]][fold, model_number] <- mean((test_data - posterior_mean)^2, na.rm = TRUE) if (orientation_results == "positive") { @@ -549,17 +577,17 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps Y2_sample <- post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, (n_samples + 1):(2 * n_samples)] if (parallelize_RP) { E1_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - test_data[i])) + mean(abs(Y1_sample[i,] - test_data[i])) }) E2_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) + mean(abs(Y1_sample[i,] - Y2_sample[i,])) }) } else { E1_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - test_data[i])) + mean(abs(Y1_sample[i,] - test_data[i])) }) E2_tmp <- lapply(1:length(test_data), function(i) { - mean(abs(Y1_sample[[i]] - Y2_sample[[i]])) + mean(abs(Y1_sample[i,] - Y2_sample[i,])) }) } } @@ -740,7 +768,7 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps final_row <- c(final_row, "") next } - + if (orientation_results == "negative") { best_tmp <- which.min(result_df[, j]) } else { @@ -751,17 +779,17 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps result_df <- rbind(result_df, final_row) row.names(result_df)[nrow(result_df)] <- "" } - + # Cluster cleanup if (parallelize_RP) { parallel::stopCluster(cluster_tmp) } - + # Set return flag if (return_post_samples) { return_scores_folds <- TRUE } - + # Prepare output if (!return_scores_folds) { if (save_settings) { @@ -826,7 +854,7 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps #' @noRd -samples_posterior_linear_predictor <- function(model, i_lik, test_list, n_samples, print){ +sample_posterior_linear_predictor <- function(model, i_lik, test_list, n_samples, print){ test_data <- model$bru_info$lhoods[[i_lik]]$response_data$BRU_response[test_list[[i_lik]]] link_name <- model$.args$control.family[[i_lik]]$link @@ -853,7 +881,7 @@ samples_posterior_linear_predictor <- function(model, i_lik, test_list, n_sample linkfuninv <- process_link(link_name) } - formula_tmp <- process_formula_lhoods(models[[model_number]], i_lik) + formula_tmp <- process_formula(model) env_tmp <- environment(formula_tmp) assign("linkfuninv", linkfuninv, envir = env_tmp) @@ -865,11 +893,11 @@ samples_posterior_linear_predictor <- function(model, i_lik, test_list, n_sample cat("Generating samples...\n") } - data <- models[[model_number]]$bru_info$lhoods[[i_lik]]$data + data <- model$bru_info$lhoods[[i_lik]]$data df_pred <- select_indexes(data, test_list[[i_lik]]) - post_samples <- inlabru::generate(new_model, newdata = df_pred, formula = formula, n.samples = n_samples) + post_samples <- inlabru::generate(model, newdata = df_pred, formula = formula_tmp, n.samples = n_samples) return(post_samples) } @@ -896,31 +924,31 @@ get_posterior_samples <- function(post_linear_predictors, new_model, i_lik, new_ }) } - if (models[[model_number]]$.args$family == "gaussian") { + if (model_family == "gaussian") { sd_sample <- 1 / sqrt(as.vector(meas_err_par[[1]])) Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { post_linear_predictors[i, ] + sd_sample * rnorm(new_n_samples) }) - } else if(models[[model_number]]$.args$family == "gamma"){ + } else if(model_family == "gamma"){ phi_sample <- as.vector(meas_err_par[[1]]) Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { scale_temp <- post_linear_predictors[i,] / phi_sample rgamma(new_n_samples, shape = phi_sample, scale = scale_temp) }) - } else if(models[[model_number]]$.args$family == "poisson"){ + } else if(model_family == "poisson"){ Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { stats::rpois(n_samples, post_linear_predictors[i,]) }) - } else if(models[[model_number]]$.args$family == "binomial"){ + } else if(model_family == "binomial"){ Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { stats::rbinom(n = new_n_samples, size = 1, prob = post_linear_predictors[i, ]) }) - } else if(models[[model_number]]$.args$family == "stochvol"){ + } else if(model_family == "stochvol"){ phi_sample <- as.vector(meas_err_par[[1]]) Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { sqrt(post_linear_predictors[i, ] + 1 / phi_sample) * rnorm(new_n_samples) }) - } else if(models[[model_number]]$.args$family == "stochvolln"){ + } else if(model_family == "stochvolln"){ phi_sample <- as.vector(meas_err_par[[1]]) mu_sample <- as.vector(meas_err_par[[2]]) Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { @@ -928,7 +956,7 @@ get_posterior_samples <- function(post_linear_predictors, new_model, i_lik, new_ mean <- mu_sample - 0.5 * var mean + sqrt(var) * rnorm(new_n_samples) }) - } else if(models[[model_number]]$.args$family == "stochvolnig"){ + } else if(model_family == "stochvolnig"){ shape <- as.vector(meas_err_par[[1]]) skewness <- as.vector(meas_err_par[[2]]) gamma <- sqrt(1+skewness^2/shape^2) @@ -943,7 +971,7 @@ get_posterior_samples <- function(post_linear_predictors, new_model, i_lik, new_ beta = skewness * gamma # beta = mu_old / sigma^2 = skewness_1 * gamma_1 ) }) - } else if(models[[model_number]]$.args$family == "stochvolt"){ + } else if(model_family == "stochvolt"){ degree <- as.vector(meas_err_par[[1]]) Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { sqrt(post_linear_predictors[i, ]) * rt(new_n_samples, degree) diff --git a/man/bru_get_mapper.inla_rspde.Rd b/man/bru_get_mapper.inla_rspde.Rd index 75728dc1..1ce70685 100644 --- a/man/bru_get_mapper.inla_rspde.Rd +++ b/man/bru_get_mapper.inla_rspde.Rd @@ -7,13 +7,13 @@ \alias{ibm_jacobian.bru_mapper_inla_rspde} \title{rSPDE inlabru mapper} \usage{ -bru_get_mapper.inla_rspde(model, ...) +\method{bru_get_mapper}{inla_rspde}(model, ...) -ibm_n.bru_mapper_inla_rspde(mapper, ...) +\method{ibm_n}{bru_mapper_inla_rspde}(mapper, ...) -ibm_values.bru_mapper_inla_rspde(mapper, ...) +\method{ibm_values}{bru_mapper_inla_rspde}(mapper, ...) -ibm_jacobian.bru_mapper_inla_rspde(mapper, input, ...) +\method{ibm_jacobian}{bru_mapper_inla_rspde}(mapper, input, ...) } \arguments{ \item{model}{An \code{inla_rspde} object for which to construct or extract a mapper} diff --git a/man/bru_get_mapper.inla_rspde_anisotropic2d.Rd b/man/bru_get_mapper.inla_rspde_anisotropic2d.Rd index 282769ed..f8c4d760 100644 --- a/man/bru_get_mapper.inla_rspde_anisotropic2d.Rd +++ b/man/bru_get_mapper.inla_rspde_anisotropic2d.Rd @@ -4,7 +4,7 @@ \alias{bru_get_mapper.inla_rspde_anisotropic2d} \title{rSPDE anisotropic inlabru mapper} \usage{ -bru_get_mapper.inla_rspde_anisotropic2d(model, ...) +\method{bru_get_mapper}{inla_rspde_anisotropic2d}(model, ...) } \arguments{ \item{model}{An \code{inla_rspde_anisotropic2d} object for which to construct or extract a mapper} diff --git a/man/bru_get_mapper.inla_rspde_matern1d.Rd b/man/bru_get_mapper.inla_rspde_matern1d.Rd index cc2b8187..33d67587 100644 --- a/man/bru_get_mapper.inla_rspde_matern1d.Rd +++ b/man/bru_get_mapper.inla_rspde_matern1d.Rd @@ -7,13 +7,13 @@ \alias{ibm_jacobian.bru_mapper_inla_rspde_matern1d} \title{rSPDE stationary inlabru mapper} \usage{ -bru_get_mapper.inla_rspde_matern1d(model, ...) +\method{bru_get_mapper}{inla_rspde_matern1d}(model, ...) -ibm_n.bru_mapper_inla_rspde_matern1d(mapper, ...) +\method{ibm_n}{bru_mapper_inla_rspde_matern1d}(mapper, ...) -ibm_values.bru_mapper_inla_rspde_matern1d(mapper, ...) +\method{ibm_values}{bru_mapper_inla_rspde_matern1d}(mapper, ...) -ibm_jacobian.bru_mapper_inla_rspde_matern1d(mapper, input, ...) +\method{ibm_jacobian}{bru_mapper_inla_rspde_matern1d}(mapper, input, ...) } \arguments{ \item{model}{An \code{inla_rspde_matern1d} object for which to construct or extract a mapper} diff --git a/man/bru_get_mapper.inla_rspde_spacetime.Rd b/man/bru_get_mapper.inla_rspde_spacetime.Rd index 298a131a..c71f532d 100644 --- a/man/bru_get_mapper.inla_rspde_spacetime.Rd +++ b/man/bru_get_mapper.inla_rspde_spacetime.Rd @@ -4,7 +4,7 @@ \alias{bru_get_mapper.inla_rspde_spacetime} \title{rSPDE space time inlabru mapper} \usage{ -bru_get_mapper.inla_rspde_spacetime(model, ...) +\method{bru_get_mapper}{inla_rspde_spacetime}(model, ...) } \arguments{ \item{model}{An \code{inla_rspde_spacetime} object for which to construct or extract a mapper} diff --git a/man/create_train_test_indices.Rd b/man/create_train_test_indices.Rd index 5cb2b446..daa12fe2 100644 --- a/man/create_train_test_indices.Rd +++ b/man/create_train_test_indices.Rd @@ -2,10 +2,10 @@ % Please edit documentation in R/util.R \name{create_train_test_indices} \alias{create_train_test_indices} -\title{Create train and test splits to be used in the \code{cross_validation} function} +\title{Create train and test splits for cross-validation} \usage{ create_train_test_indices( - data, + data_list, cv_type = c("k-fold", "loo", "lpo"), k = 5, percentage = 20, @@ -13,21 +13,29 @@ create_train_test_indices( ) } \arguments{ -\item{data}{A \code{list}, \code{data.frame}, \code{SpatialPointsDataFrame} or \code{metric_graph_data} objects.} +\item{data_list}{A list of datasets, one per likelihood. Each dataset can be a data.frame, +SpatialPointsDataFrame, or metric_graph_data object} -\item{cv_type}{The type of the folding to be carried out. The options are \code{k-fold} for \code{k}-fold cross-validation, in which case the parameter \code{k} should be provided, -\code{loo}, for leave-one-out and \code{lpo} for leave-percentage-out, in this case, the parameter \code{percentage} should be given, and also the \code{number_folds} -with the number of folds to be done. The default is \code{k-fold}.} +\item{cv_type}{Type of cross-validation: "k-fold", "loo", or "lpo". Default is "k-fold"} -\item{k}{The number of folds to be used in \code{k}-fold cross-validation. Will only be used if \code{cv_type} is \code{k-fold}.} +\item{k}{Number of folds for k-fold CV. Default is 5} -\item{percentage}{The percentage (from 1 to 99) of the data to be used to train the model. Will only be used if \code{cv_type} is \code{lpo}.} +\item{percentage}{Training data percentage for LPO CV (1-99). Default is 20} -\item{number_folds}{Number of folds to be done if \code{cv_type} is \code{lpo}.} +\item{number_folds}{Number of folds for LPO CV. Default is 10} } \value{ -A list with two elements, \code{train} containing the training indices and \code{test} containing indices. +A list where each element contains: +\item{train}{Indices for training data mapped to original datasets} +\item{test}{Indices for test data mapped to original datasets} } \description{ -Train and test splits +Creates train and test splits for cross-validation by handling multiple data types +and supporting k-fold, leave-one-out (LOO), and leave-percentage-out (LPO) methods. +Handles missing values and maintains data structure across multiple datasets. +} +\details{ +The function handles NA values by removing rows with any missing values before +creating splits. For multiple datasets, indices are mapped back to their original +positions in each dataset. } diff --git a/man/cross_validation.Rd b/man/cross_validation.Rd index 3b12728d..8f22067f 100644 --- a/man/cross_validation.Rd +++ b/man/cross_validation.Rd @@ -29,7 +29,9 @@ cross_validation( ) } \arguments{ -\item{models}{A fitted model obtained from calling the \code{bru()} function or a list of models fitted with the \code{bru()} function.} +\item{models}{A fitted model obtained from calling the \code{bru()} function or a list of fitted models. +All models in the list must have the same number of likelihoods and must be fitted to +identical datasets.} \item{model_names}{A vector containing the names of the models to appear in the returned \code{data.frame}. If \code{NULL}, the names will be of the form \verb{Model 1}, \verb{Model 2}, and so on. By default, it will try to obtain the name from the models list.} @@ -53,8 +55,12 @@ with the number of folds to be done. The default is \code{k-fold}.} \item{include_best}{Should a row indicating which model was the best for each score be included?} -\item{train_test_indexes}{A list containing two entries \code{train}, which is a list whose elements are vectors of indexes of the training data, and \code{test}, which is a list whose elements are vectors of indexes of the test data. -Typically this will be returned list obtained by setting the argument \code{return_train_test} to \code{TRUE}.} +\item{train_test_indexes}{A list where each element corresponds to a fold. Each fold contains: +\itemize{ +\item \code{train}: A list of training index vectors, one for each likelihood. +\item \code{test}: A list of test index vectors, one for each likelihood, with the same length as \code{train}. +This list is typically obtained by setting the argument \code{return_train_test} to \code{TRUE}. +}} \item{return_train_test}{Logical. Should the training and test indexes be returned? If 'TRUE' the train and test indexes will the 'train_test' element of the returned list.} diff --git a/man/group_predict.Rd b/man/group_predict.Rd deleted file mode 100644 index e50c8b34..00000000 --- a/man/group_predict.Rd +++ /dev/null @@ -1,55 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inlabru_rspde.R -\name{group_predict} -\alias{group_predict} -\title{Perform prediction on a testing set based on a training set} -\usage{ -group_predict( - models, - model_names = NULL, - formula = NULL, - train_indices, - test_indices, - n_samples = 1000, - pseudo_predict = TRUE, - return_samples = FALSE, - return_hyper_samples = FALSE, - n_hyper_samples = 1, - compute_posterior_means = TRUE, - print = TRUE, - fit_verbose = FALSE -) -} -\arguments{ -\item{models}{A fitted model obtained from calling the \code{bru()} function or a list of models fitted with the \code{bru()} function.} - -\item{model_names}{A vector containing the names of the models to appear in the returned \code{data.frame}. If \code{NULL}, the names will be of the form \verb{Model 1}, \verb{Model 2}, and so on. By default, it will try to obtain the name from the models list.} - -\item{formula}{A formula where the right hand side defines an R expression to evaluate for each generated sample. If \verb{NULL``, the latent and hyperparameter states are returned as named list elements. See the manual for the }predict\verb{method in the}inlabru` package.} - -\item{train_indices}{A list containing the indices of the observations for the model to be trained, or a numerical vector containing the indices.} - -\item{test_indices}{A list containing the indices of the test data, where the prediction will be done, or a numerical vector containing the indices.} - -\item{n_samples}{Number of samples to compute the posterior statistics to be used to compute the scores.} - -\item{pseudo_predict}{If \code{TRUE}, the models will NOT be refitted on the training data, and the parameters obtained on the entire dataset will be used. If \code{FALSE}, the models will be refitted on the training data.} - -\item{return_samples}{Should the posterior samples be returned?} - -\item{return_hyper_samples}{Should samples for the hyperparameters be obtained?} - -\item{n_hyper_samples}{Number of independent samples of the hyper parameters of size \code{n_samples}.} - -\item{compute_posterior_means}{Should the posterior means be computed from the posterior samples?} - -\item{print}{Should partial results be printed throughout the computation?} - -\item{fit_verbose}{Should INLA's run during the prediction be verbose?} -} -\value{ -A data.frame with the fitted models and the corresponding scores. -} -\description{ -Compute prediction of a formula-based expression on a testing set based on a training set. -} From d9da825ed044d007aae1b6adbd89b93726e50b58 Mon Sep 17 00:00:00 2001 From: vpnsctl Date: Thu, 12 Dec 2024 05:57:16 +0300 Subject: [PATCH 3/7] adjusts --- R/inlabru_rspde.R | 21 +++++++++++---------- R/util.R | 8 ++++---- man/cross_validation.Rd | 2 +- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/R/inlabru_rspde.R b/R/inlabru_rspde.R index ee732e5a..5cb214ad 100644 --- a/R/inlabru_rspde.R +++ b/R/inlabru_rspde.R @@ -311,7 +311,7 @@ get_post_var <- function(density_df) { #' @param fit_verbose Should INLA's run during cross-validation be verbose? #' @return A data.frame with the fitted models and the corresponding scores. #' @export -cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps", "scrps", "dss"), +cross_validation <- function(models, model_names = NULL, scores = c("mae", "mse", "crps", "scrps", "dss"), cv_type = c("k-fold", "loo", "lpo"), k = 5, percentage = 20, number_folds = 10, n_samples = 1000, return_scores_folds = FALSE, @@ -330,7 +330,7 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps stop("orientation_results must be either 'positive' or 'negative'!") } - scores <- intersect(scores, c("mse", "crps", "scrps", "dss")) + scores <- intersect(scores, c("mae", "mse", "crps", "scrps", "dss")) cv_type <- cv_type[[1]] if (!(cv_type %in% c("k-fold", "loo", "lpo"))) { @@ -565,16 +565,19 @@ cross_validation <- function(models, model_names = NULL, scores = c("mse", "crps } if ("dss" %in% scores) { - post_var <- rowMeans(post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, 1:n_samples]^2) - (rowMeans(post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, 1:n_samples]))^2 + post_var <- rowMeans(post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, 1:n_samples, drop=FALSE]^2) - (rowMeans(post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, 1:n_samples, drop=FALSE]))^2 - dss[[i_lik]][fold, model_number] <- mean((test_data - rowMeans(post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, (n_samples + 1):(2 * n_samples)]))^2 / post_var + log(post_var)) + dss[[i_lik]][fold, model_number] <- mean((test_data - rowMeans(post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, (n_samples + 1):(2 * n_samples), drop=FALSE]))^2 / post_var + log(post_var)) + if (print) { + cat(paste("DSS - Likelihood ",i_lik,": ", dss[[i_lik]][fold, model_number], "\n")) + } } if (("crps" %in% scores) || ("scrps" %in% scores)) { - Y1_sample <- post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, 1:n_samples] - Y2_sample <- post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, (n_samples + 1):(2 * n_samples)] + Y1_sample <- post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, 1:n_samples, drop=FALSE] + Y2_sample <- post_samples[[model_names[[model_number]]]][[fold]][[i_lik]][, (n_samples + 1):(2 * n_samples), drop=FALSE] if (parallelize_RP) { E1_tmp <- foreach::`%dopar%`(foreach::foreach(i = 1:length(test_data)), { mean(abs(Y1_sample[i,] - test_data[i])) @@ -917,10 +920,8 @@ get_posterior_samples <- function(post_linear_predictors, new_model, i_lik, new_ family_mappings <- map_models_to_strings(full_model) if(family_mappings[[i_lik]] != ".none"){ meas_err_par <- lapply(family_mappings[[i_lik]], function(param) { - INLA::inla.rmarginal( - new_n_samples, - model_sample$marginals.hyperpar[[param]] - ) + hyper_sample <- INLA::inla.hyperpar.sample(new_n_samples, model_sample, improve.marginals = TRUE) + return(hyper_sample[,param]) }) } diff --git a/R/util.R b/R/util.R index 933cc425..3227cb29 100644 --- a/R/util.R +++ b/R/util.R @@ -1897,10 +1897,10 @@ create_train_test_indices <- function(data_list, cv_type = c("k-fold", "loo", "l k = 5, percentage = 20, number_folds = 10) { # First concatenate all data if (inherits(data_list[[1]], "metric_graph_data")) { - data <- do.call(rbind, lapply(data_list, as.data.frame)) - } else { - data <- do.call(rbind, data_list) - } + data_list <- lapply(data_list, as.data.frame) + } + + data <- do.call(rbind, data_list) # Get indices for concatenated data as before idx <- seq_len(nrow(data)) diff --git a/man/cross_validation.Rd b/man/cross_validation.Rd index 8f22067f..27ce0745 100644 --- a/man/cross_validation.Rd +++ b/man/cross_validation.Rd @@ -7,7 +7,7 @@ cross_validation( models, model_names = NULL, - scores = c("mse", "crps", "scrps", "dss"), + scores = c("mae", "mse", "crps", "scrps", "dss"), cv_type = c("k-fold", "loo", "lpo"), k = 5, percentage = 20, From 920ef05be0c06b0cfcbf788fe671a1a5b9aac385 Mon Sep 17 00:00:00 2001 From: vpnsctl Date: Thu, 12 Dec 2024 06:19:58 +0300 Subject: [PATCH 4/7] more adjusts --- R/util.R | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/R/util.R b/R/util.R index 3227cb29..335076c7 100644 --- a/R/util.R +++ b/R/util.R @@ -1904,21 +1904,7 @@ create_train_test_indices <- function(data_list, cv_type = c("k-fold", "loo", "l # Get indices for concatenated data as before idx <- seq_len(nrow(data)) - - if (inherits(data_list[[1]], "SpatialPointsDataFrame")) { - data_tmp <- data@data - data_nonNA <- !is.na(data_tmp) - } else if (inherits(data_list[[1]], "metric_graph_data")) { - data_nonNA <- !is.na(as.data.frame(data)) - } else { - data_nonNA <- !is.na(data) - } - - idx_nonNA <- sapply(1:length(idx), function(i) { - all(data_nonNA[i, ]) - }) - idx <- idx[idx_nonNA] - + # Get cumulative sizes to map back to individual datasets n_samples <- sapply(data_list, nrow) cum_sizes <- cumsum(c(0, n_samples)) From 9e630e25d1cb2c250104c3b54ce0f4752b20202b Mon Sep 17 00:00:00 2001 From: vpnsctl Date: Thu, 12 Dec 2024 13:18:29 +0300 Subject: [PATCH 5/7] Adjusts --- R/inlabru_rspde.R | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/R/inlabru_rspde.R b/R/inlabru_rspde.R index 5cb214ad..0a1ffae3 100644 --- a/R/inlabru_rspde.R +++ b/R/inlabru_rspde.R @@ -525,7 +525,7 @@ cross_validation <- function(models, model_names = NULL, scores = c("mae", "mse" model_family <- models[[model_number]]$.args$family[[i_lik]] - post_linear_predictors <- sample_posterior_linear_predictor(new_model, i_lik, test_list, n_samples, print) + post_linear_predictors <- sample_posterior_linear_predictor(new_model, i_lik, test_list, new_n_samples, print) post_samples[[model_names[[model_number]]]][[fold]][[i_lik]] <- get_posterior_samples( post_linear_predictors = post_linear_predictors, new_model = new_model, @@ -865,7 +865,7 @@ sample_posterior_linear_predictor <- function(model, i_lik, test_list, n_samples model_family <- model$.args$family[[i_lik]] if (link_name == "default") { - if (model_family == "gaussian") { + if (model_family %in% c("gaussian", "t")) { linkfuninv <- function(x) { x } @@ -918,7 +918,7 @@ get_posterior_samples <- function(post_linear_predictors, new_model, i_lik, new_ } family_mappings <- map_models_to_strings(full_model) - if(family_mappings[[i_lik]] != ".none"){ + if(family_mappings[[i_lik]][[1]] != ".none"){ meas_err_par <- lapply(family_mappings[[i_lik]], function(param) { hyper_sample <- INLA::inla.hyperpar.sample(new_n_samples, model_sample, improve.marginals = TRUE) return(hyper_sample[,param]) @@ -936,6 +936,13 @@ get_posterior_samples <- function(post_linear_predictors, new_model, i_lik, new_ scale_temp <- post_linear_predictors[i,] / phi_sample rgamma(new_n_samples, shape = phi_sample, scale = scale_temp) }) + } else if(model_family == "t"){ + sd_sample <- 1 / sqrt(as.vector(meas_err_par[[1]])) + deg_sample <- as.vector(meas_err_par[[2]]) + Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { + scale_temp <- post_linear_predictors[i,] + + sd_sample + rt(new_n_samples, df = deg_sample) + }) } else if(model_family == "poisson"){ Y_sample <- lapply(1:nrow(post_linear_predictors), function(i) { stats::rpois(n_samples, post_linear_predictors[i,]) @@ -998,7 +1005,8 @@ map_models_to_strings <- function(models) { "stochvol" = "Offset precision for stochvol", "stochvolln" = c("Offset precision for stochvolln","Mean offset for stochvolln"), "stochvolnig" = c("shape parameter for stochvol-nig", "skewness parameter for stochvol-nig"), - "stochvolt" = "degrees of freedom for stochvol student-t" + "stochvolt" = "degrees of freedom for stochvol student-t", + "t" = c("precision for the student-t observations", "degrees of freedom for student-t") ) # Count occurrences of each family From 67c489c7cfb1348f0099e6ab884d523555d97050 Mon Sep 17 00:00:00 2001 From: vpnsctl Date: Thu, 12 Dec 2024 15:00:07 +0300 Subject: [PATCH 6/7] adjusts --- R/inlabru_rspde.R | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/R/inlabru_rspde.R b/R/inlabru_rspde.R index 0a1ffae3..750dee28 100644 --- a/R/inlabru_rspde.R +++ b/R/inlabru_rspde.R @@ -221,14 +221,13 @@ bru_rerun_with_data <- function(result, idx_data, true_CV, fit_verbose) { original_timings <- result[["bru_timings"]] - lhoods_tmp <- info[["lhoods"]] - for(i_like in seq_along(lhoods_tmp)){ - lhoods_tmp[[i_like]]$response_data$BRU_response[-idx_data[[i_like]]] <- NA + for(i_like in seq_along(info[["lhoods"]])){ + info[["lhoods"]][[i_like]]$response_data$BRU_response[-idx_data[[i_like]]] <- NA } result <- inlabru::iinla( model = info[["model"]], - lhoods = lhoods_tmp, + lhoods = info[["lhoods"]], initial = result, options = info[["options"]] ) @@ -858,8 +857,6 @@ cross_validation <- function(models, model_names = NULL, scores = c("mae", "mse" #' @noRd sample_posterior_linear_predictor <- function(model, i_lik, test_list, n_samples, print){ - test_data <- model$bru_info$lhoods[[i_lik]]$response_data$BRU_response[test_list[[i_lik]]] - link_name <- model$.args$control.family[[i_lik]]$link model_family <- model$.args$family[[i_lik]] @@ -884,7 +881,7 @@ sample_posterior_linear_predictor <- function(model, i_lik, test_list, n_samples linkfuninv <- process_link(link_name) } - formula_tmp <- process_formula(model) + formula_tmp <- process_formula_lhoods(model, i_lik) env_tmp <- environment(formula_tmp) assign("linkfuninv", linkfuninv, envir = env_tmp) @@ -898,11 +895,13 @@ sample_posterior_linear_predictor <- function(model, i_lik, test_list, n_samples data <- model$bru_info$lhoods[[i_lik]]$data - df_pred <- select_indexes(data, test_list[[i_lik]]) + # df_pred <- select_indexes(data, test_list[[i_lik]]) + + # post_samples <- inlabru::generate(model, newdata = df_pred, formula = formula_tmp, n.samples = n_samples) - post_samples <- inlabru::generate(model, newdata = df_pred, formula = formula_tmp, n.samples = n_samples) + post_samples <- inlabru::generate(model, newdata = data, formula = formula_tmp, n.samples = n_samples) - return(post_samples) + return(post_samples[test_list[[i_lik]] , , drop=FALSE]) } @@ -999,7 +998,7 @@ map_models_to_strings <- function(models) { # Define base mappings mapping <- list( "gaussian" = "Precision for the Gaussian observations", - "gamma" = "Precision parameter for the Gamma observations", + "gamma" = "Precision-parameter for the Gamma observations", "poisson" = ".none", "binomial" = ".none", "stochvol" = "Offset precision for stochvol", From 6f44cf0a8365076e85582248c795b5e8cb023e20 Mon Sep 17 00:00:00 2001 From: vpnsctl Date: Thu, 12 Dec 2024 16:09:20 +0300 Subject: [PATCH 7/7] updates --- NEWS.md | 2 ++ R/inlabru_rspde.R | 29 ++++++++--------------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/NEWS.md b/NEWS.md index 27a782c5..b65f1a2f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # rSPDE (development version) +* Improved the `cross_validation` function to allow for multiple likelihoods. + # rSPDE 2.4.0 * Created the `group_predict` function, to obtain predictions on a testing set based on observations on a training set. diff --git a/R/inlabru_rspde.R b/R/inlabru_rspde.R index 750dee28..2d0707fd 100644 --- a/R/inlabru_rspde.R +++ b/R/inlabru_rspde.R @@ -991,6 +991,7 @@ get_posterior_samples <- function(post_linear_predictors, new_model, i_lik, new_ return(Y_sample) } +#' @noRd map_models_to_strings <- function(models) { # Extract families from models families <- models$.args$family @@ -1008,41 +1009,27 @@ map_models_to_strings <- function(models) { "t" = c("precision for the student-t observations", "degrees of freedom for student-t") ) - # Count occurrences of each family - family_counts <- table(families) - # Initialize result list result <- vector("list", length(families)) - # Track how many times we've seen each family - seen_counts <- list() - # Process each family for (i in seq_along(families)) { family <- families[i] - - # Initialize or increment counter for this family - seen_counts[[family]] <- if (is.null(seen_counts[[family]])) 1 else seen_counts[[family]] + 1 - - # Get base string(s) for this family base_string <- mapping[[family]] - # Only append counter if it's not .none and appears multiple times - if (base_string[1] != ".none" && family_counts[family] > 1 && seen_counts[[family]] > 1) { + # If it's first occurrence or .none, use base string + if (i == 1 || base_string[1] == ".none") { + result[[i]] <- base_string + } else { + # For subsequent occurrences, append [i] if (length(base_string) == 1) { - result[[i]] <- paste0(base_string, "[", seen_counts[[family]], "]") + result[[i]] <- paste0(base_string, "[", i, "]") } else { - result[[i]] <- paste0(base_string, "[", seen_counts[[family]], "]") + result[[i]] <- paste0(base_string, "[", i, "]") } - } else { - result[[i]] <- base_string } } return(result) } - - - -