Merge branch 'devel-src' of github.com:davidbolin/rSPDE into devel-src

davidbolin · Dec 12, 2024 · 9f3dd0b · 9f3dd0b
2 parents 812c645 + bf18b7b
commit 9f3dd0b
Show file tree

Hide file tree

Showing 11 changed files with 615 additions and 1,535 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -73,7 +73,6 @@ export(get.initial.values.rSPDE)
 export(gg_df)
 export(glance)
 export(graph_data_rspde)
-export(group_predict)
 export(intrinsic.matern.operators)
 export(matern.covariance)
 export(matern.operators)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # rSPDE (development version)
 
+* Improved the `cross_validation` function to allow for multiple likelihoods. 
+
 # rSPDE 2.4.0
 
 * Created the `group_predict` function, to obtain predictions on a testing set based on observations on a training set.

diff --git a/R/inlabru_rspde.R b/R/inlabru_rspde.R
diff --git a/R/util.R b/R/util.R
@@ -1869,72 +1869,86 @@ select_indexes <- function(data, idx) {
 }
 
 
-
-#' Create train and test splits to be used in the `cross_validation` function
+#' Create train and test splits for cross-validation
 #'
-#' Train and test splits
+#' @description
+#' Creates train and test splits for cross-validation by handling multiple data types
+#' and supporting k-fold, leave-one-out (LOO), and leave-percentage-out (LPO) methods.
+#' Handles missing values and maintains data structure across multiple datasets.
 #'
-#' @param data A `list`, `data.frame`, `SpatialPointsDataFrame` or `metric_graph_data` objects.
-#' @param cv_type The type of the folding to be carried out. The options are `k-fold` for `k`-fold cross-validation, in which case the parameter `k` should be provided,
-#' `loo`, for leave-one-out and `lpo` for leave-percentage-out, in this case, the parameter `percentage` should be given, and also the `number_folds`
-#' with the number of folds to be done. The default is `k-fold`.
-#' @param k The number of folds to be used in `k`-fold cross-validation. Will only be used if `cv_type` is `k-fold`.
-#' @param percentage The percentage (from 1 to 99) of the data to be used to train the model. Will only be used if `cv_type` is `lpo`.
-#' @param number_folds Number of folds to be done if `cv_type` is `lpo`.
-#' @return A list with two elements, `train` containing the training indices and `test` containing indices.
-#' @export
-
-create_train_test_indices <- function(data, cv_type = c("k-fold", "loo", "lpo"),
-                                      k = 5, percentage = 20, number_folds = 10) {
-  if (inherits(data, "metric_graph_data")) {
-    idx <- seq_len(nrow(as.data.frame(data)))
-  } else {
-    idx <- seq_len(nrow(data))
-  }
-  if (inherits(data, "SpatialPointsDataFrame")) {
-    data_tmp <- data@data
-    data_nonNA <- !is.na(data_tmp)
-  } else if (inherits(data, "metric_graph_data")) {
-    data_nonNA <- !is.na(as.data.frame(data))
-  } else {
-    data_nonNA <- !is.na(data)
+#' @param data_list A list of datasets, one per likelihood. Each dataset can be a data.frame, 
+#'        SpatialPointsDataFrame, or metric_graph_data object
+#' @param cv_type Type of cross-validation: "k-fold", "loo", or "lpo". Default is "k-fold"
+#' @param k Number of folds for k-fold CV. Default is 5
+#' @param percentage Training data percentage for LPO CV (1-99). Default is 20
+#' @param number_folds Number of folds for LPO CV. Default is 10
+#'
+#' @return A list where each element contains:
+#'   \item{train}{Indices for training data mapped to original datasets}
+#'   \item{test}{Indices for test data mapped to original datasets}
+#'
+#' @details
+#' The function handles NA values by removing rows with any missing values before
+#' creating splits. For multiple datasets, indices are mapped back to their original
+#' positions in each dataset.
+#' @export 
+
+create_train_test_indices <- function(data_list, cv_type = c("k-fold", "loo", "lpo"),
+                                    k = 5, percentage = 20, number_folds = 10) {
+  # First concatenate all data
+  if (inherits(data_list[[1]], "metric_graph_data")) {
+    data_list <- lapply(data_list, as.data.frame)
+  } 
+
+  data <- do.call(rbind, data_list)
+
+  # Get indices for concatenated data as before
+  idx <- seq_len(nrow(data))
+
+  # Get cumulative sizes to map back to individual datasets
+  n_samples <- sapply(data_list, nrow)
+  cum_sizes <- cumsum(c(0, n_samples))
+
+  # Function to map concatenated indices to individual dataset indices
+  map_to_likelihood_indices <- function(indices) {
+    lapply(seq_along(data_list), function(i) {
+      likelihood_indices <- indices[indices > cum_sizes[i] & indices <= cum_sizes[i + 1]]
+      likelihood_indices - cum_sizes[i]
+    })
   }
-  idx_nonNA <- sapply(1:length(idx), function(i) {
-    all(data_nonNA[i, ])
-  })
-  idx <- idx[idx_nonNA]
+
   if (cv_type == "k-fold") {
-    # split idx into k
     folds <- cut(sample(idx), breaks = k, label = FALSE)
-    test_list_idx <- lapply(1:k, function(i) {
-      which(folds == i, arr.ind = TRUE)
-    })
-    test_list <- lapply(test_list_idx, function(idx_test) {
-      idx[idx_test]
-    })
-    train_list <- lapply(1:k, function(i) {
-      idx[-test_list_idx[[i]]]
+    fold_list <- lapply(1:k, function(i) {
+      test_idx <- which(folds == i, arr.ind = TRUE)
+      train_idx <- idx[-test_idx]
+      test_idx <- idx[test_idx]
+
+      list(
+        train = map_to_likelihood_indices(train_idx),
+        test = map_to_likelihood_indices(test_idx)
+      )
     })
   } else if (cv_type == "loo") {
-    train_list <- lapply(1:length(idx), function(i) {
-      idx[-i]
+    fold_list <- lapply(seq_along(idx), function(i) {
+      list(
+        train = map_to_likelihood_indices(idx[-i]),
+        test = map_to_likelihood_indices(idx[i])
+      )
     })
-    # test_list <- lapply(1:length(idx), function(i){idx[i]})
-    test_list <- as.list(idx)
   } else if (cv_type == "lpo") {
-    test_list_idx <- list()
-    n_Y <- length(idx)
-    for (i in number_folds:1) {
-      test_list_idx[[i]] <- sample(1:length(idx), size = (1 - percentage / 100) * n_Y)
-    }
-    train_list <- lapply(1:number_folds, function(i) {
-      idx[-test_list_idx[[i]]]
-    })
-    test_list <- lapply(test_list_idx, function(idx_test) {
-      idx[idx_test]
+    fold_list <- lapply(1:number_folds, function(i) {
+      test_idx <- sample(idx, size = (1 - percentage / 100) * length(idx))
+      train_idx <- idx[-match(test_idx, idx)]
+
+      list(
+        train = map_to_likelihood_indices(train_idx),
+        test = map_to_likelihood_indices(test_idx)
+      )
     })
   }
-  return(list(train = train_list, test = test_list))
+
+  return(fold_list)
 }
 
 # Check for required packages

diff --git a/man/bru_get_mapper.inla_rspde.Rd b/man/bru_get_mapper.inla_rspde.Rd
diff --git a/man/bru_get_mapper.inla_rspde_anisotropic2d.Rd b/man/bru_get_mapper.inla_rspde_anisotropic2d.Rd
diff --git a/man/bru_get_mapper.inla_rspde_matern1d.Rd b/man/bru_get_mapper.inla_rspde_matern1d.Rd
diff --git a/man/bru_get_mapper.inla_rspde_spacetime.Rd b/man/bru_get_mapper.inla_rspde_spacetime.Rd
diff --git a/man/create_train_test_indices.Rd b/man/create_train_test_indices.Rd
diff --git a/man/cross_validation.Rd b/man/cross_validation.Rd
diff --git a/man/group_predict.Rd b/man/group_predict.Rd