Skip to content

Commit

Permalink
Merge branch 'devel-src' of github.com:davidbolin/rSPDE into devel-src
Browse files Browse the repository at this point in the history
  • Loading branch information
David Bolin committed Dec 12, 2024
2 parents 812c645 + bf18b7b commit 9f3dd0b
Show file tree
Hide file tree
Showing 11 changed files with 615 additions and 1,535 deletions.
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ export(get.initial.values.rSPDE)
export(gg_df)
export(glance)
export(graph_data_rspde)
export(group_predict)
export(intrinsic.matern.operators)
export(matern.covariance)
export(matern.operators)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# rSPDE (development version)

* Improved the `cross_validation` function to allow for multiple likelihoods.

# rSPDE 2.4.0

* Created the `group_predict` function, to obtain predictions on a testing set based on observations on a training set.
Expand Down
1,904 changes: 505 additions & 1,399 deletions R/inlabru_rspde.R

Large diffs are not rendered by default.

124 changes: 69 additions & 55 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -1869,72 +1869,86 @@ select_indexes <- function(data, idx) {
}



#' Create train and test splits to be used in the `cross_validation` function
#' Create train and test splits for cross-validation
#'
#' Train and test splits
#' @description
#' Creates train and test splits for cross-validation by handling multiple data types
#' and supporting k-fold, leave-one-out (LOO), and leave-percentage-out (LPO) methods.
#' Handles missing values and maintains data structure across multiple datasets.
#'
#' @param data A `list`, `data.frame`, `SpatialPointsDataFrame` or `metric_graph_data` objects.
#' @param cv_type The type of the folding to be carried out. The options are `k-fold` for `k`-fold cross-validation, in which case the parameter `k` should be provided,
#' `loo`, for leave-one-out and `lpo` for leave-percentage-out, in this case, the parameter `percentage` should be given, and also the `number_folds`
#' with the number of folds to be done. The default is `k-fold`.
#' @param k The number of folds to be used in `k`-fold cross-validation. Will only be used if `cv_type` is `k-fold`.
#' @param percentage The percentage (from 1 to 99) of the data to be used to train the model. Will only be used if `cv_type` is `lpo`.
#' @param number_folds Number of folds to be done if `cv_type` is `lpo`.
#' @return A list with two elements, `train` containing the training indices and `test` containing indices.
#' @export

create_train_test_indices <- function(data, cv_type = c("k-fold", "loo", "lpo"),
k = 5, percentage = 20, number_folds = 10) {
if (inherits(data, "metric_graph_data")) {
idx <- seq_len(nrow(as.data.frame(data)))
} else {
idx <- seq_len(nrow(data))
}
if (inherits(data, "SpatialPointsDataFrame")) {
data_tmp <- data@data
data_nonNA <- !is.na(data_tmp)
} else if (inherits(data, "metric_graph_data")) {
data_nonNA <- !is.na(as.data.frame(data))
} else {
data_nonNA <- !is.na(data)
#' @param data_list A list of datasets, one per likelihood. Each dataset can be a data.frame,
#' SpatialPointsDataFrame, or metric_graph_data object
#' @param cv_type Type of cross-validation: "k-fold", "loo", or "lpo". Default is "k-fold"
#' @param k Number of folds for k-fold CV. Default is 5
#' @param percentage Training data percentage for LPO CV (1-99). Default is 20
#' @param number_folds Number of folds for LPO CV. Default is 10
#'
#' @return A list where each element contains:
#' \item{train}{Indices for training data mapped to original datasets}
#' \item{test}{Indices for test data mapped to original datasets}
#'
#' @details
#' The function handles NA values by removing rows with any missing values before
#' creating splits. For multiple datasets, indices are mapped back to their original
#' positions in each dataset.
#' @export

create_train_test_indices <- function(data_list, cv_type = c("k-fold", "loo", "lpo"),
k = 5, percentage = 20, number_folds = 10) {
# First concatenate all data
if (inherits(data_list[[1]], "metric_graph_data")) {
data_list <- lapply(data_list, as.data.frame)
}

data <- do.call(rbind, data_list)

# Get indices for concatenated data as before
idx <- seq_len(nrow(data))

# Get cumulative sizes to map back to individual datasets
n_samples <- sapply(data_list, nrow)
cum_sizes <- cumsum(c(0, n_samples))

# Function to map concatenated indices to individual dataset indices
map_to_likelihood_indices <- function(indices) {
lapply(seq_along(data_list), function(i) {
likelihood_indices <- indices[indices > cum_sizes[i] & indices <= cum_sizes[i + 1]]
likelihood_indices - cum_sizes[i]
})
}
idx_nonNA <- sapply(1:length(idx), function(i) {
all(data_nonNA[i, ])
})
idx <- idx[idx_nonNA]

if (cv_type == "k-fold") {
# split idx into k
folds <- cut(sample(idx), breaks = k, label = FALSE)
test_list_idx <- lapply(1:k, function(i) {
which(folds == i, arr.ind = TRUE)
})
test_list <- lapply(test_list_idx, function(idx_test) {
idx[idx_test]
})
train_list <- lapply(1:k, function(i) {
idx[-test_list_idx[[i]]]
fold_list <- lapply(1:k, function(i) {
test_idx <- which(folds == i, arr.ind = TRUE)
train_idx <- idx[-test_idx]
test_idx <- idx[test_idx]

list(
train = map_to_likelihood_indices(train_idx),
test = map_to_likelihood_indices(test_idx)
)
})
} else if (cv_type == "loo") {
train_list <- lapply(1:length(idx), function(i) {
idx[-i]
fold_list <- lapply(seq_along(idx), function(i) {
list(
train = map_to_likelihood_indices(idx[-i]),
test = map_to_likelihood_indices(idx[i])
)
})
# test_list <- lapply(1:length(idx), function(i){idx[i]})
test_list <- as.list(idx)
} else if (cv_type == "lpo") {
test_list_idx <- list()
n_Y <- length(idx)
for (i in number_folds:1) {
test_list_idx[[i]] <- sample(1:length(idx), size = (1 - percentage / 100) * n_Y)
}
train_list <- lapply(1:number_folds, function(i) {
idx[-test_list_idx[[i]]]
})
test_list <- lapply(test_list_idx, function(idx_test) {
idx[idx_test]
fold_list <- lapply(1:number_folds, function(i) {
test_idx <- sample(idx, size = (1 - percentage / 100) * length(idx))
train_idx <- idx[-match(test_idx, idx)]

list(
train = map_to_likelihood_indices(train_idx),
test = map_to_likelihood_indices(test_idx)
)
})
}
return(list(train = train_list, test = test_list))

return(fold_list)
}

# Check for required packages
Expand Down
8 changes: 4 additions & 4 deletions man/bru_get_mapper.inla_rspde.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/bru_get_mapper.inla_rspde_anisotropic2d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions man/bru_get_mapper.inla_rspde_matern1d.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/bru_get_mapper.inla_rspde_spacetime.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 19 additions & 11 deletions man/create_train_test_indices.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 10 additions & 4 deletions man/cross_validation.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

55 changes: 0 additions & 55 deletions man/group_predict.Rd

This file was deleted.

0 comments on commit 9f3dd0b

Please sign in to comment.