refactor boot functions; fix #27; fix #39

rapidsurveys · Jan 6, 2025 · 6e49636 · 6e49636
1 parent 6ff4c9b
commit 6e49636
Show file tree

Hide file tree

Showing 19 changed files with 353 additions and 195 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,9 +4,7 @@ export(bootBW)
 export(bootClassic)
 export(bootPROBIT)
 export(boot_bw)
-export(boot_bw_sample_cluster)
 export(boot_bw_sample_clusters)
-export(boot_bw_sample_within_cluster)
 export(boot_bw_sample_within_clusters)
 export(boot_bw_weight)
 export(recode)

diff --git a/R/bootBW.r b/R/bootBW.r
@@ -7,52 +7,63 @@
 #' used in **SMART** surveys) or posterior weighting (e.g. as used in
 #' **RAM** and **S3M** surveys).
 #'
-#' @param x A data frame with primary sampling unit (PSU) in column named `psu`
-#' @param w A data frame with primary sampling unit (PSU) in column named `psu`
-#'   and survey weight (i.e. PSU population) in column named `pop`
-#' @param statistic A function operating on data in `x` (see example)
-#' @param params Parameters (named columns in `x`) passed to the function
-#'   specified in `statistic`
-#' @param outputColumns Names of columns in output data frame
-#' @param replicates Number of bootstrap replicates
+#' @param x A [data.frame()] with primary sampling unit (PSU) in variable named
+#'   `psu` and at least one other variable containing data for estimation.
+#' @param w A [data.frame()] with primary sampling unit (PSU) in variable named
+#'   `psu` and survey weights (i.e. PSU population) in variable named `pop`.
+#' @param statistic Am estimator function operating on variables in `x`
+#'   containing data for estimation. The functions [bootClassic()] and
+#'   [bootPROBIT()] are examples.
+#' @param params Parameters specified as names of columns in `x` that are to be
+#'   passed to the function specified in `statistic`.
+#' @param outputColumns Names to be used for columns in output [data.frame()].
+#'   Default to names specified in `params`.
+#' @param replicates Number of bootstrap replicates to be performed. Default is
+#'   400.
 #'
 #' @return A [data.frame()] with:
-#'   * ncol = length(outputColumns)
-#'   * nrow = replicates
-#'   * names = outputColumns
+#'   * number of columns equal to length of `outputColumns`;
+#'   * number of rows equal to number of `replicates`; and,`
+#'   * names equal to `outputColumns`.`
 #'
 #' @examples
-#' # Example function - estimate a proportion for a binary (0/1) variable):
-#'
-#' oneP <- function(x, params) {
-#'   v1 <- params[1]
-#'   v1Data <- x[[v1]]
-#'   oneP <- mean(v1Data, na.rm = TRUE)
-#'   return(oneP)
-#' }
-#'
 #' # Example call to bootBW function using RAM-OP test data:
 #'
-#' bootP <- bootBW(x = indicatorsHH,
-#'                 w = villageData,
-#'                 statistic = oneP,
-#'                 params = "anc1",
-#'                 outputColumns = "anc1",
-#'                 replicates = 9)
+#' bootBW(
+#'   x = indicatorsHH, w = villageData, statistic = bootClassic,
+#'   params = "anc1", outputColumns = "anc1", replicates = 9
+#' )
 #'
 #' # Example estimate with 95% CI:
-#'
-#' quantile(bootP, probs = c(0.500, 0.025, 0.975), na.rm = TRUE)
+#' #quantile(bootP, probs = c(0.500, 0.025, 0.975), na.rm = TRUE)
 #'
 #' @export
 #'
 
-bootBW <- function(x, w, statistic, params, outputColumns, replicates = 400) {
+bootBW <- function(x, w, statistic, 
+                   params, outputColumns = params, 
+                   replicates = 400) {
+  ## Check data ----
+  check_data(x)
+
+  ## Check weights ----
+  check_weights(w = w)
+
+  ## Check params ----
+  params <- check_params(x = x, params = params)
+
+  ## Check outputColumns ----
+  if (length(outputColumns) != length(params)) {
+    cli::cli_abort(
+      "{.arg outputColumns} must have the same length as {.arg params}"
+    )
+  }
+
   ## Scale weights and accumulate weights
   w$weight <- w$pop / sum(w$pop)
   w$cumWeight <- cumsum(w$weight)
 
-  ## Create data.frame with named columns for output
+  ## Create data.frame with named columns for output ----
   boot <- data.frame(matrix(ncol = length(outputColumns), nrow = replicates))
   names(boot) <- outputColumns
 
@@ -63,37 +74,39 @@ bootBW <- function(x, w, statistic, params, outputColumns, replicates = 400) {
   maxRows <- nClusters * max(table(x$psu))
   emptyDF <- rbind(as.data.frame(lapply(x, function(x) rep.int(NA, maxRows))))
 
-  ## Vector to hold clusters to be included in a survey replicate
+  ## Vector to hold clusters to be included in a survey replicate ----
   sampledClusters <- vector(mode = mode(x$psu), length = nClusters)
 
-  ## And now ... resample!
+  ## And now ... resample! ----
   for(i in seq_len(replicates)) {
-    ## Create a data.frame to hold a survey replicate
+    ## Create a data.frame to hold a survey replicate ----
     xBW <- emptyDF
-    ## Blocking Bootstrap from 'x' (blocking on x$psu = cluster identifier)
+    ## Blocking Bootstrap from 'x' (blocking on x$psu = cluster identifier) ----
     for(j in seq_len(nClusters)) {
-      ## "Roulette Wheel" algorithm (to select a weighted sample of clusters)
+      ## "Roulette Wheel" algorithm to select a weighted sample of clusters ----
       sampledClusters[j] <- w$psu[which.max(w$cumWeight >= runif(n = 1, min = 0, max = 1))]
     }
-    ## Pointer for inserting selected clusters into the survey replicate
+    ## Pointer for inserting selected clusters into the survey replicate ----
     rowIndex <- 1
-    ## Build a (blocking weighted) bootstrap replicate from the selected clusters
+    ## Build a blocking weighted bootstrap replicate from the selected ----
+    ## clusters
     for(k in seq_len(nClusters)) {
-      ## Extract data for cluster and resample within the cluster
+      ## Extract data for cluster and resample within the cluster ----
       y <- subset(x, psu == sampledClusters[k])
       clusterN <- nrow(y)
       y <- y[sample(1:clusterN, replace = TRUE), ]
-      ## Insert cluster replicate into survey replicate
+      ## Insert cluster replicate into survey replicate ----
       endRow <- rowIndex + clusterN
       xBW[rowIndex:(endRow - 1), ] <- y
-      ## Update pointer
+      ## Update pointer----
       rowIndex <- endRow
     }
-    ## Select data for analysis
+    ## Select data for analysis ----
     xBW <- xBW[1:(rowIndex - 1), ]
-    ## Apply statistic
+    ## Apply statistic ----
     boot[i, ] <- statistic(xBW, params)
   }
 
-  return(boot)
+  ## Return boot ----
+  boot
 }
diff --git a/R/bootClassic.R b/R/bootClassic.R
@@ -13,7 +13,7 @@
 #' @examples
 #' # Example call to bootClassic function
 #' sampled_clusters <- boot_bw_sample_clusters(
-#'   x = indicatorsHH, w = boot_bw_weight(villageData)
+#'   x = indicatorsHH, df_weighted = boot_bw_weight(villageData)
 #' )
 #' 
 #' boot <- boot_bw_sample_within_clusters(sampled_clusters)

diff --git a/R/bootProbit.R b/R/bootProbit.R
@@ -15,7 +15,7 @@
 #' @examples
 #' # Example call to bootBW function:
 #' sampled_clusters <- boot_bw_sample_clusters(
-#'   x = indicatorsCH1, w = boot_bw_weight(villageData)
+#'   x = indicatorsCH1, df_weighted = boot_bw_weight(villageData)
 #' )
 #' 
 #' boot <- boot_bw_sample_within_clusters(sampled_clusters)

diff --git a/R/boot_bw.R b/R/boot_bw.R
@@ -1,19 +1,35 @@
 #'
-#' Blocked weighted bootstrap - vectorised and parallel
+#' Blocked Weighted Bootstrap - vectorised and parallel
+#' 
+#' @description
+#' This set of functions is an alternative to the [bootBW()] function. This set
+#' attempts to make the blocked weighted bootstrap algorithm more efficient
+#' through vectorisation and use of parallelisation techniques. The function
+#' syntax has been kept consistent with [bootBW()] for ease of transition. A
+#' more in depth discussion of the efficiencies gained from this alternative
+#' function is discussed here.
 #' 
 #' @inheritParams bootBW
+#' @param df_weighted A [data.frame()] based on `w` with additional variables
+#'   for `weight` and `cumWeight`. This is usually produced using the function
+#'   [boot_bw_weight()].
+#' @param cores The number of computer cores to use or number of child processes
+#'   to be run simultaneously. Default to one less than the available number of
+#'   cores on current machine.
 #' @param index Logical. Should index values be returned or a list of
-#'   data.frames. Default to FALSE.
-#' @param cluster_df A [data.frame()] or a list of [data.frame()]s for selected
-#'   clusters
-#' @param p A random probability of selection
-#' @param cores The number of computer cores to use/number of child processes
-#'   will be run simultaneously.
+#'   [data.frame()]s. Default to FALSE.
+#' @param cluster_df A list of [data.frame()]s for selected clusters.
 #' 
-#' @returns A [data.frame()] with:
-#'   * ncol = length(outputColumns)
-#'   * nrow = replicates
-#'   * names = outputColumns
+#' @returns For [boot_bw()], a [data.frame()] with number of columns equal to 
+#'   length of `outputColumns`; number of rows equal to number of `replicates`;
+#'   and, names of variables equal to values of `outputColumns`. For
+#'   [boot_bw_weight()], A [data.frame()] based on `w` with two additional
+#'   variables for `weight` and `cumWeight`. For [boot_bw_sample_clusters()],
+#'   either a vector of integers corresponding to the primary sampling unit
+#'   (psu) identifier of the selected clusters (when `index = TRUE`) or a list
+#'   of [data.frame()]s corresponding to the data for the selected clusters
+#'   (when `index = FALSE`). For [boot_bw_sample_within_clusters()], a matrix
+#'   similar in structure to  `x` of resampled data from each selected cluster.
 #' 
 #' @examples
 #' boot_bw(
@@ -25,7 +41,56 @@
 #' @rdname boot_bw
 #' 
 
+boot_bw <- function(x, w, statistic,
+                    params, outputColumns = params, 
+                    replicates = 400,
+                    cores = parallelly::availableCores(omit = 1)) {
+  ## Get cumulative weights for clusters ----
+  w <- boot_bw_weight(w)
+
+  ## Check data ----
+  check_data(x)
+
+  ## Check params ----
+  params <- check_params(x = x, params = params)
+
+  ## Setup parallelism ----
+  cl <- parallel::makeCluster(cores)
+  doParallel::registerDoParallel(cl)
+
+  ## Resample ----
+  boot <- foreach::foreach(seq_len(replicates), .combine = rbind) %dopar% {
+    ## Sample clusters ----
+    sampled_clusters <- boot_bw_sample_clusters(x = x, df_weighted = w)
+
+    ## Sample within selected clusters ----
+    xBW <- boot_bw_sample_within_clusters(sampled_clusters)
+
+    ## Apply statistic ----
+    statistic(xBW, params)
+  }
+
+  ## Rename output data.frame ----
+  boot <- as.data.frame(boot)
+  row.names(boot) <- NULL
+  names(boot) <- outputColumns
+
+  ## Stop parallelism ----
+  parallel::stopCluster(cl)
+
+  ## Return boot ----
+  boot
+}
+
+#'
+#' @export
+#' @rdname boot_bw
+#' 
+
 boot_bw_weight <- function(w) {
+  ## Check weights ----
+  check_weights(w = w)
+
   ## Scale weights and accumulate weights
   w$weight <- w$pop / sum(w$pop)
   w$cumWeight <- cumsum(w$weight)
@@ -35,8 +100,7 @@ boot_bw_weight <- function(w) {
 }
 
 #'
-#' @export
-#' @rdname boot_bw
+#' @keywords internal
 #' 
 
 boot_bw_sample_cluster <- function(p, w) {
@@ -52,9 +116,9 @@ boot_bw_sample_cluster <- function(p, w) {
 #' @rdname boot_bw
 #' 
 
-boot_bw_sample_clusters <- function(x, w, index = FALSE) {
+boot_bw_sample_clusters <- function(x, df_weighted, index = FALSE) {
   ## Get number of clusters ----
-  nClusters <- nrow(w)
+  nClusters <- nrow(df_weighted)
 
   ## Get vector of random probabilities ----
   p <- runif(n = nClusters)
@@ -63,7 +127,7 @@ boot_bw_sample_clusters <- function(x, w, index = FALSE) {
   selected_clusters <- lapply(
     X = p,
     FUN = boot_bw_sample_cluster,
-    w = w
+    w = df_weighted
   ) |>
     unlist()
 
@@ -81,8 +145,7 @@ boot_bw_sample_clusters <- function(x, w, index = FALSE) {
 
 
 #'
-#' @export
-#' @rdname boot_bw
+#' @keywords internal
 #' 
 
 boot_bw_sample_within_cluster <- function(cluster_df) {
@@ -106,41 +169,3 @@ boot_bw_sample_within_clusters <- function(cluster_df) {
   ) |>
     do.call(rbind, args = _)
 }
-
-
-#'
-#' @export
-#' @rdname boot_bw
-#' 
-
-boot_bw <- function(x, w, statistic,
-                    params, outputColumns = params, 
-                    replicates = 399,
-                    cores = parallelly::availableCores(omit = 1)) {
-  ## Get cumulative weights for clusters ----
-  w <- boot_bw_weight(w)
-
-  ## Setup parallelism ----
-  cl <- parallel::makeCluster(cores)
-  doParallel::registerDoParallel(cl)
-
-  ## Resample ----
-  boot <- foreach::foreach(seq_len(replicates), .combine = rbind) %dopar% {
-    sampled_clusters <- boot_bw_sample_clusters(x = x, w = w)
-
-    xBW <- boot_bw_sample_within_clusters(sampled_clusters)
-
-    statistic(xBW, params)
-  }
-
-  ## Rename output data.frame ----
-  boot <- as.data.frame(boot)
-  row.names(boot) <- NULL
-  names(boot) <- outputColumns
-
-  ## Stop parallelism ----
-  parallel::stopCluster(cl)
-
-  ## Return boot ----
-  boot
-}
diff --git a/R/recode.R b/R/recode.R
@@ -51,12 +51,16 @@
 #'     then a numeric result is returned; if \code{var} is a factor then
 #'     (by default) so is the result.
 #' }
+#' 
 #' @param afr Return a factor. Default is TRUE if \code{var} is a factor and is
 #'   FALSE otherwise
 #' @param anr Coerce result to numeric (default is TRUE)
 #' @param levels Order of the levels in the returned factor; the default is to use
 #'    the sort order of the level names.
-#' @return Recoded variable
+#' 
+#' 
+#' @returns Recoded variable
+#' 
 #' @examples
 #' # Recode values from 1 to 9 to various specifications
 #' var <- sample(x = 1:9, size = 100, replace = TRUE)
@@ -108,6 +112,7 @@ recode <- function(var, recodes, afr, anr = TRUE, levels) {
       }
     }
   }
+
   if (afr) {
     result <- if (!missing(levels)) factor(result, levels = levels) else as.factor(result)
   } else if (anr && (!is.numeric(result))) {
@@ -119,5 +124,6 @@ recode <- function(var, recodes, afr, anr = TRUE, levels) {
         code = if (!any(is.na(result.valid))) result <- as.numeric(result)
       )
   }
-  return(result)
+
+  result
 }