Skip to content

Commit

Permalink
refactor boot functions; fix #27; fix #39
Browse files Browse the repository at this point in the history
  • Loading branch information
ernestguevarra committed Jan 6, 2025
1 parent 6ff4c9b commit 6e49636
Show file tree
Hide file tree
Showing 19 changed files with 353 additions and 195 deletions.
2 changes: 0 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ export(bootBW)
export(bootClassic)
export(bootPROBIT)
export(boot_bw)
export(boot_bw_sample_cluster)
export(boot_bw_sample_clusters)
export(boot_bw_sample_within_cluster)
export(boot_bw_sample_within_clusters)
export(boot_bw_weight)
export(recode)
Expand Down
99 changes: 56 additions & 43 deletions R/bootBW.r
Original file line number Diff line number Diff line change
Expand Up @@ -7,52 +7,63 @@
#' used in **SMART** surveys) or posterior weighting (e.g. as used in
#' **RAM** and **S3M** surveys).
#'
#' @param x A data frame with primary sampling unit (PSU) in column named `psu`
#' @param w A data frame with primary sampling unit (PSU) in column named `psu`
#' and survey weight (i.e. PSU population) in column named `pop`
#' @param statistic A function operating on data in `x` (see example)
#' @param params Parameters (named columns in `x`) passed to the function
#' specified in `statistic`
#' @param outputColumns Names of columns in output data frame
#' @param replicates Number of bootstrap replicates
#' @param x A [data.frame()] with primary sampling unit (PSU) in variable named
#' `psu` and at least one other variable containing data for estimation.
#' @param w A [data.frame()] with primary sampling unit (PSU) in variable named
#' `psu` and survey weights (i.e. PSU population) in variable named `pop`.
#' @param statistic Am estimator function operating on variables in `x`
#' containing data for estimation. The functions [bootClassic()] and
#' [bootPROBIT()] are examples.
#' @param params Parameters specified as names of columns in `x` that are to be
#' passed to the function specified in `statistic`.
#' @param outputColumns Names to be used for columns in output [data.frame()].
#' Default to names specified in `params`.
#' @param replicates Number of bootstrap replicates to be performed. Default is
#' 400.
#'
#' @return A [data.frame()] with:
#' * ncol = length(outputColumns)
#' * nrow = replicates
#' * names = outputColumns
#' * number of columns equal to length of `outputColumns`;
#' * number of rows equal to number of `replicates`; and,`
#' * names equal to `outputColumns`.`
#'
#' @examples
#' # Example function - estimate a proportion for a binary (0/1) variable):
#'
#' oneP <- function(x, params) {
#' v1 <- params[1]
#' v1Data <- x[[v1]]
#' oneP <- mean(v1Data, na.rm = TRUE)
#' return(oneP)
#' }
#'
#' # Example call to bootBW function using RAM-OP test data:
#'
#' bootP <- bootBW(x = indicatorsHH,
#' w = villageData,
#' statistic = oneP,
#' params = "anc1",
#' outputColumns = "anc1",
#' replicates = 9)
#' bootBW(
#' x = indicatorsHH, w = villageData, statistic = bootClassic,
#' params = "anc1", outputColumns = "anc1", replicates = 9
#' )
#'
#' # Example estimate with 95% CI:
#'
#' quantile(bootP, probs = c(0.500, 0.025, 0.975), na.rm = TRUE)
#' #quantile(bootP, probs = c(0.500, 0.025, 0.975), na.rm = TRUE)
#'
#' @export
#'

bootBW <- function(x, w, statistic, params, outputColumns, replicates = 400) {
bootBW <- function(x, w, statistic,
params, outputColumns = params,
replicates = 400) {
## Check data ----
check_data(x)

## Check weights ----
check_weights(w = w)

## Check params ----
params <- check_params(x = x, params = params)

## Check outputColumns ----
if (length(outputColumns) != length(params)) {
cli::cli_abort(
"{.arg outputColumns} must have the same length as {.arg params}"
)
}

## Scale weights and accumulate weights
w$weight <- w$pop / sum(w$pop)
w$cumWeight <- cumsum(w$weight)

## Create data.frame with named columns for output
## Create data.frame with named columns for output ----
boot <- data.frame(matrix(ncol = length(outputColumns), nrow = replicates))
names(boot) <- outputColumns

Expand All @@ -63,37 +74,39 @@ bootBW <- function(x, w, statistic, params, outputColumns, replicates = 400) {
maxRows <- nClusters * max(table(x$psu))
emptyDF <- rbind(as.data.frame(lapply(x, function(x) rep.int(NA, maxRows))))

## Vector to hold clusters to be included in a survey replicate
## Vector to hold clusters to be included in a survey replicate ----
sampledClusters <- vector(mode = mode(x$psu), length = nClusters)

## And now ... resample!
## And now ... resample! ----
for(i in seq_len(replicates)) {
## Create a data.frame to hold a survey replicate
## Create a data.frame to hold a survey replicate ----
xBW <- emptyDF
## Blocking Bootstrap from 'x' (blocking on x$psu = cluster identifier)
## Blocking Bootstrap from 'x' (blocking on x$psu = cluster identifier) ----
for(j in seq_len(nClusters)) {
## "Roulette Wheel" algorithm (to select a weighted sample of clusters)
## "Roulette Wheel" algorithm to select a weighted sample of clusters ----
sampledClusters[j] <- w$psu[which.max(w$cumWeight >= runif(n = 1, min = 0, max = 1))]
}
## Pointer for inserting selected clusters into the survey replicate
## Pointer for inserting selected clusters into the survey replicate ----
rowIndex <- 1
## Build a (blocking weighted) bootstrap replicate from the selected clusters
## Build a blocking weighted bootstrap replicate from the selected ----
## clusters
for(k in seq_len(nClusters)) {
## Extract data for cluster and resample within the cluster
## Extract data for cluster and resample within the cluster ----
y <- subset(x, psu == sampledClusters[k])
clusterN <- nrow(y)
y <- y[sample(1:clusterN, replace = TRUE), ]
## Insert cluster replicate into survey replicate
## Insert cluster replicate into survey replicate ----
endRow <- rowIndex + clusterN
xBW[rowIndex:(endRow - 1), ] <- y
## Update pointer
## Update pointer----
rowIndex <- endRow
}
## Select data for analysis
## Select data for analysis ----
xBW <- xBW[1:(rowIndex - 1), ]
## Apply statistic
## Apply statistic ----
boot[i, ] <- statistic(xBW, params)
}

return(boot)
## Return boot ----
boot
}
2 changes: 1 addition & 1 deletion R/bootClassic.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#' @examples
#' # Example call to bootClassic function
#' sampled_clusters <- boot_bw_sample_clusters(
#' x = indicatorsHH, w = boot_bw_weight(villageData)
#' x = indicatorsHH, df_weighted = boot_bw_weight(villageData)
#' )
#'
#' boot <- boot_bw_sample_within_clusters(sampled_clusters)
Expand Down
2 changes: 1 addition & 1 deletion R/bootProbit.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#' @examples
#' # Example call to bootBW function:
#' sampled_clusters <- boot_bw_sample_clusters(
#' x = indicatorsCH1, w = boot_bw_weight(villageData)
#' x = indicatorsCH1, df_weighted = boot_bw_weight(villageData)
#' )
#'
#' boot <- boot_bw_sample_within_clusters(sampled_clusters)
Expand Down
137 changes: 81 additions & 56 deletions R/boot_bw.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,35 @@
#'
#' Blocked weighted bootstrap - vectorised and parallel
#' Blocked Weighted Bootstrap - vectorised and parallel
#'
#' @description
#' This set of functions is an alternative to the [bootBW()] function. This set
#' attempts to make the blocked weighted bootstrap algorithm more efficient
#' through vectorisation and use of parallelisation techniques. The function
#' syntax has been kept consistent with [bootBW()] for ease of transition. A
#' more in depth discussion of the efficiencies gained from this alternative
#' function is discussed here.
#'
#' @inheritParams bootBW
#' @param df_weighted A [data.frame()] based on `w` with additional variables
#' for `weight` and `cumWeight`. This is usually produced using the function
#' [boot_bw_weight()].
#' @param cores The number of computer cores to use or number of child processes
#' to be run simultaneously. Default to one less than the available number of
#' cores on current machine.
#' @param index Logical. Should index values be returned or a list of
#' data.frames. Default to FALSE.
#' @param cluster_df A [data.frame()] or a list of [data.frame()]s for selected
#' clusters
#' @param p A random probability of selection
#' @param cores The number of computer cores to use/number of child processes
#' will be run simultaneously.
#' [data.frame()]s. Default to FALSE.
#' @param cluster_df A list of [data.frame()]s for selected clusters.
#'
#' @returns A [data.frame()] with:
#' * ncol = length(outputColumns)
#' * nrow = replicates
#' * names = outputColumns
#' @returns For [boot_bw()], a [data.frame()] with number of columns equal to
#' length of `outputColumns`; number of rows equal to number of `replicates`;
#' and, names of variables equal to values of `outputColumns`. For
#' [boot_bw_weight()], A [data.frame()] based on `w` with two additional
#' variables for `weight` and `cumWeight`. For [boot_bw_sample_clusters()],
#' either a vector of integers corresponding to the primary sampling unit
#' (psu) identifier of the selected clusters (when `index = TRUE`) or a list
#' of [data.frame()]s corresponding to the data for the selected clusters
#' (when `index = FALSE`). For [boot_bw_sample_within_clusters()], a matrix
#' similar in structure to `x` of resampled data from each selected cluster.
#'
#' @examples
#' boot_bw(
Expand All @@ -25,7 +41,56 @@
#' @rdname boot_bw
#'

boot_bw <- function(x, w, statistic,
params, outputColumns = params,
replicates = 400,
cores = parallelly::availableCores(omit = 1)) {
## Get cumulative weights for clusters ----
w <- boot_bw_weight(w)

## Check data ----
check_data(x)

## Check params ----
params <- check_params(x = x, params = params)

## Setup parallelism ----
cl <- parallel::makeCluster(cores)
doParallel::registerDoParallel(cl)

## Resample ----
boot <- foreach::foreach(seq_len(replicates), .combine = rbind) %dopar% {
## Sample clusters ----
sampled_clusters <- boot_bw_sample_clusters(x = x, df_weighted = w)

## Sample within selected clusters ----
xBW <- boot_bw_sample_within_clusters(sampled_clusters)

## Apply statistic ----
statistic(xBW, params)
}

## Rename output data.frame ----
boot <- as.data.frame(boot)
row.names(boot) <- NULL
names(boot) <- outputColumns

## Stop parallelism ----
parallel::stopCluster(cl)

## Return boot ----
boot
}

#'
#' @export
#' @rdname boot_bw
#'

boot_bw_weight <- function(w) {
## Check weights ----
check_weights(w = w)

## Scale weights and accumulate weights
w$weight <- w$pop / sum(w$pop)
w$cumWeight <- cumsum(w$weight)
Expand All @@ -35,8 +100,7 @@ boot_bw_weight <- function(w) {
}

#'
#' @export
#' @rdname boot_bw
#' @keywords internal
#'

boot_bw_sample_cluster <- function(p, w) {
Expand All @@ -52,9 +116,9 @@ boot_bw_sample_cluster <- function(p, w) {
#' @rdname boot_bw
#'

boot_bw_sample_clusters <- function(x, w, index = FALSE) {
boot_bw_sample_clusters <- function(x, df_weighted, index = FALSE) {
## Get number of clusters ----
nClusters <- nrow(w)
nClusters <- nrow(df_weighted)

## Get vector of random probabilities ----
p <- runif(n = nClusters)
Expand All @@ -63,7 +127,7 @@ boot_bw_sample_clusters <- function(x, w, index = FALSE) {
selected_clusters <- lapply(
X = p,
FUN = boot_bw_sample_cluster,
w = w
w = df_weighted
) |>
unlist()

Expand All @@ -81,8 +145,7 @@ boot_bw_sample_clusters <- function(x, w, index = FALSE) {


#'
#' @export
#' @rdname boot_bw
#' @keywords internal
#'

boot_bw_sample_within_cluster <- function(cluster_df) {
Expand All @@ -106,41 +169,3 @@ boot_bw_sample_within_clusters <- function(cluster_df) {
) |>
do.call(rbind, args = _)
}


#'
#' @export
#' @rdname boot_bw
#'

boot_bw <- function(x, w, statistic,
params, outputColumns = params,
replicates = 399,
cores = parallelly::availableCores(omit = 1)) {
## Get cumulative weights for clusters ----
w <- boot_bw_weight(w)

## Setup parallelism ----
cl <- parallel::makeCluster(cores)
doParallel::registerDoParallel(cl)

## Resample ----
boot <- foreach::foreach(seq_len(replicates), .combine = rbind) %dopar% {
sampled_clusters <- boot_bw_sample_clusters(x = x, w = w)

xBW <- boot_bw_sample_within_clusters(sampled_clusters)

statistic(xBW, params)
}

## Rename output data.frame ----
boot <- as.data.frame(boot)
row.names(boot) <- NULL
names(boot) <- outputColumns

## Stop parallelism ----
parallel::stopCluster(cl)

## Return boot ----
boot
}
10 changes: 8 additions & 2 deletions R/recode.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,16 @@
#' then a numeric result is returned; if \code{var} is a factor then
#' (by default) so is the result.
#' }
#'
#' @param afr Return a factor. Default is TRUE if \code{var} is a factor and is
#' FALSE otherwise
#' @param anr Coerce result to numeric (default is TRUE)
#' @param levels Order of the levels in the returned factor; the default is to use
#' the sort order of the level names.
#' @return Recoded variable
#'
#'
#' @returns Recoded variable
#'
#' @examples
#' # Recode values from 1 to 9 to various specifications
#' var <- sample(x = 1:9, size = 100, replace = TRUE)
Expand Down Expand Up @@ -108,6 +112,7 @@ recode <- function(var, recodes, afr, anr = TRUE, levels) {
}
}
}

if (afr) {
result <- if (!missing(levels)) factor(result, levels = levels) else as.factor(result)
} else if (anr && (!is.numeric(result))) {
Expand All @@ -119,5 +124,6 @@ recode <- function(var, recodes, afr, anr = TRUE, levels) {
code = if (!any(is.na(result.valid))) result <- as.numeric(result)
)
}
return(result)

result
}
Loading

0 comments on commit 6e49636

Please sign in to comment.