diff --git a/CohortDiagnosticsBreastCancer/DiagnosticsExplorer.Rproj b/CohortDiagnosticsBreastCancer/DiagnosticsExplorer.Rproj new file mode 100644 index 00000000..8e3c2ebc --- /dev/null +++ b/CohortDiagnosticsBreastCancer/DiagnosticsExplorer.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/CohortDiagnosticsBreastCancer/R/DisplayFunctions.R b/CohortDiagnosticsBreastCancer/R/DisplayFunctions.R new file mode 100644 index 00000000..d3cf2fb7 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/R/DisplayFunctions.R @@ -0,0 +1,73 @@ +camelCaseToSnakeCase <- function(string) { + string <- gsub("([A-Z])", "_\\1", string) + string <- tolower(string) + string <- gsub("([a-z])([0-9])", "\\1_\\2", string) + return(string) +} + + +camelCaseToTitleCase <- function(string) { + string <- gsub("([A-Z])", " \\1", string) + string <- gsub("([a-z])([0-9])", "\\1 \\2", string) + substr(string, 1, 1) <- toupper(substr(string, 1, 1)) + return(string) +} + + +truncateStringDef <- function(columns, maxChars) { + list( + targets = columns, + render = DT::JS(sprintf("function(data, type, row, meta) {\n + return type === 'display' && data != null && data.length > %s ?\n + '' + data.substr(0, %s) + '...' : data;\n + }", maxChars, maxChars)) + ) +} + +minCellCountDef <- function(columns) { + list( + targets = columns, + render = DT::JS("function(data, type) { + if (type !== 'display' || isNaN(parseFloat(data))) return data; + if (data >= 0) return data.toString().replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,'); + return '<' + Math.abs(data).toString().replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,'); + }") + ) +} + +minCellPercentDef <- function(columns) { + list( + targets = columns, + render = DT::JS("function(data, type) { + if (type !== 'display' || isNaN(parseFloat(data))) return data; + if (data >= 0) return (100 * data).toFixed(1).replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,') + '%'; + return '<' + Math.abs(100 * data).toFixed(1).replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,') + '%'; + }") + ) +} + +minCellRealDef <- function(columns, digits = 1) { + list( + targets = columns, + render = DT::JS(sprintf("function(data, type) { + if (type !== 'display' || isNaN(parseFloat(data))) return data; + if (data >= 0) return data.toFixed(%s).replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,'); + return '<' + Math.abs(data).toFixed(%s).replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,'); + }", digits, digits)) + ) +} + +styleAbsColorBar <- function(maxValue, colorPositive, colorNegative, angle = 90) { + DT::JS(sprintf("isNaN(parseFloat(value))? '' : 'linear-gradient(%fdeg, transparent ' + (%f - Math.abs(value))/%f * 100 + '%%, ' + (value > 0 ? '%s ' : '%s ') + (%f - Math.abs(value))/%f * 100 + '%%)'", + angle, maxValue, maxValue, colorPositive, colorNegative, maxValue, maxValue)) +} + +sumCounts <- function(counts) { + result <- sum(abs(counts)) + if (any(counts < 0)) { + return(-result) + } else { + return(result) + } + +} diff --git a/CohortDiagnosticsBreastCancer/R/FacetNested.R b/CohortDiagnosticsBreastCancer/R/FacetNested.R new file mode 100644 index 00000000..e02ca2d1 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/R/FacetNested.R @@ -0,0 +1,602 @@ +# Code borrowed from https://github.com/teunbrand/ggh4x and gtable, just to merge the labels of grouped facets. +# May need to simplify a bit. +library(ggplot2) + +.grab_ggplot_internals <- function() { + objects <- c( + ".all_aesthetics", + "as_facets_list", + "as_gg_data_frame", + "axis_label_element_overrides", + "check_aesthetics", + "check_labeller", + "check_subclass", + "compact", + "continuous_range", + "convertInd", + "df.grid", + "draw_axis_labels", + "reshape_add_margins", + "new_data_frame", + "defaults", "id", + "empty", + "eval_facets", + "ggname", + "rename_aes", + "mapped_aesthetics", + "make_labels", + "grid_as_facets_list", + "is.zero", + "rbind_dfs", + "sanitise_dim", + "set_draw_key", + "snake_class", + "ulevels", + "unique_combs", + "var_list", + "validate_mapping", + "warn_for_guide_position", + "weave_tables_col", + "weave_tables_row", + "wrap_as_facets_list", + ".pt" + ) + objects <- setNames(objects, objects) + out <- lapply(objects, function(i) { + getFromNamespace(i, "ggplot2") + }) +} + +# Store the needed ggplot internals here +.int <- .grab_ggplot_internals() + +# From gtable: +neg_to_pos <- function (x, max) { + ifelse(x >= 0, x, max + 1 + x) +} + +new_data_frame <- function (x, n = NULL) +{ + if (is.null(n)) { + n <- if (length(x) == 0) + 0 + else length(x[[1]]) + } + class(x) <- "data.frame" + attr(x, "row.names") <- .set_row_names(n) + x +} + +gtable_add_grob <- function(x, grobs, t, l, b = t, r = l, z = Inf, clip = "on", + name = x$name) +{ + if (!gtable::is.gtable(x)) + stop("x must be a gtable", call. = FALSE) + # if (is.grob(grobs)) + # grobs <- list(grobs) + if (!is.list(grobs)) + stop("grobs must either be a single grob or a list of grobs", + call. = FALSE) + n_grobs <- length(grobs) + if (is.logical(clip)) { + clip <- ifelse(clip, "on", "off") + } + layout <- unclass(x$layout) + # if (!all(vapply(list(t, r, b, l, z, clip, name), len_same_or_1, + # logical(1), n_grobs))) { + # stop("Not all inputs have either length 1 or same length same as 'grobs'") + # } + z <- rep(z, length.out = n_grobs) + zval <- c(layout$z, z[!is.infinite(z)]) + if (length(zval) == 0) { + zmin <- 1 + zmax <- 0 + } + else { + zmin <- min(zval) + zmax <- max(zval) + } + z[z == -Inf] <- zmin - rev(seq_len(sum(z == -Inf))) + z[z == Inf] <- zmax + seq_len(sum(z == Inf)) + x_row <- length(x$heights) + x_col <- length(x$widths) + t <- rep(neg_to_pos(t, x_row), length.out = n_grobs) + b <- rep(neg_to_pos(b, x_row), length.out = n_grobs) + l <- rep(neg_to_pos(l, x_col), length.out = n_grobs) + r <- rep(neg_to_pos(r, x_col), length.out = n_grobs) + clip <- rep(clip, length.out = n_grobs) + name <- rep(name, length.out = n_grobs) + x$grobs <- c(x$grobs, grobs) + x$layout <- new_data_frame(list(t = c(layout$t, t), l = c(layout$l, + l), b = c(layout$b, b), r = c(layout$r, r), z = c(layout$z, + z), clip = c(layout$clip, clip), name = c(layout$name, + name))) + x +} + +# Main function ----------------------------------------------------------- + +#' @title Layout panels in a grid with nested strips +#' +#' @description \code{facet_nested()} forms a matrix of panels defined by row +#' and column faceting variables and nests grouped facets. +#' +#' @inheritParams ggplot2::facet_grid +#' @param nest_line a \code{logical} vector of length 1, indicating whether to +#' draw a nesting line to indicate the nesting of variables. Control the look +#' of the nesting line by setting the \code{ggh4x.facet.nestline} theme +#' element. +#' @param resect a \code{unit} vector of length 1, indicating how much the +#' nesting line should be shortened. +#' @param bleed a \code{logical} vector of length 1, indicating whether merging +#' of lower-level variables is allowed when the higher-level variables are +#' separate. See details. +#' +#' @details Unlike \code{facet_grid()}, this function only automatically expands +#' missing variables when they have no variables in that direction, to allow +#' for unnested variables. It still requires at least one layer to have all +#' faceting variables. +#' +#' Hierarchies are inferred from the order of variables supplied to +#' \code{rows} or \code{cols}. The first variable is interpreted to be the +#' outermost variable, while the last variable is interpreted to be the +#' innermost variable. They display order is always such that the outermost +#' variable is placed the furthest away from the panels. Strips are +#' automatically grouped when they span a nested variable. +#' +#' The \code{bleed} argument controls whether lower-level variables are allowed +#' to be merged when higher-level are different, i.e. they can bleed over +#' hierarchies. Suppose the \code{facet_grid()} behaviour would be the +#' following: +#' +#' \code{[_1_][_2_][_2_]} \cr \code{[_3_][_3_][_4_]} +#' +#' In such case, the default \code{bleed = FALSE} argument would result in the +#' following: +#' +#' \code{[_1_][___2____]} \cr \code{[_3_][_3_][_4_]} +#' +#' Whereas \code{bleed = TRUE} would allow the following: +#' +#' \code{[_1_][___2____]} \cr \code{[___3____][_4_]} +#' +#' @export +#' +#' @return A \emph{FacetNested} ggproto object. +#' @family facetting functions +#' +#' @seealso See \code{\link[ggplot2]{facet_grid}} for descriptions of the +#' original arguments. See \code{\link[grid]{unit}} for the construction of a +#' \code{unit} vector. +#' +#' @examples +#' df <- iris +#' df$nester <- ifelse(df$Species == "setosa", +#' "Short Leaves", +#' "Long Leaves") +#' +#' ggplot(df, aes(Sepal.Length, Petal.Length)) + +#' geom_point() + +#' facet_nested(~ nester + Species) +#' +#' # Controlling the nest line +#' ggplot(df, aes(Sepal.Length, Petal.Length)) + +#' geom_point() + +#' facet_nested(~ nester + Species, nest_line = TRUE) + +#' theme(ggh4x.facet.nestline = element_line(linetype = 3)) +facet_nested <- function( + rows = NULL, cols = NULL, scales = "fixed", space = "fixed", + shrink = TRUE, labeller = "label_value", as.table = TRUE, + switch = NULL, drop = TRUE, margins = FALSE, facets = NULL, + nest_line = FALSE, resect = unit(0, "mm"), bleed = FALSE +) { + if (!is.null(facets)) { + rows <- facets + } + if (is.logical(cols)) { + margins <- cols + cols <- NULL + } + scales <- match.arg(scales, c("fixed", "free_x", "free_y", "free")) + free <- list(x = any(scales %in% c("free_x", "free")), + y = any(scales %in% c("free_y", "free"))) + + space <- match.arg(space, c("fixed", "free_x", "free_y", "free")) + space_free <- list(x = any(space %in% c("free_x", "free")), + y = any(space %in% c("free_y", "free"))) + + if (!is.null(switch) && !switch %in% c("both", "x", "y")) { + stop("switch must be either 'both', 'x', or 'y'", call. = FALSE) + } + + facets_list <- .int$grid_as_facets_list(rows, cols) + n <- length(facets_list) + if (n > 2L) { + stop("A grid facet specification can't have more than two dimensions", + .call = FALSE) + } + if (n == 1L) { + rows <- quos() + cols <- facets_list[[1]] + } else { + rows <- facets_list[[1]] + cols <- facets_list[[2]] + } + labeller <- .int$check_labeller(labeller) + ggplot2::ggproto(NULL, FacetNested, shrink = shrink, + params = list( + rows = rows, + cols = cols, + margins = margins, + free = free, + space_free = space_free, + labeller = labeller, + as.table = as.table, + switch = switch, + drop = drop, + nest_line = nest_line, + resect = resect, + bleed = bleed + )) +} + +# ggproto ----------------------------------------------------------------- + +#' @usage NULL +#' @format NULL +#' @export +#' @rdname ggh4x_extensions +FacetNested <- ggplot2::ggproto( + "FacetNested", FacetGrid, + map_data = function(data, layout, params) { + # Handle empty data + if (.int$empty(data)) { + return(cbind(data, PANEL = integer(0))) + } + # Setup variables + rows <- params$rows + cols <- params$cols + + vars <- c(names(rows), names(cols)) + if (length(vars) == 0) { + data$PANEL <- layout$PANEL + return(data) + } + + margin_vars <- list(intersect(names(rows), names(data)), + intersect(names(cols), names(data))) + + # Add variables + data <- .int$reshape_add_margins(data, margin_vars, params$margins) + facet_vals <- .int$eval_facets(c(rows, cols), data, params$.possible_columns) + + # Only set as missing if it has no variable in that direction + missing_facets <- character(0) + if (!any(names(rows) %in% names(facet_vals))){ + missing_facets <- c(missing_facets, + setdiff(names(rows), names(facet_vals))) + } + if (!any(names(cols) %in% names(facet_vals))){ + missing_facets <- c(missing_facets, + setdiff(names(cols), names(facet_vals))) + } + + # Fill in missing values + if (length(missing_facets) > 0) { + to_add <- unique(layout[missing_facets]) + data_rep <- rep.int(1:nrow(data), nrow(to_add)) + facet_rep <- rep(1:nrow(to_add), each = nrow(data)) + data <- data[data_rep, , drop = FALSE] + rownames(data) <- NULL + facet_vals <- cbind(facet_vals[data_rep, , drop = FALSE], + to_add[facet_rep, , drop = FALSE]) + rownames(facet_vals) <- NULL + } + + # Match columns to facets + if (nrow(facet_vals) == 0) { + data$PANEL <- -1 + } else { + facet_vals[] <- lapply(facet_vals[], as.factor) + facet_vals[] <- lapply(facet_vals[], addNA, ifany = TRUE) + keys <- plyr::join.keys(facet_vals, layout, + by = vars[vars %in% names(facet_vals)]) + data$PANEL <- layout$PANEL[match(keys$x, keys$y)] + } + data + }, + compute_layout = function(data, params) { + rows <- params$rows + cols <- params$cols + dups <- intersect(names(rows), names(cols)) + + if (length(dups) > 0) { + stop("Facetting variables can only appear in row or cols, not both.\n", + "Problems: ", paste0(dups, collapse = "'"), call. = FALSE) + } + + base_rows <- combine_nested_vars(data, params$plot_env, + rows, drop = params$drop) + if (!params$as.table) { + rev_order <- function(x) factor(x, levels = rev(.int$ulevels(x))) + } + base_cols <- combine_nested_vars(data, params$plot_env, cols, + drop = params$drop) + base <- .int$df.grid(base_rows, base_cols) + + if (nrow(base) == 0) { + return(.int$new_data_frame(list(PANEL = 1L, ROW = 1L, COL = 1L, + SCALE_X = 1L, SCALE_Y = 1L))) + } + + base <- .int$reshape_add_margins( + base, list(names(rows), names(cols)), params$margins + ) + base <- unique(base) + + panel <- .int$id(base, drop = TRUE) + panel <- factor(panel, levels = seq_len(attr(panel, "n"))) + + rows <- if (!length(names(rows))) { + rep(1L, length(panel)) + } else { + .int$id(base[names(rows)], drop = TRUE) + } + cols <- if (!length(names(cols))) { + rep(1L, length(panel)) + } else { + .int$id(base[names(cols)], drop = TRUE) + } + + panels <- .int$new_data_frame( + c(list(PANEL = panel, ROW = rows, COL = cols), base) + ) + panels <- panels[order(panels$PANEL), , drop = FALSE] + rownames(panels) <- NULL + panels$SCALE_X <- if (params$free$x) { + panels$COL + } else { + 1L + } + panels$SCALE_Y <- if (params$free$y) { + panels$ROW + } else { + 1L + } + panels + }, + draw_panels = function(panels, layout, x_scales, y_scales, ranges, coord, + data, theme, params) { + panel_table <- FacetGrid$draw_panels(panels, layout, x_scales, y_scales, + ranges, coord, data, theme, params) + + # Setup strips + col_vars <- unique(layout[names(params$cols)]) + row_vars <- unique(layout[names(params$rows)]) + attr(col_vars, "type") <- "cols" + attr(col_vars, "facet") <- "grid" + attr(row_vars, "type") <- "rows" + attr(row_vars, "facet") <- "grid" + + # Build strips + switch_x <- !is.null(params$switch) && params$switch %in% c("both", "x") + switch_y <- !is.null(params$switch) && params$switch %in% c("both", "y") + + # Merging strips + merge_cols <- apply(col_vars, 2, function(x) any(rle(x)$lengths > 1)) + merge_rows <- apply(row_vars, 2, function(x) any(rle(x)$lengths > 1)) + + if (any(merge_cols)) { + if (switch_x) { + panel_table <- merge_strips(panel_table, + col_vars, switch_x, params, theme, "b") + } else { + panel_table <- merge_strips(panel_table, + col_vars, switch_x, params, theme, "t") + } + } + + if (any(merge_rows)) { + if (switch_y) { + panel_table <- merge_strips(panel_table, + row_vars, switch_y, params, theme, "l") + } else { + panel_table <- merge_strips(panel_table, + row_vars, switch_y, params, theme, "r") + } + } + panel_table + } +) + +# Helper functions ----------------------------------------------- + +combine_nested_vars <- function( + data, env = emptyenv(), vars = NULL, drop = TRUE +) { + if (length(vars) == 0) { + return(.int$new_data_frame()) + } + + possible_columns <- unique(unlist(lapply(data, names))) + + values <- .int$compact(lapply(data, .int$eval_facets, facets = vars, + possible_columns = possible_columns)) + has_all <- unlist(lapply(values, length)) == length(vars) + if (!any(has_all)) { + missing <- lapply(values, function(x) setdiff(names(vars), names(x))) + missing_txt <- vapply(missing, .int$var_list, character(1)) + name <- c("Plot", paste0("Layer ", seq_len(length(data) - 1))) + stop("At least one layer must contain all faceting variables: ", + .int$var_list(names(vars)), ".\n", paste0("* ", name, " is missing ", + missing_txt, collapse = "\n"), + call. = FALSE) + } + base <- unique(.int$rbind_dfs(values[has_all])) + if (!drop) { + base <- .int$unique_combs(base) + } + for (value in values[!has_all]) { + if (.int$empty(value)) + next + old <- base[setdiff(names(base), names(value))] + new <- unique(value[intersect(names(base), names(value))]) + if (drop) { + new <- .int$unique_combs(new) + } + old[setdiff(names(base), names(value))] <- rep("", nrow(old)) + base <- rbind(base, .int$df.grid(old, new)) + } + if (.int$empty(base)) { + stop("Facetting variables must have at least one value", + call. = FALSE) + } + base +} + +# New merge strips -------------------------------------------------------- + +merge_strips <- function( + panel_table, vars, switch, params, theme, where = "t" +) { + orient <- if (where %in% c("t", "b")) "x" else "y" + nlevels <- ncol(vars) + + these_strips <- grep(paste0("strip-", where), panel_table$layout$name) + strp_rows <- range(panel_table$layout$t[these_strips]) + strp_cols <- range(panel_table$layout$l[these_strips]) + strp_rows <- seq(strp_rows[1], strp_rows[2]) + strp_cols <- seq(strp_cols[1], strp_cols[2]) + strp <- panel_table[strp_rows, strp_cols] + + # Make empty template + template <- strp + template$grobs <- list() + template$layout <- template$layout[0,] + + # Inflate strips + for (i in seq_along(strp$grobs)) { + sub <- strp$grobs[[i]] + if (where == "b") { + sub$layout$t <- rev(sub$layout$t) + sub$layout$b <- rev(sub$layout$b) + } + n <- length(sub$grobs) + lay <- strp$layout[i,] + lay <- lay[rep(1, n),] + rownames(lay) <- NULL + sub <- lapply(seq_len(n), function(j) { + x <- sub + x$grobs <- x$grobs[j] + x$layout <- x$layout[j,] + x + }) + template <- gtable_add_grob( + template, + sub, t = lay$t, l = lay$l, b = lay$b, r = lay$r, + z = lay$z, clip = lay$clip, name = paste0(lay$name, "-", seq_len(n)) + ) + } + + if (!params$bleed) { + vars[] <- lapply(seq_len(ncol(vars)), function(i) { + do.call(paste0, vars[, seq(i), drop = FALSE]) + }) + } + merge <- apply(vars, 2, function(x) any(rle(x)$lengths > 1)) + + if (where == "r") { + vars <- rev(vars) + merge <- rev(merge) + } + + # Abstract away strips + strip_ids <- strsplit(template$layout$name, "-", fixed = TRUE) + strip_ids <- do.call(rbind, strip_ids) + strip_ids <- strip_ids[,3:ncol(strip_ids)] + mode(strip_ids) <- "integer" + + template$layout$delete <- rep(FALSE, nrow(strip_ids)) + template$layout$aquire <- seq_along(template$grobs) + + for (i in seq_len(nlevels)) { + if (!merge[i]) { + next() + } + ii <- strip_ids[, 2] == i + + # Figure out what to merge + j <- as.numeric(as.factor(vars[, i])) + + ends <- cumsum(rle(j)$lengths) + starts <- c(1, which(diff(j) != 0) + 1) + + # Figure out what strip to remove + seqs <- unlist(Map(seq, from = starts, to = ends)) + delete_this <- seqs[!(seqs %in% starts)] + delete_this <- which(strip_ids[, 1] %in% delete_this & ii) + template$layout$delete[delete_this] <- TRUE + + # Figure out what cells to expand + expand <- seqs[seqs %in% starts] + expand <- which(strip_ids[, 1] %in% expand & ii) + expand_where <- seqs[seqs %in% ends] + expand_where <- which(strip_ids[, 1] %in% expand_where & ii) + template$layout$aquire[expand] <- template$layout$aquire[expand_where] + } + + # Do expansion + if (orient == "x") { + template$layout$r <- template$layout$r[template$layout$aquire] + } else { + template$layout$b <- template$layout$b[template$layout$aquire] + } + + # Do deletion + template$grobs <- template$grobs[!template$layout$delete] + strip_ids <- strip_ids[!template$layout$delete,] + template$layout <- template$layout[!template$layout$delete,] + + # Add nesting indicator + if (params$nest_line) { + active <- unit(c(0, 1), "npc") + c(1, -1) * params$resect + passive <- if (switch) c(1, 1) else c(0, 0) + nindi <- element_render( + theme, "ggh4x.facet.nestline", + x = switch(orient, x = active, y = passive), + y = switch(orient, x = passive, y = active) + ) + i <- which(with(template$layout, t != b | l != r)) + offset <- switch( + orient, + x = vapply(template$grobs, function(grob){grob$layout$t}, numeric(1)), + y = vapply(template$grobs, function(grob){grob$layout$l}, numeric(1)) + ) + offset <- if (where %in% c("r", "b")) offset else nlevels - offset + template$grobs[i] <- lapply(template$grobs[i], function(grb) { + grb <- with(grb$layout, gtable_add_grob( + grb, nindi, t = t, l = l, r = r, b = b, + z = z, + name = "nester", + clip = "off" + )) + }) + template$layout$z <- template$layout$z + offset + } + + # Delete old strips + panel_table <- gtable::gtable_filter(panel_table, paste0("strip-", where), + fixed = TRUE, trim = FALSE, invert = TRUE) + + # Place back new strips + panel_table <- with(template$layout, gtable::gtable_add_grob( + panel_table, + template$grobs, + t = t - 1 + strp_rows[1], + l = l - 1 + strp_cols[1], + b = b - 1 + strp_rows[1], + r = r - 1 + strp_cols[1], + z = z, clip = clip, name = name + )) + panel_table +} \ No newline at end of file diff --git a/CohortDiagnosticsBreastCancer/R/Plots.R b/CohortDiagnosticsBreastCancer/R/Plots.R new file mode 100644 index 00000000..551b2d15 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/R/Plots.R @@ -0,0 +1,553 @@ +plotTimeDistribution <- function(data, + cohortIds = NULL, + databaseIds = NULL, + xAxis = "database") { + + if (is.null(cohortIds) || xAxis != "database" || is.null(databaseIds)) { + warning("Not yet supported. Upcoming feature.") + return(NULL) + } + + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertTibble(x = data, + any.missing = FALSE, + min.rows = 1, + min.cols = 5, + null.ok = FALSE, + add = errorMessage) + checkmate::assertDouble(x = cohortIds, + lower = 1, + upper = 2^53, + any.missing = FALSE, + null.ok = TRUE, + min.len = 1, + add = errorMessage) + checkmate::assertCharacter(x = databaseIds, + any.missing = FALSE, + null.ok = TRUE, + min.len = 1, + unique = TRUE, + add = errorMessage) + checkmate::assertChoice(x = xAxis, + choices = c("database", "cohortId"), + add = errorMessage) + checkmate::assertNames(x = colnames(data), + must.include = c("Min", "P25", "Median", "P75", "Max"), + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + + plotData <- data + if (!is.null(cohortIds)) { + plotData <- plotData %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + if (!is.null(databaseIds)) { + plotData <- plotData %>% + dplyr::filter(.data$Database %in% !!databaseIds) + } + + plotData$tooltip <- c(paste0(plotData$shortName, " : ", plotData$cohortName, + "\nDatabase = ", plotData$Database, + "\nMin = ", plotData$Min, + "\nMax = ", plotData$Max, + "\nP25 = ", plotData$P25, + "\nMedian = ", plotData$Median, + "\nP75 = ", plotData$P75, + "\nTime Measure = ", plotData$TimeMeasure, + "\nAverage = ", plotData$Average)) + + plot <- ggplot2::ggplot(data = plotData) + + ggplot2::aes(x = .data$Database, + ymin = .data$Min, + lower = .data$P25, + middle = .data$Median, + upper = .data$P75, + ymax = .data$Max, + group = .data$TimeMeasure, + average = .data$Average) + + ggplot2::geom_errorbar(mapping = ggplot2::aes(ymin = .data$Min, + ymax = .data$Max), size = 0.5) + + ggiraph::geom_boxplot_interactive(ggplot2::aes(tooltip = tooltip), + stat = "identity", + fill = rgb(0, 0, 0.8, alpha = 0.25), + size = 0.2) + + ggplot2::facet_grid(Database+shortName~TimeMeasure, scales = "free") + + ggplot2::coord_flip() + + ggplot2::theme(panel.grid.major.y = ggplot2::element_blank(), + panel.grid.minor.y = ggplot2::element_blank(), + axis.title.y = ggplot2::element_blank(), + axis.ticks.y = ggplot2::element_blank(), + axis.text.y = ggplot2::element_blank(), + strip.text.y.right = ggplot2::element_text(angle = 0)) + plot <- ggiraph::girafe(ggobj = plot, + options = list( + ggiraph::opts_sizing(width = .7), + ggiraph::opts_zoom(max = 5)), + width_svg = 12, + height_svg = 0.7 + 0.5 * length(databaseIds)) + return(plot) +} +# how to render using pure plot ly. Plotly does not prefer precomputed data. +# TO DO: color and plot positions are not consistent yet. +# plot <- plotly::plot_ly(data = plotData, +# type = "box", +# median = plotData$P25, +# #Mean = plotData$Average, +# upperfence = plotData$Max, +# lowerfence = plotData$Min, +# split = plotData$TimeMeasure) +# loop thru database or cohorts as needed +# then subplot +# plot <- plotly::subplot(plots,nrows = length(input$databases),margin = 0.05) + + +plotIncidenceRate <- function(data, + cohortIds = NULL, + databaseIds = NULL, + stratifyByAgeGroup = TRUE, + stratifyByGender = TRUE, + stratifyByCalendarYear = TRUE, + yscaleFixed = FALSE) { + if (nrow(data) == 0) { + ParallelLogger::logWarn("Record counts are too low to plot.") + } + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertTibble(x = data, + any.missing = TRUE, + min.rows = 1, + min.cols = 5, + null.ok = FALSE, + add = errorMessage) + checkmate::assertDouble(x = cohortIds, + lower = 1, + upper = 2^53, + any.missing = FALSE, + null.ok = TRUE, + min.len = 1, + add = errorMessage) + checkmate::assertCharacter(x = databaseIds, + any.missing = FALSE, + null.ok = TRUE, + min.len = 1, + unique = TRUE, + add = errorMessage) + checkmate::assertLogical(x = stratifyByAgeGroup, + any.missing = FALSE, + min.len = 1, + max.len = 1, + null.ok = FALSE, + add = errorMessage) + checkmate::assertLogical(x = stratifyByGender, + any.missing = FALSE, + min.len = 1, + max.len = 1, + null.ok = FALSE, + add = errorMessage) + checkmate::assertLogical(x = stratifyByCalendarYear, + any.missing = FALSE, + min.len = 1, + max.len = 1, + null.ok = FALSE, + add = errorMessage) + checkmate::assertLogical(x = yscaleFixed, + any.missing = FALSE, + min.len = 1, + max.len = 1, + null.ok = FALSE, + add = errorMessage) + checkmate::assertDouble(x = data$incidenceRate, + lower = 0, + any.missing = FALSE, + null.ok = FALSE, + min.len = 1, + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + checkmate::assertDouble(x = data$incidenceRate, + lower = 0, + any.missing = FALSE, + null.ok = FALSE, + min.len = 1, + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + + plotData <- data %>% + dplyr::mutate(incidenceRate = round(.data$incidenceRate, digits = 3)) + if (!is.null(cohortIds)) { + plotData <- plotData %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + if (!is.null(databaseIds)) { + plotData <- plotData %>% + dplyr::filter(.data$databaseId %in% !!databaseIds) + } + plotData <- plotData %>% + dplyr::mutate(strataGender = !is.na(.data$gender), + strataAgeGroup = !is.na(.data$ageGroup), + strataCalendarYear = !is.na(.data$calendarYear)) %>% + dplyr::filter(.data$strataGender %in% !!stratifyByGender & + .data$strataAgeGroup %in% !!stratifyByAgeGroup & + .data$strataCalendarYear %in% !!stratifyByCalendarYear) %>% + dplyr::select(-dplyr::starts_with("strata")) + + aesthetics <- list(y = "incidenceRate") + if (stratifyByCalendarYear) { + aesthetics$x <- "calendarYear" + xLabel <- "Calender year" + showX <- TRUE + if (stratifyByGender) { + aesthetics$group <- "gender" + aesthetics$color <- "gender" + } + plotType <- "line" + } else { + xLabel <- "" + if (stratifyByGender) { + aesthetics$x <- "gender" + aesthetics$color <- "gender" + aesthetics$fill <- "gender" + showX <- TRUE + } else if (stratifyByAgeGroup) { + aesthetics$x <- "ageGroup" + showX <- TRUE + } + else{ + aesthetics$x <- "cohortId" + showX <- FALSE + } + plotType <- "bar" + } + + newSort <- plotData %>% + dplyr::select(.data$ageGroup) %>% + dplyr::distinct() %>% + dplyr::arrange(as.integer(sub(pattern = '-.+$','',x = .data$ageGroup))) + + plotData <- plotData %>% + dplyr::arrange(ageGroup = factor(.data$ageGroup, levels = newSort$ageGroup), .data$ageGroup) + + plotData$ageGroup <- factor(plotData$ageGroup, + levels = newSort$ageGroup) + plotData$tooltip <- c(paste0(plotData$shortName, ":", plotData$cohortName,"\n","Incidence Rate = ", scales::comma(plotData$incidenceRate, accuracy = 0.01), + "\nDatabase = ", plotData$databaseId, + "\nPerson years = ", scales::comma(plotData$personYears, accuracy = 0.1), + "\nCohort count = ", scales::comma(plotData$cohortCount))) + + if (stratifyByAgeGroup) { + plotData$tooltip <- c(paste0(plotData$tooltip, "\nAge Group = ", plotData$ageGroup)) + } + + if (stratifyByGender) { + plotData$tooltip <- c(paste0(plotData$tooltip, "\nGender = ", plotData$gender)) + } + + if (stratifyByCalendarYear) { + plotData$tooltip <- c(paste0(plotData$tooltip, "\nYear = ", plotData$calendarYear)) + } + + + plot <- ggplot2::ggplot(data = plotData, + do.call(ggplot2::aes_string, aesthetics)) + + ggplot2::xlab(xLabel) + + ggplot2::ylab("Incidence Rate (/1,000 person years)") + + ggplot2::theme(legend.position = "top", + legend.title = ggplot2::element_blank(), + axis.text.x = if (showX) ggplot2::element_text(angle = 90, vjust = 0.5) else ggplot2::element_blank() ) + + if (plotType == "line") { + plot <- plot + + ggiraph::geom_line_interactive(ggplot2::aes(), size = 1, alpha = 0.6) + + ggiraph::geom_point_interactive(ggplot2::aes(tooltip = tooltip), size = 2, alpha = 0.6) + } else { + plot <- plot + ggplot2::geom_bar(stat = "identity") + + ggiraph::geom_col_interactive( ggplot2::aes(tooltip = tooltip), size = 1) + } + + # databaseId field only present when called in Shiny app: + if (!is.null(data$databaseId) && length(data$databaseId) > 1) { + if (yscaleFixed) { + scales <- "fixed" + } else { + scales <- "free_y" + } + if (stratifyByGender | stratifyByCalendarYear) { + if (stratifyByAgeGroup) { + plot <- plot + facet_nested(databaseId + shortName ~ plotData$ageGroup, scales = scales) + } else { + plot <- plot + facet_nested(databaseId + shortName ~ ., scales = scales) + } + } else { + plot <- plot + facet_nested(databaseId + shortName ~., scales = scales) + } + spacing <- rep(c(1, rep(0.5, length(unique(plotData$shortName)) - 1)), length(unique(plotData$databaseId)))[-1] + plot <- plot + ggplot2::theme(panel.spacing.y = ggplot2::unit(spacing, "lines"), + strip.background = ggplot2::element_blank()) + } else { + if (stratifyByAgeGroup) { + plot <- plot + ggplot2::facet_grid(~ageGroup) + } + } + plot <- ggiraph::girafe(ggobj = plot, + options = list( + ggiraph::opts_sizing(width = .7), + ggiraph::opts_zoom(max = 5)), + width_svg = 15, + height_svg = 1.5 + 2*length(unique(data$databaseId))) + return(plot) +} + +plotCohortComparisonStandardizedDifference <- function(balance, + domain = "all", + targetLabel = "Mean Target", + comparatorLabel = "Mean Comparator") { + domains <- c("condition", "device", "drug", "measurement", "observation", "procedure") + balance$domain <- tolower(stringr::str_extract(balance$covariateName, "[a-z]+")) + balance$domain[!balance$domain %in% domains] <- "other" + + if (domain != "all") { + balance <- balance %>% + dplyr::filter(.data$domain == !!domain) + } + + # Can't make sense of plot with > 1000 dots anyway, so remove anything with small mean in both target and comparator: + if (nrow(balance) > 1000) { + balance <- balance %>% + dplyr::filter(.data$mean1 > 0.01 | .data$mean2 > 0.01) + } + + # ggiraph::geom_point_interactive(ggplot2::aes(tooltip = tooltip), size = 3, alpha = 0.6) + balance$tooltip <- c(paste("Covariate Name:", balance$covariateName, + "\nDomain: ", balance$domain, + "\nMean Target: ", scales::comma(balance$mean1, accuracy = 0.1), + "\nMean Comparator:", scales::comma(balance$mean2, accuracy = 0.1), + "\nStd diff.:", scales::comma(balance$stdDiff, accuracy = 0.1))) + + # Code used to generate palette: + # writeLines(paste(RColorBrewer::brewer.pal(n = length(domains), name = "Dark2"), collapse = "\", \"")) + + # Make sure colors are consistent, no matter which domains are included: + colors <- c("#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#444444") + colors <- colors[c(domains, "other") %in% unique(balance$domain)] + + balance$domain <- factor(balance$domain, levels = c(domains, "other")) + + # targetLabel <- paste(strwrap(targetLabel, width = 50), collapse = "\n") + # comparatorLabel <- paste(strwrap(comparatorLabel, width = 50), collapse = "\n") + + plot <- ggplot2::ggplot(balance, ggplot2::aes(x = .data$mean1, y = .data$mean2, color = .data$domain)) + + ggiraph::geom_point_interactive(ggplot2::aes(tooltip = .data$tooltip), size = 3,shape = 16, alpha = 0.5) + + ggplot2::geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + ggplot2::geom_hline(yintercept = 0) + + ggplot2::geom_vline(xintercept = 0) + + ggplot2::scale_x_continuous("MEAN") + + ggplot2::scale_y_continuous("MEAN") + + ggplot2::scale_color_manual("Domain", values = colors) + + ggplot2::facet_grid(targetCohortShortName ~ databaseId + comparatorCohortShortName) + + plot <- ggiraph::girafe(ggobj = plot, + options = list( + ggiraph::opts_sizing(width = .7), + ggiraph::opts_zoom(max = 5)),width_svg = 12, + height_svg = 5) + return(plot) +} + + +plotCohortOverlapVennDiagram <- function(data, + targetCohortIds, + comparatorCohortIds, + databaseIds) { + + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertTibble(x = data, + any.missing = FALSE, + min.rows = 1, + min.cols = 5, + null.ok = FALSE, + add = errorMessage) + checkmate::assertDouble(x = targetCohortIds, + lower = 1, + upper = 2^53, + any.missing = FALSE, + null.ok = FALSE) + checkmate::assertDouble(x = comparatorCohortIds, + lower = 1, + upper = 2^53, + any.missing = FALSE, + null.ok = FALSE) + checkmate::assertCharacter(x = databaseIds, + any.missing = FALSE, + min.len = 1, + null.ok = TRUE + ) + checkmate::reportAssertions(collection = errorMessage) + + plot <- VennDiagram::draw.pairwise.venn(area1 = abs(data$eitherSubjects) - abs(data$cOnlySubjects), + area2 = abs(data$eitherSubjects) - abs(data$tOnlySubjects), + cross.area = abs(data$bothSubjects), + category = c("Target", "Comparator"), + col = c(rgb(0.8, 0, 0), rgb(0, 0, 0.8)), + fill = c(rgb(0.8, 0, 0), rgb(0, 0, 0.8)), + alpha = 0.2, + fontfamily = rep("sans", 3), + cat.fontfamily = rep("sans", 2), + margin = 0.01, + ind = FALSE) + # Borrowed from https://stackoverflow.com/questions/37239128/how-to-put-comma-in-large-number-of-venndiagram + idx <- sapply(plot, function(i) grepl("text", i$name)) + for (i in 1:3) { + plot[idx][[i]]$label <- format(as.numeric(plot[idx][[i]]$label), + big.mark = ",", + scientific = FALSE) + } + grid::grid.draw(plot) + + return(plot) +} + +plotCohortOverlap <- function(data, + yAxis = "Percentages") { + + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertTibble(x = data, + any.missing = FALSE, + min.rows = 1, + min.cols = 6, + null.ok = FALSE, + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + checkmate::assertNames(x = colnames(data), + must.include = c("databaseId", + "targetCohortId", + "comparatorCohortId", + "tOnlySubjects", + "cOnlySubjects", + "bothSubjects"), + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + + + + + + plotData <- data %>% + dplyr::mutate(absTOnlySubjects = abs(.data$tOnlySubjects), + absCOnlySubjects = abs(.data$cOnlySubjects), + absBothSubjects = abs(.data$bothSubjects), + absEitherSubjects = abs(.data$eitherSubjects), + signTOnlySubjects = dplyr::case_when(.data$tOnlySubjects < 0 ~ '<', TRUE ~ ''), + signCOnlySubjects = dplyr::case_when(.data$cOnlySubjects < 0 ~ '<', TRUE ~ ''), + signBothSubjects = dplyr::case_when(.data$bothSubjects < 0 ~ '<', TRUE ~ '')) %>% + dplyr::mutate(tOnlyString = paste0(.data$signTOnlySubjects, + scales::comma(.data$absTOnlySubjects), + " (", + .data$signTOnlySubjects, + scales::percent(.data$absTOnlySubjects/.data$absEitherSubjects, + accuracy = 1), + ")"), + cOnlyString = paste0(.data$signCOnlySubjects, + scales::comma(.data$absCOnlySubjects), + " (", + .data$signCOnlySubjects, + scales::percent(.data$absCOnlySubjects/.data$absEitherSubjects, + accuracy = 1), + ")"), + bothString = paste0(.data$signBothSubjects, + scales::comma(.data$absBothSubjects), + " (", + .data$signBothSubjects, + scales::percent(.data$absBothSubjects/.data$absEitherSubjects, + accuracy = 1), + ")")) %>% + dplyr::mutate(tooltip = paste0("Database: ", .data$databaseId, + "\n", .data$targetShortName, ": ", .data$targetCohortName, + "\n", .data$comparatorShortName, ": ", .data$comparatorCohortName, + "\n", .data$targetShortName, " only: ", .data$tOnlyString, + "\n", .data$comparatorShortName, " only: ", .data$cOnlyString, + "\nBoth: ", .data$bothString)) %>% + dplyr::select(.data$targetShortName, + .data$comparatorShortName, + .data$databaseId, + .data$absTOnlySubjects, + .data$absCOnlySubjects, + .data$absBothSubjects, + .data$tooltip) %>% + tidyr::pivot_longer(cols = c("absTOnlySubjects", + "absCOnlySubjects", + "absBothSubjects"), + names_to = "subjectsIn", + values_to = "value") %>% + dplyr::mutate(subjectsIn = camelCaseToTitleCase(stringr::str_replace_all(string = .data$subjectsIn, + pattern = "abs|Subjects", + replacement = ""))) + + plotData$subjectsIn <- factor(plotData$subjectsIn, levels = c(" T Only", " Both", " C Only")) + if (yAxis == "Percentages") { + position = "fill" + } else { + position = "stack" + } + + plot <- ggplot2::ggplot(data = plotData) + + ggplot2::aes(fill = .data$subjectsIn, + y = .data$value, + x = .data$comparatorShortName, + tooltip = .data$tooltip, + group = .data$subjectsIn) + + ggplot2::ylab(label = "") + + ggplot2::xlab(label = "") + + ggplot2::scale_fill_manual("Subjects in", values = c(rgb(0.8, 0.2, 0.2), rgb(0.3, 0.2, 0.4), rgb(0.4, 0.4, 0.9))) + + ggplot2::facet_grid(.data$databaseId ~ .data$targetShortName, drop = FALSE) + + ggiraph::geom_bar_interactive(position = position, alpha = 0.6, stat = "identity") + if (yAxis == "Percentages") { + plot <- plot + ggplot2::scale_y_continuous(labels = scales::percent) + } else { + plot <- plot + ggplot2::scale_y_continuous(labels = scales::comma) + } + width <- 1.5 + 1*length(unique(plotData$databaseId)) + height <- 1.5 + 1*length(unique(plotData$targetShortName)) + aspectRatio <- width / height + plot <- ggiraph::girafe(ggobj = plot, + options = list( + ggiraph::opts_sizing(width = .7), + ggiraph::opts_zoom(max = 5)), + width_svg = 6 * aspectRatio, + height_svg = 6) + + return(plot) +} +# Future function getCohortOverlapHistogram: +# 1. https://stackoverflow.com/questions/20184096/how-to-plot-multiple-stacked-histograms-together-in-r +# 2. https://stackoverflow.com/questions/43415709/how-to-use-facet-grid-with-geom-histogram +# 3. https://www.datacamp.com/community/tutorials/facets-ggplot-r?utm_source=adwords_ppc&utm_campaignid=1455363063&utm_adgroupid=65083631748&utm_device=c&utm_keyword=&utm_matchtype=b&utm_network=g&utm_adpostion=&utm_creative=332602034361&utm_targetid=dsa-429603003980&utm_loc_interest_ms=&utm_loc_physical_ms=1007768&gclid=CjwKCAjw19z6BRAYEiwAmo64LQMUJwf1i0V-Zgc5hYhpDOFQeZU05reAJmQvo2-mClFWWM4_sJiSmBoC-YkQAvD_BwE +# 4. https://stackoverflow.com/questions/24123499/frequency-histograms-with-facets-calculating-percent-by-groups-used-in-facet-i +# 5. https://stackoverflow.com/questions/62821480/add-a-trace-to-every-facet-of-a-plotly-figure + +# ComparatorOnlySubjs <- generateHistogramValues(len = seq(1:nrow(data)), val = data$cOnlySubjects) +# bothSubjs <- generateHistogramValues(seq(1:nrow(data)), data$bothSubjects) +# cohortOnlySubjs <- generateHistogramValues(seq(1:nrow(data)), data$tOnlySubjects) +# bucket <- list(ComparatorOnlySubjs = ComparatorOnlySubjs, bothSubjs = bothSubjs, cohortOnlySubjs = cohortOnlySubjs) +# +# +# p <- ggplot2::ggplot(reshape::melt(bucket), ggplot2::aes(value, fill = L1)) + +# ggplot2::xlab(label = "Comparators") + +# ggplot2::geom_histogram(position = "stack", binwidth = 1) + +# ggplot2::xlim(c(0,max(length(comparatorCohortIds()),10))) + +# ggplot2::facet_grid(rows = ggplot2::vars(data$targetCohortId), +# cols = ggplot2::vars(data$databaseId), scales = "free_y") +# plot <- plotly::ggplotly(p) +# GENERATE HISTOGRAM FUNCTION +# generateHistogramValues <- function(len,val) +# { +# fillVal <- c() +# +# inc <- 1 +# for (i in len) +# { +# fillVal <- c(fillVal,rep(i,val[[i]])) +# } +# return(fillVal); +# } + diff --git a/CohortDiagnosticsBreastCancer/R/Results.R b/CohortDiagnosticsBreastCancer/R/Results.R new file mode 100644 index 00000000..a93a425f --- /dev/null +++ b/CohortDiagnosticsBreastCancer/R/Results.R @@ -0,0 +1,645 @@ +createDatabaseDataSource <- function(connection, resultsDatabaseSchema, vocabularyDatabaseSchema = resultsDatabaseSchema) { + return(list(connection = connectionPool, + resultsDatabaseSchema = resultsDatabaseSchema, + vocabularyDatabaseSchema = vocabularyDatabaseSchema)) +} + +createFileDataSource <- function(premergedDataFile, envir = new.env()) { + load(premergedDataFile, envir = envir) + return(envir) +} + + +renderTranslateQuerySql <- function(connection, sql, ..., snakeCaseToCamelCase = FALSE) { + if (is(connection, "Pool")) { + # Connection pool is used by Shiny app, which always uses PostgreSQL: + sql <- SqlRender::render(sql, ...) + sql <- SqlRender::translate(sql, targetDialect = "postgresql") + + tryCatch({ + data <- DatabaseConnector::dbGetQuery(connection, sql) + }, error = function(err) { + writeLines(sql) + stop(err) + }) + if (snakeCaseToCamelCase) { + colnames(data) <- SqlRender::snakeCaseToCamelCase(colnames(data)) + } + return(data) + } else { + return(DatabaseConnector::renderTranslateQuerySql(connection = connection, + sql = sql, + ..., + snakeCaseToCamelCase = snakeCaseToCamelCase)) + } +} + +quoteLiterals <- function(x) { + if (is.null(x)) { + return("") + } else { + return(paste0("'", paste(x, collapse = "', '"), "'")) + } +} + +getCohortCountResult <- function(dataSource = .GlobalEnv, + cohortIds = NULL, + databaseIds) { + if (is(dataSource, "environment")) { + data <- get("cohortCount", envir = dataSource) %>% + dplyr::filter(.data$databaseId %in% !!databaseIds) + if (!is.null(cohortIds)) { + data <- data %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + } else { + sql <- "SELECT * + FROM @results_database_schema.cohort_count + WHERE database_id in (@database_id) + {@cohort_ids != ''} ? { AND cohort_id in (@cohort_ids)} + ;" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + cohort_ids = cohortIds, + database_id = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + return(data) +} + +getTimeDistributionResult <- function(dataSource = .GlobalEnv, + cohortIds, + databaseIds) { + if (is(dataSource, "environment")) { + data <- get("timeDistribution", envir = dataSource) %>% + dplyr::filter(.data$cohortId %in% !!cohortIds & + .data$databaseId %in% !!databaseIds) + } else { + sql <- "SELECT * + FROM @results_database_schema.time_distribution + WHERE cohort_id in (@cohort_ids) + AND database_id in (@database_ids);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + cohort_ids = cohortIds, + database_ids = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + shortNames <- data %>% + dplyr::inner_join(cohort) %>% + dplyr::distinct(.data$cohortId, .data$cohortName) %>% + dplyr::arrange(.data$cohortName) %>% + dplyr::mutate(shortName = paste0('C', dplyr::row_number())) + + + data <- data %>% + dplyr::inner_join(shortNames, by = "cohortId") + + data <- data %>% + dplyr::rename(Database = "databaseId", + TimeMeasure = "timeMetric", + Average = "averageValue", + SD = "standardDeviation", + Min = "minValue", + P10 = "p10Value", + P25 = "p25Value", + Median = "medianValue", + P75 = "p75Value", + P90 = "p90Value", + Max = "maxValue") %>% + dplyr::relocate(.data$cohortId, .data$Database, .data$TimeMeasure) %>% + dplyr::arrange(.data$cohortId, .data$Database, .data$TimeMeasure) + return(data) +} + + +getIncidenceRateResult <- function(dataSource = .GlobalEnv, + cohortIds, + databaseIds, + stratifyByGender = c(TRUE,FALSE), + stratifyByAgeGroup = c(TRUE,FALSE), + stratifyByCalendarYear = c(TRUE,FALSE), + minPersonYears = 1000) { + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = cohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + checkmate::assertLogical(x = stratifyByGender, + add = errorMessage, + min.len = 1, + max.len = 2, + unique = TRUE) + checkmate::assertLogical(x = stratifyByAgeGroup, + add = errorMessage, + min.len = 1, + max.len = 2, + unique = TRUE) + checkmate::assertLogical(x = stratifyByCalendarYear, + add = errorMessage, + min.len = 1, + max.len = 2, + unique = TRUE) + checkmate::reportAssertions(collection = errorMessage) + + if (is(dataSource, "environment")) { + data <- get("incidenceRate", envir = dataSource) %>% + dplyr::mutate(strataGender = !is.na(.data$gender), + strataAgeGroup = !is.na(.data$ageGroup), + strataCalendarYear = !is.na(.data$calendarYear)) %>% + dplyr::filter(.data$cohortId %in% !!cohortIds & + .data$databaseId %in% !!databaseIds & + .data$strataGender %in% !!stratifyByGender & + .data$strataAgeGroup %in% !!stratifyByAgeGroup & + .data$strataCalendarYear %in% !!stratifyByCalendarYear & + .data$personYears > !!minPersonYears) %>% + dplyr::select(-tidyselect::starts_with('strata')) + } else { + sql <- "SELECT * + FROM @results_database_schema.incidence_rate + WHERE cohort_id in (@cohort_ids) + AND database_id in (@database_ids) + {@gender == TRUE} ? {AND gender != ''} : { AND gender = ''} + {@age_group == TRUE} ? {AND age_group != ''} : { AND age_group = ''} + {@calendar_year == TRUE} ? {AND calendar_year != ''} : { AND calendar_year = ''} + AND person_years > @personYears;" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + cohort_ids = cohortIds, + database_ids = quoteLiterals(databaseIds), + gender = stratifyByGender, + age_group = stratifyByAgeGroup, + calendar_year = stratifyByCalendarYear, + personYears = minPersonYears, + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + data <- data %>% + dplyr::mutate(gender = dplyr::na_if(.data$gender, ""), + ageGroup = dplyr::na_if(.data$ageGroup, ""), + calendarYear = dplyr::na_if(.data$calendarYear, "")) + } + shortNames <- data %>% + dplyr::inner_join(cohort) %>% + dplyr::distinct(.data$cohortId, .data$cohortName) %>% + dplyr::arrange(.data$cohortName) %>% + dplyr::mutate(shortName = paste0('C', dplyr::row_number())) + + + data <- data %>% + dplyr::inner_join(shortNames, by = "cohortId") + + return(data %>% + dplyr::mutate(calendarYear = as.integer(.data$calendarYear)) %>% + dplyr::arrange(.data$cohortId, .data$databaseId)) +} + +getInclusionRuleStats <- function(dataSource = .GlobalEnv, + cohortIds = NULL, + databaseIds) { + if (is(dataSource, "environment")) { + data <- get("inclusionRuleStats", envir = dataSource) %>% + dplyr::filter(.data$databaseId %in% !!databaseIds) + if (!is.null(cohortIds)) { + data <- data %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + } else { + sql <- "SELECT * + FROM @resultsDatabaseSchema.inclusion_rule_stats + WHERE database_id in (@database_id) + {@cohort_ids != ''} ? { AND cohort_id in (@cohort_ids)} + ;" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + resultsDatabaseSchema = dataSource$resultsDatabaseSchema, + cohort_ids = cohortIds, + database_id = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + data <- data %>% + dplyr::select(.data$ruleSequenceId, .data$ruleName, + .data$meetSubjects, .data$gainSubjects, + .data$remainSubjects, .data$totalSubjects, .data$databaseId) %>% + dplyr::arrange(.data$ruleSequenceId) + return(data) +} + + +getIndexEventBreakdown <- function(dataSource = .GlobalEnv, + cohortIds, + databaseIds) { + errorMessage <- checkmate::makeAssertCollection() + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = cohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + + if (is(dataSource, "environment")) { + data <- get("indexEventBreakdown", envir = dataSource) %>% + dplyr::filter(.data$databaseId %in% !!databaseIds) + if (!is.null(cohortIds)) { + data <- data %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + data <- data %>% + dplyr::inner_join(dplyr::select(get("concept", envir = dataSource), + .data$conceptId, + .data$conceptName), + by = c("conceptId")) + } else { + sql <- "SELECT index_event_breakdown.*, + standard_concept.concept_name AS concept_name + FROM @results_database_schema.index_event_breakdown + INNER JOIN @vocabulary_database_schema.concept standard_concept + ON index_event_breakdown.concept_id = standard_concept.concept_id + WHERE database_id in (@database_id) + AND cohort_id in (@cohort_ids);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + vocabulary_database_schema = dataSource$vocabularyDatabaseSchema, + cohort_ids = cohortIds, + database_id = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + return(data) +} + +getVisitContextResults <- function(dataSource = .GlobalEnv, + cohortIds, + databaseIds) { + errorMessage <- checkmate::makeAssertCollection() + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = cohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + + if (is(dataSource, "environment")) { + data <- get("visitContext", envir = dataSource) %>% + dplyr::filter(.data$databaseId %in% !!databaseIds) + if (!is.null(cohortIds)) { + data <- data %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + data <- data %>% + dplyr::inner_join(dplyr::select(get("concept", envir = dataSource), + visitConceptId = .data$conceptId, + visitConceptName = .data$conceptName), + by = c("visitConceptId")) + } else { + sql <- "SELECT visit_context.*, + standard_concept.concept_name AS visit_concept_name + FROM @results_database_schema.visit_context + INNER JOIN @vocabulary_database_schema.concept standard_concept + ON visit_context.visit_concept_id = standard_concept.concept_id + WHERE database_id in (@database_id) + AND cohort_id in (@cohort_ids);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + vocabulary_database_schema = dataSource$vocabularyDatabaseSchema, + cohort_ids = cohortIds, + database_id = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + return(data) +} + +getIncludedConceptResult <- function(dataSource = .GlobalEnv, + cohortId, + databaseIds) { + if (is(dataSource, "environment")) { + data <- get("includedSourceConcept", envir = dataSource) %>% + dplyr::filter(.data$cohortId == !!cohortId & + .data$databaseId %in% !!databaseIds) %>% + dplyr::inner_join(dplyr::select(get("conceptSets", envir = dataSource), + .data$cohortId, + .data$conceptSetId, + .data$conceptSetName), + by = c("cohortId", "conceptSetId")) %>% + dplyr::inner_join(dplyr::select(get("concept", envir = dataSource), + sourceConceptId = .data$conceptId, + sourceConceptName = .data$conceptName, + sourceVocabularyId = .data$vocabularyId, + sourceConceptCode = .data$conceptCode), + by = c("sourceConceptId")) %>% + dplyr::inner_join(dplyr::select(get("concept", envir = dataSource), + .data$conceptId, + .data$conceptName, + .data$vocabularyId), + by = c("conceptId")) + } else { + sql <- "SELECT included_source_concept.*, + concept_set_name, + source_concept.concept_name AS source_concept_name, + source_concept.vocabulary_id AS source_vocabulary_id, + source_concept.concept_code AS source_concept_code, + standard_concept.concept_name AS concept_name, + standard_concept.vocabulary_id AS vocabulary_id + FROM @results_database_schema.included_source_concept + INNER JOIN @results_database_schema.concept_sets + ON included_source_concept.cohort_id = concept_sets.cohort_id + AND included_source_concept.concept_set_id = concept_sets.concept_set_id + INNER JOIN @vocabulary_database_schema.concept source_concept + ON included_source_concept.source_concept_id = source_concept.concept_id + INNER JOIN @vocabulary_database_schema.concept standard_concept + ON included_source_concept.concept_id = standard_concept.concept_id + WHERE included_source_concept.cohort_id = @cohort_id + AND database_id in (@database_ids);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + vocabulary_database_schema = dataSource$vocabularyDatabaseSchema, + cohort_id = cohortId, + database_ids = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + + return(data) +} + +getOrphanConceptResult <- function(dataSource = .GlobalEnv, + cohortId, + databaseIds) { + if (is(dataSource, "environment")) { + data <- get("orphanConcept", envir = dataSource) %>% + dplyr::filter(.data$cohortId == !!cohortId & + .data$databaseId %in% !!databaseIds) %>% + dplyr::inner_join(dplyr::select(get("conceptSets", envir = dataSource), + .data$cohortId, + .data$conceptSetId, + .data$conceptSetName), + by = c("cohortId", "conceptSetId")) %>% + dplyr::inner_join(dplyr::select(get("concept", envir = dataSource), + .data$conceptId, + .data$conceptName, + .data$vocabularyId, + .data$conceptCode), + by = c("conceptId")) + } else { + sql <- "SELECT orphan_concept.*, + concept_set_name, + standard_concept.concept_name AS concept_name, + standard_concept.vocabulary_id AS vocabulary_id, + standard_concept.concept_code AS concept_code + FROM @results_database_schema.orphan_concept + INNER JOIN @results_database_schema.concept_sets + ON orphan_concept.cohort_id = concept_sets.cohort_id + AND orphan_concept.concept_set_id = concept_sets.concept_set_id + INNER JOIN @vocabulary_database_schema.concept standard_concept + ON orphan_concept.concept_id = standard_concept.concept_id + WHERE orphan_concept.cohort_id = @cohort_id + AND database_id in (@database_ids);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + vocabulary_database_schema = dataSource$vocabularyDatabaseSchema, + cohort_id = cohortId, + database_ids = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + + return(data) +} + + +getCohortOverlapResult <- function(dataSource = .GlobalEnv, + targetCohortIds, + comparatorCohortIds, + databaseIds) { + errorMessage <- checkmate::makeAssertCollection() + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = targetCohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = comparatorCohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + + if (is(dataSource, "environment")) { + data <- get("cohortOverlap", envir = dataSource) %>% + dplyr::filter(.data$targetCohortId %in% !!targetCohortIds & + .data$comparatorCohortId %in% !!comparatorCohortIds & + .data$databaseId %in% !!databaseIds) %>% + dplyr::inner_join(dplyr::select(get("cohort", envir = dataSource), + targetCohortId = .data$cohortId, + targetCohortName = .data$cohortName), + by = "targetCohortId") %>% + dplyr::inner_join(dplyr::select(get("cohort", envir = dataSource), + comparatorCohortId = .data$cohortId, + comparatorCohortName = .data$cohortName), + by = "comparatorCohortId") + } else { + sql <- "SELECT cohort_overlap.*, + target_cohort.cohort_name AS target_cohort_name, + comparator_cohort.cohort_name AS comparator_cohort_name + FROM @results_database_schema.cohort_overlap + INNER JOIN @results_database_schema.cohort target_cohort + ON cohort_overlap.target_cohort_id = target_cohort.cohort_id + INNER JOIN @results_database_schema.cohort comparator_cohort + ON cohort_overlap.comparator_cohort_id = comparator_cohort.cohort_id + WHERE target_cohort_id in (@targetCohortId) + AND comparator_cohort_id in (@comparatorCohortId) + AND database_id in (@databaseId);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + targetCohortId = targetCohortIds, + comparatorCohortId = comparatorCohortIds, + databaseId = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + + if (nrow(data) == 0) { + return(tidyr::tibble()) + } + targetShortNames <- data %>% + dplyr::distinct(.data$targetCohortId, .data$targetCohortName) %>% + dplyr::arrange(.data$targetCohortName) %>% + dplyr::select(-.data$targetCohortName) %>% + dplyr::mutate(targetShortName = paste0('C', dplyr::row_number())) + + comparatorShortNames <- data %>% + dplyr::distinct(.data$comparatorCohortId, .data$comparatorCohortName) %>% + dplyr::arrange(.data$comparatorCohortName) %>% + dplyr::select(-.data$comparatorCohortName) %>% + dplyr::mutate(comparatorShortName = paste0('C', dplyr::row_number())) + + data <- data %>% + dplyr::inner_join(targetShortNames, by = "targetCohortId") %>% + dplyr::inner_join(comparatorShortNames, by = "comparatorCohortId") + return(data) +} + +getCovariateValueResult <- function(dataSource = .GlobalEnv, + cohortIds, + analysisIds = NULL, + databaseIds, + timeIds = NULL, + isTemporal = FALSE) { + + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertLogical(x = isTemporal, + any.missing = FALSE, + min.len = 1, + max.len = 1, + add = errorMessage) + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = cohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + if (isTemporal) { + checkmate::assertIntegerish(x = timeIds, + lower = 0, + any.missing = FALSE, + unique = TRUE, + null.ok = TRUE, + add = errorMessage) + } + checkmate::reportAssertions(collection = errorMessage) + + + if (isTemporal) { + table <- "temporalCovariateValue" + refTable <- "temporalCovariateRef" + timeRefTable <- "temporalTimeRef" + } else { + table <- "covariateValue" + refTable <- "covariateRef" + timeRefTable <- "" + } + + if (is(dataSource, "environment")) { + data <- get(table, envir = dataSource) %>% + dplyr::filter(.data$cohortId %in% !!cohortIds, + .data$databaseId %in% !!databaseIds) %>% + dplyr::inner_join(get(refTable, envir = dataSource), by = "covariateId") + if (!is.null(analysisIds)) { + data <- data %>% + dplyr::filter(.data$analysisId %in% analysisIds) + } + if (isTemporal) { + data <- data %>% + dplyr::inner_join(get(timeRefTable, envir = dataSource), by = "timeId") + if (!is.null(timeIds)) { + data <- data %>% + dplyr::filter(.data$timeId %in% timeIds) + } + } + } else { + sql <- "SELECT covariate.*, + covariate_name, + {@time_ref_table != \"\"} ? { + start_day, + end_day, + } + concept_id, + analysis_id + FROM @results_database_schema.@table covariate + INNER JOIN @results_database_schema.@ref_table covariate_ref + ON covariate.covariate_id = covariate_ref.covariate_id + {@time_ref_table != \"\"} ? { + INNER JOIN @results_database_schema.@time_ref_table time_ref + ON covariate.time_id = time_ref.time_id + } + WHERE cohort_id in (@cohort_ids) + {@time_ref_table != \"\" & @time_ids != \"\"} ? { AND covariate.time_id IN (@time_ids)} + {@analysis_ids != \"\"} ? { AND analysis_id IN (@analysis_ids)} + AND database_id in (@databaseIds);" + if (is.null(timeIds)) { + timeIds <- "" + } + if (is.null(analysisIds)) { + analysisIds <- "" + } + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + table = SqlRender::camelCaseToSnakeCase(table), + ref_table = SqlRender::camelCaseToSnakeCase(refTable), + time_ref_table = SqlRender::camelCaseToSnakeCase(timeRefTable), + results_database_schema = dataSource$resultsDatabaseSchema, + cohort_ids = cohortIds, + analysis_ids = analysisIds, + databaseIds = quoteLiterals(databaseIds), + time_ids = timeIds, + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + if (isTemporal) { + data <- data %>% + dplyr::relocate(.data$cohortId, + .data$databaseId, + .data$timeId, + .data$startDay, + .data$endDay, + .data$analysisId, + .data$covariateId, + .data$covariateName) %>% + dplyr::arrange(.data$cohortId, .data$databaseId, .data$timeId, .data$covariateId, .data$covariateName) + } else { + data <- data %>% + dplyr::relocate(.data$cohortId, + .data$databaseId, + .data$analysisId, + .data$covariateId, + .data$covariateName) %>% + dplyr::arrange(.data$cohortId, .data$databaseId, .data$covariateId) + } + return(data) +} + +getConceptReference <- function(dataSource = .GlobalEnv, + conceptIds) { + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertIntegerish(x = conceptIds, + min.len = 1, + null.ok = TRUE, + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + if (is(dataSource, "environment")) { + data <- get("cohort", envir = dataSource) %>% + dplyr::filter(!is.na(.data$invalidReason)) %>% + dplyr::filter(.data$conceptId %in% conceptIds) + } else { + sql <- "SELECT * + FROM @results_database_schema.concept + WHERE invalid_reason IS NULL + {@conceptIds == } ? {}:{AND concept_id IN (@conceptIds)};" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + conceptIds = conceptIds, + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + return(data %>% dplyr::arrange(.data$conceptId)) +} + +checkErrorCohortIdsDatabaseIds <- function(errorMessage, + cohortIds, + databaseIds) { + checkmate::assertDouble(x = cohortIds, + null.ok = FALSE, + lower = 1, + upper = 2^53, + any.missing = FALSE, + add = errorMessage) + checkmate::assertCharacter(x = databaseIds, + min.len = 1, + any.missing = FALSE, + unique = TRUE, + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + return(errorMessage) +} diff --git a/CohortDiagnosticsBreastCancer/R/Tables.R b/CohortDiagnosticsBreastCancer/R/Tables.R new file mode 100644 index 00000000..e0b6edb6 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/R/Tables.R @@ -0,0 +1,141 @@ +library(magrittr) + +prepareTable1 <- function(covariates, + pathToCsv = "Table1Specs.csv") { + covariates <- covariates %>% + dplyr::mutate(covariateName = stringr::str_to_sentence(stringr::str_replace_all(string = .data$covariateName, + pattern = "^.*: ", + replacement = ""))) + space <- " " + specifications <- readr::read_csv(file = pathToCsv, + col_types = readr::cols(), + guess_max = min(1e7)) %>% + dplyr::mutate(dplyr::across(tidyr::everything(), ~tidyr::replace_na(data = .x, replace = ''))) + + resultsTable <- tidyr::tibble() + + if (nrow(specifications) == 0) { + return(resultsTable) + } + + for (i in 1:nrow(specifications)) { + specification <- specifications[i,] + if (specification %>% dplyr::pull(.data$covariateIds) == "") { + covariatesSubset <- covariates %>% + dplyr::filter(.data$analysisId %in% specification$analysisId) %>% + dplyr::arrange(.data$covariateId) + } else { + covariatesSubset <- covariates %>% + dplyr::filter(.data$analysisId %in% specification$analysisId, + .data$covariateId %in% (stringr::str_split(string = (specification %>% + dplyr::pull(.data$covariateIds)), + pattern = ";")[[1]] %>% + utils::type.convert())) %>% + dplyr::arrange(.data$covariateId) + } + if (nrow(covariatesSubset) > 0) { + resultsTable <- dplyr::bind_rows(resultsTable, + tidyr::tibble(characteristic = paste0('', + specification %>% dplyr::pull(.data$label), + ''), + value = NA, + header = 1, + position = i), + tidyr::tibble(characteristic = paste0(space, + space, + space, + space, + covariatesSubset$covariateName), + value = covariatesSubset$mean, + header = 0, + position = i)) %>% + dplyr::distinct() %>% + dplyr::mutate(sortOrder = dplyr::row_number()) + } + } + if (nrow(resultsTable) > 0) { + resultsTable <- resultsTable %>% + dplyr::arrange(.data$position, dplyr::desc(.data$header), .data$sortOrder) + } + return(resultsTable) +} + + +prepareTable1Comp <- function(balance, + pathToCsv = "Table1Specs.csv") { + balance <- balance %>% + dplyr::mutate(covariateName = stringr::str_to_sentence(stringr::str_replace_all(string = .data$covariateName, + pattern = "^.*: ", + replacement = ""))) + space <- " " + specifications <- readr::read_csv(file = pathToCsv, + col_types = readr::cols(), + guess_max = min(1e7)) %>% + dplyr::mutate(dplyr::across(tidyr::everything(), ~tidyr::replace_na(data = .x, replace = ''))) + + resultsTable <- tidyr::tibble() + + if (nrow(specifications) == 0) { + return(dplyr::tibble(Note = 'There are no covariate records for the cohorts being compared.')) + } + + for (i in 1:nrow(specifications)) { + specification <- specifications[i,] + if (specification %>% dplyr::pull(.data$covariateIds) == "") { + balanceSubset <- balance %>% + dplyr::filter(.data$analysisId %in% specification$analysisId) %>% + dplyr::arrange(.data$covariateId) + } else { + balanceSubset <- balance %>% + dplyr::filter(.data$analysisId %in% specification$analysisId, + .data$covariateId %in% (stringr::str_split(string = (specification %>% + dplyr::pull(.data$covariateIds)), + pattern = ";")[[1]] %>% + utils::type.convert())) %>% + dplyr::arrange(.data$covariateId) + } + + if (nrow(balanceSubset) > 0) { + resultsTable <- dplyr::bind_rows(resultsTable, + tidyr::tibble(characteristic = paste0('', + specification %>% dplyr::pull(.data$label), + ''), + MeanT = NA, + MeanC = NA, + StdDiff = NA, + header = 1, + position = i), + tidyr::tibble(characteristic = paste0(space, + space, + space, + space, + balanceSubset$covariateName), + MeanT = balanceSubset$mean1, + MeanC = balanceSubset$mean2, + StdDiff = balanceSubset$stdDiff, + header = 0, + position = i)) %>% + dplyr::distinct() %>% + dplyr::mutate(sortOrder = dplyr::row_number()) + } + } + if (nrow(resultsTable) > 0 ) { + resultsTable <- resultsTable %>% + dplyr::arrange(.data$position, dplyr::desc(.data$header), .data$sortOrder) %>% + dplyr::mutate(sortOrder = dplyr::row_number()) %>% + dplyr::select(-.data$header, -.data$position) + } + return(resultsTable) +} + + +compareCohortCharacteristics <- function(characteristics1, characteristics2) { + m <- dplyr::full_join(x = characteristics1 %>% dplyr::distinct(), + y = characteristics2 %>% dplyr::distinct(), + by = c("covariateId", "conceptId", "databaseId", "covariateName", "analysisId"), + suffix = c("1", "2")) %>% + dplyr::mutate(sd = sqrt(.data$sd1^2 + .data$sd2^2), + stdDiff = (.data$mean2 - .data$mean1)/.data$sd) %>% + dplyr::arrange(-abs(.data$stdDiff)) + return(m) +} diff --git a/CohortDiagnosticsBreastCancer/Table1Specs.csv b/CohortDiagnosticsBreastCancer/Table1Specs.csv new file mode 100644 index 00000000..6389f2be --- /dev/null +++ b/CohortDiagnosticsBreastCancer/Table1Specs.csv @@ -0,0 +1,9 @@ +label,analysisId,covariateIds +Age group,3, +Gender: female,1,8532001 +Race,4, +Ethnicity,5, +Medical history: General,210,4006969210;438409210;4212540210;255573210;201606210;4182210210;440383210;201820210;318800210;192671210;439727210;432867210;316866210;4104000210;433736210;80180210;255848210;140168210;4030518210;80809210;435783210;4279309210;81893210;81902210;197494210;4134440210 +Medical history: Cardiovascular disease,210,313217210;381591210;317576210;321588210;316139210;4185932210;321052210;440417210;444247210 +Medical history: Neoplasms,210,4044013210;432571210;40481902210;443392210;4112853210;4180790210;443388210;197508210;200962210 +Medication use,410,21601782410;21602796410;21604686410;21604389410;21603932410;21601387410;21602028410;21600960410;21601664410;21601744410;21601461410;21600046410;21603248410;21600712410;21603890410;21601853410;21604254410;21604489410;21604752410 diff --git a/CohortDiagnosticsBreastCancer/data/PreMerged.RData b/CohortDiagnosticsBreastCancer/data/PreMerged.RData new file mode 100644 index 00000000..31eaeb62 Binary files /dev/null and b/CohortDiagnosticsBreastCancer/data/PreMerged.RData differ diff --git a/CohortDiagnosticsBreastCancer/global.R b/CohortDiagnosticsBreastCancer/global.R new file mode 100644 index 00000000..bc66cfca --- /dev/null +++ b/CohortDiagnosticsBreastCancer/global.R @@ -0,0 +1,163 @@ +library(magrittr) + +source("R/Tables.R") +source("R/Plots.R") +source("R/Results.R") + +# shinySettings <- list(connectionDetails = DatabaseConnector::createConnectionDetails(dbms = "postgresql", +# server = "localhost/ohdsi", +# user = "postgres", +# password = Sys.getenv("pwPostgres")), +# resultsDatabaseSchema = "phenotype_library", +# vocabularyDatabaseSchema = "phenotype_library") +# shinySettings <- list(dataFolder = "s:/examplePackageOutput") + +# Settings when running on server: + +defaultLocalDataFolder <- "data" +defaultLocalDataFile <- "PreMerged.RData" + +connectionPool <- NULL +defaultServer <- Sys.getenv("phenotypeLibraryDbServer") +defaultDatabase <- Sys.getenv("phenotypeLibraryDbDatabase") +defaultPort <- Sys.getenv("phenotypeLibraryDbPort") +defaultUser <- Sys.getenv("phenotypeLibraryDbUser") +defaultPassword <- Sys.getenv("phenotypeLibraryDbPassword") +defaultResultsSchema <- Sys.getenv("phenotypeLibraryDbResultsSchema") +defaultVocabularySchema <- Sys.getenv("phenotypeLibraryDbVocabularySchema") + +defaultDatabaseMode <- FALSE # Use file system if FALSE + +defaultCohortBaseUrl <- "https://atlas.ohdsi.org/#/cohortdefinition/" +defaultConceptBaseUrl <- "https://athena.ohdsi.org/search-terms/terms/" + +cohortDiagnosticModeDefaultTitle <- "Cohort Diagnostics" +phenotypeLibraryModeDefaultTitle <- "Phenotype Library" + +if (!exists("shinySettings")) { + writeLines("Using default settings") + databaseMode <- defaultDatabaseMode & defaultServer != "" + if (databaseMode) { + connectionPool <- pool::dbPool( + drv = DatabaseConnector::DatabaseConnectorDriver(), + dbms = "postgresql", + server = paste(defaultServer, defaultDatabase, sep = "/"), + port = defaultPort, + user = defaultUser, + password = defaultPassword + ) + resultsDatabaseSchema <- defaultResultsSchema + vocabularyDatabaseSchema <- defaultVocabularySchema + } else { + dataFolder <- defaultLocalDataFolder + } + cohortBaseUrl <- defaultCohortBaseUrl + conceptBaseUrl <- defaultCohortBaseUrl +} else { + writeLines("Using settings provided by user") + databaseMode <- !is.null(shinySettings$connectionDetails) + if (databaseMode) { + connectionDetails <- shinySettings$connectionDetails + if (is(connectionDetails$server, "function")) { + connectionPool <- pool::dbPool(drv = DatabaseConnector::DatabaseConnectorDriver(), + dbms = "postgresql", + server = connectionDetails$server(), + port = connectionDetails$port(), + user = connectionDetails$user(), + password = connectionDetails$password(), + connectionString = connectionDetails$connectionString()) + } else { + # For backwards compatibility with older versions of DatabaseConnector: + connectionPool <- pool::dbPool(drv = DatabaseConnector::DatabaseConnectorDriver(), + dbms = "postgresql", + server = connectionDetails$server, + port = connectionDetails$port, + user = connectionDetails$user, + password = connectionDetails$password, + connectionString = connectionDetails$connectionString) + } + resultsDatabaseSchema <- shinySettings$resultsDatabaseSchema + vocabularyDatabaseSchema <- shinySettings$vocabularyDatabaseSchema + } else { + dataFolder <- shinySettings$dataFolder + } + cohortBaseUrl <- shinySettings$cohortBaseUrl + conceptBaseUrl <- shinySettings$cohortBaseUrl +} + +dataModelSpecifications <- read.csv("resultsDataModelSpecification.csv") +# Cleaning up any tables in memory: +suppressWarnings(rm(list = SqlRender::snakeCaseToCamelCase(dataModelSpecifications$tableName))) + +if (databaseMode) { + + onStop(function() { + if (DBI::dbIsValid(connectionPool)) { + writeLines("Closing database pool") + pool::poolClose(connectionPool) + } + }) + + resultsTablesOnServer <- tolower(DatabaseConnector::dbListTables(connectionPool, schema = resultsDatabaseSchema)) + + loadResultsTable <- function(tableName, required = FALSE) { + if (required || tableName %in% resultsTablesOnServer) { + tryCatch({ + table <- DatabaseConnector::dbReadTable(connectionPool, + paste(resultsDatabaseSchema, tableName, sep = ".")) + }, error = function(err) { + stop("Error reading from ", paste(resultsDatabaseSchema, tableName, sep = "."), ": ", err$message) + }) + colnames(table) <- SqlRender::snakeCaseToCamelCase(colnames(table)) + if (nrow(table) > 0) { + assign(SqlRender::snakeCaseToCamelCase(tableName), dplyr::as_tibble(table), envir = .GlobalEnv) + } + } + } + + loadResultsTable("database", required = TRUE) + loadResultsTable("cohort", required = TRUE) + loadResultsTable("phenotype_description") + loadResultsTable("temporal_time_ref") + loadResultsTable("concept_sets") + + # Create empty objects in memory for all other tables. This is used by the Shiny app to decide what tabs to show: + isEmpty <- function(tableName) { + sql <- sprintf("SELECT 1 FROM %s.%s LIMIT 1;", resultsDatabaseSchema, tableName) + oneRow <- DatabaseConnector::dbGetQuery(connectionPool, sql) + return(nrow(oneRow) == 0) + } + + for (table in dataModelSpecifications$tableName) { + if (table %in% resultsTablesOnServer && + !exists(SqlRender::snakeCaseToCamelCase(table)) && + !isEmpty(table)) { + assign(SqlRender::snakeCaseToCamelCase(table), dplyr::tibble()) + } + } + + dataSource <- createDatabaseDataSource(connection = connectionPool, + resultsDatabaseSchema = resultsDatabaseSchema, + vocabularyDatabaseSchema = vocabularyDatabaseSchema) +} else { + localDataPath <- file.path(dataFolder, defaultLocalDataFile) + if (!file.exists(localDataPath)) { + stop(sprintf("Local data file %s does not exist.", localDataPath)) + } + dataSource <- createFileDataSource(localDataPath, envir = .GlobalEnv) +} + +if (exists("temporalTimeRef")) { + temporalCovariateChoices <- temporalTimeRef %>% + dplyr::mutate(choices = paste0("Start ", .data$startDay, " to end ", .data$endDay)) %>% + dplyr::select(.data$timeId, .data$choices) %>% + dplyr::arrange(.data$timeId) %>% + dplyr::slice_head(n = 5) +} + +if (exists("covariateRef")) { + specifications <- readr::read_csv(file = "Table1Specs.csv", + col_types = readr::cols(), + guess_max = min(1e7)) + prettyAnalysisIds <- specifications$analysisId +} diff --git a/CohortDiagnosticsBreastCancer/html/cohortCharacterization.html b/CohortDiagnosticsBreastCancer/html/cohortCharacterization.html new file mode 100644 index 00000000..9f6ef077 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/html/cohortCharacterization.html @@ -0,0 +1,16 @@ +

Description

+

A table showing cohort characteristics (covariates). These characteristics are captured on or before the cohort start date. There is a Pretty and a Raw version of this table.

+

The Pretty table shows the standard OHDSI characteristics table, which includes only covariates that were manually selected to provide a general overview of the comorbidities and medications of the cohort. These are all binary covariates, and the table shows the proportion (%) of the cohort entries having the covariate.

+

The Raw table shows all captured covariates. These include binary and continuous covariates (e.g. the Charlson comorbidity index). For each covariate the table lists the mean, which for binary covariates is equal to the proportion, and the standard deviation (SD).

+ +

Options

+

You can select multiple databases in the side bar to see cohort characteristics from different databases side-by-side in the same table.

+

Select the cohort to explore in the side bar.

+

Select either the Pretty or the Raw table at the top of the table.

+ +

What to look for

+ + diff --git a/CohortDiagnosticsBreastCancer/html/cohortCounts.html b/CohortDiagnosticsBreastCancer/html/cohortCounts.html new file mode 100644 index 00000000..684a6dd6 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/html/cohortCounts.html @@ -0,0 +1,12 @@ +

Description

+

A table showing the number of cohort entries and unique persons per cohort per database. Because one person can have more than one cohort entry, the number of entries can be higher than the number of persons.

+ +

Options

+

You can select multiple databases in the side bar to see counts from different databases side-by-side.

+ +

What to look for

+ \ No newline at end of file diff --git a/CohortDiagnosticsBreastCancer/html/cohortOverlap.html b/CohortDiagnosticsBreastCancer/html/cohortOverlap.html new file mode 100644 index 00000000..419eaa62 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/html/cohortOverlap.html @@ -0,0 +1,27 @@ +

Description

+

Stacked bar graph showing the overlap between two cohorts, and a table listing several overlap statistics.

+ +

The stacked bar shows the overlap in terms of subjects. It shows the number of subjects that belong to each cohort and to both. The diagram does not consider whether the subjects were in the different cohorts at the same time.

+

The table show the same information and more: +