diff --git a/CohortDiagnosticsBreastCancer/DiagnosticsExplorer.Rproj b/CohortDiagnosticsBreastCancer/DiagnosticsExplorer.Rproj new file mode 100644 index 00000000..8e3c2ebc --- /dev/null +++ b/CohortDiagnosticsBreastCancer/DiagnosticsExplorer.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/CohortDiagnosticsBreastCancer/R/DisplayFunctions.R b/CohortDiagnosticsBreastCancer/R/DisplayFunctions.R new file mode 100644 index 00000000..d3cf2fb7 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/R/DisplayFunctions.R @@ -0,0 +1,73 @@ +camelCaseToSnakeCase <- function(string) { + string <- gsub("([A-Z])", "_\\1", string) + string <- tolower(string) + string <- gsub("([a-z])([0-9])", "\\1_\\2", string) + return(string) +} + + +camelCaseToTitleCase <- function(string) { + string <- gsub("([A-Z])", " \\1", string) + string <- gsub("([a-z])([0-9])", "\\1 \\2", string) + substr(string, 1, 1) <- toupper(substr(string, 1, 1)) + return(string) +} + + +truncateStringDef <- function(columns, maxChars) { + list( + targets = columns, + render = DT::JS(sprintf("function(data, type, row, meta) {\n + return type === 'display' && data != null && data.length > %s ?\n + '' + data.substr(0, %s) + '...' : data;\n + }", maxChars, maxChars)) + ) +} + +minCellCountDef <- function(columns) { + list( + targets = columns, + render = DT::JS("function(data, type) { + if (type !== 'display' || isNaN(parseFloat(data))) return data; + if (data >= 0) return data.toString().replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,'); + return '<' + Math.abs(data).toString().replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,'); + }") + ) +} + +minCellPercentDef <- function(columns) { + list( + targets = columns, + render = DT::JS("function(data, type) { + if (type !== 'display' || isNaN(parseFloat(data))) return data; + if (data >= 0) return (100 * data).toFixed(1).replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,') + '%'; + return '<' + Math.abs(100 * data).toFixed(1).replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,') + '%'; + }") + ) +} + +minCellRealDef <- function(columns, digits = 1) { + list( + targets = columns, + render = DT::JS(sprintf("function(data, type) { + if (type !== 'display' || isNaN(parseFloat(data))) return data; + if (data >= 0) return data.toFixed(%s).replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,'); + return '<' + Math.abs(data).toFixed(%s).replace(/(\\d)(?=(\\d{3})+(?!\\d))/g, '$1,'); + }", digits, digits)) + ) +} + +styleAbsColorBar <- function(maxValue, colorPositive, colorNegative, angle = 90) { + DT::JS(sprintf("isNaN(parseFloat(value))? '' : 'linear-gradient(%fdeg, transparent ' + (%f - Math.abs(value))/%f * 100 + '%%, ' + (value > 0 ? '%s ' : '%s ') + (%f - Math.abs(value))/%f * 100 + '%%)'", + angle, maxValue, maxValue, colorPositive, colorNegative, maxValue, maxValue)) +} + +sumCounts <- function(counts) { + result <- sum(abs(counts)) + if (any(counts < 0)) { + return(-result) + } else { + return(result) + } + +} diff --git a/CohortDiagnosticsBreastCancer/R/FacetNested.R b/CohortDiagnosticsBreastCancer/R/FacetNested.R new file mode 100644 index 00000000..e02ca2d1 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/R/FacetNested.R @@ -0,0 +1,602 @@ +# Code borrowed from https://github.com/teunbrand/ggh4x and gtable, just to merge the labels of grouped facets. +# May need to simplify a bit. +library(ggplot2) + +.grab_ggplot_internals <- function() { + objects <- c( + ".all_aesthetics", + "as_facets_list", + "as_gg_data_frame", + "axis_label_element_overrides", + "check_aesthetics", + "check_labeller", + "check_subclass", + "compact", + "continuous_range", + "convertInd", + "df.grid", + "draw_axis_labels", + "reshape_add_margins", + "new_data_frame", + "defaults", "id", + "empty", + "eval_facets", + "ggname", + "rename_aes", + "mapped_aesthetics", + "make_labels", + "grid_as_facets_list", + "is.zero", + "rbind_dfs", + "sanitise_dim", + "set_draw_key", + "snake_class", + "ulevels", + "unique_combs", + "var_list", + "validate_mapping", + "warn_for_guide_position", + "weave_tables_col", + "weave_tables_row", + "wrap_as_facets_list", + ".pt" + ) + objects <- setNames(objects, objects) + out <- lapply(objects, function(i) { + getFromNamespace(i, "ggplot2") + }) +} + +# Store the needed ggplot internals here +.int <- .grab_ggplot_internals() + +# From gtable: +neg_to_pos <- function (x, max) { + ifelse(x >= 0, x, max + 1 + x) +} + +new_data_frame <- function (x, n = NULL) +{ + if (is.null(n)) { + n <- if (length(x) == 0) + 0 + else length(x[[1]]) + } + class(x) <- "data.frame" + attr(x, "row.names") <- .set_row_names(n) + x +} + +gtable_add_grob <- function(x, grobs, t, l, b = t, r = l, z = Inf, clip = "on", + name = x$name) +{ + if (!gtable::is.gtable(x)) + stop("x must be a gtable", call. = FALSE) + # if (is.grob(grobs)) + # grobs <- list(grobs) + if (!is.list(grobs)) + stop("grobs must either be a single grob or a list of grobs", + call. = FALSE) + n_grobs <- length(grobs) + if (is.logical(clip)) { + clip <- ifelse(clip, "on", "off") + } + layout <- unclass(x$layout) + # if (!all(vapply(list(t, r, b, l, z, clip, name), len_same_or_1, + # logical(1), n_grobs))) { + # stop("Not all inputs have either length 1 or same length same as 'grobs'") + # } + z <- rep(z, length.out = n_grobs) + zval <- c(layout$z, z[!is.infinite(z)]) + if (length(zval) == 0) { + zmin <- 1 + zmax <- 0 + } + else { + zmin <- min(zval) + zmax <- max(zval) + } + z[z == -Inf] <- zmin - rev(seq_len(sum(z == -Inf))) + z[z == Inf] <- zmax + seq_len(sum(z == Inf)) + x_row <- length(x$heights) + x_col <- length(x$widths) + t <- rep(neg_to_pos(t, x_row), length.out = n_grobs) + b <- rep(neg_to_pos(b, x_row), length.out = n_grobs) + l <- rep(neg_to_pos(l, x_col), length.out = n_grobs) + r <- rep(neg_to_pos(r, x_col), length.out = n_grobs) + clip <- rep(clip, length.out = n_grobs) + name <- rep(name, length.out = n_grobs) + x$grobs <- c(x$grobs, grobs) + x$layout <- new_data_frame(list(t = c(layout$t, t), l = c(layout$l, + l), b = c(layout$b, b), r = c(layout$r, r), z = c(layout$z, + z), clip = c(layout$clip, clip), name = c(layout$name, + name))) + x +} + +# Main function ----------------------------------------------------------- + +#' @title Layout panels in a grid with nested strips +#' +#' @description \code{facet_nested()} forms a matrix of panels defined by row +#' and column faceting variables and nests grouped facets. +#' +#' @inheritParams ggplot2::facet_grid +#' @param nest_line a \code{logical} vector of length 1, indicating whether to +#' draw a nesting line to indicate the nesting of variables. Control the look +#' of the nesting line by setting the \code{ggh4x.facet.nestline} theme +#' element. +#' @param resect a \code{unit} vector of length 1, indicating how much the +#' nesting line should be shortened. +#' @param bleed a \code{logical} vector of length 1, indicating whether merging +#' of lower-level variables is allowed when the higher-level variables are +#' separate. See details. +#' +#' @details Unlike \code{facet_grid()}, this function only automatically expands +#' missing variables when they have no variables in that direction, to allow +#' for unnested variables. It still requires at least one layer to have all +#' faceting variables. +#' +#' Hierarchies are inferred from the order of variables supplied to +#' \code{rows} or \code{cols}. The first variable is interpreted to be the +#' outermost variable, while the last variable is interpreted to be the +#' innermost variable. They display order is always such that the outermost +#' variable is placed the furthest away from the panels. Strips are +#' automatically grouped when they span a nested variable. +#' +#' The \code{bleed} argument controls whether lower-level variables are allowed +#' to be merged when higher-level are different, i.e. they can bleed over +#' hierarchies. Suppose the \code{facet_grid()} behaviour would be the +#' following: +#' +#' \code{[_1_][_2_][_2_]} \cr \code{[_3_][_3_][_4_]} +#' +#' In such case, the default \code{bleed = FALSE} argument would result in the +#' following: +#' +#' \code{[_1_][___2____]} \cr \code{[_3_][_3_][_4_]} +#' +#' Whereas \code{bleed = TRUE} would allow the following: +#' +#' \code{[_1_][___2____]} \cr \code{[___3____][_4_]} +#' +#' @export +#' +#' @return A \emph{FacetNested} ggproto object. +#' @family facetting functions +#' +#' @seealso See \code{\link[ggplot2]{facet_grid}} for descriptions of the +#' original arguments. See \code{\link[grid]{unit}} for the construction of a +#' \code{unit} vector. +#' +#' @examples +#' df <- iris +#' df$nester <- ifelse(df$Species == "setosa", +#' "Short Leaves", +#' "Long Leaves") +#' +#' ggplot(df, aes(Sepal.Length, Petal.Length)) + +#' geom_point() + +#' facet_nested(~ nester + Species) +#' +#' # Controlling the nest line +#' ggplot(df, aes(Sepal.Length, Petal.Length)) + +#' geom_point() + +#' facet_nested(~ nester + Species, nest_line = TRUE) + +#' theme(ggh4x.facet.nestline = element_line(linetype = 3)) +facet_nested <- function( + rows = NULL, cols = NULL, scales = "fixed", space = "fixed", + shrink = TRUE, labeller = "label_value", as.table = TRUE, + switch = NULL, drop = TRUE, margins = FALSE, facets = NULL, + nest_line = FALSE, resect = unit(0, "mm"), bleed = FALSE +) { + if (!is.null(facets)) { + rows <- facets + } + if (is.logical(cols)) { + margins <- cols + cols <- NULL + } + scales <- match.arg(scales, c("fixed", "free_x", "free_y", "free")) + free <- list(x = any(scales %in% c("free_x", "free")), + y = any(scales %in% c("free_y", "free"))) + + space <- match.arg(space, c("fixed", "free_x", "free_y", "free")) + space_free <- list(x = any(space %in% c("free_x", "free")), + y = any(space %in% c("free_y", "free"))) + + if (!is.null(switch) && !switch %in% c("both", "x", "y")) { + stop("switch must be either 'both', 'x', or 'y'", call. = FALSE) + } + + facets_list <- .int$grid_as_facets_list(rows, cols) + n <- length(facets_list) + if (n > 2L) { + stop("A grid facet specification can't have more than two dimensions", + .call = FALSE) + } + if (n == 1L) { + rows <- quos() + cols <- facets_list[[1]] + } else { + rows <- facets_list[[1]] + cols <- facets_list[[2]] + } + labeller <- .int$check_labeller(labeller) + ggplot2::ggproto(NULL, FacetNested, shrink = shrink, + params = list( + rows = rows, + cols = cols, + margins = margins, + free = free, + space_free = space_free, + labeller = labeller, + as.table = as.table, + switch = switch, + drop = drop, + nest_line = nest_line, + resect = resect, + bleed = bleed + )) +} + +# ggproto ----------------------------------------------------------------- + +#' @usage NULL +#' @format NULL +#' @export +#' @rdname ggh4x_extensions +FacetNested <- ggplot2::ggproto( + "FacetNested", FacetGrid, + map_data = function(data, layout, params) { + # Handle empty data + if (.int$empty(data)) { + return(cbind(data, PANEL = integer(0))) + } + # Setup variables + rows <- params$rows + cols <- params$cols + + vars <- c(names(rows), names(cols)) + if (length(vars) == 0) { + data$PANEL <- layout$PANEL + return(data) + } + + margin_vars <- list(intersect(names(rows), names(data)), + intersect(names(cols), names(data))) + + # Add variables + data <- .int$reshape_add_margins(data, margin_vars, params$margins) + facet_vals <- .int$eval_facets(c(rows, cols), data, params$.possible_columns) + + # Only set as missing if it has no variable in that direction + missing_facets <- character(0) + if (!any(names(rows) %in% names(facet_vals))){ + missing_facets <- c(missing_facets, + setdiff(names(rows), names(facet_vals))) + } + if (!any(names(cols) %in% names(facet_vals))){ + missing_facets <- c(missing_facets, + setdiff(names(cols), names(facet_vals))) + } + + # Fill in missing values + if (length(missing_facets) > 0) { + to_add <- unique(layout[missing_facets]) + data_rep <- rep.int(1:nrow(data), nrow(to_add)) + facet_rep <- rep(1:nrow(to_add), each = nrow(data)) + data <- data[data_rep, , drop = FALSE] + rownames(data) <- NULL + facet_vals <- cbind(facet_vals[data_rep, , drop = FALSE], + to_add[facet_rep, , drop = FALSE]) + rownames(facet_vals) <- NULL + } + + # Match columns to facets + if (nrow(facet_vals) == 0) { + data$PANEL <- -1 + } else { + facet_vals[] <- lapply(facet_vals[], as.factor) + facet_vals[] <- lapply(facet_vals[], addNA, ifany = TRUE) + keys <- plyr::join.keys(facet_vals, layout, + by = vars[vars %in% names(facet_vals)]) + data$PANEL <- layout$PANEL[match(keys$x, keys$y)] + } + data + }, + compute_layout = function(data, params) { + rows <- params$rows + cols <- params$cols + dups <- intersect(names(rows), names(cols)) + + if (length(dups) > 0) { + stop("Facetting variables can only appear in row or cols, not both.\n", + "Problems: ", paste0(dups, collapse = "'"), call. = FALSE) + } + + base_rows <- combine_nested_vars(data, params$plot_env, + rows, drop = params$drop) + if (!params$as.table) { + rev_order <- function(x) factor(x, levels = rev(.int$ulevels(x))) + } + base_cols <- combine_nested_vars(data, params$plot_env, cols, + drop = params$drop) + base <- .int$df.grid(base_rows, base_cols) + + if (nrow(base) == 0) { + return(.int$new_data_frame(list(PANEL = 1L, ROW = 1L, COL = 1L, + SCALE_X = 1L, SCALE_Y = 1L))) + } + + base <- .int$reshape_add_margins( + base, list(names(rows), names(cols)), params$margins + ) + base <- unique(base) + + panel <- .int$id(base, drop = TRUE) + panel <- factor(panel, levels = seq_len(attr(panel, "n"))) + + rows <- if (!length(names(rows))) { + rep(1L, length(panel)) + } else { + .int$id(base[names(rows)], drop = TRUE) + } + cols <- if (!length(names(cols))) { + rep(1L, length(panel)) + } else { + .int$id(base[names(cols)], drop = TRUE) + } + + panels <- .int$new_data_frame( + c(list(PANEL = panel, ROW = rows, COL = cols), base) + ) + panels <- panels[order(panels$PANEL), , drop = FALSE] + rownames(panels) <- NULL + panels$SCALE_X <- if (params$free$x) { + panels$COL + } else { + 1L + } + panels$SCALE_Y <- if (params$free$y) { + panels$ROW + } else { + 1L + } + panels + }, + draw_panels = function(panels, layout, x_scales, y_scales, ranges, coord, + data, theme, params) { + panel_table <- FacetGrid$draw_panels(panels, layout, x_scales, y_scales, + ranges, coord, data, theme, params) + + # Setup strips + col_vars <- unique(layout[names(params$cols)]) + row_vars <- unique(layout[names(params$rows)]) + attr(col_vars, "type") <- "cols" + attr(col_vars, "facet") <- "grid" + attr(row_vars, "type") <- "rows" + attr(row_vars, "facet") <- "grid" + + # Build strips + switch_x <- !is.null(params$switch) && params$switch %in% c("both", "x") + switch_y <- !is.null(params$switch) && params$switch %in% c("both", "y") + + # Merging strips + merge_cols <- apply(col_vars, 2, function(x) any(rle(x)$lengths > 1)) + merge_rows <- apply(row_vars, 2, function(x) any(rle(x)$lengths > 1)) + + if (any(merge_cols)) { + if (switch_x) { + panel_table <- merge_strips(panel_table, + col_vars, switch_x, params, theme, "b") + } else { + panel_table <- merge_strips(panel_table, + col_vars, switch_x, params, theme, "t") + } + } + + if (any(merge_rows)) { + if (switch_y) { + panel_table <- merge_strips(panel_table, + row_vars, switch_y, params, theme, "l") + } else { + panel_table <- merge_strips(panel_table, + row_vars, switch_y, params, theme, "r") + } + } + panel_table + } +) + +# Helper functions ----------------------------------------------- + +combine_nested_vars <- function( + data, env = emptyenv(), vars = NULL, drop = TRUE +) { + if (length(vars) == 0) { + return(.int$new_data_frame()) + } + + possible_columns <- unique(unlist(lapply(data, names))) + + values <- .int$compact(lapply(data, .int$eval_facets, facets = vars, + possible_columns = possible_columns)) + has_all <- unlist(lapply(values, length)) == length(vars) + if (!any(has_all)) { + missing <- lapply(values, function(x) setdiff(names(vars), names(x))) + missing_txt <- vapply(missing, .int$var_list, character(1)) + name <- c("Plot", paste0("Layer ", seq_len(length(data) - 1))) + stop("At least one layer must contain all faceting variables: ", + .int$var_list(names(vars)), ".\n", paste0("* ", name, " is missing ", + missing_txt, collapse = "\n"), + call. = FALSE) + } + base <- unique(.int$rbind_dfs(values[has_all])) + if (!drop) { + base <- .int$unique_combs(base) + } + for (value in values[!has_all]) { + if (.int$empty(value)) + next + old <- base[setdiff(names(base), names(value))] + new <- unique(value[intersect(names(base), names(value))]) + if (drop) { + new <- .int$unique_combs(new) + } + old[setdiff(names(base), names(value))] <- rep("", nrow(old)) + base <- rbind(base, .int$df.grid(old, new)) + } + if (.int$empty(base)) { + stop("Facetting variables must have at least one value", + call. = FALSE) + } + base +} + +# New merge strips -------------------------------------------------------- + +merge_strips <- function( + panel_table, vars, switch, params, theme, where = "t" +) { + orient <- if (where %in% c("t", "b")) "x" else "y" + nlevels <- ncol(vars) + + these_strips <- grep(paste0("strip-", where), panel_table$layout$name) + strp_rows <- range(panel_table$layout$t[these_strips]) + strp_cols <- range(panel_table$layout$l[these_strips]) + strp_rows <- seq(strp_rows[1], strp_rows[2]) + strp_cols <- seq(strp_cols[1], strp_cols[2]) + strp <- panel_table[strp_rows, strp_cols] + + # Make empty template + template <- strp + template$grobs <- list() + template$layout <- template$layout[0,] + + # Inflate strips + for (i in seq_along(strp$grobs)) { + sub <- strp$grobs[[i]] + if (where == "b") { + sub$layout$t <- rev(sub$layout$t) + sub$layout$b <- rev(sub$layout$b) + } + n <- length(sub$grobs) + lay <- strp$layout[i,] + lay <- lay[rep(1, n),] + rownames(lay) <- NULL + sub <- lapply(seq_len(n), function(j) { + x <- sub + x$grobs <- x$grobs[j] + x$layout <- x$layout[j,] + x + }) + template <- gtable_add_grob( + template, + sub, t = lay$t, l = lay$l, b = lay$b, r = lay$r, + z = lay$z, clip = lay$clip, name = paste0(lay$name, "-", seq_len(n)) + ) + } + + if (!params$bleed) { + vars[] <- lapply(seq_len(ncol(vars)), function(i) { + do.call(paste0, vars[, seq(i), drop = FALSE]) + }) + } + merge <- apply(vars, 2, function(x) any(rle(x)$lengths > 1)) + + if (where == "r") { + vars <- rev(vars) + merge <- rev(merge) + } + + # Abstract away strips + strip_ids <- strsplit(template$layout$name, "-", fixed = TRUE) + strip_ids <- do.call(rbind, strip_ids) + strip_ids <- strip_ids[,3:ncol(strip_ids)] + mode(strip_ids) <- "integer" + + template$layout$delete <- rep(FALSE, nrow(strip_ids)) + template$layout$aquire <- seq_along(template$grobs) + + for (i in seq_len(nlevels)) { + if (!merge[i]) { + next() + } + ii <- strip_ids[, 2] == i + + # Figure out what to merge + j <- as.numeric(as.factor(vars[, i])) + + ends <- cumsum(rle(j)$lengths) + starts <- c(1, which(diff(j) != 0) + 1) + + # Figure out what strip to remove + seqs <- unlist(Map(seq, from = starts, to = ends)) + delete_this <- seqs[!(seqs %in% starts)] + delete_this <- which(strip_ids[, 1] %in% delete_this & ii) + template$layout$delete[delete_this] <- TRUE + + # Figure out what cells to expand + expand <- seqs[seqs %in% starts] + expand <- which(strip_ids[, 1] %in% expand & ii) + expand_where <- seqs[seqs %in% ends] + expand_where <- which(strip_ids[, 1] %in% expand_where & ii) + template$layout$aquire[expand] <- template$layout$aquire[expand_where] + } + + # Do expansion + if (orient == "x") { + template$layout$r <- template$layout$r[template$layout$aquire] + } else { + template$layout$b <- template$layout$b[template$layout$aquire] + } + + # Do deletion + template$grobs <- template$grobs[!template$layout$delete] + strip_ids <- strip_ids[!template$layout$delete,] + template$layout <- template$layout[!template$layout$delete,] + + # Add nesting indicator + if (params$nest_line) { + active <- unit(c(0, 1), "npc") + c(1, -1) * params$resect + passive <- if (switch) c(1, 1) else c(0, 0) + nindi <- element_render( + theme, "ggh4x.facet.nestline", + x = switch(orient, x = active, y = passive), + y = switch(orient, x = passive, y = active) + ) + i <- which(with(template$layout, t != b | l != r)) + offset <- switch( + orient, + x = vapply(template$grobs, function(grob){grob$layout$t}, numeric(1)), + y = vapply(template$grobs, function(grob){grob$layout$l}, numeric(1)) + ) + offset <- if (where %in% c("r", "b")) offset else nlevels - offset + template$grobs[i] <- lapply(template$grobs[i], function(grb) { + grb <- with(grb$layout, gtable_add_grob( + grb, nindi, t = t, l = l, r = r, b = b, + z = z, + name = "nester", + clip = "off" + )) + }) + template$layout$z <- template$layout$z + offset + } + + # Delete old strips + panel_table <- gtable::gtable_filter(panel_table, paste0("strip-", where), + fixed = TRUE, trim = FALSE, invert = TRUE) + + # Place back new strips + panel_table <- with(template$layout, gtable::gtable_add_grob( + panel_table, + template$grobs, + t = t - 1 + strp_rows[1], + l = l - 1 + strp_cols[1], + b = b - 1 + strp_rows[1], + r = r - 1 + strp_cols[1], + z = z, clip = clip, name = name + )) + panel_table +} \ No newline at end of file diff --git a/CohortDiagnosticsBreastCancer/R/Plots.R b/CohortDiagnosticsBreastCancer/R/Plots.R new file mode 100644 index 00000000..551b2d15 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/R/Plots.R @@ -0,0 +1,553 @@ +plotTimeDistribution <- function(data, + cohortIds = NULL, + databaseIds = NULL, + xAxis = "database") { + + if (is.null(cohortIds) || xAxis != "database" || is.null(databaseIds)) { + warning("Not yet supported. Upcoming feature.") + return(NULL) + } + + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertTibble(x = data, + any.missing = FALSE, + min.rows = 1, + min.cols = 5, + null.ok = FALSE, + add = errorMessage) + checkmate::assertDouble(x = cohortIds, + lower = 1, + upper = 2^53, + any.missing = FALSE, + null.ok = TRUE, + min.len = 1, + add = errorMessage) + checkmate::assertCharacter(x = databaseIds, + any.missing = FALSE, + null.ok = TRUE, + min.len = 1, + unique = TRUE, + add = errorMessage) + checkmate::assertChoice(x = xAxis, + choices = c("database", "cohortId"), + add = errorMessage) + checkmate::assertNames(x = colnames(data), + must.include = c("Min", "P25", "Median", "P75", "Max"), + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + + plotData <- data + if (!is.null(cohortIds)) { + plotData <- plotData %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + if (!is.null(databaseIds)) { + plotData <- plotData %>% + dplyr::filter(.data$Database %in% !!databaseIds) + } + + plotData$tooltip <- c(paste0(plotData$shortName, " : ", plotData$cohortName, + "\nDatabase = ", plotData$Database, + "\nMin = ", plotData$Min, + "\nMax = ", plotData$Max, + "\nP25 = ", plotData$P25, + "\nMedian = ", plotData$Median, + "\nP75 = ", plotData$P75, + "\nTime Measure = ", plotData$TimeMeasure, + "\nAverage = ", plotData$Average)) + + plot <- ggplot2::ggplot(data = plotData) + + ggplot2::aes(x = .data$Database, + ymin = .data$Min, + lower = .data$P25, + middle = .data$Median, + upper = .data$P75, + ymax = .data$Max, + group = .data$TimeMeasure, + average = .data$Average) + + ggplot2::geom_errorbar(mapping = ggplot2::aes(ymin = .data$Min, + ymax = .data$Max), size = 0.5) + + ggiraph::geom_boxplot_interactive(ggplot2::aes(tooltip = tooltip), + stat = "identity", + fill = rgb(0, 0, 0.8, alpha = 0.25), + size = 0.2) + + ggplot2::facet_grid(Database+shortName~TimeMeasure, scales = "free") + + ggplot2::coord_flip() + + ggplot2::theme(panel.grid.major.y = ggplot2::element_blank(), + panel.grid.minor.y = ggplot2::element_blank(), + axis.title.y = ggplot2::element_blank(), + axis.ticks.y = ggplot2::element_blank(), + axis.text.y = ggplot2::element_blank(), + strip.text.y.right = ggplot2::element_text(angle = 0)) + plot <- ggiraph::girafe(ggobj = plot, + options = list( + ggiraph::opts_sizing(width = .7), + ggiraph::opts_zoom(max = 5)), + width_svg = 12, + height_svg = 0.7 + 0.5 * length(databaseIds)) + return(plot) +} +# how to render using pure plot ly. Plotly does not prefer precomputed data. +# TO DO: color and plot positions are not consistent yet. +# plot <- plotly::plot_ly(data = plotData, +# type = "box", +# median = plotData$P25, +# #Mean = plotData$Average, +# upperfence = plotData$Max, +# lowerfence = plotData$Min, +# split = plotData$TimeMeasure) +# loop thru database or cohorts as needed +# then subplot +# plot <- plotly::subplot(plots,nrows = length(input$databases),margin = 0.05) + + +plotIncidenceRate <- function(data, + cohortIds = NULL, + databaseIds = NULL, + stratifyByAgeGroup = TRUE, + stratifyByGender = TRUE, + stratifyByCalendarYear = TRUE, + yscaleFixed = FALSE) { + if (nrow(data) == 0) { + ParallelLogger::logWarn("Record counts are too low to plot.") + } + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertTibble(x = data, + any.missing = TRUE, + min.rows = 1, + min.cols = 5, + null.ok = FALSE, + add = errorMessage) + checkmate::assertDouble(x = cohortIds, + lower = 1, + upper = 2^53, + any.missing = FALSE, + null.ok = TRUE, + min.len = 1, + add = errorMessage) + checkmate::assertCharacter(x = databaseIds, + any.missing = FALSE, + null.ok = TRUE, + min.len = 1, + unique = TRUE, + add = errorMessage) + checkmate::assertLogical(x = stratifyByAgeGroup, + any.missing = FALSE, + min.len = 1, + max.len = 1, + null.ok = FALSE, + add = errorMessage) + checkmate::assertLogical(x = stratifyByGender, + any.missing = FALSE, + min.len = 1, + max.len = 1, + null.ok = FALSE, + add = errorMessage) + checkmate::assertLogical(x = stratifyByCalendarYear, + any.missing = FALSE, + min.len = 1, + max.len = 1, + null.ok = FALSE, + add = errorMessage) + checkmate::assertLogical(x = yscaleFixed, + any.missing = FALSE, + min.len = 1, + max.len = 1, + null.ok = FALSE, + add = errorMessage) + checkmate::assertDouble(x = data$incidenceRate, + lower = 0, + any.missing = FALSE, + null.ok = FALSE, + min.len = 1, + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + checkmate::assertDouble(x = data$incidenceRate, + lower = 0, + any.missing = FALSE, + null.ok = FALSE, + min.len = 1, + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + + plotData <- data %>% + dplyr::mutate(incidenceRate = round(.data$incidenceRate, digits = 3)) + if (!is.null(cohortIds)) { + plotData <- plotData %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + if (!is.null(databaseIds)) { + plotData <- plotData %>% + dplyr::filter(.data$databaseId %in% !!databaseIds) + } + plotData <- plotData %>% + dplyr::mutate(strataGender = !is.na(.data$gender), + strataAgeGroup = !is.na(.data$ageGroup), + strataCalendarYear = !is.na(.data$calendarYear)) %>% + dplyr::filter(.data$strataGender %in% !!stratifyByGender & + .data$strataAgeGroup %in% !!stratifyByAgeGroup & + .data$strataCalendarYear %in% !!stratifyByCalendarYear) %>% + dplyr::select(-dplyr::starts_with("strata")) + + aesthetics <- list(y = "incidenceRate") + if (stratifyByCalendarYear) { + aesthetics$x <- "calendarYear" + xLabel <- "Calender year" + showX <- TRUE + if (stratifyByGender) { + aesthetics$group <- "gender" + aesthetics$color <- "gender" + } + plotType <- "line" + } else { + xLabel <- "" + if (stratifyByGender) { + aesthetics$x <- "gender" + aesthetics$color <- "gender" + aesthetics$fill <- "gender" + showX <- TRUE + } else if (stratifyByAgeGroup) { + aesthetics$x <- "ageGroup" + showX <- TRUE + } + else{ + aesthetics$x <- "cohortId" + showX <- FALSE + } + plotType <- "bar" + } + + newSort <- plotData %>% + dplyr::select(.data$ageGroup) %>% + dplyr::distinct() %>% + dplyr::arrange(as.integer(sub(pattern = '-.+$','',x = .data$ageGroup))) + + plotData <- plotData %>% + dplyr::arrange(ageGroup = factor(.data$ageGroup, levels = newSort$ageGroup), .data$ageGroup) + + plotData$ageGroup <- factor(plotData$ageGroup, + levels = newSort$ageGroup) + plotData$tooltip <- c(paste0(plotData$shortName, ":", plotData$cohortName,"\n","Incidence Rate = ", scales::comma(plotData$incidenceRate, accuracy = 0.01), + "\nDatabase = ", plotData$databaseId, + "\nPerson years = ", scales::comma(plotData$personYears, accuracy = 0.1), + "\nCohort count = ", scales::comma(plotData$cohortCount))) + + if (stratifyByAgeGroup) { + plotData$tooltip <- c(paste0(plotData$tooltip, "\nAge Group = ", plotData$ageGroup)) + } + + if (stratifyByGender) { + plotData$tooltip <- c(paste0(plotData$tooltip, "\nGender = ", plotData$gender)) + } + + if (stratifyByCalendarYear) { + plotData$tooltip <- c(paste0(plotData$tooltip, "\nYear = ", plotData$calendarYear)) + } + + + plot <- ggplot2::ggplot(data = plotData, + do.call(ggplot2::aes_string, aesthetics)) + + ggplot2::xlab(xLabel) + + ggplot2::ylab("Incidence Rate (/1,000 person years)") + + ggplot2::theme(legend.position = "top", + legend.title = ggplot2::element_blank(), + axis.text.x = if (showX) ggplot2::element_text(angle = 90, vjust = 0.5) else ggplot2::element_blank() ) + + if (plotType == "line") { + plot <- plot + + ggiraph::geom_line_interactive(ggplot2::aes(), size = 1, alpha = 0.6) + + ggiraph::geom_point_interactive(ggplot2::aes(tooltip = tooltip), size = 2, alpha = 0.6) + } else { + plot <- plot + ggplot2::geom_bar(stat = "identity") + + ggiraph::geom_col_interactive( ggplot2::aes(tooltip = tooltip), size = 1) + } + + # databaseId field only present when called in Shiny app: + if (!is.null(data$databaseId) && length(data$databaseId) > 1) { + if (yscaleFixed) { + scales <- "fixed" + } else { + scales <- "free_y" + } + if (stratifyByGender | stratifyByCalendarYear) { + if (stratifyByAgeGroup) { + plot <- plot + facet_nested(databaseId + shortName ~ plotData$ageGroup, scales = scales) + } else { + plot <- plot + facet_nested(databaseId + shortName ~ ., scales = scales) + } + } else { + plot <- plot + facet_nested(databaseId + shortName ~., scales = scales) + } + spacing <- rep(c(1, rep(0.5, length(unique(plotData$shortName)) - 1)), length(unique(plotData$databaseId)))[-1] + plot <- plot + ggplot2::theme(panel.spacing.y = ggplot2::unit(spacing, "lines"), + strip.background = ggplot2::element_blank()) + } else { + if (stratifyByAgeGroup) { + plot <- plot + ggplot2::facet_grid(~ageGroup) + } + } + plot <- ggiraph::girafe(ggobj = plot, + options = list( + ggiraph::opts_sizing(width = .7), + ggiraph::opts_zoom(max = 5)), + width_svg = 15, + height_svg = 1.5 + 2*length(unique(data$databaseId))) + return(plot) +} + +plotCohortComparisonStandardizedDifference <- function(balance, + domain = "all", + targetLabel = "Mean Target", + comparatorLabel = "Mean Comparator") { + domains <- c("condition", "device", "drug", "measurement", "observation", "procedure") + balance$domain <- tolower(stringr::str_extract(balance$covariateName, "[a-z]+")) + balance$domain[!balance$domain %in% domains] <- "other" + + if (domain != "all") { + balance <- balance %>% + dplyr::filter(.data$domain == !!domain) + } + + # Can't make sense of plot with > 1000 dots anyway, so remove anything with small mean in both target and comparator: + if (nrow(balance) > 1000) { + balance <- balance %>% + dplyr::filter(.data$mean1 > 0.01 | .data$mean2 > 0.01) + } + + # ggiraph::geom_point_interactive(ggplot2::aes(tooltip = tooltip), size = 3, alpha = 0.6) + balance$tooltip <- c(paste("Covariate Name:", balance$covariateName, + "\nDomain: ", balance$domain, + "\nMean Target: ", scales::comma(balance$mean1, accuracy = 0.1), + "\nMean Comparator:", scales::comma(balance$mean2, accuracy = 0.1), + "\nStd diff.:", scales::comma(balance$stdDiff, accuracy = 0.1))) + + # Code used to generate palette: + # writeLines(paste(RColorBrewer::brewer.pal(n = length(domains), name = "Dark2"), collapse = "\", \"")) + + # Make sure colors are consistent, no matter which domains are included: + colors <- c("#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#444444") + colors <- colors[c(domains, "other") %in% unique(balance$domain)] + + balance$domain <- factor(balance$domain, levels = c(domains, "other")) + + # targetLabel <- paste(strwrap(targetLabel, width = 50), collapse = "\n") + # comparatorLabel <- paste(strwrap(comparatorLabel, width = 50), collapse = "\n") + + plot <- ggplot2::ggplot(balance, ggplot2::aes(x = .data$mean1, y = .data$mean2, color = .data$domain)) + + ggiraph::geom_point_interactive(ggplot2::aes(tooltip = .data$tooltip), size = 3,shape = 16, alpha = 0.5) + + ggplot2::geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + ggplot2::geom_hline(yintercept = 0) + + ggplot2::geom_vline(xintercept = 0) + + ggplot2::scale_x_continuous("MEAN") + + ggplot2::scale_y_continuous("MEAN") + + ggplot2::scale_color_manual("Domain", values = colors) + + ggplot2::facet_grid(targetCohortShortName ~ databaseId + comparatorCohortShortName) + + plot <- ggiraph::girafe(ggobj = plot, + options = list( + ggiraph::opts_sizing(width = .7), + ggiraph::opts_zoom(max = 5)),width_svg = 12, + height_svg = 5) + return(plot) +} + + +plotCohortOverlapVennDiagram <- function(data, + targetCohortIds, + comparatorCohortIds, + databaseIds) { + + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertTibble(x = data, + any.missing = FALSE, + min.rows = 1, + min.cols = 5, + null.ok = FALSE, + add = errorMessage) + checkmate::assertDouble(x = targetCohortIds, + lower = 1, + upper = 2^53, + any.missing = FALSE, + null.ok = FALSE) + checkmate::assertDouble(x = comparatorCohortIds, + lower = 1, + upper = 2^53, + any.missing = FALSE, + null.ok = FALSE) + checkmate::assertCharacter(x = databaseIds, + any.missing = FALSE, + min.len = 1, + null.ok = TRUE + ) + checkmate::reportAssertions(collection = errorMessage) + + plot <- VennDiagram::draw.pairwise.venn(area1 = abs(data$eitherSubjects) - abs(data$cOnlySubjects), + area2 = abs(data$eitherSubjects) - abs(data$tOnlySubjects), + cross.area = abs(data$bothSubjects), + category = c("Target", "Comparator"), + col = c(rgb(0.8, 0, 0), rgb(0, 0, 0.8)), + fill = c(rgb(0.8, 0, 0), rgb(0, 0, 0.8)), + alpha = 0.2, + fontfamily = rep("sans", 3), + cat.fontfamily = rep("sans", 2), + margin = 0.01, + ind = FALSE) + # Borrowed from https://stackoverflow.com/questions/37239128/how-to-put-comma-in-large-number-of-venndiagram + idx <- sapply(plot, function(i) grepl("text", i$name)) + for (i in 1:3) { + plot[idx][[i]]$label <- format(as.numeric(plot[idx][[i]]$label), + big.mark = ",", + scientific = FALSE) + } + grid::grid.draw(plot) + + return(plot) +} + +plotCohortOverlap <- function(data, + yAxis = "Percentages") { + + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertTibble(x = data, + any.missing = FALSE, + min.rows = 1, + min.cols = 6, + null.ok = FALSE, + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + checkmate::assertNames(x = colnames(data), + must.include = c("databaseId", + "targetCohortId", + "comparatorCohortId", + "tOnlySubjects", + "cOnlySubjects", + "bothSubjects"), + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + + + + + + plotData <- data %>% + dplyr::mutate(absTOnlySubjects = abs(.data$tOnlySubjects), + absCOnlySubjects = abs(.data$cOnlySubjects), + absBothSubjects = abs(.data$bothSubjects), + absEitherSubjects = abs(.data$eitherSubjects), + signTOnlySubjects = dplyr::case_when(.data$tOnlySubjects < 0 ~ '<', TRUE ~ ''), + signCOnlySubjects = dplyr::case_when(.data$cOnlySubjects < 0 ~ '<', TRUE ~ ''), + signBothSubjects = dplyr::case_when(.data$bothSubjects < 0 ~ '<', TRUE ~ '')) %>% + dplyr::mutate(tOnlyString = paste0(.data$signTOnlySubjects, + scales::comma(.data$absTOnlySubjects), + " (", + .data$signTOnlySubjects, + scales::percent(.data$absTOnlySubjects/.data$absEitherSubjects, + accuracy = 1), + ")"), + cOnlyString = paste0(.data$signCOnlySubjects, + scales::comma(.data$absCOnlySubjects), + " (", + .data$signCOnlySubjects, + scales::percent(.data$absCOnlySubjects/.data$absEitherSubjects, + accuracy = 1), + ")"), + bothString = paste0(.data$signBothSubjects, + scales::comma(.data$absBothSubjects), + " (", + .data$signBothSubjects, + scales::percent(.data$absBothSubjects/.data$absEitherSubjects, + accuracy = 1), + ")")) %>% + dplyr::mutate(tooltip = paste0("Database: ", .data$databaseId, + "\n", .data$targetShortName, ": ", .data$targetCohortName, + "\n", .data$comparatorShortName, ": ", .data$comparatorCohortName, + "\n", .data$targetShortName, " only: ", .data$tOnlyString, + "\n", .data$comparatorShortName, " only: ", .data$cOnlyString, + "\nBoth: ", .data$bothString)) %>% + dplyr::select(.data$targetShortName, + .data$comparatorShortName, + .data$databaseId, + .data$absTOnlySubjects, + .data$absCOnlySubjects, + .data$absBothSubjects, + .data$tooltip) %>% + tidyr::pivot_longer(cols = c("absTOnlySubjects", + "absCOnlySubjects", + "absBothSubjects"), + names_to = "subjectsIn", + values_to = "value") %>% + dplyr::mutate(subjectsIn = camelCaseToTitleCase(stringr::str_replace_all(string = .data$subjectsIn, + pattern = "abs|Subjects", + replacement = ""))) + + plotData$subjectsIn <- factor(plotData$subjectsIn, levels = c(" T Only", " Both", " C Only")) + if (yAxis == "Percentages") { + position = "fill" + } else { + position = "stack" + } + + plot <- ggplot2::ggplot(data = plotData) + + ggplot2::aes(fill = .data$subjectsIn, + y = .data$value, + x = .data$comparatorShortName, + tooltip = .data$tooltip, + group = .data$subjectsIn) + + ggplot2::ylab(label = "") + + ggplot2::xlab(label = "") + + ggplot2::scale_fill_manual("Subjects in", values = c(rgb(0.8, 0.2, 0.2), rgb(0.3, 0.2, 0.4), rgb(0.4, 0.4, 0.9))) + + ggplot2::facet_grid(.data$databaseId ~ .data$targetShortName, drop = FALSE) + + ggiraph::geom_bar_interactive(position = position, alpha = 0.6, stat = "identity") + if (yAxis == "Percentages") { + plot <- plot + ggplot2::scale_y_continuous(labels = scales::percent) + } else { + plot <- plot + ggplot2::scale_y_continuous(labels = scales::comma) + } + width <- 1.5 + 1*length(unique(plotData$databaseId)) + height <- 1.5 + 1*length(unique(plotData$targetShortName)) + aspectRatio <- width / height + plot <- ggiraph::girafe(ggobj = plot, + options = list( + ggiraph::opts_sizing(width = .7), + ggiraph::opts_zoom(max = 5)), + width_svg = 6 * aspectRatio, + height_svg = 6) + + return(plot) +} +# Future function getCohortOverlapHistogram: +# 1. https://stackoverflow.com/questions/20184096/how-to-plot-multiple-stacked-histograms-together-in-r +# 2. https://stackoverflow.com/questions/43415709/how-to-use-facet-grid-with-geom-histogram +# 3. https://www.datacamp.com/community/tutorials/facets-ggplot-r?utm_source=adwords_ppc&utm_campaignid=1455363063&utm_adgroupid=65083631748&utm_device=c&utm_keyword=&utm_matchtype=b&utm_network=g&utm_adpostion=&utm_creative=332602034361&utm_targetid=dsa-429603003980&utm_loc_interest_ms=&utm_loc_physical_ms=1007768&gclid=CjwKCAjw19z6BRAYEiwAmo64LQMUJwf1i0V-Zgc5hYhpDOFQeZU05reAJmQvo2-mClFWWM4_sJiSmBoC-YkQAvD_BwE +# 4. https://stackoverflow.com/questions/24123499/frequency-histograms-with-facets-calculating-percent-by-groups-used-in-facet-i +# 5. https://stackoverflow.com/questions/62821480/add-a-trace-to-every-facet-of-a-plotly-figure + +# ComparatorOnlySubjs <- generateHistogramValues(len = seq(1:nrow(data)), val = data$cOnlySubjects) +# bothSubjs <- generateHistogramValues(seq(1:nrow(data)), data$bothSubjects) +# cohortOnlySubjs <- generateHistogramValues(seq(1:nrow(data)), data$tOnlySubjects) +# bucket <- list(ComparatorOnlySubjs = ComparatorOnlySubjs, bothSubjs = bothSubjs, cohortOnlySubjs = cohortOnlySubjs) +# +# +# p <- ggplot2::ggplot(reshape::melt(bucket), ggplot2::aes(value, fill = L1)) + +# ggplot2::xlab(label = "Comparators") + +# ggplot2::geom_histogram(position = "stack", binwidth = 1) + +# ggplot2::xlim(c(0,max(length(comparatorCohortIds()),10))) + +# ggplot2::facet_grid(rows = ggplot2::vars(data$targetCohortId), +# cols = ggplot2::vars(data$databaseId), scales = "free_y") +# plot <- plotly::ggplotly(p) +# GENERATE HISTOGRAM FUNCTION +# generateHistogramValues <- function(len,val) +# { +# fillVal <- c() +# +# inc <- 1 +# for (i in len) +# { +# fillVal <- c(fillVal,rep(i,val[[i]])) +# } +# return(fillVal); +# } + diff --git a/CohortDiagnosticsBreastCancer/R/Results.R b/CohortDiagnosticsBreastCancer/R/Results.R new file mode 100644 index 00000000..a93a425f --- /dev/null +++ b/CohortDiagnosticsBreastCancer/R/Results.R @@ -0,0 +1,645 @@ +createDatabaseDataSource <- function(connection, resultsDatabaseSchema, vocabularyDatabaseSchema = resultsDatabaseSchema) { + return(list(connection = connectionPool, + resultsDatabaseSchema = resultsDatabaseSchema, + vocabularyDatabaseSchema = vocabularyDatabaseSchema)) +} + +createFileDataSource <- function(premergedDataFile, envir = new.env()) { + load(premergedDataFile, envir = envir) + return(envir) +} + + +renderTranslateQuerySql <- function(connection, sql, ..., snakeCaseToCamelCase = FALSE) { + if (is(connection, "Pool")) { + # Connection pool is used by Shiny app, which always uses PostgreSQL: + sql <- SqlRender::render(sql, ...) + sql <- SqlRender::translate(sql, targetDialect = "postgresql") + + tryCatch({ + data <- DatabaseConnector::dbGetQuery(connection, sql) + }, error = function(err) { + writeLines(sql) + stop(err) + }) + if (snakeCaseToCamelCase) { + colnames(data) <- SqlRender::snakeCaseToCamelCase(colnames(data)) + } + return(data) + } else { + return(DatabaseConnector::renderTranslateQuerySql(connection = connection, + sql = sql, + ..., + snakeCaseToCamelCase = snakeCaseToCamelCase)) + } +} + +quoteLiterals <- function(x) { + if (is.null(x)) { + return("") + } else { + return(paste0("'", paste(x, collapse = "', '"), "'")) + } +} + +getCohortCountResult <- function(dataSource = .GlobalEnv, + cohortIds = NULL, + databaseIds) { + if (is(dataSource, "environment")) { + data <- get("cohortCount", envir = dataSource) %>% + dplyr::filter(.data$databaseId %in% !!databaseIds) + if (!is.null(cohortIds)) { + data <- data %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + } else { + sql <- "SELECT * + FROM @results_database_schema.cohort_count + WHERE database_id in (@database_id) + {@cohort_ids != ''} ? { AND cohort_id in (@cohort_ids)} + ;" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + cohort_ids = cohortIds, + database_id = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + return(data) +} + +getTimeDistributionResult <- function(dataSource = .GlobalEnv, + cohortIds, + databaseIds) { + if (is(dataSource, "environment")) { + data <- get("timeDistribution", envir = dataSource) %>% + dplyr::filter(.data$cohortId %in% !!cohortIds & + .data$databaseId %in% !!databaseIds) + } else { + sql <- "SELECT * + FROM @results_database_schema.time_distribution + WHERE cohort_id in (@cohort_ids) + AND database_id in (@database_ids);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + cohort_ids = cohortIds, + database_ids = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + shortNames <- data %>% + dplyr::inner_join(cohort) %>% + dplyr::distinct(.data$cohortId, .data$cohortName) %>% + dplyr::arrange(.data$cohortName) %>% + dplyr::mutate(shortName = paste0('C', dplyr::row_number())) + + + data <- data %>% + dplyr::inner_join(shortNames, by = "cohortId") + + data <- data %>% + dplyr::rename(Database = "databaseId", + TimeMeasure = "timeMetric", + Average = "averageValue", + SD = "standardDeviation", + Min = "minValue", + P10 = "p10Value", + P25 = "p25Value", + Median = "medianValue", + P75 = "p75Value", + P90 = "p90Value", + Max = "maxValue") %>% + dplyr::relocate(.data$cohortId, .data$Database, .data$TimeMeasure) %>% + dplyr::arrange(.data$cohortId, .data$Database, .data$TimeMeasure) + return(data) +} + + +getIncidenceRateResult <- function(dataSource = .GlobalEnv, + cohortIds, + databaseIds, + stratifyByGender = c(TRUE,FALSE), + stratifyByAgeGroup = c(TRUE,FALSE), + stratifyByCalendarYear = c(TRUE,FALSE), + minPersonYears = 1000) { + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = cohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + checkmate::assertLogical(x = stratifyByGender, + add = errorMessage, + min.len = 1, + max.len = 2, + unique = TRUE) + checkmate::assertLogical(x = stratifyByAgeGroup, + add = errorMessage, + min.len = 1, + max.len = 2, + unique = TRUE) + checkmate::assertLogical(x = stratifyByCalendarYear, + add = errorMessage, + min.len = 1, + max.len = 2, + unique = TRUE) + checkmate::reportAssertions(collection = errorMessage) + + if (is(dataSource, "environment")) { + data <- get("incidenceRate", envir = dataSource) %>% + dplyr::mutate(strataGender = !is.na(.data$gender), + strataAgeGroup = !is.na(.data$ageGroup), + strataCalendarYear = !is.na(.data$calendarYear)) %>% + dplyr::filter(.data$cohortId %in% !!cohortIds & + .data$databaseId %in% !!databaseIds & + .data$strataGender %in% !!stratifyByGender & + .data$strataAgeGroup %in% !!stratifyByAgeGroup & + .data$strataCalendarYear %in% !!stratifyByCalendarYear & + .data$personYears > !!minPersonYears) %>% + dplyr::select(-tidyselect::starts_with('strata')) + } else { + sql <- "SELECT * + FROM @results_database_schema.incidence_rate + WHERE cohort_id in (@cohort_ids) + AND database_id in (@database_ids) + {@gender == TRUE} ? {AND gender != ''} : { AND gender = ''} + {@age_group == TRUE} ? {AND age_group != ''} : { AND age_group = ''} + {@calendar_year == TRUE} ? {AND calendar_year != ''} : { AND calendar_year = ''} + AND person_years > @personYears;" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + cohort_ids = cohortIds, + database_ids = quoteLiterals(databaseIds), + gender = stratifyByGender, + age_group = stratifyByAgeGroup, + calendar_year = stratifyByCalendarYear, + personYears = minPersonYears, + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + data <- data %>% + dplyr::mutate(gender = dplyr::na_if(.data$gender, ""), + ageGroup = dplyr::na_if(.data$ageGroup, ""), + calendarYear = dplyr::na_if(.data$calendarYear, "")) + } + shortNames <- data %>% + dplyr::inner_join(cohort) %>% + dplyr::distinct(.data$cohortId, .data$cohortName) %>% + dplyr::arrange(.data$cohortName) %>% + dplyr::mutate(shortName = paste0('C', dplyr::row_number())) + + + data <- data %>% + dplyr::inner_join(shortNames, by = "cohortId") + + return(data %>% + dplyr::mutate(calendarYear = as.integer(.data$calendarYear)) %>% + dplyr::arrange(.data$cohortId, .data$databaseId)) +} + +getInclusionRuleStats <- function(dataSource = .GlobalEnv, + cohortIds = NULL, + databaseIds) { + if (is(dataSource, "environment")) { + data <- get("inclusionRuleStats", envir = dataSource) %>% + dplyr::filter(.data$databaseId %in% !!databaseIds) + if (!is.null(cohortIds)) { + data <- data %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + } else { + sql <- "SELECT * + FROM @resultsDatabaseSchema.inclusion_rule_stats + WHERE database_id in (@database_id) + {@cohort_ids != ''} ? { AND cohort_id in (@cohort_ids)} + ;" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + resultsDatabaseSchema = dataSource$resultsDatabaseSchema, + cohort_ids = cohortIds, + database_id = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + data <- data %>% + dplyr::select(.data$ruleSequenceId, .data$ruleName, + .data$meetSubjects, .data$gainSubjects, + .data$remainSubjects, .data$totalSubjects, .data$databaseId) %>% + dplyr::arrange(.data$ruleSequenceId) + return(data) +} + + +getIndexEventBreakdown <- function(dataSource = .GlobalEnv, + cohortIds, + databaseIds) { + errorMessage <- checkmate::makeAssertCollection() + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = cohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + + if (is(dataSource, "environment")) { + data <- get("indexEventBreakdown", envir = dataSource) %>% + dplyr::filter(.data$databaseId %in% !!databaseIds) + if (!is.null(cohortIds)) { + data <- data %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + data <- data %>% + dplyr::inner_join(dplyr::select(get("concept", envir = dataSource), + .data$conceptId, + .data$conceptName), + by = c("conceptId")) + } else { + sql <- "SELECT index_event_breakdown.*, + standard_concept.concept_name AS concept_name + FROM @results_database_schema.index_event_breakdown + INNER JOIN @vocabulary_database_schema.concept standard_concept + ON index_event_breakdown.concept_id = standard_concept.concept_id + WHERE database_id in (@database_id) + AND cohort_id in (@cohort_ids);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + vocabulary_database_schema = dataSource$vocabularyDatabaseSchema, + cohort_ids = cohortIds, + database_id = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + return(data) +} + +getVisitContextResults <- function(dataSource = .GlobalEnv, + cohortIds, + databaseIds) { + errorMessage <- checkmate::makeAssertCollection() + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = cohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + + if (is(dataSource, "environment")) { + data <- get("visitContext", envir = dataSource) %>% + dplyr::filter(.data$databaseId %in% !!databaseIds) + if (!is.null(cohortIds)) { + data <- data %>% + dplyr::filter(.data$cohortId %in% !!cohortIds) + } + data <- data %>% + dplyr::inner_join(dplyr::select(get("concept", envir = dataSource), + visitConceptId = .data$conceptId, + visitConceptName = .data$conceptName), + by = c("visitConceptId")) + } else { + sql <- "SELECT visit_context.*, + standard_concept.concept_name AS visit_concept_name + FROM @results_database_schema.visit_context + INNER JOIN @vocabulary_database_schema.concept standard_concept + ON visit_context.visit_concept_id = standard_concept.concept_id + WHERE database_id in (@database_id) + AND cohort_id in (@cohort_ids);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + vocabulary_database_schema = dataSource$vocabularyDatabaseSchema, + cohort_ids = cohortIds, + database_id = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + return(data) +} + +getIncludedConceptResult <- function(dataSource = .GlobalEnv, + cohortId, + databaseIds) { + if (is(dataSource, "environment")) { + data <- get("includedSourceConcept", envir = dataSource) %>% + dplyr::filter(.data$cohortId == !!cohortId & + .data$databaseId %in% !!databaseIds) %>% + dplyr::inner_join(dplyr::select(get("conceptSets", envir = dataSource), + .data$cohortId, + .data$conceptSetId, + .data$conceptSetName), + by = c("cohortId", "conceptSetId")) %>% + dplyr::inner_join(dplyr::select(get("concept", envir = dataSource), + sourceConceptId = .data$conceptId, + sourceConceptName = .data$conceptName, + sourceVocabularyId = .data$vocabularyId, + sourceConceptCode = .data$conceptCode), + by = c("sourceConceptId")) %>% + dplyr::inner_join(dplyr::select(get("concept", envir = dataSource), + .data$conceptId, + .data$conceptName, + .data$vocabularyId), + by = c("conceptId")) + } else { + sql <- "SELECT included_source_concept.*, + concept_set_name, + source_concept.concept_name AS source_concept_name, + source_concept.vocabulary_id AS source_vocabulary_id, + source_concept.concept_code AS source_concept_code, + standard_concept.concept_name AS concept_name, + standard_concept.vocabulary_id AS vocabulary_id + FROM @results_database_schema.included_source_concept + INNER JOIN @results_database_schema.concept_sets + ON included_source_concept.cohort_id = concept_sets.cohort_id + AND included_source_concept.concept_set_id = concept_sets.concept_set_id + INNER JOIN @vocabulary_database_schema.concept source_concept + ON included_source_concept.source_concept_id = source_concept.concept_id + INNER JOIN @vocabulary_database_schema.concept standard_concept + ON included_source_concept.concept_id = standard_concept.concept_id + WHERE included_source_concept.cohort_id = @cohort_id + AND database_id in (@database_ids);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + vocabulary_database_schema = dataSource$vocabularyDatabaseSchema, + cohort_id = cohortId, + database_ids = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + + return(data) +} + +getOrphanConceptResult <- function(dataSource = .GlobalEnv, + cohortId, + databaseIds) { + if (is(dataSource, "environment")) { + data <- get("orphanConcept", envir = dataSource) %>% + dplyr::filter(.data$cohortId == !!cohortId & + .data$databaseId %in% !!databaseIds) %>% + dplyr::inner_join(dplyr::select(get("conceptSets", envir = dataSource), + .data$cohortId, + .data$conceptSetId, + .data$conceptSetName), + by = c("cohortId", "conceptSetId")) %>% + dplyr::inner_join(dplyr::select(get("concept", envir = dataSource), + .data$conceptId, + .data$conceptName, + .data$vocabularyId, + .data$conceptCode), + by = c("conceptId")) + } else { + sql <- "SELECT orphan_concept.*, + concept_set_name, + standard_concept.concept_name AS concept_name, + standard_concept.vocabulary_id AS vocabulary_id, + standard_concept.concept_code AS concept_code + FROM @results_database_schema.orphan_concept + INNER JOIN @results_database_schema.concept_sets + ON orphan_concept.cohort_id = concept_sets.cohort_id + AND orphan_concept.concept_set_id = concept_sets.concept_set_id + INNER JOIN @vocabulary_database_schema.concept standard_concept + ON orphan_concept.concept_id = standard_concept.concept_id + WHERE orphan_concept.cohort_id = @cohort_id + AND database_id in (@database_ids);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + vocabulary_database_schema = dataSource$vocabularyDatabaseSchema, + cohort_id = cohortId, + database_ids = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + + return(data) +} + + +getCohortOverlapResult <- function(dataSource = .GlobalEnv, + targetCohortIds, + comparatorCohortIds, + databaseIds) { + errorMessage <- checkmate::makeAssertCollection() + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = targetCohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = comparatorCohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + + if (is(dataSource, "environment")) { + data <- get("cohortOverlap", envir = dataSource) %>% + dplyr::filter(.data$targetCohortId %in% !!targetCohortIds & + .data$comparatorCohortId %in% !!comparatorCohortIds & + .data$databaseId %in% !!databaseIds) %>% + dplyr::inner_join(dplyr::select(get("cohort", envir = dataSource), + targetCohortId = .data$cohortId, + targetCohortName = .data$cohortName), + by = "targetCohortId") %>% + dplyr::inner_join(dplyr::select(get("cohort", envir = dataSource), + comparatorCohortId = .data$cohortId, + comparatorCohortName = .data$cohortName), + by = "comparatorCohortId") + } else { + sql <- "SELECT cohort_overlap.*, + target_cohort.cohort_name AS target_cohort_name, + comparator_cohort.cohort_name AS comparator_cohort_name + FROM @results_database_schema.cohort_overlap + INNER JOIN @results_database_schema.cohort target_cohort + ON cohort_overlap.target_cohort_id = target_cohort.cohort_id + INNER JOIN @results_database_schema.cohort comparator_cohort + ON cohort_overlap.comparator_cohort_id = comparator_cohort.cohort_id + WHERE target_cohort_id in (@targetCohortId) + AND comparator_cohort_id in (@comparatorCohortId) + AND database_id in (@databaseId);" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + targetCohortId = targetCohortIds, + comparatorCohortId = comparatorCohortIds, + databaseId = quoteLiterals(databaseIds), + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + + if (nrow(data) == 0) { + return(tidyr::tibble()) + } + targetShortNames <- data %>% + dplyr::distinct(.data$targetCohortId, .data$targetCohortName) %>% + dplyr::arrange(.data$targetCohortName) %>% + dplyr::select(-.data$targetCohortName) %>% + dplyr::mutate(targetShortName = paste0('C', dplyr::row_number())) + + comparatorShortNames <- data %>% + dplyr::distinct(.data$comparatorCohortId, .data$comparatorCohortName) %>% + dplyr::arrange(.data$comparatorCohortName) %>% + dplyr::select(-.data$comparatorCohortName) %>% + dplyr::mutate(comparatorShortName = paste0('C', dplyr::row_number())) + + data <- data %>% + dplyr::inner_join(targetShortNames, by = "targetCohortId") %>% + dplyr::inner_join(comparatorShortNames, by = "comparatorCohortId") + return(data) +} + +getCovariateValueResult <- function(dataSource = .GlobalEnv, + cohortIds, + analysisIds = NULL, + databaseIds, + timeIds = NULL, + isTemporal = FALSE) { + + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertLogical(x = isTemporal, + any.missing = FALSE, + min.len = 1, + max.len = 1, + add = errorMessage) + errorMessage <- checkErrorCohortIdsDatabaseIds(cohortIds = cohortIds, + databaseIds = databaseIds, + errorMessage = errorMessage) + if (isTemporal) { + checkmate::assertIntegerish(x = timeIds, + lower = 0, + any.missing = FALSE, + unique = TRUE, + null.ok = TRUE, + add = errorMessage) + } + checkmate::reportAssertions(collection = errorMessage) + + + if (isTemporal) { + table <- "temporalCovariateValue" + refTable <- "temporalCovariateRef" + timeRefTable <- "temporalTimeRef" + } else { + table <- "covariateValue" + refTable <- "covariateRef" + timeRefTable <- "" + } + + if (is(dataSource, "environment")) { + data <- get(table, envir = dataSource) %>% + dplyr::filter(.data$cohortId %in% !!cohortIds, + .data$databaseId %in% !!databaseIds) %>% + dplyr::inner_join(get(refTable, envir = dataSource), by = "covariateId") + if (!is.null(analysisIds)) { + data <- data %>% + dplyr::filter(.data$analysisId %in% analysisIds) + } + if (isTemporal) { + data <- data %>% + dplyr::inner_join(get(timeRefTable, envir = dataSource), by = "timeId") + if (!is.null(timeIds)) { + data <- data %>% + dplyr::filter(.data$timeId %in% timeIds) + } + } + } else { + sql <- "SELECT covariate.*, + covariate_name, + {@time_ref_table != \"\"} ? { + start_day, + end_day, + } + concept_id, + analysis_id + FROM @results_database_schema.@table covariate + INNER JOIN @results_database_schema.@ref_table covariate_ref + ON covariate.covariate_id = covariate_ref.covariate_id + {@time_ref_table != \"\"} ? { + INNER JOIN @results_database_schema.@time_ref_table time_ref + ON covariate.time_id = time_ref.time_id + } + WHERE cohort_id in (@cohort_ids) + {@time_ref_table != \"\" & @time_ids != \"\"} ? { AND covariate.time_id IN (@time_ids)} + {@analysis_ids != \"\"} ? { AND analysis_id IN (@analysis_ids)} + AND database_id in (@databaseIds);" + if (is.null(timeIds)) { + timeIds <- "" + } + if (is.null(analysisIds)) { + analysisIds <- "" + } + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + table = SqlRender::camelCaseToSnakeCase(table), + ref_table = SqlRender::camelCaseToSnakeCase(refTable), + time_ref_table = SqlRender::camelCaseToSnakeCase(timeRefTable), + results_database_schema = dataSource$resultsDatabaseSchema, + cohort_ids = cohortIds, + analysis_ids = analysisIds, + databaseIds = quoteLiterals(databaseIds), + time_ids = timeIds, + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + if (isTemporal) { + data <- data %>% + dplyr::relocate(.data$cohortId, + .data$databaseId, + .data$timeId, + .data$startDay, + .data$endDay, + .data$analysisId, + .data$covariateId, + .data$covariateName) %>% + dplyr::arrange(.data$cohortId, .data$databaseId, .data$timeId, .data$covariateId, .data$covariateName) + } else { + data <- data %>% + dplyr::relocate(.data$cohortId, + .data$databaseId, + .data$analysisId, + .data$covariateId, + .data$covariateName) %>% + dplyr::arrange(.data$cohortId, .data$databaseId, .data$covariateId) + } + return(data) +} + +getConceptReference <- function(dataSource = .GlobalEnv, + conceptIds) { + # Perform error checks for input variables + errorMessage <- checkmate::makeAssertCollection() + checkmate::assertIntegerish(x = conceptIds, + min.len = 1, + null.ok = TRUE, + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + if (is(dataSource, "environment")) { + data <- get("cohort", envir = dataSource) %>% + dplyr::filter(!is.na(.data$invalidReason)) %>% + dplyr::filter(.data$conceptId %in% conceptIds) + } else { + sql <- "SELECT * + FROM @results_database_schema.concept + WHERE invalid_reason IS NULL + {@conceptIds == } ? {}:{AND concept_id IN (@conceptIds)};" + data <- renderTranslateQuerySql(connection = dataSource$connection, + sql = sql, + results_database_schema = dataSource$resultsDatabaseSchema, + conceptIds = conceptIds, + snakeCaseToCamelCase = TRUE) %>% + tidyr::tibble() + } + return(data %>% dplyr::arrange(.data$conceptId)) +} + +checkErrorCohortIdsDatabaseIds <- function(errorMessage, + cohortIds, + databaseIds) { + checkmate::assertDouble(x = cohortIds, + null.ok = FALSE, + lower = 1, + upper = 2^53, + any.missing = FALSE, + add = errorMessage) + checkmate::assertCharacter(x = databaseIds, + min.len = 1, + any.missing = FALSE, + unique = TRUE, + add = errorMessage) + checkmate::reportAssertions(collection = errorMessage) + return(errorMessage) +} diff --git a/CohortDiagnosticsBreastCancer/R/Tables.R b/CohortDiagnosticsBreastCancer/R/Tables.R new file mode 100644 index 00000000..e0b6edb6 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/R/Tables.R @@ -0,0 +1,141 @@ +library(magrittr) + +prepareTable1 <- function(covariates, + pathToCsv = "Table1Specs.csv") { + covariates <- covariates %>% + dplyr::mutate(covariateName = stringr::str_to_sentence(stringr::str_replace_all(string = .data$covariateName, + pattern = "^.*: ", + replacement = ""))) + space <- " " + specifications <- readr::read_csv(file = pathToCsv, + col_types = readr::cols(), + guess_max = min(1e7)) %>% + dplyr::mutate(dplyr::across(tidyr::everything(), ~tidyr::replace_na(data = .x, replace = ''))) + + resultsTable <- tidyr::tibble() + + if (nrow(specifications) == 0) { + return(resultsTable) + } + + for (i in 1:nrow(specifications)) { + specification <- specifications[i,] + if (specification %>% dplyr::pull(.data$covariateIds) == "") { + covariatesSubset <- covariates %>% + dplyr::filter(.data$analysisId %in% specification$analysisId) %>% + dplyr::arrange(.data$covariateId) + } else { + covariatesSubset <- covariates %>% + dplyr::filter(.data$analysisId %in% specification$analysisId, + .data$covariateId %in% (stringr::str_split(string = (specification %>% + dplyr::pull(.data$covariateIds)), + pattern = ";")[[1]] %>% + utils::type.convert())) %>% + dplyr::arrange(.data$covariateId) + } + if (nrow(covariatesSubset) > 0) { + resultsTable <- dplyr::bind_rows(resultsTable, + tidyr::tibble(characteristic = paste0('', + specification %>% dplyr::pull(.data$label), + ''), + value = NA, + header = 1, + position = i), + tidyr::tibble(characteristic = paste0(space, + space, + space, + space, + covariatesSubset$covariateName), + value = covariatesSubset$mean, + header = 0, + position = i)) %>% + dplyr::distinct() %>% + dplyr::mutate(sortOrder = dplyr::row_number()) + } + } + if (nrow(resultsTable) > 0) { + resultsTable <- resultsTable %>% + dplyr::arrange(.data$position, dplyr::desc(.data$header), .data$sortOrder) + } + return(resultsTable) +} + + +prepareTable1Comp <- function(balance, + pathToCsv = "Table1Specs.csv") { + balance <- balance %>% + dplyr::mutate(covariateName = stringr::str_to_sentence(stringr::str_replace_all(string = .data$covariateName, + pattern = "^.*: ", + replacement = ""))) + space <- " " + specifications <- readr::read_csv(file = pathToCsv, + col_types = readr::cols(), + guess_max = min(1e7)) %>% + dplyr::mutate(dplyr::across(tidyr::everything(), ~tidyr::replace_na(data = .x, replace = ''))) + + resultsTable <- tidyr::tibble() + + if (nrow(specifications) == 0) { + return(dplyr::tibble(Note = 'There are no covariate records for the cohorts being compared.')) + } + + for (i in 1:nrow(specifications)) { + specification <- specifications[i,] + if (specification %>% dplyr::pull(.data$covariateIds) == "") { + balanceSubset <- balance %>% + dplyr::filter(.data$analysisId %in% specification$analysisId) %>% + dplyr::arrange(.data$covariateId) + } else { + balanceSubset <- balance %>% + dplyr::filter(.data$analysisId %in% specification$analysisId, + .data$covariateId %in% (stringr::str_split(string = (specification %>% + dplyr::pull(.data$covariateIds)), + pattern = ";")[[1]] %>% + utils::type.convert())) %>% + dplyr::arrange(.data$covariateId) + } + + if (nrow(balanceSubset) > 0) { + resultsTable <- dplyr::bind_rows(resultsTable, + tidyr::tibble(characteristic = paste0('', + specification %>% dplyr::pull(.data$label), + ''), + MeanT = NA, + MeanC = NA, + StdDiff = NA, + header = 1, + position = i), + tidyr::tibble(characteristic = paste0(space, + space, + space, + space, + balanceSubset$covariateName), + MeanT = balanceSubset$mean1, + MeanC = balanceSubset$mean2, + StdDiff = balanceSubset$stdDiff, + header = 0, + position = i)) %>% + dplyr::distinct() %>% + dplyr::mutate(sortOrder = dplyr::row_number()) + } + } + if (nrow(resultsTable) > 0 ) { + resultsTable <- resultsTable %>% + dplyr::arrange(.data$position, dplyr::desc(.data$header), .data$sortOrder) %>% + dplyr::mutate(sortOrder = dplyr::row_number()) %>% + dplyr::select(-.data$header, -.data$position) + } + return(resultsTable) +} + + +compareCohortCharacteristics <- function(characteristics1, characteristics2) { + m <- dplyr::full_join(x = characteristics1 %>% dplyr::distinct(), + y = characteristics2 %>% dplyr::distinct(), + by = c("covariateId", "conceptId", "databaseId", "covariateName", "analysisId"), + suffix = c("1", "2")) %>% + dplyr::mutate(sd = sqrt(.data$sd1^2 + .data$sd2^2), + stdDiff = (.data$mean2 - .data$mean1)/.data$sd) %>% + dplyr::arrange(-abs(.data$stdDiff)) + return(m) +} diff --git a/CohortDiagnosticsBreastCancer/Table1Specs.csv b/CohortDiagnosticsBreastCancer/Table1Specs.csv new file mode 100644 index 00000000..6389f2be --- /dev/null +++ b/CohortDiagnosticsBreastCancer/Table1Specs.csv @@ -0,0 +1,9 @@ +label,analysisId,covariateIds +Age group,3, +Gender: female,1,8532001 +Race,4, +Ethnicity,5, +Medical history: General,210,4006969210;438409210;4212540210;255573210;201606210;4182210210;440383210;201820210;318800210;192671210;439727210;432867210;316866210;4104000210;433736210;80180210;255848210;140168210;4030518210;80809210;435783210;4279309210;81893210;81902210;197494210;4134440210 +Medical history: Cardiovascular disease,210,313217210;381591210;317576210;321588210;316139210;4185932210;321052210;440417210;444247210 +Medical history: Neoplasms,210,4044013210;432571210;40481902210;443392210;4112853210;4180790210;443388210;197508210;200962210 +Medication use,410,21601782410;21602796410;21604686410;21604389410;21603932410;21601387410;21602028410;21600960410;21601664410;21601744410;21601461410;21600046410;21603248410;21600712410;21603890410;21601853410;21604254410;21604489410;21604752410 diff --git a/CohortDiagnosticsBreastCancer/data/PreMerged.RData b/CohortDiagnosticsBreastCancer/data/PreMerged.RData new file mode 100644 index 00000000..31eaeb62 Binary files /dev/null and b/CohortDiagnosticsBreastCancer/data/PreMerged.RData differ diff --git a/CohortDiagnosticsBreastCancer/global.R b/CohortDiagnosticsBreastCancer/global.R new file mode 100644 index 00000000..bc66cfca --- /dev/null +++ b/CohortDiagnosticsBreastCancer/global.R @@ -0,0 +1,163 @@ +library(magrittr) + +source("R/Tables.R") +source("R/Plots.R") +source("R/Results.R") + +# shinySettings <- list(connectionDetails = DatabaseConnector::createConnectionDetails(dbms = "postgresql", +# server = "localhost/ohdsi", +# user = "postgres", +# password = Sys.getenv("pwPostgres")), +# resultsDatabaseSchema = "phenotype_library", +# vocabularyDatabaseSchema = "phenotype_library") +# shinySettings <- list(dataFolder = "s:/examplePackageOutput") + +# Settings when running on server: + +defaultLocalDataFolder <- "data" +defaultLocalDataFile <- "PreMerged.RData" + +connectionPool <- NULL +defaultServer <- Sys.getenv("phenotypeLibraryDbServer") +defaultDatabase <- Sys.getenv("phenotypeLibraryDbDatabase") +defaultPort <- Sys.getenv("phenotypeLibraryDbPort") +defaultUser <- Sys.getenv("phenotypeLibraryDbUser") +defaultPassword <- Sys.getenv("phenotypeLibraryDbPassword") +defaultResultsSchema <- Sys.getenv("phenotypeLibraryDbResultsSchema") +defaultVocabularySchema <- Sys.getenv("phenotypeLibraryDbVocabularySchema") + +defaultDatabaseMode <- FALSE # Use file system if FALSE + +defaultCohortBaseUrl <- "https://atlas.ohdsi.org/#/cohortdefinition/" +defaultConceptBaseUrl <- "https://athena.ohdsi.org/search-terms/terms/" + +cohortDiagnosticModeDefaultTitle <- "Cohort Diagnostics" +phenotypeLibraryModeDefaultTitle <- "Phenotype Library" + +if (!exists("shinySettings")) { + writeLines("Using default settings") + databaseMode <- defaultDatabaseMode & defaultServer != "" + if (databaseMode) { + connectionPool <- pool::dbPool( + drv = DatabaseConnector::DatabaseConnectorDriver(), + dbms = "postgresql", + server = paste(defaultServer, defaultDatabase, sep = "/"), + port = defaultPort, + user = defaultUser, + password = defaultPassword + ) + resultsDatabaseSchema <- defaultResultsSchema + vocabularyDatabaseSchema <- defaultVocabularySchema + } else { + dataFolder <- defaultLocalDataFolder + } + cohortBaseUrl <- defaultCohortBaseUrl + conceptBaseUrl <- defaultCohortBaseUrl +} else { + writeLines("Using settings provided by user") + databaseMode <- !is.null(shinySettings$connectionDetails) + if (databaseMode) { + connectionDetails <- shinySettings$connectionDetails + if (is(connectionDetails$server, "function")) { + connectionPool <- pool::dbPool(drv = DatabaseConnector::DatabaseConnectorDriver(), + dbms = "postgresql", + server = connectionDetails$server(), + port = connectionDetails$port(), + user = connectionDetails$user(), + password = connectionDetails$password(), + connectionString = connectionDetails$connectionString()) + } else { + # For backwards compatibility with older versions of DatabaseConnector: + connectionPool <- pool::dbPool(drv = DatabaseConnector::DatabaseConnectorDriver(), + dbms = "postgresql", + server = connectionDetails$server, + port = connectionDetails$port, + user = connectionDetails$user, + password = connectionDetails$password, + connectionString = connectionDetails$connectionString) + } + resultsDatabaseSchema <- shinySettings$resultsDatabaseSchema + vocabularyDatabaseSchema <- shinySettings$vocabularyDatabaseSchema + } else { + dataFolder <- shinySettings$dataFolder + } + cohortBaseUrl <- shinySettings$cohortBaseUrl + conceptBaseUrl <- shinySettings$cohortBaseUrl +} + +dataModelSpecifications <- read.csv("resultsDataModelSpecification.csv") +# Cleaning up any tables in memory: +suppressWarnings(rm(list = SqlRender::snakeCaseToCamelCase(dataModelSpecifications$tableName))) + +if (databaseMode) { + + onStop(function() { + if (DBI::dbIsValid(connectionPool)) { + writeLines("Closing database pool") + pool::poolClose(connectionPool) + } + }) + + resultsTablesOnServer <- tolower(DatabaseConnector::dbListTables(connectionPool, schema = resultsDatabaseSchema)) + + loadResultsTable <- function(tableName, required = FALSE) { + if (required || tableName %in% resultsTablesOnServer) { + tryCatch({ + table <- DatabaseConnector::dbReadTable(connectionPool, + paste(resultsDatabaseSchema, tableName, sep = ".")) + }, error = function(err) { + stop("Error reading from ", paste(resultsDatabaseSchema, tableName, sep = "."), ": ", err$message) + }) + colnames(table) <- SqlRender::snakeCaseToCamelCase(colnames(table)) + if (nrow(table) > 0) { + assign(SqlRender::snakeCaseToCamelCase(tableName), dplyr::as_tibble(table), envir = .GlobalEnv) + } + } + } + + loadResultsTable("database", required = TRUE) + loadResultsTable("cohort", required = TRUE) + loadResultsTable("phenotype_description") + loadResultsTable("temporal_time_ref") + loadResultsTable("concept_sets") + + # Create empty objects in memory for all other tables. This is used by the Shiny app to decide what tabs to show: + isEmpty <- function(tableName) { + sql <- sprintf("SELECT 1 FROM %s.%s LIMIT 1;", resultsDatabaseSchema, tableName) + oneRow <- DatabaseConnector::dbGetQuery(connectionPool, sql) + return(nrow(oneRow) == 0) + } + + for (table in dataModelSpecifications$tableName) { + if (table %in% resultsTablesOnServer && + !exists(SqlRender::snakeCaseToCamelCase(table)) && + !isEmpty(table)) { + assign(SqlRender::snakeCaseToCamelCase(table), dplyr::tibble()) + } + } + + dataSource <- createDatabaseDataSource(connection = connectionPool, + resultsDatabaseSchema = resultsDatabaseSchema, + vocabularyDatabaseSchema = vocabularyDatabaseSchema) +} else { + localDataPath <- file.path(dataFolder, defaultLocalDataFile) + if (!file.exists(localDataPath)) { + stop(sprintf("Local data file %s does not exist.", localDataPath)) + } + dataSource <- createFileDataSource(localDataPath, envir = .GlobalEnv) +} + +if (exists("temporalTimeRef")) { + temporalCovariateChoices <- temporalTimeRef %>% + dplyr::mutate(choices = paste0("Start ", .data$startDay, " to end ", .data$endDay)) %>% + dplyr::select(.data$timeId, .data$choices) %>% + dplyr::arrange(.data$timeId) %>% + dplyr::slice_head(n = 5) +} + +if (exists("covariateRef")) { + specifications <- readr::read_csv(file = "Table1Specs.csv", + col_types = readr::cols(), + guess_max = min(1e7)) + prettyAnalysisIds <- specifications$analysisId +} diff --git a/CohortDiagnosticsBreastCancer/html/cohortCharacterization.html b/CohortDiagnosticsBreastCancer/html/cohortCharacterization.html new file mode 100644 index 00000000..9f6ef077 --- /dev/null +++ b/CohortDiagnosticsBreastCancer/html/cohortCharacterization.html @@ -0,0 +1,16 @@ +
A table showing cohort characteristics (covariates). These characteristics are captured on or before the cohort start date. There is a Pretty and a Raw version of this table.
+The Pretty table shows the standard OHDSI characteristics table, which includes only covariates that were manually selected to provide a general overview of the comorbidities and medications of the cohort. These are all binary covariates, and the table shows the proportion (%) of the cohort entries having the covariate.
+
The Raw table shows all captured covariates. These include binary and continuous covariates (e.g. the Charlson comorbidity index). For each covariate the table lists the mean, which for binary covariates is equal to the proportion, and the standard deviation (SD).
+ +You can select multiple databases in the side bar to see cohort characteristics from different databases side-by-side in the same table.
+Select the cohort to explore in the side bar.
+Select either the Pretty or the Raw table at the top of the table.
+ +A table showing the number of cohort entries and unique persons per cohort per database. Because one person can have more than one cohort entry, the number of entries can be higher than the number of persons.
+ +You can select multiple databases in the side bar to see counts from different databases side-by-side.
+ +Stacked bar graph showing the overlap between two cohorts, and a table listing several overlap statistics.
+ +The stacked bar shows the overlap in terms of subjects. It shows the number of subjects that belong to each cohort and to both. The diagram does not consider whether the subjects were in the different cohorts at the same time.
+The table show the same information and more: +
You can select one or more database in the side bar.
+You can select the (target) cohort(s) and comparator cohort(s) in the side bar.
+ +A table or plot showing cohort characteristics (covariates) for two cohorts side-by-side. These characteristics are captured on or before the cohort start date. There is a Pretty and a Raw version of the table.
+The Pretty table shows the standard OHDSI characteristics table, which includes only covariates that were manually selected to provide a general overview of the comorbidities and medications of the cohort. These are all binary covariates, and the table shows the proportion (%) of the cohort entries having the covariate, as well as the standardized difference of the mean (StdDiff).
+
The Raw table shows all captured covariates. These include binary and continuous covariates (e.g. the Charlson comorbidity index). For each covariate the table lists the mean, which for binary covariates is equal to the proportion, the standard deviation (SD), and the standardized difference of the mean (StdDiff).
+The plot shows all covariates, include binary and continuous covariates. The x-axis represents the mean value in the target cohort, the y-axis the mean value in the comparator cohort. Each dot represents a covariate, and the color indicates the absolute value of the standardized difference of the mean.
+ +You can select a database in the side bar.
+Select the cohort to explore in the side bar.
+Select either the Pretty, the Raw table, or the plot at the top of the screen.
+In the plot, you can move the mouse pointer over a dot to see information on that covariate.
+ +A graph showing the incidence rate, optionally stratified by age (in 10-year bins), gender, and calendar year.
+ +The incidence rate is computed as 1000 * the number of people first entering the cohort / the number of years people were eligible to enter the cohort for the first time. The eligible person time is defined as the time when +
You can select multiple databases in the side bar to see graphs from different databases in the same plot.
+Select the cohort to explore in the side bar.
+At the top left of the plot, you can choose whether to stratify the data by age, gender, or calendar year.
+At the top right of the plot, you can choose whether to use the same y-axis for all databases.
+If you move the mouse over the plot, you can see the precise value.
+ +A table showing the (source) concepts observed in the database that are included in a concept set of a cohort. The Subjects column contains the number of subjects in the entire database that have the specific concept. This count is not restricted to only those people in the cohort. Source concepts are identified in the _source_concept_id fields of the Common Data Model, (e.g. drug_source_concept_id) and are used to identify the specific source codes used in a database. Standard concepts are found using the _concept_id fields (e.g. drug_concept_id), and use the same coding system across all databases.
+ +You can select a database in the side bar to see the concepts and counts observed in that database.
+Select the cohort and the specific concept set within that cohort to explore in the side bar.
+You can switch between Source Concepts and Standard Concepts at the top of the table.
+ +A table showing the number of subject that match specific inclusion rules in the cohort definition. Note that this table will be empty if no inclusion rules have been specified.
+ +The table contains the following columns: +
You can select a database in the side bar to see the inclusion rule statistics observed in that database.
+Select the cohort to explore in the side bar.
+ +A table showing the concepts belonging to the concept sets in the entry event definition that are observed on the index date. In other words, the table lists the concepts that likely triggered the cohort entry. The counts indicate number of cohort entries where the concepts was observed on the index date. Note that multiple concepts can be present on the index date, so the sum of counts might be greater than the cohort entry count.
+ +You can select multiple databases in the side bar to see counts from different databases side-by-side.
+Select the cohort to explore in the side bar.
+ +A table showing the (source) concepts observed in the database that are not included in a concept set of a cohort, but maybe should be. The following logic is used to identify concepts that might be relevant:
+The Subjects column contains the number of subjects in the entire database that have the specific concept. This count is not restricted to only those people in the cohort. Source concepts are identified in the _source_concept_id fields of the Common Data Model, (e.g. drug_source_concept_id) and are used to identify the specific source codes used in a database. Standard concepts are found using the _concept_id fields (e.g. drug_concept_id), and use the same coding system across all databases.
+ +You can select a database in the side bar to see the concepts and counts observed in that database.
+Select the cohort and the specific concept set within that cohort to explore in the side bar.
+ +A table showing temporal cohort characteristics (covariates). These characteristics are captured at specific time intervals before or after cohort start date. There is a Pretty and a Raw version of this table.
+The Pretty table includes a pre-selected subset of covariates manually selected to provide a general overview of the comorbidities and medications of the cohort. These are all binary covariates, and the table shows the proportion (%) of the cohort entries having the covariate.
+
The Raw table shows all captured covariates. These include binary and continuous covariates (e.g. the Charlson comorbidity index). For each covariate the table lists the mean, which for binary covariates is equal to the proportion, and the standard deviation (SD).
+ +You can select multiple databases in the side bar to see temporal cohort characteristics from different databases side-by-side in the same table.
+Select the cohort to explore in the side bar.
+Select either the Pretty or the Raw table at the top of the table.
+ +Boxplot and a table showing the distribution of time (in days) before and after the cohort index date (cohort start date), and the time between cohort start and end date. The information is shown for all cohort entries, so not limiting to the first per person.
+ +The boxplot shows: +
The table show the same information and more: +
You can select multiple databases in the side bar to see time distributions from different databases in the same plot and table.
+Select the cohort to explore in the side bar.
+ +A table showing the relationship between the cohort start date and visits recorded in the database. For each database, the table shows:
+You can select multiple databases in the side bar to see counts from different databases side-by-side.
+Select the cohort to explore in the side bar.
+ +