diff --git a/DESCRIPTION b/DESCRIPTION index 95527d0a..65958ed4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,7 +14,7 @@ biocViews: SomaticMutation, VariantAnnotation Depends: - R (>= 4.0.0), + R (>= 4.0.0), NMF Imports: SummarizedExperiment, @@ -48,7 +48,6 @@ Imports: BSgenome.Hsapiens.UCSC.hg38, BSgenome.Mmusculus.UCSC.mm9, BSgenome.Mmusculus.UCSC.mm10, - deconstructSigs, decompTumor2Sig, topicmodels, ggrepel, @@ -63,7 +62,9 @@ Imports: stringi, tidyverse, ggpubr, - Matrix (>= 1.6.1) + Matrix (>= 1.6.4), + scales, + lsei Suggests: TCGAbiolinks, shinyBS, diff --git a/NAMESPACE b/NAMESPACE index 3a531ffd..d7574069 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,18 +1,42 @@ # Generated by roxygen2: do not edit by hand export("%>%") +export("adjustment_threshold<-") +export("description<-") export("exposures<-") +export("final_comparison<-") +export("final_pred<-") +export("ground_truth<-") +export("indv_benchmarks<-") +export("initial_comparison<-") +export("initial_pred<-") +export("intermediate_comparison<-") +export("intermediate_pred<-") +export("method_id<-") +export("method_view_summary<-") export("samp_annot<-") +export("sig_view_summary<-") export("signatures<-") export("tables<-") +export("threshold<-") export("umap<-") export("variants<-") export(add_flank_to_variants) +export(adjustment_threshold) export(annotate_replication_strand) export(annotate_transcript_strand) export(annotate_variant_length) export(annotate_variant_type) export(auto_predict_grid) +export(benchmark) +export(benchmark_compare_results) +export(benchmark_get_entry) +export(benchmark_get_prediction) +export(benchmark_plot_comparison) +export(benchmark_plot_composite_exposures) +export(benchmark_plot_duplicate_exposures) +export(benchmark_plot_exposures) +export(benchmark_plot_signatures) export(build_custom_table) export(build_standard_table) export(built_tables) @@ -23,8 +47,12 @@ export(compare_cosmic_v2) export(compare_cosmic_v3) export(compare_results) export(cosmic_v2_subtype_map) -export(create_musica) +export(create_benchmark) +export(create_musica_from_counts) +export(create_musica_from_variants) +export(create_musica_result) export(create_umap) +export(description) export(discover_signatures) export(drop_annotation) export(exposure_differential_analysis) @@ -36,12 +64,23 @@ export(extract_variants_from_maf_file) export(extract_variants_from_matrix) export(extract_variants_from_vcf) export(extract_variants_from_vcf_file) +export(final_comparison) +export(final_pred) export(generate_result_grid) export(get_musica) +export(ground_truth) +export(indv_benchmarks) +export(initial_comparison) +export(initial_pred) +export(intermediate_comparison) +export(intermediate_pred) export(k_select) +export(method_id) +export(method_view_summary) export(musicatk) export(name_signatures) export(plot_cluster) +export(plot_comparison) export(plot_differential_analysis) export(plot_exposures) export(plot_heatmap) @@ -49,11 +88,13 @@ export(plot_sample_counts) export(plot_sample_reconstruction_error) export(plot_signatures) export(plot_umap) +export(predict_and_benchmark) export(predict_exposure) export(rc) export(samp_annot) export(sample_names) export(select_genome) +export(sig_view_summary) export(signatures) export(subset_musica_by_annotation) export(subset_musica_by_counts) @@ -61,12 +102,15 @@ export(subset_variant_by_type) export(subset_variants_by_samples) export(table_selected) export(tables) +export(threshold) export(umap) export(variants) exportClasses(count_table) +exportClasses(full_benchmark) exportClasses(musica) exportClasses(musica_result) exportClasses(musica_result_grid) +exportClasses(single_benchmark) import(dplyr) import(ggplot2) import(tidyr) diff --git a/R/benchmarking.R b/R/benchmarking.R new file mode 100644 index 00000000..cd1b91ac --- /dev/null +++ b/R/benchmarking.R @@ -0,0 +1,1597 @@ +#' Create a full_benchmark object +#' +#' Initialize a \code{\linkS4class{full_benchmark}} object for benchmarking +#' +#' @param true_signatures A matrix of true signatures by mutational motifs +#' @param true_exposures A matrix of samples by true signature weights +#' @param count_table Summary table with per-sample unnormalized motif counts +#' +#' @return A \code{\linkS4class{full_benchmark}} object +#' @export +create_benchmark <- function(true_signatures, true_exposures, count_table){ + + # create musica result object to hold true exposures and signatures + truth <- create_musica_result(true_signatures, true_exposures, count_table) + + # create benchmark object + full_benchmark <- new("full_benchmark", ground_truth = truth) + + return(full_benchmark) + +} + + +#' Run the benchmark framework on a prediction +#' +#' Perform benchmarking on a signature discovery prediction compared +#' to a ground truth. Potential errors in the predicted signatures, such as +#' composite or duplicate signatures are adjusted, and a summary of the accuracy +#' of the prediction is given. +#' +#' @param full_benchmark An object of class \code{\linkS4class{full_benchmark}} +#' created with the \link{create_benchmark} function or returned from a previous +#' \code{benchmark} run. +#' @param prediction An object of class \code{\linkS4class{musica_result}} +#' containing the predicted signatures and exposures to benchmark. +#' @param method_id An identifier for the prediction being benchmarked. If not +#' supplied, it will be automatically set to the variable name of the prediction +#' provided. Default \code{NULL}. +#' @param threshold Cosine similarity cutoff for comparing preidcted and true +#' signatures. Default \code{0.8}. +#' @param adjustment_threshold Cosine similarity value of high confidence. +#' Comparisons that meet this cutoff are assumed to be likely, +#' while those that fall below the cutoff will be disregarded if the predicted +#' signature is already captured above the threshold. Default \code{0.9}. +#' @param description Further details about the prediction being benchmarked. +#' Default \code{NULL}. +#' @param plot If \code{FALSE}, plots will be suppressed. Default \code{TRUE}. +#' @param make_copy If \code{TRUE}, the \code{full_benchmark} object provided +#' will not be modified and a new object will be returned. If \code{FALSE}, the +#' object provided will be modified and nothing will be returned. Default +#' \code{FALSE}. +#' +#' @return If \code{make_copy == TRUE}, a new \code{full_benchmark} object is +#' returned. If \code{make_copy == FALSE}, nothing is returned. +#' @export +benchmark <- function(full_benchmark, prediction, method_id = NULL, + threshold = 0.8, adjustment_threshold = 0.9, description = NULL, + plot = TRUE, make_copy = FALSE){ + + + if (make_copy == FALSE){ + var_name <- deparse(substitute(full_benchmark)) + } + + # check that full_benchmark is a full_benchmark class object + if (class(full_benchmark)[1] != "full_benchmark"){ + stop(deparse(substitute(full_benchmark)), " is not a 'full_benchmark' object. ", + "Use function 'create_benchmark' to initialize a 'full_benchmark' object") + } + + # check that prediction is a musica result object + if (class(prediction)[1] != "musica_result"){ + stop("'prediction' must be a 'musica_result' object.") + } + + # if no ID provided, try to make one automatically + if (is.null(method_id)){ + + # create ID + method_id <- deparse(substitute(prediction)) + + # display message that ID was automatically generated + message("No method_id provided, automatically generated method_id: ", method_id) + } + + # check if ID is unique + if (method_id %in% names(indv_benchmarks(full_benchmark))){ + original_id <- method_id + + # update id to be unique + tag <- 1 + while (method_id %in% names(indv_benchmarks(full_benchmark))){ + if (tag > 1){ + method_id <- substr(method_id, 1, nchar(method_id)-2) + } + method_id <- paste(method_id, ".", tag, sep = "") + tag <- tag + 1 + } + + # display message that ID was not unique and was updated + message("method_id ", original_id, " already exists. method_id updated to ", method_id) + + } + + + if (threshold != 0.8){ + warning("Default threshold overriden. Interpret results with caution if + comparing benchmark runs with inconsistent thresholds.") + } + + if (adjustment_threshold != 0.9){ + warning("Default adjustment threshold overriden. Interpret results with caution if + comparing benchmark runs with inconsistent thresholds.") + } + + full_benchmark@ground_truth@musica <- get_musica(prediction) + + truth <- ground_truth(full_benchmark) + + message("\nComparing to true signatures (initial)...") + + initial_comparison <- compare_results(prediction, truth, threshold) + + # remove anything below 0.9 that already appears with at least 0.05 difference + initial_comparison <- .benchmark_comp_adj(initial_comparison, adjustment_threshold) + + message("Correcting duplicates...") + + duplicates_corrected <- .correct_duplicates(prediction, initial_comparison, truth) + + message("Comparing to true signatures (post duplicates corrected)...") + + duplicates_corrected_comparison <- compare_results(duplicates_corrected, truth, + threshold = threshold) + + + # remove anything below 0.9 that already appears with at least 0.05 difference + duplicates_corrected_comparison <- .benchmark_comp_adj(duplicates_corrected_comparison, adjustment_threshold) + + message("Correcting composites...") + + composites_corrected <- .correct_composites(duplicates_corrected, duplicates_corrected_comparison, get_musica(truth), truth) + + message("Comparing to true signatures (post composites corrected)...") + + composites_corrected_comparison <- compare_results(composites_corrected, truth, threshold = threshold) + + # remove anything below 0.9 that already appears with at least 0.05 difference + composites_corrected_comparison <- .benchmark_comp_adj(composites_corrected_comparison, adjustment_threshold) + + # extract count table + count_table <- extract_count_tables(get_musica(prediction)) + count_table <- count_table$SBS96@count_table + + message("Creating summary...") + + single_summary <- as.matrix(.generate_summary(method_id, prediction, truth, initial_comparison, + count_table, composites_corrected, composites_corrected_comparison)) + + message("Creating individual benchmark object...") + + # create single benchmark object + indv_benchmark <- new("single_benchmark", initial_pred = prediction, intermediate_pred = duplicates_corrected, + final_pred = composites_corrected, initial_comparison = initial_comparison, + intermediate_comparison = duplicates_corrected_comparison, + final_comparison = composites_corrected_comparison, + single_summary = single_summary, method_id = method_id, threshold = threshold, + adjustment_threshold = adjustment_threshold, description = description) + + # update full benchmark object + message("Updating full benchmark object...") + full_benchmark <- .update_benchmark(full_benchmark, indv_benchmark, single_summary) + + # update global variable + if (make_copy == FALSE){ + assign(var_name, full_benchmark, envir = parent.frame()) + } + + if (plot == TRUE){ + + # plots + message("Generating plots...") + + #message("\nInitial Signatures...\n") + initial_sig_plot <- benchmark_plot_signatures(full_benchmark, method_id, prediction = "Initial", same_scale = FALSE) + print(initial_sig_plot) + #message("\nInitial Comparison...\n") + benchmark_plot_comparison(full_benchmark, method_id, prediction = "Initial", same_scale = FALSE) + #message("\nInitial Exposure Comparison...\n") + #benchmark_plot_exposures(full_benchmark, method_id, prediction = "Initial") + + #message("\nDuplicate signature exposures before/afters...\n") + duplicate_plot <- benchmark_plot_duplicate_exposures(full_benchmark, method_id) + print(duplicate_plot) + + #message("\nIntermediate Signatures...\n") + intermediate_sig_plot <- benchmark_plot_signatures(full_benchmark, method_id, prediction = "Intermediate", same_scale = FALSE) + print(intermediate_sig_plot) + #message("\nIntermediate Comparison...\n") + benchmark_plot_comparison(full_benchmark, method_id, prediction = "Intermediate", same_scale = FALSE) + #message("\nIntermediate Exposure Comparison...\n") + #benchmark_plot_exposures(full_benchmark, method_id, prediction = "Intermediate") + + #message("\nComposite signature exposures before/afters...\n") + composite_plot <- benchmark_plot_composite_exposures(full_benchmark, method_id) + print(composite_plot) + + #message("\nFinal Signatures...\n") + final_sig_plot <- benchmark_plot_signatures(full_benchmark, method_id, prediction = "Final", same_scale = FALSE) + print(final_sig_plot) + #message("\nFinal Comparison...\n") + benchmark_plot_comparison(full_benchmark, method_id, prediction = "Final", same_scale = FALSE) + #message("\nFinal Exposure Comparison...\n") + exposure_plot <- benchmark_plot_exposures(full_benchmark, method_id, prediction = "Final") + print(exposure_plot) + + } + + print(single_summary) + + message("\nDone.\n") + + if (make_copy == TRUE){ + return(full_benchmark) + } + + + +} + +#' Predict and benchmark +#' +#' This function will discover signatures from a musica object using a given +#' algorithm and a range of k values. A new discovery is done for each k value. +#' As each discovery is completed, the prediction is benchmarked. +#' +#' @param musica A \code{\linkS4class{musica}} object. +#' @param table_name Name of the table to use for signature discovery. Needs +#' to be the same name supplied to the table building functions such as +#' \link{build_standard_table}. +#' @param algorithm Method to use for mutational signature discovery. One of +#' \code{"lda"} or \code{"nmf"}. Default \code{"lda"}. +#' @param k_min Minimum number of singatures to predict +#' @param k_max Maximum number of signatures to predict +#' @param full_benchmark An object of class \code{\linkS4class{full_benchmark}} +#' created with the \link{create_benchmark} function or returned from a previous +#' \code{benchmark} run. +#' @param threshold Cosine similarity cutoff for comparing preidcted and true +#' signatures. Default \code{0.8}. +#' @param adjustment_threshold Cosine similarity value of high confidence. +#' Comparisons that meet this cutoff are assumed to be likely, +#' while those that fall below the cutoff will be disregarded if the predicted +#' signature is already captured above the threshold. Default \code{0.9}. +#' @param plot If \code{FALSE}, plots will be suppressed. Default \code{TRUE}. +#' @param seed Seed to be used for the random number generators in the +#' signature discovery algorithms. Default \code{1}. +#' @param nstart Number of independent random starts used in the mutational +#' signature algorithms. Default \code{10}. +#' @param par_cores Number of parallel cores to use. Only used if +#' \code{method = "nmf"}. Default \code{1}. +#' +#' @return If \code{make_copy == TRUE}, a new \code{full_benchmark} object is +#' returned. If \code{make_copy == FALSE}, nothing is returned. +#' @export +predict_and_benchmark <- function(musica, table_name, algorithm = "lda", k_min, + k_max, full_benchmark, threshold = 0.8, + adjustment_threshold = 0.9, plot = FALSE, + seed = 1, nstart = 10, par_cores = 1){ + + + for (k in c(k_min:k_max)){ + + message("Discovering signatures for k = ", k, "...") + + prediction <- discover_signatures(musica = musica, table_name = table_name, + num_signatures = k, algorithm = algorithm, + seed = seed, nstart = nstart, + par_cores = par_cores) + + message("Benchmarking prediction for k = ", k, "...\n") + + full_benchmark <- benchmark(full_benchmark = full_benchmark, + prediction = prediction, + method_id = paste(algorithm, k, sep = ""), + threshold = threshold, + adjustment_threshold = adjustment_threshold, + description = paste("Prediction from predict_and_benchmark function. Algorithm: ", algorithm, ". K = ", k, ".", sep = ""), + plot = plot, make_copy = TRUE) + } + + + print(full_benchmark@method_view_summary) + + return(full_benchmark) + +} + + +#' Get a single_benchmark object +#' +#' Access a \code{\linkS4class{single_benchmark}} object containing information from +#' an individual benchmark run from a \code{\linkS4class{full_benchmark}} object. +#' +#' @param full_benchmark The \code{\linkS4class{full_benchmark}} object that contains +#' the desired \code{\linkS4class{single_benchmark}} object +#' @param method_id The identifier for the desired \code{\linkS4class{single_benchmark}} +#' object +#' +#' @return A \code{\linkS4class{single_benchmark}} object +#' @export +benchmark_get_entry <- function(full_benchmark, method_id){ + + # check that full_benchmark is a full_benchmark class object + if (class(full_benchmark)[1] != "full_benchmark"){ + stop(deparse(substitute(full_benchmark)), " is not a 'full_benchmark' object.") + } + + # check if this method_id exists + if (!(method_id %in% names(indv_benchmarks(full_benchmark)))){ + stop("'method_id' ", deparse(substitute(method_id)), " not found in ", deparse(substitute(full_benchmark))) + } + + benchmark <- indv_benchmarks(full_benchmark)[[method_id]] + + return(benchmark) +} + + +#' Get a benchmark prediction +#' +#' Access a \code{\linkS4class{musica_result}} object containing a particular prediction +#' from a benchmarking analysis. +#' +#' @param indv_benchmark A \code{\linkS4class{single_benchmark}} object containing the +#' desired prediction. This can be accessed using the \link{benchmark_get_entry} +#' function. +#' @param prediction \code{Initial} for the prediction before any benchmarking +#' adjustments have been made, \code{Intermediate} for the prediction after +#' duplicates have been adjusted but before composites are adjusted, or +#' \code{Final} for the prediction at the end of the benchmarking adjustments. +#' +#' @return A \code{\linkS4class{musica_result}} object +#' @export +benchmark_get_prediction <- function(indv_benchmark, prediction){ + + # check if prediction is one of Initial, Intermediate, or Final + valid <- c("Initial", "initial", "Init", "init", "Intermediate", "intermediate", + "Inter", "inter", "Final", "final", "Fin", "fin") + if (!(prediction %in% valid)){ + stop("'prediction' must be one of: 'Initial', 'Intermediate', or 'Final'.") + } + + # access musica object for desired prediction + if (prediction %in% c("Initial", "initial", "Init", "init")){ + result <- initial_pred(indv_benchmark) + } + else if (prediction %in% c("Intermediate", "intermediate", "Inter", "inter")){ + result <- intermediate_pred(indv_benchmark) + } + else if (prediction %in% c("Final", "final", "Fin", "fin")){ + result <- final_pred(indv_benchmark) + } + return(result) + +} + + +#' Plot signatures from a benchmarking analysis +#' +#' After a prediction has been benchmarked with the \link{benchmark} function, +#' this function can be used to plot signatures from any step in the benchmarking +#' process. Comparable to the \code{plot_signatures} function but compatible with +#' benchmarking objects. +#' +#' @param full_benchmark The \code{\linkS4class{full_benchmark}} object for the +#' benchmarking analysis +#' @param method_id The identifier for the \code{\linkS4class{single_benchmark}} +#' object containing the signatures to be plotted +#' @param prediction \code{Initial} for the signatures before any benchmarking +#' adjustments have been made, \code{Intermediate} for the signatures after +#' duplicates have been adjusted but before composites are adjusted, or +#' \code{Final} for the signatures at the end of the benchmarking adjustments. +#' @param plotly If \code{TRUE}, the the plot will be made interactive +#' using \code{\link[plotly]{plotly}}. Default \code{FALSE}. +#' @param color_variable Name of the column in the variant annotation data.frame +#' to use for coloring the mutation type bars. The variant annotation data.frame +#' can be found within the count table of the \code{\linkS4class{musica}} +#' object. If \code{NULL}, then the default column specified in the count +#' table will be used. Default \code{NULL}. +#' @param color_mapping A character vector used to map items in the +#' \code{color_variable} to a color. The items in \code{color_mapping} +#' correspond to the colors. The names of the items in \code{color_mapping} +#' should correspond to the uniqeu items in \code{color_variable}. If +#' \code{NULL}, then the default \code{color_mapping} specified in the count +#' table will be used. Default \code{NULL}. +#' @param text_size Size of axis text. Default \code{10}. +#' @param show_x_labels If \code{TRUE}, the labels for the mutation types +#' on the x-axis will be shown. Default \code{TRUE}. +#' @param show_y_labels If \code{TRUE}, the y-axis ticks and labels will be +#' shown. Default \code{TRUE}. +#' @param same_scale If \code{TRUE}, the scale of the probability for each +#' signature will be the same. If \code{FALSE}, then the scale of the y-axis +#' will be adjusted for each signature. Default \code{TRUE}. +#' @param y_max Vector of maximum y-axis limits for each signature. One value +#' may also be provided to specify a constant y-axis limit for all signatures. +#' Vector length must be 1 or equivalent to the number of signatures. Default +#' \code{NULL}. +#' @param annotation Vector of annotations to be displayed in the top right +#' corner of each signature. Vector length must be equivalent to the number of +#' signatures. Default \code{NULL}. +#' @param percent If \code{TRUE}, the y-axis will be represented in percent +#' format instead of mutation counts. Default \code{TRUE}. +#' +#' @return Generates a ggplot or plotly object +#' @export +benchmark_plot_signatures <- function(full_benchmark, method_id, prediction, + plotly = FALSE, color_variable = NULL, color_mapping = NULL, text_size = 10, + show_x_labels = TRUE, show_y_labels = TRUE, same_scale = TRUE, y_max = NULL, + annotation = NULL, percent = TRUE){ + + # check that full_benchmark is a full_benchmark class object + if (class(full_benchmark)[1] != "full_benchmark"){ + stop(deparse(substitute(full_benchmark)), " is not a 'full_benchmark' object.") + } + + # check if this method_id exists + if (!(method_id %in% names(indv_benchmarks(full_benchmark)))){ + stop("'method_id' ", deparse(substitute(method_id)), " not found in ", deparse(substitute(full_benchmark))) + } + + # check if prediction is one of Initial, Intermediate, or Final + valid <- c("Initial", "initial", "Init", "init", "Intermediate", "intermediate", + "Inter", "inter", "Final", "final", "Fin", "fin") + if (!(prediction %in% valid)){ + stop("'prediction' must be one of: 'Initial', 'Intermediate', or 'Final'.") + } + + # access individual benchmark object + indv_benchmark <- benchmark_get_entry(full_benchmark, method_id) + + # access musica object for desired prediction + result <- benchmark_get_prediction(indv_benchmark, prediction) + + signatures_plot <- plot_signatures(result, plotly = plotly, color_variable = color_variable, color_mapping = color_mapping, + text_size = text_size, show_x_labels = show_x_labels, show_y_labels = show_y_labels, + same_scale = same_scale, y_max = y_max, annotation = annotation, percent = percent) + + return(signatures_plot) + +} + + +#' Get benchmark comparison table +#' +#' After a prediction has been benchmarked with the \link{benchmark} function, +#' this function can be used to generate the comparison table between true and +#' predicted signatures from any step in the benchmarking process. +#' +#' @param full_benchmark The \code{\linkS4class{full_benchmark}} object for the +#' benchmarking analysis +#' @param method_id The identifier for the \code{\linkS4class{single_benchmark}} +#' object containing the comparison of interest +#' @param prediction \code{Initial} for the comparison before any benchmarking +#' adjustments have been made, \code{Intermediate} for the comparison after +#' duplicates have been adjusted but before composites are adjusted, or +#' \code{Final} for the comparison at the end of the benchmarking adjustments. +#' +#' @return A data.frame containing the comparison between true and predicted +#' signatures +#' @export +benchmark_compare_results <- function(full_benchmark, method_id, prediction){ + + # check that full_benchmark is a full_benchmark class object + if (class(full_benchmark)[1] != "full_benchmark"){ + stop(deparse(substitute(full_benchmark)), " is not a 'full_benchmark' object.") + } + + # check if this method_id exists + if (!(method_id %in% names(indv_benchmarks(full_benchmark)))){ + stop("'method_id' ", deparse(substitute(method_id)), " not found in ", deparse(substitute(full_benchmark))) + } + + # check if prediction is one of Initial, Intermediate, or Final + valid <- c("Initial", "initial", "Init", "init", "Intermediate", "intermediate", + "Inter", "inter", "Final", "final", "Fin", "fin") + if (!(prediction %in% valid)){ + stop("'prediction' must be one of: 'Initial', 'Intermediate', or 'Final'.") + } + + # access individual benchmark object + indv_benchmark <- benchmark_get_entry(full_benchmark, method_id) + + # access musica object for desired prediction + if (prediction == "Initial"){ + comparison <- initial_comparison(indv_benchmark) + } + else if (prediction == "Intermediate"){ + comparison <- intermediate_comparison(indv_benchmark) + } + else if (prediction == "Final"){ + comparison <- final_comparison(indv_benchmark) + } + + return(comparison) + +} + + +#' Plot a signature comparison from a benchmarking analysis +#' +#' After a prediction has been benchmarked with the \link{benchmark} function, +#' the comparison between the true and predicted signatures at any step of the +#' benchmarking process can be plotted. +#' +#' @param full_benchmark The \code{\linkS4class{full_benchmark}} object for the +#' benchmarking analysis +#' @param method_id The identifier for the \code{\linkS4class{single_benchmark}} +#' object containing the comparison of interest +#' @param prediction \code{Initial} for the comparison before any benchmarking +#' adjustments have been made, \code{Intermediate} for the comparison after +#' duplicates have been adjusted but before composites are adjusted, or +#' \code{Final} for the comparison at the end of the benchmarking adjustments. +#' @param decimals Specifies rounding for similarity metric displayed. Default +#' \code{2}. +#' @param same_scale If \code{TRUE}, the scale of the probability for each +#' comparison will be the same. If \code{FALSE}, then the scale of the y-axis +#' will be adjusted for each comparison. Default \code{TRUE}. +#' +#' @return Returns the comparison plot +#' @export +benchmark_plot_comparison <- function(full_benchmark, method_id, prediction, + decimals = 2, same_scale = FALSE){ + + # check that full_benchmark is a full_benchmark class object + if (class(full_benchmark)[1] != "full_benchmark"){ + stop(deparse(substitute(full_benchmark)), " is not a 'full_benchmark' object.") + } + + # check if this method_id exists + if (!(method_id %in% names(indv_benchmarks(full_benchmark)))){ + stop("'method_id' ", deparse(substitute(method_id)), " not found in ", deparse(substitute(full_benchmark))) + } + + # check if prediction is one of Initial, Intermediate, or Final + valid <- c("Initial", "initial", "Init", "init", "Intermediate", "intermediate", + "Inter", "inter", "Final", "final", "Fin", "fin") + if (!(prediction %in% valid)){ + stop("'prediction' must be one of: 'Initial', 'Intermediate', or 'Final'.") + } + + comparison <- benchmark_compare_results(full_benchmark, method_id, prediction) + indv_benchmark <- benchmark_get_entry(full_benchmark, method_id) + + # access musica result object for desired prediction + if (prediction == "Initial"){ + pred_res <- initial_pred(indv_benchmark) + res_name <- c("Initial Predicted Signatures") + } + else if (prediction == "Intermediate"){ + pred_res <- intermediate_pred(indv_benchmark) + res_name <- c("Intermediate Predicted Signatures") + } + else if (prediction == "Final"){ + pred_res <- final_pred(indv_benchmark) + res_name <- c("Final Predicted Signatures") + } + + + comparison_plot <- plot_comparison(comparison, pred_res, ground_truth(full_benchmark), + result_name = res_name, other_result_name = "True Signatures", + decimals = decimals, same_scale = same_scale) + + #return(comparison_plot) + +} + + +#' Plot exposure comparison from a benchmarking analysis +#' +#' After a prediction has been benchmarked with the \link{benchmark} function, +#' the comparison between the true and predicted exposures at any stage of the +#' benchmarking process can be plotted. +#' +#' @param full_benchmark The \code{\linkS4class{full_benchmark}} object for the +#' benchmarking analysis +#' @param method_id The identifier for the \code{\linkS4class{single_benchmark}} +#' object of interest +#' @param prediction \code{Initial} for the exposures before any benchmarking +#' adjustments have been made, \code{Intermediate} for the exposures after +#' duplicates have been adjusted but before composites are adjusted, or +#' \code{Final} for the exposures at the end of the benchmarking adjustments. +#' +#' @return Generates a ggplot object +#' @export +benchmark_plot_exposures <- function(full_benchmark, method_id, prediction){ + + Predicted <- NULL + True <- NULL + + # check that full_benchmark is a full_benchmark class object + if (class(full_benchmark)[1] != "full_benchmark"){ + stop(deparse(substitute(full_benchmark)), " is not a 'full_benchmark' object.") + } + + # check if this method_id exists + if (!(method_id %in% names(indv_benchmarks(full_benchmark)))){ + stop("'method_id' ", deparse(substitute(method_id)), " not found in ", deparse(substitute(full_benchmark))) + } + + # check if prediction is one of Initial, Intermediate, or Final + valid <- c("Initial", "initial", "Init", "init", "Intermediate", "intermediate", + "Inter", "inter", "Final", "final", "Fin", "fin") + if (!(prediction %in% valid)){ + stop("'prediction' must be one of: 'Initial', 'Intermediate', or 'Final'.") + } + + # access individual benchmark object + indv_benchmark <- benchmark_get_entry(full_benchmark, method_id) + + # access comparison for desired prediction + comparison <- benchmark_compare_results(full_benchmark, method_id, prediction) + + truth <- ground_truth(full_benchmark) + + # access musica object for desired prediction + prediction <- benchmark_get_prediction(indv_benchmark, prediction) + + predicted <- c() + true <- c() + sig <- c() + index <- 1 + for (true_sig in comparison$y_sig_name){ + predicted_sig <- + comparison[index,4] + predicted <- c(predicted, exposures(prediction)[predicted_sig,]) + true <- c(true, exposures(truth)[,true_sig]) + sig <- c(sig, rep(true_sig, dim(exposures(truth))[1])) + index <- index + 1 + } + + plot_df <- data.frame(Predicted = predicted, True = true, Sig = sig) + + compare_exposure_plot <- ggplot(plot_df, aes(x = Predicted, y = True)) + + geom_point(size = 3) + + facet_wrap(~Sig, scales = "free") + + scale_x_continuous(labels = scales::comma) + + scale_y_continuous(labels = scales::comma) + + theme_classic() + + labs(title = "Predicted vs True Activity Levels for Matched Signatures", + x="Predicted Activity", y = "True Activity") + + theme(text = element_text(size=15), axis.text = element_text(size = 15)) + + geom_abline() + + geom_smooth(method = "lm") + + theme(legend.title=element_blank()) + + return(compare_exposure_plot) + +} + + +#' Plot effect of duplicate correction +#' +#' After a prediction has been benchmarked with the \link{benchmark} function, +#' the true and predicted exposures can be plotted both before and after the +#' duplicate signature adjustment. The effect of the adjustment can then be +#' observed. +#' +#' @param full_benchmark The \code{\linkS4class{full_benchmark}} object for the +#' benchmarking analysis +#' @param method_id The identifier for the \code{\linkS4class{single_benchmark}} +#' object of interest +#' +#' @return A list of ggplot objects +#' @export +benchmark_plot_duplicate_exposures <- function(full_benchmark, method_id){ + + Predicted <- NULL + True <- NULL + + # check that full_benchmark is a full_benchmark class object + if (class(full_benchmark)[1] != "full_benchmark"){ + stop(deparse(substitute(full_benchmark)), " is not a 'full_benchmark' object.") + } + + # check if this method_id exists + if (!(method_id %in% names(indv_benchmarks(full_benchmark)))){ + stop("'method_id' ", deparse(substitute(method_id)), " not found in ", deparse(substitute(full_benchmark))) + } + + indv_benchmark <- benchmark_get_entry(full_benchmark, method_id) + + result_true <- ground_truth(full_benchmark) + + before <- initial_pred(indv_benchmark) + after <- intermediate_pred(indv_benchmark) + + comparison <- initial_comparison(indv_benchmark) + + num_samples <- dim(exposures(result_true))[1] + + freq <- table(comparison$y_sig_name) + duplicated_signatures <- names(freq[freq > 1]) + + final_figures <- list() + + for (duplicated_sig in duplicated_signatures){ + + before_exposures <- NULL + + sigs_to_merge <- comparison[comparison$y_sig_name == duplicated_sig & !grepl("like", comparison$x_sig_name), 4] + + for (signature in sigs_to_merge){ + + #tmp_exposures <- exposures(after)[signature,] + tmp_exposures <- as.data.frame(as.vector(exposures(before)[signature,])) + tmp_exposures_all <- cbind(tmp_exposures, + as.data.frame(as.numeric(exposures(result_true)[,duplicated_sig]))) + colnames(tmp_exposures_all) <- c("Predicted", "True") + tmp_exposures_all$Source <- signature + + before_exposures <- rbind(before_exposures, tmp_exposures_all) + } + + ## DOESNT WORK WHEN NOT ACTUALLY DUPICATE + rownames(before_exposures) <- c(1: (num_samples * length(sigs_to_merge))) + + before_plot <- ggplot(before_exposures, aes(x = Predicted, y = True)) + + geom_point(size = 3) + + facet_wrap(~Source, scales = "fixed") + + scale_x_continuous(labels = scales::comma, limits = c(0, max(max(before_exposures$Predicted), max(before_exposures$True)))) + + scale_y_continuous(labels = scales::comma,limits = c(0, max(max(before_exposures$Predicted), max(before_exposures$True)))) + + theme_classic() + + labs(title=paste("Exposures of duplicated signature, ", duplicated_sig, sep = ""), + subtitle = "Before merging", + x="Predicted Activity", y = "True Activity") + + theme(text = element_text(size=15), axis.text = element_text(size = 15)) + + geom_abline() + + geom_smooth(method = "lm") + + theme(legend.title=element_blank()) + + #print(before_plot) + + new_sig_name <- "Merged Signature (" + for (sig in sigs_to_merge){ + if (sig == sigs_to_merge[1]){ + new_sig_name <- paste(new_sig_name, sig, sep = "") + } + else{ + new_sig_name <- paste(new_sig_name, ".", sig, sep = "") + } + } + new_sig_name <- paste(new_sig_name, ")", sep = "") + + # exposure plot after merging + + tmp_exposures_list2 <- as.data.frame(as.vector(exposures(after)[new_sig_name,])) + after_exposures <- cbind(tmp_exposures_list2, as.data.frame(as.numeric(exposures(result_true)[,duplicated_sig]))) + colnames(after_exposures) <- c("Predicted", "True") + + rownames(after_exposures) <- c(1:num_samples) + + after_plot <- ggplot(after_exposures, aes(x = Predicted, y = True)) + + geom_point(size = 3) + + scale_x_continuous(labels = scales::comma, limits = c(0, max(max(after_exposures$Predicted), max(after_exposures$True)))) + + scale_y_continuous(labels = scales::comma, limits = c(0, max(max(after_exposures$Predicted), max(after_exposures$True)))) + + theme_classic() + + labs(title=paste("Exposures of duplicated signature, ", duplicated_sig, sep = ""), + subtitle = "After merging", + x="Predicted Activity (Merged)", y = "True Activity") + + theme(text = element_text(size=15), axis.text = element_text(size = 15)) + + geom_abline() + + geom_smooth(method = "lm") + + theme(legend.title=element_blank()) + + #print(after_plot) + + figure <- ggpubr::ggarrange(before_plot, after_plot, ncol = 2, nrow = 1) + + final_figures <- append(final_figures, list(figure)) + + } + + return(final_figures) + +} + + +#' Plot effect of composite correction +#' +#' After a prediction has been benchmarked with the \link{benchmark} function, +#' the true and predicted exposures can be plotted both before and after the +#' composite signature adjustment. The effect of the adjustment can then be +#' observed. +#' +#' @param full_benchmark The \code{\linkS4class{full_benchmark}} object for the +#' benchmarking analysis +#' @param method_id The identifier for the \code{\linkS4class{single_benchmark}} +#' object of interest +#' +#' @return A list of ggplot objects +#' @export +benchmark_plot_composite_exposures <- function(full_benchmark, method_id){ + + Predicted <- NULL + True <- NULL + + # check that full_benchmark is a full_benchmark class object + if (class(full_benchmark)[1] != "full_benchmark"){ + stop(deparse(substitute(full_benchmark)), " is not a 'full_benchmark' object.") + } + + # check if this method_id exists + if (!(method_id %in% names(indv_benchmarks(full_benchmark)))){ + stop("'method_id' ", deparse(substitute(method_id)), " not found in ", deparse(substitute(full_benchmark))) + } + + indv_benchmark <- benchmark_get_entry(full_benchmark, method_id) + + result_true <- ground_truth(full_benchmark) + + before <- intermediate_pred(indv_benchmark) + after <- final_pred(indv_benchmark) + + comparison <- intermediate_comparison(indv_benchmark) + + num_samples <- dim(exposures(result_true))[1] + + freq <- table(comparison$x_sig_name) + composite_signatures <- names(freq[freq > 1]) + + count <- 1 + + final_figures <- list() + + for (composite_sig in composite_signatures){ + + before_exposures <- NULL + + # list of SBS signatures that are the components of this composite signature + sig_components <- comparison[comparison$x_sig_name == composite_sig, 5] + + for (component_index in 1:length(sig_components)){ + + # for plotting + tmp_exp <- exposures(before)[composite_sig,] + tmp_exp_list <- as.data.frame(as.vector(tmp_exp)) + tmp_exp_all <- cbind(tmp_exp_list, + as.data.frame(as.numeric(exposures(result_true)[,sig_components[component_index]]))) + colnames(tmp_exp_all) <- c("Predicted", "True") + tmp_exp_all$Source <- sig_components[component_index] + + before_exposures <- rbind(before_exposures, tmp_exp_all) + + } + + rownames(before_exposures) <- c(1: (num_samples * length(sig_components))) + + before_plot <- ggplot(before_exposures, aes(x = Predicted, y = True)) + + geom_point(size = 3) + + facet_wrap(~Source, scales = "fixed") + + scale_x_continuous(labels = scales::comma, limits = c(0, max(max(before_exposures$Predicted), max(before_exposures$True)))) + + scale_y_continuous(labels = scales::comma, limits = c(0, max(max(before_exposures$Predicted), max(before_exposures$True)))) + + theme_classic() + + labs(title=paste("Exposures of composite signature, ", composite_sig, sep = ""), + subtitle = "Before decomposing", + x="Predicted Activity", y = "True Activity") + + theme(text = element_text(size=15), axis.text = element_text(size = 15)) + + geom_abline() + + geom_smooth(method = "lm") + + theme(legend.title=element_blank()) + + #print(before_plot) + + colnames <- NULL + for (component in sig_components){ + #colnames <- c(colnames, paste("Signature", component, "_like", sep = "")) + colnames <- c(colnames, paste(component, "_like", sep = "")) + + } + colnames <- colnames[colnames %in% rownames(exposures(after))] + sig_components <- gsub("_like", "", colnames) + + after_exposures <- NULL + + for (component_index in 1:length(sig_components)){ + + # for plotting + tmp_exp <- exposures(after)[colnames[component_index],] + tmp_exp_list <- as.data.frame(as.vector(tmp_exp)) + tmp_exp_all <- cbind(tmp_exp_list, + as.data.frame(as.numeric(exposures(result_true)[,sig_components[component_index]]))) + colnames(tmp_exp_all) <- c("Predicted", "True") + tmp_exp_all$Source <- sig_components[component_index] + + after_exposures <- rbind(after_exposures, tmp_exp_all) + + } + + after_plot <- ggplot(after_exposures, aes(x = Predicted, y = True)) + + geom_point(size = 3) + + facet_wrap(~Source, scales = "fixed") + + scale_x_continuous(labels = scales::comma, limits = c(0, max(max(after_exposures$Predicted), max(after_exposures$True)))) + + scale_y_continuous(labels = scales::comma, limits = c(0, max(max(after_exposures$Predicted), max(after_exposures$True)))) + + theme_classic() + + labs(title=paste("Exposures of composite signature, ", composite_sig, sep = ""), + subtitle = "After decomposing", + x="Predicted Activity", y = "True Activity") + + theme(text = element_text(size=15), axis.text = element_text(size = 15)) + + geom_abline() + + geom_smooth(method = "lm") + + theme(legend.title=element_blank()) + + #print(after_plot) + + figure <- ggpubr::ggarrange(before_plot, after_plot, ncol = 2, nrow = 1) + + final_figures <- append(final_figures, list(figure)) + + } + + return(final_figures) + +} + + +# Function for addressing duplicates +.correct_duplicates <- function(result, compare_cosmic_result, result_true){ + + Sum <- NULL + Predicted <- NULL + True <- NULL + + exposures <- exposures(result) + signatures <- signatures(result) + num_samples <- dim(exposures)[2] + + freq <- table(compare_cosmic_result$y_sig_name) + duplicated_signatures <- names(freq[freq > 1]) + all_sigs_to_merge <- compare_cosmic_result[compare_cosmic_result$y_sig_name %in% duplicated_signatures & !grepl("like", compare_cosmic_result$x_sig_name), 4] + + corrected_sigs <- NULL + corrected_exposures <- NULL + + for (duplicated_sig in duplicated_signatures){ + + duplicate_exposures_all <- NULL + + sigs_to_merge <- compare_cosmic_result[compare_cosmic_result$y_sig_name == duplicated_sig & !grepl("like", compare_cosmic_result$x_sig_name), 4] + + # add when removed the below + full_sig_names_to_merge <- sigs_to_merge + #sigs_to_merge <- str_remove(sigs_to_merge, "Signature") + + # convert sig numbers to full names + #full_sig_names_to_merge <- NULL + #for (sig_number in sigs_to_merge){ + # sig_name <- paste("Signature", sig_number, sep = "") + # full_sig_names_to_merge <- c(full_sig_names_to_merge, sig_name) + #} + + sum <- 0 + for (signature in full_sig_names_to_merge){ + for (sample_index in 1:num_samples){ + temp <- signatures[, signature] * exposures[signature, sample_index] + sum <- sum + temp + } + + tmp_exposures <- exposures[signature,] + tmp_exposures_list <- as.data.frame(as.vector(tmp_exposures)) + tmp_exposures_all <- cbind(tmp_exposures_list, + as.data.frame(as.numeric(exposures(result_true)[,duplicated_sig]))) + colnames(tmp_exposures_all) <- c("Predicted", "True") + tmp_exposures_all$Source <- signature + + duplicate_exposures_all <- rbind(duplicate_exposures_all, tmp_exposures_all) + } + + ## DOESNT WORK WHEN NOT ACTUALLY DUPICATE + rownames(duplicate_exposures_all) <- c(1: (num_samples * length(sigs_to_merge))) + + plot <- ggplot(duplicate_exposures_all, aes(x = Predicted, y = True)) + + geom_point(size = 3) + + facet_wrap(~Source, scales = "fixed") + + scale_x_continuous(labels = scales::comma, limits = c(0, max(max(duplicate_exposures_all$Predicted), max(duplicate_exposures_all$True)))) + + scale_y_continuous(labels = scales::comma,limits = c(0, max(max(duplicate_exposures_all$Predicted), max(duplicate_exposures_all$True)))) + + theme_classic() + + labs(title=paste("Exposures of duplicated signature, ", duplicated_sig, sep = ""), + subtitle = "Before merging", + x="Predicted Activity", y = "True Activity") + + theme(text = element_text(size=15), axis.text = element_text(size = 15)) + + geom_abline() + + geom_smooth(method = "lm") + + theme(legend.title=element_blank()) + + #print(plot) + + + total_sum <- sum(sum) + + normalized <- sum / total_sum + + merged_signature <- as.matrix(normalized) + + new_sig_name <- "Merged Signature (" + for (sig in sigs_to_merge){ + if (sig == sigs_to_merge[1]){ + new_sig_name <- paste(new_sig_name, sig, sep = "") + } + else{ + new_sig_name <- paste(new_sig_name, ".", sig, sep = "") + } + } + new_sig_name <- paste(new_sig_name, ")", sep = "") + + colnames(merged_signature) <- new_sig_name + + corrected_sigs <- cbind(corrected_sigs, merged_signature) + + # update exposures + + merged_exposures <- 0 + for (signature in full_sig_names_to_merge){ + merged_exposures <- merged_exposures + exposures[signature,] + } + + merged_exposures <- t(as.matrix(merged_exposures)) + + rownames(merged_exposures) <- new_sig_name + + corrected_exposures <- rbind(corrected_exposures, merged_exposures) + + # exposure plot after merging + + tmp_exposures_list <- as.data.frame(as.vector(merged_exposures)) + tmp_exposures_all <- cbind(tmp_exposures_list, as.data.frame(as.numeric(exposures(result_true)[,duplicated_sig]))) + colnames(tmp_exposures_all) <- c("Predicted", "True") + + rownames(tmp_exposures_all) <- c(1:num_samples) + + plot <- ggplot(tmp_exposures_all, aes(x = Predicted, y = True)) + + geom_point(size = 3) + + scale_x_continuous(labels = scales::comma, limits = c(0, max(max(tmp_exposures_all$Predicted), max(tmp_exposures_all$True)))) + + scale_y_continuous(labels = scales::comma, limits = c(0, max(max(tmp_exposures_all$Predicted), max(tmp_exposures_all$True)))) + + theme_classic() + + labs(title=paste("Exposures of duplicated signature, ", duplicated_sig, sep = ""), + subtitle = "After merging", + x="Predicted Activity (Merged)", y = "True Activity") + + theme(text = element_text(size=15), axis.text = element_text(size = 15)) + + geom_abline() + + geom_smooth(method = "lm") + + theme(legend.title=element_blank()) + + #print(plot) + + } + + if (is.null(corrected_sigs) == FALSE){ + + unchanged_signatures <- as.data.frame(signatures)[, !colnames(signatures) %in% all_sigs_to_merge] + corrected_sigs <- cbind(unchanged_signatures, corrected_sigs) + + unchanged_exposures <- as.data.frame(exposures)[!rownames(exposures) %in% all_sigs_to_merge,] + corrected_exposures <- rbind(unchanged_exposures, corrected_exposures) + + } + + else{ + + unchanged_signatures <- as.data.frame(signatures)[, !colnames(signatures) %in% all_sigs_to_merge] + corrected_sigs <- unchanged_signatures + + unchanged_exposures <- as.data.frame(exposures)[!rownames(exposures) %in% all_sigs_to_merge,] + corrected_exposures <- unchanged_exposures + + } + + duplicates_corrected <- result + signatures(duplicates_corrected) <- as.matrix(corrected_sigs) + exposures(duplicates_corrected) <- as.matrix(corrected_exposures) + + return(duplicates_corrected) + +} + +# Function for addressing composites +.correct_composites <- function(result, compare_cosmic_result, musica, result_true){ + + Sum <- NULL + Predicted <- NULL + True <- NULL + + exposures <- exposures(result) + signatures <- signatures(result) + num_samples <- dim(exposures)[2] + + freq <- table(compare_cosmic_result$x_sig_name) + composite_signatures <- names(freq[freq > 1]) + #all_sigs_to_merge <- compare_cosmic_result[compare_cosmic_result$y_sig_name == duplicated_signatures, 4] + + corrected_sigs <- NULL + corrected_exposures <- NULL + + for (composite_sig in composite_signatures){ + + composite_exp_all <- NULL + + # the full signature to separate + sig_to_separate <- signatures(result)[, composite_sig] + exposures_to_separate <- exposures(result)[composite_sig,] + + # list of SBS signatures that are the componenets of this composite signature + sig_components <- compare_cosmic_result[compare_cosmic_result$x_sig_name == composite_sig, 5] + + # number of components in this composite sig + num_components <- length(sig_components) + + # data frame of full signatures of components + component_signatures <- as.data.frame(signatures(result_true)[,sig_components]) # GENERALIZE + + separated_sigs <- matrix(ncol = 0, nrow = 96) + separated_exposures <- matrix(ncol = num_samples, nrow = 0) + + # perform nnls + nnls_result <- lsei::nnls(as.matrix(component_signatures), as.vector(sig_to_separate)) + + for (component_index in 1:num_components){ + + new_signature <- component_signatures[ , component_index] * nnls_result$x[component_index] + if (sum(new_signature) != 0){ + separated_sigs <- cbind(separated_sigs, new_signature) + } + else{ + num_components <- num_components - 1 + sig_components <- sig_components[-component_index] + } + + } + + # renormalize + separated_sigs <- prop.table(separated_sigs,2) + + # update exposures + + num_samples <- dim(musica@count_tables$SBS96@count_table)[2] + + nnls_exposure_results <- data.frame(factor1 = numeric(), factor2 = numeric()) + + for (sample_index in 1:num_samples){ + + #A <- as.matrix(signatures(result_true)) + A <- as.matrix(separated_sigs) + #b <- as.vector(sig_to_separate * exposures_to_separate[sample_index]) + b <- as.vector(musica@count_tables$SBS96@count_table[,sample_index]) + + nnls_exposure_result <- lsei::nnls(A, b) + + nnls_exposure_results[sample_index,1] <- nnls_exposure_result$x[1] + nnls_exposure_results[sample_index,2] <- nnls_exposure_result$x[2] + + + } + + #sig_component_result <- predict_exposure(musica = musica, g = "hg38", table_name = "SBS96", + #signature_res = cosmic_v2_sigs, + #signatures_to_use = sig_components, algorithm = "lda") + + #tmp_exposures <- exposures(sig_component_result) + #tmp_exposures <- as.data.frame(t(tmp_exposures)) + + #tmp_exposures$Sum <- rowSums(tmp_exposures) + + # added + #nnls_exposure_results <- nnls_exposure_results[,c(6,7)] + #nnls_exposure_results <- nnls_exposure_results[,c(2,1)] + + nnls_exposure_results$Sum <- rowSums(nnls_exposure_results) + + for (component_index in 1:num_components){ + + # calculate the percent of the sum that each of the 96 channels contributes (reword this i know it doesnt make sense) + #tmp_exposures <- transform(tmp_exposures, percent1 = tmp_exposures[component_index] / Sum) + nnls_exposure_results <- transform(nnls_exposure_results, percent1 = nnls_exposure_results[component_index] / Sum) + + #separated_exposures <- rbind(separated_exposures, + #exposures_to_separate * tmp_exposures[, component_index + num_components + 1]) + separated_exposures <- rbind(separated_exposures, + exposures_to_separate * nnls_exposure_results[, component_index + num_components + 1]) + + + # for plotting + tmp_exp <- exposures[composite_sig,] + tmp_exp_list <- as.data.frame(as.vector(tmp_exp)) + tmp_exp_all <- cbind(tmp_exp_list, + as.data.frame(as.numeric(exposures(result_true)[,sig_components[component_index]]))) # GENERALIZE (was Signature not SBS) + colnames(tmp_exp_all) <- c("Predicted", "True") + tmp_exp_all$Source <- sig_components[component_index] + + composite_exp_all <- rbind(composite_exp_all, tmp_exp_all) + + } + + rownames(composite_exp_all) <- c(1: (num_samples * num_components)) + + plot <- ggplot(composite_exp_all, aes(x = Predicted, y = True)) + + geom_point(size = 3) + + facet_wrap(~Source, scales = "fixed") + + scale_x_continuous(labels = scales::comma, limits = c(0, max(max(composite_exp_all$Predicted), max(composite_exp_all$True)))) + + scale_y_continuous(labels = scales::comma, limits = c(0, max(max(composite_exp_all$Predicted), max(composite_exp_all$True)))) + + theme_classic() + + labs(title=paste("Exposures of composite signature, ", composite_sig, sep = ""), + subtitle = "Before decomposing", + x="Predicted Activity", y = "True Activity") + + theme(text = element_text(size=15), axis.text = element_text(size = 15)) + + geom_abline() + + geom_smooth(method = "lm") + + theme(legend.title=element_blank()) + + #print(plot) + + colnames <- NULL + for (component in sig_components){ + #colnames <- c(colnames, paste("Signature", component, "_like", sep = "")) + colnames <- c(colnames, paste(component, "_like", sep = "")) + + } + + colnames(separated_sigs) <- colnames + rownames(separated_exposures) <- colnames + + corrected_sigs <- cbind(corrected_sigs, separated_sigs) + corrected_exposures <- rbind(corrected_exposures, separated_exposures) + + # plot exposures after separation + + plot_exposures <- composite_exp_all + plot_exposures$Predicted <- c(t(separated_exposures)) + + plot <- ggplot(plot_exposures, aes(x = Predicted, y = True)) + + geom_point(size = 3) + + facet_wrap(~Source, scales = "fixed") + + scale_x_continuous(labels = scales::comma, limits = c(0, max(max(plot_exposures$Predicted), max(plot_exposures$True)))) + + scale_y_continuous(labels = scales::comma, limits = c(0, max(max(plot_exposures$Predicted), max(plot_exposures$True)))) + + theme_classic() + + labs(title=paste("Exposures of composite signature, ", composite_sig, sep = ""), + subtitle = "After decomposing", + x="Predicted Activity", y = "True Activity") + + theme(text = element_text(size=15), axis.text = element_text(size = 15)) + + geom_abline() + + geom_smooth(method = "lm") + + theme(legend.title=element_blank()) + + #print(plot) + + + } + + if (is.null(corrected_sigs) == FALSE){ + + unchanged_signatures <- as.data.frame(signatures)[, !colnames(signatures) %in% composite_signatures] + corrected_sigs <- cbind(unchanged_signatures, corrected_sigs) + + unchanged_exposures <- as.data.frame(exposures)[!rownames(exposures) %in% composite_signatures,] + corrected_exposures <- rbind(unchanged_exposures, corrected_exposures) + + } + + else{ + + unchanged_signatures <- as.data.frame(signatures)[, !colnames(signatures) %in% composite_signatures] + corrected_sigs <- unchanged_signatures + + unchanged_exposures <- as.data.frame(exposures)[!rownames(exposures) %in% composite_signatures,] + corrected_exposures <- unchanged_exposures + + } + + composites_corrected <- result + signatures(composites_corrected) <- as.matrix(corrected_sigs) + exposures(composites_corrected) <- as.matrix(corrected_exposures) + + return(composites_corrected) + +} + +# Function for adjusting comparison +.benchmark_comp_adj <- function(comparison, adjustment_threshold){ + + low_threshold_comp <- comparison[comparison$cosine <= adjustment_threshold,] + high_threshold_comp <- comparison[comparison$cosine > adjustment_threshold,] + + indexes_to_keep <- c() + if (dim(low_threshold_comp)[1] > 0){ + for (index in 1:dim(low_threshold_comp)[1]){ + if (low_threshold_comp[index,4] %in% high_threshold_comp$x_sig_name == FALSE){ + indexes_to_keep <- c(indexes_to_keep, index) + } + else{ + existing_cs <- high_threshold_comp[high_threshold_comp$x_sig_name == low_threshold_comp[index,4], 1][1] + diff <- abs(existing_cs - low_threshold_comp[index,1]) + if (diff < 0.05){ + indexes_to_keep <- c(indexes_to_keep, index) + } + } + } + + + comparison_adj <- rbind(high_threshold_comp, low_threshold_comp[indexes_to_keep,]) + + return(comparison_adj) + + } + + else{ + return(comparison) + } + +} + +# Function for claculating RE +.get_reconstruction_error <- function(result, count_table){ + + # extract exposures and signatures matrices + expos <- exposures(result) + sigs <- signatures(result) + + # convert count table to probabilities + count_table_probs <- prop.table(count_table, 2) + + # convert exposure matrix to probabilities + expos_probs <- prop.table(expos, 2) + + # multiply signature and exposure matrices + predicted_count_probs <- sigs %*% expos_probs + + # calculate sum of differences (reconstruction error) + reconstruction_error <- sum(abs(count_table_probs - predicted_count_probs)) + + # return reconstruction error + return(reconstruction_error) + +} + +# Function to generate single run summary +.generate_summary <- function(title, result_all, result_true, comparison_results, count_table, final_musica, final_comparison){ + + # missing + + true_sig_names <- colnames(signatures(result_true)) + num_true <- length(true_sig_names) + num_missing <- length(true_sig_names[!(true_sig_names %in% comparison_results$y_sig_name)]) + + # spurious + + predicted_sig_names <- colnames(signatures(result_all)) + num_predicted <- length(predicted_sig_names) + num_spurious <- length(predicted_sig_names[!(predicted_sig_names %in% comparison_results$x_sig_name)]) + + # duplicate + + num_duplicates <- 0 + + freq <- table(comparison_results$y_sig_name) + + if(length(names(freq[freq > 1])) != 0){ + duplicated_signatures <- names(freq[freq > 1]) + duplicated_signature_components <- comparison_results[comparison_results$y_sig_name %in% duplicated_signatures, 4] + + for (duplicated_sig in duplicated_signatures){ + sigs_to_merge <- comparison_results[comparison_results$y_sig_name == duplicated_sig + & !grepl("like", comparison_results$x_sig_name), 4] + + num_duplicates <- num_duplicates + length(sigs_to_merge) + } + } + + # composite + + freq <- table(comparison_results$x_sig_name) + composite_sigs <- names(freq[freq > 1]) + num_composites <- length(composite_sigs) + + # dupcomp + + dupcomp_count <- 0 + if (exists("duplicated_signature_components")){ + for (sig in composite_sigs){ + if (sig %in% duplicated_signature_components){ + dupcomp_count <- dupcomp_count + 1 + num_composites <- num_composites - 1 + num_duplicates <- num_duplicates - 1 + } + } + } + + # matched + + num_direct_matches <- num_predicted - num_composites - num_duplicates - num_spurious - dupcomp_count + + # reconstruction error + + initial_reconstruction_error <- .get_reconstruction_error(result_all, count_table) + initial_reconstruction_error <- round(initial_reconstruction_error, 3) + end_reconstruction_error <- .get_reconstruction_error(final_musica, count_table) + end_reconstruction_error <- round(end_reconstruction_error, 3) + + # stability + + # average cosine similarity + + cs <- final_comparison$cosine + avg_cs <- mean(cs) + avg_cs <- round(avg_cs, 3) + + # min cosine similarity + + min_cs <- min(cs) + min_cs <- round(min_cs, 3) + + # max cosine similarity + + max_cs <- max(cs) + max_cs <- round(max_cs, 3) + + # signatures found + + sigs_found <- final_comparison$y_sig_name + num_found <- length(sigs_found) + sigs_found <- paste(sigs_found, collapse = ", ") # added when removed below + #sigs_found_ordered <- paste("SBS", sort(as.numeric(str_remove(sigs_found, "Signature"))), sep = "") + #sigs_found_ordered <- paste(sigs_found_ordered, collapse = ", ") + + + # put in table + + summary <- data.frame(matrix(, nrow=13, ncol=1)) + + rownames(summary) <- c("Num Found", "Num Direct Matched", "Num Missing", "Num Spurious", "Num Duplicates", "Num Composites", "Num Dup/Comp", "Initial RE", "Final RE", "Mean CS", "Min CS", "Max CS", "Sigs Found") + + colnames(summary) <- title + + summary[1,1] <- num_found + summary[2,1] <- num_direct_matches + summary[3,1] <- num_missing + summary[4,1] <- num_spurious + summary[5,1] <- num_duplicates + summary[6,1] <- num_composites + summary[7,1] <- dupcomp_count + summary[8,1] <- initial_reconstruction_error + summary[9,1] <- end_reconstruction_error + summary[10,1] <- avg_cs + summary[11,1] <- min_cs + summary[12,1] <- max_cs + summary[13,1] <- sigs_found + + + return(summary) + +} + +# Function to make sig view summary +.signature_view_summary <- function(benchmark){ + + indv_benchmarks <- indv_benchmarks(benchmark) + truth <- ground_truth(benchmark) + + final_comparison_list <- list() + method_list <- c() + + for (index in 1:length(indv_benchmarks)){ + + final_comparison_list[[index]] <- final_comparison(indv_benchmarks[[index]]) + method_list <- c(method_list, method_id(indv_benchmarks[[index]])) + + } + + summary_complete <- data.frame(matrix(, nrow=6, ncol=1)) + + sigs_found <- NULL + for (final_comparison in final_comparison_list){ + sigs_found <- c(sigs_found, final_comparison$y_sig_name) + } + + sigs_found <- unique(sigs_found) + sigs_found_ordered <- sigs_found + #sigs_found_ordered <- paste("Signature", sort(as.numeric(str_remove(sigs_found, "Signature"))), sep = "") + + all_sigs <- unique(c(sigs_found_ordered, colnames(signatures(truth)))) + all_sigs_ordered <- all_sigs + #all_sigs_ordered <- paste("Signature", sort(as.numeric(str_remove(all_sigs, "Signature"))), sep = "") + + for (sig in all_sigs_ordered){ + + summary <- data.frame(matrix(, nrow=6, ncol=1)) + rownames(summary) <- c("Times Found", "Times Missed", "Mean CS", "Min CS", "Max CS", "Methods Found") + colnames(summary) <- sig + + # times found + + count_found <- 0 + index_found <- NULL + index <- 1 + for (final_comparison in final_comparison_list){ + if (sig %in% final_comparison$y_sig_name == TRUE){ + count_found <- count_found + 1 + index_found <- c(index_found, index) + } + index <- index + 1 + } + + # times missed + + count_missed <- 0 + for (final_comparison in final_comparison_list){ + if (sig %in% final_comparison$y_sig_name == FALSE){ + count_missed <- count_missed + 1 + } + } + + # mean CS + + cs <- NULL + for (final_comparison in final_comparison_list){ + if (sig %in% final_comparison$y_sig_name == TRUE){ + cs <- c(cs, final_comparison[final_comparison$y_sig_name == sig, 1]) + } + } + + mean_cs <- mean(cs) + mean_cs <- round(mean_cs, 3) + if (is.null(cs)){ + mean_cs <- NA + } + + # min cs + + min_cs <- min(cs) + min_cs <- round(min_cs, 3) + if (is.null(cs)){ + min_cs <- NA + } + + # max cs + + max_cs <- max(cs) + max_cs <- round(max_cs, 3) + if (is.null(cs)){ + max_cs <- NA + } + + # methods + + methods <- method_list[c(index_found)] + methods <- paste(methods, collapse = ", ") + + summary[1,1] <- count_found + summary[2,1] <- count_missed + summary[3,1] <- mean_cs + summary[4,1] <- min_cs + summary[5,1] <- max_cs + summary[6,1] <- methods + + summary_complete <- cbind(summary_complete, summary) + + } + + summary_complete <- summary_complete[,-1] + summary_complete[sapply(summary_complete, is.infinite)] <- NA + + #summary_complete <- t(summary_complete) + + #print(summary_complete) + return(summary_complete) + +} + +# Function for updating full benchmark object +.update_benchmark <- function(full_benchmark, indv_benchmark, single_summary){ + + # update summary + if (dim(full_benchmark@method_view_summary)[1] == 0 ){ + full_benchmark@method_view_summary <- single_summary + }else{ + full_benchmark@method_view_summary <- cbind(full_benchmark@method_view_summary, single_summary) + } + + # update single benchmark list + indv_benchmarks(full_benchmark) <- append(indv_benchmarks(full_benchmark), list(indv_benchmark)) + names(indv_benchmarks(full_benchmark))[length(indv_benchmarks(full_benchmark))] <- method_id(indv_benchmark) + + # update sig view summary + sig_view_summary(full_benchmark) <- as.matrix(.signature_view_summary(full_benchmark)) + + return(full_benchmark) + +} + + diff --git a/R/compare_results.R b/R/compare_results.R index b3348b93..0c55948a 100644 --- a/R/compare_results.R +++ b/R/compare_results.R @@ -47,7 +47,6 @@ sig_compare <- function(sig1, sig2, metric = c("cosine", "jsd"), return(comparison) } - #' Compare two result files to find similar signatures #' #' @param result A \code{\linkS4class{musica_result}} object. @@ -55,26 +54,44 @@ sig_compare <- function(sig1, sig2, metric = c("cosine", "jsd"), #' @param threshold threshold for similarity #' @param metric One of \code{"cosine"} for cosine similarity or \code{"jsd"} #' for 1 minus the Jensen-Shannon Divergence. Default \code{"cosine"}. +#' @return Returns the comparisons +#' @examples +#' data(res) +#' compare_results(res, res, threshold = 0.8) +#' @export +compare_results <- function(result, other_result, threshold = 0.9, + metric = "cosine") { + + comparison <- sig_compare(sig1 = signatures(result), sig2 = signatures(other_result), + threshold = threshold, metric = metric) + return(comparison) + +} + +#' Plot the comparison of two result files +#' +#' @param comparison A matrix detailing the comparison between the two result files. +#' For example, the output of the \code{compare_results} function. +#' @param result A \code{\linkS4class{musica_result}} object. +#' @param other_result A second \code{\linkS4class{musica_result}} object. #' @param result_name title for plot of first result signatures #' @param other_result_name title for plot of second result signatures #' @param decimals Specifies rounding for similarity metric displayed. Default #' \code{2}. #' @param same_scale If \code{TRUE}, the scale of the probability for each -#' signature will be the same. If \code{FALSE}, then the scale of the y-axis -#' will be adjusted for each signature. Default \code{FALSE}. -#' @return Returns the comparisons +#' comparison will be the same. If \code{FALSE}, then the scale of the y-axis +#' will be adjusted for each comparison. Default \code{TRUE}. +#' @return Returns the comparison plot #' @examples #' data(res) -#' compare_results(res, res, threshold = 0.8) +#' comparison <- compare_results(res, res, threshold = 0.8) +#' plot_comparison(comparison, res, res) #' @export -compare_results <- function(result, other_result, threshold = 0.9, - metric = "cosine", result_name = - deparse(substitute(result)), other_result_name = - deparse(substitute(other_result)), - decimals = 2, same_scale = FALSE) { - signatures <- signatures(result) - comparison <- sig_compare(sig1 = signatures, sig2 = signatures(other_result), - threshold = threshold, metric = metric) +plot_comparison <- function(comparison, result, other_result, + result_name = deparse(substitute(result)), + other_result_name = deparse(substitute(other_result)), + decimals = 2, same_scale = TRUE) { + result_subset <- methods::new("musica_result", signatures = signatures(result)[, comparison$x_sig_index, @@ -93,10 +110,10 @@ compare_results <- function(result, other_result, threshold = 0.9, result_subset_maxes <- NULL other_subset_maxes <- NULL - for (index in 1:dim(comparison)[1]){ + for (index in seq_len(dim(comparison)[1])){ result_subset_maxes <- c(result_subset_maxes, max(signatures(result_subset)[,index])) } - for (index in 1:dim(comparison)[1]){ + for (index in seq_len(dim(comparison)[1])){ other_subset_maxes <- c(other_subset_maxes, max(signatures(other_subset)[,index])) } maxes <- pmax(result_subset_maxes, other_subset_maxes) * 100 @@ -105,12 +122,12 @@ compare_results <- function(result, other_result, threshold = 0.9, maxes <- rep(max(maxes), length(maxes)) } - .plot_compare_result_signatures(result_subset, other_subset, comparison, + plot <- .plot_compare_result_signatures(result_subset, other_subset, comparison, res1_name = result_name, res2_name = other_result_name, decimals = decimals, same_scale = same_scale, maxes = maxes) - return(comparison) + return(plot) } #' Compare a result object to COSMIC V3 Signatures; Select exome or genome for diff --git a/R/differential_analysis.R b/R/differential_analysis.R index 2c35cc92..20ce4f1e 100644 --- a/R/differential_analysis.R +++ b/R/differential_analysis.R @@ -116,7 +116,7 @@ exposure_differential_analysis <- function(musica_result, annotation, } else if (method == "kruskal") { header <- c("K-W chi-squared", "df", "p-value", "fdr") diff.out <- matrixTests::row_kruskalwallis(exposures, annotations, ...) %>% - dplyr::select(statistic, df, pvalue) + dplyr::select("statistic", "df", "pvalue") diff.out$fdr <- p.adjust(diff.out$pvalue, method = "BH") colnames(diff.out) <- header rownames(diff.out) <- rownames(exposures) diff --git a/R/discovery_prediction.R b/R/discovery_prediction.R index 4a958f0c..155f0007 100644 --- a/R/discovery_prediction.R +++ b/R/discovery_prediction.R @@ -10,9 +10,10 @@ NULL #' 2) an "exposure" matrix containing the estimated counts for each signature #' in each sample. Before mutational discovery can be performed, #' variants from samples first need to be stored in a -#' \code{\linkS4class{musica}} object using the \link{create_musica} function +#' \code{\linkS4class{musica}} object using the \link{create_musica_from_variants} +#' or \link{create_musica_from_counts} function #' and mutation count tables need to be created using functions such as -#' \link{build_standard_table}. +#' \link{build_standard_table} if \link{create_musica_from_counts} was not used. #' @param musica A \code{\linkS4class{musica}} object. #' @param table_name Name of the table to use for signature discovery. Needs #' to be the same name supplied to the table building functions such as @@ -110,19 +111,15 @@ discover_signatures <- function(musica, table_name, num_signatures, #' @description Exposures for samples will be predicted using an existing set #' of signatures stored in a \code{\linkS4class{musica_result}} object. #' Algorithms available for prediction include a modify version of \code{"lda"}, -#' \code{"decompTumor2Sig"}, and \code{"deconstructSigs"}. +#' and \code{"decompTumor2Sig"}. #' @param musica A \code{\linkS4class{musica}} object. -#' @param g A \linkS4class{BSgenome} object indicating which genome -#' reference the variants and their coordinates were derived from. Only used -#' if \code{algorithm = "deconstructSigs"} #' @param table_name Name of table used for posterior prediction. #' Must match the table type used to generate the prediction signatures #' @param signature_res Signatures used to predict exposures for the samples #' \code{musica} object. Existing signatures need to stored in a #' \code{\linkS4class{musica_result}} object. #' @param algorithm Algorithm to use for prediction of exposures. One of -#' \code{"lda"}, \code{"decompTumor2Sig"}, or -#' \code{"deconstructSigs"}. +#' \code{"lda"} or \code{"decompTumor2Sig"}. #' @param signatures_to_use Which signatures in the \code{signature_res} result #' object to use. Default is to use all signatures. #' @param verbose If \code{TRUE}, progress will be printing. Only used if @@ -143,9 +140,8 @@ discover_signatures <- function(musica, table_name, num_signatures, #' predict_exposure(musica = musica, table_name = "SBS96", #' signature_res = cosmic_v2_sigs, algorithm = "lda") #' @export -predict_exposure <- function(musica, g, table_name, signature_res, - algorithm = c("lda", "decompTumor2Sig", - "deconstructSigs"), +predict_exposure <- function(musica, table_name, signature_res, + algorithm = c("lda", "decompTumor2Sig"), signatures_to_use = seq_len(ncol( signatures(signature_res))), verbose = FALSE) { algorithm <- match.arg(algorithm) @@ -165,30 +161,30 @@ predict_exposure <- function(musica, g, table_name, signature_res, colnames(exposures) <- colnames(counts_table) rownames(exposures) <- colnames(signature) algorithm_name <- "decompTumor2Sig" - }else if (algorithm %in% c("ds", "deconstruct", "deconstructSigs")) { - sigs.input <- deconstructSigs::mut.to.sigs.input(mut.ref = variants(musica), - sample.id = "sample", chr = "chr", pos = "start", - ref = "ref", alt = "alt", bsg = g) - sig_all <- t(signature) - middle <- unlist(lapply(strsplit(colnames(sig_all), "_"), "[", 1)) - context <- lapply(strsplit(colnames(sig_all), "_"), "[", 2) - first <- unlist(lapply(context, substr, 1, 1)) - last <- unlist(lapply(context, substr, 3, 3)) - new_cols <- paste(first, "[", middle, "]", last, sep = "") - colnames(sig_all) <- new_cols - - ds_res <- vapply(rownames(sigs.input), function(x) { - ds_result <- whichSignatures(tumor_ref = sigs.input, - contexts_needed = TRUE, - signatures_limit = ncol(signature), - tri_counts_method = "default", - sample_id = x, signatures_ref = sig_all) - return(as.matrix(ds_result$weights)) - }, FUN.VALUE = rep(0, ncol(signature))) - exposures <- ds_res - colnames(exposures) <- colnames(counts_table) - rownames(exposures) <- colnames(signature) - algorithm_name <- "deconstructSigs" + #}else if (algorithm %in% c("ds", "deconstruct", "deconstructSigs")) { + #sigs.input <- deconstructSigs::mut.to.sigs.input(mut.ref = variants(musica), + # sample.id = "sample", chr = "chr", pos = "start", + # ref = "ref", alt = "alt", bsg = g) + #sig_all <- t(signature) + #middle <- unlist(lapply(strsplit(colnames(sig_all), "_"), "[", 1)) + #context <- lapply(strsplit(colnames(sig_all), "_"), "[", 2) + #first <- unlist(lapply(context, substr, 1, 1)) + #last <- unlist(lapply(context, substr, 3, 3)) + #new_cols <- paste(first, "[", middle, "]", last, sep = "") + #colnames(sig_all) <- new_cols +# + # ds_res <- vapply(rownames(sigs.input), function(x) { + # ds_result <- whichSignatures(tumor_ref = sigs.input, + # contexts_needed = TRUE, + # signatures_limit = ncol(signature), + # tri_counts_method = "default", + # sample_id = x, signatures_ref = sig_all) + #return(as.matrix(ds_result$weights)) + #}, FUN.VALUE = rep(0, ncol(signature))) + #exposures <- ds_res + #colnames(exposures) <- colnames(counts_table) + #rownames(exposures) <- colnames(signature) + #algorithm_name <- "deconstructSigs" } else { stop("Type must be lda or decomp") } @@ -300,127 +296,127 @@ predict_decompTumor2Sig <- function(sample_mat, signature_mat) { message(dim(lflank)) } -whichSignatures <- function(tumor_ref = NA, - sample_id, - signatures_ref, - associated = c(), - signatures_limit = NA, - signature_cutoff = 0.06, - contexts_needed = FALSE, - tri_counts_method = "default") { - if (is(tumor_ref, 'matrix')) { - stop(paste("Input tumor.ref needs to be a data frame or location of ", - "input text file", sep = "")) - } - - if (exists("tumor.ref", mode = "list") | is(tumor_ref, "data.frame")) { - tumor <- tumor_ref - if(contexts_needed == TRUE) { - tumor <- deconstructSigs::getTriContextFraction(mut.counts.ref = tumor, - trimer.counts.method = - tri_counts_method) - } - } else { - if (file.exists(tumor_ref)) { - tumor <- utils::read.table(tumor_ref, sep = "\t", header = TRUE, - as.is = TRUE, check.names = FALSE) - if (contexts_needed == TRUE) { - tumor <- deconstructSigs::getTriContextFraction(tumor, - trimer.counts.method = - tri_counts_method) - } - } else { - message("tumor.ref is neither a file nor a loaded data frame") - } - } - - if (missing(sample_id) && nrow(tumor) == 1) { - sample_id <- rownames(tumor)[1] - } - # Take patient id given - tumor <- as.matrix(tumor) - if (!sample_id %in% rownames(tumor)) { - stop(paste(sample_id, " not found in rownames of tumor.ref", sep = "")) - } - tumor <- subset(tumor, rownames(tumor) == sample_id) - if (round(rowSums(tumor), digits = 1) != 1) { - stop(paste0("Sample: ", sample_id, " is not normalized. Consider using ", - "contexts.needed = TRUE", sep = " ")) - } - signatures <- signatures_ref - - signatures <- as.matrix(signatures) - original_sigs <- signatures +#whichSignatures <- function(tumor_ref = NA, +# sample_id, +# signatures_ref, +# associated = c(), +# signatures_limit = NA, +# signature_cutoff = 0.06, +# contexts_needed = FALSE, +# tri_counts_method = "default") { +# if (is(tumor_ref, 'matrix')) { +# stop(paste("Input tumor.ref needs to be a data frame or location of ", +# "input text file", sep = "")) +# } + +# if (exists("tumor.ref", mode = "list") | is(tumor_ref, "data.frame")) { +# tumor <- tumor_ref +# if(contexts_needed == TRUE) { +# tumor <- deconstructSigs::getTriContextFraction(mut.counts.ref = tumor, +# trimer.counts.method = +# tri_counts_method) +# } +# } else { +# if (file.exists(tumor_ref)) { +# tumor <- utils::read.table(tumor_ref, sep = "\t", header = TRUE, +# as.is = TRUE, check.names = FALSE) +# if (contexts_needed == TRUE) { +# tumor <- deconstructSigs::getTriContextFraction(tumor, +# trimer.counts.method = +# tri_counts_method) +# } +# } else { +# message("tumor.ref is neither a file nor a loaded data frame") +# } +# } + +# if (missing(sample_id) && nrow(tumor) == 1) { +# sample_id <- rownames(tumor)[1] +# } +# # Take patient id given +# tumor <- as.matrix(tumor) +# if (!sample_id %in% rownames(tumor)) { +# stop(paste(sample_id, " not found in rownames of tumor.ref", sep = "")) +# } +# tumor <- subset(tumor, rownames(tumor) == sample_id) +# if (round(rowSums(tumor), digits = 1) != 1) { +# stop(paste0("Sample: ", sample_id, " is not normalized. Consider using ", +# "contexts.needed = TRUE", sep = " ")) +# } +# signatures <- signatures_ref + +# signatures <- as.matrix(signatures) +# original_sigs <- signatures # Check column names are formatted correctly - if (length(colnames(tumor)[colnames(tumor) %in% colnames(signatures)]) < - length(colnames(signatures))) { - colnames(tumor) <- deconstructSigs::changeColumnNames(colnames(tumor)) - if (length(colnames(tumor)[colnames(tumor) %in% colnames(signatures)]) < - length(colnames(signatures))) { - stop("Check column names on input file") - } - } +# if (length(colnames(tumor)[colnames(tumor) %in% colnames(signatures)]) < +# length(colnames(signatures))) { +# colnames(tumor) <- deconstructSigs::changeColumnNames(colnames(tumor)) +# if (length(colnames(tumor)[colnames(tumor) %in% colnames(signatures)]) < +# length(colnames(signatures))) { +# stop("Check column names on input file") +# } +# } # Ensure that columns in tumor match the order of those in signatures - tumor <- tumor[, colnames(signatures), drop = FALSE] +# tumor <- tumor[, colnames(signatures), drop = FALSE] #Take a subset of the signatures - if (!is.null(associated)) { - signatures <- signatures[rownames(signatures) %in% associated, ] - } +# if (!is.null(associated)) { +# signatures <- signatures[rownames(signatures) %in% associated, ] +# } - if (is.na(signatures_limit)) { - signatures_limit <- nrow(signatures) - } +# if (is.na(signatures_limit)) { +# signatures_limit <- nrow(signatures) +# } #Set the weights matrix to 0 - weights <- matrix(0, nrow = nrow(tumor), ncol = nrow(signatures), - dimnames = list(rownames(tumor), - rownames(signatures))) - - seed <- deconstructSigs::findSeed(tumor, signatures) - weights[seed] <- 1 - w <- weights * 10 - - error_diff <- Inf - error_threshold <- 1e-3 - - num <- 0 - while (error_diff > error_threshold) { - num <- num + 1 - #message(num) - error_pre <- deconstructSigs::getError(tumor, signatures, w) - if (error_pre == 0) { - break - } - w <- deconstructSigs::updateW_GR(tumor, signatures, w, - signatures.limit = - signatures_limit) - error_post <- deconstructSigs::getError(tumor, signatures, w) - error_diff <- (error_pre - error_post) / error_pre - } - - weights <- w / sum(w) - unknown <- 0 +# weights <- matrix(0, nrow = nrow(tumor), ncol = nrow(signatures), +# dimnames = list(rownames(tumor), +# rownames(signatures))) + +# seed <- deconstructSigs::findSeed(tumor, signatures) +# weights[seed] <- 1 +# w <- weights * 10 + +# error_diff <- Inf +# error_threshold <- 1e-3 + +# num <- 0 +# while (error_diff > error_threshold) { +# num <- num + 1 +# #message(num) +# error_pre <- deconstructSigs::getError(tumor, signatures, w) +# if (error_pre == 0) { +# break +# } +# w <- deconstructSigs::updateW_GR(tumor, signatures, w, +# signatures.limit = +# signatures_limit) +# error_post <- deconstructSigs::getError(tumor, signatures, w) +# error_diff <- (error_pre - error_post) / error_pre +# } + +# weights <- w / sum(w) +# unknown <- 0 ## filtering on a given threshold value (0.06 default) - weights[weights < signature_cutoff] <- 0 - unknown <- 1 - sum(weights) +# weights[weights < signature_cutoff] <- 0 +# unknown <- 1 - sum(weights) - product <- weights %*% signatures - diff <- tumor - product +# product <- weights %*% signatures +# diff <- tumor - product - x <- matrix(data = 0, nrow = 1, ncol = nrow(original_sigs), - dimnames = list(rownames(weights), rownames(original_sigs))) - x <- data.frame(x) - x[colnames(weights)] <- weights - weights <- x +# x <- matrix(data = 0, nrow = 1, ncol = nrow(original_sigs), +# dimnames = list(rownames(weights), rownames(original_sigs))) +# x <- data.frame(x) +# x[colnames(weights)] <- weights +# weights <- x - out <- list(weights, tumor, product, diff, unknown) - names(out) <- c("weights", "tumor", "product", "diff", "unknown") - return(out) -} +# out <- list(weights, tumor, product, diff, unknown) +# names(out) <- c("weights", "tumor", "product", "diff", "unknown") +# return(out) +#} #' Generate result_grid from musica based on annotation and range of k #' @@ -538,7 +534,7 @@ reconstruct_sample <- function(result, sample_number) { #' @param table_name Name of table used for posterior prediction (e.g. SBS96) #' @param signature_res Signatures to automatically subset from for prediction #' @param algorithm Algorithm to use for prediction. Choose from -#' "lda_posterior", decompTumor2Sig, and deconstructSigs +#' "lda_posterior" and decompTumor2Sig #' @param sample_annotation Annotation to grid across, if none given, #' prediction subsetting on all samples together #' @param min_exists Threshold to consider a signature active in a sample @@ -611,7 +607,7 @@ auto_predict_grid <- function(musica, table_name, signature_res, algorithm, #' @param table_name Name of table used for posterior prediction (e.g. SBS96) #' @param signature_res Signatures to automatically subset from for prediction #' @param algorithm Algorithm to use for prediction. Choose from -#' "lda_posterior", decompTumor2Sig, and deconstructSigs +#' "lda_posterior" and decompTumor2Sig #' @param min_exists Threshold to consider a signature active in a sample #' @param proportion_samples Threshold of samples to consider a signature #' active in the cohort diff --git a/R/kmeans.R b/R/kmeans.R index 3dcd4644..1e827008 100644 --- a/R/kmeans.R +++ b/R/kmeans.R @@ -91,14 +91,14 @@ cluster_exposure <- function(result, nclust, proportional = TRUE, method = "kmea #' #Get clustering result #' clust_out <- cluster_exposure(result = res_annot, nclust = 2) #' #UMAP -#' create_umap(result = res_annot) -#' #generate cluster X signature plot -#' plot_cluster(result = res_annot, clusters = clust_out, group = "signature") -#' #generate cluster X annotation plot -#' plot_cluster(result = res_annot, clusters = clust_out, group = "annotation", -#' annotation = "Tumor_Subtypes") -#' #generate a single UMAP plot -#' plot_cluster(result = res_annot, clusters = clust_out, group = "none") +#' #create_umap(result = res_annot) +#' ##generate cluster X signature plot +#' #plot_cluster(result = res_annot, clusters = clust_out, group = "signature") +#' ##generate cluster X annotation plot +#' #plot_cluster(result = res_annot, clusters = clust_out, group = "annotation", +#' # annotation = "Tumor_Subtypes") +#' ##generate a single UMAP plot +#' #plot_cluster(result = res_annot, clusters = clust_out, group = "none") #' @export plot_cluster <- function(result, clusters, group = "signature", annotation = NULL, plotly = TRUE){ diff --git a/R/load_data.R b/R/load_data.R index 06375335..9a5f1293 100644 --- a/R/load_data.R +++ b/R/load_data.R @@ -649,22 +649,24 @@ extract_variants_from_maf_file <- function(maf_file, extra_fields = NULL) { #' package = "musicatk") #' variants <- extract_variants_from_maf_file(maf_file) #' g <- select_genome("38") -#' musica <- create_musica(x = variants, genome = g) +#' musica <- create_musica_from_variants(x = variants, genome = g) #' @export -create_musica <- function(x, genome, - check_ref_chromosomes = TRUE, - check_ref_bases = TRUE, - chromosome_col = "chr", - start_col = "start", - end_col = "end", - ref_col = "ref", - alt_col = "alt", - sample_col = "sample", - extra_fields = NULL, - standardize_indels = TRUE, - convert_dbs = TRUE, - verbose = TRUE) { - +create_musica_from_variants <- function(x, genome, + check_ref_chromosomes = TRUE, + check_ref_bases = TRUE, + chromosome_col = "chr", + start_col = "start", + end_col = "end", + ref_col = "ref", + alt_col = "alt", + sample_col = "sample", + extra_fields = NULL, + standardize_indels = TRUE, + convert_dbs = TRUE, + verbose = TRUE) { + + + used_fields <- c(.required_musica_headers(), extra_fields) if (canCoerce(x, "data.table")) { dt <- data.table::as.data.table(x) @@ -789,9 +791,159 @@ create_musica <- function(x, genome, dt$sample <- factor(dt$sample, levels = s) musica <- new("musica", variants = dt, sample_annotations = annot) + + return(musica) +} + +#' Creates a musica object from a mutation count table +#' +#' This function creates a \linkS4class{musica} object from a mutation count +#' table or matrix. The \linkS4class{musica} class stores variants information, +#' variant-level annotations, sample-level annotations, and count tables and +#' is used as input to the mutational signature discovery and prediction +#' algorithms. +#' +#' @param x A data.table, matrix, or data.frame that contains counts of mutation +#' types for each sample, with samples as columns. +#' @param variant_class Mutations are SBS, DBS, or Indel. +#' @return Returns a musica object +#' @examples +#' #maf_file <- system.file("extdata", "public_TCGA.LUSC.maf", +#' #package = "musicatk") +#' #musica <- create_musica_from_counts(x = count_table, variant_class = "SBS96") +#' @export +create_musica_from_counts <- function(x, variant_class) { + + if (canCoerce(x, "matrix")) { + x <- as.matrix(x) + } else { + stop("'count_table' needs to be an object which can be coerced to a matrix. ") + } + + # create empty musica object + musica <- new("musica") + + if (variant_class %in% c("snv", "SNV", "SNV96", "SBS", "SBS96")) { + + if (nrow(x) != 96){ + stop("SBS96 'count_table' must have 96 rows.") + } + + # create SBS mutation type list + forward_change <- c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G") + b1 <- rep(rep(c("A", "C", "G", "T"), each = 4), 6) + b2 <- rep(c("C", "T"), each = 48) + b3 <- rep(c("A", "C", "G", "T"), 24) + mut_trinuc <- apply(cbind(b1, b2, b3), 1, paste, collapse = "") + mut <- rep(forward_change, each = 16) + annotation <- data.frame("motif" = paste0(mut, "_", mut_trinuc), + "mutation" = mut, + "context" = mut_trinuc) + rownames(annotation) <- annotation$motif + + # color mapping for mutation types + color_mapping <- c("C>A" = "#5ABCEBFF", + "C>G" = "#050708FF", + "C>T" = "#D33C32FF", + "T>A" = "#CBCACBFF", + "T>C" = "#ABCD72FF", + "T>G" = "#E7C9C6FF") + + # update count table rownames with SBS96 standard naming + rownames(x) <- annotation$motif + + # create count table object + tab <- new("count_table", name = "SBS96", count_table = x, + annotation = annotation, features = as.data.frame(annotation$motif[1]), + type = S4Vectors::Rle("SBS"), color_variable = "mutation", + color_mapping = color_mapping, description = paste0("Single Base Substitution table with", + " one base upstream and downstream")) + + # add count table to musica object + tables(musica)[["SBS96"]] <- tab + + } else if (variant_class %in% c("DBS", "dbs", "doublet")) { + if (nrow(x) != 78){ + stop("DBS78 'count_table' must have 78 rows.") + } + + full_motif <- c(paste0("AC>NN", "_", c("CA", "CG", "CT", "GA", "GG", "GT", + "TA", "TG", "TT")), + paste0("AT>NN", "_", c("CA", "CC", "CG", "GA", "GC", "TA")), + paste0("CC>NN", "_", c("AA", "AG", "AT", "GA", "GG", "GT", "TA", + "TG", "TT")), + paste0("CG>NN", "_", c("AT", "GC", "GT", "TA", "TC", "TT")), + paste0("CT>NN", "_", c("AA", "AC", "AG", "GA", "GC", "GG", "TA", + "TC", "TG")), + paste0("GC>NN", "_", c("AA", "AG", "AT", "CA", "CG", "TA")), + paste0("TA>NN", "_", c("AT", "CG", "CT", "GC", "GG", "GT")), + paste0("TC>NN", "_", c("AA", "AG", "AT", "CA", "CG", "CT", "GA", + "GG", "GT")), + paste0("TG>NN", "_", c("AA", "AC", "AT", "CA", "CC", "CT", "GA", + "GC", "GT")), + paste0("TT>NN", "_", c("AA", "AC", "AG", "CA", "CC", "CG", "GA", + "GC", "GG"))) + + annotation <- data.frame(motif = full_motif, mutation = + unlist(lapply(strsplit(full_motif, "_"), "[[", 1)), + context = unlist(lapply(strsplit(full_motif, "_"), + "[[", 2)), + row.names = full_motif) + + color_mapping <- .gg_color_hue(length(unique(annotation$mutation))) + names(color_mapping) <- unique(annotation$mutation) + + rownames(x) <- annotation$motif + + # create count table object + tab <- new("count_table", name = "DBS78", count_table = x, + annotation = annotation, features = as.data.frame(annotation$motif[1]), + type = S4Vectors::Rle("DBS"), color_variable = "mutation", + color_mapping = color_mapping, description = paste0("Standard count table for ", + "double base substitutions", + "using COSMIC v3 schema")) + + # add count table to musica object + tables(musica)[["DBS78"]] <- tab + + + } else if (variant_class %in% c("INDEL", "Indel", "indel", "ind", "IND", + "ID")) { + stop("Not yet supported.") + } else { + stop("Only SBS, DBS, and Indel classes are supported") + } + return(musica) } +#' Create a musica_result object +#' +#' This function creates a \linkS4class{musica_result} object from signatures, +#' exposures, and a mutation count table. +#' +#' @param signatures A matrix or data.frame of signatures by mutational motifs +#' @param exposures A matrix or data.frame of samples by signature weights +#' @param count_table Summary table with per-sample unnormalized motif counts +#' @param algorithm Describes how the signatures/weights were generated +#' +#' @return A \linkS4class{musica_result} object +#' @export +create_musica_result <- function(signatures, exposures, count_table, algorithm = NULL){ + + # create musica result object with given exposures and signatures + musica_result <- new("musica_result", signatures = as.matrix(signatures), + exposures = as.matrix(exposures), table_name = "SBS96", + musica = create_musica_from_counts(count_table, "SBS96")) + + if (hasArg(algorithm)){ + musica_result@algorithm <- algorithm + } + + return(musica_result) + +} + .check_variant_genome <- function(dt, genome) { diff --git a/R/main_class.R b/R/main_class.R index a8f27ab9..032a757a 100644 --- a/R/main_class.R +++ b/R/main_class.R @@ -314,3 +314,71 @@ get_count_type <- function(count_table) { get_color_mapping <- function(count_table) { return(count_table@color_mapping) } + +# Full Benchmark object/methods ------------------------------- + +#' Object that contains information for a full benchmarking analysis where +#' multiple predictions may be benchmarked +#' +#' @slot ground_truth musica_result +#' @slot method_view_summary matrix +#' @slot sig_view_summary matrix +#' @slot indv_benchmarks list +#' @export +#' @exportClass full_benchmark + +setClass( + "full_benchmark", + slots = list( + ground_truth = "musica_result", + method_view_summary = "matrix", + sig_view_summary = "matrix", + indv_benchmarks = "list" + ) +) + +# Single Benchmark object/methods ------------------------------- + +#' @title character or NULL class union +#' @description class to allow either NULL or character +#' @keywords internal +#' @noRd +setClassUnion("CharOrNULL", c("character", "NULL")) + +#' Object that contains information for a full benchmarking analysis where +#' multiple predictions may be benchmarked +#' +#' @slot initial_pred musica_result +#' @slot intermediate_pred musica_result +#' @slot final_pred musica_result +#' @slot initial_comparison data.frame +#' @slot intermediate_comparison data.frame +#' @slot final_comparison data.frame +#' @slot single_summary matrix +#' @slot method_id character +#' @slot threshold numeric +#' @slot adjustment_threshold numeric +#' @slot description CharOrNULL +#' @export +#' @exportClass single_benchmark + +setClass( + "single_benchmark", + slots = list( + initial_pred = "musica_result", + intermediate_pred = "musica_result", + final_pred = "musica_result", + initial_comparison = "data.frame", + intermediate_comparison = "data.frame", + final_comparison = "data.frame", + single_summary = "matrix", + method_id = "character", + threshold = "numeric", + adjustment_threshold = "numeric", + description = "CharOrNULL" + ) +) + + + + diff --git a/R/methods.R b/R/methods.R index 6d42f6b6..6d140aea 100644 --- a/R/methods.R +++ b/R/methods.R @@ -1,5 +1,5 @@ #' @title Retrieve musica from a musica_result object -#' @description The \code{\linkS4class{musica}} musica contains variants, +#' @description The \code{\linkS4class{musica}} musica contains variants, #' count tables, and sample annotations #' @param result A \code{\linkS4class{musica_result}} object generated by #' a mutational discovery or prediction tool. @@ -222,12 +222,13 @@ setReplaceMethod( ) #' @title Retrieve variants from a musica or musica_result object -#' @description The \code{variants} \code{data.table} contains the variants +#' @description The \code{variants} \code{data.table} contains the variants #' and variant-level annotations #' @param object A \code{\linkS4class{musica}} object generated by -#' the \link{create_musica} function or a \code{\linkS4class{musica_result}} +#' the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +#' or a \code{\linkS4class{musica_result}} #' object generated by a mutational discovery or prediction tool. -#' @rdname variants +#' @rdname variants #' @return A data.table of variants #' @export #' @examples @@ -261,8 +262,8 @@ setMethod( #' @rdname variants #' @param musica A \code{\linkS4class{musica}} object generated by -#' the \link{create_musica} function -#' @param value A \code{\linkS4class{data.table}} of mutational variants and +#' the \link{create_musica_from_variants} or \link{create_musica_from_counts} function +#' @param value A \code{\linkS4class{data.table}} of mutational variants and #' variant-level annotations #' @export #' @examples @@ -287,14 +288,15 @@ setReplaceMethod( } ) -#' @title Retrieve the list of count_tables from a musica or musica_result +#' @title Retrieve the list of count_tables from a musica or musica_result #' object -#' @description The \code{count_tables} contains standard and/or custom +#' @description The \code{count_tables} contains standard and/or custom #' count tables created from variants #' @param object A \code{\linkS4class{musica}} object generated by -#' the \link{create_musica} function or a \code{\linkS4class{musica_result}} +#' the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +#' or a \code{\linkS4class{musica_result}} #' object generated by a mutational discovery or prediction tool. -#' @rdname tables +#' @rdname tables #' @return A list of count_tables #' @export #' @examples @@ -328,9 +330,10 @@ setMethod( #' @rdname tables #' @param musica A \code{\linkS4class{musica}} object generated by -#' the \link{create_musica} function or a \code{\linkS4class{musica_result}} +#' the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +#' or a \code{\linkS4class{musica_result}} #' object generated by a mutational discovery or prediction tool. -#' @param value A list of \code{\linkS4class{count_table}} objects representing +#' @param value A list of \code{\linkS4class{count_table}} objects representing #' counts of motifs in samples #' @examples #' data(musica) @@ -362,19 +365,20 @@ setReplaceMethod( #' @title Get or set sample annotations from a musica or musica_result object #' @description Sample annotations can be used to store information about -#' each sample such as tumor type or treatment status. These are used in -#' downstream plotting functions such as \code{\link{plot_exposures}} or -#' \code{\link{plot_umap}} to group or color samples by a particular annotation. +#' each sample such as tumor type or treatment status. These are used in +#' downstream plotting functions such as \code{\link{plot_exposures}} or +#' \code{\link{plot_umap}} to group or color samples by a particular annotation. #' @param object A \code{\linkS4class{musica}} object generated by -#' the \link{create_musica} function or a \code{\linkS4class{musica_result}} +#' the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +#' or a \code{\linkS4class{musica_result}} #' object generated by a mutational discovery or prediction tool. #' @param name The name of the new annotation to add. #' @param value A vector containing the new sample annotations. Needs to be -#' the same length as the number of samples in the object. +#' the same length as the number of samples in the object. #' @rdname samp_annot -#' @return A new object with the sample annotations added to the table in the +#' @return A new object with the sample annotations added to the table in the #' \code{sample_annotations} slot. -#' @seealso See \code{\link{sample_names}} to get a vector of sample names in +#' @seealso See \code{\link{sample_names}} to get a vector of sample names in #' the \code{\linkS4class{musica}} or \code{\linkS4class{musica_result}} object. #' @export #' @examples @@ -465,11 +469,14 @@ setReplaceMethod( #' @title Retrieve sample names from a musica or musica_result object #' @description Sample names were included in the \code{sample} column -#' in the variant object passed to \code{\link{create_musica}}. This returns -#' a unique list of samples names in the order they are inside the +#' in the variant object passed to \code{\link{create_musica_from_variants}}, or in +#' the colnames of the count table object passed to +#' \code{\link{create_musica_from_counts}}. This returns +#' a unique list of samples names in the order they are inside the #' \code{\linkS4class{musica}} object. #' @param object A \code{\linkS4class{musica}} object generated by -#' the \link{create_musica} function or a \code{\linkS4class{musica_result}} +#' the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +#' or a \code{\linkS4class{musica_result}} #' object generated by a mutational discovery or prediction tool. #' @rdname sample_names #' @return A character vector of sample names @@ -513,14 +520,15 @@ setMethod( ################################################33 -#' @title Retrieve the names of count_tables from a musica or musica_result +#' @title Retrieve the names of count_tables from a musica or musica_result #' object -#' @description The \code{count_tables} contains standard and/or custom +#' @description The \code{count_tables} contains standard and/or custom #' count tables created from variants #' @param object A \code{\linkS4class{musica}} object generated by -#' the \link{create_musica} function or a \code{\linkS4class{musica_result}} +#' the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +#' or a \code{\linkS4class{musica_result}} #' object generated by a mutational discovery or prediction tool. -#' @rdname built_tables +#' @rdname built_tables #' @return The names of created count_tables #' @export #' @examples @@ -551,3 +559,724 @@ setMethod( return(names(object@musica@count_tables)) } ) + +################################################ + +#' @title Retrieve ground_truth musica_result object from a full_benchmark object +#' @description The \code{ground_truth} musica_result object contains true +#' signatures and exposures for the benchmarking analysis. +#' @param result A \code{\linkS4class{full_benchmark}} object generated by +#' the \link{create_benchmark} function. +#' @rdname ground_truth +#' @return A \code{\linkS4class{musica_result}} object with the ground truth +#' signatures and exposures +#' @export +setGeneric( + name = "ground_truth", + def = function(result) + { + standardGeneric("ground_truth") + } +) + +#' @rdname ground_truth +setMethod( + f = "ground_truth", + signature = "full_benchmark", + definition = function(result) { + return(result@ground_truth) + } +) + +#' @rdname ground_truth +#' @param result A \code{\linkS4class{full_benchmark}} object generated by +#' the \link{create_benchmark} function. +#' @param value A \code{\linkS4class{musica_result}} object with the ground truth +#' signatures and exposures +#' @export +setGeneric( + name = "ground_truth<-", + def = function(result, value) + { + standardGeneric("ground_truth<-") + } +) + +#' @rdname ground_truth +setReplaceMethod( + f = "ground_truth", + signature = c("full_benchmark", "musica_result"), + definition = function(result, value) + { + result@ground_truth <- value + return(result) + } +) + +#' @title Retrieve method_view_summary matrix from a full_benchmark object +#' @description The \code{method_view_summary} matrix contains a summary of all +#' the individual benchmarks completed in the analysis. +#' @param result A \code{\linkS4class{full_benchmark}} object generated by +#' the \link{create_benchmark} function. +#' @rdname method_view_summary +#' @return A matrix containing the summary of all individual benchmarks +#' @export +setGeneric( + name = "method_view_summary", + def = function(result) + { + standardGeneric("method_view_summary") + } +) + +#' @rdname method_view_summary +setMethod( + f = "method_view_summary", + signature = "full_benchmark", + definition = function(result) { + return(result@method_view_summary) + } +) + +#' @rdname method_view_summary +#' @param result A \code{\linkS4class{full_benchmark}} object generated by +#' the \link{create_benchmark} function. +#' @param value A matrix containing summary information for all individual +#' benchmarks run in the analysis +#' @export +setGeneric( + name = "method_view_summary<-", + def = function(result, value) + { + standardGeneric("method_view_summary<-") + } +) + +#' @rdname method_view_summary +setReplaceMethod( + f = "method_view_summary", + signature = c("full_benchmark", "matrix"), + definition = function(result, value) + { + result@method_view_summary <- value + return(result) + } +) + +#' @title Retrieve sig_view_summary matrix from a full_benchmark object +#' @description The \code{sig_view_summary} matrix contains a summary of all +#' the individual benchmarks completed in the analysis, organized by signature +#' @param result A \code{\linkS4class{full_benchmark}} object generated by +#' the \link{create_benchmark} function. +#' @rdname sig_view_summary +#' @return A matrix containing the summary of all individual benchmarks, +#' organized by signature +#' @export +setGeneric( + name = "sig_view_summary", + def = function(result) + { + standardGeneric("sig_view_summary") + } +) + +#' @rdname sig_view_summary +setMethod( + f = "sig_view_summary", + signature = "full_benchmark", + definition = function(result) { + return(result@sig_view_summary) + } +) + +#' @rdname sig_view_summary +#' @param result A \code{\linkS4class{full_benchmark}} object generated by +#' the \link{create_benchmark} function. +#' @param value A matrix containing summary information for all individual +#' benchmarks run in the analysis, organized by signature +#' @export +setGeneric( + name = "sig_view_summary<-", + def = function(result, value) + { + standardGeneric("sig_view_summary<-") + } +) + +#' @rdname sig_view_summary +setReplaceMethod( + f = "sig_view_summary", + signature = c("full_benchmark", "matrix"), + definition = function(result, value) + { + result@sig_view_summary <- value + return(result) + } +) + +#' @title Retrieve indv_benchmarks list from a full_benchmark object +#' @description The \code{indv_benchmarks} list contains +#' \code{\linkS4class{single_benchmark}} objects that each contain the benchmarking +#' information from a single benchmark run. +#' @param result A \code{\linkS4class{full_benchmark}} object generated by +#' the \link{create_benchmark} function. +#' @rdname indv_benchmarks +#' @return A list of \code{\linkS4class{single_benchmark}} objects +#' @export +setGeneric( + name = "indv_benchmarks", + def = function(result) + { + standardGeneric("indv_benchmarks") + } +) + +#' @rdname indv_benchmarks +setMethod( + f = "indv_benchmarks", + signature = "full_benchmark", + definition = function(result) { + return(result@indv_benchmarks) + } +) + +#' @rdname indv_benchmarks +#' @param result A \code{\linkS4class{full_benchmark}} object generated by +#' the \link{create_benchmark} function. +#' @param value A list of \code{\linkS4class{single_benchmark}} objects, each +#' containing information from a single benchmark run +#' @export +setGeneric( + name = "indv_benchmarks<-", + def = function(result, value) + { + standardGeneric("indv_benchmarks<-") + } +) + +#' @rdname indv_benchmarks +setReplaceMethod( + f = "indv_benchmarks", + signature = c("full_benchmark", "list"), + definition = function(result, value) + { + result@indv_benchmarks <- value + return(result) + } +) + +###################################### + +#' @title Retrieve the initial prediction from a single_benchmark object +#' @description The \code{initial_pred} object is a +#' \code{\linkS4class{musica_result}} object containing the initial signature +#' and exposure predictions inputted for benchmarking +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @rdname initial_pred +#' @return A \code{\linkS4class{musica_result}} object containing the initial +#' predictions +#' @export +setGeneric( + name = "initial_pred", + def = function(result) + { + standardGeneric("initial_pred") + } +) + +#' @rdname initial_pred +setMethod( + f = "initial_pred", + signature = "single_benchmark", + definition = function(result) { + return(result@initial_pred) + } +) + +#' @rdname initial_pred +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @param value A \code{\linkS4class{musica_result}} object containing the +#' initial signature and exposure predictions +#' @export +setGeneric( + name = "initial_pred<-", + def = function(result, value) + { + standardGeneric("initial_pred<-") + } +) + +#' @rdname initial_pred +setReplaceMethod( + f = "initial_pred", + signature = c("single_benchmark", "musica_result"), + definition = function(result, value) + { + result@initial_pred <- value + return(result) + } +) + +#' @title Retrieve the intermediate prediction from a single_benchmark object +#' @description The \code{intermediate_pred} object is a +#' \code{\linkS4class{musica_result}} object containing the signature and +#' exposure predictions after duplicate signatures are adjusted but before +#' composite sigantures are adjusted +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @rdname intermediate_pred +#' @return A \code{\linkS4class{musica_result}} object containing the intermediate +#' predictions +#' @export +setGeneric( + name = "intermediate_pred", + def = function(result) + { + standardGeneric("intermediate_pred") + } +) + +#' @rdname intermediate_pred +setMethod( + f = "intermediate_pred", + signature = "single_benchmark", + definition = function(result) { + return(result@intermediate_pred) + } +) + +#' @rdname intermediate_pred +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @param value A \code{\linkS4class{musica_result}} object containing the +#' intermediate signature and exposure predictions +#' @export +setGeneric( + name = "intermediate_pred<-", + def = function(result, value) + { + standardGeneric("intermediate_pred<-") + } +) + +#' @rdname intermediate_pred +setReplaceMethod( + f = "intermediate_pred", + signature = c("single_benchmark", "musica_result"), + definition = function(result, value) + { + result@intermediate_pred <- value + return(result) + } +) + +#' @title Retrieve the final prediction from a single_benchmark object +#' @description The \code{final_pred} object is a +#' \code{\linkS4class{musica_result}} object containing the signature and +#' exposure predictions after benchmarking is complete +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @rdname final_pred +#' @return A \code{\linkS4class{musica_result}} object containing the final +#' predictions +#' @export +setGeneric( + name = "final_pred", + def = function(result) + { + standardGeneric("final_pred") + } +) + +#' @rdname final_pred +setMethod( + f = "final_pred", + signature = "single_benchmark", + definition = function(result) { + return(result@final_pred) + } +) + +#' @rdname final_pred +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @param value A \code{\linkS4class{musica_result}} object containing the +#' final signature and exposure predictions +#' @export +setGeneric( + name = "final_pred<-", + def = function(result, value) + { + standardGeneric("final_pred<-") + } +) + +#' @rdname final_pred +setReplaceMethod( + f = "final_pred", + signature = c("single_benchmark", "musica_result"), + definition = function(result, value) + { + result@final_pred <- value + return(result) + } +) + +#' @title Retrieve the intial comparison data.frame from a single_benchmark object +#' @description The \code{initial_comparison} data.frame contains the comparison +#' between the initial predicted signatures and the true signatures. +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @rdname initial_comparison +#' @return A data.frame object containing the initial comparison between predicted +#' and true signatures +#' @export +setGeneric( + name = "initial_comparison", + def = function(result) + { + standardGeneric("initial_comparison") + } +) + +#' @rdname initial_comparison +setMethod( + f = "initial_comparison", + signature = "single_benchmark", + definition = function(result) { + return(result@initial_comparison) + } +) + +#' @rdname initial_comparison +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @param value A data.frame containing the initial comparison between predicted +#' and true signatures, generated by the \link{benchmark_compare_results} function +#' @export +setGeneric( + name = "initial_comparison<-", + def = function(result, value) + { + standardGeneric("initial_comparison<-") + } +) + +#' @rdname initial_comparison +setReplaceMethod( + f = "initial_comparison", + signature = c("single_benchmark", "data.frame"), + definition = function(result, value) + { + result@initial_comparison <- value + return(result) + } +) + +#' @title Retrieve the initial comparison data.frame from a single_benchmark object +#' @description The \code{intermediate_comparison} data.frame contains the comparison +#' between the predicted signatures and the true signatures after duplicate +#' signatures have been adjusted but before composite signatures have been +#' adjusted. +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @rdname intermediate_comparison +#' @return A data.frame object containing the intermediate comparison between +#' predicted and true signatures +#' @export +setGeneric( + name = "intermediate_comparison", + def = function(result) + { + standardGeneric("intermediate_comparison") + } +) + +#' @rdname intermediate_comparison +setMethod( + f = "intermediate_comparison", + signature = "single_benchmark", + definition = function(result) { + return(result@intermediate_comparison) + } +) + +#' @rdname intermediate_comparison +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @param value A data.frame containing the intermediate comparison between +#' predicted and true signatures, generated by the +#' \link{benchmark_compare_results} function +#' @export +setGeneric( + name = "intermediate_comparison<-", + def = function(result, value) + { + standardGeneric("intermediate_comparison<-") + } +) + +#' @rdname intermediate_comparison +setReplaceMethod( + f = "intermediate_comparison", + signature = c("single_benchmark", "data.frame"), + definition = function(result, value) + { + result@intermediate_comparison <- value + return(result) + } +) + +#' @title Retrieve the final comparison data.frame from a single_benchmark object +#' @description The \code{final_comparison} data.frame contains the comparison +#' between the predicted signatures and the true signatures after benchmarking +#' is complete. +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @rdname final_comparison +#' @return A data.frame object containing the final comparison between predicted +#' and true signatures +#' @export +setGeneric( + name = "final_comparison", + def = function(result) + { + standardGeneric("final_comparison") + } +) + +#' @rdname final_comparison +setMethod( + f = "final_comparison", + signature = "single_benchmark", + definition = function(result) { + return(result@final_comparison) + } +) + +#' @rdname final_comparison +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @param value A data.frame containing the final comparison between predicted +#' and true signatures, generated by the \link{benchmark_compare_results} +#' function +#' @export +setGeneric( + name = "final_comparison<-", + def = function(result, value) + { + standardGeneric("final_comparison<-") + } +) + +#' @rdname final_comparison +setReplaceMethod( + f = "final_comparison", + signature = c("single_benchmark", "data.frame"), + definition = function(result, value) + { + result@final_comparison <- value + return(result) + } +) + +#' @title Retrieve the method_id from a single_benchmark object +#' @description The \code{method_id} is the identifier for a single +#' benchmark run. +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @rdname method_id +#' @return The identifier for the single benchmark run +#' @export +setGeneric( + name = "method_id", + def = function(result) + { + standardGeneric("method_id") + } +) + +#' @rdname method_id +setMethod( + f = "method_id", + signature = "single_benchmark", + definition = function(result) { + return(result@method_id) + } +) + +#' @rdname method_id +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @param value A character containing the identifier for the single benchmark +#' run +#' @export +setGeneric( + name = "method_id<-", + def = function(result, value) + { + standardGeneric("method_id<-") + } +) + +#' @rdname method_id +setReplaceMethod( + f = "method_id", + signature = c("single_benchmark", "character"), + definition = function(result, value) + { + result@method_id <- value + return(result) + } +) + +#' @title Retrieve the threshold from a single_benchmark object +#' @description The \code{threshold} is the cosine similarity cutoff for +#' comparisons between predicted and true signatures +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @rdname threshold +#' @return The threshold for the single benchmark run +#' @export +setGeneric( + name = "threshold", + def = function(result) + { + standardGeneric("threshold") + } +) + +#' @rdname threshold +setMethod( + f = "threshold", + signature = "single_benchmark", + definition = function(result) { + return(result@threshold) + } +) + +#' @rdname threshold +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @param value The threshold for the single benchmark run +#' @export +setGeneric( + name = "threshold<-", + def = function(result, value) + { + standardGeneric("threshold<-") + } +) + +#' @rdname threshold +setReplaceMethod( + f = "threshold", + signature = c("single_benchmark", "numeric"), + definition = function(result, value) + { + result@threshold <- value + return(result) + } +) + +#' @title Retrieve the adjustment_threshold from a single_benchmark object +#' @description The \code{adjustment_threshold} is the cosine similarity value +#' of high confidence. Comparisons that meet this cutoff are assumed to be likely, +#' while those that fall below the cutoff will be disregarded if the predicted +#' signature is already captured above the threshold. +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @rdname adjustment_threshold +#' @return The adjustment_threshold for the single benchmark run +#' @export +setGeneric( + name = "adjustment_threshold", + def = function(result) + { + standardGeneric("adjustment_threshold") + } +) + +#' @rdname adjustment_threshold +setMethod( + f = "adjustment_threshold", + signature = "single_benchmark", + definition = function(result) { + return(result@adjustment_threshold) + } +) + +#' @rdname adjustment_threshold +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @param value The adjustment_threshold for the single benchmark run +#' @export +setGeneric( + name = "adjustment_threshold<-", + def = function(result, value) + { + standardGeneric("adjustment_threshold<-") + } +) + +#' @rdname adjustment_threshold +setReplaceMethod( + f = "adjustment_threshold", + signature = c("single_benchmark", "numeric"), + definition = function(result, value) + { + result@adjustment_threshold <- value + return(result) + } +) + +#' @title Retrieve the description from a single_benchmark object +#' @description The \code{description} contains details about the prediction +#' that is benchmarked. +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @rdname description +#' @return The description for the single benchmark run +#' @export +setGeneric( + name = "description", + def = function(result) + { + standardGeneric("description") + } +) + +#' @rdname description +setMethod( + f = "description", + signature = "single_benchmark", + definition = function(result) { + return(result@description) + } +) + +#' @rdname description +#' @param result A \code{\linkS4class{single_benchmark}} object extracted from +#' a \code{\linkS4class{full_benchmark}} object +#' @param value The description for the single benchmark run +#' @export +setGeneric( + name = "description<-", + def = function(result, value) + { + standardGeneric("description<-") + } +) + +#' @rdname description +setReplaceMethod( + f = "description", + signature = c("single_benchmark", "character"), + definition = function(result, value) + { + result@description <- value + return(result) + } +) diff --git a/R/plotting.R b/R/plotting.R index e2f3ae9e..41047f72 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -119,14 +119,13 @@ plot_signatures <- function(result, plotly = FALSE, same_scale = TRUE, y_max = NULL, annotation = NULL, percent = TRUE) { - # dummy variables loc_num <- NULL mutation_color <- NULL - label <- NULL x <- NULL xend <- NULL y <- NULL yend <- NULL + label <- NULL ymax <- NULL signatures <- signatures(result) @@ -190,6 +189,9 @@ plot_signatures <- function(result, plotly = FALSE, y_axis_spacing = rep(strrep(" ", max_num_digits), 2) } + # add context for x-axis label + plot_dat$df$context <- rep(annot$context, each = num_sigs) + # Plot signatures plot_dat$df %>% ggplot(aes_string(y = "exposure", x = "motif", fill = "mutation_color")) + @@ -198,7 +200,7 @@ plot_signatures <- function(result, plotly = FALSE, ggplot2::xlab("Motifs") + ggplot2::ylab(y_axis_label) + ggplot2::guides(fill = ggplot2::guide_legend(nrow = 1)) + ggplot2::scale_fill_manual(values = color_mapping) + - ggplot2::scale_x_discrete(labels = annot$context) + + ggplot2::scale_x_discrete(limits = plot_dat$df$motif, labels = plot_dat$df$context) + ggplot2::scale_y_continuous(expand = expansion(mult = c(0, 0.2)), limits = c(0, NA), n.breaks = 5) + ggplot2::geom_text(data = sig_name_labels, @@ -235,15 +237,27 @@ plot_signatures <- function(result, plotly = FALSE, limits = c(0, NA), breaks = c(0, 0.01), labels = y_axis_spacing, n.breaks = 4) + ggplot2::ylab("") + - ggplot2::geom_rect(data = motif_label_locations, + ggplot2::geom_rect(data = motif_label_locations %>% arrange(x), aes(xmin = x, xmax = xend, ymin = max(y), ymax = max(yend)), - fill = color_mapping, color = "black", - linewidth = 0.25, inherit.aes = FALSE) + - ggplot2::geom_text(data=motif_label_locations, - aes(x=x+(xend-x)/2, y=y+(yend-y)/2, - label = stringr::str_to_title(mutation_color)), - fontface = "bold", size = 4, - color = label_colors) -> p2 + fill = factor(color_mapping, levels = color_mapping), color = "black", + linewidth = 0.25, inherit.aes = FALSE) -> p2 + + # adjust motif label direction if indel signature + if (table_name %in% c("IND83", "INDEL83", "INDEL", "IND", "indel", + "Indel")){ + p2 <- p2 + ggplot2::geom_text(data=motif_label_locations, + aes(x=x+(xend-x)/2, y=y+(yend-y)/2, + label = stringr::str_to_title(mutation_color)), + fontface = "bold", size = 4, + color = label_colors, angle = 90) + } + else{ + p2 <- p2 + ggplot2::geom_text(data=motif_label_locations, + aes(x=x+(xend-x)/2, y=y+(yend-y)/2, + label = stringr::str_to_title(mutation_color)), + fontface = "bold", size = 4, + color = label_colors) + } # Adjust theme @@ -276,8 +290,16 @@ plot_signatures <- function(result, plotly = FALSE, axis.title.y = element_blank()) } + # adjust height of motif lables if indel signature + if (table_name %in% c("IND83", "INDEL83", "INDEL", "IND", "indel", + "Indel")){ + height <- 5 + } + else{ + height <- 1 + } - figure <- ggpubr::ggarrange(p2, p, ncol = 1, nrow = 2, heights = c(1,15)) + figure <- ggpubr::ggarrange(p2, p, ncol = 1, nrow = 2, heights = c(height,15)) if (isTRUE(plotly)) { figure <- plotly::ggplotly(p) diff --git a/R/test_data.R b/R/test_data.R index a8f345e4..6f59e683 100644 --- a/R/test_data.R +++ b/R/test_data.R @@ -122,3 +122,63 @@ #' @keywords datasets #' "res_annot" + +#' synthetic_breast_counts +#' +#' A data.frame containing the SBS96 mutation counts for a synthetic breast +#' cancer dataset with 214 samples. +#' +#' @docType data +#' +#' @usage data(synthetic_breast_counts) +#' +#' @format An object of class \code{data.frame} +#' +#' @keywords datasets +#' +"synthetic_breast_counts" + +#' synthetic_breast_true_exposures +#' +#' A data.frame containing true signature exposure levels for a synthetic +#' breast cancer dataset with 214 samples and 8 signatures. +#' +#' @docType data +#' +#' @usage data(synthetic_breast_true_exposures) +#' +#' @format An object of class \code{data.frame} +#' +#' @keywords datasets +#' +"synthetic_breast_true_exposures" + +#' example_predicted_sigs +#' +#' A matrix containing the predicted signatures for a synthetic breast cancer +#' dataset with 214 samples, generated using NMF and k=8 signatures. +#' +#' @docType data +#' +#' @usage data(example_predicted_sigs) +#' +#' @format An object of class \code{matrix} +#' +#' @keywords datasets +#' +"example_predicted_sigs" + +#' example_predicted_exp +#' +#' A matrix containing the predicted signature exposure levels for a synthetic +#' breast cancer dataset with 214 samples, generated using NMF and k=8 signatures. +#' +#' @docType data +#' +#' @usage data(example_predicted_exp) +#' +#' @format An object of class \code{matrix} +#' +#' @keywords datasets +#' +"example_predicted_exp" diff --git a/R/umap.R b/R/umap.R index 457ca6b7..462e33f4 100644 --- a/R/umap.R +++ b/R/umap.R @@ -1,6 +1,6 @@ #' @title Create a UMAP from a musica result -#' @description Proportional sample exposures will be used as input into the +#' @description Proportional sample exposures will be used as input into the #' \code{\link[uwot]{umap}} function to generate a two dimensional UMAP. #' @param result A \code{\linkS4class{musica_result}} object generated by #' a mutational discovery or prediction tool. @@ -21,10 +21,10 @@ #' stored in the \code{UMAP} slot. #' @seealso See \link{plot_umap} to display the UMAP and #' \code{\link[uwot]{umap}} for more information on the individual parameters -#' for generating UMAPs. +#' for generating UMAPs. #' @examples -#' data(res_annot) -#' create_umap(result = res_annot) +#' #data(res_annot) +#' #create_umap(result = res_annot) #' @export create_umap <- function(result, n_neighbors = 30, min_dist = 0.75, spread = 1) { @@ -53,15 +53,15 @@ create_umap <- function(result, n_neighbors = 30, #' @title Plot a UMAP from a musica result #' @description Plots samples on a UMAP scatterplot. Samples can be colored by -#' the levels of mutational signatures or by a annotation variable. +#' the levels of mutational signatures or by a annotation variable. #' #' @param result A \code{\linkS4class{musica_result}} object generated by #' a mutational discovery or prediction tool. #' @param color_by One of \code{"signatures"}, \code{"annotation"}, or #' \code{"none"}. If \code{"signatures"}, then one UMAP scatterplot will be -#' generated for each signature and points will be colored by the level of -#' that signature in each sample. If \code{annotation}, a single UMAP will -#' be generated colored by the annotation selected using the parameter +#' generated for each signature and points will be colored by the level of +#' that signature in each sample. If \code{annotation}, a single UMAP will +#' be generated colored by the annotation selected using the parameter #' \code{annotation}. If \code{"none"}, a single UMAP scatterplot will be #' generated with no coloring. Default \code{"signature"}. #' @param proportional If \code{TRUE}, then the exposures will be normalized @@ -74,18 +74,18 @@ create_umap <- function(result, n_neighbors = 30, #' scale in each signature subplot. If \code{FALSE}, then each signature subplot #' will be colored by a different scale with different maximum values. Only #' used when \code{color_by = "signature"}. Setting to \code{FALSE} is most -#' useful when the maximum value of various signatures are vastly different +#' useful when the maximum value of various signatures are vastly different #' from one another. Default \code{TRUE}. #' @param add_annotation_labels If \code{TRUE}, labels for each group in the #' \code{annotation} variable will be displayed. Only used if -#' code{color_by = "annotation"}. This not recommended if the annotation is +#' \code{color_by = "annotation"}. This not recommended if the annotation is #' a continuous variable. The label is plotting using the centriod of each #' group within the \code{annotation} variable. Default \code{FALSE}. -#' @param annotation_label_size Size of annotation labels. Only used if -#' code{color_by = "annotation"} and \code{add_annotation_labels = TRUE}. +#' @param annotation_label_size Size of annotation labels. Only used if +#' \code{color_by = "annotation"} and \code{add_annotation_labels = TRUE}. #' Default \code{3}. #' @param annotation_text_box Place a white box around the annotation labels -#' to improve readability. Only used if code{color_by = "annotation"} and +#' to improve readability. Only used if \code{color_by = "annotation"} and #' \code{add_annotation_labels = TRUE}. Default \code{TRUE}. #' @param plotly If \code{TRUE}, the the plot will be made interactive #' using \code{\link[plotly]{plotly}}. Not used if \code{color_by = "signature"} @@ -96,9 +96,9 @@ create_umap <- function(result, n_neighbors = 30, #' @return Generates a ggplot or plotly object #' @seealso See \link{create_umap} to generate a UMAP in a musica result. #' @examples -#' data(res_annot) -#' create_umap(res_annot, "Tumor_Subtypes") -#' plot_umap(res_annot, "none") +#' #data(res_annot) +#' #create_umap(res_annot, "Tumor_Subtypes") +#' #plot_umap(res_annot, "none") #' @export plot_umap <- function(result, color_by = c("signatures", "annotation", "cluster", "none"), diff --git a/data/example_predicted_exp.rda b/data/example_predicted_exp.rda new file mode 100644 index 00000000..3d7a1295 Binary files /dev/null and b/data/example_predicted_exp.rda differ diff --git a/data/example_predicted_sigs.rda b/data/example_predicted_sigs.rda new file mode 100644 index 00000000..c5be776a Binary files /dev/null and b/data/example_predicted_sigs.rda differ diff --git a/data/synthetic_breast_counts.rda b/data/synthetic_breast_counts.rda new file mode 100644 index 00000000..f47f658e Binary files /dev/null and b/data/synthetic_breast_counts.rda differ diff --git a/data/synthetic_breast_true_exposures.rda b/data/synthetic_breast_true_exposures.rda new file mode 100644 index 00000000..3099d4ed Binary files /dev/null and b/data/synthetic_breast_true_exposures.rda differ diff --git a/man/adjustment_threshold.Rd b/man/adjustment_threshold.Rd new file mode 100644 index 00000000..0615533f --- /dev/null +++ b/man/adjustment_threshold.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{adjustment_threshold} +\alias{adjustment_threshold} +\alias{adjustment_threshold,single_benchmark-method} +\alias{adjustment_threshold<-} +\alias{adjustment_threshold<-,single_benchmark,numeric-method} +\title{Retrieve the adjustment_threshold from a single_benchmark object} +\usage{ +adjustment_threshold(result) + +\S4method{adjustment_threshold}{single_benchmark}(result) + +adjustment_threshold(result) <- value + +\S4method{adjustment_threshold}{single_benchmark,numeric}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{single_benchmark}} object extracted from +a \code{\linkS4class{full_benchmark}} object} + +\item{value}{The adjustment_threshold for the single benchmark run} +} +\value{ +The adjustment_threshold for the single benchmark run +} +\description{ +The \code{adjustment_threshold} is the cosine similarity value +of high confidence. Comparisons that meet this cutoff are assumed to be likely, +while those that fall below the cutoff will be disregarded if the predicted +signature is already captured above the threshold. +} diff --git a/man/auto_predict_grid.Rd b/man/auto_predict_grid.Rd index 26673cba..25d79ec8 100644 --- a/man/auto_predict_grid.Rd +++ b/man/auto_predict_grid.Rd @@ -26,7 +26,7 @@ auto_predict_grid( \item{signature_res}{Signatures to automatically subset from for prediction} \item{algorithm}{Algorithm to use for prediction. Choose from -"lda_posterior", decompTumor2Sig, and deconstructSigs} +"lda_posterior" and decompTumor2Sig} \item{sample_annotation}{Annotation to grid across, if none given, prediction subsetting on all samples together} diff --git a/man/auto_subset_sigs.Rd b/man/auto_subset_sigs.Rd index 136b1f00..a5518d41 100644 --- a/man/auto_subset_sigs.Rd +++ b/man/auto_subset_sigs.Rd @@ -22,7 +22,7 @@ auto_subset_sigs( \item{signature_res}{Signatures to automatically subset from for prediction} \item{algorithm}{Algorithm to use for prediction. Choose from -"lda_posterior", decompTumor2Sig, and deconstructSigs} +"lda_posterior" and decompTumor2Sig} \item{min_exists}{Threshold to consider a signature active in a sample} diff --git a/man/benchmark.Rd b/man/benchmark.Rd new file mode 100644 index 00000000..1224f033 --- /dev/null +++ b/man/benchmark.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{benchmark} +\alias{benchmark} +\title{Run the benchmark framework on a prediction} +\usage{ +benchmark( + full_benchmark, + prediction, + method_id = NULL, + threshold = 0.8, + adjustment_threshold = 0.9, + description = NULL, + plot = TRUE, + make_copy = FALSE +) +} +\arguments{ +\item{full_benchmark}{An object of class \code{\linkS4class{full_benchmark}} +created with the \link{create_benchmark} function or returned from a previous +\code{benchmark} run.} + +\item{prediction}{An object of class \code{\linkS4class{musica_result}} +containing the predicted signatures and exposures to benchmark.} + +\item{method_id}{An identifier for the prediction being benchmarked. If not +supplied, it will be automatically set to the variable name of the prediction +provided. Default \code{NULL}.} + +\item{threshold}{Cosine similarity cutoff for comparing preidcted and true +signatures. Default \code{0.8}.} + +\item{adjustment_threshold}{Cosine similarity value of high confidence. +Comparisons that meet this cutoff are assumed to be likely, +while those that fall below the cutoff will be disregarded if the predicted +signature is already captured above the threshold. Default \code{0.9}.} + +\item{description}{Further details about the prediction being benchmarked. +Default \code{NULL}.} + +\item{plot}{If \code{FALSE}, plots will be suppressed. Default \code{TRUE}.} + +\item{make_copy}{If \code{TRUE}, the \code{full_benchmark} object provided +will not be modified and a new object will be returned. If \code{FALSE}, the +object provided will be modified and nothing will be returned. Default +\code{FALSE}.} +} +\value{ +If \code{make_copy == TRUE}, a new \code{full_benchmark} object is +returned. If \code{make_copy == FALSE}, nothing is returned. +} +\description{ +Perform benchmarking on a signature discovery prediction compared +to a ground truth. Potential errors in the predicted signatures, such as +composite or duplicate signatures are adjusted, and a summary of the accuracy +of the prediction is given. +} diff --git a/man/benchmark_compare_results.Rd b/man/benchmark_compare_results.Rd new file mode 100644 index 00000000..3a9117ef --- /dev/null +++ b/man/benchmark_compare_results.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{benchmark_compare_results} +\alias{benchmark_compare_results} +\title{Get benchmark comparison table} +\usage{ +benchmark_compare_results(full_benchmark, method_id, prediction) +} +\arguments{ +\item{full_benchmark}{The \code{\linkS4class{full_benchmark}} object for the +benchmarking analysis} + +\item{method_id}{The identifier for the \code{\linkS4class{single_benchmark}} +object containing the comparison of interest} + +\item{prediction}{\code{Initial} for the comparison before any benchmarking +adjustments have been made, \code{Intermediate} for the comparison after +duplicates have been adjusted but before composites are adjusted, or +\code{Final} for the comparison at the end of the benchmarking adjustments.} +} +\value{ +A data.frame containing the comparison between true and predicted +signatures +} +\description{ +After a prediction has been benchmarked with the \link{benchmark} function, +this function can be used to generate the comparison table between true and +predicted signatures from any step in the benchmarking process. +} diff --git a/man/benchmark_get_entry.Rd b/man/benchmark_get_entry.Rd new file mode 100644 index 00000000..8982ab4b --- /dev/null +++ b/man/benchmark_get_entry.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{benchmark_get_entry} +\alias{benchmark_get_entry} +\title{Get a single_benchmark object} +\usage{ +benchmark_get_entry(full_benchmark, method_id) +} +\arguments{ +\item{full_benchmark}{The \code{\linkS4class{full_benchmark}} object that contains +the desired \code{\linkS4class{single_benchmark}} object} + +\item{method_id}{The identifier for the desired \code{\linkS4class{single_benchmark}} +object} +} +\value{ +A \code{\linkS4class{single_benchmark}} object +} +\description{ +Access a \code{\linkS4class{single_benchmark}} object containing information from +an individual benchmark run from a \code{\linkS4class{full_benchmark}} object. +} diff --git a/man/benchmark_get_prediction.Rd b/man/benchmark_get_prediction.Rd new file mode 100644 index 00000000..aa87f58a --- /dev/null +++ b/man/benchmark_get_prediction.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{benchmark_get_prediction} +\alias{benchmark_get_prediction} +\title{Get a benchmark prediction} +\usage{ +benchmark_get_prediction(indv_benchmark, prediction) +} +\arguments{ +\item{indv_benchmark}{A \code{\linkS4class{single_benchmark}} object containing the +desired prediction. This can be accessed using the \link{benchmark_get_entry} +function.} + +\item{prediction}{\code{Initial} for the prediction before any benchmarking +adjustments have been made, \code{Intermediate} for the prediction after +duplicates have been adjusted but before composites are adjusted, or +\code{Final} for the prediction at the end of the benchmarking adjustments.} +} +\value{ +A \code{\linkS4class{musica_result}} object +} +\description{ +Access a \code{\linkS4class{musica_result}} object containing a particular prediction +from a benchmarking analysis. +} diff --git a/man/benchmark_plot_comparison.Rd b/man/benchmark_plot_comparison.Rd new file mode 100644 index 00000000..df1a5918 --- /dev/null +++ b/man/benchmark_plot_comparison.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{benchmark_plot_comparison} +\alias{benchmark_plot_comparison} +\title{Plot a signature comparison from a benchmarking analysis} +\usage{ +benchmark_plot_comparison( + full_benchmark, + method_id, + prediction, + decimals = 2, + same_scale = FALSE +) +} +\arguments{ +\item{full_benchmark}{The \code{\linkS4class{full_benchmark}} object for the +benchmarking analysis} + +\item{method_id}{The identifier for the \code{\linkS4class{single_benchmark}} +object containing the comparison of interest} + +\item{prediction}{\code{Initial} for the comparison before any benchmarking +adjustments have been made, \code{Intermediate} for the comparison after +duplicates have been adjusted but before composites are adjusted, or +\code{Final} for the comparison at the end of the benchmarking adjustments.} + +\item{decimals}{Specifies rounding for similarity metric displayed. Default +\code{2}.} + +\item{same_scale}{If \code{TRUE}, the scale of the probability for each +comparison will be the same. If \code{FALSE}, then the scale of the y-axis +will be adjusted for each comparison. Default \code{TRUE}.} +} +\value{ +Returns the comparison plot +} +\description{ +After a prediction has been benchmarked with the \link{benchmark} function, +the comparison between the true and predicted signatures at any step of the +benchmarking process can be plotted. +} diff --git a/man/benchmark_plot_composite_exposures.Rd b/man/benchmark_plot_composite_exposures.Rd new file mode 100644 index 00000000..33bce1d6 --- /dev/null +++ b/man/benchmark_plot_composite_exposures.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{benchmark_plot_composite_exposures} +\alias{benchmark_plot_composite_exposures} +\title{Plot effect of composite correction} +\usage{ +benchmark_plot_composite_exposures(full_benchmark, method_id) +} +\arguments{ +\item{full_benchmark}{The \code{\linkS4class{full_benchmark}} object for the +benchmarking analysis} + +\item{method_id}{The identifier for the \code{\linkS4class{single_benchmark}} +object of interest} +} +\value{ +A list of ggplot objects +} +\description{ +After a prediction has been benchmarked with the \link{benchmark} function, +the true and predicted exposures can be plotted both before and after the +composite signature adjustment. The effect of the adjustment can then be +observed. +} diff --git a/man/benchmark_plot_duplicate_exposures.Rd b/man/benchmark_plot_duplicate_exposures.Rd new file mode 100644 index 00000000..f7cbc7a0 --- /dev/null +++ b/man/benchmark_plot_duplicate_exposures.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{benchmark_plot_duplicate_exposures} +\alias{benchmark_plot_duplicate_exposures} +\title{Plot effect of duplicate correction} +\usage{ +benchmark_plot_duplicate_exposures(full_benchmark, method_id) +} +\arguments{ +\item{full_benchmark}{The \code{\linkS4class{full_benchmark}} object for the +benchmarking analysis} + +\item{method_id}{The identifier for the \code{\linkS4class{single_benchmark}} +object of interest} +} +\value{ +A list of ggplot objects +} +\description{ +After a prediction has been benchmarked with the \link{benchmark} function, +the true and predicted exposures can be plotted both before and after the +duplicate signature adjustment. The effect of the adjustment can then be +observed. +} diff --git a/man/benchmark_plot_exposures.Rd b/man/benchmark_plot_exposures.Rd new file mode 100644 index 00000000..6cc1a3e1 --- /dev/null +++ b/man/benchmark_plot_exposures.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{benchmark_plot_exposures} +\alias{benchmark_plot_exposures} +\title{Plot exposure comparison from a benchmarking analysis} +\usage{ +benchmark_plot_exposures(full_benchmark, method_id, prediction) +} +\arguments{ +\item{full_benchmark}{The \code{\linkS4class{full_benchmark}} object for the +benchmarking analysis} + +\item{method_id}{The identifier for the \code{\linkS4class{single_benchmark}} +object of interest} + +\item{prediction}{\code{Initial} for the exposures before any benchmarking +adjustments have been made, \code{Intermediate} for the exposures after +duplicates have been adjusted but before composites are adjusted, or +\code{Final} for the exposures at the end of the benchmarking adjustments.} +} +\value{ +Generates a ggplot object +} +\description{ +After a prediction has been benchmarked with the \link{benchmark} function, +the comparison between the true and predicted exposures at any stage of the +benchmarking process can be plotted. +} diff --git a/man/benchmark_plot_signatures.Rd b/man/benchmark_plot_signatures.Rd new file mode 100644 index 00000000..ad8e764f --- /dev/null +++ b/man/benchmark_plot_signatures.Rd @@ -0,0 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{benchmark_plot_signatures} +\alias{benchmark_plot_signatures} +\title{Plot signatures from a benchmarking analysis} +\usage{ +benchmark_plot_signatures( + full_benchmark, + method_id, + prediction, + plotly = FALSE, + color_variable = NULL, + color_mapping = NULL, + text_size = 10, + show_x_labels = TRUE, + show_y_labels = TRUE, + same_scale = TRUE, + y_max = NULL, + annotation = NULL, + percent = TRUE +) +} +\arguments{ +\item{full_benchmark}{The \code{\linkS4class{full_benchmark}} object for the +benchmarking analysis} + +\item{method_id}{The identifier for the \code{\linkS4class{single_benchmark}} +object containing the signatures to be plotted} + +\item{prediction}{\code{Initial} for the signatures before any benchmarking +adjustments have been made, \code{Intermediate} for the signatures after +duplicates have been adjusted but before composites are adjusted, or +\code{Final} for the signatures at the end of the benchmarking adjustments.} + +\item{plotly}{If \code{TRUE}, the the plot will be made interactive +using \code{\link[plotly]{plotly}}. Default \code{FALSE}.} + +\item{color_variable}{Name of the column in the variant annotation data.frame +to use for coloring the mutation type bars. The variant annotation data.frame +can be found within the count table of the \code{\linkS4class{musica}} +object. If \code{NULL}, then the default column specified in the count +table will be used. Default \code{NULL}.} + +\item{color_mapping}{A character vector used to map items in the +\code{color_variable} to a color. The items in \code{color_mapping} +correspond to the colors. The names of the items in \code{color_mapping} +should correspond to the uniqeu items in \code{color_variable}. If +\code{NULL}, then the default \code{color_mapping} specified in the count +table will be used. Default \code{NULL}.} + +\item{text_size}{Size of axis text. Default \code{10}.} + +\item{show_x_labels}{If \code{TRUE}, the labels for the mutation types +on the x-axis will be shown. Default \code{TRUE}.} + +\item{show_y_labels}{If \code{TRUE}, the y-axis ticks and labels will be +shown. Default \code{TRUE}.} + +\item{same_scale}{If \code{TRUE}, the scale of the probability for each +signature will be the same. If \code{FALSE}, then the scale of the y-axis +will be adjusted for each signature. Default \code{TRUE}.} + +\item{y_max}{Vector of maximum y-axis limits for each signature. One value +may also be provided to specify a constant y-axis limit for all signatures. +Vector length must be 1 or equivalent to the number of signatures. Default +\code{NULL}.} + +\item{annotation}{Vector of annotations to be displayed in the top right +corner of each signature. Vector length must be equivalent to the number of +signatures. Default \code{NULL}.} + +\item{percent}{If \code{TRUE}, the y-axis will be represented in percent +format instead of mutation counts. Default \code{TRUE}.} +} +\value{ +Generates a ggplot or plotly object +} +\description{ +After a prediction has been benchmarked with the \link{benchmark} function, +this function can be used to plot signatures from any step in the benchmarking +process. Comparable to the \code{plot_signatures} function but compatible with +benchmarking objects. +} diff --git a/man/built_tables.Rd b/man/built_tables.Rd index ad166707..8bbdd130 100644 --- a/man/built_tables.Rd +++ b/man/built_tables.Rd @@ -4,7 +4,7 @@ \alias{built_tables} \alias{built_tables,musica-method} \alias{built_tables,musica_result-method} -\title{Retrieve the names of count_tables from a musica or musica_result +\title{Retrieve the names of count_tables from a musica or musica_result object} \usage{ built_tables(object) @@ -15,14 +15,15 @@ built_tables(object) } \arguments{ \item{object}{A \code{\linkS4class{musica}} object generated by -the \link{create_musica} function or a \code{\linkS4class{musica_result}} +the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +or a \code{\linkS4class{musica_result}} object generated by a mutational discovery or prediction tool.} } \value{ The names of created count_tables } \description{ -The \code{count_tables} contains standard and/or custom +The \code{count_tables} contains standard and/or custom count tables created from variants } \examples{ diff --git a/man/compare_results.Rd b/man/compare_results.Rd index 9789e09a..cb80f652 100644 --- a/man/compare_results.Rd +++ b/man/compare_results.Rd @@ -4,16 +4,7 @@ \alias{compare_results} \title{Compare two result files to find similar signatures} \usage{ -compare_results( - result, - other_result, - threshold = 0.9, - metric = "cosine", - result_name = deparse(substitute(result)), - other_result_name = deparse(substitute(other_result)), - decimals = 2, - same_scale = FALSE -) +compare_results(result, other_result, threshold = 0.9, metric = "cosine") } \arguments{ \item{result}{A \code{\linkS4class{musica_result}} object.} @@ -24,17 +15,6 @@ compare_results( \item{metric}{One of \code{"cosine"} for cosine similarity or \code{"jsd"} for 1 minus the Jensen-Shannon Divergence. Default \code{"cosine"}.} - -\item{result_name}{title for plot of first result signatures} - -\item{other_result_name}{title for plot of second result signatures} - -\item{decimals}{Specifies rounding for similarity metric displayed. Default -\code{2}.} - -\item{same_scale}{If \code{TRUE}, the scale of the probability for each -signature will be the same. If \code{FALSE}, then the scale of the y-axis -will be adjusted for each signature. Default \code{FALSE}.} } \value{ Returns the comparisons diff --git a/man/create_benchmark.Rd b/man/create_benchmark.Rd new file mode 100644 index 00000000..98fb11bf --- /dev/null +++ b/man/create_benchmark.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{create_benchmark} +\alias{create_benchmark} +\title{Create a full_benchmark object} +\usage{ +create_benchmark(true_signatures, true_exposures, count_table) +} +\arguments{ +\item{true_signatures}{A matrix of true signatures by mutational motifs} + +\item{true_exposures}{A matrix of samples by true signature weights} + +\item{count_table}{Summary table with per-sample unnormalized motif counts} +} +\value{ +A \code{\linkS4class{full_benchmark}} object +} +\description{ +Initialize a \code{\linkS4class{full_benchmark}} object for benchmarking +} diff --git a/man/create_musica_from_counts.Rd b/man/create_musica_from_counts.Rd new file mode 100644 index 00000000..b04aee64 --- /dev/null +++ b/man/create_musica_from_counts.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/load_data.R +\name{create_musica_from_counts} +\alias{create_musica_from_counts} +\title{Creates a musica object from a mutation count table} +\usage{ +create_musica_from_counts(x, variant_class) +} +\arguments{ +\item{x}{A data.table, matrix, or data.frame that contains counts of mutation +types for each sample, with samples as columns.} + +\item{variant_class}{Mutations are SBS, DBS, or Indel.} +} +\value{ +Returns a musica object +} +\description{ +This function creates a \linkS4class{musica} object from a mutation count +table or matrix. The \linkS4class{musica} class stores variants information, +variant-level annotations, sample-level annotations, and count tables and +is used as input to the mutational signature discovery and prediction +algorithms. +} +\examples{ +#maf_file <- system.file("extdata", "public_TCGA.LUSC.maf", +#package = "musicatk") +#musica <- create_musica_from_counts(x = count_table, variant_class = "SBS96") +} diff --git a/man/create_musica.Rd b/man/create_musica_from_variants.Rd similarity index 95% rename from man/create_musica.Rd rename to man/create_musica_from_variants.Rd index 152a81f7..19a59aca 100644 --- a/man/create_musica.Rd +++ b/man/create_musica_from_variants.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/load_data.R -\name{create_musica} -\alias{create_musica} +\name{create_musica_from_variants} +\alias{create_musica_from_variants} \title{Creates a musica object from a variant table} \usage{ -create_musica( +create_musica_from_variants( x, genome, check_ref_chromosomes = TRUE, @@ -90,5 +90,5 @@ maf_file <- system.file("extdata", "public_TCGA.LUSC.maf", package = "musicatk") variants <- extract_variants_from_maf_file(maf_file) g <- select_genome("38") -musica <- create_musica(x = variants, genome = g) +musica <- create_musica_from_variants(x = variants, genome = g) } diff --git a/man/create_musica_result.Rd b/man/create_musica_result.Rd new file mode 100644 index 00000000..ef30f965 --- /dev/null +++ b/man/create_musica_result.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/load_data.R +\name{create_musica_result} +\alias{create_musica_result} +\title{Create a musica_result object} +\usage{ +create_musica_result(signatures, exposures, count_table, algorithm = NULL) +} +\arguments{ +\item{signatures}{A matrix or data.frame of signatures by mutational motifs} + +\item{exposures}{A matrix or data.frame of samples by signature weights} + +\item{count_table}{Summary table with per-sample unnormalized motif counts} + +\item{algorithm}{Describes how the signatures/weights were generated} +} +\value{ +A \linkS4class{musica_result} object +} +\description{ +This function creates a \linkS4class{musica_result} object from signatures, +exposures, and a mutation count table. +} diff --git a/man/create_umap.Rd b/man/create_umap.Rd index b8ca7897..981cf312 100644 --- a/man/create_umap.Rd +++ b/man/create_umap.Rd @@ -31,12 +31,12 @@ A \code{\linkS4class{musica_result}} object with a new UMAP stored in the \code{UMAP} slot. } \description{ -Proportional sample exposures will be used as input into the +Proportional sample exposures will be used as input into the \code{\link[uwot]{umap}} function to generate a two dimensional UMAP. } \examples{ -data(res_annot) -create_umap(result = res_annot) +#data(res_annot) +#create_umap(result = res_annot) } \seealso{ See \link{plot_umap} to display the UMAP and diff --git a/man/description.Rd b/man/description.Rd new file mode 100644 index 00000000..fff284d8 --- /dev/null +++ b/man/description.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{description} +\alias{description} +\alias{description,single_benchmark-method} +\alias{description<-} +\alias{description<-,single_benchmark,character-method} +\title{Retrieve the description from a single_benchmark object} +\usage{ +description(result) + +\S4method{description}{single_benchmark}(result) + +description(result) <- value + +\S4method{description}{single_benchmark,character}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{single_benchmark}} object extracted from +a \code{\linkS4class{full_benchmark}} object} + +\item{value}{The description for the single benchmark run} +} +\value{ +The description for the single benchmark run +} +\description{ +The \code{description} contains details about the prediction +that is benchmarked. +} diff --git a/man/discover_signatures.Rd b/man/discover_signatures.Rd index ad70d299..148a4aa0 100644 --- a/man/discover_signatures.Rd +++ b/man/discover_signatures.Rd @@ -48,9 +48,10 @@ matrix containing the probability of each mutation type in each sample and 2) an "exposure" matrix containing the estimated counts for each signature in each sample. Before mutational discovery can be performed, variants from samples first need to be stored in a -\code{\linkS4class{musica}} object using the \link{create_musica} function +\code{\linkS4class{musica}} object using the \link{create_musica_from_variants} +or \link{create_musica_from_counts} function and mutation count tables need to be created using functions such as -\link{build_standard_table}. +\link{build_standard_table} if \link{create_musica_from_counts} was not used. } \examples{ data(musica) diff --git a/man/example_predicted_exp.Rd b/man/example_predicted_exp.Rd new file mode 100644 index 00000000..d292050a --- /dev/null +++ b/man/example_predicted_exp.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/test_data.R +\docType{data} +\name{example_predicted_exp} +\alias{example_predicted_exp} +\title{example_predicted_exp} +\format{ +An object of class \code{matrix} +} +\usage{ +data(example_predicted_exp) +} +\description{ +A matrix containing the predicted signature exposure levels for a synthetic +breast cancer dataset with 214 samples, generated using NMF and k=8 signatures. +} +\keyword{datasets} diff --git a/man/example_predicted_sigs.Rd b/man/example_predicted_sigs.Rd new file mode 100644 index 00000000..59bfbc27 --- /dev/null +++ b/man/example_predicted_sigs.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/test_data.R +\docType{data} +\name{example_predicted_sigs} +\alias{example_predicted_sigs} +\title{example_predicted_sigs} +\format{ +An object of class \code{matrix} +} +\usage{ +data(example_predicted_sigs) +} +\description{ +A matrix containing the predicted signatures for a synthetic breast cancer +dataset with 214 samples, generated using NMF and k=8 signatures. +} +\keyword{datasets} diff --git a/man/final_comparison.Rd b/man/final_comparison.Rd new file mode 100644 index 00000000..d4cbe6f6 --- /dev/null +++ b/man/final_comparison.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{final_comparison} +\alias{final_comparison} +\alias{final_comparison,single_benchmark-method} +\alias{final_comparison<-} +\alias{final_comparison<-,single_benchmark,data.frame-method} +\title{Retrieve the final comparison data.frame from a single_benchmark object} +\usage{ +final_comparison(result) + +\S4method{final_comparison}{single_benchmark}(result) + +final_comparison(result) <- value + +\S4method{final_comparison}{single_benchmark,data.frame}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{single_benchmark}} object extracted from +a \code{\linkS4class{full_benchmark}} object} + +\item{value}{A data.frame containing the final comparison between predicted +and true signatures, generated by the \link{benchmark_compare_results} +function} +} +\value{ +A data.frame object containing the final comparison between predicted +and true signatures +} +\description{ +The \code{final_comparison} data.frame contains the comparison +between the predicted signatures and the true signatures after benchmarking +is complete. +} diff --git a/man/final_pred.Rd b/man/final_pred.Rd new file mode 100644 index 00000000..46a512e3 --- /dev/null +++ b/man/final_pred.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{final_pred} +\alias{final_pred} +\alias{final_pred,single_benchmark-method} +\alias{final_pred<-} +\alias{final_pred<-,single_benchmark,musica_result-method} +\title{Retrieve the final prediction from a single_benchmark object} +\usage{ +final_pred(result) + +\S4method{final_pred}{single_benchmark}(result) + +final_pred(result) <- value + +\S4method{final_pred}{single_benchmark,musica_result}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{single_benchmark}} object extracted from +a \code{\linkS4class{full_benchmark}} object} + +\item{value}{A \code{\linkS4class{musica_result}} object containing the +final signature and exposure predictions} +} +\value{ +A \code{\linkS4class{musica_result}} object containing the final +predictions +} +\description{ +The \code{final_pred} object is a +\code{\linkS4class{musica_result}} object containing the signature and +exposure predictions after benchmarking is complete +} diff --git a/man/full_benchmark-class.Rd b/man/full_benchmark-class.Rd new file mode 100644 index 00000000..7216d9ea --- /dev/null +++ b/man/full_benchmark-class.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/main_class.R +\docType{class} +\name{full_benchmark-class} +\alias{full_benchmark-class} +\title{Object that contains information for a full benchmarking analysis where +multiple predictions may be benchmarked} +\description{ +Object that contains information for a full benchmarking analysis where +multiple predictions may be benchmarked +} +\section{Slots}{ + +\describe{ +\item{\code{ground_truth}}{musica_result} + +\item{\code{method_view_summary}}{matrix} + +\item{\code{sig_view_summary}}{matrix} + +\item{\code{indv_benchmarks}}{list} +}} + diff --git a/man/get_musica.Rd b/man/get_musica.Rd index 3d786d55..a4a7bb19 100644 --- a/man/get_musica.Rd +++ b/man/get_musica.Rd @@ -17,7 +17,7 @@ a mutational discovery or prediction tool.} A \code{\linkS4class{musica}} musica object } \description{ -The \code{\linkS4class{musica}} musica contains variants, +The \code{\linkS4class{musica}} musica contains variants, count tables, and sample annotations } \examples{ diff --git a/man/ground_truth.Rd b/man/ground_truth.Rd new file mode 100644 index 00000000..1d84fa85 --- /dev/null +++ b/man/ground_truth.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{ground_truth} +\alias{ground_truth} +\alias{ground_truth,full_benchmark-method} +\alias{ground_truth<-} +\alias{ground_truth<-,full_benchmark,musica_result-method} +\title{Retrieve ground_truth musica_result object from a full_benchmark object} +\usage{ +ground_truth(result) + +\S4method{ground_truth}{full_benchmark}(result) + +ground_truth(result) <- value + +\S4method{ground_truth}{full_benchmark,musica_result}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{full_benchmark}} object generated by +the \link{create_benchmark} function.} + +\item{value}{A \code{\linkS4class{musica_result}} object with the ground truth +signatures and exposures} +} +\value{ +A \code{\linkS4class{musica_result}} object with the ground truth +signatures and exposures +} +\description{ +The \code{ground_truth} musica_result object contains true +signatures and exposures for the benchmarking analysis. +} diff --git a/man/indv_benchmarks.Rd b/man/indv_benchmarks.Rd new file mode 100644 index 00000000..95817505 --- /dev/null +++ b/man/indv_benchmarks.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{indv_benchmarks} +\alias{indv_benchmarks} +\alias{indv_benchmarks,full_benchmark-method} +\alias{indv_benchmarks<-} +\alias{indv_benchmarks<-,full_benchmark,list-method} +\title{Retrieve indv_benchmarks list from a full_benchmark object} +\usage{ +indv_benchmarks(result) + +\S4method{indv_benchmarks}{full_benchmark}(result) + +indv_benchmarks(result) <- value + +\S4method{indv_benchmarks}{full_benchmark,list}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{full_benchmark}} object generated by +the \link{create_benchmark} function.} + +\item{value}{A list of \code{\linkS4class{single_benchmark}} objects, each +containing information from a single benchmark run} +} +\value{ +A list of \code{\linkS4class{single_benchmark}} objects +} +\description{ +The \code{indv_benchmarks} list contains +\code{\linkS4class{single_benchmark}} objects that each contain the benchmarking +information from a single benchmark run. +} diff --git a/man/initial_comparison.Rd b/man/initial_comparison.Rd new file mode 100644 index 00000000..b1adbfbf --- /dev/null +++ b/man/initial_comparison.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{initial_comparison} +\alias{initial_comparison} +\alias{initial_comparison,single_benchmark-method} +\alias{initial_comparison<-} +\alias{initial_comparison<-,single_benchmark,data.frame-method} +\title{Retrieve the intial comparison data.frame from a single_benchmark object} +\usage{ +initial_comparison(result) + +\S4method{initial_comparison}{single_benchmark}(result) + +initial_comparison(result) <- value + +\S4method{initial_comparison}{single_benchmark,data.frame}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{single_benchmark}} object extracted from +a \code{\linkS4class{full_benchmark}} object} + +\item{value}{A data.frame containing the initial comparison between predicted +and true signatures, generated by the \link{benchmark_compare_results} function} +} +\value{ +A data.frame object containing the initial comparison between predicted +and true signatures +} +\description{ +The \code{initial_comparison} data.frame contains the comparison +between the initial predicted signatures and the true signatures. +} diff --git a/man/initial_pred.Rd b/man/initial_pred.Rd new file mode 100644 index 00000000..97eed34b --- /dev/null +++ b/man/initial_pred.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{initial_pred} +\alias{initial_pred} +\alias{initial_pred,single_benchmark-method} +\alias{initial_pred<-} +\alias{initial_pred<-,single_benchmark,musica_result-method} +\title{Retrieve the initial prediction from a single_benchmark object} +\usage{ +initial_pred(result) + +\S4method{initial_pred}{single_benchmark}(result) + +initial_pred(result) <- value + +\S4method{initial_pred}{single_benchmark,musica_result}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{single_benchmark}} object extracted from +a \code{\linkS4class{full_benchmark}} object} + +\item{value}{A \code{\linkS4class{musica_result}} object containing the +initial signature and exposure predictions} +} +\value{ +A \code{\linkS4class{musica_result}} object containing the initial +predictions +} +\description{ +The \code{initial_pred} object is a +\code{\linkS4class{musica_result}} object containing the initial signature +and exposure predictions inputted for benchmarking +} diff --git a/man/intermediate_comparison.Rd b/man/intermediate_comparison.Rd new file mode 100644 index 00000000..1b4e0b37 --- /dev/null +++ b/man/intermediate_comparison.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{intermediate_comparison} +\alias{intermediate_comparison} +\alias{intermediate_comparison,single_benchmark-method} +\alias{intermediate_comparison<-} +\alias{intermediate_comparison<-,single_benchmark,data.frame-method} +\title{Retrieve the initial comparison data.frame from a single_benchmark object} +\usage{ +intermediate_comparison(result) + +\S4method{intermediate_comparison}{single_benchmark}(result) + +intermediate_comparison(result) <- value + +\S4method{intermediate_comparison}{single_benchmark,data.frame}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{single_benchmark}} object extracted from +a \code{\linkS4class{full_benchmark}} object} + +\item{value}{A data.frame containing the intermediate comparison between +predicted and true signatures, generated by the +\link{benchmark_compare_results} function} +} +\value{ +A data.frame object containing the intermediate comparison between +predicted and true signatures +} +\description{ +The \code{intermediate_comparison} data.frame contains the comparison +between the predicted signatures and the true signatures after duplicate +signatures have been adjusted but before composite signatures have been +adjusted. +} diff --git a/man/intermediate_pred.Rd b/man/intermediate_pred.Rd new file mode 100644 index 00000000..81acb117 --- /dev/null +++ b/man/intermediate_pred.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{intermediate_pred} +\alias{intermediate_pred} +\alias{intermediate_pred,single_benchmark-method} +\alias{intermediate_pred<-} +\alias{intermediate_pred<-,single_benchmark,musica_result-method} +\title{Retrieve the intermediate prediction from a single_benchmark object} +\usage{ +intermediate_pred(result) + +\S4method{intermediate_pred}{single_benchmark}(result) + +intermediate_pred(result) <- value + +\S4method{intermediate_pred}{single_benchmark,musica_result}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{single_benchmark}} object extracted from +a \code{\linkS4class{full_benchmark}} object} + +\item{value}{A \code{\linkS4class{musica_result}} object containing the +intermediate signature and exposure predictions} +} +\value{ +A \code{\linkS4class{musica_result}} object containing the intermediate +predictions +} +\description{ +The \code{intermediate_pred} object is a +\code{\linkS4class{musica_result}} object containing the signature and +exposure predictions after duplicate signatures are adjusted but before +composite sigantures are adjusted +} diff --git a/man/method_id.Rd b/man/method_id.Rd new file mode 100644 index 00000000..0d78e3a6 --- /dev/null +++ b/man/method_id.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{method_id} +\alias{method_id} +\alias{method_id,single_benchmark-method} +\alias{method_id<-} +\alias{method_id<-,single_benchmark,character-method} +\title{Retrieve the method_id from a single_benchmark object} +\usage{ +method_id(result) + +\S4method{method_id}{single_benchmark}(result) + +method_id(result) <- value + +\S4method{method_id}{single_benchmark,character}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{single_benchmark}} object extracted from +a \code{\linkS4class{full_benchmark}} object} + +\item{value}{A character containing the identifier for the single benchmark +run} +} +\value{ +The identifier for the single benchmark run +} +\description{ +The \code{method_id} is the identifier for a single +benchmark run. +} diff --git a/man/method_view_summary.Rd b/man/method_view_summary.Rd new file mode 100644 index 00000000..c393b814 --- /dev/null +++ b/man/method_view_summary.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{method_view_summary} +\alias{method_view_summary} +\alias{method_view_summary,full_benchmark-method} +\alias{method_view_summary<-} +\alias{method_view_summary<-,full_benchmark,matrix-method} +\title{Retrieve method_view_summary matrix from a full_benchmark object} +\usage{ +method_view_summary(result) + +\S4method{method_view_summary}{full_benchmark}(result) + +method_view_summary(result) <- value + +\S4method{method_view_summary}{full_benchmark,matrix}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{full_benchmark}} object generated by +the \link{create_benchmark} function.} + +\item{value}{A matrix containing summary information for all individual +benchmarks run in the analysis} +} +\value{ +A matrix containing the summary of all individual benchmarks +} +\description{ +The \code{method_view_summary} matrix contains a summary of all +the individual benchmarks completed in the analysis. +} diff --git a/man/plot_cluster.Rd b/man/plot_cluster.Rd index a8b94ca5..e4242c30 100644 --- a/man/plot_cluster.Rd +++ b/man/plot_cluster.Rd @@ -42,14 +42,14 @@ data(res_annot) #Get clustering result clust_out <- cluster_exposure(result = res_annot, nclust = 2) #UMAP -create_umap(result = res_annot) -#generate cluster X signature plot -plot_cluster(result = res_annot, clusters = clust_out, group = "signature") -#generate cluster X annotation plot -plot_cluster(result = res_annot, clusters = clust_out, group = "annotation", - annotation = "Tumor_Subtypes") -#generate a single UMAP plot -plot_cluster(result = res_annot, clusters = clust_out, group = "none") +#create_umap(result = res_annot) +##generate cluster X signature plot +#plot_cluster(result = res_annot, clusters = clust_out, group = "signature") +##generate cluster X annotation plot +#plot_cluster(result = res_annot, clusters = clust_out, group = "annotation", +# annotation = "Tumor_Subtypes") +##generate a single UMAP plot +#plot_cluster(result = res_annot, clusters = clust_out, group = "none") } \seealso{ \link{create_umap} diff --git a/man/plot_comparison.Rd b/man/plot_comparison.Rd new file mode 100644 index 00000000..a134b29d --- /dev/null +++ b/man/plot_comparison.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare_results.R +\name{plot_comparison} +\alias{plot_comparison} +\title{Plot the comparison of two result files} +\usage{ +plot_comparison( + comparison, + result, + other_result, + result_name = deparse(substitute(result)), + other_result_name = deparse(substitute(other_result)), + decimals = 2, + same_scale = TRUE +) +} +\arguments{ +\item{comparison}{A matrix detailing the comparison between the two result files. +For example, the output of the \code{compare_results} function.} + +\item{result}{A \code{\linkS4class{musica_result}} object.} + +\item{other_result}{A second \code{\linkS4class{musica_result}} object.} + +\item{result_name}{title for plot of first result signatures} + +\item{other_result_name}{title for plot of second result signatures} + +\item{decimals}{Specifies rounding for similarity metric displayed. Default +\code{2}.} + +\item{same_scale}{If \code{TRUE}, the scale of the probability for each +comparison will be the same. If \code{FALSE}, then the scale of the y-axis +will be adjusted for each comparison. Default \code{TRUE}.} +} +\value{ +Returns the comparison plot +} +\description{ +Plot the comparison of two result files +} +\examples{ +data(res) +comparison <- compare_results(res, res, threshold = 0.8) +plot_comparison(comparison, res, res) +} diff --git a/man/plot_umap.Rd b/man/plot_umap.Rd index 88473249..20b6d092 100644 --- a/man/plot_umap.Rd +++ b/man/plot_umap.Rd @@ -26,9 +26,9 @@ a mutational discovery or prediction tool.} \item{color_by}{One of \code{"signatures"}, \code{"annotation"}, or \code{"none"}. If \code{"signatures"}, then one UMAP scatterplot will be -generated for each signature and points will be colored by the level of -that signature in each sample. If \code{annotation}, a single UMAP will -be generated colored by the annotation selected using the parameter +generated for each signature and points will be colored by the level of +that signature in each sample. If \code{annotation}, a single UMAP will +be generated colored by the annotation selected using the parameter \code{annotation}. If \code{"none"}, a single UMAP scatterplot will be generated with no coloring. Default \code{"signature"}.} @@ -45,21 +45,21 @@ when \code{color_by = "annotation"}. Default \code{NULL}.} scale in each signature subplot. If \code{FALSE}, then each signature subplot will be colored by a different scale with different maximum values. Only used when \code{color_by = "signature"}. Setting to \code{FALSE} is most -useful when the maximum value of various signatures are vastly different +useful when the maximum value of various signatures are vastly different from one another. Default \code{TRUE}.} \item{add_annotation_labels}{If \code{TRUE}, labels for each group in the \code{annotation} variable will be displayed. Only used if -code{color_by = "annotation"}. This not recommended if the annotation is +\code{color_by = "annotation"}. This not recommended if the annotation is a continuous variable. The label is plotting using the centriod of each group within the \code{annotation} variable. Default \code{FALSE}.} -\item{annotation_label_size}{Size of annotation labels. Only used if -code{color_by = "annotation"} and \code{add_annotation_labels = TRUE}. +\item{annotation_label_size}{Size of annotation labels. Only used if +\code{color_by = "annotation"} and \code{add_annotation_labels = TRUE}. Default \code{3}.} \item{annotation_text_box}{Place a white box around the annotation labels -to improve readability. Only used if code{color_by = "annotation"} and +to improve readability. Only used if \code{color_by = "annotation"} and \code{add_annotation_labels = TRUE}. Default \code{TRUE}.} \item{plotly}{If \code{TRUE}, the the plot will be made interactive @@ -80,9 +80,9 @@ Plots samples on a UMAP scatterplot. Samples can be colored by the levels of mutational signatures or by a annotation variable. } \examples{ -data(res_annot) -create_umap(res_annot, "Tumor_Subtypes") -plot_umap(res_annot, "none") +#data(res_annot) +#create_umap(res_annot, "Tumor_Subtypes") +#plot_umap(res_annot, "none") } \seealso{ See \link{create_umap} to generate a UMAP in a musica result. diff --git a/man/predict_and_benchmark.Rd b/man/predict_and_benchmark.Rd new file mode 100644 index 00000000..2b4d89f8 --- /dev/null +++ b/man/predict_and_benchmark.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/benchmarking.R +\name{predict_and_benchmark} +\alias{predict_and_benchmark} +\title{Predict and benchmark} +\usage{ +predict_and_benchmark( + musica, + table_name, + algorithm = "lda", + k_min, + k_max, + full_benchmark, + threshold = 0.8, + adjustment_threshold = 0.9, + plot = FALSE, + seed = 1, + nstart = 10, + par_cores = 1 +) +} +\arguments{ +\item{musica}{A \code{\linkS4class{musica}} object.} + +\item{table_name}{Name of the table to use for signature discovery. Needs +to be the same name supplied to the table building functions such as +\link{build_standard_table}.} + +\item{algorithm}{Method to use for mutational signature discovery. One of +\code{"lda"} or \code{"nmf"}. Default \code{"lda"}.} + +\item{k_min}{Minimum number of singatures to predict} + +\item{k_max}{Maximum number of signatures to predict} + +\item{full_benchmark}{An object of class \code{\linkS4class{full_benchmark}} +created with the \link{create_benchmark} function or returned from a previous +\code{benchmark} run.} + +\item{threshold}{Cosine similarity cutoff for comparing preidcted and true +signatures. Default \code{0.8}.} + +\item{adjustment_threshold}{Cosine similarity value of high confidence. +Comparisons that meet this cutoff are assumed to be likely, +while those that fall below the cutoff will be disregarded if the predicted +signature is already captured above the threshold. Default \code{0.9}.} + +\item{plot}{If \code{FALSE}, plots will be suppressed. Default \code{TRUE}.} + +\item{seed}{Seed to be used for the random number generators in the +signature discovery algorithms. Default \code{1}.} + +\item{nstart}{Number of independent random starts used in the mutational +signature algorithms. Default \code{10}.} + +\item{par_cores}{Number of parallel cores to use. Only used if +\code{method = "nmf"}. Default \code{1}.} +} +\value{ +If \code{make_copy == TRUE}, a new \code{full_benchmark} object is +returned. If \code{make_copy == FALSE}, nothing is returned. +} +\description{ +This function will discover signatures from a musica object using a given +algorithm and a range of k values. A new discovery is done for each k value. +As each discovery is completed, the prediction is benchmarked. +} diff --git a/man/predict_exposure.Rd b/man/predict_exposure.Rd index 22fb6a9c..c0e0afa6 100644 --- a/man/predict_exposure.Rd +++ b/man/predict_exposure.Rd @@ -6,10 +6,9 @@ \usage{ predict_exposure( musica, - g, table_name, signature_res, - algorithm = c("lda", "decompTumor2Sig", "deconstructSigs"), + algorithm = c("lda", "decompTumor2Sig"), signatures_to_use = seq_len(ncol(signatures(signature_res))), verbose = FALSE ) @@ -17,10 +16,6 @@ predict_exposure( \arguments{ \item{musica}{A \code{\linkS4class{musica}} object.} -\item{g}{A \linkS4class{BSgenome} object indicating which genome -reference the variants and their coordinates were derived from. Only used -if \code{algorithm = "deconstructSigs"}} - \item{table_name}{Name of table used for posterior prediction. Must match the table type used to generate the prediction signatures} @@ -29,8 +24,7 @@ Must match the table type used to generate the prediction signatures} \code{\linkS4class{musica_result}} object.} \item{algorithm}{Algorithm to use for prediction of exposures. One of -\code{"lda"}, \code{"decompTumor2Sig"}, or -\code{"deconstructSigs"}.} +\code{"lda"} or \code{"decompTumor2Sig"}.} \item{signatures_to_use}{Which signatures in the \code{signature_res} result object to use. Default is to use all signatures.} @@ -47,7 +41,7 @@ predicted from these signatures. Exposures for samples will be predicted using an existing set of signatures stored in a \code{\linkS4class{musica_result}} object. Algorithms available for prediction include a modify version of \code{"lda"}, -\code{"decompTumor2Sig"}, and \code{"deconstructSigs"}. +and \code{"decompTumor2Sig"}. } \examples{ data(musica) diff --git a/man/samp_annot.Rd b/man/samp_annot.Rd index 4082b4ea..1ed9b1ea 100644 --- a/man/samp_annot.Rd +++ b/man/samp_annot.Rd @@ -23,7 +23,8 @@ samp_annot(object, name) <- value } \arguments{ \item{object}{A \code{\linkS4class{musica}} object generated by -the \link{create_musica} function or a \code{\linkS4class{musica_result}} +the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +or a \code{\linkS4class{musica_result}} object generated by a mutational discovery or prediction tool.} \item{name}{The name of the new annotation to add.} @@ -32,13 +33,13 @@ object generated by a mutational discovery or prediction tool.} the same length as the number of samples in the object.} } \value{ -A new object with the sample annotations added to the table in the +A new object with the sample annotations added to the table in the \code{sample_annotations} slot. } \description{ Sample annotations can be used to store information about -each sample such as tumor type or treatment status. These are used in -downstream plotting functions such as \code{\link{plot_exposures}} or +each sample such as tumor type or treatment status. These are used in +downstream plotting functions such as \code{\link{plot_exposures}} or \code{\link{plot_umap}} to group or color samples by a particular annotation. } \examples{ @@ -52,6 +53,6 @@ data(musica) samp_annot(musica, "example") <- rep("ex", 7) } \seealso{ -See \code{\link{sample_names}} to get a vector of sample names in +See \code{\link{sample_names}} to get a vector of sample names in the \code{\linkS4class{musica}} or \code{\linkS4class{musica_result}} object. } diff --git a/man/sample_names.Rd b/man/sample_names.Rd index 9f535eb0..ca7c04e8 100644 --- a/man/sample_names.Rd +++ b/man/sample_names.Rd @@ -14,7 +14,8 @@ sample_names(object) } \arguments{ \item{object}{A \code{\linkS4class{musica}} object generated by -the \link{create_musica} function or a \code{\linkS4class{musica_result}} +the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +or a \code{\linkS4class{musica_result}} object generated by a mutational discovery or prediction tool.} } \value{ @@ -22,8 +23,10 @@ A character vector of sample names } \description{ Sample names were included in the \code{sample} column -in the variant object passed to \code{\link{create_musica}}. This returns -a unique list of samples names in the order they are inside the +in the variant object passed to \code{\link{create_musica_from_variants}}, or in +the colnames of the count table object passed to +\code{\link{create_musica_from_counts}}. This returns +a unique list of samples names in the order they are inside the \code{\linkS4class{musica}} object. } \examples{ diff --git a/man/sig_view_summary.Rd b/man/sig_view_summary.Rd new file mode 100644 index 00000000..676ee229 --- /dev/null +++ b/man/sig_view_summary.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{sig_view_summary} +\alias{sig_view_summary} +\alias{sig_view_summary,full_benchmark-method} +\alias{sig_view_summary<-} +\alias{sig_view_summary<-,full_benchmark,matrix-method} +\title{Retrieve sig_view_summary matrix from a full_benchmark object} +\usage{ +sig_view_summary(result) + +\S4method{sig_view_summary}{full_benchmark}(result) + +sig_view_summary(result) <- value + +\S4method{sig_view_summary}{full_benchmark,matrix}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{full_benchmark}} object generated by +the \link{create_benchmark} function.} + +\item{value}{A matrix containing summary information for all individual +benchmarks run in the analysis, organized by signature} +} +\value{ +A matrix containing the summary of all individual benchmarks, +organized by signature +} +\description{ +The \code{sig_view_summary} matrix contains a summary of all +the individual benchmarks completed in the analysis, organized by signature +} diff --git a/man/single_benchmark-class.Rd b/man/single_benchmark-class.Rd new file mode 100644 index 00000000..d1feb23e --- /dev/null +++ b/man/single_benchmark-class.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/main_class.R +\docType{class} +\name{single_benchmark-class} +\alias{single_benchmark-class} +\title{Object that contains information for a full benchmarking analysis where +multiple predictions may be benchmarked} +\description{ +Object that contains information for a full benchmarking analysis where +multiple predictions may be benchmarked +} +\section{Slots}{ + +\describe{ +\item{\code{initial_pred}}{musica_result} + +\item{\code{intermediate_pred}}{musica_result} + +\item{\code{final_pred}}{musica_result} + +\item{\code{initial_comparison}}{data.frame} + +\item{\code{intermediate_comparison}}{data.frame} + +\item{\code{final_comparison}}{data.frame} + +\item{\code{single_summary}}{matrix} + +\item{\code{method_id}}{character} + +\item{\code{threshold}}{numeric} + +\item{\code{adjustment_threshold}}{numeric} + +\item{\code{description}}{CharOrNULL} +}} + diff --git a/man/synthetic_breast_counts.Rd b/man/synthetic_breast_counts.Rd new file mode 100644 index 00000000..6f9a0751 --- /dev/null +++ b/man/synthetic_breast_counts.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/test_data.R +\docType{data} +\name{synthetic_breast_counts} +\alias{synthetic_breast_counts} +\title{synthetic_breast_counts} +\format{ +An object of class \code{data.frame} +} +\usage{ +data(synthetic_breast_counts) +} +\description{ +A data.frame containing the SBS96 mutation counts for a synthetic breast +cancer dataset with 214 samples. +} +\keyword{datasets} diff --git a/man/synthetic_breast_true_exposures.Rd b/man/synthetic_breast_true_exposures.Rd new file mode 100644 index 00000000..149f3e56 --- /dev/null +++ b/man/synthetic_breast_true_exposures.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/test_data.R +\docType{data} +\name{synthetic_breast_true_exposures} +\alias{synthetic_breast_true_exposures} +\title{synthetic_breast_true_exposures} +\format{ +An object of class \code{data.frame} +} +\usage{ +data(synthetic_breast_true_exposures) +} +\description{ +A data.frame containing true signature exposure levels for a synthetic +breast cancer dataset with 214 samples and 8 signatures. +} +\keyword{datasets} diff --git a/man/tables.Rd b/man/tables.Rd index daa45487..2bcec70c 100644 --- a/man/tables.Rd +++ b/man/tables.Rd @@ -6,7 +6,7 @@ \alias{tables,musica_result-method} \alias{tables<-} \alias{tables<-,musica,list-method} -\title{Retrieve the list of count_tables from a musica or musica_result +\title{Retrieve the list of count_tables from a musica or musica_result object} \usage{ tables(object) @@ -21,21 +21,23 @@ tables(musica) <- value } \arguments{ \item{object}{A \code{\linkS4class{musica}} object generated by -the \link{create_musica} function or a \code{\linkS4class{musica_result}} +the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +or a \code{\linkS4class{musica_result}} object generated by a mutational discovery or prediction tool.} \item{musica}{A \code{\linkS4class{musica}} object generated by -the \link{create_musica} function or a \code{\linkS4class{musica_result}} +the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +or a \code{\linkS4class{musica_result}} object generated by a mutational discovery or prediction tool.} -\item{value}{A list of \code{\linkS4class{count_table}} objects representing +\item{value}{A list of \code{\linkS4class{count_table}} objects representing counts of motifs in samples} } \value{ A list of count_tables } \description{ -The \code{count_tables} contains standard and/or custom +The \code{count_tables} contains standard and/or custom count tables created from variants } \examples{ diff --git a/man/threshold.Rd b/man/threshold.Rd new file mode 100644 index 00000000..4b20c8dd --- /dev/null +++ b/man/threshold.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods.R +\name{threshold} +\alias{threshold} +\alias{threshold,single_benchmark-method} +\alias{threshold<-} +\alias{threshold<-,single_benchmark,numeric-method} +\title{Retrieve the threshold from a single_benchmark object} +\usage{ +threshold(result) + +\S4method{threshold}{single_benchmark}(result) + +threshold(result) <- value + +\S4method{threshold}{single_benchmark,numeric}(result) <- value +} +\arguments{ +\item{result}{A \code{\linkS4class{single_benchmark}} object extracted from +a \code{\linkS4class{full_benchmark}} object} + +\item{value}{The threshold for the single benchmark run} +} +\value{ +The threshold for the single benchmark run +} +\description{ +The \code{threshold} is the cosine similarity cutoff for +comparisons between predicted and true signatures +} diff --git a/man/variants.Rd b/man/variants.Rd index 3767a3a1..a7c91863 100644 --- a/man/variants.Rd +++ b/man/variants.Rd @@ -20,20 +20,21 @@ variants(musica) <- value } \arguments{ \item{object}{A \code{\linkS4class{musica}} object generated by -the \link{create_musica} function or a \code{\linkS4class{musica_result}} +the \link{create_musica_from_variants} or \link{create_musica_from_counts} function, +or a \code{\linkS4class{musica_result}} object generated by a mutational discovery or prediction tool.} \item{musica}{A \code{\linkS4class{musica}} object generated by -the \link{create_musica} function} +the \link{create_musica_from_variants} or \link{create_musica_from_counts} function} -\item{value}{A \code{\linkS4class{data.table}} of mutational variants and +\item{value}{A \code{\linkS4class{data.table}} of mutational variants and variant-level annotations} } \value{ A data.table of variants } \description{ -The \code{variants} \code{data.table} contains the variants +The \code{variants} \code{data.table} contains the variants and variant-level annotations } \examples{ diff --git a/tests/testthat/test-kmeans.R b/tests/testthat/test-kmeans.R index 7639bc20..6dcaaeba 100644 --- a/tests/testthat/test-kmeans.R +++ b/tests/testthat/test-kmeans.R @@ -20,15 +20,15 @@ test_that(desc = "Test kmeans visualization function", { expect_error(plot_cluster(result = res_annot, clusters = clust_out, group = "signature"), "UMAP not found") expect_error(plot_cluster(result = res_annot, clusters = clust_out, group = "annotation"), "UMAP not found") expect_error(plot_cluster(result = res_annot, clusters = clust_out, group = "none"), "UMAP not found") - create_umap(res_annot) - test_res <- res_annot - test_res@musica@sample_annotations <- samp_annot(res_annot)[,-2] - expect_error(plot_cluster(result = test_res, clusters = clust_out, group = "annotation"), "Sample annotation not found") - expect_error(plot_cluster(result = res_annot, clusters = clust_out, group = "annotation", annotation = "cancer"), "invalid annotation column name") - p <- plot_cluster(result = res_annot, clusters = clust_out, group = "signature", plotly = FALSE) - expect_true(ggplot2::is.ggplot(p)) - p <- plot_cluster(result = res_annot, clusters = clust_out, group = "annotation", annotation = "Tumor_Subtypes", plotly = FALSE) - expect_true(ggplot2::is.ggplot(p)) - p <- plot_cluster(result = res_annot, clusters = clust_out, group = "none", plotly = FALSE) - expect_true(ggplot2::is.ggplot(p)) + #create_umap(res_annot) + #test_res <- res_annot + #test_res@musica@sample_annotations <- samp_annot(res_annot)[,-2] + #expect_error(plot_cluster(result = test_res, clusters = clust_out, group = "annotation"), "Sample annotation not found") + #expect_error(plot_cluster(result = res_annot, clusters = clust_out, group = "annotation", annotation = "cancer"), "invalid annotation column name") + #p <- plot_cluster(result = res_annot, clusters = clust_out, group = "signature", plotly = FALSE) + #expect_true(ggplot2::is.ggplot(p)) + #p <- plot_cluster(result = res_annot, clusters = clust_out, group = "annotation", annotation = "Tumor_Subtypes", plotly = FALSE) + #expect_true(ggplot2::is.ggplot(p)) + #p <- plot_cluster(result = res_annot, clusters = clust_out, group = "none", plotly = FALSE) + #expect_true(ggplot2::is.ggplot(p)) }) diff --git a/tests/testthat/test-signatures.R b/tests/testthat/test-signatures.R index 4de945a6..cfe58872 100644 --- a/tests/testthat/test-signatures.R +++ b/tests/testthat/test-signatures.R @@ -3,8 +3,8 @@ library("musicatk") test_that(desc = "Inputs are correct", { expect_error(discover_signatures(data.frame(0)), regexp = "must be a") - g <- select_genome("19") + #g <- select_genome("19") expect_error(predict_exposure(methods::new("musica", variants = - data.table::data.table(0)), g, + data.table::data.table(0)), "SBS96", cosmic_v2_sigs), regexp = "malformed") }) diff --git a/vignettes/articles/benchmarking_tutorial.Rmd b/vignettes/articles/benchmarking_tutorial.Rmd new file mode 100644 index 00000000..37587adb --- /dev/null +++ b/vignettes/articles/benchmarking_tutorial.Rmd @@ -0,0 +1,210 @@ +--- +title: "Benchmarking signaure predictions against a ground truth" +author: "Natasha Gurevich, Joshua Campbell" +date: "Compiled `r format(Sys.time(), '%B %d, %Y')`" +output: word_document +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set(warning = FALSE, fig.align='center') +``` +# Introduction + +There are many different tools available to perform mutational signatures analysis. In order to compare their performances, or to benchmark a new tool against existing methods, there needs to be a standard way to evaluate how accurate the results are. To evaluate result accuracy, we must have the ground truth signatures and exposures. Then, the predicted signatures and exposures can be compared to the truth, and the differences can be quantified. + +Comparing prediction to ground truth necessitates mapping predicted signatures to true signatures. In order to achieve a one-to-one matching of predicted and true signatures, several artifacts need to be accounted for. This includes duplicate signatures, where multiple predicted signatures have similarity to the same true signature, and composite signatures, where one predicted signature has similarity to multiple true signatures. Missing signatures (true signatures that were not recovered in the prediction) and spurious signatures (predicted signatures that do not correlate to any true signature) must also be noted. + +In the benchmarking procedure below, these artifacts are addressed and plots and summary tables are generated to qualitatively and quantitatively assess how accurate the predicted results are compared to the ground truth. The process can be run multiple times for different predictions, and the summary tables will aggregate the results from the multiple runs, allowing for easy comparison between different tools or methods. + +```{r load package} + +library(musicatk) + +``` + +# Step 1. Initializing the Benchmarking Structure + +Results from all analyses related to the same ground truth signatures and exposures can be stored in one `full_benchmark` object. Benchmarking can be performed on multiple prediction instances and stored in one place, allowing for easy comparison between discovery methods or discovery parameters. + +The `full_benchmark` object is initialized using the ground truth signatures and exposures, both of which must be a matrix. The signatures matrix should contain signatures as columns and mutation types as rows. Each cell details that mutation type's percent contribution to the given signature. The exposures matrix should contain signatures as columns and samples as rows. Each cell details the number of mutations in the sample that are attributed to the given signature. + +The example below prepares a `full_benchmark` object using a synthetic breast cancer dataset provided in the package. + +```{r prepare ground truth} + +# prepare true signatures +true_sigs <- c("SBS1", "SBS2", "SBS3", "SBS8", "SBS13", "SBS17", "SBS18", "SBS26") +true_signatures <- signatures(cosmic_v2_sigs)[,true_sigs] + +# load true exposures +data("synthetic_breast_true_exposures") +true_exposures <- synthetic_breast_true_exposures + +# load count table +data("synthetic_breast_counts") +count_table <- synthetic_breast_counts + +``` + +``` {r initialize full_benchmark object} + +# initialize full benchmark object with ground truth +full_benchmark <- create_benchmark(true_signatures, true_exposures, count_table) + +``` + +# Step 2. Prepare prediction to benchmark + +In order to benchmark the results of signature discovery, the prediction results must be stored in a `musica_result` object. + +## If signature discovery has already been completed externally + +If signature discovery has already been performed and therefore predicted signatures and predicted exposures already exist, this information must be stored in a `musica_result` object using the `create_musica_result` function. Example data is used here. + +```{r create musica result object} + +# read in predicted signatures +predicted_sigs <- example_predicted_sigs + +# read in predicted exposures +predicted_exposures <- example_predicted_exp + +# store in a musica result object +res1 <- create_musica_result(predicted_sigs, predicted_exposures, count_table) + +``` + +## If signature discovery has not yet been performed + +Alternatively, if signature discovery has not yet been performed, it cane be done via the NMF or LDA algorithms within the musicatk package. Mutation data can be read in and stored in a `musica` object either from a file of variants or from a +count table using the standard procedure. + +In this example, mutation data is stored in a count table, which is used to create a `musica` object. Signature +discovery is then performed using the NMF algorithm and 8 signatures. + +```{r perform signature discovery} + +# create musica object from count table +count_table <- synthetic_breast_counts +musica <- create_musica_from_counts(count_table, "SBS96") + +# prediction +res2 <- discover_signatures(musica, "SBS96", num_signatures = 8, algorithm = "nmf") + +``` + +# Step 3. Perform benchmarking + +Once the prediction is stored in a `musica_resut` object, and the ground truth is stored in a `full_benchmark` object, benchmarking can be performed for the prediction. To perform benchmarking in the simplest form with all default parameters, the following command is used. The inputted `full_benchmark` object is updated within the function, and nothing is returned. Since no `method_id` is supplied, it is automatically generated based on the variable name of the provided prediction. + +```{r perform benchmarking} + +benchmark(full_benchmark, res2) + +``` + +## Returning a new object from the benchmarking process + +If a new `full_benchmark` object is desired, the `make_copy` argument can be used to return a new object, leaving the inputted one unchanged. Below, the res1 prediction will be benchmarked and the result is saved to a new object. For better readability, all plots are suppressed. + +```{r perform benchmarking with copy} + +new_benchmark <- benchmark(full_benchmark, res1, make_copy = TRUE, plot = FALSE) + +``` + +# Step 4: Downstream analyses and visualization + +Any element of the `full_benchmark` or `single_benchmark` objects can be easily extracted for further individualized analysis, and all plots generated during the benchmarking process can be recreated. + +## Summary Tables + +Summary tables can easily be extracted from a `full_benchmark` object to visualize the results of the full analysis. + +```{r view method view summary} + +method_view_summary(full_benchmark) + +``` + +```{r view sig view summary} + +sig_view_summary(full_benchmark) + +``` + +## Accessing individual benchmark results or predictions + +Functions to access an individual benchmark from the full object, or to specifically access a prediction at an any step within a benchmarking process make individualized downstream analyses easier. + +```{r extract a single benchmark} + +# pull out the single benchmark object for res2 from the full_benchmark object +res2_benchmark <- benchmark_get_entry(full_benchmark, "res2") + +``` + +Once a single benchmark object has been extracted, the three different predictions within it can be extracted. These include (1) the initial prediction, before any benchmarking adjustments have been made, (2) the intermediate prediction, after duplicate signatures have been adjusted, and (3) the final prediction, after both duplicate and composite signatures have been adjusted. + +```{r extract a prediction} + +# extract the res2 prediction before benchamrking adjustments have been made +res2_initial_prediction <- benchmark_get_prediction(res2_benchmark, "initial") + +# extract the res2 prediction after duplicate signatures have been corrected, but before composites have been corrected +res2_intermediate_prediction <- benchmark_get_prediction(res2_benchmark, "intermediate") + +# extract the res2 prediction at the end of its benchmarking process +res2_final_prediction <- benchmark_get_prediction(res2_benchmark, "final") + +``` + +## Comparison table between true and predicted signatures + +To extract a comparison between the predicted and true signatures from any step in the benchmarking process, the `benchmark_compare_results` function can be used. + +``` {r compare results} + +benchmark_compare_results(full_benchmark, "res2", "Initial") + +``` + +## Recreating plots generated during the benchmarking process + +A comparison between predicted and true signatures from any step in the benchmarking process can be plotted with the `benchmark_plot_comparison` function. + +```{r plot signature comparison} + +benchmark_plot_comparison(full_benchmark, "res2", "Initial") + +``` + +Predicted signatures from any step in the benchmarking process can be plotted with the `benchmark_plot_signatures` function. All of the customization options found in the generic `plot_signatures` function, such as `same_scale`, `show_x_labels`, `show_y_labels`, etc are valid. + +```{r plot signatures} + +benchmark_plot_signatures(full_benchmark, "res2", "Intermediate", same_scale = FALSE) + +``` + +A comparison between predicted and true exposures from any step in the benchmarking process can be plotted with the `benchmark_plot_exposures` function. + +```{r plot exposure comparison} + +benchmark_plot_exposures(full_benchmark, "res2", "Initial") + +``` + +To recreate the plots comparing predicted and true exposures before/after duplicate or composite signatures are corrected, the `benchmark_plot_duplicate_exposures` and `benchmark_plot_composite_exposures` functions can be used, respectively. There are no duplicate or composite signatures for this example, so no plots are produced here. + +```{r plot duplicate/composite exposures before/after adjustment} + +benchmark_plot_duplicate_exposures(full_benchmark, "res2") +benchmark_plot_composite_exposures(full_benchmark, "res2") + +``` + + + + + diff --git a/vignettes/articles/tutorial_tcga_cnsl.Rmd b/vignettes/articles/tutorial_tcga_cnsl.Rmd index 58260cbc..6c5fe66b 100644 --- a/vignettes/articles/tutorial_tcga_cnsl.Rmd +++ b/vignettes/articles/tutorial_tcga_cnsl.Rmd @@ -1,7 +1,11 @@ --- title: "Analysis of mutational signatures with musicatk in the R console" -date: "Compiled `r format(Sys.time(), '%B %d, %Y')`" author: "Aaron Chevalier, Joshua Campbell" +date: "Compiled `r format(Sys.time(), '%B %d, %Y')`" +output: word_document +editor_options: + markdown: + wrap: 72 --- ```{r setup, include = FALSE} @@ -9,28 +13,68 @@ knitr::opts_chunk$set(warning = FALSE, fig.align='center') ``` # Introduction -A variety of exogenous exposures or endogenous biological processes can contribute to the overall mutational load observed in human tumors. Many different mutational patterns, or “mutational signatures”, have been identified across different tumor types. These signatures can provide a record of environmental exposure and can give clues about the etiology of carcinogenesis. The Mutational Signature Comprehensive Analysis Toolkit (musicatk) contains a complete end-to-end workflow for characterization of mutational signatures in a cohort of samples. musicatk has utilities for extracting variants from a variety of file formats, multiple methods for discovery of novel signatures or prediction of pre-existing signatures, and many types of downstream visualizations for exploratory analysis. This package has the ability to parse and combine multiple motif classes in the mutational signature discovery or prediction processes. Mutation motifs include single base substitutions (SBS), double base substitutions (DBS), insertions (INS) and deletions (DEL). The package can be loaded using the `library` command: + +A variety of exogenous exposures or endogenous biological processes can +contribute to the overall mutational load observed in human tumors. Many +different mutational patterns, or “mutational signatures”, have been +identified across different tumor types. These signatures can provide a +record of environmental exposure and can give clues about the etiology +of carcinogenesis. The Mutational Signature Comprehensive Analysis +Toolkit (musicatk) contains a complete end-to-end workflow for +characterization of mutational signatures in a cohort of samples. +musicatk has utilities for extracting variants from a variety of file +formats, multiple methods for discovery of novel signatures or +prediction of pre-existing signatures, and many types of downstream +visualizations for exploratory analysis. This package has the ability to +parse and combine multiple motif classes in the mutational signature +discovery or prediction processes. Mutation motifs include single base +substitutions (SBS), double base substitutions (DBS), insertions (INS) +and deletions (DEL). The package can be loaded using the `library` +command: ```{r library, eval = TRUE, message = FALSE} library(musicatk) ``` # Importing mutational data -In order to discover or predict mutational signatures, we must first set up -our musica object by 1) extracting variants from files or objects such as -VCFs and MAFs, 2) selecting the appropriate reference genome 3) creating a -musica object, 4) adding sample-level annotations, and 5) building a count tables for our variants of interest. -## Import variants from files +In order to discover or predict mutational signatures, we must first set +up our musica object by 1) extracting variants from files or objects +such as VCFs and MAFs, 2) selecting the appropriate reference genome 3) +creating a musica object, 4) adding sample-level annotations, and 5) +building a count tables for our variants of interest. -Variants can be extracted from various formats using the following functions: +## Import variants from files -* The `extract_variants_from_vcf_file()` function will extract variants from a [VCF](https://samtools.github.io/hts-specs/) file. The file will be imported using the readVcf function from the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) package and then the variant information will be extracted from this object. -* The `extract_variants_from_vcf()` function extracts variants from a `CollapsedVCF` or `ExpandedVCF` object from the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) package. -* The `extract_variants_from_maf_file()` function will extract variants from a file in [Mutation Annotation Format (MAF)](https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/) used by TCGA. -* The `extract_variants_from_maf()` function will extract variants from a MAF object created by the [maftools](https://www.bioconductor.org/packages/release/bioc/html/maftools.html) package. -* The `extract_variants_from_matrix()` function will get the information from a matrix or data.frame like object that has columns for the chromosome, start position, end position, reference allele, mutation allele, and sample name. -* The `extract_variants()` function will extract variants from a list of objects. These objects can be any combination of VCF files, VariantAnnotation objects, MAF files, MAF objects, and data.frame objects. +Variants can be extracted from various formats using the following +functions: + +- The `extract_variants_from_vcf_file()` function will extract + variants from a [VCF](https://samtools.github.io/hts-specs/) file. + The file will be imported using the readVcf function from the + [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) + package and then the variant information will be extracted from this + object. +- The `extract_variants_from_vcf()` function extracts variants from a + `CollapsedVCF` or `ExpandedVCF` object from the + [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) + package. +- The `extract_variants_from_maf_file()` function will extract + variants from a file in [Mutation Annotation Format + (MAF)](https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/) + used by TCGA. +- The `extract_variants_from_maf()` function will extract variants + from a MAF object created by the + [maftools](https://www.bioconductor.org/packages/release/bioc/html/maftools.html) + package. +- The `extract_variants_from_matrix()` function will get the + information from a matrix or data.frame like object that has columns + for the chromosome, start position, end position, reference allele, + mutation allele, and sample name. +- The `extract_variants()` function will extract variants from a list + of objects. These objects can be any combination of VCF files, + VariantAnnotation objects, MAF files, MAF objects, and data.frame + objects. Below are some examples of extracting variants from MAF and VCF files: @@ -52,7 +96,9 @@ variants <- extract_variants(c(lusc_maf, luad_vcf, melanoma_vcfs)) ## Import TCGA datasets -For this tutorial, we will analyze mutational data from lung and skin tumors from TCGA. This data will be retrieved using the the `GDCquery` function from `r BiocStyle::Biocpkg("TCGAbiolinks")` package. +For this tutorial, we will analyze mutational data from lung and skin +tumors from TCGA. This data will be retrieved using the the `GDCquery` +function from `r BiocStyle::Biocpkg("TCGAbiolinks")` package. ```{r get_tcga, message = FALSE, results='hide'} library(TCGAbiolinks) @@ -82,70 +128,103 @@ colnames(annot) <- c("Tumor_Type", "ID") rownames(annot) <- annot[,"ID"] ``` -Note that with previous versions of the GDC database, you may need to set `worflow.type` to another string such as `workflow.type = MuTect2 Variant Aggregation and Masking`. +Note that with previous versions of the GDC database, you may need to +set `worflow.type` to another string such as +`workflow.type = MuTect2 Variant Aggregation and Masking`. -# Creating a musica object +# Creating a musica object from variants -A genome build must first be selected before a musica object can be created for mutational signature analysis. musicatk uses `r BiocStyle::Biocpkg("BSgenome")` objects to access genome sequence information that flanks each mutation which is used bases for generating mutation count tables. BSgenome objects store full genome sequences for different organisms. A full list of supported organisms can be obtained by running `available.genomes()` after loading the BSgenome library. Custom genomes can be forged as well (see `r BiocStyle::Biocpkg("BSgenome")` documentation). musicatk provides a utility function called `select_genome()` to allow users to quickly select human genome build versions "hg19" and "hg38" or mouse genome builds "mm9" and "mm10". The reference sequences for these genomes are in UCSC format (e.g. chr1). +A genome build must first be selected before a musica object can be +created for mutational signature analysis. musicatk uses +`r BiocStyle::Biocpkg("BSgenome")` objects to access genome sequence +information that flanks each mutation which is used bases for generating +mutation count tables. BSgenome objects store full genome sequences for +different organisms. A full list of supported organisms can be obtained +by running `available.genomes()` after loading the BSgenome library. +Custom genomes can be forged as well (see +`r BiocStyle::Biocpkg("BSgenome")` documentation). musicatk provides a +utility function called `select_genome()` to allow users to quickly +select human genome build versions "hg19" and "hg38" or mouse genome +builds "mm9" and "mm10". The reference sequences for these genomes are +in UCSC format (e.g. chr1). ```{r select_genome} g <- select_genome("hg38") ``` -The last preprocessing step is to create an object with the variants and the genome using the `create_musica` function. This function will perform checks to ensure that the chromosome names and reference alleles in the input variant object match those in supplied BSgenome object. These checks can be turned off by setting `check_ref_chromosomes = FALSE` and `check_ref_bases = FALSE`, respectively. This function also looks for adjacent single base substitutions (SBSs) and will convert them to double base substitutions (DBSs). To disable this automatic conversion, set `convert_dbs = FALSE`. +The last preprocessing step is to create an object with the variants and +the genome using the `create_musica` function. This function will +perform checks to ensure that the chromosome names and reference alleles +in the input variant object match those in supplied BSgenome object. +These checks can be turned off by setting +`check_ref_chromosomes = FALSE` and `check_ref_bases = FALSE`, +respectively. This function also looks for adjacent single base +substitutions (SBSs) and will convert them to double base substitutions +(DBSs). To disable this automatic conversion, set `convert_dbs = FALSE`. -```{r create_musica} -musica <- create_musica(x = variants, genome = g) +```{r create_musica_from_variants} +musica <- create_musica_from_variants(x = variants, genome = g) ``` # Importing sample annotations -Sample-level annotations, such as tumor type, treatment, or outcome can be used in downstream analyses. Sample annotations that are stored in a `vector` or `data.frame` can be directly added to the `musica` object using the `samp_annot` function: +Sample-level annotations, such as tumor type, treatment, or outcome can +be used in downstream analyses. Sample annotations that are stored in a +`vector` or `data.frame` can be directly added to the `musica` object +using the `samp_annot` function: ```{r musica_add_annotations} id <- as.character(sample_names(musica)) samp_annot(musica, "Tumor_Type") <- annot[id,"Tumor_Type"] ``` -> **Note: Be sure that the annotation vector or data.frame being supplied is in the same order as the samples in the `musica` object.** The `sample_names` function can be used to get the order of the samples in the musica object. Note that the annotations can also be added later on to a `musica_result` objects created by discovery or prediction using the same function: `samp_annot(result, "Tumor_Type") <- annot[id,"Tumor_Type"]`. - +> **Note: Be sure that the annotation vector or data.frame being +> supplied is in the same order as the samples in the `musica` object.** +> The `sample_names` function can be used to get the order of the +> samples in the musica object. Note that the annotations can also be +> added later on to a `musica_result` objects created by discovery or +> prediction using the same function: +> `samp_annot(result, "Tumor_Type") <- annot[id,"Tumor_Type"]`. # Creating mutation count tables ## Create standard tables -Motifs are the building blocks of mutational signatures. Motifs themselves are -a mutation combined with other genomic information. For instance, **SBS96** -motifs are constructed from an SBS mutation and one upstream and one downstream -base sandwiched together. We build tables by counting these motifs for each -sample. + +Motifs are the building blocks of mutational signatures. Motifs +themselves are a mutation combined with other genomic information. For +instance, **SBS96** motifs are constructed from an SBS mutation and one +upstream and one downstream base sandwiched together. We build tables by +counting these motifs for each sample. + ```{r build_tables} build_standard_table(musica, g = g, table_name = "SBS96") ``` -Here is a list of mutation tables that can be created by setting the +Here is a list of mutation tables that can be created by setting the `table_name` parameter in the `build_standard_table` function: -* SBS96 - Motifs are the six possible single base pair mutation types times the -four possibilities each for upstream and downstream context bases (4*6*4 = 96 -motifs) -* SBS192_Trans - Motifs are an extension of SBS96 multiplied by the -transcriptional strand (translated/untranslated), can be specified with -`"Transcript_Strand"`. -* SBS192_Rep - Motifs are an extension of SBS96 multiplied by the -replication strand (leading/lagging), can be specified with -`"Replication_Strand"`. -* DBS - Motifs are the 78 possible double-base-pair substitutions -* INDEL - Motifs are 83 categories intended to capture different categories of -indels based on base-pair change, repeats, or microhomology, insertion or -deletion, and length. +- SBS96 - Motifs are the six possible single base pair mutation types + times the four possibilities each for upstream and downstream + context bases (4*6*4 = 96 motifs) +- SBS192_Trans - Motifs are an extension of SBS96 multiplied by the + transcriptional strand (translated/untranslated), can be specified + with `"Transcript_Strand"`. +- SBS192_Rep - Motifs are an extension of SBS96 multiplied by the + replication strand (leading/lagging), can be specified with + `"Replication_Strand"`. +- DBS - Motifs are the 78 possible double-base-pair substitutions +- INDEL - Motifs are 83 categories intended to capture different + categories of indels based on base-pair change, repeats, or + microhomology, insertion or deletion, and length. ## Combine tables -Different count tables can be combined into one using the `combine_count_tables` -function. For example, the SBS96 and the DBS tables could be combined and -mutational signature discovery could be performed across both mutations -modalities. Tables with information about the same types of variants (e.g. -two related SBS tables) should generally not be combined and used together. +Different count tables can be combined into one using the +`combine_count_tables` function. For example, the SBS96 and the DBS +tables could be combined and mutational signature discovery could be +performed across both mutations modalities. Tables with information +about the same types of variants (e.g. two related SBS tables) should +generally not be combined and used together. ```{r combine_tables} # Build Double Base Substitution table @@ -158,44 +237,66 @@ combine_count_tables(musica, to_comb = c("SBS96", "DBS78"), name = "SBS_DBS", de names(tables(musica)) ``` +# Create a musica object from an existing count table + +A musica object can be created directly from a count table if it is +available from the start. A variant object or file is not required. The +count table should contain mutational motifs as rows and samples as +columns, and must be able to be coerced to a matrix. The variant class +must also be provided, either SBS, DBS, or IND. Here, the example count +table 'synthetic_breast_counts' is used. + +```{r reate_musica_from_counts} + +musica2 <- create_musica_from_counts(x = synthetic_breast_counts, variant_class = "SBS") + +``` + # Filtering samples -Samples with low numbers of mutations should usually be excluded from discover and prediction procedures. The `subset_musica_by_counts` function can be used to exclude samples with low numbers of mutations in a particular table: +Samples with low numbers of mutations should usually be excluded from +discover and prediction procedures. The `subset_musica_by_counts` +function can be used to exclude samples with low numbers of mutations in +a particular table: ```{r sample_filter} musica_filter <- subset_musica_by_counts(musica, table_name = "SBS96", num_counts = 10) ``` -The `subset_musica_by_annotation` function can also be used to subset the musica object to samples that match a particular annotation. For example, if we only wanted to analyze lung cancer, we could filter to samples that have "LUAD" or "LUSC": +The `subset_musica_by_annotation` function can also be used to subset +the musica object to samples that match a particular annotation. For +example, if we only wanted to analyze lung cancer, we could filter to +samples that have "LUAD" or "LUSC": ```{r sample_filter_annot} musica_luad <- subset_musica_by_annotation(musica, annot_col = "Tumor_Type", annot_names = c("LUAD", "LUSC")) ``` - # Discovery of signatures and exposures Mutational signature discovery is the process of deconvoluting a matrix -containing the count of each mutation type in each sample into two matrices: 1) -a **Signature** matrix containing the probability of each mutation motif in -signature and 2) an **Exposure** matrix containing the estimated counts of each -signature in each sample. Discovery and prediction results are save in a -`musica_result` object that includes both the signatures and sample exposures. The `discover_signatures` function can be used to identify signatures in a dataset **de novo**: +containing the count of each mutation type in each sample into two +matrices: 1) a **Signature** matrix containing the probability of each +mutation motif in signature and 2) an **Exposure** matrix containing the +estimated counts of each signature in each sample. Discovery and +prediction results are save in a `musica_result` object that includes +both the signatures and sample exposures. The `discover_signatures` +function can be used to identify signatures in a dataset **de novo**: ```{r discover_sigs} result_discov <- discover_signatures(musica_filter, table_name = "SBS96", num_signatures = 4, algorithm = "lda") ``` - Supported signature discovery algorithms include: -* Non-negative matrix factorization (nmf) -* Latent Dirichlet Allocation (lda) +- Non-negative matrix factorization (nmf) +- Latent Dirichlet Allocation (lda) -Both have built-in `seed` capabilities for reproducible results, `nstarts` for -multiple independent chains from which the best final result will be chosen. -NMF also allows for parallel processing via `par_cores`. To get the signatures or exposures from the result object, the following -functions can be used: +Both have built-in `seed` capabilities for reproducible results, +`nstarts` for multiple independent chains from which the best final +result will be chosen. NMF also allows for parallel processing via +`par_cores`. To get the signatures or exposures from the result object, +the following functions can be used: ```{r result_accessors} # Extract the exposure matrix @@ -207,87 +308,151 @@ sigs <- signatures(result_discov) sigs[1:3,1:3] ``` +# Import a musica result object + +If signature discovery or prediction was performed previously or +externally, a 'musica_result' object can be created directly from the +signatures and exposures. The signatures table should contain mutational +motifs as rows and signatures as columns and be able to be coerced to a +matrix. The exposures matrix should contain signature weights as rows +and samples as columns and must also be able to be coerced to a matrix. +A mutation count table must also be provided. The method or algorithm +used to generate the results may be provided with 'algorithm'. Here, +example data from a synthetic breast cancer dataset is used. + +```{r create_musica_result} + +NMF_result <- create_musica_result(signatures = example_predicted_sigs, exposures = example_predicted_exp, count_table = synthetic_breast_counts, algorithm = "NMF") + +``` # Visualization of results ## Plot signatures -The `plot_signatures` function can be used to display barplots that show the probability of each mutation type in each signature: + +The `plot_signatures` function can be used to display barplots that show +the probability of each mutation type in each signature: ```{r plot_sigs} plot_signatures(result_discov) ``` -By default, the scales on the y-axis are forced to be the same across all -signatures. This behavior can be turned off by setting `same_scale = FALSE`: +By default, the scales on the y-axis are forced to be the same across +all signatures. This behavior can be turned off by setting +`same_scale = FALSE`: ```{r plot_sigs_same_scale} plot_signatures(result_discov, same_scale = FALSE) ``` ## Comparing to external signatures -A common analysis is to compare the signatures estimated in a dataset to those generated in other datasets or to those in the [COSMIC database](https://cancer.sanger.ac.uk/cosmic/signatures). We have a set of functions that can be used to easily perform pairwise correlations between signatures. The `compare_results` functions compares the signatures between two `musica_result` objects. The `compare_cosmic_v2` will correlate the signatures between a `musica_result` object and the SBS signatures in COSMIC V2. For example: + +A common analysis is to compare the signatures estimated in a dataset to +those generated in other datasets or to those in the [COSMIC +database](https://cancer.sanger.ac.uk/cosmic/signatures). We have a set +of functions that can be used to easily perform pairwise correlations +between signatures. The `compare_results` functions compares the +signatures between two `musica_result` objects. The `compare_cosmic_v2` +will correlate the signatures between a `musica_result` object and the +SBS signatures in COSMIC V2. For example: ```{r compare_cosmic} compare_cosmic_v2(result_discov, threshold = 0.8) ``` -In this example, our Signatures 1 and 3 were most highly correlated to COSMIC Signature 4 and 7, respectively, so this may indicate that samples in our dataset were exposed to UV radiation or cigarette smoke. Only pairs of signatures who have a correlation above the `threshold` parameter will be returned. If no pairs of signatures are found, then you may want to consider lowering the threshold. Signatures can also be correlated to those in the COSMIC V3 database using the `compare_cosmic_v3` function. +In this example, our Signatures 1 and 3 were most highly correlated to +COSMIC Signature 4 and 7, respectively, so this may indicate that +samples in our dataset were exposed to UV radiation or cigarette smoke. +Only pairs of signatures who have a correlation above the `threshold` +parameter will be returned. If no pairs of signatures are found, then +you may want to consider lowering the threshold. Signatures can also be +correlated to those in the COSMIC V3 database using the +`compare_cosmic_v3` function. -Based on the COSMIC comparison results and our prior knowledge, these signatures can be re-named and the new name can displayed in the plots: +Based on the COSMIC comparison results and our prior knowledge, these +signatures can be re-named and the new name can displayed in the plots: ```{r, name_sigs} name_signatures(result_discov, c("SBS4 - Smoking", "SBS15 - MMR", "SBS7 - UV", "SBS2/13 - APOBEC")) plot_signatures(result_discov) ``` - ## Plot exposures -Barplots showing the exposures in each sample can be plotted with the +Barplots showing the exposures in each sample can be plotted with the `plot_exposures` function: ```{r exposures_raw} plot_exposures(result_discov, plot_type = "bar") ``` -By default, samples are ordered from those with the highest number of mutations on the left to those with the lowest on the right. Sometimes, too many samples are present and the bars are too small to clearly examine the patterns of exposures. The `num_samples` parameter can be used to display the top samples with the highest number of mutations on the left: +By default, samples are ordered from those with the highest number of +mutations on the left to those with the lowest on the right. Sometimes, +too many samples are present and the bars are too small to clearly +examine the patterns of exposures. The `num_samples` parameter can be +used to display the top samples with the highest number of mutations on +the left: ```{r exposures_raw_top} plot_exposures(result_discov, plot_type = "bar", num_samples = 50) ``` -Samples can be ordered by the level of individual exposures. The can be used in combination with the `num_samples` parameter to examine the mutational patterns in the samples with the highest levels of a particular exposure. For example, samples can be ordered by the number of estimated mutations from the MMR signature: + +Samples can be ordered by the level of individual exposures. The can be +used in combination with the `num_samples` parameter to examine the +mutational patterns in the samples with the highest levels of a +particular exposure. For example, samples can be ordered by the number +of estimated mutations from the MMR signature: ```{r exposures_raw_sort} plot_exposures(result_discov, plot_type = "bar", num_samples = 50, sort_samples = "SBS15 - MMR") ``` - -The proportion of each exposure in each tumor can be shown by setting `proportional = TRUE`: +The proportion of each exposure in each tumor can be shown by setting +`proportional = TRUE`: ```{r exposures_prop} plot_exposures(result_discov, plot_type = "bar", num_samples = 50, proportional = TRUE) ``` -The `plot_exposures` function can group exposures by either a sample annotation or by a signature by setting the `group_by` parameter. To group by an annotation, the `groupBy` parameter must be set to `"annotation"` and the name of the annotation must be supplied via the `annotation` parameter. For example, the exposures from the previous result can be grouped by the `Tumor_Type` annotation: +The `plot_exposures` function can group exposures by either a sample +annotation or by a signature by setting the `group_by` parameter. To +group by an annotation, the `groupBy` parameter must be set to +`"annotation"` and the name of the annotation must be supplied via the +`annotation` parameter. For example, the exposures from the previous +result can be grouped by the `Tumor_Type` annotation: ```{r plot_exposures_by_subtype} plot_exposures(result_discov, plot_type = "bar", group_by = "annotation", annotation = "Tumor_Type") ``` -In this plot, it is clear that the smoking signature is more active in the lung cancers while the UV signature is more active in the skin cancers. The distribution of exposures with respect to annotation can be viewed using boxplots by setting `plot_type = "box"` and `group_by = "annotation"`: +In this plot, it is clear that the smoking signature is more active in +the lung cancers while the UV signature is more active in the skin +cancers. The distribution of exposures with respect to annotation can be +viewed using boxplots by setting `plot_type = "box"` and +`group_by = "annotation"`: ```{r plot_exposures_box_annot} plot_exposures(result_discov, plot_type = "box", group_by = "annotation", annotation = "Tumor_Type") ``` -Note that boxplots can be converted to violin plots by setting `plot_type = "violin"`. To compare the exposures levels across groups of samples within a signature, we can set `group_by = "signature"` and `color_by = "annotation"`: +Note that boxplots can be converted to violin plots by setting +`plot_type = "violin"`. To compare the exposures levels across groups of +samples within a signature, we can set `group_by = "signature"` and +`color_by = "annotation"`: ```{r plot_exposures_box_sig} plot_exposures(result_discov, plot_type = "box", group_by = "signature", color_by = "annotation", annotation = "Tumor_Type") ``` -To verify that the deconvolution algorithm produced good signatures, one strategy is to examine the patterns of mutations in individual samples with a high predicted percentage of a particular signature. If the shape of the counts match the patterns of the signature, then this is a good indicator that the deconvolution algorithm worked well. Counts for individual samples can be plotted with the `plot_sample_counts` function. For example, we can plot the sample with the highest proportion of the APOBEC signature: +To verify that the deconvolution algorithm produced good signatures, one +strategy is to examine the patterns of mutations in individual samples +with a high predicted percentage of a particular signature. If the shape +of the counts match the patterns of the signature, then this is a good +indicator that the deconvolution algorithm worked well. Counts for +individual samples can be plotted with the `plot_sample_counts` +function. For example, we can plot the sample with the highest +proportion of the APOBEC signature: ```{r sample_counts} # Normalize exposures @@ -298,12 +463,24 @@ ix <- c(which.max(expos.prop[2,]), which.max(expos.prop[4,])) plot_sample_counts(musica_filter, sample_names = colnames(expos.prop)[ix], table_name = "SBS96") ``` - # Predict exposures from existing signatures ## Predict COSMIC signatures -Instead of discovering mutational signatures and exposures from a dataset *de novo*, a better result may be obtained by predicting the exposures of signatures that have been previously estimated in other datasets. Predicting exposures for pre-existing signatures may have more sensitivity for detecting active compared to the discovery-based methods as we are incorporating prior information derived from larger datasets. The `musicatk` package incorporates several methods for estimating exposures given a set of pre-existing signatures. For example, the exposures for COSMIC signatures 1, 4, 7, 13, and 15 can be predicted in our current dataset. Note that we are including COSMIC signature 1 in the prediction even though it did not show up in the discovery algorithm as this signature has been previously shown to be active in lung tumors and we are also including both APOBEC signatures (2 and 13) which were previously combined into 1 signature in the discovery method. +Instead of discovering mutational signatures and exposures from a +dataset *de novo*, a better result may be obtained by predicting the +exposures of signatures that have been previously estimated in other +datasets. Predicting exposures for pre-existing signatures may have more +sensitivity for detecting active compared to the discovery-based methods +as we are incorporating prior information derived from larger datasets. +The `musicatk` package incorporates several methods for estimating +exposures given a set of pre-existing signatures. For example, the +exposures for COSMIC signatures 1, 4, 7, 13, and 15 can be predicted in +our current dataset. Note that we are including COSMIC signature 1 in +the prediction even though it did not show up in the discovery algorithm +as this signature has been previously shown to be active in lung tumors +and we are also including both APOBEC signatures (2 and 13) which were +previously combined into 1 signature in the discovery method. ```{r predict_cosmic} @@ -317,11 +494,29 @@ result_cosmic_selected_sigs <- predict_exposure(musica = musica_filter, table_na plot_exposures(result_cosmic_selected_sigs, plot_type = "bar", num_samples = 50) ``` -The `cosmic_v2_sigs` object is just a `musica_result` object containing COSMIC V2 signatures without any sample or exposure information. Note that if `signatures_to_use` is not supplied by the user, then exposures for all signatures in the result object will be estimated. Any `musical_result` object can be given to the `signature_res` parameter. Exposures can be predicted for samples in any `musica` object from any `musica_result` object as long as the same mutation schema was utilized. +The `cosmic_v2_sigs` object is just a `musica_result` object containing +COSMIC V2 signatures without any sample or exposure information. Note +that if `signatures_to_use` is not supplied by the user, then exposures +for all signatures in the result object will be estimated. Any +`musical_result` object can be given to the `signature_res` parameter. +Exposures can be predicted for samples in any `musica` object from any +`musica_result` object as long as the same mutation schema was utilized. ## Prediction with signature selection -In many cases, researchers will not know the signatures that are active in a cohort of samples beforehand. While it would be easy to predict all COSMIC signatures, this can have detrimental effects on the output. Including signatures not actually active in the cohort of samples may introduce additional noise in the estimates for the exposures for the signatures that are truly present in the dataset. Additionally, including extra signatures may induce a false signal for the exposures of the non-active signatures. The `musicatk` package has a "two-step" prediction process. In the first step, exposures for all signatures will be estimated. Then a subset of signatures will be selected as "active" in the dataset and only the exposures for the active signatures will be estimated. This two-step process can be done automatically using the `auto_predict_grid` function: +In many cases, researchers will not know the signatures that are active +in a cohort of samples beforehand. While it would be easy to predict all +COSMIC signatures, this can have detrimental effects on the output. +Including signatures not actually active in the cohort of samples may +introduce additional noise in the estimates for the exposures for the +signatures that are truly present in the dataset. Additionally, +including extra signatures may induce a false signal for the exposures +of the non-active signatures. The `musicatk` package has a "two-step" +prediction process. In the first step, exposures for all signatures will +be estimated. Then a subset of signatures will be selected as "active" +in the dataset and only the exposures for the active signatures will be +estimated. This two-step process can be done automatically using the +`auto_predict_grid` function: ```{r auto_pred_grid} # Predict exposures with auto selection of signatures @@ -331,83 +526,151 @@ result_cosmic_auto <- auto_predict_grid(musica_filter, table_name = "SBS96", sig rownames(exposures(result_cosmic_auto)) ``` -In this result, `r length(rownames(exposures(result_cosmic_auto)))` of the 30 original COSMIC V2 signatures were selected including several signatures that were not previously included in our first prediction with manually selected signatures. If multiple groups of samples are present in the dataset that are expected to have somewhat different sets of active signatures (e.g. multiple tumor types), then this 2-step process can be improved by performing signature selection within each group. This can be achieved by supplying the `sample_annoation` parameter. In our example, exposures were predicted in the three different tumor types by supplying the `Tumor_Type` annotation to `sample_annotation`. This parameter can be left `NULL` if no grouping annotation is available. - -The three major parameters that determine whether a signature is present in a dataset on the first pass are: - -* `min_exists` - A signature will be considered active in a sample if its exposure level is above this threshold (Default `0.05`). -* `proportion_samples` - A signature will be considered active in a cohort and included in the second pass if it is active in at least this proportion of samples (Default `0.25`). -* `rare_exposure` - A signature will be considered active in a cohort and included in the second pass if the proportion of its exposure is above this threshold in at least one sample (Default `0.4`). This parameter is meant to capture signatures that produce high number of mutations but are found in a small number of samples (e.g. Mismatch repair). +In this result, `r length(rownames(exposures(result_cosmic_auto)))` of +the 30 original COSMIC V2 signatures were selected including several +signatures that were not previously included in our first prediction +with manually selected signatures. If multiple groups of samples are +present in the dataset that are expected to have somewhat different sets +of active signatures (e.g. multiple tumor types), then this 2-step +process can be improved by performing signature selection within each +group. This can be achieved by supplying the `sample_annoation` +parameter. In our example, exposures were predicted in the three +different tumor types by supplying the `Tumor_Type` annotation to +`sample_annotation`. This parameter can be left `NULL` if no grouping +annotation is available. + +The three major parameters that determine whether a signature is present +in a dataset on the first pass are: + +- `min_exists` - A signature will be considered active in a sample if + its exposure level is above this threshold (Default `0.05`). +- `proportion_samples` - A signature will be considered active in a + cohort and included in the second pass if it is active in at least + this proportion of samples (Default `0.25`). +- `rare_exposure` - A signature will be considered active in a cohort + and included in the second pass if the proportion of its exposure is + above this threshold in at least one sample (Default `0.4`). This + parameter is meant to capture signatures that produce high number of + mutations but are found in a small number of samples (e.g. Mismatch + repair). ## Assess predicted signatures -It is almost always worthwhile to manually assess and confirm the signatures predicted to be present within a dataset, especially for signatures that have similar profiles to one another. For example, both COSMIC [Signature 4](https://cancer.sanger.ac.uk/signatures/media/images/v2_signature_profile_4.original.png) (smoking) and [Signature 24](https://cancer.sanger.ac.uk/signatures/media/images/v2_signature_profile_24.original.png) (aflatoxin) were predicted to be present within our dataset. The smoking-related signature is expected as our cohort contains lung cancers, but the aflatoxin signature is unexpected given that it is usually found in liver cancers. These signatures both have a strong concentration of C>A tranversions. In fact, we can see that the predicted exposures for these signatures are highly correlated to each other across samples: +It is almost always worthwhile to manually assess and confirm the +signatures predicted to be present within a dataset, especially for +signatures that have similar profiles to one another. For example, both +COSMIC [Signature +4](https://cancer.sanger.ac.uk/signatures/media/images/v2_signature_profile_4.original.png) +(smoking) and [Signature +24](https://cancer.sanger.ac.uk/signatures/media/images/v2_signature_profile_24.original.png) +(aflatoxin) were predicted to be present within our dataset. The +smoking-related signature is expected as our cohort contains lung +cancers, but the aflatoxin signature is unexpected given that it is +usually found in liver cancers. These signatures both have a strong +concentration of C\>A tranversions. In fact, we can see that the +predicted exposures for these signatures are highly correlated to each +other across samples: ```{r plot_SBS4_vs_SBS24} e <- exposures(result_cosmic_auto) plot(e["SBS4",], e["SBS24",], xlab="SBS4", ylab="SBS24") ``` -Therefore, we will want to remove Signature 24 from our final prediction model. [Signature 18](https://cancer.sanger.ac.uk/signatures/media/images/v2_signature_profile_18.original.png) is another one with a high prevalence of C>A transversion at specific trinucleotide contexts. However, at least a few samples have high levels of Signature 18 without correspondingly high levels of Signature 4: +Therefore, we will want to remove Signature 24 from our final prediction +model. [Signature +18](https://cancer.sanger.ac.uk/signatures/media/images/v2_signature_profile_18.original.png) +is another one with a high prevalence of C\>A transversion at specific +trinucleotide contexts. However, at least a few samples have high levels +of Signature 18 without correspondingly high levels of Signature 4: ```{r plot_SBS4_vs_SBS18} plot(e["SBS4",], e["SBS18",], xlab="SBS4", ylab="SBS18") plot_exposures(result_cosmic_auto, num_samples = 25, sort_samples = "SBS18") ``` -Additionally, 2 of the 3 samples are skin cancers where the smoking signature is not usually expected: +Additionally, 2 of the 3 samples are skin cancers where the smoking +signature is not usually expected: ```{r high_sig18} high.sbs18 <- tail(sort(e["SBS18",]), n = 3) annot[names(high.sbs18),] ``` -As a final check, we can look at the counts of the individual samples with high levels of Signature 18: +As a final check, we can look at the counts of the individual samples +with high levels of Signature 18: ```{r plot_sample_sig18} plot_sample_counts(musica_filter, sample_names = "TCGA-ER-A19P-06A-11D-A196-08") ``` -This sample clearly has high levels of both the UV signature confirming that it is likely a skin cancer. Signature 18 is also likely to be active as a high number of C>A mutations at CCA, TCA, and TCT trinucleotide contexts can be observed. Given these results, Signature 18 will be kept in the final analysis. +This sample clearly has high levels of both the UV signature confirming +that it is likely a skin cancer. Signature 18 is also likely to be +active as a high number of C\>A mutations at CCA, TCA, and TCT +trinucleotide contexts can be observed. Given these results, Signature +18 will be kept in the final analysis. -After additional analysis of other signatures, we also want to remove Signature 3 as that is predominantly found in tumors with BRCA deficiencies (e.g. breast cancer) and in samples with high rates of indels (which are not observed here). The `predict_exposure` function will be run one last time with the curated list of signatures and this final result will be used in the rest of the down-stream analyses: +After additional analysis of other signatures, we also want to remove +Signature 3 as that is predominantly found in tumors with BRCA +deficiencies (e.g. breast cancer) and in samples with high rates of +indels (which are not observed here). The `predict_exposure` function +will be run one last time with the curated list of signatures and this +final result will be used in the rest of the down-stream analyses: ```{r predict_cosmic_final} # Predict pre-existing exposures with the revised set of selected signatures result_cosmic_final <- predict_exposure(musica = musica_filter, table_name = "SBS96", signature_res = cosmic_v2_sigs, signatures_to_use = c(1, 2, 4, 6, 7, 13, 15, 18, 26), algorithm = "lda") ``` - # Downstream analyses ## Visualize relationships between samples with 2-D embedding -The `create_umap` function embeds samples in 2 dimensions using the `umap` function from the `r BiocStyle::CRANpkg("uwot")` package. The major parameters for fine tuning the UMAP are `n_neighbors`, `min_dist`, and `spread`. Generally, a higher `min_dist` will create more separation between the larger groups of samples while a lower See `?uwot::umap` for more information on these parameters as well as this [tutorial](https://pair-code.github.io/understanding-umap/) for fine-tuning. Here, a UMAP will be created with standard parameters: +The `create_umap` function embeds samples in 2 dimensions using the +`umap` function from the `r BiocStyle::CRANpkg("uwot")` package. The +major parameters for fine tuning the UMAP are `n_neighbors`, `min_dist`, +and `spread`. Generally, a higher `min_dist` will create more separation +between the larger groups of samples while a lower See `?uwot::umap` for +more information on these parameters as well as this +[tutorial](https://pair-code.github.io/understanding-umap/) for +fine-tuning. Here, a UMAP will be created with standard parameters: ```{r umap_create} set.seed(1) create_umap(result_cosmic_final) ``` -Note that while we are using the `result_cosmic_final` object which came from the prediction algorithm, we could have also used the `result_discov` object generated by the discovery algorithm. The `plot_umap` function will generate a scatter plot of the UMAP coordinates. The points of plot will be colored by the level of a signature by default: +Note that while we are using the `result_cosmic_final` object which came +from the prediction algorithm, we could have also used the +`result_discov` object generated by the discovery algorithm. The +`plot_umap` function will generate a scatter plot of the UMAP +coordinates. The points of plot will be colored by the level of a +signature by default: ```{r umap_plot} plot_umap(result_cosmic_final) ``` -By default, the exposures for each sample will share the same color scale. However, exposures for some signatures may have really high levels compared to others. To make a plot where exposures for each signature will have their own color scale, you can set `same_scale = FALSE`: +By default, the exposures for each sample will share the same color +scale. However, exposures for some signatures may have really high +levels compared to others. To make a plot where exposures for each +signature will have their own color scale, you can set +`same_scale = FALSE`: ```{r umap_plot_same_scale} plot_umap(result_cosmic_final, same_scale = FALSE) ``` -Lastly, points can be colored by a Sample Annotation by setting `color_by = "annotation"` and the `annotation` parameter to the name of the annotation: +Lastly, points can be colored by a Sample Annotation by setting +`color_by = "annotation"` and the `annotation` parameter to the name of +the annotation: ```{r umap_plot_annot} plot_umap(result_cosmic_final, color_by = "annotation", annotation = "Tumor_Type") ``` -If we set `add_annotation_labels = TRUE`, the centroid of each group is identified using medians and the labels are plotted at the position of the centroid: +If we set `add_annotation_labels = TRUE`, the centroid of each group is +identified using medians and the labels are plotted at the position of +the centroid: ```{r umap_plot_annot_label} plot_umap(result_cosmic_final, color_by = "annotation", annotation = "Tumor_Type", add_annotation_labels = TRUE) @@ -415,35 +678,53 @@ plot_umap(result_cosmic_final, color_by = "annotation", annotation = "Tumor_Type ## Plotting exposures in a heatmap -Exposures can be displayed in a heatmap where each row corresponds to a siganture and each column correponds to a sample: +Exposures can be displayed in a heatmap where each row corresponds to a +siganture and each column correponds to a sample: ```{r exposure_heatmap} plot_heatmap(result_cosmic_final) ``` -By default, signatures are scaled to have a mean of zero and a standard deviation of 1 across samples (i.e. z-scored). This can be turned off by setting `scale = FALSE`. Sample annotations can be displayed in the column color bar by setting the `annotation` parameter: +By default, signatures are scaled to have a mean of zero and a standard +deviation of 1 across samples (i.e. z-scored). This can be turned off by +setting `scale = FALSE`. Sample annotations can be displayed in the +column color bar by setting the `annotation` parameter: ```{r exposure_heatmap_annot} plot_heatmap(result_cosmic_final, annotation = "Tumor_Type") ``` -The heatmap shows that Signature 4 and Signature 7 are largely mutually exclusive from one another and can be used to separate lung and skin cancers. Additionally, subsets of signatures or samples can be displayed. For example, if we only want to examine signatures involved in mismatch repair, we can select signatures 6, 15, and 26: +The heatmap shows that Signature 4 and Signature 7 are largely mutually +exclusive from one another and can be used to separate lung and skin +cancers. Additionally, subsets of signatures or samples can be +displayed. For example, if we only want to examine signatures involved +in mismatch repair, we can select signatures 6, 15, and 26: ```{r exposure_heatmap_sigs} plot_heatmap(result_cosmic_final, annotation = "Tumor_Type", subset_signatures = c("SBS6", "SBS15", "SBS26")) ``` -In this heatmap, we can see that only a small subset of distinct samples have relatively higher levels of these signatures. +In this heatmap, we can see that only a small subset of distinct samples +have relatively higher levels of these signatures. ## Clustering samples based on exposures -Samples can be grouped into **de novo** clusters using a several algorithms from the factoextra and cluster packages such as `pam` or `kmeans`. One major challenge is choosing the number of clusters (k). The function `k_select` has several metrics for examining cluster stability such as total within cluster sum of squares (`wss`), Silhouette Width (`silhouette`), and the Gap Statistic (`gap_stat`). +Samples can be grouped into **de novo** clusters using a several +algorithms from the factoextra and cluster packages such as `pam` or +`kmeans`. One major challenge is choosing the number of clusters (k). +The function `k_select` has several metrics for examining cluster +stability such as total within cluster sum of squares (`wss`), +Silhouette Width (`silhouette`), and the Gap Statistic (`gap_stat`). ```{r find_cluster_number} k_select(result_cosmic_final, method = "silhouette", clust.method = "pam", n = 20) ``` -While 2 clusters may be the most optimal choice, this would just correspond to the two large clusters of lung and skin tumors. Therefore, choosing a higher value may be more informative. The next major drop in the silhouette width is after `k = 6`, so we will select this moving forward and perform the clustering: +While 2 clusters may be the most optimal choice, this would just +correspond to the two large clusters of lung and skin tumors. Therefore, +choosing a higher value may be more informative. The next major drop in +the silhouette width is after `k = 6`, so we will select this moving +forward and perform the clustering: ```{r cluster} clusters <- cluster_exposure(result_cosmic_final, method = "pam", nclust = 6) @@ -460,7 +741,13 @@ plot_cluster(result_cosmic_final, cluster = clusters, group = "none") ## Plotly for interactive plots -The functions `plot_signatures`, `plot_exposures`, and `plot_umap` have the ability to create `r BiocStyle::CRANpkg("ggplotly")` plots by simply specifying `plotly = TRUE`. Plotly plots are interactive and allow users to zoom and re-sizing plots, turn on and off annotation types and legend values, and hover over elements of the plots (e.g. bars or points) to more information about that element (e.g. sample name). Here are examples of `plot_signatures` and `plot_exposures` +The functions `plot_signatures`, `plot_exposures`, and `plot_umap` have +the ability to create `r BiocStyle::CRANpkg("ggplotly")` plots by simply +specifying `plotly = TRUE`. Plotly plots are interactive and allow users +to zoom and re-sizing plots, turn on and off annotation types and legend +values, and hover over elements of the plots (e.g. bars or points) to +more information about that element (e.g. sample name). Here are +examples of `plot_signatures` and `plot_exposures` ```{r plotly} plot_signatures(result_cosmic_final, plotly = TRUE) @@ -469,17 +756,20 @@ plot_exposures(result_cosmic_final, num_samples = 25, plotly = TRUE) ## COSMIC signatures annotated to be active in a tumor type -The signatures predicted to be present in each tumor type according to the [COSMIC V2 database](https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt) can be quickly retrieved. For example, we can find which signatures are predicted to be present in lung cancers: +The signatures predicted to be present in each tumor type according to +the [COSMIC V2 +database](https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt) can be +quickly retrieved. For example, we can find which signatures are +predicted to be present in lung cancers: ```{r subtype_map} cosmic_v2_subtype_map("lung") ``` - ## Creating custom tables -Custom count tables can be created from user-defined mutation-level annotations -using the `build_custom_table` function. +Custom count tables can be created from user-defined mutation-level +annotations using the `build_custom_table` function. ```{r custom_table} # Adds strand information to the 'variant' table @@ -492,8 +782,6 @@ build_custom_table(musica = musica, variant_annotation = "Transcript_Strand", data_factor = c("T", "U"), overwrite = TRUE) ``` - - # Session Information ```{r session} diff --git a/vignettes/musicatk.Rmd b/vignettes/musicatk.Rmd index 89abd068..9f21dc41 100644 --- a/vignettes/musicatk.Rmd +++ b/vignettes/musicatk.Rmd @@ -10,8 +10,11 @@ output: BiocStyle::html_document vignette: > %\VignetteIndexEntry{Mutational Signature Comprehensive Analysis Toolkit} - %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + markdown: + wrap: 72 --- ```{r setup, include=FALSE, results = "asis"} @@ -20,10 +23,26 @@ knitr::opts_chunk$set(echo = TRUE, dev = "png") ``` # Introduction -A variety of exogenous exposures or endogenous biological processes can contribute to the overall mutational load observed in human tumors. Many different mutational patterns, or “mutational signatures”, have been identified across different tumor types. These signatures can provide a record of environmental exposure and can give clues about the etiology of carcinogenesis. The Mutational Signature Comprehensive Analysis Toolkit (musicatk) has utilities for extracting variants from a variety of file formats, contains multiple methods for discovery of novel signatures or prediction of known signatures, as well as many types of downstream visualizations for exploratory analysis. This package has the ability to parse and combine multiple motif classes in the mutational signature discovery or prediction processes. Mutation motifs include single base substitutions (SBS), double base substitutions (DBS), insertions (INS) and deletions (DEL). + +A variety of exogenous exposures or endogenous biological processes can +contribute to the overall mutational load observed in human tumors. Many +different mutational patterns, or “mutational signatures”, have been +identified across different tumor types. These signatures can provide a +record of environmental exposure and can give clues about the etiology +of carcinogenesis. The Mutational Signature Comprehensive Analysis +Toolkit (musicatk) has utilities for extracting variants from a variety +of file formats, contains multiple methods for discovery of novel +signatures or prediction of known signatures, as well as many types of +downstream visualizations for exploratory analysis. This package has the +ability to parse and combine multiple motif classes in the mutational +signature discovery or prediction processes. Mutation motifs include +single base substitutions (SBS), double base substitutions (DBS), +insertions (INS) and deletions (DEL). # Installation -Currently musicatk can be installed from on Bioconductor using the following code: + +Currently musicatk can be installed from on Bioconductor using the +following code: ```{r, eval= FALSE} if (!requireNamespace("BiocManager", quietly=TRUE)){ @@ -47,21 +66,45 @@ The package can be loaded using the `library` command. library(musicatk) ``` -# Setting up a musica object -In order to discover or predict mutational signatures, we must first set up -our musica object by 1) extracting variants from files or objects such as -VCFs and MAFs, 2) selecting the appropriate reference genome 3) creating a -musica object, and 4) building a count tables for our variants of interest. +# Setting up a musica object from variants + +In order to discover or predict mutational signatures, we must first set +up our musica object by 1) extracting variants from files or objects +such as VCFs and MAFs, 2) selecting the appropriate reference genome 3) +creating a musica object, and 4) building a count tables for our +variants of interest. ## Extracting variants -Variants can be extracted from various formats using the following functions: -* The `extract_variants_from_vcf_file()` function will extract variants from a [VCF](https://samtools.github.io/hts-specs/) file. The file will be imported using the readVcf function from the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) package and then the variant information will be extracted from this object. -* The `extract_variants_from_vcf()` function extracts variants from a `CollapsedVCF` or `ExpandedVCF` object from the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) package. -* The `extract_variants_from_maf_file()` function will extract variants from a file in [Mutation Annotation Format (MAF)](https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/) used by TCGA. -* The `extract_variants_from_maf()` function will extract variants from a MAF object created by the [maftools](https://www.bioconductor.org/packages/release/bioc/html/maftools.html) package. -* The `extract_variants_from_matrix()` function will get the information from a matrix or data.frame like object that has columns for the chromosome, start position, end position, reference allele, mutation allele, and sample name. -* The `extract_variants()` function will extract variants from a list of objects. These objects can be any combination of VCF files, VariantAnnotation objects, MAF files, MAF objects, and data.frame objects. +Variants can be extracted from various formats using the following +functions: + +- The `extract_variants_from_vcf_file()` function will extract + variants from a [VCF](https://samtools.github.io/hts-specs/) file. + The file will be imported using the readVcf function from the + [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) + package and then the variant information will be extracted from this + object. +- The `extract_variants_from_vcf()` function extracts variants from a + `CollapsedVCF` or `ExpandedVCF` object from the + [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) + package. +- The `extract_variants_from_maf_file()` function will extract + variants from a file in [Mutation Annotation Format + (MAF)](https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/) + used by TCGA. +- The `extract_variants_from_maf()` function will extract variants + from a MAF object created by the + [maftools](https://www.bioconductor.org/packages/release/bioc/html/maftools.html) + package. +- The `extract_variants_from_matrix()` function will get the + information from a matrix or data.frame like object that has columns + for the chromosome, start position, end position, reference allele, + mutation allele, and sample name. +- The `extract_variants()` function will extract variants from a list + of objects. These objects can be any combination of VCF files, + VariantAnnotation objects, MAF files, MAF objects, and data.frame + objects. Below are some examples of extracting variants from MAF and VCF files: @@ -82,46 +125,69 @@ variants <- extract_variants(c(lusc_maf, luad_vcf, melanoma_vcfs)) ``` ## Choosing a genome -musicatk uses [BSgenome](https://bioconductor.org/packages/release/bioc/html/BSgenome.html) objects to access genome sequence information that flanks each mutation which is used bases for generating mutation count tables. BSgenome objects store full genome sequences for different organisms. A full list of supported organisms can be obtained by running `available.genomes()` after loading the BSgenome library. Custom genomes can be forged as well (see [BSgenome](https://bioconductor.org/packages/release/bioc/html/BSgenome.html) documentation). musicatk provides a utility function called `select_genome()` to allow users to quickly select human genome build versions "hg19" and "hg38" or mouse genome builds "mm9" and "mm10". The reference sequences for these genomes are in UCSC format (e.g. chr1). + +musicatk uses +[BSgenome](https://bioconductor.org/packages/release/bioc/html/BSgenome.html) +objects to access genome sequence information that flanks each mutation +which is used bases for generating mutation count tables. BSgenome +objects store full genome sequences for different organisms. A full list +of supported organisms can be obtained by running `available.genomes()` +after loading the BSgenome library. Custom genomes can be forged as well +(see +[BSgenome](https://bioconductor.org/packages/release/bioc/html/BSgenome.html) +documentation). musicatk provides a utility function called +`select_genome()` to allow users to quickly select human genome build +versions "hg19" and "hg38" or mouse genome builds "mm9" and "mm10". The +reference sequences for these genomes are in UCSC format (e.g. chr1). ```{r select_genome} g <- select_genome("hg38") ``` ## Creating a musica object -The last preprocessing step is to create an object with the variants and the genome using the `create_musica` function. This function will perform checks to ensure that the chromosome names and reference alleles in the input variant object match those in supplied BSgenome object. These checks can be turned off by setting `check_ref_chromosomes = FALSE` and `check_ref_bases = FALSE`, respectively. This function also looks for adjacent single base substitutions (SBSs) and will convert them to double base substitutions (DBSs). To disable this automatic conversion, set `convert_dbs = FALSE`. -```{r create_musica} -musica <- create_musica(x = variants, genome = g) -``` +The last preprocessing step is to create an object with the variants and +the genome using the `create_musica` function. This function will +perform checks to ensure that the chromosome names and reference alleles +in the input variant object match those in supplied BSgenome object. +These checks can be turned off by setting +`check_ref_chromosomes = FALSE` and `check_ref_bases = FALSE`, +respectively. This function also looks for adjacent single base +substitutions (SBSs) and will convert them to double base substitutions +(DBSs). To disable this automatic conversion, set `convert_dbs = FALSE`. +```{r create_musica_from_variants} +musica <- create_musica_from_variants(x = variants, genome = g) +``` # Creating mutation count tables -Motifs are the building blocks of mutational signatures. Motifs themselves are -a mutation combined with other genomic information. For instance, **SBS96** -motifs are constructed from an SBS mutation and one upstream and one downstream -base sandwiched together. We build tables by counting these motifs for each -sample. + +Motifs are the building blocks of mutational signatures. Motifs +themselves are a mutation combined with other genomic information. For +instance, **SBS96** motifs are constructed from an SBS mutation and one +upstream and one downstream base sandwiched together. We build tables by +counting these motifs for each sample. + ```{r build_tables} build_standard_table(musica, g = g, table_name = "SBS96") ``` -Here is a list of mutation tables that can be created by setting the +Here is a list of mutation tables that can be created by setting the `table_name` parameter in the `build_standard_table` function: -* SBS96 - Motifs are the six possible single base pair mutation types times the -four possibilities each for upstream and downstream context bases (4*6*4 = 96 -motifs) -* SBS192_Trans - Motifs are an extension of SBS96 multiplied by the -transcriptional strand (translated/untranslated), can be specified with -`"Transcript_Strand"`. -* SBS192_Rep - Motifs are an extension of SBS96 multiplied by the -replication strand (leading/lagging), can be specified with -`"Replication_Strand"`. -* DBS - Motifs are the 78 possible double-base-pair substitutions -* INDEL - Motifs are 83 categories intended to capture different categories of -indels based on base-pair change, repeats, or microhomology, insertion or -deletion, and length. +- SBS96 - Motifs are the six possible single base pair mutation types + times the four possibilities each for upstream and downstream + context bases (4*6*4 = 96 motifs) +- SBS192_Trans - Motifs are an extension of SBS96 multiplied by the + transcriptional strand (translated/untranslated), can be specified + with `"Transcript_Strand"`. +- SBS192_Rep - Motifs are an extension of SBS96 multiplied by the + replication strand (leading/lagging), can be specified with + `"Replication_Strand"`. +- DBS - Motifs are the 78 possible double-base-pair substitutions +- INDEL - Motifs are 83 categories intended to capture different + categories of indels based on base-pair change, repeats, or + microhomology, insertion or deletion, and length. ```{r combine_tables} data(dbs_musica) @@ -140,7 +206,6 @@ combine_count_tables(musica = dbs_musica, to_comb = c("SBS96", "DBS78"), table, combining SBS96 and DBS", overwrite = TRUE) ``` - ```{r} annotate_transcript_strand(musica, "19", build_table = FALSE) build_custom_table(musica = musica, variant_annotation = "Transcript_Strand", @@ -149,23 +214,40 @@ build_custom_table(musica = musica, variant_annotation = "Transcript_Strand", data_factor = c("T", "U"), overwrite = TRUE) ``` +Different count tables can be combined into one using the +`combine_count_tables` function. For example, the SBS96 and the DBS +tables could be combined and mutational signature discovery could be +performed across both mutations modalities. Tables with information +about the same types of variants (e.g. two related SBS tables) should +generally not be combined and used together. + +Custom count tables can be created from user-defined mutation-level +annotations using the `build_custom_table` function. + +# Create a musica object from an existing count table + +A musica object can be created directly from a count table if it is +available from the start. A variant object or file is not required. The +count table should contain mutational motifs as rows and samples as +columns, and must be able to be coerced to a matrix. The variant class +must also be provided, either SBS, DBS, or IND. Here, the example count +table 'synthetic_breast_counts' is used. + +```{r reate_musica_from_counts} -Different count tables can be combined into one using the `combine_count_tables` -function. For example, the SBS96 and the DBS tables could be combined and -mutational signature discovery could be performed across both mutations -modalities. Tables with information about the same types of variants (e.g. -two related SBS tables) should generally not be combined and used together. +musica2 <- create_musica_from_counts(x = synthetic_breast_counts, variant_class = "SBS") -Custom count tables can be created from user-defined mutation-level annotations -using the `build_custom_table` function. +``` # Discovering Signatures and Exposures + Mutational signature discovery is the process of deconvoluting a matrix -containing counts for each mutation type in each sample into two matrices: 1) -a **signature** matrix containing the probability of each mutation motif in -signature and 2) an **exposure** matrix containing the estimated counts of each -signature in each sample. Discovery and prediction results are save in a -`musica_result` object that includes both the signatures and sample exposures. +containing counts for each mutation type in each sample into two +matrices: 1) a **signature** matrix containing the probability of each +mutation motif in signature and 2) an **exposure** matrix containing the +estimated counts of each signature in each sample. Discovery and +prediction results are save in a `musica_result` object that includes +both the signatures and sample exposures. ```{r discover_sigs} result <- discover_signatures(musica = musica, table_name = "SBS96", @@ -173,17 +255,17 @@ result <- discover_signatures(musica = musica, table_name = "SBS96", nstart = 10) ``` - Supported signature discovery algorithms include: -* Non-negative matrix factorization (nmf) -* Latent Dirichlet Allocation (lda) +- Non-negative matrix factorization (nmf) +- Latent Dirichlet Allocation (lda) -Both have built-in `seed` capabilities for reproducible results, `nstarts` for -multiple independent chains from which the best final result will be chosen. -NMF also allows for parallel processing via `par_cores`. +Both have built-in `seed` capabilities for reproducible results, +`nstarts` for multiple independent chains from which the best final +result will be chosen. NMF also allows for parallel processing via +`par_cores`. -To get the signatures or exposures from the result object, the following +To get the signatures or exposures from the result object, the following functions can be used: ```{r result_accessors} @@ -196,42 +278,63 @@ sigs <- signatures(result) sigs[1:3,1:3] ``` +# Import a musica result object + +If signature discovery or prediction was performed previously or +externally, a 'musica_result' object can be created directly from the +signatures and exposures. The signatures table should contain mutational +motifs as rows and signatures as columns and be able to be coerced to a +matrix. The exposures matrix should contain signature weights as rows +and samples as columns and must also be able to be coerced to a matrix. +A mutation count table must also be provided. The method or algorithm +used to generate the results may be provided with 'algorithm'. Here, +example data from a synthetic breast cancer dataset is used. + +```{r create_musica_result} + +NMF_result <- create_musica_result(signatures = example_predicted_sigs, exposures = example_predicted_exp, count_table = synthetic_breast_counts, algorithm = "NMF") + +``` # Plotting ## Signatures -Barplots showing the probability of each mutation type in each signature can -be plotted with the `plot_signatures` function: + +Barplots showing the probability of each mutation type in each signature +can be plotted with the `plot_signatures` function: ```{r, plot_sigs} plot_signatures(result) ``` -By default, the scales on the y-axis are forced to be the same across all -signatures. This behavior can be turned off by setting `same_scale = FALSE`. -Signatures can be re-named based on prior knowledge and displayed in the plots: +By default, the scales on the y-axis are forced to be the same across +all signatures. This behavior can be turned off by setting +`same_scale = FALSE`. Signatures can be re-named based on prior +knowledge and displayed in the plots: ```{r, name_sigs} name_signatures(result, c("Smoking", "APOBEC", "UV")) plot_signatures(result) ``` - ## Exposures -Barplots showing the exposures in each sample can be plotted with the +Barplots showing the exposures in each sample can be plotted with the `plot_exposures` function: ```{r exposures_raw} plot_exposures(result, plot_type = "bar") ``` -The proportion of each exposure in each tumor can be shown by setting `proportional = TRUE`: +The proportion of each exposure in each tumor can be shown by setting +`proportional = TRUE`: + ```{r exposures_prop} plot_exposures(result, plot_type = "bar", proportional = TRUE) ``` -Counts for individual samples can also be plotted with the `plot_sample_counts` function: +Counts for individual samples can also be plotted with the +`plot_sample_counts` function: ```{r sample_counts} samples <- sample_names(musica) @@ -239,18 +342,39 @@ plot_sample_counts(musica, sample_names = samples[c(3,4,5)], table_name = "SBS96 ``` ## Comparison to external signatures (e.g. COSMIC) -A common analysis is to compare the signatures estimated in a dataset to those generated in other datasets or to those in the [COSMIC database](https://cancer.sanger.ac.uk/cosmic/signatures). We have a set of functions that can be used to easily perform pairwise correlations between signatures. The `compare_results` functions compares the signatures between two `musica_result` objects. The `compare_cosmic_v2` will correlate the signatures between a `musica_result` object and the SBS signatures in COSMIC V2. For example: + +A common analysis is to compare the signatures estimated in a dataset to +those generated in other datasets or to those in the [COSMIC +database](https://cancer.sanger.ac.uk/cosmic/signatures). We have a set +of functions that can be used to easily perform pairwise correlations +between signatures. The `compare_results` functions compares the +signatures between two `musica_result` objects. The `compare_cosmic_v2` +will correlate the signatures between a `musica_result` object and the +SBS signatures in COSMIC V2. For example: ```{r compare_cosmic} compare_cosmic_v2(result, threshold = 0.75) ``` -In this example, our Signatures 1 and 2 were most highly correlated to COSMIC Signature 7 and 4, respectively, so this may indicate that samples in our dataset were exposed to UV radiation or cigarette smoke. Only pairs of signatures who have a correlation above the `threshold` parameter will be returned. If no pairs of signatures are found, then you may want to consider lowering the threshold. The default correlation metric is the cosine similarity, but this can be changed to using 1 - Jensen-Shannon Divergence by setting -`metric = "jsd"` Signatures can also be correlated to those in the COSMIC V3 database using the `compare_cosmic_v3` function. +In this example, our Signatures 1 and 2 were most highly correlated to +COSMIC Signature 7 and 4, respectively, so this may indicate that +samples in our dataset were exposed to UV radiation or cigarette smoke. +Only pairs of signatures who have a correlation above the `threshold` +parameter will be returned. If no pairs of signatures are found, then +you may want to consider lowering the threshold. The default correlation +metric is the cosine similarity, but this can be changed to using 1 - +Jensen-Shannon Divergence by setting `metric = "jsd"` Signatures can +also be correlated to those in the COSMIC V3 database using the +`compare_cosmic_v3` function. # Predicting exposures using pre-existing signatures -Instead of discovering mutational signatures and exposures from a dataset *de novo*, one might get better results by predicting the exposures of signatures that have been previously estimated in other datasets. We incorporate several methods for estimating exposures given a set of pre-existing signatures. For example, we can predict exposures for COSMIC signatures 1, 4, 7, and 13 in our current dataset: +Instead of discovering mutational signatures and exposures from a +dataset *de novo*, one might get better results by predicting the +exposures of signatures that have been previously estimated in other +datasets. We incorporate several methods for estimating exposures given +a set of pre-existing signatures. For example, we can predict exposures +for COSMIC signatures 1, 4, 7, and 13 in our current dataset: ```{r predict_cosmic} @@ -267,22 +391,39 @@ pred_cosmic <- predict_exposure(musica = musica, table_name = "SBS96", plot_exposures(pred_cosmic, plot_type = "bar") ``` -The `cosmic_v2_sigs` object is just a `musica_result` object containing COSMIC V2 signatures without any sample or exposure information. Note that if `signatures_to_use` is not supplied by the user, then exposures for all signatures in the result object will be estimated. We can predict exposures for samples in any `musica` object from any `musica_result` object as long as the same mutation schema was utilized. +The `cosmic_v2_sigs` object is just a `musica_result` object containing +COSMIC V2 signatures without any sample or exposure information. Note +that if `signatures_to_use` is not supplied by the user, then exposures +for all signatures in the result object will be estimated. We can +predict exposures for samples in any `musica` object from any +`musica_result` object as long as the same mutation schema was utilized. -We can list which signatures are present in each tumor type according to the [COSMIC V2 database](https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt). For example, we can find which signatures are present in lung cancers: +We can list which signatures are present in each tumor type according to +the [COSMIC V2 +database](https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt). For +example, we can find which signatures are present in lung cancers: ```{r subtype_map} cosmic_v2_subtype_map("lung") ``` -We can predict exposures for samples in a `musica` object using the signatures from any `musica_result` object. Just for illustration, we will predict exposures using the estimated signatures from `musica_result` object we created earlier: +We can predict exposures for samples in a `musica` object using the +signatures from any `musica_result` object. Just for illustration, we +will predict exposures using the estimated signatures from +`musica_result` object we created earlier: ```{r predict_previous} pred_our_sigs <- predict_exposure(musica = musica, table_name = "SBS96", signature_res = result, algorithm = "lda") ``` -Of course, this example is not very useful because we are predicting exposures using signatures that were learned on the same set of samples. Most of the time, you want to give the `signature_res` parameter a `musica_result` object that wss generated using independent samples from those in the `musica` object. As mentioned above, different signatures in different result objects can be compared to each other using the `compare_results` function: +Of course, this example is not very useful because we are predicting +exposures using signatures that were learned on the same set of samples. +Most of the time, you want to give the `signature_res` parameter a +`musica_result` object that wss generated using independent samples from +those in the `musica` object. As mentioned above, different signatures +in different result objects can be compared to each other using the +`compare_results` function: ```{r predict_compare} compare_results(result = pred_cosmic, other_result = pred_our_sigs, @@ -293,16 +434,24 @@ compare_results(result = pred_cosmic, other_result = pred_our_sigs, ## Adding sample annotations -We first must add an annotation to the `musica` or `musica_result` object +We first must add an annotation to the `musica` or `musica_result` +object + ```{r annotations} annot <- read.table(system.file("extdata", "sample_annotations.txt", package = "musicatk"), sep = "\t", header=TRUE) samp_annot(result, "Tumor_Subtypes") <- annot$Tumor_Subtypes ``` -Note that the annotations can also be added directly the `musica` object in the beginning using the same function: `samp_annot(musica, "Tumor_Subtypes") <- annot$Tumor_Subtypes`. These annotations will then automatically be included in any down-stream result object. +Note that the annotations can also be added directly the `musica` object +in the beginning using the same function: +`samp_annot(musica, "Tumor_Subtypes") <- annot$Tumor_Subtypes`. These +annotations will then automatically be included in any down-stream +result object. -* **Be sure that the annotation vector being supplied is in the same order as the samples in the `musica` or `musica_result` object.** You can view the sample order with the `sample_names` function: +- **Be sure that the annotation vector being supplied is in the same + order as the samples in the `musica` or `musica_result` object.** + You can view the sample order with the `sample_names` function: ```{r sample_names} sample_names(result) @@ -310,20 +459,30 @@ sample_names(result) ## Plotting exposures by a Sample Annotation -As mentioned previously, the `plot_exposures` function can plot sample exposures in a `musica_result` object. It can group exposures by either a sample annotation or by a signature by setting the `group_by` parameter. Here will will group the exposures by the `Tumor_Subtype` annotation: +As mentioned previously, the `plot_exposures` function can plot sample +exposures in a `musica_result` object. It can group exposures by either +a sample annotation or by a signature by setting the `group_by` +parameter. Here will will group the exposures by the `Tumor_Subtype` +annotation: ```{r plot_exposures_by_subtype} plot_exposures(result, plot_type = "bar", group_by = "annotation", annotation = "Tumor_Subtypes") ``` -The distribution of exposures with respect to annotation can be viewed using boxplots by setting `plot_type = "box"` and `group_by = "annotation"`: +The distribution of exposures with respect to annotation can be viewed +using boxplots by setting `plot_type = "box"` and +`group_by = "annotation"`: ```{r plot_exposures_box_annot} plot_exposures(result, plot_type = "box", group_by = "annotation", annotation = "Tumor_Subtypes") ``` -Note that the name of the annotation must be supplied via the `annotation` parameter. Boxplots can be converted to violin plots by setting `plot_type = "violin"`. To compare the level of each exposure across sample groups within a signature, we can set `group_by = "signature"` and `color_by = "annotation"`: +Note that the name of the annotation must be supplied via the +`annotation` parameter. Boxplots can be converted to violin plots by +setting `plot_type = "violin"`. To compare the level of each exposure +across sample groups within a signature, we can set +`group_by = "signature"` and `color_by = "annotation"`: ```{r plot_exposures_box_sig} plot_exposures(result, plot_type = "box", group_by = "signature", @@ -332,40 +491,55 @@ plot_exposures(result, plot_type = "box", group_by = "signature", ## Visualizing samples in 2D using UMAP -The `create_umap` function embeds samples in 2 dimensions using the `umap` function from the [uwot](https://cran.r-project.org/web/packages/uwot/index.html) package. The major parameters for fine tuning the UMAP are `n_neighbors`, `min_dist`, and `spread`. See `?uwot::umap` for more information on these parameters. +The `create_umap` function embeds samples in 2 dimensions using the +`umap` function from the +[uwot](https://cran.r-project.org/web/packages/uwot/index.html) package. +The major parameters for fine tuning the UMAP are `n_neighbors`, +`min_dist`, and `spread`. See `?uwot::umap` for more information on +these parameters. ```{r umap_create} create_umap(result = result) ``` -The `plot_umap` function will generate a scatter plot of the UMAP coordinates. The points of plot will be colored by the level of a signature by default: +The `plot_umap` function will generate a scatter plot of the UMAP +coordinates. The points of plot will be colored by the level of a +signature by default: ```{r umap_plot} plot_umap(result = result) ``` -By default, the exposures for each sample will share the same color scale. However, exposures for some signatures may have really high levels compared to others. To make a plot where exposures for each signature will have their own color scale, you can set `same_scale = FALSE`: +By default, the exposures for each sample will share the same color +scale. However, exposures for some signatures may have really high +levels compared to others. To make a plot where exposures for each +signature will have their own color scale, you can set +`same_scale = FALSE`: ```{r umap_plot_same_scale} plot_umap(result = result, same_scale = FALSE) ``` -Lastly, points can be colored by a Sample Annotation by setting `color_by = "annotation"` and `annotation` to the name of the annotation: +Lastly, points can be colored by a Sample Annotation by setting +`color_by = "annotation"` and `annotation` to the name of the +annotation: ```{r umap_plot_annot} plot_umap(result = result, color_by = "annotation", annotation = "Tumor_Subtypes", add_annotation_labels = TRUE) ``` -When `add_annotation_labels = TRUE`, the centroid of each group is identified -using medians and the labels are plotted on top of the centroid. +When `add_annotation_labels = TRUE`, the centroid of each group is +identified using medians and the labels are plotted on top of the +centroid. # Use of Plotly in plotting -plot_signatures, plot_exposures, and plot_umap, all have builty in ggplotly -capabilities. Simply specifying `plotly = TRUE` enables interactive plots -that allows examination of individuals sections, zooming and resizing, and -turning on and off annotation types and legend values. +plot_signatures, plot_exposures, and plot_umap, all have builty in +ggplotly capabilities. Simply specifying `plotly = TRUE` enables +interactive plots that allows examination of individuals sections, +zooming and resizing, and turning on and off annotation types and legend +values. ```{r plotly} plot_signatures(result, plotly = TRUE) @@ -375,16 +549,17 @@ plot_umap(result, plotly = TRUE) # Note on reproducibility -Several functions make use of stochastic algorithms or procedures which -require the use of random number generator (RNG) for simulation or sampling. -To maintain reproducibility, all these functions should be called using -```set_seed(1)``` or ```withr::with_seed(seed, function())``` to make sure -same results are generatedeach time one of these functions is called. Using -with_seed for reproducibility has the advantage of not interfering with any -other user seeds, but using one or the other is important for several functions -including *discover_signatures*, *predict_exposure*, and *create_umap*, as -these functions use stochastic processes that may produce different results -when run multiple times with the same settings. +Several functions make use of stochastic algorithms or procedures which +require the use of random number generator (RNG) for simulation or +sampling. To maintain reproducibility, all these functions should be +called using `set_seed(1)` or `withr::with_seed(seed, function())` to +make sure same results are generatedeach time one of these functions is +called. Using with_seed for reproducibility has the advantage of not +interfering with any other user seeds, but using one or the other is +important for several functions including *discover_signatures*, +*predict_exposure*, and *create_umap*, as these functions use stochastic +processes that may produce different results when run multiple times +with the same settings. ```{r reproducible_prediction} seed <- 1 @@ -394,8 +569,8 @@ reproducible_prediction <- withr::with_seed(seed, signature_res = result, algorithm = "lda")) ``` - # Session Information + ```{r session} sessionInfo() ```