diff --git a/R/normalize.R b/R/normalize.R index 121bde0..1da48e5 100644 --- a/R/normalize.R +++ b/R/normalize.R @@ -17,18 +17,15 @@ #' ) #' ) %>% #' normalize_counts( -#' feature_id_colname = "Gene", -#' columns_to_include = c("Gene", "A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"), -#' sample_id_colname = "Sample", #' group_column = "Group", #' label_column = "Label" #' ) #' head(moo@counts[["norm"]][["voom"]]) normalize_counts <- function(moo, count_type = "filt", - feature_id_colname = "Gene", - columns_to_include = c("Gene", "A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"), - sample_id_colname = "Sample", + feature_id_colname = NULL, + samples_to_include = NULL, + sample_id_colname = NULL, group_column = "Group", label_column = "Label", input_in_log_counts = FALSE, @@ -50,109 +47,33 @@ normalize_counts <- function(moo, legend_position_for_histogram = "top", number_of_histogram_legend_columns = 6, number_of_image_rows = 2, - colors_for_plots = c(), + colors_for_plots = c( + "#5954d6", "#e1562c", "#b80058", "#00c6f8", "#d163e6", "#00a76c", + "#ff9287", "#008cf9", "#006e00", "#796880", "#FFA500", "#878500" + ), make_plots_interactive = FALSE, plot_correlation_matrix_heatmap = TRUE) { counts_matrix <- moo@counts[[count_type]] %>% as.data.frame() sample_metadata <- moo@sample_meta %>% as.data.frame() - ### PH: START Color Picking Function - ################################## - ##### Color Picking Function - ################################## - - ### Do not Functionalize This code yet. - ### I will make a General Color picker function to create a custom Color Palette to use for all figures - ### I think I would also like to use the metadata table to manually set colors - colorlist <- c( - "#5954d6", - "#e1562c", - "#b80058", - "#00c6f8", - "#d163e6", - "#00a76c", - "#ff9287", - "#008cf9", - "#006e00", - "#796880", - "#FFA500", - "#878500" - ) - names(colorlist) <- c( - "indigo", - "carrot", - "lipstick", - "turquoise", - "lavender", - "jade", - "coral", - "azure", - "green", - "rum", - "orange", - "olive" - ) - if (length(colors_for_plots) == 0) { - colors_for_plots <- c( - "indigo", - "carrot", - "lipstick", - "turquoise", - "lavender", - "jade", - "coral", - "azure", - "green", - "rum", - "orange", - "olive" - ) + if (is.null(sample_id_colname)) { + sample_id_colname <- colnames(sample_metadata)[1] } - colorval <- colorlist[colors_for_plots] - colorval <- unname(colorval) # remove names which affect ggplot - ### PH: END Color Picking Function + if (is.null(feature_id_colname)) { + feature_id_colname <- colnames(counts_matrix)[1] + } + if (is.null(samples_to_include)) { + samples_to_include <- sample_metadata %>% dplyr::pull(sample_id_colname) + } + df.filt <- counts_matrix %>% + dplyr::select(tidyselect::all_of(samples_to_include)) + ## --------------- ## ## Main Code Block ## ## --------------- ## - - - ############################## - #### Create Generic Row names - ############################## - ## create unique rownames to correctly add back Annocolumns at end of template - - samples_to_include <- columns_to_include[columns_to_include %in% sample_metadata[, sample_id_colname, drop = T]] - anno_col <- columns_to_include[columns_to_include %in% sample_metadata[, sample_id_colname, drop = T] == F] - - - samples_to_include <- samples_to_include[samples_to_include != feature_id_colname] - samples_to_include <- samples_to_include[samples_to_include != "Gene"] - samples_to_include <- samples_to_include[samples_to_include != "GeneName"] - - ## create unique rownames to correctly add back Annocolumns at end of template - counts_matrix[, feature_id_colname] <- paste0(counts_matrix[, feature_id_colname], "_", 1:nrow(counts_matrix)) - - - anno_col <- c(anno_col, feature_id_colname) %>% unique() - anno_tbl <- counts_matrix[, anno_col, drop = F] %>% as.data.frame() - - - df.filt <- counts_matrix[, samples_to_include] gene_names <- NULL - gene_names$GeneID <- counts_matrix[, 1] - - ############################## - #### Input Data Validation - ############################## - # TODO move this logic to S7 validator - sample_metadata <- sample_metadata[match(colnames(df.filt), sample_metadata[[sample_id_colname]]), ] # First match sample metadata to counts matrix - sample_metadata <- sample_metadata[rowSums(is.na(sample_metadata)) != ncol(sample_metadata), ] # Remove empty rows - sample_metadata <- sample_metadata[, colSums(is.na(sample_metadata)) == 0] # Remove empty columns - rownames(sample_metadata) <- sample_metadata[[sample_id_colname]] - - df.filt <- df.filt[, match(sample_metadata[[sample_id_colname]], colnames(df.filt))] # Match counts matrix columns to sample metadata - + gene_names$feature_id <- counts_matrix %>% dplyr::pull(feature_id_colname) ### PH: START Limma Normalization ############################## @@ -166,8 +87,8 @@ normalize_counts <- function(moo, x <- edgeR::DGEList(counts = df.filt, genes = gene_names) } v <- limma::voom(x, normalize = voom_normalization_method) - rownames(v$E) <- v$genes$GeneID - as.data.frame(v$E) %>% tibble::rownames_to_column(feature_id_colname) -> df.voom + rownames(v$E) <- v$genes$feature_id + df.voom <- as.data.frame(v$E) %>% tibble::rownames_to_column(feature_id_colname) message(paste0("Total number of features included: ", nrow(df.voom))) ### PH: END Limma Normalization @@ -177,7 +98,7 @@ normalize_counts <- function(moo, samples_to_rename, group_column, label_column, - color_values = colorval, + color_values = colors_for_plots, principal_component_on_x_axis = principal_component_on_x_axis, principal_component_on_y_axis = principal_component_on_y_axis, legend_position_for_pca = legend_position_for_pca, @@ -194,14 +115,14 @@ normalize_counts <- function(moo, feature_id_colname, group_column, label_column, - color_values = colorval, + color_values = colors_for_plots, x_axis_label = "Normalized Counts" ) - if (plot_correlation_matrix_heatmap == TRUE) { + if (isTRUE(plot_correlation_matrix_heatmap)) { corHM <- plot_heatmap( counts_matrix = df.filt, sample_metadata = sample_metadata, - anno_colors = colorval, + anno_colors = colors_for_plots, anno_column = group_column, label_column = label_column, sample_id_colname = sample_id_colname @@ -210,11 +131,6 @@ normalize_counts <- function(moo, message("Sample columns") message(colnames(df.voom)[!colnames(df.voom) %in% feature_id_colname]) - message("Feature Columns") - message(colnames(anno_tbl)) - - df.voom <- merge(anno_tbl, df.voom, by = feature_id_colname, all.y = T) - df.voom[, feature_id_colname] <- gsub("_[0-9]+$", "", df.voom[, feature_id_colname]) if (isFALSE("norm" %in% names(moo@counts))) { moo@counts[["norm"]] <- list() diff --git a/tests/testthat/test-normalize.R b/tests/testthat/test-normalize.R index c0cdea3..6cdedde 100644 --- a/tests/testthat/test-normalize.R +++ b/tests/testthat/test-normalize.R @@ -1,4 +1,4 @@ -test_that("normalize works", { +test_that("normalize works for NIDAP", { moo <- multiOmicDataSet( sample_meta_dat = as.data.frame(nidap_sample_metadata), anno_dat = data.frame(), @@ -8,13 +8,7 @@ test_that("normalize works", { "filt" = as.data.frame(nidap_filtered_counts) ) ) %>% - normalize_counts( - feature_id_colname = "Gene", - columns_to_include = c("Gene", "A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"), - sample_id_colname = "Sample", - group_column = "Group", - label_column = "Label" - ) + normalize_counts(group_column = "Group", label_column = "Label") expect_true(equal_dfs( moo@counts[["norm"]][["voom"]] %>% dplyr::arrange(desc(Gene)), @@ -22,3 +16,117 @@ test_that("normalize works", { dplyr::arrange(desc(Gene)) )) }) + +test_that("noramlize works for RENEE", { + moo <- create_multiOmicDataSet_from_dataframes(readr::read_tsv( + system.file("extdata", "sample_metadata.tsv.gz", package = "MOSuite") + ), gene_counts) %>% + clean_raw_counts() %>% + filter_counts( + group_column = "condition", + label_column = "sample_id", + minimum_count_value_to_be_considered_nonzero = 1, + minimum_number_of_samples_with_nonzero_counts_in_total = 1, + minimum_number_of_samples_with_nonzero_counts_in_a_group = 1, + make_plots = FALSE + ) %>% + normalize_counts(group_column = "condition", label_column = "sample_id") + expect_equal( + head(moo@counts$norm$voom), + structure( + list( + gene_id = c( + "ENSG00000215458.8", + "ENSG00000160179.18", + "ENSG00000258017.1", + "ENSG00000282393.1", + "ENSG00000286104.1", + "ENSG00000274422.1" + ), + KO_S3 = c( + 11.0751960068561, + 9.6086338540783, + 9.6086338540783, + 8.81615260371772, + 9.6086338540783, + 8.81615260371772 + ), + KO_S4 = c( + 12.3480907442867, + 12.7703165561761, + 8.81615260371772, + 9.6086338540783, + 8.81615260371772, + 9.6086338540783 + ), + WT_S1 = c( + 8.81615260371772, + 12.3480907442867, + 8.81615260371772, + 8.81615260371772, + 8.81615260371772, + 8.81615260371772 + ), + WT_S2 = c( + 10.0048744792586, + 12.2369960496953, + 8.81615260371772, + 8.81615260371772, + 8.81615260371772, + 8.81615260371772 + ) + ), + row.names = c(NA, 6L), + class = "data.frame" + ) + ) + expect_equal( + tail(moo@counts$norm$voom), + structure( + list( + gene_id = c( + "ENSG00000157538.14", + "ENSG00000160193.11", + "ENSG00000182093.15", + "ENSG00000182362.14", + "ENSG00000173276.14", + "ENSG00000237232.7" + ), + KO_S3 = c( + 12.3480907442867, + 9.6086338540783, + 11.8597009422769, + 11.0751960068561, + 11.8597009422769, + 8.81615260371772 + ), + KO_S4 = c( + 12.7703165561761, + 9.6086338540783, + 9.6086338540783, + 8.81615260371772, + 12.7703165561761, + 9.6086338540783 + ), + WT_S1 = c( + 12.2426956580003, + 10.5853565029804, + 11.7865266999202, + 8.81615260371772, + 11.7865266999202, + 8.81615260371772 + ), + WT_S2 = c( + 12.4720029602325, + 10.9249210977479, + 11.4357186116131, + 8.81615260371772, + 12.2369960496953, + 8.81615260371772 + ) + ), + row.names = 286:291, + class = "data.frame" + ) + ) +}) diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd index 928a3a3..03ef0ec 100644 --- a/vignettes/intro.Rmd +++ b/vignettes/intro.Rmd @@ -34,26 +34,19 @@ moo <- create_multiOmicDataSet_from_files( sample_meta_filepath = metadata_tsv, feature_counts_filepath = gene_counts_tsv ) -# todo: obviate the need to pass feature & sample id colnames every time + moo <- moo %>% - clean_raw_counts( - feature_id_colname = "gene_id", - sample_id_colname = "sample_id", - ) %>% + clean_raw_counts() %>% filter_counts( group_column = "condition", label_column = "sample_id", - samples_to_include = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), minimum_count_value_to_be_considered_nonzero = 1, minimum_number_of_samples_with_nonzero_counts_in_total = 1, minimum_number_of_samples_with_nonzero_counts_in_a_group = 1, ) %>% normalize_counts( - feature_id_colname = "Gene", - sample_id_colname = "sample_id", group_column = "condition", - label_column = "sample_id", - columns_to_include = c("Gene", "KO_S3", "KO_S4", "WT_S1", "WT_S2") + label_column = "sample_id" ) moo@counts$norm$voom %>% head()