refactor(normalize): same column picking behavior as filter_counts()

CCBR · Jan 28, 2025 · f151366 · f151366
1 parent 95c5a30
commit f151366
Show file tree

Hide file tree

Showing 3 changed files with 144 additions and 127 deletions.
diff --git a/R/normalize.R b/R/normalize.R
@@ -17,18 +17,15 @@
 #'   )
 #' ) %>%
 #'   normalize_counts(
-#'     feature_id_colname = "Gene",
-#'     columns_to_include = c("Gene", "A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"),
-#'     sample_id_colname = "Sample",
 #'     group_column = "Group",
 #'     label_column = "Label"
 #'   )
 #' head(moo@counts[["norm"]][["voom"]])
 normalize_counts <- function(moo,
                              count_type = "filt",
-                             feature_id_colname = "Gene",
-                             columns_to_include = c("Gene", "A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"),
-                             sample_id_colname = "Sample",
+                             feature_id_colname = NULL,
+                             samples_to_include = NULL,
+                             sample_id_colname = NULL,
                              group_column = "Group",
                              label_column = "Label",
                              input_in_log_counts = FALSE,
@@ -50,109 +47,33 @@ normalize_counts <- function(moo,
                              legend_position_for_histogram = "top",
                              number_of_histogram_legend_columns = 6,
                              number_of_image_rows = 2,
-                             colors_for_plots = c(),
+                             colors_for_plots = c(
+                               "#5954d6", "#e1562c", "#b80058", "#00c6f8", "#d163e6", "#00a76c",
+                               "#ff9287", "#008cf9", "#006e00", "#796880", "#FFA500", "#878500"
+                             ),
                              make_plots_interactive = FALSE,
                              plot_correlation_matrix_heatmap = TRUE) {
   counts_matrix <- moo@counts[[count_type]] %>% as.data.frame()
   sample_metadata <- moo@sample_meta %>% as.data.frame()
 
-  ### PH: START Color Picking Function
-  ##################################
-  ##### Color Picking Function
-  ##################################
-
-  ### Do not Functionalize This code yet.
-  ### I will make a General Color picker function to create a custom Color Palette to use for all figures
-  ### I think I would also like to use the metadata table to manually set colors
-  colorlist <- c(
-    "#5954d6",
-    "#e1562c",
-    "#b80058",
-    "#00c6f8",
-    "#d163e6",
-    "#00a76c",
-    "#ff9287",
-    "#008cf9",
-    "#006e00",
-    "#796880",
-    "#FFA500",
-    "#878500"
-  )
-  names(colorlist) <- c(
-    "indigo",
-    "carrot",
-    "lipstick",
-    "turquoise",
-    "lavender",
-    "jade",
-    "coral",
-    "azure",
-    "green",
-    "rum",
-    "orange",
-    "olive"
-  )
-  if (length(colors_for_plots) == 0) {
-    colors_for_plots <- c(
-      "indigo",
-      "carrot",
-      "lipstick",
-      "turquoise",
-      "lavender",
-      "jade",
-      "coral",
-      "azure",
-      "green",
-      "rum",
-      "orange",
-      "olive"
-    )
+  if (is.null(sample_id_colname)) {
+    sample_id_colname <- colnames(sample_metadata)[1]
   }
-  colorval <- colorlist[colors_for_plots]
-  colorval <- unname(colorval) # remove names which affect ggplot
-  ### PH: END Color Picking Function
+  if (is.null(feature_id_colname)) {
+    feature_id_colname <- colnames(counts_matrix)[1]
+  }
+  if (is.null(samples_to_include)) {
+    samples_to_include <- sample_metadata %>% dplyr::pull(sample_id_colname)
+  }
+  df.filt <- counts_matrix %>%
+    dplyr::select(tidyselect::all_of(samples_to_include))
+
 
   ## --------------- ##
   ## Main Code Block ##
   ## --------------- ##
-
-
-  ##############################
-  #### Create Generic Row names
-  ##############################
-  ## create unique rownames to correctly add back Annocolumns at end of template
-
-  samples_to_include <- columns_to_include[columns_to_include %in% sample_metadata[, sample_id_colname, drop = T]]
-  anno_col <- columns_to_include[columns_to_include %in% sample_metadata[, sample_id_colname, drop = T] == F]
-
-
-  samples_to_include <- samples_to_include[samples_to_include != feature_id_colname]
-  samples_to_include <- samples_to_include[samples_to_include != "Gene"]
-  samples_to_include <- samples_to_include[samples_to_include != "GeneName"]
-
-  ## create unique rownames to correctly add back Annocolumns at end of template
-  counts_matrix[, feature_id_colname] <- paste0(counts_matrix[, feature_id_colname], "_", 1:nrow(counts_matrix))
-
-
-  anno_col <- c(anno_col, feature_id_colname) %>% unique()
-  anno_tbl <- counts_matrix[, anno_col, drop = F] %>% as.data.frame()
-
-
-  df.filt <- counts_matrix[, samples_to_include]
   gene_names <- NULL
-  gene_names$GeneID <- counts_matrix[, 1]
-
-  ##############################
-  #### Input Data Validation
-  ##############################
-  # TODO move this logic to S7 validator
-  sample_metadata <- sample_metadata[match(colnames(df.filt), sample_metadata[[sample_id_colname]]), ] # First match sample metadata to counts matrix
-  sample_metadata <- sample_metadata[rowSums(is.na(sample_metadata)) != ncol(sample_metadata), ] # Remove empty rows
-  sample_metadata <- sample_metadata[, colSums(is.na(sample_metadata)) == 0] # Remove empty columns
-  rownames(sample_metadata) <- sample_metadata[[sample_id_colname]]
-
-  df.filt <- df.filt[, match(sample_metadata[[sample_id_colname]], colnames(df.filt))] # Match counts matrix columns to sample metadata
-
+  gene_names$feature_id <- counts_matrix %>% dplyr::pull(feature_id_colname)
 
   ### PH: START Limma Normalization
   ##############################
@@ -166,8 +87,8 @@ normalize_counts <- function(moo,
     x <- edgeR::DGEList(counts = df.filt, genes = gene_names)
   }
   v <- limma::voom(x, normalize = voom_normalization_method)
-  rownames(v$E) <- v$genes$GeneID
-  as.data.frame(v$E) %>% tibble::rownames_to_column(feature_id_colname) -> df.voom
+  rownames(v$E) <- v$genes$feature_id
+  df.voom <- as.data.frame(v$E) %>% tibble::rownames_to_column(feature_id_colname)
   message(paste0("Total number of features included: ", nrow(df.voom)))
   ### PH: END Limma Normalization
 
@@ -177,7 +98,7 @@ normalize_counts <- function(moo,
     samples_to_rename,
     group_column,
     label_column,
-    color_values = colorval,
+    color_values = colors_for_plots,
     principal_component_on_x_axis = principal_component_on_x_axis,
     principal_component_on_y_axis = principal_component_on_y_axis,
     legend_position_for_pca = legend_position_for_pca,
@@ -194,14 +115,14 @@ normalize_counts <- function(moo,
     feature_id_colname,
     group_column,
     label_column,
-    color_values = colorval,
+    color_values = colors_for_plots,
     x_axis_label = "Normalized Counts"
   )
-  if (plot_correlation_matrix_heatmap == TRUE) {
+  if (isTRUE(plot_correlation_matrix_heatmap)) {
     corHM <- plot_heatmap(
       counts_matrix = df.filt,
       sample_metadata = sample_metadata,
-      anno_colors = colorval,
+      anno_colors = colors_for_plots,
       anno_column = group_column,
       label_column = label_column,
       sample_id_colname = sample_id_colname
@@ -210,11 +131,6 @@ normalize_counts <- function(moo,
 
   message("Sample columns")
   message(colnames(df.voom)[!colnames(df.voom) %in% feature_id_colname])
-  message("Feature Columns")
-  message(colnames(anno_tbl))
-
-  df.voom <- merge(anno_tbl, df.voom, by = feature_id_colname, all.y = T)
-  df.voom[, feature_id_colname] <- gsub("_[0-9]+$", "", df.voom[, feature_id_colname])
 
   if (isFALSE("norm" %in% names(moo@counts))) {
     moo@counts[["norm"]] <- list()

diff --git a/tests/testthat/test-normalize.R b/tests/testthat/test-normalize.R
@@ -1,4 +1,4 @@
-test_that("normalize works", {
+test_that("normalize works for NIDAP", {
   moo <- multiOmicDataSet(
     sample_meta_dat = as.data.frame(nidap_sample_metadata),
     anno_dat = data.frame(),
@@ -8,17 +8,125 @@ test_that("normalize works", {
       "filt" = as.data.frame(nidap_filtered_counts)
     )
   ) %>%
-    normalize_counts(
-      feature_id_colname = "Gene",
-      columns_to_include = c("Gene", "A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"),
-      sample_id_colname = "Sample",
-      group_column = "Group",
-      label_column = "Label"
-    )
+    normalize_counts(group_column = "Group", label_column = "Label")
   expect_true(equal_dfs(
     moo@counts[["norm"]][["voom"]] %>%
       dplyr::arrange(desc(Gene)),
     as.data.frame(nidap_norm_counts) %>%
       dplyr::arrange(desc(Gene))
   ))
 })
+
+test_that("noramlize works for RENEE", {
+  moo <- create_multiOmicDataSet_from_dataframes(readr::read_tsv(
+    system.file("extdata", "sample_metadata.tsv.gz", package = "MOSuite")
+  ), gene_counts) %>%
+    clean_raw_counts() %>%
+    filter_counts(
+      group_column = "condition",
+      label_column = "sample_id",
+      minimum_count_value_to_be_considered_nonzero = 1,
+      minimum_number_of_samples_with_nonzero_counts_in_total = 1,
+      minimum_number_of_samples_with_nonzero_counts_in_a_group = 1,
+      make_plots = FALSE
+    ) %>%
+    normalize_counts(group_column = "condition", label_column = "sample_id")
+  expect_equal(
+    head(moo@counts$norm$voom),
+    structure(
+      list(
+        gene_id = c(
+          "ENSG00000215458.8",
+          "ENSG00000160179.18",
+          "ENSG00000258017.1",
+          "ENSG00000282393.1",
+          "ENSG00000286104.1",
+          "ENSG00000274422.1"
+        ),
+        KO_S3 = c(
+          11.0751960068561,
+          9.6086338540783,
+          9.6086338540783,
+          8.81615260371772,
+          9.6086338540783,
+          8.81615260371772
+        ),
+        KO_S4 = c(
+          12.3480907442867,
+          12.7703165561761,
+          8.81615260371772,
+          9.6086338540783,
+          8.81615260371772,
+          9.6086338540783
+        ),
+        WT_S1 = c(
+          8.81615260371772,
+          12.3480907442867,
+          8.81615260371772,
+          8.81615260371772,
+          8.81615260371772,
+          8.81615260371772
+        ),
+        WT_S2 = c(
+          10.0048744792586,
+          12.2369960496953,
+          8.81615260371772,
+          8.81615260371772,
+          8.81615260371772,
+          8.81615260371772
+        )
+      ),
+      row.names = c(NA, 6L),
+      class = "data.frame"
+    )
+  )
+  expect_equal(
+    tail(moo@counts$norm$voom),
+    structure(
+      list(
+        gene_id = c(
+          "ENSG00000157538.14",
+          "ENSG00000160193.11",
+          "ENSG00000182093.15",
+          "ENSG00000182362.14",
+          "ENSG00000173276.14",
+          "ENSG00000237232.7"
+        ),
+        KO_S3 = c(
+          12.3480907442867,
+          9.6086338540783,
+          11.8597009422769,
+          11.0751960068561,
+          11.8597009422769,
+          8.81615260371772
+        ),
+        KO_S4 = c(
+          12.7703165561761,
+          9.6086338540783,
+          9.6086338540783,
+          8.81615260371772,
+          12.7703165561761,
+          9.6086338540783
+        ),
+        WT_S1 = c(
+          12.2426956580003,
+          10.5853565029804,
+          11.7865266999202,
+          8.81615260371772,
+          11.7865266999202,
+          8.81615260371772
+        ),
+        WT_S2 = c(
+          12.4720029602325,
+          10.9249210977479,
+          11.4357186116131,
+          8.81615260371772,
+          12.2369960496953,
+          8.81615260371772
+        )
+      ),
+      row.names = 286:291,
+      class = "data.frame"
+    )
+  )
+})
diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd
@@ -34,26 +34,19 @@ moo <- create_multiOmicDataSet_from_files(
   sample_meta_filepath = metadata_tsv,
   feature_counts_filepath = gene_counts_tsv
 )
-# todo: obviate the need to pass feature & sample id colnames every time
+
 moo <- moo %>%
-  clean_raw_counts(
-    feature_id_colname = "gene_id",
-    sample_id_colname = "sample_id",
-  ) %>%
+  clean_raw_counts() %>%
   filter_counts(
     group_column = "condition",
     label_column = "sample_id",
-    samples_to_include = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"),
     minimum_count_value_to_be_considered_nonzero = 1,
     minimum_number_of_samples_with_nonzero_counts_in_total = 1,
     minimum_number_of_samples_with_nonzero_counts_in_a_group = 1,
   ) %>%
   normalize_counts(
-    feature_id_colname = "Gene",
-    sample_id_colname = "sample_id",
     group_column = "condition",
-    label_column = "sample_id",
-    columns_to_include = c("Gene", "KO_S3", "KO_S4", "WT_S1", "WT_S2")
+    label_column = "sample_id"
   )
 
 moo@counts$norm$voom %>% head()