Skip to content

Commit

Permalink
refactor(normalize): same column picking behavior as filter_counts()
Browse files Browse the repository at this point in the history
  • Loading branch information
kelly-sovacool committed Jan 28, 2025
1 parent 95c5a30 commit f151366
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 127 deletions.
134 changes: 25 additions & 109 deletions R/normalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,15 @@
#' )
#' ) %>%
#' normalize_counts(
#' feature_id_colname = "Gene",
#' columns_to_include = c("Gene", "A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"),
#' sample_id_colname = "Sample",
#' group_column = "Group",
#' label_column = "Label"
#' )
#' head(moo@counts[["norm"]][["voom"]])
normalize_counts <- function(moo,
count_type = "filt",
feature_id_colname = "Gene",
columns_to_include = c("Gene", "A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"),
sample_id_colname = "Sample",
feature_id_colname = NULL,
samples_to_include = NULL,
sample_id_colname = NULL,
group_column = "Group",
label_column = "Label",
input_in_log_counts = FALSE,
Expand All @@ -50,109 +47,33 @@ normalize_counts <- function(moo,
legend_position_for_histogram = "top",
number_of_histogram_legend_columns = 6,
number_of_image_rows = 2,
colors_for_plots = c(),
colors_for_plots = c(
"#5954d6", "#e1562c", "#b80058", "#00c6f8", "#d163e6", "#00a76c",
"#ff9287", "#008cf9", "#006e00", "#796880", "#FFA500", "#878500"
),
make_plots_interactive = FALSE,
plot_correlation_matrix_heatmap = TRUE) {
counts_matrix <- moo@counts[[count_type]] %>% as.data.frame()
sample_metadata <- moo@sample_meta %>% as.data.frame()

### PH: START Color Picking Function
##################################
##### Color Picking Function
##################################

### Do not Functionalize This code yet.
### I will make a General Color picker function to create a custom Color Palette to use for all figures
### I think I would also like to use the metadata table to manually set colors
colorlist <- c(
"#5954d6",
"#e1562c",
"#b80058",
"#00c6f8",
"#d163e6",
"#00a76c",
"#ff9287",
"#008cf9",
"#006e00",
"#796880",
"#FFA500",
"#878500"
)
names(colorlist) <- c(
"indigo",
"carrot",
"lipstick",
"turquoise",
"lavender",
"jade",
"coral",
"azure",
"green",
"rum",
"orange",
"olive"
)
if (length(colors_for_plots) == 0) {
colors_for_plots <- c(
"indigo",
"carrot",
"lipstick",
"turquoise",
"lavender",
"jade",
"coral",
"azure",
"green",
"rum",
"orange",
"olive"
)
if (is.null(sample_id_colname)) {
sample_id_colname <- colnames(sample_metadata)[1]
}
colorval <- colorlist[colors_for_plots]
colorval <- unname(colorval) # remove names which affect ggplot
### PH: END Color Picking Function
if (is.null(feature_id_colname)) {
feature_id_colname <- colnames(counts_matrix)[1]
}
if (is.null(samples_to_include)) {
samples_to_include <- sample_metadata %>% dplyr::pull(sample_id_colname)
}
df.filt <- counts_matrix %>%
dplyr::select(tidyselect::all_of(samples_to_include))


## --------------- ##
## Main Code Block ##
## --------------- ##


##############################
#### Create Generic Row names
##############################
## create unique rownames to correctly add back Annocolumns at end of template

samples_to_include <- columns_to_include[columns_to_include %in% sample_metadata[, sample_id_colname, drop = T]]
anno_col <- columns_to_include[columns_to_include %in% sample_metadata[, sample_id_colname, drop = T] == F]


samples_to_include <- samples_to_include[samples_to_include != feature_id_colname]
samples_to_include <- samples_to_include[samples_to_include != "Gene"]
samples_to_include <- samples_to_include[samples_to_include != "GeneName"]

## create unique rownames to correctly add back Annocolumns at end of template
counts_matrix[, feature_id_colname] <- paste0(counts_matrix[, feature_id_colname], "_", 1:nrow(counts_matrix))


anno_col <- c(anno_col, feature_id_colname) %>% unique()
anno_tbl <- counts_matrix[, anno_col, drop = F] %>% as.data.frame()


df.filt <- counts_matrix[, samples_to_include]
gene_names <- NULL
gene_names$GeneID <- counts_matrix[, 1]

##############################
#### Input Data Validation
##############################
# TODO move this logic to S7 validator
sample_metadata <- sample_metadata[match(colnames(df.filt), sample_metadata[[sample_id_colname]]), ] # First match sample metadata to counts matrix
sample_metadata <- sample_metadata[rowSums(is.na(sample_metadata)) != ncol(sample_metadata), ] # Remove empty rows
sample_metadata <- sample_metadata[, colSums(is.na(sample_metadata)) == 0] # Remove empty columns
rownames(sample_metadata) <- sample_metadata[[sample_id_colname]]

df.filt <- df.filt[, match(sample_metadata[[sample_id_colname]], colnames(df.filt))] # Match counts matrix columns to sample metadata

gene_names$feature_id <- counts_matrix %>% dplyr::pull(feature_id_colname)

### PH: START Limma Normalization
##############################
Expand All @@ -166,8 +87,8 @@ normalize_counts <- function(moo,
x <- edgeR::DGEList(counts = df.filt, genes = gene_names)
}
v <- limma::voom(x, normalize = voom_normalization_method)
rownames(v$E) <- v$genes$GeneID
as.data.frame(v$E) %>% tibble::rownames_to_column(feature_id_colname) -> df.voom
rownames(v$E) <- v$genes$feature_id
df.voom <- as.data.frame(v$E) %>% tibble::rownames_to_column(feature_id_colname)
message(paste0("Total number of features included: ", nrow(df.voom)))
### PH: END Limma Normalization

Expand All @@ -177,7 +98,7 @@ normalize_counts <- function(moo,
samples_to_rename,
group_column,
label_column,
color_values = colorval,
color_values = colors_for_plots,
principal_component_on_x_axis = principal_component_on_x_axis,
principal_component_on_y_axis = principal_component_on_y_axis,
legend_position_for_pca = legend_position_for_pca,
Expand All @@ -194,14 +115,14 @@ normalize_counts <- function(moo,
feature_id_colname,
group_column,
label_column,
color_values = colorval,
color_values = colors_for_plots,
x_axis_label = "Normalized Counts"
)
if (plot_correlation_matrix_heatmap == TRUE) {
if (isTRUE(plot_correlation_matrix_heatmap)) {
corHM <- plot_heatmap(
counts_matrix = df.filt,
sample_metadata = sample_metadata,
anno_colors = colorval,
anno_colors = colors_for_plots,
anno_column = group_column,
label_column = label_column,
sample_id_colname = sample_id_colname
Expand All @@ -210,11 +131,6 @@ normalize_counts <- function(moo,

message("Sample columns")
message(colnames(df.voom)[!colnames(df.voom) %in% feature_id_colname])
message("Feature Columns")
message(colnames(anno_tbl))

df.voom <- merge(anno_tbl, df.voom, by = feature_id_colname, all.y = T)
df.voom[, feature_id_colname] <- gsub("_[0-9]+$", "", df.voom[, feature_id_colname])

if (isFALSE("norm" %in% names(moo@counts))) {
moo@counts[["norm"]] <- list()
Expand Down
124 changes: 116 additions & 8 deletions tests/testthat/test-normalize.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
test_that("normalize works", {
test_that("normalize works for NIDAP", {
moo <- multiOmicDataSet(
sample_meta_dat = as.data.frame(nidap_sample_metadata),
anno_dat = data.frame(),
Expand All @@ -8,17 +8,125 @@ test_that("normalize works", {
"filt" = as.data.frame(nidap_filtered_counts)
)
) %>%
normalize_counts(
feature_id_colname = "Gene",
columns_to_include = c("Gene", "A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"),
sample_id_colname = "Sample",
group_column = "Group",
label_column = "Label"
)
normalize_counts(group_column = "Group", label_column = "Label")
expect_true(equal_dfs(
moo@counts[["norm"]][["voom"]] %>%
dplyr::arrange(desc(Gene)),
as.data.frame(nidap_norm_counts) %>%
dplyr::arrange(desc(Gene))
))
})

test_that("noramlize works for RENEE", {
moo <- create_multiOmicDataSet_from_dataframes(readr::read_tsv(
system.file("extdata", "sample_metadata.tsv.gz", package = "MOSuite")
), gene_counts) %>%
clean_raw_counts() %>%
filter_counts(
group_column = "condition",
label_column = "sample_id",
minimum_count_value_to_be_considered_nonzero = 1,
minimum_number_of_samples_with_nonzero_counts_in_total = 1,
minimum_number_of_samples_with_nonzero_counts_in_a_group = 1,
make_plots = FALSE
) %>%
normalize_counts(group_column = "condition", label_column = "sample_id")
expect_equal(
head(moo@counts$norm$voom),
structure(
list(
gene_id = c(
"ENSG00000215458.8",
"ENSG00000160179.18",
"ENSG00000258017.1",
"ENSG00000282393.1",
"ENSG00000286104.1",
"ENSG00000274422.1"
),
KO_S3 = c(
11.0751960068561,
9.6086338540783,
9.6086338540783,
8.81615260371772,
9.6086338540783,
8.81615260371772
),
KO_S4 = c(
12.3480907442867,
12.7703165561761,
8.81615260371772,
9.6086338540783,
8.81615260371772,
9.6086338540783
),
WT_S1 = c(
8.81615260371772,
12.3480907442867,
8.81615260371772,
8.81615260371772,
8.81615260371772,
8.81615260371772
),
WT_S2 = c(
10.0048744792586,
12.2369960496953,
8.81615260371772,
8.81615260371772,
8.81615260371772,
8.81615260371772
)
),
row.names = c(NA, 6L),
class = "data.frame"
)
)
expect_equal(
tail(moo@counts$norm$voom),
structure(
list(
gene_id = c(
"ENSG00000157538.14",
"ENSG00000160193.11",
"ENSG00000182093.15",
"ENSG00000182362.14",
"ENSG00000173276.14",
"ENSG00000237232.7"
),
KO_S3 = c(
12.3480907442867,
9.6086338540783,
11.8597009422769,
11.0751960068561,
11.8597009422769,
8.81615260371772
),
KO_S4 = c(
12.7703165561761,
9.6086338540783,
9.6086338540783,
8.81615260371772,
12.7703165561761,
9.6086338540783
),
WT_S1 = c(
12.2426956580003,
10.5853565029804,
11.7865266999202,
8.81615260371772,
11.7865266999202,
8.81615260371772
),
WT_S2 = c(
12.4720029602325,
10.9249210977479,
11.4357186116131,
8.81615260371772,
12.2369960496953,
8.81615260371772
)
),
row.names = 286:291,
class = "data.frame"
)
)
})
13 changes: 3 additions & 10 deletions vignettes/intro.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,19 @@ moo <- create_multiOmicDataSet_from_files(
sample_meta_filepath = metadata_tsv,
feature_counts_filepath = gene_counts_tsv
)
# todo: obviate the need to pass feature & sample id colnames every time
moo <- moo %>%
clean_raw_counts(
feature_id_colname = "gene_id",
sample_id_colname = "sample_id",
) %>%
clean_raw_counts() %>%
filter_counts(
group_column = "condition",
label_column = "sample_id",
samples_to_include = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"),
minimum_count_value_to_be_considered_nonzero = 1,
minimum_number_of_samples_with_nonzero_counts_in_total = 1,
minimum_number_of_samples_with_nonzero_counts_in_a_group = 1,
) %>%
normalize_counts(
feature_id_colname = "Gene",
sample_id_colname = "sample_id",
group_column = "condition",
label_column = "sample_id",
columns_to_include = c("Gene", "KO_S3", "KO_S4", "WT_S1", "WT_S2")
label_column = "sample_id"
)
moo@counts$norm$voom %>% head()
Expand Down

0 comments on commit f151366

Please sign in to comment.