From 193cbe7be6cce0a4c13e750938597166b4820e09 Mon Sep 17 00:00:00 2001 From: Phil Homan Date: Tue, 19 Nov 2024 11:21:03 -0500 Subject: [PATCH] feat: add clean raw counts code from NIDAP https://nidap.nih.gov/workspace/vector/view/ri.vector.main.workbook.db4b90af-0c04-4759-84ce-7f76bafe6b27\?branch\=Phil_MOWorkflow towards #73 --- R/clean.R | 386 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 386 insertions(+) create mode 100644 R/clean.R diff --git a/R/clean.R b/R/clean.R new file mode 100644 index 0000000..5dcfbe6 --- /dev/null +++ b/R/clean.R @@ -0,0 +1,386 @@ +# Clean Raw Counts [CCBR] (5453b016-53cf-44c8-b09b-7efda66543af): v82 +Clean_Raw_Counts_ForMOSuite <- function(raw_counts_ccbr_bulk_training, metadata_ccbr_bulk_training) { + library(stringr) + library(tidyr) + library(dplyr) + library(ggplot2) + + ## -------------------------------- ## + ## User-Defined Template Parameters ## + ## -------------------------------- ## + + # Basic Parameters: + raw_counts_matrix <- raw_counts_ccbr_bulk_training + sample_metadata <- metadata_ccbr_bulk_training + sample_names_column <- "Sample" + Data_type <- "Bulk RNAseq" + gene_id_column <- "GeneName" + samples_to_rename <- c("") + + # Advanced Parameters: + cleanup_column_names <- TRUE + split_gene_name <- TRUE + aggregate_rows_with_duplicate_gene_names <- TRUE + gene_name_column_to_use_for_collapsing_duplicates <- "" + + ################################## + ##### Remove version number from ENSEMBLE + ################################## + removeVersion <- function(ids) { + return(unlist(lapply(stringr::str_split(ids, "[.]"), "[[", 1))) + } + ### PH: START READ depth Plot + ################################## + ##### Sample Read Counts Plot + ################################## + + # Exclude the gene column from sample columns for counts + sample_cols <- raw_counts_matrix[, -1] + + # Sum of each sample column + column_sums <- colSums(sample_cols, na.rm = TRUE) + + # Create a data frame for plotting + sum_df <- data.frame( + sample_names = names(column_sums), + read_sums = column_sums + ) + + # Plotting + read_plot <- ggplot(sum_df, aes(x = sample_names, y = read_sums)) + + geom_bar(stat = "identity", fill = "blue") + + labs(title = "Total Reads per Sample", x = "Samples", y = "Read Count") + + theme_minimal() + + theme( + axis.text.x = element_text(angle = 45, hjust = 1, size = 14), + axis.text.y = element_text(size = 14), + axis.title = element_text(size = 16), + plot.title = element_text(size = 20) + ) + + ### PH: END READ depth Plot + + ### PH: START Sample Validation + ################################## + ##### Sample Name Validation + ################################## + + ## check if sample names are different between raw counts + ## and metadata tables + + check_sample_names <- function(counts, metadata) { + raw_count_names <- colnames(counts) + metadata_names <- metadata[, sample_names_column] + + different_names <- setdiff(metadata_names, raw_count_names) + if (length(different_names) > 0) { + stop( + "The following sample names are different in the metadata but not the raw counts: ", + paste(different_names, collapse = ",") + ) + } + } + + check_sample_names(raw_counts_matrix, sample_metadata) + + ################################## + ##### Sample Name Check for duplicated + ################################## + + ## duplicate col name + if (sum(duplicated(colnames(raw_counts_matrix))) != 0) { + print("Duplicate column names are not allowed, the following columns were duplicated.\n") + colnames(raw_counts_matrix)[duplicated(colnames(raw_counts_matrix))] + stop("Duplicated columns") + } + ### PH: END Sample Validation + + + ### PH: START Rename Samples + ################################## + ##### Manually rename samples + ################################## + + if (!is.null(samples_to_rename)) { + if (samples_to_rename != c("")) { + for (x in samples_to_rename) { + old <- strsplit(x, ": ?")[[1]][1] + new <- strsplit(x, ": ?")[[1]][2] + colnames(raw_counts_matrix)[colnames(raw_counts_matrix) %in% old] <- new + } + } + } + ### PH: END Rename Samples + + + ### PH: START Clean up Sample Name columns + ################################## + ##### Cleanup Columns + ################################## + ## Look for any non standard Characters or automatic formatting introduced when Table is read included + ## done after Rename step so that names do not automatically change before using names in Metadata. + + if (cleanup_column_names) { + cl_og <- colnames(raw_counts_matrix) + ## convert special charchers to _ + cl2 <- gsub("-| |\\:", "_", colnames(raw_counts_matrix)) + if (length(cl2[(cl2) != colnames(raw_counts_matrix)]) > 0) { + print("Columns had special characters relpaced with _ ") + # (colnames(raw_counts_matrix)[(colnames(raw_counts_matrix))!=cl2]) + # print(cl2[(cl2)!=colnames(raw_counts_matrix)]) + colnames(raw_counts_matrix) <- cl2 + } + + ## if names begin with number add X + cl2 <- sub("^(\\d)", "X\\1", colnames(raw_counts_matrix)) + if (length(cl2[(cl2) != colnames(raw_counts_matrix)]) > 0) { + print("Columns started with numbers and an X was added to colname :") + # (colnames(raw_counts_matrix)[(colnames(raw_counts_matrix))!=cl2]) + # print(cl2[(cl2)!=colnames(raw_counts_matrix)]) + colnames(raw_counts_matrix) <- cl2 + } + + print( + colnames(raw_counts_matrix)[!colnames(raw_counts_matrix) %in% gene_id_column] %>% as.data.frame(), + row.names = F + ) + # print("Final Colnames:") + } else { + ## invalid name format + if (any(make.names(colnames(raw_counts_matrix)) != colnames(raw_counts_matrix))) { + print("Error: The following counts matrix column names are not valid:\n") + print(colnames(raw_counts_matrix)[make.names(colnames(raw_counts_matrix)) != colnames(raw_counts_matrix)]) + print("Likely causes are columns starting with numbers or other special characters eg spaces.\n") + # stop("Bad column names.") + } + ## Names Contain dashes + if (sum(grepl("-", colnames(raw_counts_matrix))) != 0) { + print("The sample names cannot contain dashes.") + print(colnames(raw_counts_matrix)[grepl("-", colnames(raw_counts_matrix))]) + # stop("No dashes allowed in column names") + } + } + ### PH: END Clean up Sample Name columns + + + ### PH: START - Split Ensemble + Gene name + ## Identify and separate Gene Name Columns into multiple Gene Metadata columns + ################################## + ## Split Ensemble + Gene name + ################################## + ## First check if Feature ID column can be split by ",|_-:" + ## Then check if one column contains Ensemble (regex '^ENS[A-Z]+[0-9]+') + ## check if Ensemble ID has version info and remove version + ## If one column contains Ensemble ID Assume other column is Gene names + ## If Column does not contain Ensmeble ID name split columns Gene_ID_1 and Gene_ID_2 + + ## if split_Gene_name ==F then will rename gene_id_column column to either Gene(Bulk RNAseq) or FeatureID(Proteomics) + + print("") + + if (split_gene_name == T) { + Ensembl_ID <- str_split_fixed(raw_counts_matrix[, gene_id_column], "_|-|:|\\|", n = 2) %>% data.frame() + EnsCol <- apply(Ensembl_ID, c(1, 2), function(x) grepl("^ENS[A-Z]+[0-9]+", x)) + + + if ("" %in% Ensembl_ID[, 1] | "" %in% Ensembl_ID[, 2]) { + print(paste0("Not able to identify multiple id's in ", gene_id_column)) + # colnames(df)[colnames(df)%in%clm]=gene_col + if (Data_type == "Bulk RNAseq") { + colnames(raw_counts_matrix)[colnames(raw_counts_matrix) %in% gene_id_column] <- "Gene" + } else if (Data_type == "Proteomics") { + colnames(raw_counts_matrix)[colnames(raw_counts_matrix) %in% gene_id_column] <- "FeatureID" + } else { + print("incorrect Data Type") + incorrect_Data_Type + } + } else { + ## at least one column must have all ensemble ids found in EnsCol + if (nrow(EnsCol[EnsCol[, 1] == T, ]) == nrow(Ensembl_ID) | nrow(EnsCol[EnsCol[, 2] == T, ]) == nrow(Ensembl_ID)) { + if (Data_type == "Bulk RNAseq") { + colnames(Ensembl_ID)[colSums(EnsCol) != nrow(Ensembl_ID)] <- "Gene" + } else if (Data_type == "Proteomics") { + colnames(Ensembl_ID)[colSums(EnsCol) != nrow(Ensembl_ID)] <- "FeatureID" + } + ## check if Ensmble column has version information + if (grepl("^ENS[A-Z]+[0-9]+\\.[0-9]+$", Ensembl_ID[, colSums(EnsCol) == nrow(Ensembl_ID)]) %>% sum() == nrow(Ensembl_ID)) { + colnames(Ensembl_ID)[colSums(EnsCol) == nrow(Ensembl_ID)] <- "Ensembl_ID_version" + Ensembl_ID$Ensembl_ID <- removeVersion(Ensembl_ID$Ensembl_ID_version) + } else { + colnames(Ensembl_ID)[colSums(EnsCol) == nrow(Ensembl_ID)] <- "Ensembl_ID" + } + } else { + colnames(Ensembl_ID) <- c("Feature_id_1", "Feature_id_2") + print("Could not determine ID formats from split 'Feature ID' Column") + } + raw_counts_matrix <- cbind(Ensembl_ID, raw_counts_matrix[, !colnames(raw_counts_matrix) %in% gene_id_column]) + } + } else { + if (Data_type == "Bulk RNAseq") { + colnames(raw_counts_matrix)[colnames(raw_counts_matrix) %in% gene_id_column] <- "Gene" + } else if (Data_type == "Proteomics") { + colnames(raw_counts_matrix)[colnames(raw_counts_matrix) %in% gene_id_column] <- "FeatureID" + } else { + print("incorrect Data Type") + incorrect_Data_Type + } + } + ### PH: End - Split Ensemble + Gene name + + + ### PH: START - Aggregate duplicate gene names + ################################## + ## If duplicate gene aggregate information to single row + ################################## + + ## If user uses "Feature ID" column then switch to empty for appropriate behavior based on other parameters + if (gene_name_column_to_use_for_collapsing_duplicates == gene_id_column) { + gene_name_column_to_use_for_collapsing_duplicates <- "" + } + ## Use different Gene Names Column Name based on data Type + ## I think we should deprecate this and just stick with "FeatureID" + if (gene_name_column_to_use_for_collapsing_duplicates == "" & + ("Feature_id_1" %in% colnames(raw_counts_matrix)) == F) { + if (Data_type == "Bulk RNAseq") { + gene_name_column_to_use_for_collapsing_duplicates <- "Gene" + } else if (Data_type == "Proteomics") { + gene_name_column_to_use_for_collapsing_duplicates <- "FeatureID" + } + } + + ## Error Check if Column is Numeric + nums <- unlist(lapply(raw_counts_matrix, is.numeric)) + nums <- names(nums[nums]) + print("") + print("Columns that can be used to aggregate gene information") + print(raw_counts_matrix[, !names(raw_counts_matrix) %in% nums, drop = F] %>% colnames()) + + print("") + + ########## + ## This section will Print duplicate row names when Aggregation column is not Specified. + ## Purpose is to Identify Row Annotation columns and show user that rows may duplicated + ####### + ## Print what rows are duplicated in selected annotation column + ## if no column name given default to Gene(Bulk RNAseq) or FeatureID(Proteomics) + ## Options: 1 Use default RowName for Data_type + ## 2 Use default Row name when Split Gene name recognizes Annotation name type + ## 3 show duplicate rows for all row annotations columns when Split Gene name does not recognize Annotation name type + if (gene_name_column_to_use_for_collapsing_duplicates == "") { + if (split_gene_name == F) { + ## If no Column name given for Aggregation then display Feature ID duplicates + print(paste0("genes with duplicate IDs in ", gene_id_column, ":")) + + ## Print original Column name for user Reference then use new Column name to subset table + if (Data_type == "Bulk RNAseq") { + gene_id_column <- "Gene" + } else if (Data_type == "Proteomics") { + gene_id_column <- "FeatureID" + } + raw_counts_matrix[duplicated(raw_counts_matrix[, gene_id_column]), gene_id_column] %>% + unique() %>% + as.character() %>% + write(stdout()) + + ## if Gene Name column is split then select Column Names generated from "Split Ensemble + Gene name" + ## Raw If Feature_id_1 is generated it means that "Split Ensemble + Gene name" could not recognize Gene name format (EnsembleID or GeneName) + ## and so default is to identify duplicicates in Feature_id_1 column + } else if (split_gene_name == T & grepl("Feature_id_1", colnames(raw_counts_matrix)) == F) { + if (Data_type == "Bulk RNAseq") { + gene_id_column <- "Gene" + } else if (Data_type == "Proteomics") { + gene_id_column <- "FeatureID" + } + print(paste0("genes with duplicate IDs in ", gene_id_column, ":")) + + raw_counts_matrix[duplicated(raw_counts_matrix[, gene_name_column_to_use_for_collapsing_duplicates]), gene_name_column_to_use_for_collapsing_duplicates] %>% + unique() %>% + as.character() %>% + write(stdout()) + } else if (split_gene_name == T & grepl("Feature_id_1", colnames(raw_counts_matrix)) == T) { + print(paste0("genes with duplicate IDs in ", "Feature_id_1", ":")) + + raw_counts_matrix[duplicated(raw_counts_matrix[, "Feature_id_1"]), "Feature_id_1"] %>% + unique() %>% + as.character() %>% + write(stdout()) + + print(paste0("genes with duplicate IDs in ", "Feature_id_2", ":")) + + raw_counts_matrix[duplicated(raw_counts_matrix[, "Feature_id_2"]), "Feature_id_2"] %>% + unique() %>% + as.character() %>% + write(stdout()) + } + } + + + + ########## + ## This section Aggregates duplicate Row names based on selected Annotation Column name + ####### + + + if (aggregate_rows_with_duplicate_gene_names == TRUE) { + print("Aggregating the counts for the same ID in different chromosome locations.") + print("Column used to Aggregate duplicate IDs: ") + print(gene_name_column_to_use_for_collapsing_duplicates) + print("Number of rows before Collapse: ") + print(nrow(raw_counts_matrix)) + + if (sum(duplicated(raw_counts_matrix[, gene_name_column_to_use_for_collapsing_duplicates])) != 0) { + print("") + print("Duplicate IDs: ") + print(raw_counts_matrix[duplicated(raw_counts_matrix[, gene_name_column_to_use_for_collapsing_duplicates]), gene_name_column_to_use_for_collapsing_duplicates] %>% as.character() %>% unique()) + + dfagg <- raw_counts_matrix[, c(gene_name_column_to_use_for_collapsing_duplicates, nums)] %>% + group_by_at(gene_name_column_to_use_for_collapsing_duplicates) %>% + summarise_all(sum) + + if (ncol(raw_counts_matrix[, !names(raw_counts_matrix) %in% nums, drop = FALSE]) > 1) { + ## collapse non-numeric columns + dfagg2 <- raw_counts_matrix[, !names(raw_counts_matrix) %in% nums] %>% + group_by_at(gene_name_column_to_use_for_collapsing_duplicates) %>% + summarise_all(paste, collapse = ",") + + dfagg <- merge(dfagg2, dfagg, by = eval(gene_name_column_to_use_for_collapsing_duplicates), sort = F) %>% as.data.frame() + } + dfout <- dfagg + print("Number of rows after Collapse: ") + print(nrow(dfout)) + } else { + print(paste0("no duplicated IDs in ", gene_name_column_to_use_for_collapsing_duplicates)) + dfout <- raw_counts_matrix + } + } else { + if (gene_name_column_to_use_for_collapsing_duplicates != "") { + print("") + print(paste0("Duplicate IDs in ", gene_name_column_to_use_for_collapsing_duplicates, " Column:")) + print(raw_counts_matrix[duplicated(raw_counts_matrix[, gene_name_column_to_use_for_collapsing_duplicates]), gene_name_column_to_use_for_collapsing_duplicates] %>% as.character() %>% unique()) + } + + print("") + print(paste0("If you desire to Aggregate row feature information select appropriate Column to use for collapsing duplicates")) + + dfout <- raw_counts_matrix + } + + print(Data_type) + print(read_plot) + + return(dfout) +} + +### PH: START - Aggregate duplicate gene names + + + +################################################# +## Global imports and functions included below ## +################################################# + +# Functions defined here will be available to call in +# the code for any table. + +# install_bioconductor_package <- function(pkg) { +# install.packages(paste0("https://gypsum.palantircloud.com/assets/dyn/bioconductor-packages/", pkg, ".tar.gz"), repos=NULL)