From 2c5cdf9cb6dff8685dbbf538ba36e8e62e5bf574 Mon Sep 17 00:00:00 2001 From: AtaJadidAhari Date: Mon, 13 Jan 2025 13:38:23 +0100 Subject: [PATCH] add updateSeqlevelsStyle & tests --- R/countRNAseqData.R | 10 +++- R/helper-functions.R | 62 ++++++++++++++++++++++ tests/testthat/test_updateSeqlevelsStyle.R | 45 ++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/test_updateSeqlevelsStyle.R diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index d228e49d..3e329616 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -508,7 +508,15 @@ countSplitReads <- function(sampleID, fds, NcpuPerSample=1, genome=NULL, if(is.character(genome)){ genome <- getBSgenome(genome) } - seqlevelsStyle(genome) <- seqlevelsStyle(chromosomes)[1] + tryCatch( + { + seqlevelsStyle(genome) <- seqlevelsStyle(chromosomes)[1] + } + error = function(e) { + warning("Could not update genome's seqlevelsStyle using GenomeInfoDb package. Updating manually now...") + genome <- updateSeqlevelsStyle(genome, metadata(genome)$genome, seqlevelsStyle(chromosomes)[1], metadata(genome)$provider, fds@workingDir) + } + ) chrLengths <- extractChromosomeLengths(bamfile) mismatchChrs <- which( seqlengths(genome)[chromosomes] != chrLengths[chromosomes]) diff --git a/R/helper-functions.R b/R/helper-functions.R index d646f2c7..0e47fb98 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -700,3 +700,65 @@ checkForAndCreateDir <- function(object, dir){ } return(TRUE) } + +updateSeqlevelsStyle <- function(bsgenome, genome_assembly, new_style, old_style, conversion_dict_path){ + if (genome_assembly == "hg19" || genome_assembly == "hs37d5") { + conversion_dict_path <- file.path(conversion_dict_path, "hg19_NCBI2UCSC.txt") + if (!file.exists(conversion_dict_path)) { + file_url <- "https://www.cmm.in.tum.de/public/paper/FRASER/hg19_NCBI2UCSC.txt" + message(paste0("Downloading seqlevelStyle conversion dict: "), file_url) + + response <- GET(file_url, write_disk(conversion_dict_path, overwrite = TRUE)) + + if (response$status_code == 200) { + assembly_report <- fread(conversion_dict_path) + } + else { + stop(paste0("Failed to download the file. Status code:", response$status_code, + ".\nPlease download the file manually from ", file_url, " to the following directory: ", conversion_dict_path)) + } + } + assembly_report <- fread(conversion_dict_path) + } + else if (genome_assembly == "hg38" || genome_assembly == "GRCh38") { + conversion_dict_path <- file.path(conversion_dict_path, "hg38_NCBI2UCSC.txt") + if (!file.exists(conversion_dict_path)) { + file_url <- "https://www.cmm.in.tum.de/public/paper/FRASER/hg38_NCBI2UCSC.txt" + message(paste0("Downloading seqlevelStyle conversion dict: "), file_url) + + response <- GET(file_url, write_disk(conversion_dict_path, overwrite = TRUE)) + + if (response$status_code == 200) { + assembly_report <- fread(conversion_dict_path) + } else { + stop(paste0("Failed to download the file. Status code:", response$status_code, + ".\nPlease download the file manually from ", file_url, " to the following directory: ", conversion_dict_path)) + } + } + assembly_report <- fread(conversion_dict_path) + } + + if (new_style == "UCSC") { + assembly_report <- assembly_report[NCBI_Genome_Name != ""] + mapping <- setNames(assembly_report$`UCSC.style.name`, assembly_report$`Sequence.Name`) + } + else { # new_style == "NCBI" + mapping <- setNames(assembly_report$`Sequence.Name`, assembly_report$`UCSC.style.name`) + } + + mapping <- mapping[names(mapping) != "na" & mapping != "na"] + new_seqnames <- unname(mapping[seqnames(bsgenome)]) + + seqnames(bsgenome) <- new_seqnames + + if (new_style == "UCSC" & old_style == "NCBI"){ + seqinfo(bsgenome)@genome <- assembly_report[!(is.na(NCBI_Genome_Name) | NCBI_Genome_Name == ""), NCBI_Genome_Name] + } + else if (new_style == "NCBI" & old_style=="UCSC"){ # new_style == "NCBI" + seqinfo(bsgenome)@genome <- assembly_report[!(is.na(UCSC_Genome_Name) | UCSC_Genome_Name == ""), UCSC_Genome_Name] + } + + seqlevelsStyle(bsgenome) <- new_style + + return (bsgenome) +} diff --git a/tests/testthat/test_updateSeqlevelsStyle.R b/tests/testthat/test_updateSeqlevelsStyle.R new file mode 100644 index 00000000..e5d5477b --- /dev/null +++ b/tests/testthat/test_updateSeqlevelsStyle.R @@ -0,0 +1,45 @@ +context("Test updateSeqlevelsStyle function") + +test_that("hg38, UCSC to NCBI", { + genome <- getBSgenome("hg38") + bsgenome <- genome + + seqlevelsStyle(bsgenome) <- "NCBI" + genome <- updateSeqlevelsStyle(genome, metadata(genome)$genome, "NCBI", metadata(genome)$provider, "./") + + expect_equal(seqnames(genome), seqnames(bsgenome)) + +}) + +test_that("hg38, NCBI to UCSC", { + genome <- getBSgenome("GRCh38") + bsgenome <- genome + + seqlevelsStyle(bsgenome) <- "UCSC" + genome <- updateSeqlevelsStyle(genome, metadata(genome)$genome, "UCSC", metadata(genome)$provider, "./") + + expect_equal(seqnames(genome), seqnames(bsgenome)) + +}) + +test_that("hg19, NCBI to UCSC", { + genome <- getBSgenome("hs37d5") + bsgenome <- genome + + seqlevelsStyle(bsgenome) <- "UCSC" + genome <- updateSeqlevelsStyle(genome, metadata(genome)$genome, "UCSC", metadata(genome)$provider, "./") + + expect_equal(seqnames(genome), seqnames(bsgenome)) + +}) + +test_that("hg19, UCSC to NCBI", { + genome <- getBSgenome("hg19") + bsgenome <- genome + + seqlevelsStyle(bsgenome) <- "NCBI" + genome <- updateSeqlevelsStyle(genome, metadata(genome)$genome, "NCBI", metadata(genome)$provider, "./") + + expect_equal(seqnames(genome), seqnames(bsgenome)) + +})