ericscript.pl DownloadDB.R and RetrieveRefId.R were updated

smsrts · May 2, 2020 · e235f5d · e235f5d
1 parent 01dcff8
commit e235f5d
Show file tree

Hide file tree

Showing 28 changed files with 4,088 additions and 0 deletions.
diff --git a/ericscript.pl b/ericscript.pl
diff --git a/lib/R/BuildExonUnionModel.R b/lib/R/BuildExonUnionModel.R
@@ -0,0 +1,93 @@
+vars.tmp <- commandArgs()
+vars <- vars.tmp[length(vars.tmp)]
+split.vars <- unlist(strsplit(vars, ","))
+ericscriptfolder <- split.vars [1]
+refid <- split.vars[2]
+dbfolder <- split.vars [3]
+tmpfolder <- split.vars [4]
+
+
+formatfasta <- function(myfasta, step = 50) {  
+  totalchar <- nchar(myfasta)
+  if (totalchar > step) {
+    steps <- seq(1, totalchar, by = step)
+    newfasta <- rep("", (length(steps) - 1))
+    for (j in 1: (length(steps) - 1)) {
+      aa <- substr(myfasta, steps[j], (steps[j] + (step - 1)))
+      newfasta[j] <- aa 
+    }
+    if ((totalchar - tail(steps, n = 1)) > 0) {
+      newfasta <-  c(newfasta, substr(myfasta, steps[j+1], totalchar))
+    }
+  } else
+  {
+    newfasta <- substr(myfasta, 1, totalchar)
+  }
+  return(newfasta)
+}
+
+convertToComplement <- function(x) {
+
+  bases <- c("A", "C", "G", "T")
+  #xx <- unlist(strsplit(toupper(x), NULL))
+  xx <- rev(unlist(strsplit(toupper(x), NULL)))
+  paste(unlist(lapply(xx, function(bbb) {
+    if (bbb=="A") compString <- "T"
+    if (bbb=="C") compString <- "G"
+    if (bbb=="G") compString <- "C"
+    if (bbb=="T") compString <- "A"
+    if (!bbb %in% bases) compString <- "N"
+    return(compString)
+  })), collapse="")
+
+}
+
+refid.folder <- file.path(dbfolder, "data", refid)
+if (file.exists(refid.folder) == F) {
+  dir.create(refid.folder)
+}
+x <- scan(file.path(tmpfolder, "subseq.fa"), what = "", quiet = T)
+x.bed <- read.delim(file.path(tmpfolder, "exonstartend.mrg.txt"), sep = "\t", header = F)
+refid.bed <- paste(as.character(x.bed[[1]]), paste((as.numeric(as.character(x.bed[[2]])) + 1), as.character(x.bed[[3]]), sep = "-"), sep = ":")
+tmp <- grep(">", x)
+genomicreg <- substr(x[tmp], 2, nchar(x[tmp]))
+sequences.tmp <- rep("", length(tmp))
+for (i in 1: (length(tmp) - 1)) { 
+  sequences.tmp[i] <- gsub(", ", "", toString(x[(tmp[i] + 1):(tmp[i+1] - 1)]))
+}
+sequences.tmp[length(tmp)] <- gsub(", ", "", toString(x[(tmp[length(tmp)] + 1): length(x)]))
+genenames.tmp <- as.character(x.bed[[4]])
+unique.genenames <- unique(genenames.tmp)
+strand.tmp <- read.delim(file.path(tmpfolder, "strand.txt"), sep = "\t", header = F)
+strand <- strand.tmp[[2]]
+sequences <- rep("", length(unique.genenames))
+for (i in 1: length(unique.genenames)) {
+    genenames1 <- paste(">", unique.genenames[i], sep = "")
+    ix.gene <- which(genenames.tmp == unique.genenames[i])
+    ix.refid <- which(genomicreg %in% refid.bed[ix.gene])
+    if (strand[i] == "-1") {
+      seqtmp0 <- gsub(", ", "", toString(sequences.tmp[ix.refid]))
+      sequences[i] <- convertToComplement(seqtmp0)
+    } else {
+      sequences[i] <- gsub(", ", "", toString(sequences.tmp[ix.refid]))
+    }
+    if (nchar(sequences[i])  == 0) {
+      sequences[i] <- "NNNNNNN"
+    }
+
+    if (i == 1) {
+      cat(genenames1, file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = F, sep = "\n")
+    } else {
+      cat(genenames1, file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = T, sep = "\n")
+    }
+    cat(formatfasta(sequences[i]), file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = T, sep = "\n")  
+}
+ix.emptyseq <- which(nchar(sequences) == 0) 
+GeneNames <- unique.genenames
+if (length(ix.emptyseq) > 0) {
+GeneNames <- GeneNames[-ix.emptyseq]
+sequences <- sequences[-ix.emptyseq]
+}
+save(GeneNames, file = file.path(refid.folder, "EnsemblGene.GeneNames.RData"))
+save(sequences, file = file.path(refid.folder, "EnsemblGene.Sequences.RData"))
+
diff --git a/lib/R/BuildFasta.R b/lib/R/BuildFasta.R
@@ -0,0 +1,101 @@
+vars.tmp <- commandArgs()
+vars <- vars.tmp[length(vars.tmp)]
+split.vars <- unlist(strsplit(vars, ","))
+samplename <- split.vars [1]
+outputfolder <- split.vars[2]
+ericscriptfolder <- split.vars[3]
+readlength <- max(as.numeric(split.vars[4]))
+refid <- as.character(split.vars[5])
+dbfolder <- as.character(split.vars[6])
+
+
+formatfasta <- function(myfasta, step = 50) {
+
+	totalchar <- nchar(myfasta)
+	if (totalchar > step) {
+		steps <- seq(1, totalchar, by = step)
+		newfasta <- rep("", (length(steps) - 1))
+		for (j in 1: (length(steps) - 1)) {
+			aa <- substr(myfasta, steps[j], (steps[j] + (step - 1)))
+			newfasta[j] <- aa 
+		}
+		if ((totalchar - tail(steps, n = 1)) > 0) {
+			newfasta <-  c(newfasta, substr(myfasta, steps[j+1], totalchar))
+		}
+	} else
+	{
+		newfasta <- substr(myfasta, 1, totalchar)
+	}
+	return(newfasta)
+}
+
+
+load(file.path(outputfolder,"out", paste(samplename,".chimeric.RData", sep = "")))
+load(file.path(dbfolder, "data", refid, "EnsemblGene.GeneNames.RData"))
+load(file.path(dbfolder, "data", refid, "EnsemblGene.Sequences.RData"))
+load(file.path(dbfolder, "data", refid, "EnsemblGene.Structures.RData"))
+load(file.path(outputfolder, "out", paste(samplename,".chimeric.RData", sep = "")))
+id1 <- MyGF$id1
+id2 <- MyGF$id2
+junctions <- rep(NA, length(id1))
+ids_fasta <- rep("", length(id1))
+sequences.fasta <- rep("", length(id1))
+fasta.file <- c()
+maxgap <- 300
+for (i in 1: length(id1)) {
+	ix.genetable1 <- which(EnsemblGene.Structures$EnsemblGene == id1[i])
+	ix.genetable2 <- which(EnsemblGene.Structures$EnsemblGene == id2[i])
+	ix.gene1 <- which(GeneNames == id1[i])
+	ix.gene2 <- which(GeneNames == id2[i])
+	min.pos1 <- min(MyGF$pos1[[i]]) - 2*readlength
+	max.pos1 <- max(MyGF$pos1[[i]]) + readlength - 1
+	if (min.pos1 < 1) {min.pos1 <- 1}
+	min.pos2 <- min(MyGF$pos2[[i]])
+	max.pos2 <- max(MyGF$pos2[[i]]) + 2*readlength
+	a <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.genetable1]), ",")))
+	b <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.genetable1]), ",")))
+	strand1 <- as.character(EnsemblGene.Structures$Strand[ix.genetable1])
+	if (strand1 == "+") {
+		tmp.sum1 <- cumsum((b - a ))
+	} else {
+		tmp.sum1 <- cumsum(rev(b - a))		
+	}
+	exonenumber1 <- which(tmp.sum1 >= max.pos1)[1]
+	if (is.na(exonenumber1)) {exonenumber1 <- length(tmp.sum1)}
+	a2 <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.genetable2]), ",")))
+	b2 <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.genetable2]), ",")))
+	strand2 <- as.character(EnsemblGene.Structures$Strand[ix.genetable2])
+	if (strand2 == "+") {
+		tmp.sum2 <- cumsum((b2 - a2))
+	} else {
+		tmp.sum2 <- cumsum(rev(b2 - a2))
+	}
+	exonenumber2 <- which(tmp.sum2 >= min.pos2)[1]
+	if (is.na(exonenumber2)) {exonenumber2 <- length(tmp.sum2)}
+	id.gf1 <- paste(id1[i], exonenumber1, sep = "_")
+	fasta.gf1.tmp0 <- sequences[ix.gene1]
+	start.end.exons <- c(0,tmp.sum1)
+	fasta.gf1 <- substr(fasta.gf1.tmp0, min.pos1, (max.pos1 + maxgap - 1))	
+	id.gf2 <- paste(id2[i], exonenumber2, sep = "_")
+	fasta.gf2.tmp0 <- sequences[ix.gene2]
+	if (max.pos2 > nchar(fasta.gf2.tmp0)) {max.pos2 <- nchar(fasta.gf2.tmp0)}
+	start.end.exons <- c(0,tmp.sum2)
+	fasta.gf2 <- substr(fasta.gf2.tmp0, (min.pos2 - maxgap), max.pos2)
+	id.fastaGF <- paste(">",id.gf1,"----",id.gf2," junction@",nchar(fasta.gf1),sep = "")
+	sequences.fasta[i] <- paste(fasta.gf1, fasta.gf2, sep = "")
+	fasta.gf12 <- formatfasta(sequences.fasta[i])
+	ids_fasta[i] <- paste(id.gf1,id.gf2, sep = "----")
+	junctions[i] <- nchar(fasta.gf1)
+	fastaGF <- c(id.fastaGF, fasta.gf12)
+	fasta.file <- c(fasta.file, fastaGF)
+}
+save(junctions, file = file.path(outputfolder, "out", paste(samplename,".junctions.RData", sep = "")))
+save(sequences.fasta, file = file.path(outputfolder, "out", paste(samplename,".sequences_fasta.RData", sep = "")))
+save(ids_fasta, file = file.path(outputfolder, "out",  paste(samplename, ".ids_fasta.RData", sep = "")))
+cat(fasta.file, file = file.path(outputfolder,"out", paste(samplename,".EricScript.junctions.fa",sep = "")), sep = "\n")
+
+
+
+
+
+
diff --git a/lib/R/BuildNeighbourhoodSequences.R b/lib/R/BuildNeighbourhoodSequences.R
@@ -0,0 +1,37 @@
+vars.tmp <- commandArgs()
+vars <- vars.tmp[length(vars.tmp)]
+split.vars <- unlist(strsplit(vars, ","))
+samplename <- split.vars [1]
+outputfolder <- split.vars[2]
+z <- read.delim(file.path(outputfolder,"out",paste(samplename,".intervals.pileup", sep = "")), sep = "\t", header = F)
+load(file.path(outputfolder,"out",paste(samplename,".ids_filtered.RData", sep = "")))
+load(file.path(outputfolder,"out",paste(samplename,".junctions.recalibrated.RData", sep = "")))
+load(file.path(outputfolder, "out",  paste(samplename, ".ids_fasta.RData", sep = "")))
+id.pileup <- as.character(z[,1])
+pos.pileup <-  as.numeric(as.character(z[,2]))
+sequence.pileup <- as.character(z[,3])
+unique.ids.pileup <- unique(id.pileup)
+width <- 100
+fasta.file <-  c()
+for (i in 1:length(id.filtered)) {
+	ix.id <- which(id.pileup == id.filtered[i])
+	ix.ref <- which(ids_fasta == id.filtered[i])
+	junction <- junctions.recalibrated[ix.ref]
+	ix.id.pileup <- which(id.pileup == id.filtered[i])
+	ix.junction1 <- which(pos.pileup[ix.id.pileup] == junction)
+	ix.junction2 <- which(pos.pileup[ix.id.pileup] == (junction + 1))
+	seq.vec <- rep("N", width)
+	pos.seq <- seq.int((junction-(width/2-1)), (junction + (width/2)))
+	ix.pos.tmp <- which(pos.seq %in% pos.pileup[ix.id.pileup])
+	seq.vec[ix.pos.tmp] <- sequence.pileup[ix.id.pileup]
+	if ((length(ix.junction1)!=0) & (length(ix.junction2)!=0)) {
+		query.sequence <- character(length = 1)
+		for (ii in 1:length(seq.vec)) {
+			query.sequence <- paste(query.sequence, seq.vec[ii], sep = "")
+		}
+		ids_fasta_query <-  paste(">", id.filtered[i],sep = "")
+		fasta.file <-  c(fasta.file, c(ids_fasta_query, query.sequence))
+	}
+}
+cat(fasta.file, sep = "\n", file = file.path(outputfolder, "out",paste(samplename,".checkselfhomology.fa", sep = "")))
+cat(file.path(outputfolder, "out", paste(samplename,".checkselfhomology.fa", sep = "")), file = file.path(outputfolder, "out", ".link"))