Skip to content

Commit

Permalink
ericscript.pl DownloadDB.R and RetrieveRefId.R were updated
Browse files Browse the repository at this point in the history
  • Loading branch information
smsrts committed May 2, 2020
1 parent 01dcff8 commit e235f5d
Show file tree
Hide file tree
Showing 28 changed files with 4,088 additions and 0 deletions.
557 changes: 557 additions & 0 deletions ericscript.pl

Large diffs are not rendered by default.

93 changes: 93 additions & 0 deletions lib/R/BuildExonUnionModel.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
vars.tmp <- commandArgs()
vars <- vars.tmp[length(vars.tmp)]
split.vars <- unlist(strsplit(vars, ","))
ericscriptfolder <- split.vars [1]
refid <- split.vars[2]
dbfolder <- split.vars [3]
tmpfolder <- split.vars [4]


formatfasta <- function(myfasta, step = 50) {
totalchar <- nchar(myfasta)
if (totalchar > step) {
steps <- seq(1, totalchar, by = step)
newfasta <- rep("", (length(steps) - 1))
for (j in 1: (length(steps) - 1)) {
aa <- substr(myfasta, steps[j], (steps[j] + (step - 1)))
newfasta[j] <- aa
}
if ((totalchar - tail(steps, n = 1)) > 0) {
newfasta <- c(newfasta, substr(myfasta, steps[j+1], totalchar))
}
} else
{
newfasta <- substr(myfasta, 1, totalchar)
}
return(newfasta)
}

convertToComplement <- function(x) {

bases <- c("A", "C", "G", "T")
#xx <- unlist(strsplit(toupper(x), NULL))
xx <- rev(unlist(strsplit(toupper(x), NULL)))
paste(unlist(lapply(xx, function(bbb) {
if (bbb=="A") compString <- "T"
if (bbb=="C") compString <- "G"
if (bbb=="G") compString <- "C"
if (bbb=="T") compString <- "A"
if (!bbb %in% bases) compString <- "N"
return(compString)
})), collapse="")

}

refid.folder <- file.path(dbfolder, "data", refid)
if (file.exists(refid.folder) == F) {
dir.create(refid.folder)
}
x <- scan(file.path(tmpfolder, "subseq.fa"), what = "", quiet = T)
x.bed <- read.delim(file.path(tmpfolder, "exonstartend.mrg.txt"), sep = "\t", header = F)
refid.bed <- paste(as.character(x.bed[[1]]), paste((as.numeric(as.character(x.bed[[2]])) + 1), as.character(x.bed[[3]]), sep = "-"), sep = ":")
tmp <- grep(">", x)
genomicreg <- substr(x[tmp], 2, nchar(x[tmp]))
sequences.tmp <- rep("", length(tmp))
for (i in 1: (length(tmp) - 1)) {
sequences.tmp[i] <- gsub(", ", "", toString(x[(tmp[i] + 1):(tmp[i+1] - 1)]))
}
sequences.tmp[length(tmp)] <- gsub(", ", "", toString(x[(tmp[length(tmp)] + 1): length(x)]))
genenames.tmp <- as.character(x.bed[[4]])
unique.genenames <- unique(genenames.tmp)
strand.tmp <- read.delim(file.path(tmpfolder, "strand.txt"), sep = "\t", header = F)
strand <- strand.tmp[[2]]
sequences <- rep("", length(unique.genenames))
for (i in 1: length(unique.genenames)) {
genenames1 <- paste(">", unique.genenames[i], sep = "")
ix.gene <- which(genenames.tmp == unique.genenames[i])
ix.refid <- which(genomicreg %in% refid.bed[ix.gene])
if (strand[i] == "-1") {
seqtmp0 <- gsub(", ", "", toString(sequences.tmp[ix.refid]))
sequences[i] <- convertToComplement(seqtmp0)
} else {
sequences[i] <- gsub(", ", "", toString(sequences.tmp[ix.refid]))
}
if (nchar(sequences[i]) == 0) {
sequences[i] <- "NNNNNNN"
}

if (i == 1) {
cat(genenames1, file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = F, sep = "\n")
} else {
cat(genenames1, file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = T, sep = "\n")
}
cat(formatfasta(sequences[i]), file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = T, sep = "\n")
}
ix.emptyseq <- which(nchar(sequences) == 0)
GeneNames <- unique.genenames
if (length(ix.emptyseq) > 0) {
GeneNames <- GeneNames[-ix.emptyseq]
sequences <- sequences[-ix.emptyseq]
}
save(GeneNames, file = file.path(refid.folder, "EnsemblGene.GeneNames.RData"))
save(sequences, file = file.path(refid.folder, "EnsemblGene.Sequences.RData"))

101 changes: 101 additions & 0 deletions lib/R/BuildFasta.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
vars.tmp <- commandArgs()
vars <- vars.tmp[length(vars.tmp)]
split.vars <- unlist(strsplit(vars, ","))
samplename <- split.vars [1]
outputfolder <- split.vars[2]
ericscriptfolder <- split.vars[3]
readlength <- max(as.numeric(split.vars[4]))
refid <- as.character(split.vars[5])
dbfolder <- as.character(split.vars[6])


formatfasta <- function(myfasta, step = 50) {

totalchar <- nchar(myfasta)
if (totalchar > step) {
steps <- seq(1, totalchar, by = step)
newfasta <- rep("", (length(steps) - 1))
for (j in 1: (length(steps) - 1)) {
aa <- substr(myfasta, steps[j], (steps[j] + (step - 1)))
newfasta[j] <- aa
}
if ((totalchar - tail(steps, n = 1)) > 0) {
newfasta <- c(newfasta, substr(myfasta, steps[j+1], totalchar))
}
} else
{
newfasta <- substr(myfasta, 1, totalchar)
}
return(newfasta)
}


load(file.path(outputfolder,"out", paste(samplename,".chimeric.RData", sep = "")))
load(file.path(dbfolder, "data", refid, "EnsemblGene.GeneNames.RData"))
load(file.path(dbfolder, "data", refid, "EnsemblGene.Sequences.RData"))
load(file.path(dbfolder, "data", refid, "EnsemblGene.Structures.RData"))
load(file.path(outputfolder, "out", paste(samplename,".chimeric.RData", sep = "")))
id1 <- MyGF$id1
id2 <- MyGF$id2
junctions <- rep(NA, length(id1))
ids_fasta <- rep("", length(id1))
sequences.fasta <- rep("", length(id1))
fasta.file <- c()
maxgap <- 300
for (i in 1: length(id1)) {
ix.genetable1 <- which(EnsemblGene.Structures$EnsemblGene == id1[i])
ix.genetable2 <- which(EnsemblGene.Structures$EnsemblGene == id2[i])
ix.gene1 <- which(GeneNames == id1[i])
ix.gene2 <- which(GeneNames == id2[i])
min.pos1 <- min(MyGF$pos1[[i]]) - 2*readlength
max.pos1 <- max(MyGF$pos1[[i]]) + readlength - 1
if (min.pos1 < 1) {min.pos1 <- 1}
min.pos2 <- min(MyGF$pos2[[i]])
max.pos2 <- max(MyGF$pos2[[i]]) + 2*readlength
a <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.genetable1]), ",")))
b <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.genetable1]), ",")))
strand1 <- as.character(EnsemblGene.Structures$Strand[ix.genetable1])
if (strand1 == "+") {
tmp.sum1 <- cumsum((b - a ))
} else {
tmp.sum1 <- cumsum(rev(b - a))
}
exonenumber1 <- which(tmp.sum1 >= max.pos1)[1]
if (is.na(exonenumber1)) {exonenumber1 <- length(tmp.sum1)}
a2 <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.genetable2]), ",")))
b2 <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.genetable2]), ",")))
strand2 <- as.character(EnsemblGene.Structures$Strand[ix.genetable2])
if (strand2 == "+") {
tmp.sum2 <- cumsum((b2 - a2))
} else {
tmp.sum2 <- cumsum(rev(b2 - a2))
}
exonenumber2 <- which(tmp.sum2 >= min.pos2)[1]
if (is.na(exonenumber2)) {exonenumber2 <- length(tmp.sum2)}
id.gf1 <- paste(id1[i], exonenumber1, sep = "_")
fasta.gf1.tmp0 <- sequences[ix.gene1]
start.end.exons <- c(0,tmp.sum1)
fasta.gf1 <- substr(fasta.gf1.tmp0, min.pos1, (max.pos1 + maxgap - 1))
id.gf2 <- paste(id2[i], exonenumber2, sep = "_")
fasta.gf2.tmp0 <- sequences[ix.gene2]
if (max.pos2 > nchar(fasta.gf2.tmp0)) {max.pos2 <- nchar(fasta.gf2.tmp0)}
start.end.exons <- c(0,tmp.sum2)
fasta.gf2 <- substr(fasta.gf2.tmp0, (min.pos2 - maxgap), max.pos2)
id.fastaGF <- paste(">",id.gf1,"----",id.gf2," junction@",nchar(fasta.gf1),sep = "")
sequences.fasta[i] <- paste(fasta.gf1, fasta.gf2, sep = "")
fasta.gf12 <- formatfasta(sequences.fasta[i])
ids_fasta[i] <- paste(id.gf1,id.gf2, sep = "----")
junctions[i] <- nchar(fasta.gf1)
fastaGF <- c(id.fastaGF, fasta.gf12)
fasta.file <- c(fasta.file, fastaGF)
}
save(junctions, file = file.path(outputfolder, "out", paste(samplename,".junctions.RData", sep = "")))
save(sequences.fasta, file = file.path(outputfolder, "out", paste(samplename,".sequences_fasta.RData", sep = "")))
save(ids_fasta, file = file.path(outputfolder, "out", paste(samplename, ".ids_fasta.RData", sep = "")))
cat(fasta.file, file = file.path(outputfolder,"out", paste(samplename,".EricScript.junctions.fa",sep = "")), sep = "\n")






37 changes: 37 additions & 0 deletions lib/R/BuildNeighbourhoodSequences.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
vars.tmp <- commandArgs()
vars <- vars.tmp[length(vars.tmp)]
split.vars <- unlist(strsplit(vars, ","))
samplename <- split.vars [1]
outputfolder <- split.vars[2]
z <- read.delim(file.path(outputfolder,"out",paste(samplename,".intervals.pileup", sep = "")), sep = "\t", header = F)
load(file.path(outputfolder,"out",paste(samplename,".ids_filtered.RData", sep = "")))
load(file.path(outputfolder,"out",paste(samplename,".junctions.recalibrated.RData", sep = "")))
load(file.path(outputfolder, "out", paste(samplename, ".ids_fasta.RData", sep = "")))
id.pileup <- as.character(z[,1])
pos.pileup <- as.numeric(as.character(z[,2]))
sequence.pileup <- as.character(z[,3])
unique.ids.pileup <- unique(id.pileup)
width <- 100
fasta.file <- c()
for (i in 1:length(id.filtered)) {
ix.id <- which(id.pileup == id.filtered[i])
ix.ref <- which(ids_fasta == id.filtered[i])
junction <- junctions.recalibrated[ix.ref]
ix.id.pileup <- which(id.pileup == id.filtered[i])
ix.junction1 <- which(pos.pileup[ix.id.pileup] == junction)
ix.junction2 <- which(pos.pileup[ix.id.pileup] == (junction + 1))
seq.vec <- rep("N", width)
pos.seq <- seq.int((junction-(width/2-1)), (junction + (width/2)))
ix.pos.tmp <- which(pos.seq %in% pos.pileup[ix.id.pileup])
seq.vec[ix.pos.tmp] <- sequence.pileup[ix.id.pileup]
if ((length(ix.junction1)!=0) & (length(ix.junction2)!=0)) {
query.sequence <- character(length = 1)
for (ii in 1:length(seq.vec)) {
query.sequence <- paste(query.sequence, seq.vec[ii], sep = "")
}
ids_fasta_query <- paste(">", id.filtered[i],sep = "")
fasta.file <- c(fasta.file, c(ids_fasta_query, query.sequence))
}
}
cat(fasta.file, sep = "\n", file = file.path(outputfolder, "out",paste(samplename,".checkselfhomology.fa", sep = "")))
cat(file.path(outputfolder, "out", paste(samplename,".checkselfhomology.fa", sep = "")), file = file.path(outputfolder, "out", ".link"))
Loading

0 comments on commit e235f5d

Please sign in to comment.