Skip to content

Commit

Permalink
Merge pull request #4 from HudsonAlpha/revert-3-spyder_error_zero_reads
Browse files Browse the repository at this point in the history
Revert "Spyder error zero reads"
  • Loading branch information
arnald-alonso authored Aug 7, 2016
2 parents dbbb94f + acb46f4 commit e8dcc93
Show file tree
Hide file tree
Showing 10 changed files with 53 additions and 810 deletions.
30 changes: 8 additions & 22 deletions R/stats_algs.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,22 @@
# INPUT DATA
args <- commandArgs(trailingOnly = TRUE)
path <- args[1] # path to the project outputs/ folder
alg <- args[2] # algorithm: star or htseq-gene or htseq-exon or kallisto
alg <- args[2] # algorithm: star or htseq-gene or htseq-exon
# READS RPKM AND COUNT MATRICES
if (alg != 'kallisto'){
rpkms <- read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
snames1 <- as.character(read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
counts <- read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
snames2 <- as.character(read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
labs <- c("RPKM","COUNTS")
} else{
rpkms <- read.table(paste(path, alg, "_tpm.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F, na.strings = '-nan')
rpkms <- rpkms[,2:ncol(rpkms)]
snames1 <- as.character(read.table(paste(path, alg, "_tpm.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
snames1 <- snames1[2:length(snames1)]
counts <- read.table(paste(path, alg, "_est_counts.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
counts <- counts[,2:ncol(counts)]
snames2 <- as.character(read.table(paste(path, alg, "_est_counts.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
snames2 <- snames2[2:length(snames2)]
labs <- c("TPM","EST_COUNTS")
}
rpkms <- read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
snames1 <- as.character(read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
counts <- read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
snames2 <- as.character(read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
# ONLY USES FEATURES WITH AT LEAST TWO RAW COUNTS IN ONE LIBRARY
inc <- which(rowSums(counts[2:ncol(counts)], na.rm = T) > 1)
NT <- c()
for (i in 1:2){
N <- matrix(NA,nrow=0,ncol=0)
if (i == 1) {G <- rpkms[inc,];lab <- labs[1]; snames <- snames1[2:length(snames1)]}
if (i == 2) {G <- counts[inc,];lab <- labs[2]; snames <- snames2[2:length(snames2)]}
if (i == 1) {G <- rpkms[inc,];lab <- "RPKM"; snames <- snames1[2:length(snames1)]}
if (i == 2) {G <- counts[inc,];lab <- "COUNTS"; snames <- snames2[2:length(snames2)]}
if (ncol(G)>2){ # at least two samples available
lcounts <- log2(G[, 2:ncol(G)]+1)
s_ok <- (which((colSums(G[, 2:ncol(G)]) > 10000)&(colSums(is.na(lcounts)) == 0))) # remove samples with NA (not processed yet)
s_ok <- (which(colSums(is.na(lcounts)) == 0)) # remove samples with NA (not processed yet)
if (length(s_ok) > 1){
## QUANTILES
tp <- round(t(apply(lcounts[, s_ok], 2, quantile, probs = c(0.05,0.25,0.5,0.75,0.95), na.rm = T)), 2)
Expand Down Expand Up @@ -62,4 +49,3 @@ for (i in 1:2){
NT <- cbind(snames[s_ok], NT)
write.table(NT, file = paste(path, alg, "_pca", ".txt", sep=""), quote = F, row.names = F, sep = "\t")


91 changes: 18 additions & 73 deletions lib/html_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,18 @@ def get_menu(config, ns):
menu.append('<h1>Alignment QC:</h1>')
if enabled.has_key("picard"):
menu.append('<h2><a #highlight="" href="./picard.html">- Picard</a></h2>')
if enabled.has_key("picard_IS"):
menu.append('<h2><a #highlight="" href="./picard-is.html">- Picard Insert Size</a></h2>')
if enabled.has_key("star"):
menu.append('<h2><a #highlight="" href="./star.html">- STAR</a></h2>')
if enabled.has_key("kallisto"):
menu.append('<h2><a #highlight="" href="./kallisto.html">- KALLISTO</a></h2>')
if enabled.has_key("htseq-gene"):
menu.append('<h2><a #highlight="" href="./htseq-gene.html">- HTseq-Gene</a></h2>')
if enabled.has_key("htseq-exon"):
menu.append('<h2><a #highlight="" href="./htseq-exon.html">- HTseq-Exon</a></h2>')
if ns > 1:
if enabled.has_key("star") or enabled.has_key("htseq-gene") or enabled.has_key("htseq-exon") or enabled.has_key("kallisto"):
if enabled.has_key("star") or enabled.has_key("htseq-gene") or enabled.has_key("htseq-exon"):
menu.append('<h1>Count statistics:</h1>')
menu.append('<h2><a #highlight="" href="./downloads.html">- DOWNLOADS</a></h2>')
if enabled.has_key("star"):
menu.append('<h2><a #highlight="" href="./star2.html">- STAR</a></h2>')
if enabled.has_key("kallisto"):
menu.append('<h2><a #highlight="" href="./kallisto2.html">- KALLISTO</a></h2>')
if enabled.has_key("htseq-gene"):
menu.append('<h2><a href="./htseq-gene2.html">- HTseq-Gene</a></h2>')
if enabled.has_key("htseq-exon"):
Expand All @@ -68,8 +62,8 @@ def get_menu(config, ns):
return menu

def print_samples(path,config):
analysis = ['trimgalore', 'fastqc', 'kallisto', 'star', 'star-fusion', 'picard', "htseq-gene", "htseq-exon", "picard_IS", "varscan", 'gatk']
sta= {"trimgalore":"TrimGalore", "fastqc":"FastQC","star":"STAR","star-fusion":"STAR-Fusion","picard":"PicardQC","kallisto":"Kallisto","htseq-gene":"HTseq-gene","htseq-exon":"HTseq-exon", "picard_IS":"Picard-InsertSize", "varscan":"VARSCAN", "gatk":"GATK"}
analysis = ['trimgalore', 'fastqc', 'kallisto', 'star', 'star-fusion', 'picard', "htseq-gene", "htseq-exon", "varscan", 'gatk']
sta= {"trimgalore":"TrimGalore", "fastqc":"FastQC","star":"STAR","star-fusion":"STAR-Fusion","picard":"PicardQC","kallisto":"Kallisto","htseq-gene":"HTseq-gene","htseq-exon":"HTseq-exon", "varscan":"VARSCAN", "gatk":"GATK"}
# SAMPLES LIST
samples = dict()
f = open(path + "/samples.list",'r')
Expand Down Expand Up @@ -154,16 +148,6 @@ def print_samples(path,config):
else:
res.append("FAIL")
results["star-fusion"][x] = " / ".join(res)
elif i=="picard_IS":
for x, y in sorted(samples.iteritems()):
res = []
if sok.has_key(x):
link = "../results_picard_IS/" + x + ".txt"
link = '<a href="LINK" target="_blank">OK</a>'.replace("LINK", link)
res.append(link)
else:
res.append("FAIL")
results["picard_IS"][x] = " / ".join(res)
elif i=="picard":
for x, y in sorted(samples.iteritems()):
res = []
Expand Down Expand Up @@ -346,7 +330,6 @@ def stats_picard(path,samples,config):
if i+n[k] in files:
f = open(path+"/results_picard"+"/"+i+n[k],'r')
nx = 0
kdiff0 = 0
for ii in f:
if ii.startswith("PF_BASES"):
head = ii.rstrip().split("\t")
Expand All @@ -358,26 +341,17 @@ def stats_picard(path,samples,config):
vals = ii.strip("\n").split("\t")
for kk in range(len(head)):
stats[k][head[kk]] = vals[kk]
if vals[kk] != "":
if float(vals[kk]) > 0:
kdiff0 += 1
nx = 0
if kdiff0 == 0:
ex = 1
break
f.close()
else:
ex = 1
break
if ex ==1:
tr = "<td bgcolor='#CC3300'>"+i+"</td>"
o = i
for ii in range(18):
tr += "<td bgcolor='#CC3300'>NA</td>"
o += "\tNA"
tr = "<tr>"+tr+"</tr>"
table.append(tr)
s = ["NA" for ii in range(10)]
else:
align = int(stats[0]["PF_ALIGNED_BASES"])
cod = int(stats[0]["CODING_BASES"])
Expand Down Expand Up @@ -415,56 +389,27 @@ def stats_picard(path,samples,config):
tr +="<td bgcolor='#99CC66'>0</td>"
o += "\t0"
tr = "<tr>"+tr+"</tr>"
table.append(tr)
o = o.split("\t")
st= 0
s = []
for ind in [10, 11, 12, 5, 7, 3]:
s.append(str(round(float(o[ind]) * float(o[2])/float(o[1]),3)))
if ind != 3:
st += float(o[ind]) * float(o[2])/float(o[1])
for ind in [15,16,17,18]:
if o[ind] != "0":
s.append(str(round(float(o[ind]) * 100,3)))
else:
s.append("0")
s[5] = str(round(100 - st,3))
table.append(tr)
o = o.split("\t")
st= 0
s = []
for ind in [10, 11, 12, 5, 7, 3]:
s.append(str(round(float(o[ind]) * float(o[2])/float(o[1]),3)))
if ind != 3:
st += float(o[ind]) * float(o[2])/float(o[1])
for ind in [15,16,17,18]:
if o[ind] != "0":
s.append(str(round(float(o[ind]) * 100,3)))
else:
s.append("0")
s[5] = str(round(100 - st,3))
print >> out, i + "\t" + "\t".join(s)
out.close()
return "<table>"+"\n".join(table)+"</table>"
else:
return ""


def stats_picard_2(path,samples,config):
n = os.listdir(path)
hh = "\t".join(['sample_id','MEDIAN_INSERT_SIZE','MEDIAN_ABSOLUTE_DEVIATION','MIN_INSERT_SIZE',
'MAX_INSERT_SIZE','MEAN_INSERT_SIZE','STANDARD_DEVIATION','READ_PAIRS', 'LINK_TXT', 'LINK_PDF'])
if config.has_key("picard_IS") and ("results_picard_IS" in n):
files = os.listdir(path+"/results_picard_IS")
out = open(path + "/outputs/stats_picard2.txt",'w')
print >> out, hh
data = list()
for i in sorted(samples.keys()):
if i + ".txt" in files:
f = open(path+"/results_picard_IS"+"/"+i+".txt",'r')
k = 0
while (1):
j = f.readline()
if j.startswith("MEDIAN_INSERT_SIZE"):
j = f.readline().strip("\n").split("\t")
break
k += 1
if k > 10 or len(j) == 0:
j = ['NA' for i in range(7)]
break
print >> out, "\t".join([i] + j + ['<a href="../results_picard_IS/' + i + '.txt" target="_blank">+</a>', '<a href="../results_picard_IS/' + i + '.pdf" target="_blank">+</a>'])
f.close()
else:
print >> out, "\t".join([i] + ['NA' for i in range(9)])
out.close()
return 1

def skeleton(path, path2html):
print "> Building HTML and OUTPUT folders skeletons..."
print " - Path: " + path
Expand Down Expand Up @@ -562,7 +507,7 @@ def check_config(path):
# Parses the configuration file
print "> Parsing configuration file..."
try:
z = ["trimgalore", "fastqc", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "kallisto", "picard_IS", "varscan", "gatk"]
z = ["trimgalore", "fastqc", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "kallisto", "varscan", "gatk"]
f = open(path + "/config.txt", 'r')
analysis = dict()
analysis["cluster"] = dict()
Expand Down
22 changes: 0 additions & 22 deletions lib/programs.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,28 +366,6 @@ def sam2sortbam(timestamp, path_base, folder, samples, nproc, wt, q):
return submit_job_super("sam2sortbam", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)


def picard_IS(timestamp, path_base, folder, samples, nproc, wt, q):
output = "results_picard_IS"
secure_mkdir(path_base + folder, output)
print "## Picard-InsertSize"
print "> Writing jobs for Picard InsertSize..."
nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples))
commands = list()
ksamp = sortbysize(samples)
proc_files = os.listdir(path_base + folder + "/results_sam2sortbam/")
for sample in ksamp:
in_file = path_base + folder + "/results_sam2sortbam/" + sample + ".sorted.bam"
if sample + ".sorted.bam" in proc_files:
for i in range(len(config.nannots)):
out_file = in_file.replace("results_sam2sortbam/", "results_picard_IS/").replace(".sorted.bam", "")
call = "java -jar " + config.path_picard + "/CollectInsertSizeMetrics.jar I="+in_file+" O="+out_file+".txt H="+out_file+".pdf"
commands.append(call + sample_checker.replace("#FOLDER", path_base + folder + "/results_picard_IS").replace("#SAMPLE", sample))
else:
print "Warning: [Picard] Sorted BAM file not found -> " + in_file
create_scripts(nchild, commands, path_base, folder, output)
return submit_job_super("picard_IS", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)


def varscan(timestamp, path_base, folder, samples, nproc, wt, q, genome_build, args):
ref = config.path_fasta.replace("#LABEL",genome_build)
output = "results_varscan"
Expand Down
3 changes: 2 additions & 1 deletion lib/spider_stats.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import sys
import config

head_star_log = ["Sample", "Links","Started job","Started mapping","Finished","Mapping speed [Mr/h]","Input reads [n]","Input read length (mean)","Uniquely mapped [n]","Uniquely mapped [%]","Mapped length","Splices [n]","Splices annotated [n]","Splices GT/AG [n]","Splices: GC/AG [n]","Splices: AT/AC [n]","Splices: Non-canonical [n]","Mismatch rate per base [%]","Deletion rate per base [%]","Deletion average length","Insertion rate per base [%]","Insertion average length","Multimapping reads [n]","Multimapping reads [%]","Multimapping reads (+) [n]","Multimapping reads (+) [%]","Unmapped reads: too many mismatches [%]","Unmapped reads: too short [%]","Unmapped reads: other [%]"]
Expand Down Expand Up @@ -522,7 +523,7 @@ def stats_star(path, samples):
r = list()
for sample in samples:
if N[i].has_key(sample):
if (len(N[i][sample]) > j) and (MT[sample] > 0):
if len(N[i][sample]) > j:
s = str(round(float(N[i][sample][j]) * 1000000000 / (float(L[ng[j]]) * MT[sample]),4))
else:
s = "NA"
Expand Down
6 changes: 3 additions & 3 deletions lib/vcrparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,14 @@ def project_process(path_base, folder):
for i in f:
if not i.startswith("%"):
i = i.strip("\n").split("\t")
if i[0] in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "picard_IS", "varscan", "gatk"]:
if i[0] in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "varscan", "gatk"]:
i[1] = i[1].split("/")[0]
if i[1] != "0":
config[i[0]] = i[1]
if config.has_key("varscan") or config.has_key("gatk") or config.has_key("picard_IS") :
if config.has_key("varscan") or config.has_key("gatk"):
config["sam2sortbam"] = 1
if len(config) > 0:
for pg in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "sam2sortbam", "picard_IS", "varscan", "gatk"]:
for pg in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "sam2sortbam", "varscan", "gatk"]:
if config.has_key(pg):
print "Process: " + pg
if not pids.has_key(pg):
Expand Down
Loading

0 comments on commit e8dcc93

Please sign in to comment.