Merge pull request #4 from HudsonAlpha/revert-3-spyder_error_zero_reads

Revert "Spyder error zero reads"
HudsonAlpha · Aug 7, 2016 · e8dcc93 · e8dcc93
2 parents dbbb94f + acb46f4
commit e8dcc93
Show file tree

Hide file tree

Showing 10 changed files with 53 additions and 810 deletions.
diff --git a/R/stats_algs.R b/R/stats_algs.R
@@ -4,35 +4,22 @@
 # INPUT DATA
 args <- commandArgs(trailingOnly = TRUE)
 path <- args[1]  # path to the project outputs/ folder
-alg  <- args[2]  # algorithm: star or htseq-gene or htseq-exon or kallisto
+alg  <- args[2]  # algorithm: star or htseq-gene or htseq-exon
 # READS RPKM AND COUNT MATRICES
-if (alg != 'kallisto'){
-  rpkms  <- read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
-  snames1 <- as.character(read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
-  counts <- read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
-  snames2 <- as.character(read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
-  labs <- c("RPKM","COUNTS")
-} else{
-  rpkms  <- read.table(paste(path, alg, "_tpm.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F, na.strings = '-nan')
-  rpkms <- rpkms[,2:ncol(rpkms)]
-  snames1 <- as.character(read.table(paste(path, alg, "_tpm.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
-  snames1 <- snames1[2:length(snames1)]
-  counts <- read.table(paste(path, alg, "_est_counts.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
-  counts <- counts[,2:ncol(counts)]
-  snames2 <- as.character(read.table(paste(path, alg, "_est_counts.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
-  snames2 <- snames2[2:length(snames2)]
-  labs <- c("TPM","EST_COUNTS")
-}
+rpkms  <- read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
+snames1 <- as.character(read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
+counts <- read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
+snames2 <- as.character(read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
 # ONLY USES FEATURES WITH AT LEAST TWO RAW COUNTS IN ONE LIBRARY
 inc <- which(rowSums(counts[2:ncol(counts)], na.rm = T) > 1)
 NT <- c()
 for (i in 1:2){
   N <- matrix(NA,nrow=0,ncol=0)
-  if (i == 1) {G <- rpkms[inc,];lab <- labs[1]; snames <- snames1[2:length(snames1)]}
-  if (i == 2) {G <- counts[inc,];lab <- labs[2]; snames <- snames2[2:length(snames2)]}
+  if (i == 1) {G <- rpkms[inc,];lab <- "RPKM"; snames <- snames1[2:length(snames1)]}
+  if (i == 2) {G <- counts[inc,];lab <- "COUNTS"; snames <- snames2[2:length(snames2)]}
   if (ncol(G)>2){ # at least two samples available
     lcounts  <- log2(G[, 2:ncol(G)]+1)
-    s_ok <- (which((colSums(G[, 2:ncol(G)]) > 10000)&(colSums(is.na(lcounts)) == 0))) # remove samples with NA (not processed yet)
+    s_ok <- (which(colSums(is.na(lcounts)) == 0)) # remove samples with NA (not processed yet)
     if (length(s_ok) > 1){
       ## QUANTILES
       tp <- round(t(apply(lcounts[, s_ok], 2, quantile, probs = c(0.05,0.25,0.5,0.75,0.95), na.rm = T)), 2)
@@ -62,4 +49,3 @@ for (i in 1:2){
 NT <- cbind(snames[s_ok], NT)
 write.table(NT, file = paste(path, alg, "_pca", ".txt", sep=""), quote = F, row.names = F, sep = "\t")
 
-
diff --git a/lib/html_lib.py b/lib/html_lib.py
@@ -33,24 +33,18 @@ def get_menu(config, ns):
         menu.append('<h1>Alignment QC:</h1>')
     if enabled.has_key("picard"):
         menu.append('<h2><a #highlight="" href="./picard.html">- Picard</a></h2>')
-    if enabled.has_key("picard_IS"):
-        menu.append('<h2><a #highlight="" href="./picard-is.html">- Picard Insert Size</a></h2>')
     if enabled.has_key("star"):
         menu.append('<h2><a #highlight="" href="./star.html">- STAR</a></h2>')
-    if enabled.has_key("kallisto"):
-        menu.append('<h2><a #highlight="" href="./kallisto.html">- KALLISTO</a></h2>')
     if enabled.has_key("htseq-gene"):
         menu.append('<h2><a #highlight="" href="./htseq-gene.html">- HTseq-Gene</a></h2>')
     if enabled.has_key("htseq-exon"):
         menu.append('<h2><a #highlight="" href="./htseq-exon.html">- HTseq-Exon</a></h2>')
     if ns > 1:
-        if enabled.has_key("star") or enabled.has_key("htseq-gene") or enabled.has_key("htseq-exon") or enabled.has_key("kallisto"):
+        if enabled.has_key("star") or enabled.has_key("htseq-gene") or enabled.has_key("htseq-exon"):
             menu.append('<h1>Count statistics:</h1>')
             menu.append('<h2><a #highlight="" href="./downloads.html">- DOWNLOADS</a></h2>')
         if enabled.has_key("star"):
             menu.append('<h2><a #highlight="" href="./star2.html">- STAR</a></h2>')
-        if enabled.has_key("kallisto"):
-            menu.append('<h2><a #highlight="" href="./kallisto2.html">- KALLISTO</a></h2>')
         if enabled.has_key("htseq-gene"):
             menu.append('<h2><a  href="./htseq-gene2.html">- HTseq-Gene</a></h2>')
         if enabled.has_key("htseq-exon"):
@@ -68,8 +62,8 @@ def get_menu(config, ns):
     return menu
 
 def print_samples(path,config):
-    analysis = ['trimgalore', 'fastqc', 'kallisto', 'star', 'star-fusion', 'picard', "htseq-gene", "htseq-exon", "picard_IS", "varscan", 'gatk']
-    sta= {"trimgalore":"TrimGalore", "fastqc":"FastQC","star":"STAR","star-fusion":"STAR-Fusion","picard":"PicardQC","kallisto":"Kallisto","htseq-gene":"HTseq-gene","htseq-exon":"HTseq-exon", "picard_IS":"Picard-InsertSize", "varscan":"VARSCAN", "gatk":"GATK"}
+    analysis = ['trimgalore', 'fastqc', 'kallisto', 'star', 'star-fusion', 'picard', "htseq-gene", "htseq-exon", "varscan", 'gatk']
+    sta= {"trimgalore":"TrimGalore", "fastqc":"FastQC","star":"STAR","star-fusion":"STAR-Fusion","picard":"PicardQC","kallisto":"Kallisto","htseq-gene":"HTseq-gene","htseq-exon":"HTseq-exon", "varscan":"VARSCAN", "gatk":"GATK"}
     # SAMPLES LIST
     samples = dict()
     f   = open(path + "/samples.list",'r')
@@ -154,16 +148,6 @@ def print_samples(path,config):
                     else:
                         res.append("FAIL")
                     results["star-fusion"][x] = " / ".join(res)
-            elif i=="picard_IS":
-                for x, y in sorted(samples.iteritems()):
-                    res = []
-                    if sok.has_key(x):
-                        link = "../results_picard_IS/" + x + ".txt"
-                        link = '<a href="LINK" target="_blank">OK</a>'.replace("LINK", link)
-                        res.append(link)
-                    else:
-                        res.append("FAIL")
-                    results["picard_IS"][x] = " / ".join(res)
             elif i=="picard":
                 for x, y in sorted(samples.iteritems()):
                     res = []
@@ -346,7 +330,6 @@ def stats_picard(path,samples,config):
                 if i+n[k] in files:
                     f = open(path+"/results_picard"+"/"+i+n[k],'r')
                     nx = 0
-                    kdiff0 = 0
                     for ii in f:
                         if ii.startswith("PF_BASES"):
                             head = ii.rstrip().split("\t")
@@ -358,26 +341,17 @@ def stats_picard(path,samples,config):
                             vals = ii.strip("\n").split("\t")
                             for kk in range(len(head)):
                                 stats[k][head[kk]] = vals[kk]
-                                if vals[kk] != "":
-                                    if float(vals[kk]) > 0:
-                                        kdiff0 += 1
                             nx = 0
-                    if kdiff0 == 0:
-                        ex = 1
-                        break
                     f.close()
                 else:
                     ex = 1
-                    break
             if ex ==1:
                 tr = "<td bgcolor='#CC3300'>"+i+"</td>"
                 o = i
                 for ii in range(18):
                     tr += "<td bgcolor='#CC3300'>NA</td>"
                     o += "\tNA"
                 tr = "<tr>"+tr+"</tr>"
-                table.append(tr)
-                s = ["NA" for ii in range(10)]
             else:
                 align  = int(stats[0]["PF_ALIGNED_BASES"])
                 cod  = int(stats[0]["CODING_BASES"])
@@ -415,56 +389,27 @@ def stats_picard(path,samples,config):
                         tr +="<td bgcolor='#99CC66'>0</td>"
                         o += "\t0"
                 tr = "<tr>"+tr+"</tr>"
-                table.append(tr)
-                o = o.split("\t")
-                st= 0
-                s = []
-                for ind in [10, 11, 12, 5, 7, 3]:
-                    s.append(str(round(float(o[ind]) * float(o[2])/float(o[1]),3)))
-                    if ind != 3:
-                        st += float(o[ind]) * float(o[2])/float(o[1])
-                for ind in [15,16,17,18]:
-                    if o[ind] != "0":
-                        s.append(str(round(float(o[ind]) * 100,3)))
-                    else:
-                        s.append("0")
-                s[5] = str(round(100 - st,3))
+            table.append(tr)
+            o = o.split("\t")
+            st= 0
+            s = []
+            for ind in [10, 11, 12, 5, 7, 3]:
+                s.append(str(round(float(o[ind]) * float(o[2])/float(o[1]),3)))
+                if ind != 3:
+                    st += float(o[ind]) * float(o[2])/float(o[1])
+            for ind in [15,16,17,18]:
+                if o[ind] != "0":
+                    s.append(str(round(float(o[ind]) * 100,3)))
+                else:
+                    s.append("0")
+            s[5] = str(round(100 - st,3))
             print >> out, i + "\t" + "\t".join(s)
         out.close()
         return "<table>"+"\n".join(table)+"</table>"
     else:
         return ""
 
 
-def stats_picard_2(path,samples,config):
-    n = os.listdir(path)
-    hh = "\t".join(['sample_id','MEDIAN_INSERT_SIZE','MEDIAN_ABSOLUTE_DEVIATION','MIN_INSERT_SIZE',
-                    'MAX_INSERT_SIZE','MEAN_INSERT_SIZE','STANDARD_DEVIATION','READ_PAIRS', 'LINK_TXT', 'LINK_PDF'])
-    if config.has_key("picard_IS") and ("results_picard_IS" in n):
-        files  = os.listdir(path+"/results_picard_IS")
-        out = open(path + "/outputs/stats_picard2.txt",'w')
-        print >> out, hh
-        data = list()
-        for i in sorted(samples.keys()):
-            if i + ".txt" in files:
-                f = open(path+"/results_picard_IS"+"/"+i+".txt",'r')
-                k = 0
-                while (1):
-                    j = f.readline()
-                    if j.startswith("MEDIAN_INSERT_SIZE"):
-                        j = f.readline().strip("\n").split("\t")
-                        break
-                    k += 1
-                    if k > 10 or len(j) == 0:
-                        j = ['NA' for i in range(7)]
-                        break
-                print >> out, "\t".join([i] + j + ['<a href="../results_picard_IS/' + i + '.txt" target="_blank">+</a>', '<a href="../results_picard_IS/' + i + '.pdf" target="_blank">+</a>'])
-                f.close()
-            else:
-                print >> out, "\t".join([i] + ['NA' for i in range(9)])
-        out.close()
-    return 1
-
 def skeleton(path, path2html):
     print "> Building HTML and OUTPUT folders skeletons..."
     print "  - Path: " + path
@@ -562,7 +507,7 @@ def check_config(path):
     # Parses the configuration file
     print "> Parsing configuration file..."
     try:
-        z = ["trimgalore", "fastqc", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "kallisto", "picard_IS", "varscan", "gatk"]
+        z = ["trimgalore", "fastqc", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "kallisto", "varscan", "gatk"]
         f = open(path + "/config.txt", 'r')
         analysis = dict()
         analysis["cluster"] = dict()

diff --git a/lib/programs.py b/lib/programs.py
@@ -366,28 +366,6 @@ def sam2sortbam(timestamp, path_base, folder, samples, nproc, wt, q):
     return  submit_job_super("sam2sortbam", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
 
 
-def picard_IS(timestamp, path_base, folder, samples, nproc, wt, q):
-    output = "results_picard_IS"
-    secure_mkdir(path_base + folder, output)
-    print "## Picard-InsertSize"
-    print "> Writing jobs for Picard InsertSize..."
-    nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples))
-    commands = list()
-    ksamp = sortbysize(samples)
-    proc_files = os.listdir(path_base + folder + "/results_sam2sortbam/")
-    for sample in ksamp:
-        in_file = path_base + folder + "/results_sam2sortbam/" + sample + ".sorted.bam"
-        if sample + ".sorted.bam" in proc_files:
-            for i in range(len(config.nannots)):
-                out_file = in_file.replace("results_sam2sortbam/", "results_picard_IS/").replace(".sorted.bam", "")
-                call = "java -jar " + config.path_picard + "/CollectInsertSizeMetrics.jar I="+in_file+" O="+out_file+".txt H="+out_file+".pdf"
-                commands.append(call + sample_checker.replace("#FOLDER", path_base + folder + "/results_picard_IS").replace("#SAMPLE", sample))
-        else:
-            print "Warning: [Picard] Sorted BAM file not found -> " + in_file
-    create_scripts(nchild, commands, path_base, folder, output)
-    return  submit_job_super("picard_IS", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
-
-
 def varscan(timestamp, path_base, folder, samples, nproc, wt, q, genome_build, args):
     ref = config.path_fasta.replace("#LABEL",genome_build)
     output = "results_varscan"

diff --git a/lib/spider_stats.py b/lib/spider_stats.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import config
 
 head_star_log = ["Sample", "Links","Started job","Started mapping","Finished","Mapping speed [Mr/h]","Input reads [n]","Input read length (mean)","Uniquely mapped [n]","Uniquely mapped [%]","Mapped length","Splices [n]","Splices annotated [n]","Splices GT/AG [n]","Splices: GC/AG [n]","Splices: AT/AC [n]","Splices: Non-canonical [n]","Mismatch rate per base [%]","Deletion rate per base [%]","Deletion average length","Insertion rate per base [%]","Insertion average length","Multimapping reads [n]","Multimapping reads [%]","Multimapping reads (+) [n]","Multimapping reads (+) [%]","Unmapped reads: too many mismatches [%]","Unmapped reads: too short [%]","Unmapped reads: other [%]"]
@@ -522,7 +523,7 @@ def stats_star(path, samples):
             r = list()
             for sample in samples:
                 if N[i].has_key(sample):
-                    if (len(N[i][sample]) > j) and (MT[sample] > 0):
+                    if len(N[i][sample]) > j:
                         s = str(round(float(N[i][sample][j]) * 1000000000 / (float(L[ng[j]]) * MT[sample]),4))
                     else:
                         s = "NA"

diff --git a/lib/vcrparser.py b/lib/vcrparser.py
@@ -55,14 +55,14 @@ def project_process(path_base, folder):
     for i in f:
         if not i.startswith("%"):
             i = i.strip("\n").split("\t")
-            if i[0] in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "picard_IS", "varscan", "gatk"]:
+            if i[0] in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "varscan", "gatk"]:
                 i[1] = i[1].split("/")[0]
                 if i[1] != "0":
                     config[i[0]] = i[1]
-    if config.has_key("varscan") or config.has_key("gatk") or config.has_key("picard_IS") :
+    if config.has_key("varscan") or config.has_key("gatk"):
         config["sam2sortbam"] = 1
     if len(config) > 0:
-        for pg in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "sam2sortbam", "picard_IS", "varscan", "gatk"]:
+        for pg in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "sam2sortbam", "varscan", "gatk"]:
             if config.has_key(pg):
                 print "Process:  " + pg
                 if not pids.has_key(pg):