Merge branch 'implementNanopolish' into master branch. This implement…

…s the full nanopolish protocol with tests
EvdH0 · Nov 19, 2016 · 0144d74 · 0144d74
2 parents 83adf84 + 9a719db
commit 0144d74
Show file tree

Hide file tree

Showing 10 changed files with 615 additions and 22 deletions.
diff --git a/INSTALL.md b/INSTALL.md
@@ -23,6 +23,8 @@ poreFUME requires Python 2.7 or newer. Furthermore the following packages can be
 
  apt-get install build-essential
  apt-get install zlib1g-dev
+
+ apt-get install libncurses5-dev #needed for samtools
   
  pip install biopython
  pip install numpy

diff --git a/README.md b/README.md
@@ -10,14 +10,17 @@ Demultiplex, correct and annotate antibiotic resistance genes in nanopore data
 
 ```
 usage: poreFUME.py [-h] [--PacBioLegacyBarcode] [--verbose] [--overwriteDemux]
-                   [--overwriteNanocorrect] [--overwriteCARD] [--skipDemux]
-                   [--skipDemuxCollect] [--skipNanocorrect] [--skipCARD]
+                   [--overwriteNanocorrect] [--overwriteNanopolish]
+                   [--overwriteCARD] [--skipDemux] [--skipDemuxCollect]
+                   [--skipNanocorrect] [--skipNanopolish] [--skipCARD]
                    [--match [MATCH]] [--mismatch [MISMATCH]]
                    [--gapopen [GAPOPEN]] [--gapextend [GAPEXTEND]]
                    [--cores [CORES]] [--barcodeThreshold [BARCODETHRESHOLD]]
                    [--barcodeEdge [BARCODEEDGE]]
                    [--pathNanocorrect [PATHNANOCORRECT]]
-                   [--pathCARD [PATHCARD]] [--annotateAll]
+                   [--pathNanopolish [PATHNANOPOLISH]] [--pathBWA [PATHBWA]]
+                   [--pathRawreads [PATHRAWREADS]] [--pathCARD [PATHCARD]]
+                   [--annotateAll] [--minCoverage [MINCOVERAGE]]
                    fileONTreads fileBarcodes
 
 positional arguments:
@@ -38,15 +41,19 @@ optional arguments:
   --overwriteNanocorrect
                         overwrite the results in the output/nanocorrect/runid
                         directory if the exist
+  --overwriteNanopolish
+                        overwrite the results in the output/nanopolish/runid
+                        directory, if the exist
   --overwriteCARD       overwrite the results in the output/annotation/runid
                         directory if the exist
   --skipDemux           Skip the barcode demux step and proceed with
                         nanocorrect, cannot be used with overwrite. Assumes
                         the output/barcode/ and output/ directory are
                         populated accordingly
-  --skipDemuxCollect    will skip the demux it self and go to colletion based
+  --skipDemuxCollect    will skip the demux it self and go to collection based
                         on the pickle
   --skipNanocorrect     Skip the nanocorrect step.
+  --skipNanopolish      Skip the nanocorrect step.
   --skipCARD            Skip the CARD annotation
   --match [MATCH]       Score for match in alignment (default: 2.7)
   --mismatch [MISMATCH]
@@ -64,6 +71,18 @@ optional arguments:
   --pathNanocorrect [PATHNANOCORRECT]
                         Set the path to the nanocorrect files (default:
                         /Users/evand/Downloads/testnanocorrect/nanocorrect/)
+  --pathNanopolish [PATHNANOPOLISH]
+                        Set the path to the nanopolish files (default:
+                        /Users/evand/Downloads/nanopolish/nanopolish/)
+  --pathBWA [PATHBWA]   Set the path to BWA (default:
+                        /Users/evand/Downloads/nanopolish/bwa)
+  --pathRawreads [PATHRAWREADS]
+                        Set the path to the raw reads (.fast5 files),
+                        nanopolish needs this. As a hint, this should be the
+                        absolute path to which the last part of the header on
+                        the poretools produced fasta file referes to. poreFUME
+                        will make a symlink to the directory containing the
+                        .fast5 files.
   --pathCARD [PATHCARD]
                         Set the path to CARD fasta file (default:
                         inputData/n.fasta.protein.homolog.fasta)
@@ -72,12 +91,17 @@ optional arguments:
                         this option all the files, raw, after demux, after 1st
                         round of correction, after 2nd round of correction are
                         annotated. This obviously takes longer.
+  --minCoverage [MINCOVERAGE]
+                        sequences will only be nanopolish'ed if they have a
+                        coverage that is higher than this threshold. (default:
+                        30)
 ```
 
 
 ### Example
 
-```poreFUME.py inputData/2DnanoporeData.fasta inputData/barcodes.fasta --PacBioLegacyBarcode --barcodeThreshold 50 --annotateAll --verbose --cores 8```
+
+```python poreFUME.py inputData/2DnanoporeData.fasta inputData/barcodes.fasta --PacBioLegacyBarcode --cores 8 --pathCARD=inputData/n.fasta.protein.homolog.fasta --pathNanocorrect=/home/ubuntu/poreFUME/nanocorrect/ --pathRawreads=/home/ubuntu/poreFUME/test/data/testSet75  --pathNanopolish=/home/ubuntu/poreFUME/nanopolish/ --verbose```
 
 ## Output :
 
@@ -86,7 +110,8 @@ Folders ```output/barcode/mySample/```, ```output/nanocorrect/mySample/```, ```o
 The pipeline consists of 3 steps:
 1.  barcode demultiplexing
 2.  error correction using nanocorrect
-3.  annotation of the reads using the CARD datbase
+3.  polishing using nanopolish
+4.  annotation of the reads using the CARD datbase
 
 Each step can be skipped using the relevant ```--skipXXX``` parameter. For example when only barcodes need to extracted ```--skipNanocorrect``` and ```--skipCARD``` can be used.
 
@@ -102,7 +127,11 @@ _Note: when poreFUME already find data in ```output/barcode/mySample/``` it will
 When ```--skipNanocorrect``` is not set (_default_) poreFUME will invoke [nanocorrect](https://github.com/jts/nanocorrect) to error correct the demultiplexed reads. Since nanocorrect needs its own directory to run in when ran in parallel, it will create ```/output/nanocorrect/mySample/{barcodeID}/``` directories. Inside [DALIGNER](https://github.com/thegenemyers/DALIGNER) and [poa](https://sourceforge.net/projects/poamsa/) will be run as called by nanocorrect. ```--pathNanocorrect``` can be set to point to the nanopore package.  
 _Note: when poreFUME already find data in ```output/nanocorrect/mySample/``` it will terminate, this can be overruled by passing the ```--overwriteNanocorrect``` flag. This will remove all the existing data in ```output/nanocorrect/mySample/```_.
 
-### Step 3. annotation using CARD
+### Step 3. polishing of the nanocorrected data using nanopolish
+When ```--skipNanopolish``` is not set (_default_) poreFUME will invoke [nanopolish](https://github.com/jts/nanopolish) to polish the nanocorrected data. Nanopolish will be run in an own directory , it will create ```/output/nanopolish/mySample/{barcodeID}/``` directories. ```--pathRawreads``` is required, and should point to the directory containing the .fast5 files.   
+_Note: when poreFUME already find data in ```output/nanopolish/mySample/``` it will terminate, this can be overruled by passing the ```--overwriteNanopolish``` flag. This will remove all the existing data in ```output/nanopolish/mySample/```_.
+
+### Step 4. annotation using CARD
 The final step is annotation of the data using the [CARD database](https://card.mcmaster.ca/) when ```--skipCARD``` is not set (_default_). This part will look for ```output/mySample.afterNC2.fasta``` and annotate the file against the CARD database.  ```--pathCARD``` is used to point to the nucleotide CARD database. When ```--annotateAll``` is set, also ```inputFiles/mySample.fasta``` , ```output/mySample.afterBC.fasta``` and ```output/mySample.afterNC1.fasta``` will be annotated. The output is a CSV file in ```output/annotation/mySample/mySample.AFTERNC2.annotation.csv``` containing the readname and the relevant CARD information.   
 _Note: with ```--skipCARD``` the output in ```output/annotation/mySample/``` will be overwritten_.
 
@@ -111,11 +140,11 @@ With the ```--cores``` flag the following processes can be parallelized:
 * Smith-Waterman algorithm to detect barcodes
 * nanocorrect on multiple barcodes
 * BLAST to annotate with the CARD database. 
+* the ```nanopolish variants``` command is invoked using the -t (thread) parameter 
 
 ## Testing
 To test the working of poreFUME you can run ```nosetests -v``` which should output something like
 ```
-ubuntu@ip-172-3:~/poreFUME$ nosetests -v
 test this ... ok
 testCARDavialable (test.TestCARD) ... ok
 testInputavialable (test.TestCARD) ... ok
@@ -127,11 +156,14 @@ testDBsplit (test.TestDependencies) ... ok
 testF2DB (test.TestDependencies) ... ok
 testLAcat (test.TestDependencies) ... ok
 testPOA (test.TestDependencies) ... ok
+testParallel (test.TestDependencies) ... ok
+testSamtools (test.TestDependencies) ... ok
+testbwaVersion (test.TestDependencies) ... ok
 job ranger returns index of begin and end of job range. ... ok
 testOverlap (test.TestFunctions) ... ok
 
 ----------------------------------------------------------------------
-Ran 13 tests in 0.272s
+Ran 16 tests in 1.074s
 
 OK
 ```

diff --git a/env.sh b/env.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
 curdir=`pwd`
-export PATH=$PATH:$curdir/DAZZ_DB:$curdir/DALIGNER:$curdir/nanocorrect:$curdir/poaV2:$curdir/ncbi-blast-2.4.0+/bin
+export PATH=$PATH:$curdir/DAZZ_DB:$curdir/DALIGNER:$curdir/nanocorrect:$curdir/poaV2:$curdir/ncbi-blast-2.4.0+/bin:$curdir/bwa:$curdir/samtools
 echo $PATH
 
diff --git a/inputData/pb_39.fasta b/inputData/pb_39.fasta
@@ -1,7 +1,7 @@
 >F_1
-GGTAG GCGCTCTGTGTGCAGC
+GGTAGGCGCTCTGTGTGCAGC
 >R_1
-AGAGTACTACATATGA GATGG
+AGAGTACTACATATGAGATGG
 >F_11
 GGTAGAGAGCATCTCTGTACT
 >R_11

diff --git a/install.sh b/install.sh
@@ -35,5 +35,32 @@ cd DAZZ_DB; git checkout 8cb2f29c4011a2c2; make
 
 #install blast
 cd $curdir
-wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.4.0+-x64-linux.tar.gz
+wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.4.0/ncbi-blast-2.4.0+-x64-linux.tar.gz
 tar -xzf ncbi-blast-2.4.0+-x64-linux.tar.gz
+
+#Install samtools
+cd $curdir
+git clone --recursive https://github.com/samtools/htslib.git
+cd htslib; git checkout 6bed35a3eaefa3baa2c7e0166ceba442212f166b;make
+
+#probably you need: apt-get install libncurses5-dev
+cd $curdir
+git clone --recursive https://github.com/samtools/samtools.git
+cd samtools; git checkout 897c0027a04501e3ea33d94b5cdeb633d010da8d; make
+
+#Instal bwa
+cd $curdir
+git clone https://github.com/lh3/bwa.git
+cd bwa; git checkout 5961611c358e480110793bbf241523a3cfac049b; make
+
+#Install nanopolish, we used a forked version that has a 'build in' supporting fraction filter
+cd $curdir
+git clone --recursive https://github.com/EvdH0/nanopolish.git
+cd nanopolish; git checkout 04fd9aecbb4ab266350476b957f4abb8ed994d8d; make 
+
+#Install GNU parallel, we don't actually use this for nanopolish, but kept it in as a possiblity
+cd $curdir
+wget http://ftp.gnu.org/gnu/parallel/parallel-20160922.tar.bz2
+tar xvjf parallel-20160922.tar.bz2
+cd parallel-20160922/
+./configure && make && sudo make install
diff --git a/integrationTest.sh b/integrationTest.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+
+#This is a basal integration test. It assumes the user correctly installed the dependencies in INSTALL.md, ran install.sh and env.sh in the current shell. This script will download a set of 75 sequences. All the sequences belong to pacbio barcode 01. A ~40 MB set of raw fast5 files will also be download, since fast5 files are needed for nanopolish. 
+#poreFUME is fired up, check whether this matches your local settings, such as cores and directory paths.
+#Finally, the output (CARD annotation) is compared to the pre-computed CARD annotation (in test/data) and should be similar.
+
+
+curdir=`pwd`
+echo $curdir
+
+if [ ! -f test/data/testSet75.tar.gz ]; then
+	cd test
+	cd data
+	wget http://www.student.dtu.dk/~evand/poreFUME_data/testSet75.tar.gz
+	tar -zxvf testSet75.tar.gz
+fi
+
+if [ ! -f inputData/testSet75.fasta ]; then
+	cd $curdir
+	cd inputData
+	wget http://www.student.dtu.dk/~evand/poreFUME_data/testSet75.fasta
+fi
+
+cd $curdir
+python poreFUME.py inputData/testSet75.fasta inputData/pb_39.fasta --PacBioLegacyBarcode --cores 8 --pathCARD=inputData/n.fasta.protein.homolog.fasta --pathNanocorrect=$curdir/nanocorrect/ --pathRawreads=$curdir/test/data/testSet75 --overwriteNanocorrect --pathNanopolish=$curdir/nanopolish/ --overwriteNanopolish --overwriteDemux --overwriteCARD
+
+if ! diff -q output/annotation/testSet75/testSet75.afterNP.annotated.csv test/data/testSet75.afterNP.annotated.csv > /dev/null  2>&1; then
+    echo "Integration test failed, the output in output/annotation/testSet75/testSet75.afterNP.annotated.csv is not what is expected"
+else
+    echo "Integration test passed!"
+fi
+
diff --git a/piper.py b/piper.py
@@ -0,0 +1,50 @@
+#### This little script filters the CIGAR strings and only when the alignment has a mimumum amount of bases it will be pushed to stdout
+import sys
+import re
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE,SIG_DFL) 
+
+import optparse
+
+parser = optparse.OptionParser()
+
+parser.add_option('-m', '--minAlignment',
+    action="store", dest="minAlignment",
+    help="set the minimal alignment length", default="spam")
+
+options, args = parser.parse_args()
+
+
+if __name__ == "__main__":
+    for line in sys.stdin:
+		if line[0:1] == "@":
+			sys.stdout.write(line)
+			continue
+#		sys.stderr.write("DEBUG: got line: " + line)
+        #sys.stdout.write(line)
+
+
+
+
+		regex = r"(\d+)M"
+
+		test_str = line.split('\t')[5]
+
+		matches = re.finditer(regex, test_str)
+
+		alignMatch = 0
+		for matchNum, match in enumerate(matches):
+		    matchNum = matchNum + 1
+
+#		    print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
+
+		    for groupNum in range(0, len(match.groups())):
+		        groupNum = groupNum + 1
+
+#		        print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
+		        alignMatch += int(match.group(groupNum))
+
+		if alignMatch > int(options.minAlignment):
+			sys.stdout.write(line)
+#		print alignMatch
+