diff --git a/INSTALL.md b/INSTALL.md index bde7c77..bb5b723 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -23,6 +23,8 @@ poreFUME requires Python 2.7 or newer. Furthermore the following packages can be apt-get install build-essential apt-get install zlib1g-dev + + apt-get install libncurses5-dev #needed for samtools pip install biopython pip install numpy diff --git a/README.md b/README.md index ce6aa0d..15b2b62 100644 --- a/README.md +++ b/README.md @@ -10,14 +10,17 @@ Demultiplex, correct and annotate antibiotic resistance genes in nanopore data ``` usage: poreFUME.py [-h] [--PacBioLegacyBarcode] [--verbose] [--overwriteDemux] - [--overwriteNanocorrect] [--overwriteCARD] [--skipDemux] - [--skipDemuxCollect] [--skipNanocorrect] [--skipCARD] + [--overwriteNanocorrect] [--overwriteNanopolish] + [--overwriteCARD] [--skipDemux] [--skipDemuxCollect] + [--skipNanocorrect] [--skipNanopolish] [--skipCARD] [--match [MATCH]] [--mismatch [MISMATCH]] [--gapopen [GAPOPEN]] [--gapextend [GAPEXTEND]] [--cores [CORES]] [--barcodeThreshold [BARCODETHRESHOLD]] [--barcodeEdge [BARCODEEDGE]] [--pathNanocorrect [PATHNANOCORRECT]] - [--pathCARD [PATHCARD]] [--annotateAll] + [--pathNanopolish [PATHNANOPOLISH]] [--pathBWA [PATHBWA]] + [--pathRawreads [PATHRAWREADS]] [--pathCARD [PATHCARD]] + [--annotateAll] [--minCoverage [MINCOVERAGE]] fileONTreads fileBarcodes positional arguments: @@ -38,15 +41,19 @@ optional arguments: --overwriteNanocorrect overwrite the results in the output/nanocorrect/runid directory if the exist + --overwriteNanopolish + overwrite the results in the output/nanopolish/runid + directory, if the exist --overwriteCARD overwrite the results in the output/annotation/runid directory if the exist --skipDemux Skip the barcode demux step and proceed with nanocorrect, cannot be used with overwrite. Assumes the output/barcode/ and output/ directory are populated accordingly - --skipDemuxCollect will skip the demux it self and go to colletion based + --skipDemuxCollect will skip the demux it self and go to collection based on the pickle --skipNanocorrect Skip the nanocorrect step. + --skipNanopolish Skip the nanocorrect step. --skipCARD Skip the CARD annotation --match [MATCH] Score for match in alignment (default: 2.7) --mismatch [MISMATCH] @@ -64,6 +71,18 @@ optional arguments: --pathNanocorrect [PATHNANOCORRECT] Set the path to the nanocorrect files (default: /Users/evand/Downloads/testnanocorrect/nanocorrect/) + --pathNanopolish [PATHNANOPOLISH] + Set the path to the nanopolish files (default: + /Users/evand/Downloads/nanopolish/nanopolish/) + --pathBWA [PATHBWA] Set the path to BWA (default: + /Users/evand/Downloads/nanopolish/bwa) + --pathRawreads [PATHRAWREADS] + Set the path to the raw reads (.fast5 files), + nanopolish needs this. As a hint, this should be the + absolute path to which the last part of the header on + the poretools produced fasta file referes to. poreFUME + will make a symlink to the directory containing the + .fast5 files. --pathCARD [PATHCARD] Set the path to CARD fasta file (default: inputData/n.fasta.protein.homolog.fasta) @@ -72,12 +91,17 @@ optional arguments: this option all the files, raw, after demux, after 1st round of correction, after 2nd round of correction are annotated. This obviously takes longer. + --minCoverage [MINCOVERAGE] + sequences will only be nanopolish'ed if they have a + coverage that is higher than this threshold. (default: + 30) ``` ### Example -```poreFUME.py inputData/2DnanoporeData.fasta inputData/barcodes.fasta --PacBioLegacyBarcode --barcodeThreshold 50 --annotateAll --verbose --cores 8``` + +```python poreFUME.py inputData/2DnanoporeData.fasta inputData/barcodes.fasta --PacBioLegacyBarcode --cores 8 --pathCARD=inputData/n.fasta.protein.homolog.fasta --pathNanocorrect=/home/ubuntu/poreFUME/nanocorrect/ --pathRawreads=/home/ubuntu/poreFUME/test/data/testSet75 --pathNanopolish=/home/ubuntu/poreFUME/nanopolish/ --verbose``` ## Output : @@ -86,7 +110,8 @@ Folders ```output/barcode/mySample/```, ```output/nanocorrect/mySample/```, ```o The pipeline consists of 3 steps: 1. barcode demultiplexing 2. error correction using nanocorrect -3. annotation of the reads using the CARD datbase +3. polishing using nanopolish +4. annotation of the reads using the CARD datbase Each step can be skipped using the relevant ```--skipXXX``` parameter. For example when only barcodes need to extracted ```--skipNanocorrect``` and ```--skipCARD``` can be used. @@ -102,7 +127,11 @@ _Note: when poreFUME already find data in ```output/barcode/mySample/``` it will When ```--skipNanocorrect``` is not set (_default_) poreFUME will invoke [nanocorrect](https://github.com/jts/nanocorrect) to error correct the demultiplexed reads. Since nanocorrect needs its own directory to run in when ran in parallel, it will create ```/output/nanocorrect/mySample/{barcodeID}/``` directories. Inside [DALIGNER](https://github.com/thegenemyers/DALIGNER) and [poa](https://sourceforge.net/projects/poamsa/) will be run as called by nanocorrect. ```--pathNanocorrect``` can be set to point to the nanopore package. _Note: when poreFUME already find data in ```output/nanocorrect/mySample/``` it will terminate, this can be overruled by passing the ```--overwriteNanocorrect``` flag. This will remove all the existing data in ```output/nanocorrect/mySample/```_. -### Step 3. annotation using CARD +### Step 3. polishing of the nanocorrected data using nanopolish +When ```--skipNanopolish``` is not set (_default_) poreFUME will invoke [nanopolish](https://github.com/jts/nanopolish) to polish the nanocorrected data. Nanopolish will be run in an own directory , it will create ```/output/nanopolish/mySample/{barcodeID}/``` directories. ```--pathRawreads``` is required, and should point to the directory containing the .fast5 files. +_Note: when poreFUME already find data in ```output/nanopolish/mySample/``` it will terminate, this can be overruled by passing the ```--overwriteNanopolish``` flag. This will remove all the existing data in ```output/nanopolish/mySample/```_. + +### Step 4. annotation using CARD The final step is annotation of the data using the [CARD database](https://card.mcmaster.ca/) when ```--skipCARD``` is not set (_default_). This part will look for ```output/mySample.afterNC2.fasta``` and annotate the file against the CARD database. ```--pathCARD``` is used to point to the nucleotide CARD database. When ```--annotateAll``` is set, also ```inputFiles/mySample.fasta``` , ```output/mySample.afterBC.fasta``` and ```output/mySample.afterNC1.fasta``` will be annotated. The output is a CSV file in ```output/annotation/mySample/mySample.AFTERNC2.annotation.csv``` containing the readname and the relevant CARD information. _Note: with ```--skipCARD``` the output in ```output/annotation/mySample/``` will be overwritten_. @@ -111,11 +140,11 @@ With the ```--cores``` flag the following processes can be parallelized: * Smith-Waterman algorithm to detect barcodes * nanocorrect on multiple barcodes * BLAST to annotate with the CARD database. +* the ```nanopolish variants``` command is invoked using the -t (thread) parameter ## Testing To test the working of poreFUME you can run ```nosetests -v``` which should output something like ``` -ubuntu@ip-172-3:~/poreFUME$ nosetests -v test this ... ok testCARDavialable (test.TestCARD) ... ok testInputavialable (test.TestCARD) ... ok @@ -127,11 +156,14 @@ testDBsplit (test.TestDependencies) ... ok testF2DB (test.TestDependencies) ... ok testLAcat (test.TestDependencies) ... ok testPOA (test.TestDependencies) ... ok +testParallel (test.TestDependencies) ... ok +testSamtools (test.TestDependencies) ... ok +testbwaVersion (test.TestDependencies) ... ok job ranger returns index of begin and end of job range. ... ok testOverlap (test.TestFunctions) ... ok ---------------------------------------------------------------------- -Ran 13 tests in 0.272s +Ran 16 tests in 1.074s OK ``` diff --git a/env.sh b/env.sh index 6cf0fd0..db41456 100755 --- a/env.sh +++ b/env.sh @@ -1,6 +1,6 @@ #!/bin/bash curdir=`pwd` -export PATH=$PATH:$curdir/DAZZ_DB:$curdir/DALIGNER:$curdir/nanocorrect:$curdir/poaV2:$curdir/ncbi-blast-2.4.0+/bin +export PATH=$PATH:$curdir/DAZZ_DB:$curdir/DALIGNER:$curdir/nanocorrect:$curdir/poaV2:$curdir/ncbi-blast-2.4.0+/bin:$curdir/bwa:$curdir/samtools echo $PATH diff --git a/inputData/pb_39.fasta b/inputData/pb_39.fasta index d5309de..daeed45 100644 --- a/inputData/pb_39.fasta +++ b/inputData/pb_39.fasta @@ -1,7 +1,7 @@ >F_1 -GGTAG GCGCTCTGTGTGCAGC +GGTAGGCGCTCTGTGTGCAGC >R_1 -AGAGTACTACATATGA GATGG +AGAGTACTACATATGAGATGG >F_11 GGTAGAGAGCATCTCTGTACT >R_11 diff --git a/install.sh b/install.sh index e188126..86cfbc9 100755 --- a/install.sh +++ b/install.sh @@ -35,5 +35,32 @@ cd DAZZ_DB; git checkout 8cb2f29c4011a2c2; make #install blast cd $curdir -wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.4.0+-x64-linux.tar.gz +wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.4.0/ncbi-blast-2.4.0+-x64-linux.tar.gz tar -xzf ncbi-blast-2.4.0+-x64-linux.tar.gz + +#Install samtools +cd $curdir +git clone --recursive https://github.com/samtools/htslib.git +cd htslib; git checkout 6bed35a3eaefa3baa2c7e0166ceba442212f166b;make + +#probably you need: apt-get install libncurses5-dev +cd $curdir +git clone --recursive https://github.com/samtools/samtools.git +cd samtools; git checkout 897c0027a04501e3ea33d94b5cdeb633d010da8d; make + +#Instal bwa +cd $curdir +git clone https://github.com/lh3/bwa.git +cd bwa; git checkout 5961611c358e480110793bbf241523a3cfac049b; make + +#Install nanopolish, we used a forked version that has a 'build in' supporting fraction filter +cd $curdir +git clone --recursive https://github.com/EvdH0/nanopolish.git +cd nanopolish; git checkout 04fd9aecbb4ab266350476b957f4abb8ed994d8d; make + +#Install GNU parallel, we don't actually use this for nanopolish, but kept it in as a possiblity +cd $curdir +wget http://ftp.gnu.org/gnu/parallel/parallel-20160922.tar.bz2 +tar xvjf parallel-20160922.tar.bz2 +cd parallel-20160922/ +./configure && make && sudo make install diff --git a/integrationTest.sh b/integrationTest.sh new file mode 100755 index 0000000..f46dc26 --- /dev/null +++ b/integrationTest.sh @@ -0,0 +1,33 @@ +#!/bin/bash + + +#This is a basal integration test. It assumes the user correctly installed the dependencies in INSTALL.md, ran install.sh and env.sh in the current shell. This script will download a set of 75 sequences. All the sequences belong to pacbio barcode 01. A ~40 MB set of raw fast5 files will also be download, since fast5 files are needed for nanopolish. +#poreFUME is fired up, check whether this matches your local settings, such as cores and directory paths. +#Finally, the output (CARD annotation) is compared to the pre-computed CARD annotation (in test/data) and should be similar. + + +curdir=`pwd` +echo $curdir + +if [ ! -f test/data/testSet75.tar.gz ]; then + cd test + cd data + wget http://www.student.dtu.dk/~evand/poreFUME_data/testSet75.tar.gz + tar -zxvf testSet75.tar.gz +fi + +if [ ! -f inputData/testSet75.fasta ]; then + cd $curdir + cd inputData + wget http://www.student.dtu.dk/~evand/poreFUME_data/testSet75.fasta +fi + +cd $curdir +python poreFUME.py inputData/testSet75.fasta inputData/pb_39.fasta --PacBioLegacyBarcode --cores 8 --pathCARD=inputData/n.fasta.protein.homolog.fasta --pathNanocorrect=$curdir/nanocorrect/ --pathRawreads=$curdir/test/data/testSet75 --overwriteNanocorrect --pathNanopolish=$curdir/nanopolish/ --overwriteNanopolish --overwriteDemux --overwriteCARD + +if ! diff -q output/annotation/testSet75/testSet75.afterNP.annotated.csv test/data/testSet75.afterNP.annotated.csv > /dev/null 2>&1; then + echo "Integration test failed, the output in output/annotation/testSet75/testSet75.afterNP.annotated.csv is not what is expected" +else + echo "Integration test passed!" +fi + diff --git a/piper.py b/piper.py new file mode 100644 index 0000000..4a67fc5 --- /dev/null +++ b/piper.py @@ -0,0 +1,50 @@ +#### This little script filters the CIGAR strings and only when the alignment has a mimumum amount of bases it will be pushed to stdout +import sys +import re +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE,SIG_DFL) + +import optparse + +parser = optparse.OptionParser() + +parser.add_option('-m', '--minAlignment', + action="store", dest="minAlignment", + help="set the minimal alignment length", default="spam") + +options, args = parser.parse_args() + + +if __name__ == "__main__": + for line in sys.stdin: + if line[0:1] == "@": + sys.stdout.write(line) + continue +# sys.stderr.write("DEBUG: got line: " + line) + #sys.stdout.write(line) + + + + + regex = r"(\d+)M" + + test_str = line.split('\t')[5] + + matches = re.finditer(regex, test_str) + + alignMatch = 0 + for matchNum, match in enumerate(matches): + matchNum = matchNum + 1 + +# print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group())) + + for groupNum in range(0, len(match.groups())): + groupNum = groupNum + 1 + +# print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum))) + alignMatch += int(match.group(groupNum)) + + if alignMatch > int(options.minAlignment): + sys.stdout.write(line) +# print alignMatch + diff --git a/poreFUME.py b/poreFUME.py index 58a35b5..587a471 100644 --- a/poreFUME.py +++ b/poreFUME.py @@ -45,7 +45,8 @@ def main(): 1a. Call demux() 1b. Call demuxCollect() 2. Call nanocorrect() - 3. Call annotateCARD() + 3. Call nanopolish() + 4. Call annotateCARD() done All the steps can be turned on and off using --skipXXX this allows flexible analysis and intermidate exits. @@ -53,7 +54,8 @@ def main(): 1a Demux uses the Smith-Waterman implementation which is very slow on this dataset. Relevant parameters are --barcodeEdge , --barcodeThreshold and --match, --mismatch, --gapopen, --gapextend. The returned sequences will be reverse complemented such that the read always starts with the forward primer. The collected reads are stored in mysample.AFTERBC.fasta 2. nanocorrect uses nanocorrect by Loman,Simpson,Quick and is also slow. Relevant parameters are --pathNanocorrect. The collected reads are stored as mysample.AFTERNC1.fasta (first round of nanocorrect) and mysample.AFTERNC2.fasta (second round of nanocorrect without the coverage requirement) - 3. annotateCARD find high scoring segmens in the result list, which is not extremly fast. --annotateAll will invoke the annotation of the start and intermidiate files (ie. the afterBC and afterNC1 ) + 3. nanopolish uses nanopolish by Loman,Simpson,Quick. + 4. annotateCARD find high scoring segmens in the result list, which is not extremly fast. --annotateAll will invoke the annotation of the start and intermidiate files (ie. the afterBC and afterNC1 ) @@ -86,6 +88,8 @@ def main(): type=str) parser.add_argument("fileBarcodes", help="path to FASTA where the barcodes are stored, format should be ie F_34 for forward and R_34 for reverse barcode", type=str) + + parser.add_argument("--PacBioLegacyBarcode",help="the pacbio_barcodes_paired.fasta file has first digist as 4 instead of 04, turning this option on will fix this", action="store_true") @@ -99,6 +103,9 @@ def main(): parser.add_argument("--overwriteNanocorrect",help="overwrite the results in the output/nanocorrect/runid directory if the exist", action="store_true") + + parser.add_argument("--overwriteNanopolish",help="overwrite the results in the output/nanopolish/runid directory, if the exist", + action="store_true") @@ -112,6 +119,10 @@ def main(): parser.add_argument("--skipNanocorrect",help="Skip the nanocorrect step.", action="store_true") + + parser.add_argument("--skipNanopolish",help="Skip the nanocorrect step.", + action="store_true") + parser.add_argument("--skipCARD",help="Skip the CARD annotation", action="store_true") @@ -123,11 +134,19 @@ def main(): parser.add_argument("--cores",help="Amount of args.cores to use for multiprocessing (default: %(default)s)",nargs='?', default=1,type=int) parser.add_argument("--barcodeThreshold",help="Minimum score for a barcode pair to pass (default: %(default)s)",nargs='?', default=58,type=int) #58 was used on the 'lib A set', 54 on porecamp? parser.add_argument("--barcodeEdge",help="Maximum amount of bp from the edge of a read to look for a barcode. (default: %(default)s)",nargs='?', default=60,type=int) #60 was used on the 'lib A set' , 120 on the lib B since it had a different experimental ligation protocol + + parser.add_argument("--pathNanocorrect",help="Set the path to the nanocorrect files (default: %(default)s)",nargs='?', default="/Users/evand/Downloads/testnanocorrect/nanocorrect/",type=str) #make sure it has an extra copy of nanocorrect with a coverage of 0 in it - parser.add_argument("--pathCARD",help="Set the path to CARD fasta file (default: %(default)s)",nargs='?', default="inputData/n.fasta.protein.homolog.fasta",type=str) #make sure it has an extra copy of nanocorrect with a coverage of 0 in it + parser.add_argument("--pathNanopolish",help="Set the path to the nanopolish files (default: %(default)s)",nargs='?', default="/Users/evand/Downloads/nanopolish/nanopolish/",type=str) #location of nanopolish + parser.add_argument("--pathBWA",help="Set the path to BWA (default: %(default)s)",nargs='?', default="/Users/evand/Downloads/nanopolish/bwa",type=str) #location of nanopolish + + parser.add_argument("--pathRawreads",help="Set the path to the raw reads (.fast5 files), nanopolish needs this. As a hint, this should be the absolute path to which the last part of the header on the poretools produced fasta file referes to. poreFUME will make a symlink to the directory containing the .fast5 files.",nargs='?', default="inputData/NB6",type=str) #location of fast5 files as refered to in the datafiles created by poreTools. See nanopolish docs for more info + + parser.add_argument("--pathCARD",help="Set the path to CARD fasta file (default: %(default)s)",nargs='?', default="inputData/n.fasta.protein.homolog.fasta",type=str) parser.add_argument("--annotateAll",help="By default only the final (demuxed and two times corrected) dataset is annotated, however by turning on this option all the files, raw, after demux, after 1st round of correction, after 2nd round of correction are annotated. This obviously takes longer.", action="store_true") + parser.add_argument("--minCoverage",help="sequences will only be nanopolish'ed if they have a coverage that is higher than this threshold. (default: %(default)s)",nargs='?', default=30,type=int) #Jared suggested using at least 30x coverage @@ -215,7 +234,20 @@ def main(): logger.info('LAcat found in path') + if not cmdExists('bwa'): + raise RuntimeError('bwa is not avialable in PATH, did you run . env.sh? Check install.sh or install manually from https://github.com/lh3/bwa') + else: + logger.info('bwa found in path') + if not cmdExists('samtools'): + raise RuntimeError('samtools is not avialable in PATH, did you run . env.sh? Check install.sh or install manually from https://github.com/samtools/samtools') + else: + logger.info('samtools found in path') + + if not cmdExists('parallel'): + raise RuntimeError('GNU parallel is not avialable, did you install it correctly? Check install.sh or install manually from https://www.gnu.org/software/parallel/') + else: + logger.info('GNU parallel found') if not os.path.isfile(args.fileONTreads): raise IOError('fileONTreads does not exist',args.fileONTreads) @@ -260,7 +292,35 @@ def main(): #TODO: build a flag so this can be overwritten. Smart solution to store these files anyway logger.info(nanocorrectDir + ' existst but was empty, so proceed') os.makedirs(nanocorrectDir) #create it - + + + if not args.skipNanopolish: #Do a check for clean directory now instead of when demux is done + if not os.path.exists(args.pathNanopolish): + raise IOError('the pathNanopolish is not valid!') + + ##Direcotry handeling + nanopolishDir = getNanopolishDir(baseFileName) + if not os.path.exists(nanopolishDir): #it it does not exist + os.makedirs(nanopolishDir) #create it + logger.info(nanopolishDir + ' did not exist, created it') + else: #it exists + logger.info(nanopolishDir + ' already exists') + if args.overwriteNanopolish: #if we are sure to overwrite the existing nanopolish data? + shutil.rmtree(nanopolishDir) + os.makedirs(nanopolishDir) #create it + logger.info(nanopolishDir + ' emptied because of --overwriteNanopolish flag') + else: + try: + os.rmdir(nanopolishDir) #remove it + + except OSError: #Cannot be removed, because it is not empty + raise RuntimeError('The nanopolish directory is not empty! Is there a previous run present? --overwriteNanopolish can be used to proceed', nanopolishDir) + #TODO: build a flag so this can be overwritten. Smart solution to store these files anyway + logger.info(nanopolishDir + ' existst but was empty, so proceed') + os.makedirs(nanopolishDir) #create it + if not os.path.exists(args.pathRawreads): #When we run nanopolish we need to have the raw reads defined + raise IOError('the pathRawreads is not valid! This should point to your .fast5 files, see doc.') + if not args.skipCARD: #Do a check for clean directory now instead of when demux is done ##Direcotry handeling annotationDir = getAnnotationDir(baseFileName) @@ -284,6 +344,9 @@ def main(): os.makedirs(annotationDir) #create it + ##################################### + #Call all the relevant sub routines # + ##################################### if not args.skipDemux: @@ -300,7 +363,13 @@ def main(): nanocorrect(baseFileName,args) else: logger.info('Skip the nanocorrect step because of --skipNanocorrect') + + if not args.skipNanopolish: + nanopolish(baseFileName,args) + else: + logger.info('Skip the nanopolish step because of --skipNanopolish') + if not args.skipCARD: annotateCARD(baseFileName,args) else: @@ -329,6 +398,12 @@ def getBarcodeDir(baseFileName): def getNanocorrectDir(baseFileName): return os.path.join('output','nanocorrect',baseFileName) +def getNanocorrectDirABS(baseFileName): + return os.path.join(os.getcwd(),'output','nanocorrect',baseFileName) + +def getNanopolishDir(baseFileName): + return os.path.join('output','nanopolish',baseFileName) + def getAnnotationDir(baseFileName): return os.path.join('output','annotation',baseFileName) @@ -730,7 +805,7 @@ def deMux(baseFileName,args): if inputLength" + str(thisRead.id) + '\n' ) + readFile.write( str(thisRead.seq) + '\n') + readFile.close() + + # ../bwa/bwa index 37.poreCamp.2D.min500.OK.afterNC2.fasta + runCmd = [os.path.join('bwa'), + 'index', + ".".join([str(thisBarcode),baseFileName,thisRead.id,'fasta'])] + p0 = Popen(runCmd, + stdout=PIPE,stderr=tempFile,cwd= os.path.join(getNanopolishDir(baseFileName),str(thisBarcode))) + + logger.info(" ".join(runCmd)) + print p0.communicate() + p0.wait() + + + + # ../bwa/bwa mem -x ont2d -t 4 37.poreCamp.2D.min500.OK.afterNC2.fasta 37.poreCamp.2D.min500.OK.afterBC.fasta | samtools view -Sb - | samtools sort -f - reads.sorted.bam + runCmd = [os.path.join('bwa'), + 'mem', + '-x', + 'ont2d', + '-t', + str(args.cores) , + ".".join([str(thisBarcode),baseFileName,thisRead.id,'fasta']), + os.path.join(getNanocorrectDirABS(baseFileName),str(thisBarcode),".".join([str(thisBarcode),baseFileName,'afterBC.fasta']))] + p1 = Popen(runCmd, + stdout=PIPE,stderr=tempFile,cwd= os.path.join(getNanopolishDir(baseFileName),str(thisBarcode))) + logger.info(" ".join(runCmd)) + minAlign = int(len(thisRead.seq)*float(0.9)) + logger.info("Setting the minimum alignment to %s, of the total length %s",minAlign,len(thisRead.seq)) + runCmd = ['python', os.path.join(os.getcwd(),'piper.py'), + '--minAlignment', + str(minAlign)] + p15 = Popen(runCmd, + stdin=p1.stdout,stdout=PIPE,stderr=tempFile,cwd= os.path.join(getNanopolishDir(baseFileName),str(thisBarcode))) + logger.info(" ".join(runCmd)) + + runCmd = ['samtools', + 'view', + '-Sb', + '-'] + p2 = Popen(runCmd, + stdin=p15.stdout, stdout=PIPE,stderr=tempFile,cwd= os.path.join(getNanopolishDir(baseFileName),str(thisBarcode))) + logger.info(" ".join(runCmd)) + + + runCmd = ['samtools', + 'sort', + + '-o', + 'reads.sorted.bam'] + p3 = Popen(runCmd, + stdin=p2.stdout, stdout=PIPE,stderr=tempFile,cwd= os.path.join(getNanopolishDir(baseFileName),str(thisBarcode))) + logger.info(" ".join(runCmd)) + p3.wait() + + #samtools index reads.sorted.bam + runCmd = ['samtools', + 'index', + 'reads.sorted.bam'] + p0 = Popen(runCmd, + stdout=PIPE,stderr=tempFile,cwd= os.path.join(getNanopolishDir(baseFileName),str(thisBarcode))) + logger.info(" ".join(runCmd)) + print p0.communicate() + p0.wait() + + proc = Popen(["samtools depth reads.sorted.bam | awk '{sum+=$3} END { print sum/NR}'"], stdout=PIPE, shell=True,cwd= os.path.join(getNanopolishDir(baseFileName),str(thisBarcode))) ####!!!!! Running in shell !!!!! + logger.info(" ".join(runCmd)) + try: + thisCoverage = float(proc.communicate()[0]) + except: + logger.error("Coverage depth of %s was not calculated succesfully, no reads mapped?. Continue with next read!",thisRead.id) + thisCoverage = 0 + + + proc.wait() + if thisCoverage polished_genome.fa + + + ###Collect polished data + + logger.info("Export nanopolish files from the allPolished to the .afterNP.") + storeRecords = [] + for thisBarcode in nanoBarcodes: + try: + handleInput = open(os.path.join(getNanopolishDir(baseFileName),str(thisBarcode),'allPolished.fasta'), "r") #opens up output/nanopolish/mysample/43/allPolished.fasta + + + except IOError as errorCode: + if errorCode.errno ==errno.ENOENT: #Source file doesnt exist, can happen if nanopolish didnt yield any files + logger.warning('While collecting the nanopolish results the followimg error occured:'+ str(errorCode)) + continue + else: + raise errorCode + + #append the barcode information in the readname + for record in SeqIO.parse(handleInput, "fasta"): + record.id = 'BC_' + str(thisBarcode) + '_' + record.description + record.description = '' + storeRecords.append(record) + + handleInput.close() + + handleOutput = open(os.path.join('output', baseFileName + '.afterNP.fasta'), "w") #write to output/mysample.afterBC.fasta + SeqIO.write(storeRecords, handleOutput, "fasta") + handleOutput.close() + logger.info("nanopolish is done") def annotateCARD(baseFileName,args): """ - Step 3. Use a BLAST against the CARD database to obtain an annotation + Step 4. Use a BLAST against the CARD database to obtain an annotation """ logger.info("Start CARD annotation") @@ -1021,11 +1451,12 @@ def annotateCARD(baseFileName,args): 'afterBC' : os.path.join('output' , baseFileName + '.afterBC.fasta'), 'afterNC1' : os.path.join('output', baseFileName + '.afterNC1.fasta'), 'afterNC2' : os.path.join('output', baseFileName + '.afterNC2.fasta'), + 'afterNP' : os.path.join('output' ,baseFileName + '.afterNP.fasta') } else: #only annotate final set qpath = { - - 'afterNC2' : os.path.join('output' ,baseFileName + '.afterNC2.fasta'), + 'afterNP' : os.path.join('output' ,baseFileName + '.afterNP.fasta') + ,'afterNC2' : os.path.join('output' ,baseFileName + '.afterNC2.fasta'), } #for thisQueryFile in qpath: @@ -1091,3 +1522,5 @@ def annotateCARD(baseFileName,args): if __name__ == "__main__": main() + + diff --git a/test.py b/test.py index b43772a..5829042 100644 --- a/test.py +++ b/test.py @@ -3,7 +3,7 @@ import unittest import os - +from subprocess import Popen, PIPE from argparse import Namespace @@ -74,6 +74,18 @@ def testDBdust(self): def testLAcat(self): self.assertTrue(poreFUME.cmdExists('LAcat')) + + def testSamtools(self): + self.assertTrue(poreFUME.cmdExists('samtools')) + + def testParallel(self): + self.assertTrue(poreFUME.cmdExists('parallel')) + + def testbwaVersion(self): + process = Popen(['bwa' ], stdout=PIPE, stderr=PIPE ) + stdout, stderr = process.communicate() + process.wait() #wait till finished + self.assertTrue(stderr.split('\n')[2].split(': ')[1].rstrip() == '0.7.15-r1142-dirty') if __name__ == '__main__': diff --git a/test/data/testSet75.afterNP.annotated.csv b/test/data/testSet75.afterNP.annotated.csv new file mode 100644 index 0000000..feb0add --- /dev/null +++ b/test/data/testSet75.afterNP.annotated.csv @@ -0,0 +1,4 @@ +,CARD:qseqid,CARD:sseqid,CARD:pident,CARD:length,CARD:mismatch,CARD:gapopen,CARD:qstart,CARD:qend,CARD:sstart,CARD:send,CARD:evalue,CARD:bitscore,CARD:id1,CARD:ARO,CARD:GeneName,CARD:subjectGeneStart,CARD:subjectGeneEnd,CARD:subjectGeneLength,CARD:coverageOfSubjectGene +0,BC_1_31,gb|GQ343019|132-1023|ARO:3002999|CblA-1,98.665,899,2,7,69,964,1,892,0.0,1585,GQ343019,ARO:3002999,CblA-1,132,1023,891,100 +1,BC_1_6,gb|GQ343019|132-1023|ARO:3002999|CblA-1,98.662,897,3,9,52,944,1,892,0.0,1581,GQ343019,ARO:3002999,CblA-1,132,1023,891,100 +2,BC_1_0,gb|GQ343019|132-1023|ARO:3002999|CblA-1,96.916,908,6,14,89,994,1,888,0.0,1502,GQ343019,ARO:3002999,CblA-1,132,1023,891,100