-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSequence_Filtering.py
72 lines (63 loc) · 6.01 KB
/
Sequence_Filtering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import subprocess
import argparse
from argparse import RawTextHelpFormatter
__author__ = 'Andrew Thomas'
parser = argparse.ArgumentParser(description='This is a sequence filtering script for Illumina V4-V5 paired-end reads by Andrew Thomas.\nExample usage:\npython sequence_filtering.py -fq /work/data/ -wd /work/ -sc /home/16S_Analysis/ -s samples.txt\nThe sample-file mapping should be as follows:\nOct_12\tZLAM0113IQMSIPE01R01R1.fastq\tZLAM0113IQMSIPE01R01R2.fastq\n', formatter_class=RawTextHelpFormatter)
parser.add_argument('-fq','--fastq_dir', help="Absolute fastq input file directory",required=True)
parser.add_argument('-wd','--working_dir',help='Absolute working directory', required=True)
parser.add_argument('-sc','--script_dir', help="Absolute file directory for this repository",required=True)
parser.add_argument('-s','--sample_mapping',help='Mapping of sample names to file names', required=True)
args = parser.parse_args()
sample_files = {}
sample_mapping = open(args.sample_mapping, "r")
for line in sample_mapping.readlines():
line = line.rstrip()
line = line.split("\t")
sample_files[line[0]] = [line[1], line[2]]
sample_mapping.close()
# Edit directories
fastq_dir = args.fastq_dir
working_directory = args.working_dir
script_dir = args.script_dir
def write_report(sample):
total_seqs = int(subprocess.check_output("wc -l " + fastq_dir + sample_files[key][0], shell = True).rstrip().split()[0]) / 4
remaining_seqs = int(subprocess.check_output("wc -l " + key + "-Filtered.renamed.fna", shell = True).rstrip().split()[0]) / 2
joined_seqs = int(subprocess.check_output("wc -l " + key + "/" + key + "-join.fastq | cut -f1", shell = True).rstrip().split()[0]) / 4
unjoined_seqs = int(subprocess.check_output("wc -l " + key + "/" + key + "-un1 | cut -f1", shell = True).rstrip().split()[0]) / 4
fprimer_filtered = int(subprocess.check_output("cat " + key + "/" + key + "-Split-Libraries/split_library_log.txt | grep 'Num mismatches in primer exceeds' | cut -f2 -d':'", shell = True).lstrip())
size_filtered = int(subprocess.check_output("cat " + key + "/" + key + "-Split-Libraries/split_library_log.txt | grep 'Length outside bounds' | cut -f2", shell = True).lstrip())
ambiguous_bases = int(subprocess.check_output("cat " + key + "/" + key + "-Split-Libraries/split_library_log.txt | grep 'Num ambiguous bases exceeds' | cut -f2", shell = True).lstrip())
quality_score = int(subprocess.check_output("cat " + key + "/" + key + "-Split-Libraries/split_library_log.txt | grep 'Number of sequences where a low quality score' | cut -f2 -d':'", shell = True).lstrip())
homopolymer = int(subprocess.check_output("cat " + key + "/" + key + "-Split-Libraries/split_library_log.txt | grep 'Max homopolymer' | cut -f2", shell = True).lstrip())
rprimer_filtered = int(subprocess.check_output("cat " + key + "/" + key + "-Reverse-Primer-Split/rev_primer_truncation.log | grep 'Reverse primers not found' | cut -f2 -d':'", shell = True).lstrip())
percent_remain = float(remaining_seqs) / total_seqs * 100.0
subprocess.call("echo -e '" + key + "\t" + str(total_seqs) + "\t" + str(joined_seqs) + "\t" + str(unjoined_seqs) + "\t" + str(fprimer_filtered) + "\t" + str(rprimer_filtered) + "\t" + str(size_filtered) + "\t" + str(ambiguous_bases) + "\t" + str(quality_score) + "\t" + str(homopolymer) + "\t" + str(remaining_seqs) + "\t" + str(percent_remain) + "' >> Sequence.Filtering.Report", shell = True)
# Make directories
subprocess.call("mkdir Pre-Processing/", shell = True)
os.chdir(working_directory + "Pre-Processing")
subprocess.call("mkdir Sequence-Filtering/", shell = True)
os.chdir(working_directory + "Pre-Processing/Sequence-Filtering")
# Join paired reads + filter sequences
subprocess.call("echo -e 'Parameters for filtering sequences\nForward Primer mismatches: 2\nReverse Primer mismatches: 2\nAmbiguous Bases: 2\nMinimum length: 100\nMaximum length: 1000\nMax homopolymer: 10\nMin qual score: 20\nWindow size: 50\n' > Sequence.Filtering.Report", shell = True)
subprocess.call("echo -e 'Sample\tTotal_Reads\tJoined_Reads\tUnjoined_Reads\tForward_Mismatches\tReverse_Mismatches\tSize_Filtered\tAmbiguous_Bases\tQuality_Filtered\tHomopolymer_Filtered\tRemaining_Reads\tPercent_of_total' >> Sequence.Filtering.Report", shell = True)
for key in sample_files:
subprocess.call("mkdir " + key, shell = True)
os.chdir(key)
print "Joining paired-end reads for " + key
subprocess.call("fastq-join " + fastq_dir + sample_files[key][0] + " " + fastq_dir + sample_files[key][1] + " -o " + key + "-", shell = True)
subprocess.call("mv " + key + "-join " + key + "-join.fastq", shell = True)
p1 = subprocess.Popen(["/usr/bin/qiime", "convert_fastaqual_fastq.py", "-f", key + "-join.fastq", "-c", "fastq_to_fastaqual", "-o", "Fasta-Qual/"])
p1.wait()
print "Filtering sequences for " + key
p2 = subprocess.Popen(["/usr/bin/qiime", "split_libraries.py", "-f", "Fasta-Qual/*.fna", "-q", "Fasta-Qual/*.qual", "-l", "100", "-s", "20", "-a", "2", "-H", "10", "-w", "50", "-g", "-M", "1", "-b", "8", "-e", "1", "-o", key + "-Split-Libraries/", "-m", script_dir + "Utilities/Amplicon.map"])
p2.wait()
p3 = subprocess.Popen(["/usr/bin/qiime", "truncate_reverse_primer.py", "-f", key + "-Split-Libraries/seqs.fna", "-m", script_dir + "Utilities/Amplicon.map", "-z", "truncate_remove", "-M", "2", "-o", key + "-Reverse-Primer-Split/"])
p3.wait()
subprocess.call("mv " + key + "-Reverse-Primer-Split/seqs_rev_primer_truncated.fna " + key + "-Filtered.fna", shell = True)
subprocess.call("python " + script_dir + "Utilities/fasta_number.py " + key + "-Filtered.fna " + key + "_ > " + key + "-Filtered.renamed.fna", shell = True)
subprocess.call("rm " + key + "-Filtered.fna", shell = True)
subprocess.call("mv " + key + "-Filtered.renamed.fna " + working_directory + "Pre-Processing/Sequence-Filtering/" + key + "-Filtered.renamed.fna", shell = True)
os.chdir(str(working_directory + "Pre-Processing/Sequence-Filtering"))
write_report(key)
subprocess.call("sed -i 's/-e //g' Sequence.Filtering.Report", shell = True)