-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_reads_cutadapt.py
132 lines (102 loc) · 6.18 KB
/
clean_reads_cutadapt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/anaconda3/bin/python3.6
# Description: Use this script to remove contaminant adapter sequences using CutAdapt from your sequence data.
#
# Outputs: This script generates fastq files containing cleaned reads. The orientation of your reads must be indicated with "R1" or "R2" in the file names.
#
# Software requirements: CutAdapt (https://cutadapt.readthedocs.io/en/stable/guide.html#removing-adapters) must be installed. The following Python modules are also required - Biopython, wget.
#
# Usage: python clean_reads_cutadapt.py [full path name of directory containing raw/demultiplexed reads] [name of directory for CutAdapt outputs] [prefix of the adapters you want removed]
##################
# Import modules #
##################
import glob, heapq, os, shutil, subprocess as sub, sys, wget
from Bio import Seq, SeqIO
#########################################################################################
# Pass path name of directory in which raw/demultiplexed reads are stored to a variable #
#########################################################################################
rawReadsDir = sys.argv[1]
#####################################
# Specify pipeline output directory #
#####################################
resultsDir = "%s/processing_results" % os.path.dirname(rawReadsDir)
##################################################################################
# Pass path name of directory in which cleaned reads will be saved to a variable #
##################################################################################
cleanDir = "%s/%s" % (resultsDir, sys.argv[2])
###########################################################################
# Provide string containing identifier to isolate adapter(s) to be tested #
###########################################################################
adaptPrefix = sys.argv[3]
#######################################################
# Create output directory if it doesn't already exist #
#######################################################
if os.path.exists(cleanDir) != True:
os.mkdir(cleanDir)
else:
shutil.rmtree(cleanDir)
os.mkdir(cleanDir)
##########################################################
# Download FASTA file listing adapter sequences to check #
##########################################################
filename = "%s/sequencing_adapters.fa" % resultsDir
url = "https://raw.githubusercontent.com/NCBI-Hackathons/OnlineAdapterDatabase/master/datasources/sequencing_adapters.fa"
wget.download(url, filename)
#########################
# Get names of adapters #
#########################
getAdaptSeqNames = sub.Popen('grep ">%s" %s | sed -e "s/>//g"' % (adaptPrefix, filename), shell = True, stdout = sub.PIPE)
seqNames = getAdaptSeqNames.stdout.read().decode("utf-8").strip().split("\n")
#############################
# Extract adapter sequences #
#############################
seqsF = [] # create a list to store the forward sequences
for record in SeqIO.parse(filename, "fasta"): # create an iterable sequence object
if record.id in seqNames:
seqsF.append(str(record.seq)) # add the sequence to the list
if adaptPrefix.casefold() == "Truseq".casefold():
if record.id == "TruSeqUniversalAdapter":
rc = str(Seq.reverse_complement(record.seq)) # find the reverse complement of the Truseq universal adapter
#############################################################
# Write string joining adapter sequences to run in CutAdapt #
#############################################################
seqsF.insert(0, "")
adaptStr = (" -a ").join(seqsF)
adaptStr + " -A %s" % rc
##########################################################################
# Determine substring to remove from file names to retrieve sample names #
##########################################################################
files = glob.glob("%s/*_R1_*fastq*" % rawReadsDir) # list of paths to the sequence files that will be processed with CutAdapt
filenames = sorted([os.path.basename(x) for x in files]) # extract base names from the file paths
nameLens = [len(i) for i in filenames] # list of lengths of the file base names
substrList = [] # empty list to store substrings
for j in filenames: # determine all possible substrings of each base name up to the maximum length, which is set to the minimum value of the base name lengths
for k in range(0, min(nameLens)):
substrList.append(j[-k:])
substrDict = {} # create dictionary storing the number of occurrences of each possible substring
for l in substrList:
substrDict[l] = substrList.count(l) # key = substring, value = number of occurrences
maxSubstrOccur = heapq.nlargest(1, list(set(list(substrDict.values())))) # calculate the largest and second largest number of occurrences of a substring
substrReplaceDict = {} # dictionary to store final set of substrings and their replacements
for m in maxSubstrOccur: # get substrings that occurred the most number of times
maxSubstrDict = { key:value for key, value in substrDict.items() if value == m } # keep only those substrings in the dictionary that occur the most number of times
maxSubstrKeys = list(maxSubstrDict.keys()) # get the substrings of the subsetted dictionary
for n in range(0, len(maxSubstrKeys)):
substrReplaceDict[maxSubstrKeys[n]] = "" # append to final dictionary of substrings
############################
# Print substrings to file #
############################
substrR1 = "\n".join(list(substrReplaceDict))
outFile = open("%s/filename_substrings_to_remove.txt" % resultsDir, "w")
outFile.write("\n".join(substrR1))
outFile.close()
###################################################
# Clean reads of adapter sequences using CutAdapt #
###################################################
for f1 in files:
for o, p in substrReplaceDict.items():
sample = os.path.basename(f1).replace(o, p)
f2 = glob.glob("%s/%s*_R2_*fastq*" % (rawReadsDir, sample))[0]
cmd = "cutadapt%s --times 26 --minimum-length 20 -q 10 -o %s/%s_R1_cutadapt.fastq.gz -p %s/%s_R2_cutadapt.fastq.gz %s %s --cores=18" % (adaptStr, cleanDir, sample, cleanDir, sample, f1, f2)
p = sub.Popen(cmd, shell = True)
p.communicate()
print("Adapter sequences removed from sequence files of sample %s." % sample)