-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgene_finder.py
289 lines (249 loc) · 8.45 KB
/
gene_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
"""
Library for finding potential genes in a strand of DNA.
"""
import helpers
def get_complement(nucleotide):
"""
A function that returns a complementary nucleotide
correlated to the arguement.
Args:
nucleotide: a string representing a nucleotide
Returns:
A string representing the inputted nucleotide's
complement
"""
if nucleotide == "A":
return "T"
if nucleotide == "T":
return "A"
if nucleotide == "G":
return "C"
if nucleotide == "C":
return "G"
return None
def get_reverse_complement(strand):
"""
A function that returns the complement of a strand
of DNA.
Args:
strand: a string representing a strand of DNA.
Returns:
A string representing the complement of the
inputted strand of DNA.
"""
comp_strand = []
for character in strand:
comp_strand.append(get_complement(character))
comp_strand.reverse()
return "".join(comp_strand)
def rest_of_orf(strand):
"""
A function that scans a strand of DNA beginning with
a start codon for a stop codon and returns the ORF.
Args:
strand: a string representing the strand of DNA
to be analyzed.
Returns:
A string representing the found ORF (from start
codon up until end codon) or the entire strand
if not end codon is detected.
"""
for i, _ in enumerate(strand):
# Enumerate through the strand.
if (i + 1) % 3 == 0:
if strand[(i - 2):(i + 1)] == "TAA" or \
strand[(i - 2):(i + 1)] == "TAG" or \
strand[(i - 2):(i + 1)] == "TGA":
return f"{strand[0:(i - 2)]}"
# If stop codon is detected, we stop enumerating
# and return what we found.
return strand
# If stop codon is never detected, we return whole
# string.
def find_all_orfs_one_frame(strand):
"""
A function that scans a strand of DNA for ORFs and
returns a list of all ORFs found.
Args:
strand: a string representing the strand of DNA
to be analyzed.
Returns:
A list representing all ORFs found in the analyzed
strand of DNA.
"""
orf_list = []
i = 0
while i <= len(strand):
# While loop with "counter" variable to analyze the
# string.
i += 1
if (i + 1) % 3 == 0 and strand[(i - 2):(i + 1)] == "ATG":
# Search for valid start codon.
end = i + 1
while strand[end:(end + 3)] != "TAA" and \
strand[end:(end + 3)] != "TGA" and \
strand[end:(end + 3)] != "TAG":
# Search for valid stop codon.
end += 3
if end + 3 > len(strand):
# Make sure we don't go out-of-bounds to prevent
# infinite while loop.
if strand[((len(strand)) - 3):(len(strand))] == "TAA" or \
strand[((len(strand)) - 3):(len(strand))] == "TGA" or \
strand[((len(strand)) - 3):(len(strand))] == "TAG":
orf_list.append(
strand[(i - 2):((len(strand)) - 3)])
return orf_list
# If strand ends in end codon, add last ORF and
# return ORF list.
orf_list.append(strand[(i - 2):(len(strand))])
return orf_list
# Otherwise, add rest of strand and return ORF
# list.
end += 3
orf_list.append(strand[(i - 2):(end - 3)])
# Append the valid ORF to our list.
i = end - 1
return orf_list
# Return completed list at the end of the while loop.
def find_all_orfs(strand):
"""
A function that finds ORFs in a strand of DNA, as well
as ORFs in the same strand shifted by one and two
nucleotides.
Args:
strand: a string representing the strand of DNA
to be analyzed.
Returns:
A list containing all ORFs found in the strand of
DNA as well as all ORFs found in relevant shifts.
"""
shift_1 = strand[len(strand) - 1:] + \
strand[:len(strand) - 1]
# Create a once shifted strand.
shift_2 = strand[len(strand) - 2:] + \
strand[:len(strand) - 2]
# Create a twice shifted strand.
list_noshift = find_all_orfs_one_frame(strand)
list_oneshift = find_all_orfs_one_frame(shift_1)
list_twoshift = find_all_orfs_one_frame(shift_2)
all_orf_shifts = list_noshift + list_oneshift + list_twoshift
# Compile and append all ORFs to one list
return all_orf_shifts
def find_all_orfs_both_strands(strand):
"""
A function that finds ORFs in a strand of DNA as well as
its complementary strand.
Args:
strand: a string representing the strand of DNA
to be analyzed.
Returns:
A list containing all ORFs found in the analyzed
strand of DNA and its complement.
"""
strand_comp = get_reverse_complement(strand)
left_orfs = find_all_orfs(strand_comp)
right_orfs = find_all_orfs(strand)
all_orfs = left_orfs + right_orfs
return all_orfs
def find_longest_orf(strand):
"""
A function that finds the longest ORF in a strand of
DNA.
Args:
strand: a string representing the strand of DNA
to be analyzed.
Returns:
A string representing the longest ORF found.
"""
all_orfs_to_sort = find_all_orfs_both_strands(strand)
sorted_orfs = sorted(all_orfs_to_sort, key=len)
if len(sorted_orfs) > 0:
return sorted_orfs[-1]
return None
def noncoding_orf_threshold(strand, num_trials):
"""
A function that randomizes a strand of DNA and
finds the shortest ORF among the longest ORFs
found in the randomization process.
Args:
strand: a string representing the strand of DNA
to be analyzed.
num_trials: a positive integer representing the
number of randomization trials to hold.
Returns:
min_orf_len: an integer representing the length
of the shortest ORF from all the trials maximum
ORFs.
"""
threshold_orfs = []
for _ in range(num_trials):
current_strand = helpers.shuffle(strand)
current_long_orf = find_longest_orf(current_strand)
if isinstance(current_long_orf, str) is True:
# If statement makes sure list doesn't get corrupted
# by adding None type.
threshold_orfs.append(current_long_orf)
else:
continue
# Hold the randomization trials.
longest_orf = min(threshold_orfs, key=len)
# Find the shortest threshold ORF.
min_orf_len = int(len(longest_orf))
# Find the length of the minimum ORF as int.
return min_orf_len
def encode_amino_acids(orf):
"""
A function that converts ORFs into the animo acids
they encode.
Args:
orf: a string representing the orf to be analyzed.
Returns:
A string representing the amino acids the
analyzed ORF encodes.
"""
amino_acids = []
i = 0
if len(orf) % 3 != 0:
orf = orf[:-1]
if len(orf) % 3 != 0:
orf = orf[:-1]
# Make length of orf divisable by three. We repeat
# this process twice to ensure this occurs.
while i <= len(orf):
i += 1
if (i + 1) % 3 == 0:
current_acid = helpers.amino_acid(orf[(i - 2):(i + 1)])
amino_acids.append(current_acid)
continue
# Turn ORF into relevant animo acids.
return "".join(amino_acids)
def find_genes(path):
"""
A function to load a FASTA file of nucleotide and
find all amino acids of significance it can encode.
Args:
path: a string representing the file path where
the FASTA file to be analyzed is located.
Returns:
all_acids: a list with all of the relevant amino
acids that can be encoded by the FASTA file.
"""
strand = helpers.load_fasta_file(path)
min_length = int(noncoding_orf_threshold(strand, 1500))
print(min_length)
all_orfs = find_all_orfs_both_strands(strand)
all_acids_temp = []
all_acids = []
for item in all_orfs:
acid_curr = encode_amino_acids(item)
all_acids_temp.append(acid_curr)
# Encode amino acids and add them to our list
for item in all_acids_temp:
if len(item) < (min_length // 3):
del item
else:
all_acids.append(item)
# Remove amino-acids from ORFs smaller than minimum
# size threshold.
return all_acids