-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
111 lines (93 loc) · 3.5 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
Helper constants and functions for Gene Finder.
Author: Steve Matsumoto <@syclops>
"""
import random
# Use these for convenient notation in the amino_acid implementation. These are
# to help implementation and are not designed to be human-readable. For a more
# readable list of codons/amino acids, see
# https://en.wikipedia.org/wiki/DNA_codon_table instead.
PURINES = {"A", "G"}
PARTIAL_CODON_TABLE = {
"CT": "L", # Leucine
"GT": "V", # Valine
"TC": "S", # Serine
"CC": "P", # Proline
"AC": "T", # Threonine
"GC": "A", # Alanine
"CG": "R", # Arginine
"GG": "G", # Glycine
}
BRANCHED_CODON_TABLE = {
"TT": ("L", "F"), # Leucine, Phenylalanine
"TA": ("*", "Y"), # STOP, Tyrosine
"CA": ("Q", "H"), # Glutamine, Histidine
"AA": ("K", "N"), # Lysine, Asparagine
"GA": ("E", "D"), # Glutamic acid, Aspartic acid
"AG": ("R", "S"), # Arginine, Serine
}
def amino_acid(codon): # pylint: disable=too-many-return-statements
"""
Return the amino acid symbol corresponding to the DNA codon.
Given a string `codon` of exactly three DNA nucleotides, return the IUPAC
amino acid code corresponding to `codon`. A list of the amino acid codes and
their corresponding codons can be found here:
https://en.wikipedia.org/wiki/DNA_codon_table
Args:
codon: A string of exactly three DNA nucleotides (A, T, C, or G).
Returns:
A string of a single character representing the IUPAC notation of the
amino acid corresponding to `codon`.
"""
# Some amino acids can be determined solely by the first two nucleotides.
if codon[:2] in PARTIAL_CODON_TABLE:
return PARTIAL_CODON_TABLE[codon[:2]]
# Many other amino acids can be determined by the first two nucleotides,
# plus whether the last nucleotide is a purine (A/G) or pyramidine (T/C).
if codon[:2] in BRANCHED_CODON_TABLE:
branches = BRANCHED_CODON_TABLE[codon[:2]]
if codon[-1] in PURINES:
return branches[0]
return branches[1]
# The few amino acids left can be handled on a case-by-case basis.
if codon == "ATG":
return "M" # Methionine/START
if codon[:2] == "AT":
return "I" # Isoleucine
# At this point we know the first two characters of the codon are "TG".
if codon[-1] not in PURINES:
return "C" # Cysteine
if codon[-1] == "A":
return "*" # STOP
return "W" # Tryptophan
# ATTCACGCAACCGAATCACACGCCGATATGGCTAATTAA
def shuffle(strand):
"""
Shuffle the order of nucleotides in a strand of DNA.
Args:
strand: A string representing a strand of DNA as a sequence of
nucleotides (A, T, C, G).
Returns:
A string representing the shuffled strand, with the same number of each
nucleotide, but likely in a different order.
"""
return "".join(random.sample(strand, len(strand)))
def load_fasta_file(path):
"""
Read and return a sequence of DNA nucleotides from a FASTA file.
Args:
path: A string representing the path to the FASTA file.
Returns:
A string representing the sequence of DNA nucleotides (i.e., the
characters A, T, C, or G) in the FASTA file.
"""
sequence = ""
read_header = False
with open(path, "r", encoding="ascii") as fasta_file:
for line in fasta_file:
if not read_header:
# Skip the header line.
read_header = True
continue
sequence += line.strip()
return sequence