-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenes.py
executable file
·96 lines (87 loc) · 2.63 KB
/
genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
import argparse
import string
import os
def parse_tsv_file(fh, index=False):
header = [x.strip() for x in fh.readline().split('\t')]
parsed = []
for line in fh:
values = [x.strip() for x in line.split('\t')]
z = zip(header, values)
if index:
parsed.append(dict(z))
else:
parsed.append(list(z))
return parsed
def normalize_filename(filename):
for s in string.punctuation:
filename = filename.replace(s, '_')
return filename
def fasta_filename(row):
fasta_filename = None
for key, value in row:
if key == 'Gene id':
return normalize_filename(value) + '.fasta'
class MissingIndexesError(Exception): pass
def rows_to_fasta(rows, sequence_index, output_dir):
'''
convert parse_tsv_file rows into fasta by looking up every Gene id
in sequence_index and grabbing the sequence from there
Then making a fasta entry similar to
>{{gene_id}}
{{sequence}}
'''
fasta_fmt = '>{}\n{}'
missing_indexes = []
# iterate over every row in genes rows
for row in rows:
filename = fasta_filename(row)
filepath = os.path.join(output_dir, filename)
with open(filepath, 'w') as fh:
# iterate over every column in the row
for key, value in row:
if key == 'Gene id':
if value not in sequence_index:
missing_indexes.append(value)
sequence = ''
else:
sequence = sequence_index[value]
fh.write(fasta_fmt.format(value, sequence)+'\n')
if missing_indexes:
raise MissingIndexesError('The following identifiers could not be found in any of the index files: {}'.format(missing_indexes))
def parse_args():
args = argparse.ArgumentParser()
args.add_argument(
'gene_file',
help='File with gene identifiers in it'
)
args.add_argument(
'index_files',
nargs='+',
help='File[s] that will generate the sequence index'
)
args.add_argument(
'--output_dir',
default='output',
help='Directory to place output files into'
)
return args.parse_args()
def parse_index_files(files):
index = {}
for _file in files:
with open(_file) as fh:
for row in parse_tsv_file(fh, index=True):
index[row['feature_id']] = row['nucleotide_sequence']
return index
def main():
args = parse_args()
if os.path.exists(args.output_dir):
print("Will not overwrite existing output directory {}".format(args.output_dir))
return
os.makedirs(args.output_dir)
with open(args.gene_file) as fh:
rows = parse_tsv_file(fh)
sequence_index = parse_index_files(args.index_files)
rows_to_fasta(rows, sequence_index, args.output_dir)
if __name__ == '__main__':
main()