-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseq_embeddings.py
340 lines (277 loc) · 10.4 KB
/
seq_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#!/usr/bin/env python
## This code processes the
## DNA sequence data and
## trains seq embedidngs
## model for vectorizing
## the DNA seqeunces (whole
## genome or FASTA files)
# %% ENVIRONMENT
import os, sys
from os.path import expanduser
HOME = expanduser("~")
# %% IMPORTS
import sys
import pathlib
import datetime
import fasttext
import pickle as pkl
# %% SETTINGS
DATA_DIR = f"{HOME}/0.work/genomes"
DATA_FLS = ['hgTables_5UTR.fa', 'hgTables_3UTR.fa', 'hgTables_500up.fa', 'hgTables_300down.fa', 'hgTables_cds.fa'] ## Train Model Genomic Features
# DATA_FLS = ['Mus_musculus.GRCm38.68.dna.toplevel.mod.clean.fa'] ## Train Model On Whole Genome
# %% FUNCTIONS
def fasta_reader(fas, fas_filter = "longest"):
'''
read fasta file to list
fas_filter:
"longest": choose longest FASTA per key
"first": use first FASTA seqeunce if multiple seqeunces
'''
## imports
from collections import defaultdict
## output
fasdct = {}
## input
fhin = open(fas, 'r')
aread = fhin.read()
faslst = aread.split('>')
fhin.close()
## parse header and
## seqs from FASTA
tmpdct = defaultdict(list) ## Store FASTA as dict
acount = 0 ## count the number of entries
empty_count = 0
for i in faslst[1:]:
ent = i.split('\n')
aname = ent[0].split()[0].strip()
seq = ''.join(x.strip() for x in ent[1:]) ## Sequence in multiple lines
alen = len(seq)
tmpdct[aname].append((seq, alen))
acount += 1
# fasdct[aname] = seq
print(f"Input Seqs:{acount}")
## select one seqeunece if multiple
## genes, transcripts for same ID
if fas_filter == "longest":
fasdct = select_longest(tmpdct)
elif fas_filter == "first":
fasdct = {k:v[0] for k,v in tmpdct}
else:
print(f"Unknown 'fas_filter' argument:{fas_filter} - exiting")
sys.exit()
print(f"Read FASTA file:{fas} | Sequences:{len(fasdct)}")
return fasdct
def select_longest(indct):
'''
given a defaultdict(list), iterate and
for each key select the longest seq
'''
resdct = {}
for k,v in indct.items():
# print(f"Key:{k}")
# print(f"Value:{v}")
if len(v) > 1:
# print(f"Multiple Seqs - selecting longest")
len_sorted = sorted(v, key = lambda x:x[1])
seq = len_sorted[0][0] ## to select seq from a tuple (seq, len)
else:
seq = v[0][0] ## to select seq from the single avaialable element
## add to dict
resdct[k] = seq
# sys.exit()
print(f"Output Seqs:{len(resdct)}")
return resdct
def fasta_writer(fas_dct, fas_out):
'''
writes fasta dict to fasta file;
the output filename is provided as an input
'''
## outfile
fhout = open(fas_out, 'w')
## iterate and write
acount = 0
for head, seq in fas_dct.items():
fhout.write(">%s\n%s\n" % (head, seq))
acount+=1
fhout.close()
print(f"Written FASTA file with {acount} | Sequences:{fas_out}")
return fas_out
def feats_writer(faslst, feats_out):
'''
writes two element list (name, seq) to fasta
'''
## output
# outfile = "%s/all_features_seq.fa" % (datadir)
fhout = open(feats_out, 'w')
## write to file
acount = 0
for idx, aseq in enumerate(faslst):
fhout.write("%s\n\n" % (aseq))
acount +=1
fhout.close()
print(f"\nWrote {acount} seqs to file:{feats_out}")
return feats_out
def combine_fasta(datadir, datafls):
'''
reads list of fasta files; extacts seq and
writes seq as newline seprated fasta seqeunce (one in each line)
'''
print(f"\nFn: Combine FASTA Seqeunces")
## output
reslst = []
if len(datafls) > 1:
feats_out = os.path.join(datadir, "genomic_features-train.fa")
else:
afile = datafls[0]
feats_out = os.path.join(datadir, "%s-train.fa" % afile.rpartition(".")[0] )
acount = 0
for fas in datafls:
path = os.path.join(datadir, fas)
fasdct = fasta_reader(path)
acount+=1
## add sequences to
## master list
tmplst = list(fasdct.values())
print(f"Cached file:{path} | seqs:{len(tmplst)}")
reslst.extend(tmplst)
## write to file
fas_dct = {idx:seq for idx,seq in enumerate(reslst)}
_ = fasta_writer(fas_dct, feats_out)
print(f"Processed:{acount} fasta files")
print(f"Total genomic features (i.e. seqeunces):{len(reslst)}")
return reslst, feats_out
def process_seqs(fas_in, method = "ok", out_format = "fasta"):
'''
Perform FASTA segmentation or processing for
embeddings.
Available Methods:
"ok": Overlapping, sliding, K-mer given a width and a stride input
Available file outputs (i.e. out_format):
"fasta" = fasta file (for pre-processing fasta before embeddings)
"feats" = text file with just line separated seqeunces (for training mebddings model)
'''
print(f"\n#### Fn: Seqeunce Segementator ########")
## outputs
fas_dct = {}
fas_out = "%s-segmented.fas" % (fas_in.rpartition(".")[0])
feat_out = "%s-segmented.feats" % (fas_in.rpartition(".")[0])
## inputs
fas_dct = fasta_reader(fas_in)
## iterate and segment
if method == "ok":
for head, seq in fas_dct.items():
kmer_lst = overlap_k(seq, kwidth =3, stride = 1)
kmer_seq = " ".join(str(i) for i in kmer_lst)
fas_dct[head] = kmer_seq
else:
print(f"Method not implemented:{method} - exiting")
sys.exit()
## write fasta file
if out_format == "fasta":
outfile = fasta_writer(fas_dct, fas_out)
print(f"Sequence segementaion complete")
elif out_format == "feats":
feat_lst = fas_dct.values()
outfile = feats_writer(feat_lst, feat_out)
print(f"Sequence segementaion complete")
else:
print(f"The `out_format` value:{out_format} not recofnized - exiting")
sys.exit()
print(f"See outfile:{outfile}")
return fas_dct, outfile
def overlap_k(seq, kwidth =3, stride =1):
'''
sliding window for overlapping k-mers of given size after given strides
'''
## output
kmer_lst = []
## size
owidth = int( ( ( len(seq.strip())-kwidth )/stride ) +1 )
# print(f"\nSeq Len:{len(seq)} | Output width:{owidth}")
idx = 0
for n in range(owidth):
s = idx
kmer = seq[s:s+kwidth]
kmer_lst.append(kmer)
idx += stride
# print(f"Kmers snippet:{kmer_lst[:50]}")
# print(f"Seq length: {len(seq)} | Total Kmers:{len(kmer_lst)}")
return kmer_lst
def train_word_vec_model(feature_file, model_type = "cbow"):
'''
here we train word vectors models
https://fasttext.cc/docs/en/unsupervised-tutorial.html
https://www.analyticsvidhya.com/blog/2017/07/word-representations-text-classification-using-fasttext-nlp-facebook/
Available Models Types:
"skipgram" = Skip-Gram
"cbow" = continuous bag of words
'''
## output
timestamp = datetime.datetime.now().strftime("%m_%d_%H_%M")
model_file = "%s_%s.bin" % (feature_file.rpartition(".")[0], timestamp)
model_mem = "%s_word_vec_%s.txt" % (feature_file.rpartition(".")[0], timestamp)
fhout = open(model_mem, 'w')
## sanity check and
## train word vectors
minn_val = 3 ## min subword size
maxn_val = 6 ## max subword size
dimn_val = 128 ## embedding dimensions
epochs = 5 ## epochs; default:5
learn_rate = 0.05 ## default: 0.5 [0.01,1]
path = pathlib.Path(feature_file)
print(f"Training fastText words vectors on seq file:{feature_file}")
print(f"featurefile:{feature_file}\nminn:{minn_val}\nmaxn:{maxn_val}\ndim:{dimn_val}\nepoch:{epochs}\nlearning_rate:{learn_rate}\n")
if path.is_file():
if model_type == "skipgram":
print(f"Training `SKIPGRAM` model")
model = fasttext.train_unsupervised(feature_file, "skipgram", minn=minn_val, maxn=maxn_val, dim=dimn_val, epoch = epochs, lr = learn_rate)
elif model_type=="cbow":
print(f"Training `CBOW` model")
model = fasttext.train_unsupervised(feature_file, "cbow", minn=minn_val, maxn=maxn_val, dim=dimn_val, epoch = epochs, lr = learn_rate)
else:
model = None
print(f"Check `model_type` parameter:{model_type} - exiting")
sys.exit()
## write the models and config
model.save_model(model_file)
fhout.write(f"featurefile:{feature_file}\ntype:{model_type}\nminn:{minn_val}\nmaxn:{maxn_val}\ndim:{dimn_val}\nepoch:{epochs}\nlearning_rate:{learn_rate}\n")
else:
raise FileNotFoundError(f"{feature_file} not found")
fhout.close()
print(f"Trained word vector model:{model_file}")
return model
# %% MAIN
## Preapre Feats Seqeunce
# _, comb_fasta = combine_fasta(DATA_DIR, DATA_FLS)
# _, train_fasta = process_seqs(comb_fasta, out_format="feats")
## Train Model
# train_fasta = "/home/atul/0.work/genomes/all_features_seq-segmented.test.feats"
# train_fasta = "/home/atul/0.work/genomes/Mus_musculus.GRCm38.68.dna.toplevel.forfasttext.fa"
# wordvec_model = train_word_vec_model(train_fasta)
# %% DEV
# model = fasttext.load_model("/home/atul/0.work/genomes/Mus_musculus.GRCm38.68.dna.toplevel.forfasttext_01_23_13_11.bin")
# %%DEV
# process_seqs(fas_in = "test.fa", method = "ok")
# %% TEST
# vect = model.get_word_vector('ATGCGC')
# model.get_nearest_neighbors('TTGCGC')
# vect.shape
# %% MAIN
def main():
_, comb_fasta = combine_fasta(DATA_DIR, DATA_FLS)
_, train_file = process_seqs(comb_fasta, out_format="feats")
# train_file = "/home/atul/0.work/genomes/all_features_seq.fa"
# train_file = "/home/atul/0.work/genomes/Mus_musculus.GRCm38.68.dna.toplevel.mod.clean-train-segmented.feats"
wordvec_model = train_word_vec_model(train_file)
return None
# %%
if __name__ == "__main__":
main()
pass
#### CHANGELOG
## v01 [01/21/2021]
## wrote functions to read/write FASTA files and feature files for training
## wrote functions for training FASTTEXT on DNA seqeunces
## v01 -> v02 [02/01/2021]
## added function to segment DNA seqeunces
## reused `process_genome_data` to `combine_fasta` for downstream segemetation