-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathfeature.py
89 lines (80 loc) · 2.89 KB
/
feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import sys, os
import fastq
STAT_LEN_LIMIT = 10
READ_TO_SKIP = 1000
ALL_BASES = ("A", "T", "C", "G");
class FeatureExtractor:
def __init__(self, filename, sample_limit=10000):
self.sample_limit = sample_limit
self.filename = filename
self.base_counts = {}
self.percents = {}
self.read_count = 0
self.stat_len = 0
self.total_num = [0 for x in xrange(STAT_LEN_LIMIT)]
for base in ALL_BASES:
self.base_counts[base] = [0 for x in xrange(STAT_LEN_LIMIT)]
self.percents[base] = [0.0 for x in xrange(STAT_LEN_LIMIT)]
def stat_read(self, read):
seq = read[1]
seqlen = len(seq)
for i in xrange(min(seqlen, STAT_LEN_LIMIT)):
self.total_num[i] += 1
b = seq[i]
if b in ALL_BASES:
self.base_counts[b][i] += 1
def extract(self):
reader = fastq.Reader(self.filename)
stat_reads_num = 0
skipped_reads = []
#sample up to maxSample reads for stat
while True:
read = reader.nextRead()
if read==None:
break
self.read_count += 1
# here we skip the first 1000 reads because usually they are usually not stable
if self.read_count < READ_TO_SKIP:
skipped_reads.append(read)
continue
stat_reads_num += 1
if stat_reads_num > self.sample_limit and self.sample_limit>0:
break
self.stat_read(read)
# if the fq file is too small, then we stat the skipped reads again
if stat_reads_num < READ_TO_SKIP:
for read in skipped_reads:
self.stat_read(read)
self.calc_read_len()
self.calc_percents()
def calc_read_len(self):
for pos in xrange(STAT_LEN_LIMIT):
has_data = False
for base in ALL_BASES:
if self.base_counts[base][pos]>0:
has_data = True
if has_data == False:
self.stat_len = pos
return
if has_data:
self.stat_len = STAT_LEN_LIMIT
def calc_percents(self):
#calc percents of each base
for pos in xrange(self.stat_len):
total = 0
for base in ALL_BASES:
total += self.base_counts[base][pos]
for base in ALL_BASES:
self.percents[base][pos] = float(self.base_counts[base][pos])/float(total)
def feature(self):
# bad feature
if self.stat_len < STAT_LEN_LIMIT:
return None
#calc percents of each base
feature_vector = []
# data is packed as values of ATCGATCGATCGATCGATCG
for pos in xrange(self.stat_len):
total = 0
for base in ALL_BASES:
feature_vector.append(self.percents[base][pos])
return feature_vector