-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbam_helpers.py
executable file
·192 lines (165 loc) · 7.1 KB
/
bam_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""
Utilities for working with bam files
"""
import pysam
import numpy as np
import re
from helpers import Window
from helpers import INFO
def extract_windows(chromosome, ref_filename, bam_filename, **args):
"""
Extract the windows that couple the seq_file and ref_files
for the given chromosome
:param chromosome: chromosome name (str)
:param ref_file: The reference file
:param test_file: The sequence file
:return:
"""
windowcapacity = args["windowsize"]
start_idx = args["start_idx"]
end_idx = args["end_idx"]
# the windows list
windows = []
with pysam.FastaFile(ref_filename) as fastafile:
print("{0} Reference file: {1}".format(INFO, fastafile.filename))
with pysam.AlignmentFile(bam_filename, "rb") as sam_file:
print("{0} Sam file: {1} ".format(INFO, sam_file.filename))
wcounter = 0
while start_idx < end_idx:
sam_output = window_sam_file(chromosome=chromosome,
sam_file=sam_file,
fastafile=fastafile,
start=start_idx, end=start_idx + windowcapacity,
**args)
windows.append(Window(idx=wcounter,
capacity=windowcapacity,
samdata=sam_output))
start_idx += windowcapacity
wcounter += 1
return windows
def window_sam_file(chromosome, sam_file, fastafile,
start, end, **kwargs):
refseq = '' # store for the refseq
samseq = [] # stote for the sample
pos = [] # reference pos
nseq = [] # all reads
nalign = [] # quality limited reads
head = 0 # whether read starts in window
head1 = 0 # start of read 1 in pair
head2 = 0 # start of read 2 in pair
read1 = 0 # equivalent nalign just for read 1s in pairs
read2 = 0 # as above for read 2s
errorAlert = False
truncate = kwargs['sam_read_config']['truncate']
ignore_orphans = kwargs['sam_read_config']['ignore_orphans']
max_depth = kwargs['sam_read_config']['max_depth']
qual = kwargs['sam_read_config'].get('quality_threshold', None)
add_indels = kwargs['sam_read_config']['add_indels']
try:
for pcol in sam_file.pileup(chromosome, start, end,
truncate=truncate,
ignore_orphans=ignore_orphans,
max_depth=max_depth):
if qual is not None:
pcol.set_min_base_quality(qual)
# fill in start when there are no reads present
while pcol.reference_pos != start:
samseq.append('_')
refseq += fastafile.fetch(chromosome, start, start + 1)
pos.append(start)
nseq.append(0)
nalign.append(0)
start += 1
# collect metrics for pileup column
pos.append(start)
nseq.append(pcol.nsegments) # all reads over a pileup column
nalign.append(pcol.get_num_aligned()) # above but quality limited
for p in pcol.pileups:
head += p.is_head # start of read present
read1 += p.alignment.is_read1 # first of mate pair (from nalign)
read2 += p.alignment.is_read2 # second of mate pair (from nalign)
if p.is_head == 1 and p.alignment.is_read1 == 1: # above but is start of read
head1 += 1
if p.is_head == 1 and p.alignment.is_read2 == 1:
head2 += 1
# get bases from reads at pileup column (quality limited)
try:
if pcol.get_num_aligned() == 0:
samseq.append('_')
refseq += fastafile.fetch(chromosome, pcol.reference_pos, pcol.reference_pos + 1)
else:
x = pcol.get_query_sequences(add_indels=add_indels)
x = [a.upper() for a in x]
samseq.append(set(x)) # store as unique set
refseq += fastafile.fetch(chromosome, pcol.reference_pos, pcol.reference_pos + 1)
except Exception as e: # may fail if large number of reads
try:
x = pcol.get_query_sequences(add_indels=add_indels)
x = [a.upper() for a in x]
samseq.append(set(x)) # store as unique set
refseq += fastafile.fetch(chromosome, pcol.reference_pos, pcol.reference_pos + 1)
except Exception as e:
errorAlert = True
start += 1
# fill in end if no reads at end of window
while start < end:
samseq.append('_')
refseq += fastafile.fetch(chromosome, start, start + 1)
pos.append(start)
nseq.append(0)
nalign.append(0)
start += 1
# Metrics for read depth
allmean = np.mean(nseq) # metrics, may not need all these
qmean = np.mean(nalign)
allmedian = np.median(nseq)
qmedian = np.median(nalign)
allsum = np.sum(nseq)
qsum = np.sum(nalign)
# GC content
gcr = (refseq.count('G') + refseq.count('g') + refseq.count('C') + refseq.count('c')) / len(refseq)
gapAlert = True if 'N' in refseq or 'n' in refseq else False
gcmax = 0
gcmaxlen = 0
gcmin = 0
gcminlen = 0
for bs in samseq:
minelgc = None
lenminelgc = None
maxelgc = None
lenmaxelgc = None
for el in bs:
el = el.split('-')
ellen = len(re.sub('[\+\-_Nn*]', '', el[0]))
elgc = len(re.sub('[\+\-_Nn*AaTt]', '', el[0]))
if minelgc == None or elgc < minelgc:
minelgc = elgc
lenminelgc = ellen
if maxelgc == None or elgc > maxelgc:
maxelgc = elgc
lenmaxelgc = ellen
gcmax += maxelgc
gcmaxlen += lenmaxelgc
gcmin += minelgc
gcminlen += lenminelgc
gcmax = gcmax / gcmaxlen if gcmaxlen > 0 else None
gcmin = gcmin / gcminlen if gcminlen > 0 else None
output = {'gcmax': gcmax,
'gcmin': gcmin,
'gcr': gcr,
'gapAlert': gapAlert,
'allmean': allmean,
'qmean': qmean,
'allmedian': allmedian,
'allsum': allsum,
'qsum': qsum,
'qmedian': qmedian,
'errorAlert': errorAlert,
'head': head,
'start': pos[0],
'end': pos[-1],
'minLen': gcminlen
}
return output
except MemoryError as e:
raise