-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsubsamplebam.py
executable file
·182 lines (156 loc) · 5.12 KB
/
subsamplebam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python # -*- coding: utf-8 -*-
# Handle SIGPIPE signals correctly
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)
# Read given file with read names and hash them for quick lookup
# Read stdin and filter based on samtools view input and spit to stdout
import sys
import argparse
import random
import subprocess
import shlex
import os.path
import multiprocessing
import numpy as np
''' Python3 compatibility '''
from past.builtins import xrange
def parse_args(wrapper=False):
parser = argparse.ArgumentParser()
parser.add_argument(
'bamfile'
)
if not wrapper:
parser.add_argument(
'refseq',
default=None,
help='name of the reference sequenceto do subsampleing on '
)
''' reference lenght isn't necessary to use samtools view '''
# parser.add_argument(
# 'reflength',
# default=None,
# type=int,
# help='Length of the reference sequence'
# )
parser.add_argument(
'--subsample',
type=int,
default=1000,
help='What depth to try and subsample to'
)
parser.add_argument(
'-A', '--count-orphans',
action='store_true',
help='Allow orphan/unpaired reads.'
)
parser.add_argument(
'-r', '--more-random',
action='store_true',
help='Subsample with more random read-picking at expense of minimizing depth.',
default=False
)
return parser.parse_args()
class MinimumDepthNotAvailable(Exception): pass
def randomly_select_reads_from_samview(bamfile, regionstr, n):
'''
Given a samview output select n random read names
In place operation on selection
:param str bamfile: bamfile path
:param str regionstr: regionstring to operate on
:param int n: Random subselection depth, aka, how many random rows to select from samview
'''
newselection = None
sview = samview(bamfile, regionstr)
sview = list(sview)
try:
newselection = set(random.sample(sview, n))
except ValueError as e:
raise MinimumDepthNotAvailable('Depth for {0} is only {1}'.format(regionstr, len(sview)))
return newselection
def parallel_randomly_select_reads_from_samview(args):
try:
return randomly_select_reads_from_samview(*args)
except MinimumDepthNotAvailable as e:
sys.stderr.write(str(e) + '\n')
return set()
def subselect_from_bam(bamfile, subselectdepth, regionstr):
'''
Iterate over every base position in regionstr and subselect randomly from the reads
that fall under it.
'''
# Parse the regionstring
refname, startstop = regionstr.split(':')
start, stop = startstop.split('-')
uniquereads = set()
rstrings = []
for i in range(int(start), int(stop)+1):
rstring = '{0}:{1}-{1}'.format(refname, i, i)
rstrings.append((bamfile, rstring, subselectdepth))
pool = multiprocessing.Pool()
ureads = pool.map(parallel_randomly_select_reads_from_samview, rstrings)
#ureads = map(parallel_randomly_select_reads_from_samview, rstrings)
for uread in ureads:
uniquereads.update(uread)
return uniquereads
def make_subselected_bam(bamfile, uniquereads):
'''
Make outbam with uniquereads
'''
# Write header of bamfile
cmd = 'samtools view -H {0}'.format(bamfile)
sys.stderr.write(cmd + '\n')
p = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
shell=True
)
for line in p.stdout:
sys.stdout.write(line)
for line in uniquereads:
sys.stdout.write(line)
def reads_by_pos(reads, max_pos):
#def is_at_pos(pos): return read['pos'] == pos
for pos in xrange(max_pos):
yield [read for read in reads if read['pos'] == pos]
def samview(bamfile, regionstr):
'''
Just return iterator over samtools view bamfile regionstr
'''
cmd = 'samtools view {0} {1}'.format(bamfile, regionstr)
sys.stderr.write(cmd + '\n')
p = subprocess.Popen(
shlex.split(cmd),
stdout=subprocess.PIPE
)
# you have to read stdout in order for return to fill in
firstline = p.stdout.readline()
if p.poll() not in (0,None) and firstline == '':
raise ValueError('samtools did not return correctly')
yield firstline
for line in p.stdout:
yield line
def samtools_is_available():
try:
p = subprocess.Popen(
['samtools'],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
sout, _ = p.communicate()
findme = None
if isinstance(sout, bytes):
findme = b'Version:'
else:
findme = 'Version:'
if sout.find(findme) >= 0:
return True
return False
except OSError as e:
return False
def main():
if not samtools_is_available():
sys.stderr.write('Samtools is not installed or executable\n')
sys.exit(1)
args = parse_args()
uniquereads = subselect_from_bam(args.bamfile, args.subsample, args.regionstr)
make_subselected_bam(args.bamfile, uniquereads)