-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathbedCovrageUseWig.py
executable file
·249 lines (238 loc) · 9.14 KB
/
bedCovrageUseWig.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#from __future__ import division, with_statement
'''
Copyright 2013, 陈同 ([email protected]).
===========================================================
'''
__author__ = 'chentong & ct586[9]'
__author_email__ = '[email protected]'
#=========================================================
'''
Compute the average read coverage of given bed file using wig.
Bed file:
track line [optional, will be skipped]
#annotation line [optional, will be skipped]
chr7 52823164 52823749 0610005C13Rik -
chr7 52826355 52826562 0610005C13Rik -
chr7 52829782 52829892 0610005C13Rik -
chr7 52829977 52830147 0610005C13Rik -
chr7 52830496 52830546 0610005C13Rik -
chr5 31351012 31351129 0610007C21Rik +
chr5 31351834 31351953 0610007C21Rik +
chr5 31354569 31354641 0610007C21Rik +
chr5 31354834 31354906 0610007C21Rik +
chr5 31355135 31355257 0610007C21Rik +
chr5 31356333 31356431 0610007C21Rik +
The forth column may have same name or different names. Neighbor lines
with same names will be merged together to compute sum,mean,median,
max,min
'''
import collections
from numpy import mean,median,max,min,sum
from numpy import array as np_array
from numpy import append as np_append
from array import array
import sys
import os
from time import localtime, strftime
timeformat = "%Y-%m-%d %H:%M:%S"
from optparse import OptionParser as OP
def cmdparameter(argv):
if len(argv) == 1:
cmd = 'python ' + argv[0] + ' -h'
os.system(cmd)
sys.exit(1)
desc = ""
usages = "%prog -i bed -w wig -o operator -s True"
parser = OP(usage=usages)
parser.add_option("-i", "--bed", dest="bed",
metavar="BED_REGION", help="Regions in bed file format, SORT \
BY chromsome.***")
parser.add_option("-w", "--wig", dest="wig",
metavar="WIG", help="Regions in wig file format.***")
parser.add_option("-o", "--op", dest="op",
metavar="OPERATOR", help="Several choice, sum,mean,median,max,min.\
Multiple ones can be given in ',' connected formatsi, lke \
<mean, max>.")
parser.add_option("-s", "--strand", dest="strand",
metavar="1/0", default=0, help="When 1 is given, compute \
strand specific coverage for bed regions. This assumes, the seond \
column in wig is positive strand while the third column in wig is \
nagative strand." )
parser.add_option("-n", "--name", dest="name", default=1,
metavar="All column/Name column", help="Default the \
column output before coverage data is the forth column of bed file. If\
position information or all columns are also required, please give <0>.")
parser.add_option("-v", "--verbose", dest="verbose",
default=0, help="Show process information")
parser.add_option("-d", "--debug", dest="debug",
default=False, help="Debug the program")
(options, args) = parser.parse_args(argv[1:])
assert options.bed != None, "Region file in bed format is needed -i"
assert options.wig != None, "Coverage file in wig format is needed -i"
return (options, args)
#--------------------------------------------------------------------
def readWig(wig, strand):
array_i = array
chr = ''
span = 0
step = 0
wigDict = collections.defaultdict(dict)
#wigDict = {} #unstrand wigDict = {pos:value}
#strand wigDicr = {pos:[pos value, neg value]}
pos_fixed = 0
for i in open(wig):
if i.startswith('track'):
continue
elif i.startswith('#'):
continue
elif i.startswith('browse'):
continue
elif i.startswith('variableStep'):
pos_fixed = 0
chromi = i.rfind('chrom=')
assert chromi != -1, "Wrong format no chr %s" % i
chr = i[chromi+6:].strip().split()[0]
spani = i.rfind("span=")
span = 1 if spani == -1 else \
int(i[spani+5:].strip().split()[0])
pos = 1
neg = 2
#-unckecked for less of data-------------
elif i.startswith("fixedStep"):
chromi = i.rfind('chrom=')
assert chromi != -1, "Wrong format no chr %s" % i
chr = i[chromi+6:].strip().split()[0]
starti = i.rfind('start=')
assert starti != -1, "Must have start %s" % i
pos_fixed = int(i[starti+6:].strip().split()[0])
assert pos_fixed > 0, \
"fixedStep must have start bigger than 0 %s" % i
stepi = i.rfind('step=')
assert stepi != -1, "Must have step %s" % i
step = int(i[stepi+5:].strip().split()[0])
start = pos_fixed - step - 1 #feasible add <step> each
spani = i.rfind('span=')
span = 1 if spani == -1 else \
int(i[spani+5:].strip().split()[0])
pos = 0
neg = 1
else:
lineL = i.strip().split()
if pos_fixed: #fixedStep each position is the last
#position plus step.
start += step
end = start + span
else:
start = int(lineL[0])-1
end = start + span
#-------------------------------------------
#if span < 2 and not strand:
# wigDict[chr][(start,end)] = float(lineL[pos])
#elif span < 2 and strand:
# wigDict[chr][(start,end)] = array('f', \
# [float(lineL[pos]), float(lineL[neg])])
#else:
for position in xrange(start, end):
if not strand:
wigDict[chr][position] = float(lineL[pos])
else:
wigDict[chr][position] = array('f', \
[float(lineL[pos]), float(lineL[neg])])
#--------------------------------------
#-----------------------------------------
#-------------------------------------------------
#--------end processing one line ----------------------
#----------------END reading whole file-------------------
return wigDict
#---------------------------------------------------
def outputWig(wigDict, strand):
keyL = wigDict.keys()
keyL.sort()
for key in keyL:
chrD = wigDict[key]
chrD_keys = chrD.keys()
chrD_keys.sort()
print key
for i in chrD_keys:
print "%d\t%f" % (i,chrD[i])
#----------------------------------------
def main():
options, args = cmdparameter(sys.argv)
#-----------------------------------
bed = options.bed
wig = options.wig
strand = options.strand
verbose = options.verbose
debug = options.debug
wigDict = readWig(wig, strand)
opL = options.op.split(',')
name_mode = int(options.name)
#-----------------------------------
opDict = {'mean':mean, 'median':median, \
'max':max, 'min':min, 'sum':sum}
if file == '-':
fh = sys.stdin
else:
fh = open(bed)
#--------------------------------
if name_mode:
print "#name\t%s" % '\t'.join(opL)
else:
print "#%s" % '\t'.join(opL)
label = ''
valueL = np_array([])
#print wigDict
for line in fh:
lineL = line.strip().split('\t')
chr = lineL[0]
start = int(lineL[1])
end = int(lineL[2])
innerD = wigDict[chr]
if label and label != lineL[3]:
print "%s\t%s" % (name, \
'\t'.join([str(opDict[op](valueL)) for op in opL]))
valueL = np_array([])
#---------------------------------------------------------
label = lineL[3]
if strand:
strand_in = lineL[5]
if name_mode:
name = label+'@'+ strand_in
else:
name = line.strip()
strand_num = 0 if strand_in=='+' else 1
valueL = np_append(valueL, [innerD.get(i,[0,0])[strand_num] \
for i in xrange(start,end)])
else:
if name_mode:
name = lineL[3]
else:
name = line.strip()
valueL = np_append(valueL, [innerD.get(i,0) \
for i in xrange(start,end)])
#----------------------------------------------
#print valueL
#for op in opL:
# tmpL.append(str(opDict[op](valueL)))
#-------------END reading file----------
#------for the last name-----------------
if label:
print "%s\t%s" % (name, \
'\t'.join([str(opDict[op](valueL)) for op in opL]))
#----close file handle for files-----
if file != '-':
fh.close()
#-----------end close fh-----------
if verbose:
print >>sys.stderr,\
"--Successful %s" % strftime(timeformat, localtime())
if __name__ == '__main__':
startTime = strftime(timeformat, localtime())
main()
endTime = strftime(timeformat, localtime())
fh = open('python.log', 'a')
print >>fh, "%s\n\tRun time : %s - %s " % \
(' '.join(sys.argv), startTime, endTime)
fh.close()