-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathbinCnt_summary.py
executable file
·295 lines (240 loc) · 13.2 KB
/
binCnt_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division, with_statement
'''
Copyright 2015, 陈同 ([email protected]).
===========================================================
'''
__author__ = 'chentong & ct586[9]'
__author_email__ = '[email protected]'
#=========================================================
desc = '''
Program description:
This is designed to summarize output of meltRegionBinMatrix.py, multipleSampleCompare_summary.py. Mainly their **summary.xls** file will be parsed.
'''
import sys
import os
from json import dumps as json_dumps
from time import localtime, strftime
timeformat = "%Y-%m-%d %H:%M:%S"
from optparse import OptionParser as OP
#from multiprocessing.dummy import Pool as ThreadPool
import re
from tools import *
from json import load as json_load
#from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
debug = 0
def fprint(content):
"""
This is a Google style docs.
Args:
param1(str): this is the first param
param2(int, optional): this is a second param
Returns:
bool: This is a description of what is returned
Raises:
KeyError: raises an exception))
"""
print json_dumps(content,indent=1)
def cmdparameter(argv):
if len(argv) == 1:
global desc
print >>sys.stderr, desc
cmd = 'python ' + argv[0] + ' -h'
os.system(cmd)
sys.exit(1)
usages = "%prog -i file"
parser = OP(usage=usages)
parser.add_option("-c", "--compare-summary", dest="compare_summary",
default="binC0304YCnt.merge/SCX.binC0304YCnt.merge.all.DE.summary.xls",
help="File name for summary.xls output by multipleSampleCompare_summary.py. One special symbol <C0304Y> should be put at the bin-size position. For example, file <bin10000Cnt.merge/SCX.bin10000Cnt.merge.all.DE.summary.xls> should be <binC0304YCnt.merge/SCX.binC0304YCnt.merge.all.DE.summary.xls> (which is default). The program will replace <C0304Y> with given bin sizes.")
parser.add_option("-m", "--meltBinRegion", dest="meltBinRegion",
default="binC0304YCnt.merge/binC0304YCnt.merge.norm.chr_distrib.summary.xls",
help="File name for summary.xls output by meltRegionBinMatrix.py. One special symbol <C0304Y> should be put at the bin-size position. For example, file <bin10000Cnt.merge/bin10000Cnt.merge.norm.chr_distrib.summary.xls> should be <binC0304YCnt.merge/binC0304YCnt.merge.norm.chr_distrib.summary.xls> (which is default). The program will replace <C0304Y> with given bin sizes. Specifally giving <no> to ignore this part analysis.")
parser.add_option("-b", "--bin-size", dest="bin_size",
help="Bin size. Multiple sizes can be separated by < > or <,>.")
parser.add_option("-t", "--tag", dest="tag",
help="An unique string with only alohabets to label all labels in output Rmd.")
#parser.add_option("-o", "--output-prefix", dest="out_prefix",
# help="The prefix of output files.")
parser.add_option("-r", "--report-dir", dest="report_dir",
default='report', help="Directory for report files. Default 'report'.")
parser.add_option("-R", "--report-sub-dir", dest="report_sub_dir",
default='4_genome_bin_difference', help="Directory for saving report figures and tables. This dir will put under <report_dir>, so only dir name is needed. Default '4_genome_bin_difference'.")
#parser.add_option("-d", "--doc-only", dest="doc_only",
# default=False, action="store_true", help="Specify to only generate doc.")
parser.add_option("-v", "--verbose", dest="verbose",
action="store_true", help="Show process information")
parser.add_option("-D", "--debug", dest="debug",
default=False, action="store_true", help="Debug the program")
(options, args) = parser.parse_args(argv[1:])
assert options.bin_size != None, "A string needed for -b"
return (options, args)
#--------------------------------------------------------------------
def main():
options, args = cmdparameter(sys.argv)
#-----------------------------------
compare_summary = options.compare_summary
meltBinRegion = options.meltBinRegion
bin_sizeL = re.split(r'[, ]*', options.bin_size)
tag = options.tag
report_dir = options.report_dir
report_sub_dir = options.report_sub_dir
verbose = options.verbose
global debug
debug = options.debug
#-----------------------------------
dest_dir = report_dir+'/'+report_sub_dir+'/'
os.system('mkdir -p '+dest_dir)
print "# 样品比较 {#sample-compare-%s}\n" % tag
print """
把基因组划分为相同大小的区域 (bin) (bin_size = ({})), 计算每个BIN的reads覆盖, 随后比较样品差异。结果如下。如果选取了不同大小的bin,每个bin分别呈现。
""".format(', '.join(bin_sizeL))
for bin in bin_sizeL:
print '\n## Bin genome (bin_size={})\n'.format(bin)
if meltBinRegion and meltBinRegion != 'no':
print '\n### Chromosome reads distribution comparison for all samples (bin_size={})\n'.format(bin)
curation_label = "Bin_chromsome_size_{}_read_distrib".format(bin)
knitr_read_txt(report_dir, curation_label)
tmp_meltBinRegion = meltBinRegion.replace('C0304Y', bin)
pic_label_base = 'Bin-size-{}-chr-distrib-{}'.format(bin, tag)
print """
绘制不同染色体上reads的分布,研究整体水平上样品间的reads丰度的异同。横轴代表染色体区域 (5'-3'), 纵轴代表每个bin的覆盖丰度 (CPM: reads count per million).
"""
count = 0
for line in open(tmp_meltBinRegion):
count += 1
png, chr = line.split()
copypng(dest_dir, png)
png = getRelativeDir(png, report_sub_dir)
#png = pdf.replace('pdf', 'png')
pic_label = pic_label_base + str(count)
print '''
(ref:{pic_label}) Reads在{chr}染色体的分布 (bin_size={bin})。
```{{r {pic_label}, fig.cap="(ref:{pic_label})"}}
knitr::include_graphics("{png}")
```
'''.format(pic_label=pic_label, chr=chr, bin=bin, png=png)
if compare_summary:
print '\n### Differentiall detected regions (bin_size={})\n'.format(bin)
curation_label = "Bin_chromsome_size_{}_sample_compare".format(bin)
knitr_read_txt(report_dir, curation_label)
pic_label_base = 'Bin-size-{}-samp-cmp-{}'.format(bin, tag)
tmp_compare_summary = compare_summary.replace('C0304Y', bin)
sumL = json_load(open(tmp_compare_summary))
# The first element if sumL containg 4 dicts with key as
# DE_count, DE_profile, pca, pearson.
# The value of these 4 keys is also a dict containing one or
# two keys, 'file' and 'plot' which points to files.
globalD = sumL[0]
#pearson
xls = globalD['pearson']['file']
pdf = globalD['pearson']['plot']
copy(dest_dir, xls)
xls = getRelativeDir(xls, report_sub_dir)
copypdf(dest_dir, pdf)
pdf = getRelativeDir(pdf, report_sub_dir)
png = pdf.replace('pdf', 'png')
print """
根据基因组每个区间检测到的reads的丰度,从整体上对样品的相关性进行评估和样品分类,
确定相似性较高或较低的样品群 (Figure \@ref(fig:{pic_label_base}-pearson) and \@ref(fig:{pic_label_base}-pca))。
(ref:{pic_label_base}-pearson) 样品间Pearson相关性分析。图中每个格子代表纵轴和横轴两个样品的Pearson correlation coefficient (PCC). 格子的颜色表示了PCC值的大小。[pdf]({pdf}) [XLS]({xls})
```{{r {pic_label_base}-pearson, fig.cap="(ref:{pic_label_base}-pearson)"}}
knitr::include_graphics("{png}")
```
""".format(pic_label_base=pic_label_base, png=png, pdf=pdf, xls=xls)
#pca
pdf = globalD['pca']['plot']
copypdf(dest_dir, pdf)
pdf = getRelativeDir(pdf, report_sub_dir)
png = pdf.replace('pdf', 'png')
print """
(ref:{pic_label_base}-pca) 样品主成分聚类 (PCA)分析。图中每个点代表一个样品,聚拢在一起的点具有相似的生物属性;相距越远的点代表样品的相似度越差。横轴表示第一主成分,对样品的区分度大于纵轴代表的第二主成分。[pdf]({pdf}) [XLS]({xls})
```{{r {pic_label_base}-pca, fig.cap="(ref:{pic_label_base}-pca)"}}
knitr::include_graphics("{png}")
```
""".format(pic_label_base=pic_label_base, png=png, pdf=pdf, xls=xls)
#DE_parameters
fdr = globalD['DE_parameters']['fdr']
log2fc = globalD['DE_parameters']['log2fc']
#DE_count
xls = globalD['DE_count']['file']
pdf = globalD['DE_count']['plot']
copy(dest_dir, xls)
xls = getRelativeDir(xls, report_sub_dir)
copypdf(dest_dir, pdf)
pdf = getRelativeDir(pdf, report_sub_dir)
png = pdf.replace('pdf', 'png')
print """
样品间差异区域的鉴定和丰度展示 (FDR (false discovery rate) = {fdr}, log2FC (log2 fold change) = {log2fc})。
样品间差异区域数目统计,每个方格代表横轴样品相比于纵轴样品丰度显著高的区域的数目 (Figure \@ref(fig:{pic_label_base}-de-count))。
(ref:{pic_label_base}-de-count) 样品间差异区域数目统计,每个方格代表横轴样品相比于纵轴样品丰度显著高的区域的数目。[PDF]({pdf}) [XLS]({xls})
```{{r {pic_label_base}-de-count, fig.cap="(ref:{pic_label_base}-de-count)"}}
knitr::include_graphics("{png}")
```
""".format(pic_label_base=pic_label_base, png=png, pdf=pdf,
xls=xls, fdr=fdr, log2fc=log2fc)
#DE_profile
xls = globalD['DE_profile']['file']
pdf = globalD['DE_profile']['plot']
copy(dest_dir, xls)
xls = getRelativeDir(xls, report_sub_dir)
copypdf(dest_dir, pdf)
pdf = getRelativeDir(pdf, report_sub_dir)
png = pdf.replace('pdf', 'png')
print """
样品间差异区域丰度图谱。每一行为一个区域,每一列为一个样品,每个小方格代表对应区域在对应样品的丰度,其值的大小根据颜色展示 (Figure \@ref(fig:{pic_label_base}-de-profile))。
(ref:{pic_label_base}-de-profile) 样品间差异区域丰度图谱。每一行为一个区域,每一列为一个样品,每个小方格代表对应区域在对应样品的丰度,其值的大小根据颜色展示。[PDF]({pdf}) [XLS]({xls})
```{{r {pic_label_base}-de-profile, fig.cap="(ref:{pic_label_base}-de-profile)"}}
knitr::include_graphics("{png}")
```
""".format(pic_label_base=pic_label_base, png=png, pdf=pdf,
xls=xls, fdr=fdr, log2fc=log2fc)
# DE compare -- each sample pair
# The second element of sumL is a dict containing sample-compare
# information in sub-dict.
compD = sumL[1]
for comp_pair, comp_subD in compD.items():
condA, condB = comp_pair.split('\t')
all = comp_subD['all']
higherThan = comp_subD['higherThan']
higherThan_Cnt = comp_subD['higherThan_Cnt']
lowerThan = comp_subD['lowerThan']
lowerThan_Cnt = comp_subD['lowerThan_Cnt']
copy(dest_dir, all, higherThan, lowerThan)
all, higherThan, lowerThan = \
getRelativeDir([all, higherThan, lowerThan], report_sub_dir)
print """
#### Sample compare ({condA} vs {condB}) (bin_size={bin})
{condA}和{condB}检测到的区域Reads丰度统计和区域注释 ([点击下载]({all}))。如果对应区域落在基因启动子区域 (转录起始位点上游1 kb和下游500 bp区域), 则以基因的注释作为此区域注释。一个区域可能对应多个基因。
{condA}相比于{condB}显著高的区域有{higherThan_Cnt}个,注释见[表格]({higherThan}).
{condB}相比于{condA}显著高的区域有{lowerThan_Cnt}个,注释见[表格]({lowerThan}).
""".format(condA=condA, condB=condB, bin=bin, all=all, higherThan_Cnt=higherThan_Cnt, lowerThan_Cnt=lowerThan_Cnt, higherThan=higherThan, lowerThan=lowerThan)
###--------multi-process------------------
#pool = ThreadPool(5) # 5 represents thread_num
#result = pool.map(func, iterable_object)
#pool.close()
#pool.join()
###--------multi-process------------------
if verbose:
print >>sys.stderr,\
"--Successful %s" % strftime(timeformat, localtime())
if __name__ == '__main__':
startTime = strftime(timeformat, localtime())
main()
endTime = strftime(timeformat, localtime())
fh = open('python.log', 'a')
print >>fh, "%s\n\tRun time : %s - %s " % \
(' '.join(sys.argv), startTime, endTime)
fh.close()
###---------profile the program---------
#import profile
#profile_output = sys.argv[0]+".prof.txt")
#profile.run("main()", profile_output)
#import pstats
#p = pstats.Stats(profile_output)
#p.sort_stats("time").print_stats()
###---------profile the program---------