-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathDE_genes.summary.py
executable file
·330 lines (275 loc) · 15.6 KB
/
DE_genes.summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division, with_statement
'''
Copyright 2015, 陈同 ([email protected]).
===========================================================
'''
__author__ = 'chentong & ct586[9]'
__author_email__ = '[email protected]'
#=========================================================
desc = '''
Program description:
This is designed to summarize results output by DESeq2.sh and GO, KEGG enrichment file.
'''
import sys
import os
from json import dumps as json_dumps
from time import localtime, strftime
timeformat = "%Y-%m-%d %H:%M:%S"
from optparse import OptionParser as OP
import re
from tools import *
#from multiprocessing.dummy import Pool as ThreadPool
#from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
debug = 0
def fprint(content):
"""
This is a Google style docs.
Args:
param1(str): this is the first param
param2(int, optional): this is a second param
Returns:
bool: This is a description of what is returned
Raises:
KeyError: raises an exception))
"""
print json_dumps(content,indent=1)
def cmdparameter(argv):
if len(argv) == 1:
global desc
print >>sys.stderr, desc
cmd = 'python ' + argv[0] + ' -h'
os.system(cmd)
sys.exit(1)
usages = "%prog -f file"
parser = OP(usage=usages)
parser.add_option("-f", "--files", dest="filein",
metavar="FILEIN", help="The all.DE file generated using DESeq.sh. ")
parser.add_option("-e", "--enrichmentType", dest="enrich_type",
default="BP_GO, MF_GO, CC_GO, KEGG",
help="Default <BP_GO, MF_GO, CC_GO, KEGG>. And only these 4 types of enrichment are supported. These strings will be used to get filenames.")
parser.add_option("-g", "--go-prefix", dest="go_prefix",
help="String like <go/NK_trans.rc.xls.DESeq2.all.DE.entrez>.")
parser.add_option("-F", "--log2-FC", dest="log2FC",
help="log2_FC for screening DE genes")
parser.add_option("-q", "--FDR", dest="FDR",
help="FDR for screening DE genes")
parser.add_option("-t", "--report-tag", dest="report_tag",
default='de', help="A unique string, only alphabets allowed. Default 'de'.")
parser.add_option("-r", "--report-dir", dest="report_dir",
default='report', help="Directory for report files. Default 'report'.")
parser.add_option("-R", "--report-sub-dir", dest="report_sub_dir",
default='3_expression_profile', help="Directory for saving report figures and tables. This dir will put under <report_dir>, so only dir name is needed. Default '3_expression_profile'.")
# parser.add_option("-d", "--doc-only", dest="doc_only",
# default=False, action="store_true", help="Specify to only generate doc.")
# parser.add_option("-n", "--number", dest="number", type="int",
# default=40, help="Set the maximum allowed samples for barplot. Default 40.\
# If more than this number of samples are given, heatmap will be used instead. ")
parser.add_option("-v", "--verbose", dest="verbose",
action="store_true", help="Show process information")
parser.add_option("-D", "--debug", dest="debug",
default=False, action="store_true", help="Debug the program")
(options, args) = parser.parse_args(argv[1:])
assert options.filein != None, "A filename needed for -i"
return (options, args)
#--------------------------------------------------------------------
def generateDoc(report_dir, report_sub_dir, report_tag, file, go_prefix, enrich_typeL, curation_label, log2FC, FDR):
dest_dir = report_dir+'/'+report_sub_dir+'/'
os.system('mkdir -p '+dest_dir)
print "\n## 基因表达图谱和差异基因 {#expr-profile-de-genes}\n"
print "\n### 整体表达聚类"
knitr_read_txt(report_dir, "DE_summary_cluster_pca")
prefix = file.replace('.all.DE', '')
'''
## OLD
#typeL = ['A._vs_.C_up', 'A._vs_.C_dw']
typeL = ['M1._higherThan_.Y1', 'M1._lowerThan_.Y1']
'''
typeL = set([line.split()[1] for line in open(file)])
'''
typeD = {'A._vs_.C':['A._higherThan_.C', 'A._lowerThan_.C']}
'''
typeD = {}
for cmp_type in typeL:
firstSep = cmp_type.find('._')
secndSep = cmp_type.find('_.')
sampA = cmp_type[:firstSep]
sampB = cmp_type[secndSep+2:]
type = cmp_type[firstSep:secndSep+2]
#name = cmp_type.replace('._higherThan_.', '____')
#name = name.repalce('._lowerThan_.', '____')
#nameL = name.split('___')
cmp = '._vs_.'.join([sampA, sampB])
#cmp, type = cmp_type.rsplit('_', 1)
type = type.join([sampA, sampB])
if cmp not in typeD:
typeD[cmp] = [type]
else:
typeD[cmp].append(type)
typeL = typeD.keys()
#--------------------------------------------
anno_all = prefix + '.normalized.anno.xls'
pca = prefix + ".normalized.rlog.pca.pdf"
pearson = prefix + ".normalized.rlog.pearson.pdf"
pca_png = prefix + ".normalized.rlog.pca.png"
pearson_png = prefix + ".normalized.rlog.pearson.png"
copy(dest_dir, anno_all)
copypdf(dest_dir, pca, pearson)
rAnno, rPca, rPearson = getRelativeDir([anno_all, pca_png, pearson_png],
report_sub_dir)
print '''
样品表达谱的相关性分析和主成分分析,展示的是样品之间基因表达的整体相似性和差异以及样品之间的聚类关系。
所有样品基因表达及注释文件下载:[基因表达和注释矩阵]({rAnno})
(ref:sample-pca-pearson) Pearson correlation and PCA analysis for all samples. 左图首先计算样品两两之间基因表达的Pearson相关系数,然后层级聚类分析样品之间的相似性,最后绘制Heatmap展示。每个方块的颜色代表横轴和纵轴对应样品的表达谱的相似度,值越高表明两个样品的基因表达谱越相近。右图是转换原始表达数据为两个正交的主成分,然后绘制样品在这两个主成分组成的空间内的分布关系,空间关系越近的样品相似度越高。
```{{r sample-pca-pearson, out.width="50%", fig.cap="(ref:sample-pca-pearson)"}}
knitr::include_graphics(c("{rPearson}", "{rPca}"))
```
'''.format(rAnno=rAnno, rPearson=rPearson, rPca=rPca)
print "\n### 差异基因功能注释和富集分析\n"
knitr_read_txt(report_dir, "DE_summary_gene_anno_enrichment")
print "\n#### 差异基因总结及表达谱\n"
DE_count = file + ".count.xls"
copy(dest_dir, DE_count)
DE_count_pdf = DE_count + '.heatmapS.pdf'
copypdf(dest_dir, DE_count_pdf)
DE_count_png = DE_count_pdf.replace('pdf', 'png')
DE_count, DE_count_pdf, DE_count_png = \
getRelativeDir([DE_count, DE_count_pdf, DE_count_png], report_sub_dir)
label_tmp_146 = 'de-count-heat-146'
print """
样品间差异基因数目统计,每个方格代表横轴样品相比于纵轴样品丰度显著高的基因的数目 (Figure \@ref(fig:{label}))。
(ref:{label}) 样品间差异基因数目统计,每个方格代表横轴样品相比于纵轴样品丰
度显著高的基因的数目。白色方格表示对应样品未做比较。[PDF]({pdf}) [XLS]({xls})
```{{r {label}, fig.cap="(ref:{label})"}}
knitr::include_graphics("{png}")
```
""".format(label=label_tmp_146+report_tag, pdf=DE_count_pdf, png=DE_count_png, xls=DE_count)
DE_profile = file + ".norm.kmeans.xls"
copy(dest_dir, DE_profile)
DE_profile_pdf = file + '.norm.kmeans.sort.heatmapS.pdf'
copypdf(dest_dir, DE_profile_pdf)
DE_profile_png = DE_profile_pdf.replace('pdf', 'png')
DE_profile, DE_profile_pdf, DE_profile_png = \
getRelativeDir([DE_profile, DE_profile_pdf, DE_profile_png], report_sub_dir)
label_tmp_146 = 'de-profile-heat-146'+report_tag
print """
样品间差异基因表达图谱,每一行代表一个基因,每一列代表一组样品,每个小方格代表基因在对应样品的相对表达量,其值的大小用颜色表示 (Figure \@ref(fig:{label}))。
(ref:{label}) 样品间差异基因表达图谱,每一行代表一个基因,每一列代表一组样品,每个小方格代表基因在对应样品的相对表达量,其值的大小用颜色表示。[PDF]({pdf}) [XLS]({xls})
```{{r {label}, fig.cap="(ref:{label})"}}
knitr::include_graphics("{png}")
```
""".format(label=label_tmp_146, pdf=DE_profile_pdf, png=DE_profile_png, xls=DE_profile)
count = 0
'''
typeD = {'A._vs_.C':['A._higherThan_.C', 'A._lowerThan_.C']}
'''
for type in typeL:
count += 1
volcano = prefix + '.'+type+'.Volcano.png'
venn = file + '.'+type+'.vennDiagram.pdf'
venn_png = venn.replace('pdf', 'png')
copypng(dest_dir, volcano)
copypdf(dest_dir, venn)
samp1, samp2 = type.split("._vs_.")
anno_up_de = prefix+'.'+'._higherThan_.'.join([samp1, samp2])+'.xls.anno.xls'
anno_dw_de = prefix+'.'+'._lowerThan_.'.join([samp1, samp2])+'.xls.anno.xls'
#anno_dw_de = prefix+'.'+type+'.results.DE_dw.anno.xls'
copy(dest_dir, anno_up_de, anno_dw_de)
ranno_up_de, ranno_dw_de, rvolcano, rvenn = \
getRelativeDir([anno_up_de, anno_dw_de, volcano, venn_png], report_sub_dir)
print "\n#### DE genes between {}\n".format(type.replace("._vs_.", " and "))
print """
(ref:volcano-venn-{count}) Volcano plot and venn diagram showing DE genes (FDR<{FDR}, |log2FC|>={log2FC}). 左图火山图展示的是基因差异倍数和显著性FDR的关系,红点表示鉴定出的差异表达基因。log2FC大于0的部分为在样品**{samp1}**中高表达的基因;log2FC小于0的部分为在样品**{samp2}**中高表达的基因。右图Venn图展示的是上调和下调的基因数目。{samp1}._vs_.{samp2}.up表示{samp1}相比于{samp2}高表达的基因。
```{{r volcano-venn-{count}, out.width="50%", fig.cap="(ref:volcano-venn-{count})"}}
knitr::include_graphics(c("{volcano}", "{venn}"))
```
""".format(count=str(count)+report_tag, volcano=rvolcano, venn=rvenn, samp1=samp1, samp2=samp2, FDR=FDR, log2FC=log2FC)
print """
* 样品{samp1}上调的基因及其注释: [Full table]({anno_up_de})
* 样品{samp2}上调的基因及其注释: [Full table]({anno_dw_de})
""".format(samp1=samp1, samp2=samp2, anno_up_de=ranno_up_de, anno_dw_de=ranno_dw_de)
print """
(ref:anno-up-de-{count}) 样品**{samp1}**上调的基因及其注释(top 30, first 7 columns)。
```{{r anno-up-de-{count} }}
anno_up_de <- read.table("{anno_up_de}",header=T,row.names=1, nrows=31, quote="",check.names=F, sep="\\t", comment.char="")
knitr::kable(head(anno_up_de, 30)[, 1:7], booktabs=T, caption="(ref:anno-up-de-{count})")
```
(ref:anno-dw-de-{count}) 样品**{samp2}**上调的基因及其注释(top 30, first 7 columns)。
```{{r anno-dw-de-{count} }}
anno_dw_de <- read.table("{anno_dw_de}",header=T,row.names=1, nrows=31, quote="",check.names=F, sep="\\t", comment.char="")
knitr::kable(head(anno_dw_de, 30)[, 1:7], booktabs=T, caption="(ref:anno-dw-de-{count})")
```
""".format(count=str(count)+report_tag, samp1=samp1, samp2=samp2, anno_up_de=ranno_up_de, anno_dw_de=ranno_dw_de)
print '''
Gene ontology 和 KEGG富集结果展示。GO富集分3个门类,生物进程 (BP)、分子功能 (MF)、细胞组分 (CC)。具体见下方对应的图和图例解释。蓝色字体为超链接,点击可下载。若有某一门类缺失,表示无富集或无对应结果。
'''
'''
typeD = {'A._vs_.C':['A._higherThan_.C', 'A._lowerThan_.C']}
'''
go_prefix_up = go_prefix+'.'+'._higherThan_.'.join([samp1,samp2])+'.'
go_prefix_dw = go_prefix+'.'+'._lowerThan_.'.join([samp1,samp2])+'.'
enrich_cnt = 0
for enrichType in enrich_typeL:
enrich_cnt += 1
go_prefix_up_enrich_xls = go_prefix_up+enrichType+'.xls'
go_prefix_up_enrich_pdf = go_prefix_up+enrichType+'.scatterplot.dv.pdf'
go_prefix_up_enrich_png = go_prefix_up+enrichType+'.scatterplot.dv.png'
go_prefix_dw_enrich_xls = go_prefix_dw+enrichType+'.xls'
go_prefix_dw_enrich_pdf = go_prefix_dw+enrichType+'.scatterplot.dv.pdf'
go_prefix_dw_enrich_png = go_prefix_dw+enrichType+'.scatterplot.dv.png'
copy(dest_dir, go_prefix_up_enrich_xls, go_prefix_dw_enrich_xls)
copypdf(dest_dir, go_prefix_up_enrich_pdf, go_prefix_dw_enrich_pdf)
rgo_prefix_up_enrich_xls, rgo_prefix_up_enrich_png, rgo_prefix_up_enrich_pdf, \
rgo_prefix_dw_enrich_xls, rgo_prefix_dw_enrich_png, rgo_prefix_dw_enrich_pdf = \
getRelativeDir([go_prefix_up_enrich_xls, go_prefix_up_enrich_png, go_prefix_up_enrich_pdf, \
go_prefix_dw_enrich_xls, go_prefix_dw_enrich_png, go_prefix_dw_enrich_pdf], report_sub_dir)
print '''
(ref:enrich-{count}-{enrich_cnt}) Top 30 gene ontology or KEGG enrichment terms for genes up-regulated in {samp1} (left) and {samp2} (right). Nodes color green to red represents enrichment signifiance. Node sizes represents number of DE genes in given terms. 点的颜色代表在对应通路的显著性富集程度,颜色越红代表富集越显著。点的大小代表对应通路中差异基因的数目,点越大,对应通路差异基因越多。缺少的图表示对应样品中没有富集的条目。 Full lists of GO or KEGG enrichment terms can be downloaded for **{samp1}** ([Enrich Table]({rgo_prefix_up_enrich_xls})) ([ PDF pic]({rgo_prefix_up_enrich_pdf})) and **{samp2}** ([Enrich Table]({rgo_prefix_dw_enrich_xls})) ([ PDF pic]({rgo_prefix_dw_enrich_pdf})).
```{{r enrich-{count}-{enrich_cnt}, out.width="50%", fig.cap="(ref:enrich-{count}-{enrich_cnt})"}}
knitr::include_graphics(c("{rgo_prefix_up_enrich_png}", "{rgo_prefix_dw_enrich_png}"))
```
'''.format(count=str(count)+report_tag, enrich_cnt=enrich_cnt, samp1=samp1, samp2=samp2, rgo_prefix_up_enrich_xls=rgo_prefix_up_enrich_xls, rgo_prefix_up_enrich_pdf=rgo_prefix_up_enrich_pdf, rgo_prefix_dw_enrich_xls=rgo_prefix_dw_enrich_xls, rgo_prefix_dw_enrich_pdf=rgo_prefix_dw_enrich_pdf, rgo_prefix_up_enrich_png=rgo_prefix_up_enrich_png, rgo_prefix_dw_enrich_png=rgo_prefix_dw_enrich_png)
#-------------------------------
def main():
options, args = cmdparameter(sys.argv)
#-----------------------------------
file = options.filein
enrich_type = options.enrich_type
enrich_typeL = re.split(r'[, ]*', enrich_type.strip())
log2_FC = options.log2FC
FDR = options.FDR
report_tag = options.report_tag
report_dir = options.report_dir
report_sub_dir = options.report_sub_dir
#doc_only = options.doc_only
go_prefix = options.go_prefix
curation_label = os.path.split(sys.argv[0])[-1].replace('.', '_')
#if doc_only:
# generateDoc(report_dir, report_sub_dir, file, go_prefix, enrich_typeL, curation_label)
# return 0
verbose = options.verbose
global debug
debug = options.debug
#-----------------------------------
aDict = {}
generateDoc(report_dir, report_sub_dir, report_tag, file, go_prefix, enrich_typeL, curation_label, log2_FC, FDR)
if __name__ == '__main__':
startTime = strftime(timeformat, localtime())
main()
endTime = strftime(timeformat, localtime())
fh = open('python.log', 'a')
print >>fh, "%s\n\tRun time : %s - %s " % \
(' '.join(sys.argv), startTime, endTime)
fh.close()
###---------profile the program---------
#import profile
#profile_output = sys.argv[0]+".prof.txt")
#profile.run("main()", profile_output)
#import pstats
#p = pstats.Stats(profile_output)
#p.sort_stats("time").print_stats()
###---------profile the program---------