-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathaffy_full.sh
executable file
·424 lines (402 loc) · 12.4 KB
/
affy_full.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#!/bin/bash
#set -x
set -e
#set -u
#########################################
#Data and format in <sample.cor>
#Control must befor treated sample.
#---here ends content(this line and '#' not included)---------------
#Cel file Sample
#1.cel control-1
#2.cel control-2
#3.cel control-3
#4.cel MUT2-1
#5.cel MUT2-2
#6.cel MUT2-3
#---here ends content(this line and '#' not included)---------------
#Notes:The second column must not have same sample names
#########################################
#Notes:
#1.Chips based on different platform, like different GRL
#number can not read together.
#If this happens, error message 错误于read.affybatch(filenames = l$filenames, phenoData = l$phenoData, :
# Cel file Name_UN-1.CEL does not seem to have the correct dimensions
#2.
#########################################
usage()
{
cat <<EOF
${txtcyn}
Usage (the least parameter):
$0 -f pheno.txt -s prefix${txtrst}
${bldblu}Function${txtrst}:
This script is used to analyze affy microarray.
${txtbld}OPTIONS${txtrst}:
-f Data file (with header line, the first column is the
cel filename, the second column is the treat methods and
different repeats (every string in this column should be
unique), ${bldred}tab${txtrst} seperated)${bldred}[NECESSARY]
[Control locates before treat]${txtrst}
name sample (this header line is needed)
GSM723207.CEL shCntrl1
GSM723208.CEL shCntrl2
GSM723209.CEL shCntrl3
GSM723210.CEL shDot1L1
GSM723211.CEL shDot1L2
GSM723212.CEL shDot1L3
-s The prefix of output file.
-M The algorithm to use.
${bldred}Default [rma], accept mas5${txtrst}
-i If error happends when loading needed packages, plaease give
TRUE to -i to install all needed ones.
${bldred}Default [FALSE]${txtrst}
-G The file used to annotate affy probe ID.
Currently accepted is a tow column file with the first column
containing probe_sets and the second column containing gene
symbols (header line is optional).
${bldred}Not necessary but better to supply${txtrst}
-m The number of rows in layout. When drawing the raw
image signal, this value is used.
-n The number of columns in layout. When drawing the raw
image signal, this value is used.
-a Execute PMA selection.[Default FALSE, accept TRUE]
-r Run the script[default] or only produce the script[FALSE].
-R Supplying the path of R you want to use instead of default R.
[Default an empty string, accept a path like
~/soft/R-3.0.0/bin]
-g Run DE test or not.[Default FALSE, accept TRUE only if there
are two conditions in data. When TRUE,
please supply other needed parameters.]
-t The repeat time of treated sample.[integer, default 3]
{if -t and -c not setted, no PMA selection.
This number relates with the number of rows in the last
integer rows.
}
-c The repeat time of control sample.[integer, default 3]
{if -t and -c not setted, no PMA selection.
This number relates with the number of rows in the first
integer rows.
}
-p The accepted maximum p-value.[default 0.05]
-d The accepted maximum fdr.[defaultault 0.3]
-o The accepted minimum fold change.
[log2 based, default 1 means 2 times fold change.]
-q Need quality check, such as raw signals, pairwise comparision,
degradition, raw boxplot. Only needed when first run.
[default:FALSE, skip this step. TRUE accepted if you run for
the first time.]
EOF
}
file=
prefix=
install='FALSE'
gpl=
pma='FALSE'
runDE='FALSE'
rawr= #rows of raw signal
rawc= #cols of raw signal
run='TRUE'
controlR=3
treatR=3
pvalue=0.05
fdr=0.3
foldc=1
qcheck=FALSE
algorithm='rma'
Rscript_path=
while getopts "hf:s:i:M:m:n:a:r:R:g:G:t:c:p:d:o:q:" OPTION
do
case $OPTION in
h)
usage
exit 1
;;
f)
file=$OPTARG
;;
s)
prefix=$OPTARG
;;
M)
algorithm=$OPTARG
;;
i)
install=$OPTARG
;;
m)
rawr=$OPTARG
;;
n)
rawc=$OPTARG
;;
a)
pma=$OPTARG
;;
r)
run=$OPTARG
;;
R)
Rscript_path=$OPTARG
;;
G)
gpl=$OPTARG
;;
g)
runDE=$OPTARG
;;
t)
treatR=$OPTARG
;;
c)
controlR=$OPTARG
;;
p)
pvalue=$OPTARG
;;
d)
fdr=$OPTARG
;;
o)
foldc=$OPTARG
;;
q)
qcheck=$OPTARG
;;
?)
usage
exit 1
;;
esac
done
if [ -z $file ] || [ -z $prefix ] ; then
usage
echo "No -f or -s supplied"
exit 1
fi
if [ $qcheck == 'TRUE' ] && ( [ -z $rawr ] || [ -z $rawc ] ); then
usage
echo "No rawr or rawc for quality check."
exit 1
fi
midname="affy."${algorithm}
cat <<EOF >$prefix.${midname}.r
if ($install){
source("http://bioconductor.org/biocLite.R")
#biocLite(c("affy", "genefilter", "gcrma", "affyPLM", 'xps', 'oligo'))
biocLite(c("affy", "genefilter"))
install.packages("amap", repo=" http://cran.us.r-project.org")
}
library("affy")
print("##reading $file")
pd <- read.AnnotatedDataFrame("$file", sep='\t', header=TRUE)
print('##reading cel files')
abc <- ReadAffy(filenames=rownames(pData(pd)), phenoData=pd,
sampleNames=pData(pd)[,1], verbose=TRUE)
if($qcheck){
print('##Begin quality ckeck')
#check the quality of microarray, the more uniform the better.
#or the less white points the better. Linux and windows may have
#different picture ways. Here windows preferred.
#postscript will get a file very large, not recommended here.
#postscript(file="$prefix.qc.raw.signal.eps", onefile=FALSE,
# horizontal=FALSE, paper="special", width=10, height=8, pointsize=10)
print('##Check raw signal')
jpeg("$prefix.affy.qc.raw.signal.jpg", 2049, 2049, res=72, quality=95) #width, height
#the matrix 6 x 3, must change according to your sample
par(mfrow=c($rawr,$rawc))
image(abc)
dev.off() #needed to save the picture
print('##Pairwise comparision')
#Pairwise comparision. The more linear converge, the better.
jpeg("$prefix.affy.MA.plot.jpg", 2048,2048,res=72,quality=95)
MAplot(abc, pairs=TRUE, plot.method="smoothScatter") #slow
dev.off() #needed to save the picture
#The boxplot, to see the expression variation and decide the normalization
print('##Raw signal--boxplot')
pdf("$prefix.affy.boxplot.raw.signal.pdf")
#las is a parameter of par. 0: always parallel to the axis [_default_],
#1: always horizontal, 2: always perpendicular to the axis,
#3: always vertical.
#But when label is long, it will be cutted of.
boxplot(abc, las=2, main="Boxplot of raw signal")
dev.off()
print('##Begin RNA degradition plot')
#QC-RNA degradation plots, mRNA degrades from 5' to 3'.
deg <- AffyRNAdeg(abc)
summaryAffyRNAdeg(deg) #this is just a show of 'deg'
jpeg("$prefix.affy.rna.degradation.raw.signal.jpg", 800, 600,res=72,quality=95)
plotAffyRNAdeg(deg, transform="shift.only")
dev.off()
}
print("##Begin preprocessing data using << ${algorithm} >>")
#data preprocessing, background correction,normalization and summarization
#calculating expression
eset <- ${algorithm}(abc)
eset_expr <- exprs(eset)
if ("${algorithm}" == "mas5"){
eset_expr <- log2(eset_expr)
}
#
print("##Normalized boxplot")
#The boxplot, to see the expression variation after normalization
pdf("$prefix.${midname}.boxplot.normalization.pdf")
boxplot(as.data.frame(eset_expr), las=2, main="Boxplot of raw signal")
dev.off()
print("hclust of all samples")
#hcluster of RMA normalization
library("amap")
pdf("$prefix.${midname}.whole.gene.hcluster.pdf")
#t:returns the transpose of x.
plot(hcluster(t(eset_expr), method="pearson"), hang=-1)
dev.off()
print("#output expression value to $prefix.${midname}.expr")
#output
write.table(eset_expr, file="$prefix.${midname}.expr", sep="\t",
row.names=TRUE, col.names=TRUE, quote=FALSE)
if ($pma) {
print("##Begin PMA detection")
#pMA selection. if a probe is 'A' in all repeated sample no
#matter control or treated, delete this probe.
detect <- exprs(mas5calls(abc, alpha1=0.01, alpha2=0.05))
probe_A <- c()
detectRC <- dim(detect)
print('##annotate the probes which are A in all repeats')
for(i in 1:detectRC[1]){
delete <- 0
treatDel <- 0
half = $controlR
for(j in 1:half){
if(detect[i,j]=='A'){
treatDel = treatDel + 1
}
}
if(treatDel==3){
delete <- 1
}else{
treatDel <- 0
for(j in (half+1):(half+$treatR)){
if(detect[i,j]=='A'){
treatDel = treatDel + 1
}
}
if(treatDel==3){
delete <- 1
}
}
if(delete==1){
probe_A <- c(probe_A, 0)
}else{
probe_A <- c(probe_A, 1)
}
}
print('Output pma annotated result')
detect <- cbind(detect, probe_A) #only for testing
write.table(detect, file="$prefix.${midname}.pma", sep="\t",
row.names=TRUE, col.names=TRUE, quote=FALSE)
#combine the probe_A with espression data
esetN <- cbind(eset_expr, probe_A)
#delete untrustable data
filterdProbe <- subset(esetN, esetN[,'probe_A']==0)
filterF <- filterdProbe[,1:detectRC[2]]
esetNoA <- subset(esetN, esetN[,'probe_A']==1)
#delete the last column
esetF <- esetNoA[,1:detectRC[2]] #the final result for t-test
print('output expression daate after filter and filtered probes')
write.table(esetF, file="$prefix.${midname}.expr.aPMA", sep="\t",
row.names=TRUE, col.names=TRUE, quote=FALSE)
write.table(filterF, file="$prefix.${midname}.expr.fullA", sep="\t",
row.names=TRUE, col.names=TRUE, quote=FALSE)
}else{
esetF <- eset_expr
}
if ($runDE) {
print('pick differentiated genes')
library('genefilter')
print('t-test')
Ttest <- rowttests(esetF, as.factor(c(rep(1,$controlR),rep(2,$treatR))))
print('method "BH" gives the false discovery rate ?p.adjust. We declare a collection of 100 genes with a maximum FDR of 0.10 to be differentially expressed (DE), then we expect a maximum of 10 genes to be false positives.')
p.adjust <- p.adjust(Ttest\$p.value, method="BH")
TtestAdj <- cbind(Ttest, p.adjust)
esetFF <- cbind(esetF, TtestAdj)
print("Output all gene expression with t-test results")
write.table(esetFF,
file="${prefix}${midname}_${foldc}_${pvalue}_${fdr}.expr.ttest",
sep="\t", row.names=TRUE, col.names=TRUE, quote=FALSE)
#diffExpr <- subset(TtestAdjM, abs(TtestAdjM[,'dm'])>=$foldc &&
# TtestAdjM[,'p.adjust']<=$fdr && TtestAdjM[,'p.value']<=$pvalue)
diffExpr <- subset(TtestAdj, abs(TtestAdj\$dm)>=$foldc)
#diffExpr <- subset(esetFF,
#abs(rowSums(esetFF[,2:$controlR])/rowSums(esetFF[,$controlR+1:$controlR+$treatR]))>=$foldc)
diffExpr <- subset(diffExpr, diffExpr\$p.value<=$pvalue)
diffExpr <- subset(diffExpr, diffExpr\$p.adjust<=$fdr)
write.table(diffExpr, file="${prefix}${midname}_${foldc}_${pvalue}_${fdr}.deexpr",
sep="\t", row.names=TRUE, col.names=TRUE, quote=FALSE)
}
EOF
if [ "$run" = 'TRUE' ];then
if [ -z ${Rscript_path} ]; then
Rscript $prefix.${midname}.r
else
${Rscript_path}/Rscript $prefix.${midname}.r
fi
/bin/rm -f $prefix.${midname}.r
sed -i '1 s/^/Probe\t/' ${prefix}.${midname}.expr
#if [ -s ${gpl} ];then
if ! [ -z ${gpl} ] && [ -s ${gpl} ];then
#grep -v '^#' ${gpl} | cut -f 1,11,13 | \
# awk 'BEGIN{OFS="\t";FS="\t"}ARGIND==1{if(FNR>1)
# a[$1]=$1"@"$2"@"$3;}ARGIND==2{if(FNR>1) $1=a[$1]; print
# $0; }' - ${prefix}.expr | sed 's# /// #;#g' >${prefix}.expr.gene
#grep -v '^#' ${gpl} | paste - ${prefix}.expr >${prefix}.expr.anno
awk 'BEGIN{OFS="\t";FS="\t"}ARGIND==1{
a[$1]=$1"@"$2;}ARGIND==2{if(FNR>1) $1=a[$1]; print
$0; }' ${gpl} ${prefix}.${midname}.expr >${prefix}.${midname}.expr.gene
fi
fi
#cat <<EOF >$prefix.Makefile
#$prefix.r.CT:
# @echo "The R script for analysis."
#
#$prefix.MA.plot.jpg.CT:
# @echo "The pairwise comparision plot to indicate the
# repeatability."
#$prefix.qc.raw.signal.jpg.CT:
# @echo "The raw point picture to detect the quality of the
# microarray. If no light points found, good. If run in linux,
# light lines can be irgnored for system reason."
#
#$prefix.boxplot.raw.signal.pdf.CT:
# @echo "The raw boxplot to see the difference among each sample
# with raw data."
#
#$prefix.boxplot.normalization.pdf.CT:
# @echo "The raw boxplot to see the difference among each sample
# after normalization."
#
#$prefix.whole.gene.hcluster.pdf.CT:
# @echo "The cluster of all samples. The tree graph can used
# to tell if your result is right."
#
#$prefix.pma.CT:
# @echo "The PMA detect result with 0 and 1 annotation in the last
# column. 1 means kept. "
#
#$prefix.expr.CT:
# @echo "The normalized expression value of all probes."
#
#$prefix.expr.fullA:
# @echo "The expression value of wrong probes."
#
#$prefix.cor:
# @echo "The input file of affy_full.sh."
#
#$prefix.t.test:
# @echo "The filtered expression value with t-test results."
#
#${prefix}_${foldc}_${pvalue}_${fdr}.expr.ttest.CT:
# @echo "The selected different expressed genes with expressed
# values ttest results."
#
#${prefix}_${foldc}_${pvalue}_${fdr}.deexpr:
# @echo "The selected different expressed genes and ttest results."
#EOF