forked from bfriedrichgrube/OC_CPTAC_iTRAQ_SWATH
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path7a_CPTAC_classification.R
192 lines (136 loc) · 5.85 KB
/
7a_CPTAC_classification.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
## Mclust classification algorithm (For Figure 3, S2)
## Step 1: Prepare R session
#load packages
library(MSnbase)
library(mclust)
# set working directory
setwd('')
## Step 2: Load required data
#load protein data
load('CPTAC_SWATH_overlap.Rdata')
load('CPTAC_DDA_overlap.Rdata')
#read in clinical and annotation table
clinical<- read.delim('CPTAC_clinical.csv', sep=',', dec='.', header=T, row.names=1)
dim(clinical)
[1] 103 23
## Step 3: Calculate z-scores for clustering
SWATH_z <- SWATH_overlap
SWATH_z <- sweep(SWATH_z, 1, apply(SWATH_z, 1, mean, na.rm=T), "-")
SWATH_z <- sweep(SWATH_z, 1, apply(SWATH_z, 1, sd, na.rm=T), "/")
dim(SWATH_z)
[1] 1599 103
DDA_z <- DDA_overlap
DDA_z <- sweep(DDA_z, 1, apply(DDA_z, 1, mean, na.rm=T), "-")
DDA_z <- sweep(DDA_z, 1, apply(DDA_z, 1, sd, na.rm=T), "/")
dim(DDA_z)
[1] 1599 103
DDA_z0 <- DDA_missing0
DDA_z0 <- sweep(DDA_z0, 1, apply(DDA_z0, 1, mean, na.rm=T), "-")
DDA_z0 <- sweep(DDA_z0, 1, apply(DDA_z0, 1, sd, na.rm=T), "/")
dim(DDA_z0)
[1] 4366 103
## Step 4: Classification
set.seed(0)
#SWATH common proteins
CPTAC_SWATH_cluster_final <- Mclust(t(SWATH_z))
summary(CPTAC_SWATH_cluster_final)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VII (spherical, varying volume) model with 3 components:
log.likelihood n df BIC ICL
-212801.5 103 4802 -447858.9 -447858.9
Clustering table:
1 2 3
17 55 31
SWATH_class <- CPTAC_SWATH_cluster_final$classification
#iTRAQ DDA common proteins
CPTAC_DDA_cluster_final <- Mclust(t(DDA_z))
summary(CPTAC_DDA_cluster_final)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VII (spherical, varying volume) model with 3 components:
log.likelihood n df BIC ICL
-220097.7 103 4802 -462451.5 -462451.5
Clustering table:
1 2 3
52 20 31
DDA_class <- CPTAC_DDA_cluster_final$classification
# iTRAQ DDA all without missing values
CPTAC_DDA_cluster_all <- Mclust(t(DDA_z0))
summary(CPTAC_DDA_cluster_all)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VII (spherical, varying volume) model with 3 components:
log.likelihood n df BIC ICL
-604107.5 103 13103 -1268944 -1268944
Clustering table:
1 2 3
56 16 31
DDA_class_all <- CPTAC_DDA_cluster_all$classification
#combine results into a table and calculate similarity
#update and upload table
tmp <- read.delim('/home/betty/Documents/Biomarker_Cancer/Ovarian_JohnHopkins/CPTAC_Clustering/clustering_results_all_170214.csv', sep=',', header=T, as.is=T)
tmp <- tmp[,c(1:2,13:15)]
classification_result <- cbind(tmp, DDA_class, DDA_class_all)
tmp1 <- classification_result
rownames(tmp1)<- tmp1[,2]
tmp1 <- tmp1[sort(rownames(tmp1)),]
classification_result <- cbind(tmp1, SWATH_class)
rownames(classification_result) <- classification_result[,1]
classification_result <- classification_result[sort(rownames(classification_result)),]
head(classification_result)
save(DDA_z, CPTAC_DDA_cluster_final, file='DDA_classification_workflow.Rdata')
save(SWATH_z, CPTAC_SWATH_cluster_final, file='SWATH_classification_workflow.Rdata')
save(classification_result, file='classification_result.Rdata')
## Step 5: Calculate Adjusted Rand Index (Figure 3B)
#ARI of results from common proteins
adjustedRandIndex(classification_result$DDA_class, classification_result$SWATH_class)
[1] 0.2146062
#ARI of results from SWATH common proteins vs. iTRAQ DDA original
adjustedRandIndex(classification_result$CPTAC_subgroup, classification_result$SWATH_class)
[1] 0.1400079
#ARI of results from iTRAQ DDA all vs. original
adjustedRandIndex(classification_result$CPTAC_subgroup, DDA_class_all)
[1] 0.3641888
#ARI of results from iTRAQ DDA common proteins vs. original
adjustedRandIndex(classification_result$CPTAC_subgroup, classification_result$DDA_class)
[1] 0.329218
#ARI of results from iTRAQ DDA all vs. common proteins
adjustedRandIndex(classification_result$DDA_class, DDA_class_all)
[1] 0.5831374
#ARI of results from iTRAQ DDA original vs. TCGA subgroup
adjustedRandIndex(classification_result$CPTAC_subgroup, classification_result$mRNA_subgroup)
[1] 0.201455
## Step 5: Prepare MSnSet Data for following step
#Prepare SWATH MSnSet
#add binary information of classification
bi_1 <- classification_result$SWATH_class==1
bi_2 <- classification_result$SWATH_class==2
bi_3 <- classification_result$SWATH_class==3
clinical_SWATH <- clinical[sort(rownames(clinical)),1:17]
clinical_SWATH <- cbind(clinical_SWATH, classification_result[,7], bi_1, bi_2, bi_3)
colnames(clinical_SWATH)[15]<-'HRD_status'
colnames(clinical_SWATH)[18:21]<- c('SWATH_class', '1.binary', '2.binary', '3.binary')
all(rownames(clinical_SWATH)==colnames(SWATH_overlap))
[1] TRUE
clinical_SWATH_anno <- new('AnnotatedDataFrame', data=clinical_SWATH)
CPTAC_SWATH_Set <- new('MSnSet', exprs=SWATH_overlap, phenoData=clinical_SWATH_anno)
#Prepare DDA MSnSet
#add binary information of classification
bi_1 <- classification_result$DDA_class==1
bi_2 <- classification_result$DDA_class==2
bi_3 <- classification_result$DDA_class==3
clinical_DDA <- clinical[,1:17]
clinical_DDA <- cbind(clinical_DDA, classification_result[,6], bi_1, bi_2, bi_3)
colnames(clinical_DDA)[15]<-'HRD_status'
colnames(clinical_DDA)[18:21]<- c('DDA_class', '1.binary', '2.binary', '3.binary')
colnames(DDA_overlap)<- rownames(clinical_DDA)
all(rownames(clinical_DDA)==colnames(DDA_overlap))
[1] TRUE
clinical_DDA_anno <- new('AnnotatedDataFrame', data=clinical_DDA)
CPTAC_DDA_Set <- new('MSnSet', exprs=DDA_overlap, phenoData=clinical_DDA_anno)
save(CPTAC_SWATH_Set, file='CPTAC_SWATH_Set.Rdata')
save(CPTAC_DDA_Set, file='CPTAC_DDA_Set.Rdata')