-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMSE_Classifier.R
141 lines (126 loc) · 5.83 KB
/
MSE_Classifier.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
## What this script does:
## Classifies BRACA1 and BRACA2 samples using an MSE classifier and returns the first 10 pairs of genes that can separate both
## groups linearly
###-----------------------## LOAD THE LIBRARIES ##-------------------------------------------------------------------------------------------------------------------------
library(RCurl)
library(MASS)
###-----------------------## GET AND PREPROCESS THE DATA ##-------------------------------------------------------------------------------------------------------------------------
## Get the microarray data
url <- getURL("http://yeast.ime.usp.br/~ronaldo/ibi5031-2013/data_breast_cancer.csv")
data <- read.csv(text = url, header = T, row.names=1)
## Filter to include only the BRACA1/BRACA2 patients
braca <- c("P01","P02","P03","P04","P05","P06","P07","P08","P09","P10","P18","P19","P20","P21","P22")
data <- data[braca]
## Get the classes
labels_plot <- data[3227,]
labels <- data[3227,]
for (i in 1:ncol(labels)) {
if (labels[i] == "BRACA1") {
labels[i] = +1
}else {
labels[i] = -1}}
## Transform the classes in a matrix of vectors
vector_y <- as.data.frame(labels)
vector_y <- t(vector_y)
# Pre-process the data
data <- data[1:3226,]
data_mod <- matrix(as.numeric(as.matrix(data)), nrow = nrow(data))
colnames(data_mod) <- colnames(data)
rownames(data_mod) <- rownames(data)
colunas <- colnames(data_mod)
## Get all possible 2 by 2 gene combinations
gene_list <- rownames(data_mod)
gene_combinations <- combn(gene_list, 2)
## Create a table with the results of the MSE classifier
gene_combinations.df <- matrix(0, nrow = 11, ncol = 6)
gene_combinations.df <- as.data.frame(gene_combinations.df)
colnames(gene_combinations.df) <- c("Gene1", "Gene2", "bad_points", "bias", "weight1", "weight2")
###-----------------------## FUNCTIONS ##-------------------------------------------------------------------------------------------------------------------------
## Função para gerar a matrix X
matrix_vector_genes <- function(dados, gene1, gene2) {
sample_size <- ncol(dados)
gene1 <- gene_combinations[1,i]
gene2 <- gene_combinations[2,i]
X <- matrix(1, nrow = sample_size, ncol= 3)
X[,2] <- dados[gene1,]
X[,3] <- dados[gene2,]
return(X)
}
## Função para calcular o MSE
mse_calc <- function(X, Y) {
a <- ginv(t(X) %*% X) %*% t(X) %*% Y ## a is a vector containing bias, weight1 and weight2
return(a)
}
## Função para contar pontos mal classificados
count_missclassifications <- function(a, X, Y) {
sample_size <- nrow(X)
counts <- 0
for (i in 1:sample_size) {
sin_sample <- (a[1] + a[2] * X[i,2] + a[3] * X[i,3])
if ((sin_sample > 0 & Y[i] != 1) | ## se é maior que 0, e o label for diferente de 1, então conte como malclassificado
(sin_sample < 0 & Y[i] == 1)) { ## se é menor que 0, e o label for igual a 1, então conte como malclassificado também
counts <- counts + 1
}
}
return(counts)
}
###-----------------------## RUN THE ANALYSIS ##-------------------------------------------------------------------------------------------------------------------------
## For each gene pair combination
paired_genes <- 0
genes_0_counts <- 0
for (i in 1:ncol(gene_combinations)) {
paired_genes <- paired_genes + 1
if (genes_0_counts <= 10) {
i <- sample(1:5201925, 1)
## Generate the X matrix
X <- matrix_vector_genes(data_mod, gene_combinations[1,i], gene_combinations[2,i])
## Calculate the MSE
a <- mse_calc(X, vector_y)
## Calculate the number of misclassifications
bad_points <- count_missclassifications(a, X, vector_y)
if (bad_points == 0) {
genes_0_counts <- genes_0_counts + 1
## Insert the results in the table
gene_combinations.df[genes_0_counts,]$Gene1 <- gene_combinations[1,i]
gene_combinations.df[genes_0_counts,]$Gene2 <- gene_combinations[2,i]
gene_combinations.df[genes_0_counts,]$bad_points <- bad_points
gene_combinations.df[genes_0_counts,]$bias <- a[1]
gene_combinations.df[genes_0_counts,]$weight1 <- a[2]
gene_combinations.df[genes_0_counts,]$weight2 <- a[3]
}
print(paste("Number of pairs evaluated:", paired_genes))
print(paste("Pair position:",i))
print(paste("Genes with 0 points missclassified:",genes_0_counts))
}
else {
break
}
}
gene_combinations.df <- gene_combinations.df[1:10,]
## Write results to txt
write.table(gene_combinations.df, "Linearly.Separable.Gene.Pairs.txt", quote = F, row.names = F, col.names = T, sep = "\t")
###-----------------------## PLOT THE RESULTS ##-------------------------------------------------------------------------------------------------------------------------
rownames(labels_plot) <- "Group"
labels_plot <- t(labels_plot)
for (i in 1:nrow(gene_combinations.df)) {
gene1 <- paste(gene_combinations.df[i,1], "$", sep = "")
gene2 <- paste(gene_combinations.df[i,2], "$", sep = "")
x <- grep(gene1, rownames(data_mod))
y <- grep(gene2, rownames(data_mod))
pair_plot <- rbind(data_mod[y,], data_mod[x,])
pair_plot <- t(pair_plot)
pair_plot <- cbind(pair_plot, labels_plot)
pair_plot <- as.data.frame(pair_plot)
b <- gene_combinations.df[i,4]
w1 <- gene_combinations.df[i,5]
w2 <- gene_combinations.df[i,6]
pair_plot$V1 <- as.numeric(as.character(pair_plot$V1))
pair_plot$V2 <- as.numeric(as.character(pair_plot$V2))
minRange <- min(pair_plot$V1,pair_plot$V2)
maxRange <- max(pair_plot$V1,pair_plot$V2)
jpeg(paste(i, "_", gene_combinations.df[i,1], "_", gene_combinations.df[i,2], ".jpg", sep = ""))
plot(pair_plot$V1, pair_plot$V2, col = pair_plot$Group, pch = 19, xlab= gene_combinations.df[i,1], ylab= gene_combinations.df[i,2])
curve(expr = ((-b -w2 * x) / w1), from = minRange, to = maxRange, add = TRUE, col = "blue", lty = 2)
legend("bottomright", pch=c(19,19), col=c("black", "red"), c("BRACA1", "BRACA2"), bty="n", box.col="black", box.lwd = 0)
dev.off()
}