-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscNBMF.R
358 lines (290 loc) · 11.9 KB
/
scNBMF.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
###############################################
#### Code for scNBMF
#### Paper: A fast and efficient count-based matrix factorization method for detecting cell types from single-cell RNAseq data
###############################################
library(tensorflow)
library(edgeR)
library(igragh)
library(Rtsne)
tf$logging$set_verbosity(tf$logging$INFO)
np <- import("numpy")
Datapreprocess <- function(data,obs_col,var_col,val_col,offset_col,verbose,row ,col ) {
#################################
# Proprocess for raw input data
#
# Notes:
# It will get the total count of each genes for training and transform the data into 4 columns
#
# Variables:
# data: The input raw data (shape: genes x cell)
# obs_col: The index of the column in result data which contains the cell index of the raw data
# obs_col: The index of the column in result data which contains the gene index of the raw data
# obs_col: The index of the column in result data which contains the count expression of the raw data
# obs_col: The index of the column in result data which contains the total count of the raw data
# verbose: Whether the dimensional information need to be output or not
#################################
if (verbose) {
cat("Datapreprocess...\n");
cat( sprintf("Data rows %d \n", nrow(data)) );
cat( sprintf("Data cols %d \n", ncol(data)) );
cat( sprintf("obs_col %d \n" , obs_col) );
cat( sprintf("var_col %d \n" , var_col) );
cat( sprintf("val_col %d \n" , val_col) );
cat( sprintf("offset_col %d \n" , offset_col) );
}
X <- matrix(0:0,ncol=4,nrow=col*row);
colsum <- colSums(data);
t <- 1;
for (i in 1 : col){
for(j in 1 : row){
#X[i*j,1] = i*j -1;
X[t,obs_col] <- i - 1;
X[t,var_col] <- j - 1;
X[t,val_col] <- data[j,i];
X[t,offset_col] <- colsum[i];
t <- t+1;
}
}
return(X)
}
get_weight <- function(W,lambda1){
####################################
#Function to get the l1_penalty of W
#
#Variables:
#W: The matrix to add l1_penalty
#lambda1: The coeffcient of l1_penalty
#res = lambda1 * (|w1| + |w2| + ... + |wn|)
####################################
tmpW <- W
tf$add_to_collection("collection",tf$contrib$layers$l1_regularizer(lambda1)(tmpW))
Wpenalty <- tmpW
return(Wpenalty)
}
get_weight2 <- function(W,lambda1){
####################################
#Function to get the l2_penalty of W
#
#Variables:
#W: The matrix to add l2_penalty
#lambda1: The coeffcient of l2_penalty
#res = lambda1 * ((w1)^2 + (w2)^2 + ... + (wn)^2)
####################################
tmpW <- W
tf$add_to_collection("collection",tf$contrib$layers$l2_regularizer(lambda1)(tmpW))
Wpenalty <- tmpW
return(Wpenalty)
}
next_batch <- function(data, batch_size, i, NN){
###################################
#Function to get the next batch of the training data
#
#Notes:
#The data will be used recyclable
#
#Variables:
#data: The input data with preprocessing (shape: genes*cells x 4)
#batch_size: The training size of the batch
#i : The number of the iterations
#NN: The whole rows of data(genes*cells)
####################################
indx <- (batch_size * i) %% NN + 1
if ((batch_size + indx) > NN){
indx <- 1
}
return(data[indx: (indx+batch_size -1 ) , ])
}
scNBMF_model <- function(H, G, C, k, variable_idx, sample_idx, T_, y_, psi, penalty_type, lambda_for_l1, eps = 1e-8){
###################################
#ZINBMF model
#
#H: Res Matrix of dimensional reduction
#G: Number of genes
#C: Number of cells
#variable_idx: Gene index in the raw data matrix
#sample_idx: Cell index in the raw data matrix
#T_: Total count in the raw data matrix
#y_: Count expression in the raw data matrix
#psi: EdgeR dispersion of the input count data
#penalty_type: 1 means l1_penalty and others means l2_penalty
#lambda_for_l1: The coeffcient of l1 or l2_penalty
#
#return:
#LL : loss function for the model
#####################################
W <- tf$Variable(tf$random_normal(shape(G, k)), name='weights')
#S <- tf$Variable(tf$zeros(shape(1L)))
W_ <- tf$gather(W, variable_idx)
psi_ <- tf$gather(psi, variable_idx)
H_ <- tf$gather(tf$matrix_transpose(H), sample_idx)
eta_ <- tf$reduce_sum(W_ * H_, 1L)
#mu_ <- tf$exp(eta_ + S + tf$log(T_))
mu_ <- tf$exp(eta_ + tf$log(T_))
LL <- tf$reduce_sum(y_ * tf$log(mu_ + eps) - (y_ + psi_) * tf$log(mu_ + psi_ + eps))
if (penalty_type == 1){
Wpenalty <- get_weight(W ,lambda_for_l1)
}
else{
Wpenalty <- get_weight2(W ,lambda_for_l1)
}
LL <- tf$reduce_mean(LL + Wpenalty)
return(LL)
}
fill_feed_dict <- function(data, batch_size, i, NN,
obs_col, var_col, val_col, offset_col,
sample_idx, variable_idx, y_, T_) {
# Create the feed_dict for the placeholders filled with the next
# batch size` examples.
batch <- next_batch(data, batch_size, i, NN)
sample_feed <- batch[,obs_col]
variable_feed <- batch[,var_col]
y_feed <- batch[,val_col]
T_feed <- batch[,offset_col]
dict(
sample_idx = sample_feed,
variable_idx = variable_feed,
y_ = y_feed,
T_ = T_feed
)
}
CalculateCluster <- function(H_result,label_true,result_file,t0,num_clusters,times=100){
#several kinds of cluster methods
#Notes: Using several kinds of methods to calculate several times of clusters and get the median as the final result
#H_result: The final result of dimensional reduction (shape: cells * k)
#label_true: The true label
#f: The output file to record the cluster result of NMI AMI and ARI
#times: The times of clusters
#nmi_record: Record the scores of Kmeans NMI to print it on the screen
#num_clusters: The kinds which need to be divided into
nmi <- c();
ari <- c();
for(j in 1 : 100){
res_clust <- kmeans(H_result, num_clusters);
nmi <- c(nmi, compare(unlist(res_clust$cluster), unlist(label_true), method = "nmi") );
ari <- c(ari, compare(unlist(res_clust$cluster), unlist(label_true), method = "adjusted.rand") );
}
nmi <- median(nmi);
ari <- median(ari);
cat("Kmeans\n",file = result_file);
cat( sprintf("NMI %f ARI %f\n" , nmi, ari) ,file = result_file , append = TRUE);
cat("Kmeans\n");
cat( sprintf("NMI %f ARI %f\n" , nmi, ari) );
}
scNBMF <- function(input_file, calcluster = FALSE, tagsname = '', batch_size = 10000,
num_iter = 18000, learning_rate = 0.001, inner_iter = 5, report_every = 100, ndim = 20, penalty_type = 1,
storename = "./H_result.csv", result_file = "./cluster_result.txt", lambda_for_l1 = 0.3,
tsneshow = TRUE, title = '', verbose = TRUE) {
#############################
#Main parametrization of the scNBMF algorithm.
#Notes: If you need cluster after calculate the dimensional reduction matrix you need to add --calcluster True --tagsname name_of_the_label.txt
#input_file: The input count expression matrix(shape: genes x cells)
#calcluster: Boolean variable. Whether need to calculate clusters or not
#tagsname: The name of the label file
#batch_size: Size of the training batch
#num_iter: Iterations of training process
#learning_rate; Learning rate of the adam optimizer
#inner_iter: Calculate how many times of optimizer before refresh loss
#report_every: Show tf.logging.INFO when training several iterations
#ndim: (k) The numbers of final dimension need to be reducted
#penalty_type: 1 means l1_penalty and others means l2_penalty
#storename: The output file to record the dimensional reduction matrix
#result_file: The output file to record the cluster result of NMI AMI and ARI
#psi_file: File name of the edgeR dispersion of the input count data
#lambda_for_l1: The coeffcient of l1 or l2_penalty
#pyplot: Boolean variable. Whether need to paint figures
#title: The title of the figures
#verbose: Show verbose infprmation
############################
data_input <- read.table(input_file,sep = ',');
#label_input <- read.table(tagsname);
data <- data_input[rowSums(data_input)>10,]
col <- as.integer(ncol(data));
row <- as.integer(nrow(data));
obs_col <- 1L
var_col <- 2L
val_col <- 3L
offset_col <- 4L
G <- row
C <- col
edgeres <- DGEList(counts = data)
edgeres <- estimateDisp(edgeres, robust = TRUE)
psi <- edgeres$tagwise.dispersion
data <- Datapreprocess(data,obs_col,var_col,val_col,offset_col,verbose,row,col)
NN <- as.integer(nrow(data))
if (verbose){
cat (psi)
cat ( sprintf("\n"));
cat ( sprintf("The number of genes %d\n" , G) );
cat ( sprintf("The number of genes %d\n" , C) );
}
data <- data[sample(nrow(data),nrow(data),replace=FALSE),] #Reshuffle
t0 <- Sys.time();
with(tf$Graph()$as_default(), {
k <- as.integer(ndim);
H <- tf$Variable(tf$random_normal(shape(k, C)), name='PCs');#Res matrix #np.random.randn(G, k)
sample_idx <- tf$placeholder(tf$int32, shape(batch_size) );# Gene index in the raw data matrix
variable_idx <- tf$placeholder(tf$int32, shape(batch_size) );# Cell index in the raw data matrix
T_ <- tf$placeholder(tf$float32, shape(batch_size) );# Total count in the raw data matrix
y_ <- tf$placeholder(tf$float32, shape(batch_size) );# Count expression in the raw data matrix
LL <- scNBMF_model(H, G, C, k, variable_idx, sample_idx, T_, y_, psi, penalty_type, lambda_for_l1)
cost <- -LL / batch_size
optimizer <- tf$train$AdamOptimizer(learning_rate)$minimize(cost)
init <- tf$global_variables_initializer()
costs <- tf$zeros(shape(num_iter))
if (verbose){
cat('Training\n')
}
sess <- tf$Session()
sess$run(init)
for (i in 0:num_iter){
feed_dict <- fill_feed_dict(data, batch_size, i, NN,
obs_col, var_col, val_col, offset_col,
sample_idx,
variable_idx,
y_,
T_)
for (j in 0:inner_iter){
sess$run(optimizer, feed_dict=feed_dict)
}
c <- sess$run(cost, feed_dict=feed_dict)
#costs[i] <- c
if( i %% report_every == 0){
cat(sprintf('Cost: %f \n', c))
}
}
H_result <- sess$run(H)
#np.savetxt(storename, H_result, delimiter = '\t' , fmt = '%.4f')
#H_result = tf$matrix_transpose(H_result)
#H_result = H_result$eval()
})
#return (H_result)
H_result <- t(H_result)
save_H_result <- round(H_result,6);
#cat (save_H_result)
write.csv(save_H_result,file = storename);
if(verbose){
if(calcluster){
cat(sprintf("Need to calculate cluster...\n"))
}
else{
cat(sprintf("Don't need to calculate cluster....\n"))
}
cat(sprintf("Tagsname is %s\n" ,tagsname))
}
if(calcluster){
if(tagsname == ''){
cat(sprintf("Need tags name(label of the cell) to calculate the result of NMI and ARI"))
}
else{
label_true <- read.table(tagsname)
num_clusters <- nrow(unique(label_true))
#cat (num_clusters)
CalculateCluster(H_result,label_true,result_file,t0,num_clusters = num_clusters,times = 100)
}
}
if(tsneshow){
res_tsne <- Rtsne(H_result)
plot(res_tsne$Y, col=label_true$V1, xlab="tSNE1", ylab="tSNE2",cex.axis=1.5,cex.lab=1.3)
title("scNBMF tSNE, 2 Groups with Labels")
}
}