-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPxR3D_nt.R
111 lines (96 loc) · 3.76 KB
/
PxR3D_nt.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
source("utils.R")
options(warn=-1)
suppressMessages(library(getopt, quietly=T))
#argument mask: 0 - no argument, 1 - required argument, 2 - optional argument
optionSpec = matrix(c(
'data.file', 'i', 1, "character",
'cv.folds', 'c', 2, "character",
'permute.time', 'p', 2, "character",
'model.file', 'o', 1, "character",
'plot.file', 'f', 2, "character",
'verbose', 'v', 2, "integer",
'help' , 'h', 0, "logical"
), byrow=TRUE, ncol=4)
opt = getopt(optionSpec)
## default parameters
data.file <- opt$data.file
cv.folds <- 10
model.file <- opt$model.file
permute.time <- 2000
verbose <- 0
seed <- 30
if (!is.null(opt$data.file)) {data.file <- opt$data.file}
if (!is.null(opt$cv.folds)) {cv.folds <- opt$cv.folds}
if (!is.null(opt$plot.file)) {plot.file <- opt$plot.file}
if (!is.null(opt$model.file)) {model.file <- opt$model.file}
if (!is.null(opt$permute.time)) {permute.time <- opt$permute.time}
if (!is.null(opt$verbose)) {verbose <- opt$verbose}
if ( !is.null(opt$help) | is.null(opt$data.file) | is.null(opt$model.file))
{
cat (
'Predict crosslinking nucleotides by PxR3D\n',
'Usage: Rscript ', get_Rscript_filename(),"[options] -i <feature mat file> -o <model file>\n",
' -i, feature mat file feature matrix file with crosslinking information\n',
' -o, model file(Rds format) final model file\n',
'[options]\n',
' -c, CV folds Integer(default: 10)\n',
' -p, permutation time Integer(default: 2000)\n',
' -f, plot file ROC plot file\n',
' -v, verbose Verbose mode\n',
' -h, help Print usage\n'
);
q(status=1);
}
### preprocess prediction and feature table ###
if(verbose){
cat("Preprocess data\n")
}
process.obj <- preProcessData(data.file, varcutoff = 0)
rawdata <- process.obj$rawData
rawdata.full <- extendrawdata(rawdata)
cleandata <- process.obj$processedData
interact.nt.idx <- process.obj$interact.nt.idx
## add up/dn stream code
cleandata.bak <- cleandata
cleandata <- cbind(cleandata,rawdata.full[interact.nt.idx, c(259,262, 266) ] )
cleandata$upstreamNt <- as.factor(cleandata$upstreamNt)
cleandata$dnstreamNt <- as.factor(cleandata$dnstreamNt)
cleandata$nt <- as.factor(cleandata$nt)
## encode nt identity to numerical
numericl.feature.mat <- cbind(model.matrix(~ nt -1 , cleandata),
model.matrix(~ upstreamNt-1 , cleandata),
model.matrix(~ dnstreamNt-1 , cleandata))
#cleandata.bak <- cleandata
cleandata <- cbind(cleandata[,1:match("base_aa_stack_aromatic",colnames(cleandata))],as.data.frame(numericl.feature.mat) )
## set mtry and ntree
mtry.seq <- seq(1,20)
ntree.seq <- c(10,100,200,500,1000,2000,5000)
set.seed(seed)
repeat.times <- 1
if(verbose){
cat("Model training\n")
}
## Predict crosslinking nt using random forest with SMOTE for imbalanced datasets
rf.obj <- rfPredict(cleandata, mtry.seq, ntree.seq, cv.folds, repeat.times, smote = T)
rf.par <- rf.obj$fit$bestTune
### permute features to get feature rank pvalue ###
set.seed(seed)
#rf.permute <- rf.permutation(cleandata, SMOTE = T, permute.time = permute.time, rf.par = rf.par)
### calculate GLM weight ###
set.seed(seed)
glmGrid <- expand.grid(alpha=seq(0,1,0.1), lambda = seq(0,1, length =50))
#glm.fit.obj <- calculateGLMweight(cbind(group=cleandata[,1], as.data.frame(scale(cleandata[, -1]))), cv.folds,repeat.times = repeat.times, metric = "AUC")
if(verbose){
cat("Saving model\n")
}
if(!is.null(model.file)){
saveRDS(rf.obj$fit$finalModel, model.file)
}
if(verbose){
cat("Plotting ROC curve\n")
}
### plot figures ###
if(!is.null(plot.file)){
set.seed(seed)
plotROCwithCI(rf.obj$fit$pred$obs, rf.obj$fit$pred$xl, plot.file = plot.file)
}