-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathIntaRNA_CSV_p-value.R
150 lines (121 loc) · 4.87 KB
/
IntaRNA_CSV_p-value.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env Rscript
####################################################################
# Computes p-values and false discovery rates (fdr following Benjamin+Hochberg)
# by fitting a GEV on the energy values computed by IntaRNA.
# Note, such p-value estimates are only useful for genome-wide predictions.
#
# arguments: <IntaRNA-output-CSV> [<out-CSV> = <intarna-csv-output>] [<col-name-E> = E]
#
# 1 <IntaRNA-output-CSV> = ";"-separated CSV output of IntaRNA
# 2 <out-CSV> = file name to write the extended CSV output to (2 new columns)
# 3 <col-name-E> = the column name that holds the energy values to be fitted
#
# example call:
#
# Rscript --vanilla IntaRNA_CSV_p-value.R predictions.csv
#
# This script is part of the IntaRNA source code package. See
# respective licence and documentation for further information.
#
# https://github.com/BackofenLab/IntaRNA
#
####################################################################
####################################################################
# get command line arguments
####################################################################
args = commandArgs(trailingOnly=TRUE)
# check and parse
if (length(args)<1) { stop("call with <intarna-csv-output> [<out-file-with-p-values> = <intarna-csv-output>] [<col-name-E> = E]", call.=FALSE) }
# get input file = IntaRNA csv output
intarnaOutputFile = args[1];
if (!file.exists(intarnaOutputFile )) { stop("intarna-csv-output file '", intarnaOutputFile, "' does not exist!", call.=FALSE) }
# get output file
outFile = intarnaOutputFile;
if (length(args)>=2) {
outFile = args[2]
}
# set column to get energies from
colNameE = "E"
# get column name from argument if present
if (length(args)>=3) {
colNameE = args[3]
}
# column delimiter used in CSV input / output
csvColSep = ";"
# number of digits of p-values
pValPrec = 7
####################################################################
# fits a generalized extreme value distribution to the given energy data
# adopted from 'gev' function of 'evir' library
# @param energy the IntaRNA energy values to fit (a vector)
# @return the fitting parameters xi, mu, and sigma
gevFitting <- function (energy)
####################################################################
{
n.all <- NA
energy <- as.numeric(-energy)
n <- length(energy)
sigma0 <- sqrt(6 * var(energy))/pi
mu0 <- mean(energy) - 0.57722 * sigma0
xi0 <- 0.1
theta <- c(xi0, sigma0, mu0)
negloglik <- function(theta, tmp) {
y <- 1 + (theta[1] * (tmp - theta[3]))/theta[2]
if ((theta[2] < 0) || (min(y) < 0))
out <- 1e+06
else {
term1 <- length(tmp) * logb(theta[2])
term2 <- sum((1 + 1/theta[1]) * logb(y))
term3 <- sum(y^(-1/theta[1]))
out <- term1 + term2 + term3
}
out
}
# compute fit
fit <- optim(theta, negloglik, hessian = TRUE, tmp = energy)
if (fit$convergence)
warning("gev fit optimization may not have succeeded")
return( list( xi=fit$par[1], sigma=fit$par[2], mu=fit$par[3] ) )
}
####################################################################
# computes p-values for the given energy values and GEV distribution
# adopted from 'pgev' function of 'evir' library
# @param energy IntaRNA energy values
# @param gev GEV parameters
# @return p-values for each energy value
gevPvalue <- function (energy, gev=list( xi = 1, mu = 0, sigma = 1) )
####################################################################
{
return ( 1 - exp( - (1 + (gev$xi * ((-energy) - gev$mu))/gev$sigma)^(-1 /gev$xi)))
}
####################################################################
# parse IntaRNA CSV
####################################################################
d = read.csv( intarnaOutputFile, sep=csvColSep )
# check if energy column present
if (!is.element(colNameE, colnames(d))) {
stop("'",colNameE,"' is not among the column names of '",intarnaOutputFile,"'", call.=FALSE);
}
# check if unique
if (sum(colnames(d) == colNameE)>1) {
stop("column name '",colNameE,"' occurs more than once in '",intarnaOutputFile,"'", call.=FALSE);
}
####################################################################
# fit p-values
####################################################################
# get energies to fit
E = d[,colnames(d) == colNameE]
# fit negated energies
gevfit <- gevFitting(E) # fitten
# get rounded pValue
pVal <- round( gevPvalue( E, gevfit ), digits=pValPrec )
# get rounded fdr
fdr <- round( p.adjust(pVal, method="BH"), digits=pValPrec )
####################################################################
# write output
####################################################################
o = cbind( d, pVal, fdr )
colnames(o)[ncol(o)-1] = "p-value"
colnames(o)[ncol(o)] = "fdr"
write.table( o, outFile, sep=csvColSep, row.names=FALSE, col.names = TRUE, quote=FALSE )
#################################################################EOF