-
Notifications
You must be signed in to change notification settings - Fork 3
/
gradient_descent.R
executable file
·125 lines (107 loc) · 3.68 KB
/
gradient_descent.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/Rscript
library(lassoshooting)
# make data
rm(list = ls(all = TRUE)) # make sure previous work is clear
ls()
#x0 <- c(1,1,1,1,1) # column of 1's
#x1 <- c(1,2,3,4,5) # original x-values
# create the x- matrix of explanatory variables
#x <- as.matrix(cbind(x0,x1))
p = 70; n = 300
x <- matrix(rnorm(n*p), nrow=n, ncol=p)
# create the y-matrix of dependent variables
#y <- as.matrix(c(3,7,5,11,14))
y <- matrix(rnorm(n), nrow=n)
#y <- as.matrix(rnorm())
m <- nrow(y)
# implement feature scaling
#x.scaled <- x
#x.scaled[,2] <- (x[,2] - mean(x[,2]))/sd(x[,2])
# analytical results with matrix algebra
analytical <- solve(t(x)%*%x)%*%t(x)%*%y # w/o feature scaling
#solve(t(x.scaled)%*%x.scaled)%*%t(x.scaled)%*%y # w/ feature scaling
# results using canned lm function match results above
#summary(lm(y ~ x[, 2])) # w/o feature scaling
#summary(lm(y ~ x.scaled[, 2])) # w/feature scaling
# define the gradient function dJ/dtheata: 1/m * (h(x)-y))*x where h(x) = x*theta
# in matrix form this is as follows:
grad <- function(x, y, theta) {
gradient <- (1/m)* (t(x) %*% ((x %*% t(theta)) - y))
return(t(gradient))
}
# define gradient descent update algorithm
grad.descent <- function(x, maxit){
thetas = matrix(0,ncol=dim(x)[2], nrow=maxit)
theta <- matrix(0, nrow=1, ncol=dim(x)[2])
numcoord = dim(x)[2]
alpha = .1 # set learning rate
for (i in 1:maxit) {
theta <- theta - alpha*grad(x, y, theta)
thetas[i, ] = theta
}
return(thetas)
}
coord.descent <- function(x, maxit, lambda=0.1){
p = dim(x)[2]
theta <- matrix(0, ncol=p, nrow=maxit) # Initialize the parameters
for(k in 1:maxit){
for(i in 1:p) {
iter = ifelse(k>1, yes=k-1, 1)
theta[k,i] = t(x[,i])%*% (y - x[,-i]%*%theta[iter,-i])/ (t(x[,i])%*%x[,i])
theta[k,i] = softthresh(theta[k,i], lambda)
}
}
return(theta)
}
maxiter = 20
out_gd = grad.descent(x, maxiter)
out_cd = coord.descent(x, maxiter, lambda=0)
library(ggplot2)
out1 = data.frame(iter=1:maxiter ,p2 = out_cd[,2])
out2 = data.frame(iter=1:maxiter ,p2 = out_gd[,2])
anal = data.frame(iter=1:maxiter, p2 = analytical[2,])
png(filename="./gd_vs_cd.png", width=1200, height=800)
ggplot(out1,aes(iter,p2)) + geom_line(aes(color="Coordinate descent"), size=1.5) +
geom_line(data=out2,aes(color="Gradient descent", size=1)) + labs(color="") +
geom_line(data=anal, aes(color="Analytical", size=1))
dev.off()
# -----------------------------------------------------------------------
# cost and convergence intuition
# -----------------------------------------------------------------------
# typically we would iterate the algorithm above until the
# change in the cost function (as a result of the updated b0 and b1 values)
# was extremely small value 'c'. C would be referred to as the set 'convergence'
# criteria. If C is not met after a given # of iterations, you can increase the
# iterations or change the learning rate 'alpha' to speed up convergence
# get results from gradient descent
# beta <- grad.descent(x,1000)
# beta <- coord.descent(x, 1000)
#
# # define the 'hypothesis function'
# h <- function(xi,b0,b1) {
# b0 + b1 * xi
# }
#
# # define the cost function
# cost <- t(mat.or.vec(1,m))
# for(i in 1:m) {
# cost[i,1] <- (1 /(2*m)) * (h(x[i,2], beta[1000,1], beta[1000,2]) - y[i,])^2
# }
#
# totalCost <- colSums(cost)
# print(totalCost)
#
# # save this as Cost1000
# cost1000 <- totalCost
#
# # change iterations to 1001 and compute cost1001
# beta <- (grad.descent(x,1500))
# cost <- t(mat.or.vec(1,m))
# for(i in 1:m) {
# cost[i,1] <- (1 /(2*m)) * (h(x[i,2],beta[1,1],beta[1,2])- y[i,])^2
# }
# cost1001 <- colSums(cost)
#
# # does this difference meet your convergence criteria?
# print(cost1000 - cost1001)
#