-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandomForest_demo.R
149 lines (114 loc) · 4.23 KB
/
randomForest_demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
### randomForest tutorial ###
### Computational Biology: Univeristy of Vermont
### April 26, 2017
### Matthias Nevins
### Notes on randomForest ###
# "randomForest" is an ensemble learning model
# (also refered to as a "machine learning" tool).
#
# The package uses a random forest algorithm to sample data randomly
# and then construct and analyzes multiple random decision trees (ensemble).
# The performance of each random decision tree is compared and results are used to
# determine the mode of a classification or the mean of a regression.
# The model can be trained on a subset of the data and then tested for accuracy.
## See notes on decision trees and random forest ##
############ randomForest #######################
# Begin by installing the randomForest package
#install.packages("randomForest")
library("randomForest")
help("randomForest")
## INPUTS ##
#help("randomForest")
# relevent input arguments
# x = data frame or matrix of predictor variable(s)
# y = response vector (if factor, classification)
# ntree = number of decision trees to grow
# mtry = Number of randomly sampled
## mtry should be a value lower than the total number of variables
## for classification trees use the square root of number of variables
# sampsize = number of rows that are randomly sampled for each tree
## sampsize should be lower than the total number of rows in your data set
# nodesize = minimum size of terminal nodes (larger the number smaller the tree)
### OUTPUTS ##
# predication by randomForest is the mean of the random decision trees
# confusion matrix (classification)
#### EXAMPLE 1: Iris data ####
# Call to "iris" data set available in R
#data(iris)
#View(iris)
#str(iris) # numeric predictor variables, Species variable is catagorical
#summary(iris)
# Store iris in a new data fram
Dframe <- iris
# Let's get started
set.seed(123) # to get reproducible random results
# Split iris data to training data and testing data
# Train the model with 70% of data and test it
# with the remaing %30 of the data.
help(sample)
spl <- sample(2,nrow(Dframe),replace=TRUE,prob=c(0.7,0.3))
print(spl)
str(spl)
# define the training data
trainData <- Dframe[spl==1,]
head(trainData)
# Test data
testData <- Dframe[spl==2,]
head(testData)
# Generate random forest with training data
irisRF <- randomForest(Species~.,data=trainData, mtry= 3, ntree=200,proximity=TRUE)
help(randomForest)
# Print Random Forest model and see the importance features
print(irisRF)
# Confusion matrix for train data
table(predict(irisRF),trainData$Species)
# Plot random forest
plot(irisRF)
# Look at importance of independant vars
importance(irisRF)
# Plot importance
varImpPlot(irisRF)
# Now build random forest for testing data
help("predict.randomForest")
irisPred <- predict(irisRF,newdata=testData)
print(irisPred)
table(irisPred, testData$Species)
#Now, let's look at the margin, positive or negative,
# if positive it means correct classification
help("margin.randomForest")
plot(margin(irisRF,testData$Species))
#------------------------------------------------------
####### EXAMPLE 2: Using MASS package ##################
# Begin by importing two libraries
library(randomForest)
library(MASS)
# set seed
#help("set.seed")
set.seed(1234)
# Store data "birthwt" data set from the MASS package into a DataFrame
#help("birthwt")
dFrame <- birthwt
# Identify predictor variables and target variable
# Identify catagorical target variable
#head(dFrame)
#str(dFrame)
#View(dFrame)
# see how many unique values are within each variable
# for "low"
length(unique(dFrame$low))
hist(dFrame$low) # two unique values
length(unique(dFrame$bwt))
hist(dFrame$bwt) # continuous variable
# another way to view unique values for all variables
apply(dFrame,2,function(x) length(unique(x)))
help(apply) # 2 in this function indicates columns
# Now convert catagorical variables using as.factor so
# they are treated as numerical data by randomForest
# Begin by placing the variables you need to convert into a new placehoder
# variable cVars (catagorical variables)
cVars <- c("low", "race", "smoke", "ptl", "ht", "ui", "ftv")
# use a for loop to go over data frame (dFrame)
for(i in cVars){
dFrame[,i]=as.factor(dFrame[,i])
}
str(dFrame) # we see that the numerical values for the