-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathAssignment 4 Census.R
71 lines (64 loc) · 2.28 KB
/
Assignment 4 Census.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
setwd("/Users/chicheongweng/Downloads")
census = read.csv("census.csv")
library(caTools)
set.seed(2000)
spl = sample.split(census$over50k, SplitRatio = 0.6)
train = subset(census, spl==TRUE)
test = subset(census, spl==FALSE)
#logistic regression
censusglm = glm( over50k ~ . , family="binomial", data = train)
summary(censusglm)
predictTest = predict(censusglm, newdata = test, type = "response")
table(test$over50k, predictTest >= 0.5)
library(ROCR)
ROCRpred = prediction(predictTest, test$over50k)
perf = performance(ROCRpred, "tpr", "fpr")
plot(perf)
as.numeric(performance(ROCRpred, "auc")@y.values)
#cart model
install.packages("rpart")
install.packages("rpart.plot")
library(rpart)
library(rpart.plot)
set.seed(2000)
spl = sample.split(census$over50k, SplitRatio = 0.6)
train = subset(census, spl==TRUE)
test = subset(census, spl==FALSE)
train_rpart = rpart(over50k ~ ., data=train, method="class")
prp(train_rpart)
library(ROCR)
predict_test_rpart = predict(train_rpart, newdata = test)
predict_test_rpart = predict_test_rpart[,2]
rp = prediction(predict_test_rpart, test$over50k)
perf = performance(rp, "tpr", "fpr")
plot(perf)
table(test$over50k, predict_test_rpart)
#random forest model
set.seed(2000)
spl = sample.split(census$over50k, SplitRatio = 0.6)
train = subset(census, spl==TRUE)
test = subset(census, spl==FALSE)
library(randomForest)
set.seed(1)
trainSmall = train[sample(nrow(train), 2000), ]
train_randomforest = randomForest(over50k ~ ., data=trainSmall)
PREDICTb = predict(train_randomforest, newdata=test)
table(test$over50k, PREDICTb)
vu = varUsed(train_randomforest, count=TRUE)
vusorted = sort(vu, decreasing = FALSE, index.return = TRUE)
dotchart(vusorted$x, names(train_randomforest$forest$xlevels[vusorted$ix]))
varImpPlot(train_randomforest)
#selecting cp by cross validation
install.packages("caret")
library(caret)
install.packages("e1071")
library(e1071)
set.seed(2)
numFolds = trainControl( method = "cv", number = 10 )
cpGrid = expand.grid( .cp = seq(0.002,0.1,0.002))
# Perform the cross validation
train(over50k ~ ., data = train, method = "rpart", trControl = numFolds, tuneGrid = cpGrid )
#create a new cart model
train_model = rpart(over50k ~ ., data=train, method="class", cp = 0.002)
predict_test = predict(train_model, newdata=test, type="class")
table(test$over50k, predict_test)