-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspamemailregression
60 lines (43 loc) · 1.61 KB
/
spamemailregression
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#spam email example
library(kernlab)
data(spam)
#Perform the subsampling. Split data by random coin flip
set.seed(3435)
trainIndicator = rbinom(4601, size = 1, prob =.5)
table(trainIndicator)
#set up train and test data sets
trainSpam = spam[trainIndicator == 1,]
testSpam = spam[trainIndicator == 0,]
head(trainSpam)
table(trainSpam$type)
par(mfrow = c(1,1))
plot(log10(trainSpam$capitalAve +1) ~ trainSpam$type)
plot(log10(trainSpam[,1:4]+1))
hCluster = hclust(dist(t(trainSpam[, 1:57])))
plot(hCluster)
hClusterupdate <- hclust(dist(t(log10(trainSpam[,1:55] +1))))
plot(hClusterupdate)
#create a regression model that looks at predicting spam with just a single variable
trainSpam$numType = as.numeric(trainSpam$type) - 1
costFunction = function(x,y) sum(x!= (y>.5))
cvError = rep(NA, 55)
library(boot)
for (i in 1:55) {
lmFormula = reformulate(names(trainSpam)[i], response = "numType")
glmFit = glm(lmFormula, family = "binomial", data = trainSpam)
cvError[i]=cv.glm(trainSpam,glmFit,costFunction,2)$delta[2]
}
#which variable has the lowest error rate from above?
names(trainSpam)[which.min(cvError)]
names(trainSpam[52])
#use the best model from group
predictionModel = glm(numType~charDollar + remove + charExclamation, family = "binomial", data = trainSpam)
#get predictions on the test set
predictionTest = predict(predictionModel, testSpam)
predictedSpam = rep("nonspam", dim(testSpam)[1])
#Classify as 'spam' for those with prob >.5
predictedSpam[predictionModel$fitted.values > .5] ="spam"
#classification table
table(predictedSpam, testSpam$type)
#error rate at seed set.seed(3435)
(61+458)/(1346+458+61+449)