spamemailregression

#spam email example

library(kernlab)
data(spam)

#Perform the subsampling. Split data by random coin flip
set.seed(3435)
trainIndicator = rbinom(4601, size = 1, prob =.5)
table(trainIndicator)

#set up train and test data sets

trainSpam = spam[trainIndicator == 1,]
testSpam = spam[trainIndicator == 0,]

head(trainSpam)
table(trainSpam$type)

par(mfrow = c(1,1))
plot(log10(trainSpam$capitalAve +1) ~ trainSpam$type)

plot(log10(trainSpam[,1:4]+1))

hCluster = hclust(dist(t(trainSpam[, 1:57])))
plot(hCluster)

hClusterupdate <- hclust(dist(t(log10(trainSpam[,1:55] +1))))
plot(hClusterupdate)

#create a regression model that looks at predicting spam with just a single variable

trainSpam$numType = as.numeric(trainSpam$type) - 1
costFunction = function(x,y) sum(x!= (y>.5))
cvError = rep(NA, 55)
library(boot)
for (i in 1:55) {
  lmFormula = reformulate(names(trainSpam)[i], response = "numType")
  glmFit = glm(lmFormula, family = "binomial", data = trainSpam)
  cvError[i]=cv.glm(trainSpam,glmFit,costFunction,2)$delta[2]
}

#which variable has the lowest error rate from above?
names(trainSpam)[which.min(cvError)]
names(trainSpam[52])

#use the best model from group
predictionModel = glm(numType~charDollar + remove + charExclamation, family = "binomial", data = trainSpam)

#get predictions on the test set
predictionTest = predict(predictionModel, testSpam)
predictedSpam = rep("nonspam", dim(testSpam)[1])

#Classify as 'spam' for those with prob >.5
predictedSpam[predictionModel$fitted.values > .5] ="spam"

#classification table
table(predictedSpam, testSpam$type)

#error rate at seed set.seed(3435)
(61+458)/(1346+458+61+449)