From 5c7bbe744d13497757ecaca16d4d283d0e0efca2 Mon Sep 17 00:00:00 2001 From: Dan Halligan Date: Fri, 3 Jan 2025 09:51:40 +0000 Subject: [PATCH] Formatting updates --- 02-statistical-learning.Rmd | 28 +++-- 03-linear-regression.Rmd | 36 +++--- 04-classification.Rmd | 45 ++++--- 07-moving-beyond-linearity.Rmd | 22 ++-- 08-tree-based-methods.Rmd | 115 +++++++++-------- 09-support-vector-mechines.Rmd | 140 +++++++++++---------- 10-deep-learning.Rmd | 2 +- 11-survival-analysis-and-censored-data.Rmd | 4 +- 12-unsupervised-learning.Rmd | 22 ++-- 9 files changed, 226 insertions(+), 188 deletions(-) diff --git a/02-statistical-learning.Rmd b/02-statistical-learning.Rmd index 7857664..55ac52b 100644 --- a/02-statistical-learning.Rmd +++ b/02-statistical-learning.Rmd @@ -302,7 +302,7 @@ college$Elite <- factor(ifelse(college$Top10perc > 50, "Yes", "No")) summary(college$Elite) plot(college$Outstate ~ college$Elite, xlab = "Elite", ylab = "Outstate") -par(mfrow = c(2,2)) +par(mfrow = c(2, 2)) for (n in c(5, 10, 20, 50)) { hist(college$Enroll, breaks = n, main = paste("n =", n), xlab = "Enroll") } @@ -374,7 +374,7 @@ x[-(10:85), numeric] |> > the relationships among the predictors. Comment on your findings. ```{r} -pairs(x[, numeric], cex = 0.2) +pairs(x[, numeric], cex = 0.2) cor(x[, numeric]) |> kable() @@ -425,8 +425,10 @@ library(tidyverse) ``` ```{r} -ggplot(Boston, aes(nox, rm)) + geom_point() -ggplot(Boston, aes(ptratio, rm)) + geom_point() +ggplot(Boston, aes(nox, rm)) + + geom_point() +ggplot(Boston, aes(ptratio, rm)) + + geom_point() heatmap(cor(Boston, method = "spearman"), cexRow = 1.1, cexCol = 1.1) ``` @@ -440,12 +442,12 @@ Yes > predictor. ```{r} -Boston |> - pivot_longer(cols = 1:13) |> - filter(name %in% c("crim", "tax", "ptratio")) |> - ggplot(aes(value)) + - geom_histogram(bins = 20) + - facet_wrap(~name, scales="free", ncol= 1) +Boston |> + pivot_longer(cols = 1:13) |> + filter(name %in% c("crim", "tax", "ptratio")) |> + ggplot(aes(value)) + + geom_histogram(bins = 20) + + facet_wrap(~name, scales = "free", ncol = 1) ``` Yes, particularly crime and tax rates. @@ -496,9 +498,9 @@ Boston |> select(-c(crim, zn)) |> pivot_longer(!rm) |> mutate(">8 rooms" = rm > 8) |> - ggplot(aes(`>8 rooms`, value)) + - geom_boxplot() + - facet_wrap(~name, scales = "free") + ggplot(aes(`>8 rooms`, value)) + + geom_boxplot() + + facet_wrap(~name, scales = "free") ``` Census tracts with big average properties (more than eight rooms per dwelling) diff --git a/03-linear-regression.Rmd b/03-linear-regression.Rmd index 98a1b5a..5a95967 100644 --- a/03-linear-regression.Rmd +++ b/03-linear-regression.Rmd @@ -67,11 +67,11 @@ library(plotly) ```{r} model <- function(gpa, iq, level) { 50 + - gpa * 20 + - iq * 0.07 + - level * 35 + - gpa * iq * 0.01 + - gpa * level * -10 + gpa * 20 + + iq * 0.07 + + level * 35 + + gpa * iq * 0.01 + + gpa * level * -10 } x <- seq(1, 5, length = 10) y <- seq(1, 200, length = 20) @@ -82,15 +82,18 @@ plot_ly(x = x, y = y) |> add_surface( z = ~college, colorscale = list(c(0, 1), c("rgb(107,184,214)", "rgb(0,90,124)")), - colorbar = list(title = "College")) |> + colorbar = list(title = "College") + ) |> add_surface( z = ~high_school, colorscale = list(c(0, 1), c("rgb(255,112,184)", "rgb(128,0,64)")), - colorbar = list(title = "High school")) |> + colorbar = list(title = "High school") + ) |> layout(scene = list( xaxis = list(title = "GPA"), yaxis = list(title = "IQ"), - zaxis = list(title = "Salary"))) + zaxis = list(title = "Salary") + )) ``` Option iii correct. @@ -366,7 +369,7 @@ par(mfrow = c(2, 2)) plot(Auto$horsepower, Auto$mpg, cex = 0.2) plot(log(Auto$horsepower), Auto$mpg, cex = 0.2) plot(sqrt(Auto$horsepower), Auto$mpg, cex = 0.2) -plot(Auto$horsepower ^ 2, Auto$mpg, cex = 0.2) +plot(Auto$horsepower^2, Auto$mpg, cex = 0.2) x <- subset(Auto, select = -name) x$horsepower <- log(x$horsepower) @@ -553,7 +556,7 @@ We can show this numerically in R by computing $t$ using the above equation. ```{r} n <- length(x) -sqrt(n - 1) * sum(x * y) / sqrt(sum(x ^ 2) * sum(y ^ 2) - sum(x * y) ^ 2) +sqrt(n - 1) * sum(x * y) / sqrt(sum(x^2) * sum(y^2) - sum(x * y)^2) ``` > e. Using the results from (d), argue that the _t_-statistic for the @@ -846,9 +849,9 @@ contributions. > answers. ```{r} -x1 <- c(x1 , 0.1) -x2 <- c(x2 , 0.8) -y <- c(y ,6) +x1 <- c(x1, 0.1) +x2 <- c(x2, 0.8) +y <- c(y, 6) summary(lm(y ~ x1 + x2)) summary(lm(y ~ x1)) summary(lm(y ~ x2)) @@ -929,9 +932,10 @@ The results from (b) show reduced significance compared to the models fit in (a). ```{r} -plot(sapply(fits, function(x) coef(x)[2]), coef(mfit)[-1], - xlab = "Univariate regression", - ylab = "multiple regression") +plot(sapply(fits, function(x) coef(x)[2]), coef(mfit)[-1], + xlab = "Univariate regression", + ylab = "multiple regression" +) ``` The estimated coefficients differ (in particular the estimated coefficient for diff --git a/04-classification.Rmd b/04-classification.Rmd index 84775e5..ca88a0e 100644 --- a/04-classification.Rmd +++ b/04-classification.Rmd @@ -23,11 +23,10 @@ $$ Letting $x = e^{\beta_0 + \beta_1X}$ \begin{align} -\frac{P(X)}{1-p(X)} &= \frac{\frac{x}{1 + x}} - {1 - \frac{x}{1 + x}} \\ - &= \frac{\frac{x}{1 + x}} - {\frac{1}{1 + x}} \\ - &= x +\frac{P(X)}{1-p(X)} + &= \frac{\frac{x}{1 + x}} {1 - \frac{x}{1 + x}} \\ + &= \frac{\frac{x}{1 + x}} {\frac{1}{1 + x}} \\ + &= x \end{align} ### Question 2 @@ -65,7 +64,7 @@ therefore, we can consider maximizing $\log(p_K(X))$ $$ \log(p_k(x)) = \log(\pi_k) - \frac{1}{2\sigma^2}(x - \mu_k)^2 - - \log\left(\sum_{l=1}^k \pi_l \exp\left(-\frac{1}{2\sigma^2}(x - \mu_l)^2\right)\right) + \log\left(\sum_{l=1}^k \pi_l \exp\left(-\frac{1}{2\sigma^2}(x - \mu_l)^2\right)\right) $$ Remember that we are maximizing over $k$, and since the last term does not @@ -265,7 +264,7 @@ when $X_1 = 40$ and $X_2 = 3.5$, $p(X) = 0.38$ > chance of getting an A in the class? We would like to solve for $X_1$ where $p(X) = 0.5$. Taking the first equation -above, we need to solve $0 = −6 + 0.05X_1 + 3.5$, so $X_1 = 50$ hours. +above, we need to solve $0 = -6 + 0.05X_1 + 3.5$, so $X_1 = 50$ hours. ### Question 7 @@ -305,7 +304,7 @@ p(D|v) &= \frac{p(v|D) p(D)}{p(v|D)p(D) + p(v|N)p(N)} \\ \end{align} ```{r} -exp(-0.5) * 0.8 / (exp(-0.5) * 0.8 + exp(-2/9) * 0.2) +exp(-0.5) * 0.8 / (exp(-0.5) * 0.8 + exp(-2 / 9) * 0.2) ``` ### Question 8 @@ -412,7 +411,7 @@ $$ (\hat\alpha_{orange0} - \hat\alpha_{apple0}) + (\hat\alpha_{orange1} - \hat\alpha_{apple1})x $$ -> c. Suppose that in your model, $\hat\beta_0 = 2$ and $\hat\beta = −1$. What +> c. Suppose that in your model, $\hat\beta_0 = 2$ and $\hat\beta = -1$. What > are the coefficient estimates in your friend's model? Be as specific as > possible. @@ -423,7 +422,7 @@ We are unable to know the specific value of each parameter however. > d. Now suppose that you and your friend fit the same two models on a different > data set. This time, your friend gets the coefficient estimates -> $\hat\alpha_{orange0} = 1.2$, $\hat\alpha_{orange1} = −2$, +> $\hat\alpha_{orange0} = 1.2$, $\hat\alpha_{orange1} = -2$, > $\hat\alpha_{apple0} = 3$, $\hat\alpha_{apple1} = 0.6$. What are the > coefficient estimates in your model? @@ -571,7 +570,7 @@ fit <- glm(Direction ~ Lag3, data = Weekly[train, ], family = binomial) pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5 mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction) -fit <- glm(Direction ~Lag4, data = Weekly[train, ], family = binomial) +fit <- glm(Direction ~ Lag4, data = Weekly[train, ], family = binomial) pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5 mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction) @@ -583,7 +582,7 @@ fit <- glm(Direction ~ Lag1 * Lag2 * Lag3 * Lag4, data = Weekly[train, ], family pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5 mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction) -fit <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4,data = Weekly[train, ]) +fit <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4, data = Weekly[train, ]) pred <- predict(fit, Weekly[!train, ], type = "response")$class mean(pred == Weekly[!train, ]$Direction) @@ -658,7 +657,7 @@ variables are colinear. ```{r} set.seed(1) -train <- sample(seq_len(nrow(x)), nrow(x) * 2/3) +train <- sample(seq_len(nrow(x)), nrow(x) * 2 / 3) ``` > d. Perform LDA on the training data in order to predict `mpg01` using the @@ -787,8 +786,8 @@ Power3 <- function(x, a) { > `log = "y"`, or `log = "xy"` as arguments to the `plot()` function. ```{r} -plot(1:10, Power3(1:10, 2), - xlab = "x", +plot(1:10, Power3(1:10, 2), + xlab = "x", ylab = expression(paste("x"^"2")), log = "y" ) @@ -806,7 +805,7 @@ plot(1:10, Power3(1:10, 2), ```{r} PlotPower <- function(x, a, log = "y") { plot(x, Power3(x, a), - xlab = "x", + xlab = "x", ylab = substitute("x"^a, list(a = a)), log = log ) @@ -827,11 +826,11 @@ PlotPower(1:10, 3) ```{r} x <- cbind( - ISLR2::Boston[, -1], + ISLR2::Boston[, -1], data.frame("highcrim" = Boston$crim > median(Boston$crim)) ) set.seed(1) -train <- sample(seq_len(nrow(x)), nrow(x) * 2/3) +train <- sample(seq_len(nrow(x)), nrow(x) * 2 / 3) ``` We can find the most associated variables by performing wilcox tests. @@ -861,8 +860,8 @@ Let's look at univariate associations with `highcrim` (in the training data) x[train, ] |> pivot_longer(!highcrim) |> mutate(name = factor(name, levels = ord)) |> - ggplot(aes(highcrim, value)) + - geom_boxplot() + + ggplot(aes(highcrim, value)) + + geom_boxplot() + facet_wrap(~name, scale = "free") ``` @@ -902,9 +901,9 @@ res <- sapply(1:12, function(max) fit_models(1:max)) res <- as_tibble(t(res)) res$n_var <- 1:12 pivot_longer(res, cols = !n_var) |> - ggplot(aes(n_var, value, col = name)) + - geom_line() + - xlab("Number of predictors") + + ggplot(aes(n_var, value, col = name)) + + geom_line() + + xlab("Number of predictors") + ylab("Error rate") ``` diff --git a/07-moving-beyond-linearity.Rmd b/07-moving-beyond-linearity.Rmd index a7140dd..b5e0e29 100644 --- a/07-moving-beyond-linearity.Rmd +++ b/07-moving-beyond-linearity.Rmd @@ -175,9 +175,9 @@ grid() ```{r} x <- seq(-2, 6, length.out = 1000) -b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x & x <= 2) -b2 <- function(x) (x - 3) * I(3 <= x & x <= 4) + I(4 < x & x <= 5) -f <- function(x) 1 + 1*b1(x) + 3*b2(x) +b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x & x <= 2) +b2 <- function(x) (x - 3) * I(3 <= x & x <= 4) + I(4 < x & x <= 5) +f <- function(x) 1 + 1 * b1(x) + 3 * b2(x) plot(x, f(x), type = "l") grid() ``` @@ -364,7 +364,7 @@ err5 <- mean(replicate(10, { c(err, err1, err2, err3, err4, err5) anova(fit, fit1, fit2, fit3, fit4, fit5) -x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out=1000) +x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out = 1000) pred <- data.frame( x = x, "Linear" = predict(fit, data.frame(horsepower = x)), @@ -407,7 +407,7 @@ lines(x, predict(fit, data.frame(dis = x)), col = "red", lty = 2) ```{r} fits <- lapply(1:10, function(i) glm(nox ~ poly(dis, i), data = Boston)) -x <- seq(min(Boston$dis), max(Boston$dis), length.out=1000) +x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000) pred <- data.frame(lapply(fits, function(fit) predict(fit, data.frame(dis = x)))) colnames(pred) <- 1:10 pred$x <- x @@ -604,7 +604,7 @@ beta1 <- 20 > ``` ```{r} -a <- y - beta1*x1 +a <- y - beta1 * x1 beta2 <- lm(a ~ x2)$coef[2] ``` @@ -633,15 +633,15 @@ res <- matrix(NA, nrow = 1000, ncol = 3) colnames(res) <- c("beta0", "beta1", "beta2") beta1 <- 20 for (i in 1:1000) { - beta2 <- lm(y - beta1*x1 ~ x2)$coef[2] - beta1 <- lm(y - beta2*x2 ~ x1)$coef[2] - beta0 <- lm(y - beta2*x2 ~ x1)$coef[1] + beta2 <- lm(y - beta1 * x1 ~ x2)$coef[2] + beta1 <- lm(y - beta2 * x2 ~ x1)$coef[2] + beta0 <- lm(y - beta2 * x2 ~ x1)$coef[1] res[i, ] <- c(beta0, beta1, beta2) } res <- as.data.frame(res) res$Iteration <- 1:1000 res <- pivot_longer(res, !Iteration) -p <- ggplot(res, aes(x=Iteration, y=value, color=name)) + +p <- ggplot(res, aes(x = Iteration, y = value, color = name)) + geom_line() + geom_point() + scale_x_continuous(trans = "log10") @@ -682,7 +682,7 @@ n <- 1000 betas <- rnorm(p) * 5 x <- matrix(rnorm(n * p), ncol = p, nrow = n) -y <- (x %*% betas) + rnorm(n) # ignore beta0 for simplicity +y <- (x %*% betas) + rnorm(n) # ignore beta0 for simplicity # multiple regression fit <- lm(y ~ x - 1) diff --git a/08-tree-based-methods.Rmd b/08-tree-based-methods.Rmd index 3ea2813..87fe01c 100644 --- a/08-tree-based-methods.Rmd +++ b/08-tree-based-methods.Rmd @@ -25,13 +25,15 @@ tree <- ape::read.tree(text = "(((R1:1,R2:1)N1:2,R3:4)N2:2,(R4:2,(R5:1,R6:1)R3:2 tree$node.label <- c("Age < 40", "Weight < 100", "Weight < 70", "Age < 60", "Weight < 80") ggtree(tree, ladderize = FALSE) + scale_x_reverse() + coord_flip() + - geom_tiplab(vjust = 2, hjust = 0.5) + - geom_text2(aes(label=label, subset=!isTip), hjust = -0.1, vjust = -1) + geom_tiplab(vjust = 2, hjust = 0.5) + + geom_text2(aes(label = label, subset = !isTip), hjust = -0.1, vjust = -1) ``` ```{r} -plot(NULL, xlab="Age (years)", ylab="Weight (kg)", - xlim = c(0, 100), ylim = c(40, 160), xaxs = "i", yaxs = "i") +plot(NULL, + xlab = "Age (years)", ylab = "Weight (kg)", + xlim = c(0, 100), ylim = c(40, 160), xaxs = "i", yaxs = "i" +) abline(v = 40, col = "red", lty = 2) lines(c(0, 40), c(100, 100), col = "blue", lty = 2) lines(c(0, 40), c(70, 70), col = "blue", lty = 2) @@ -39,8 +41,8 @@ abline(v = 60, col = "red", lty = 2) lines(c(60, 100), c(80, 80), col = "blue", lty = 2) text( - c(20, 20, 20, 50, 80, 80), - c(55, 85, 130, 100, 60, 120), + c(20, 20, 20, 50, 80, 80), + c(55, 85, 130, 100, 60, 120), labels = c("R1", "R2", "R3", "R4", "R5", "R6") ) ``` @@ -92,15 +94,15 @@ $$E = 1 - \max_k(\hat{p}_{mk})$$ # Function definitions are for when there's two classes only p <- seq(0, 1, length.out = 100) data.frame( - x = p, - "Gini index" = p * (1 - p) * 2, - "Entropy" = -(p * log(p) + (1 - p) * log(1 - p)), - "Classification error" = 1 - pmax(p, 1 - p), - check.names = FALSE - ) |> + x = p, + "Gini index" = p * (1 - p) * 2, + "Entropy" = -(p * log(p) + (1 - p) * log(1 - p)), + "Classification error" = 1 - pmax(p, 1 - p), + check.names = FALSE +) |> pivot_longer(!x) |> - ggplot(aes(x = x, y = value, color = name)) + - geom_line(na.rm = TRUE) + ggplot(aes(x = x, y = value, color = name)) + + geom_line(na.rm = TRUE) ``` ### Question 4 @@ -116,8 +118,8 @@ tree <- ape::read.tree(text = "(((3:1.5,(10:1,0:1)A:1)B:1,15:2)C:1,5:2)D;") tree$node.label <- c("X1 < 1", "X2 < 1", "X1 < 0", "X2 < 0") ggtree(tree, ladderize = FALSE) + scale_x_reverse() + coord_flip() + - geom_tiplab(vjust = 2, hjust = 0.5) + - geom_text2(aes(label=label, subset=!isTip), hjust = -0.1, vjust = -1) + geom_tiplab(vjust = 2, hjust = 0.5) + + geom_text2(aes(label = label, subset = !isTip), hjust = -0.1, vjust = -1) ``` > b. Create a diagram similar to the left-hand panel of Figure 8.12, using the @@ -126,14 +128,14 @@ ggtree(tree, ladderize = FALSE) + scale_x_reverse() + coord_flip() + > mean for each region. ```{r} -plot(NULL, xlab="X1", ylab="X2", xlim = c(-1, 2), ylim = c(0, 3), xaxs = "i", yaxs = "i") +plot(NULL, xlab = "X1", ylab = "X2", xlim = c(-1, 2), ylim = c(0, 3), xaxs = "i", yaxs = "i") abline(h = 1, col = "red", lty = 2) lines(c(1, 1), c(0, 1), col = "blue", lty = 2) lines(c(-1, 2), c(2, 2), col = "red", lty = 2) lines(c(0, 0), c(1, 2), col = "blue", lty = 2) text( - c(0, 1.5, -0.5, 1, 0.5), - c(0.5, 0.5, 1.5, 1.5, 2.5), + c(0, 1.5, -0.5, 1, 0.5), + c(0.5, 0.5, 1.5, 1.5, 2.5), labels = c("-1.80", "0.63", "-1.06", "0.21", "2.49") ) ``` @@ -187,11 +189,11 @@ train <- sample(c(TRUE, FALSE), nrow(Boston), replace = TRUE) rf_err <- function(mtry) { randomForest( - Boston[train, -13], - y = Boston[train, 13], - xtest = Boston[!train, -13], - ytest = Boston[!train, 13], - mtry = mtry, + Boston[train, -13], + y = Boston[train, 13], + xtest = Boston[!train, -13], + ytest = Boston[!train, 13], + mtry = mtry, ntree = 500 )$test$mse } @@ -200,12 +202,12 @@ names(res) <- c(1, 2, 3, 5, 7, 10, 12) data.frame(res, check.names = FALSE) |> mutate(n = 1:500) |> pivot_longer(!n) |> - ggplot(aes(x = n, y = value, color = name)) + - geom_line(na.rm = TRUE) + - xlab("Number of trees") + - ylab("Error") + - scale_y_log10() + - scale_color_discrete(name = "No. variables at\neach split") + ggplot(aes(x = n, y = value, color = name)) + + geom_line(na.rm = TRUE) + + xlab("Number of trees") + + ylab("Error") + + scale_y_log10() + + scale_color_discrete(name = "No. variables at\neach split") ``` ### Question 8 @@ -267,8 +269,10 @@ carseats_mse(ptr) ```{r} # Here we can use random Forest with mtry = 10 = p (the number of predictor # variables) to perform bagging -bagged <- randomForest(Sales ~ ., data = Carseats[train, ], mtry = 10, - ntree = 200, importance = TRUE) +bagged <- randomForest(Sales ~ ., + data = Carseats[train, ], mtry = 10, + ntree = 200, importance = TRUE +) carseats_mse(bagged) importance(bagged) ``` @@ -282,8 +286,10 @@ regression tree above. > considered at each split, on the error rate obtained. ```{r} -rf <- randomForest(Sales ~ ., data = Carseats[train, ], mtry = 3, - ntree = 500, importance = TRUE) +rf <- randomForest(Sales ~ ., + data = Carseats[train, ], mtry = 3, + ntree = 500, importance = TRUE +) carseats_mse(rf) importance(rf) ``` @@ -296,12 +302,13 @@ regression tree above, although not quite as good as the bagging approach. ```{r} library(BART) -# For ease, we'll create a fake "predict" method that just returns +# For ease, we'll create a fake "predict" method that just returns # yhat.test.mean regardless of provided "newdata" predict.wbart <- function(model, ...) model$yhat.test.mean -bartfit <- gbart(Carseats[train, 2:11], Carseats[train, 1], - x.test = Carseats[!train, 2:11]) +bartfit <- gbart(Carseats[train, 2:11], Carseats[train, 1], + x.test = Carseats[!train, 2:11] +) carseats_mse(bartfit) ``` @@ -444,17 +451,21 @@ test <- setdiff(1:nrow(dat), train) ```{r} library(gbm) set.seed(42) -lambdas <- 10 ^ seq(-3, 0, by = 0.1) +lambdas <- 10^seq(-3, 0, by = 0.1) fits <- lapply(lambdas, function(lam) { - gbm(Salary ~ ., data = dat[train, ], distribution = "gaussian", - n.trees = 1000, shrinkage = lam) + gbm(Salary ~ ., + data = dat[train, ], distribution = "gaussian", + n.trees = 1000, shrinkage = lam + ) }) errs <- sapply(fits, function(fit) { p <- predict(fit, dat[train, ], n.trees = 1000) mean((p - dat[train, ]$Salary)^2) }) -plot(lambdas, errs, type = "b", xlab = "Shrinkage values", - ylab = "Training MSE", log = "xy") +plot(lambdas, errs, + type = "b", xlab = "Shrinkage values", + ylab = "Training MSE", log = "xy" +) ``` > d. Produce a plot with different shrinkage values on the $x$-axis and the @@ -465,8 +476,10 @@ errs <- sapply(fits, function(fit) { p <- predict(fit, dat[test, ], n.trees = 1000) mean((p - dat[test, ]$Salary)^2) }) -plot(lambdas, errs, type = "b", xlab = "Shrinkage values", - ylab = "Training MSE", log = "xy") +plot(lambdas, errs, + type = "b", xlab = "Shrinkage values", + ylab = "Training MSE", log = "xy" +) min(errs) abline(v = lambdas[which.min(errs)], lty = 2, col = "red") ``` @@ -558,7 +571,7 @@ For logistic regression we correctly predict 14% of those predicted to purchase. ```{r} library(class) # KNN -fit <- knn(Caravan[train, -86], Caravan[test, -86], Caravan$Purchase[train]) +fit <- knn(Caravan[train, -86], Caravan[test, -86], Caravan$Purchase[train]) table(fit, Caravan[test, "Purchase"] == "Yes") sum(fit == "Yes" & Caravan[test, "Purchase"] == "Yes") / sum(fit == "Yes") ``` @@ -587,8 +600,8 @@ test <- setdiff(1:nrow(College), train) lr <- gam(Outstate ~ ., data = College[train, ]) # GAM from chapter 7 -gam <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + - s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2), data = College[train, ]) +gam <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + + s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2), data = College[train, ]) # Boosting boosted <- gbm(Outstate ~ ., data = College[train, ], n.trees = 1000, shrinkage = 0.01) @@ -601,8 +614,9 @@ rf <- randomForest(Outstate ~ ., data = College[train, ], mtry = 4, ntree = 1000 # BART pred <- setdiff(colnames(College), "Outstate") -bart <- gbart(College[train, pred], College[train, "Outstate"], - x.test = College[test, pred]) +bart <- gbart(College[train, pred], College[train, "Outstate"], + x.test = College[test, pred] +) mse <- function(model, ...) { pred <- predict(model, College[test, ], ...) @@ -619,7 +633,8 @@ res <- c( ) res <- data.frame("MSE" = res) res$Model <- factor(row.names(res), levels = rev(row.names(res))) -ggplot(res, aes(Model, MSE)) + coord_flip() + +ggplot(res, aes(Model, MSE)) + + coord_flip() + geom_bar(stat = "identity", fill = "steelblue") ``` diff --git a/09-support-vector-mechines.Rmd b/09-support-vector-mechines.Rmd index 4cae116..efbd505 100644 --- a/09-support-vector-mechines.Rmd +++ b/09-support-vector-mechines.Rmd @@ -6,36 +6,36 @@ > This problem involves hyperplanes in two dimensions. > -> a. Sketch the hyperplane $1 + 3X_1 − X_2 = 0$. Indicate the set of points for -> which $1 + 3X_1 − X_2 > 0$, as well as the set of points for which -> $1 + 3X_1 − X_2 < 0$. +> a. Sketch the hyperplane $1 + 3X_1 - X_2 = 0$. Indicate the set of points for +> which $1 + 3X_1 - X_2 > 0$, as well as the set of points for which +> $1 + 3X_1 - X_2 < 0$. ```{r} library(ggplot2) xlim <- c(-10, 10) ylim <- c(-30, 30) points <- expand.grid( - X1 = seq(xlim[1], xlim[2], length.out = 50), + X1 = seq(xlim[1], xlim[2], length.out = 50), X2 = seq(ylim[1], ylim[2], length.out = 50) ) -p <- ggplot(points, aes(x = X1, y = X2)) + - geom_abline(intercept = 1, slope = 3) + # X2 = 1 + 3X1 +p <- ggplot(points, aes(x = X1, y = X2)) + + geom_abline(intercept = 1, slope = 3) + # X2 = 1 + 3X1 theme_bw() -p + geom_point(aes(color = 1 + 3*X1 - X2 > 0), size = 0.1) + - scale_color_discrete(name = "1 + 3X1 − X2 > 0") +p + geom_point(aes(color = 1 + 3 * X1 - X2 > 0), size = 0.1) + + scale_color_discrete(name = "1 + 3X1 - X2 > 0") ``` -> b. On the same plot, sketch the hyperplane $−2 + X_1 + 2X_2 = 0$. Indicate the -> set of points for which $−2 + X_1 + 2X_2 > 0$, as well as the set of points -> for which $−2 + X_1 + 2X_2 < 0$. +> b. On the same plot, sketch the hyperplane $-2 + X_1 + 2X_2 = 0$. Indicate the +> set of points for which $-2 + X_1 + 2X_2 > 0$, as well as the set of points +> for which $-2 + X_1 + 2X_2 < 0$. ```{r} -p + geom_abline(intercept = 1, slope = -1/2) + # X2 = 1 - X1/2 +p + geom_abline(intercept = 1, slope = -1 / 2) + # X2 = 1 - X1/2 geom_point( - aes(color = interaction(1 + 3*X1 - X2 > 0, -2 + X1 + 2*X2 > 0)), + aes(color = interaction(1 + 3 * X1 - X2 > 0, -2 + X1 + 2 * X2 > 0)), size = 0.5 - ) + - scale_color_discrete(name = "(1 + 3X1 − X2 > 0).(−2 + X1 + 2X2 > 0)") + ) + + scale_color_discrete(name = "(1 + 3X1 - X2 > 0).(-2 + X1 + 2X2 > 0)") ``` ### Question 2 @@ -44,30 +44,30 @@ p + geom_abline(intercept = 1, slope = -1/2) + # X2 = 1 - X1/2 > form $\beta_0 + \beta_1X_1 + \beta_2X_2 = 0$. We now investigate a non-linear > decision boundary. > -> a. Sketch the curve $$(1+X_1)^2 +(2−X_2)^2 = 4$$. +> a. Sketch the curve $$(1+X_1)^2 +(2-X_2)^2 = 4$$. ```{r} points <- expand.grid( - X1 = seq(-4, 2, length.out = 100), + X1 = seq(-4, 2, length.out = 100), X2 = seq(-1, 5, length.out = 100) ) -p <- ggplot(points, aes(x = X1, y = X2, z = (1 + X1)^2 + (2 - X2)^2 - 4)) + - geom_contour(breaks = 0, colour = "black") + +p <- ggplot(points, aes(x = X1, y = X2, z = (1 + X1)^2 + (2 - X2)^2 - 4)) + + geom_contour(breaks = 0, colour = "black") + theme_bw() p ``` > b. On your sketch, indicate the set of points for which -> $$(1 + X_1)^2 + (2 − X_2)^2 > 4,$$ as well as the set of points for which -> $$(1 + X_1)^2 + (2 − X_2)^2 \leq 4.$$ +> $$(1 + X_1)^2 + (2 - X_2)^2 > 4,$$ as well as the set of points for which +> $$(1 + X_1)^2 + (2 - X_2)^2 \leq 4.$$ ```{r} p + geom_point(aes(color = (1 + X1)^2 + (2 - X2)^2 - 4 > 0), size = 0.1) ``` > c. Suppose that a classifier assigns an observation to the blue class if $$(1 -> + X_1)^2 + (2 − X_2)^2 > 4,$$ and to the red class otherwise. To what class -> is the observation $(0, 0)$ classified? $(−1, 1)$? $(2, 2)$? $(3, 8)$? +> + X_1)^2 + (2 - X_2)^2 > 4,$$ and to the red class otherwise. To what class +> is the observation $(0, 0)$ classified? $(-1, 1)$? $(2, 2)$? $(3, 8)$? ```{r} points <- data.frame( @@ -81,9 +81,9 @@ ifelse((1 + points$X1)^2 + (2 - points$X2)^2 > 4, "blue", "red") > $X_1$ and $X_2$, it is linear in terms of $X_1$, $X_1^2$, $X_2$, and > $X_2^2$. -The decision boundary is $$(1 + X_1)^2 + (2 − X_2)^2 -4 = 0$$ which we can expand +The decision boundary is $$(1 + X_1)^2 + (2 - X_2)^2 -4 = 0$$ which we can expand to: -$$1 + 2X_1 + X_1^2 + 4 − 4X_2 + X_2^2 - 4 = 0$$ +$$1 + 2X_1 + X_1^2 + 4 - 4X_2 + X_2^2 - 4 = 0$$ which is linear in terms of $X_1$, $X_1^2$, $X_2$, $X_2^2$. ### Question 3 @@ -111,8 +111,8 @@ data <- data.frame( X2 = c(4, 2, 4, 4, 1, 3, 1), Y = c(rep("Red", 4), rep("Blue", 3)) ) -p <- ggplot(data, aes(x = X1, y = X2, color = Y)) + - geom_point(size = 2) + +p <- ggplot(data, aes(x = X1, y = X2, color = Y)) + + geom_point(size = 2) + scale_colour_identity() + coord_cartesian(xlim = c(0.5, 4.5), ylim = c(0.5, 4.5)) p @@ -194,7 +194,7 @@ p + geom_abline(intercept = 1, slope = 0.5, lty = 2, col = "red") > longer separable by a hyperplane. ```{r} -p + geom_point(data = data.frame(X1 = 1, X2 = 3, Y = "Blue"), shape = 15, size = 4) +p + geom_point(data = data.frame(X1 = 1, X2 = 3, Y = "Blue"), shape = 15, size = 4) ``` ## Applied @@ -215,11 +215,12 @@ data <- data.frame( x = runif(100), y = runif(100) ) -score <- (2*data$x-0.5)^2 + (data$y)^2 - 0.5 +score <- (2 * data$x - 0.5)^2 + (data$y)^2 - 0.5 data$class <- factor(ifelse(score > 0, "red", "blue")) -p <- ggplot(data, aes(x = x, y = y, color = class)) + - geom_point(size = 2) + scale_colour_identity() +p <- ggplot(data, aes(x = x, y = y, color = class)) + + geom_point(size = 2) + + scale_colour_identity() p train <- 1:50 @@ -276,7 +277,7 @@ train$y <- factor(as.numeric((train$x1^2 - train$x2^2 > 0))) > should display $X_1$ on the $x$-axis, and $X_2$ on the $y$-axis. ```{r} -p <- ggplot(train, aes(x = x1, y = x2, color = y)) + +p <- ggplot(train, aes(x = x1, y = x2, color = y)) + geom_point(size = 2) p ``` @@ -300,7 +301,7 @@ plot_model <- function(fit) { } else { train$p <- factor(as.numeric(predict(fit) > 0)) } - ggplot(train, aes(x = x1, y = x2, color = p)) + + ggplot(train, aes(x = x1, y = x2, color = p)) + geom_point(size = 2) } @@ -373,19 +374,20 @@ data$y <- (data$class == "red") * 5 + rnorm(200) # Add barley separable points (these are simulated "noise" values) newdata <- data.frame(x = rnorm(30)) -newdata$y <- 1.5*newdata$x + 3 + rnorm(30, 0, 1) -newdata$class = ifelse((1.5*newdata$x + 3) - newdata$y > 0, "blue", "red") +newdata$y <- 1.5 * newdata$x + 3 + rnorm(30, 0, 1) +newdata$class <- ifelse((1.5 * newdata$x + 3) - newdata$y > 0, "blue", "red") data <- rbind(data, newdata) # remove any that cause misclassification leaving data that is barley linearly # separable, but along an axis that is not y = 2.5 (which would be correct # for the "true" data. -data <- data[!(data$class == "red") == ((1.5*data$x + 3 - data$y) > 0), ] +data <- data[!(data$class == "red") == ((1.5 * data$x + 3 - data$y) > 0), ] data <- data[sample(seq_len(nrow(data)), 200), ] -p <- ggplot(data, aes(x = x, y = y, color = class)) + - geom_point(size = 2) + scale_colour_identity() + +p <- ggplot(data, aes(x = x, y = y, color = class)) + + geom_point(size = 2) + + scale_colour_identity() + geom_abline(intercept = 3, slope = 1.5, lty = 2) p ``` @@ -401,9 +403,9 @@ How many training errors are misclassified for each value of cost? costs <- 10^seq(-3, 5) sapply(costs, function(cost) { - fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost) - pred <- predict(fit, data) - sum(pred != data$class) + fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost) + pred <- predict(fit, data) + sum(pred != data$class) }) ``` @@ -413,7 +415,7 @@ Cross-validation errors out <- tune(svm, as.factor(class) ~ ., data = data, kernel = "linear", ranges = list(cost = costs)) summary(out) data.frame( - cost = out$performances$cost, + cost = out$performances$cost, misclass = out$performances$error * nrow(data) ) ``` @@ -434,18 +436,18 @@ test$y <- (test$class == "red") * 5 + rnorm(200) p + geom_point(data = test, pch = 21) (errs <- sapply(costs, function(cost) { - fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost) - pred <- predict(fit, test) - sum(pred != test$class) + fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost) + pred <- predict(fit, test) + sum(pred != test$class) })) (cost <- costs[which.min(errs)]) (fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost)) test$prediction <- predict(fit, test) -p <- ggplot(test, aes(x = x, y = y, color = class, shape = prediction == class)) + - geom_point(size = 2) + - scale_colour_identity() +p <- ggplot(test, aes(x = x, y = y, color = class, shape = prediction == class)) + + geom_point(size = 2) + + scale_colour_identity() p ``` @@ -480,8 +482,10 @@ set.seed(42) costs <- 10^seq(-4, 3, by = 0.5) results <- list() f <- high_mpg ~ displacement + horsepower + weight -results$linear <- tune(svm, f, data = data, kernel = "linear", - ranges = list(cost = costs)) +results$linear <- tune(svm, f, + data = data, kernel = "linear", + ranges = list(cost = costs) +) summary(results$linear) ``` @@ -490,12 +494,16 @@ summary(results$linear) > on your results. ```{r} -results$polynomial <- tune(svm, f, data = data, kernel = "polynomial", - ranges = list(cost = costs, degree = 1:3)) +results$polynomial <- tune(svm, f, + data = data, kernel = "polynomial", + ranges = list(cost = costs, degree = 1:3) +) summary(results$polynomial) -results$radial <- tune(svm, f, data = data, kernel = "radial", - ranges = list(cost = costs, gamma = 10^(-2:1))) +results$radial <- tune(svm, f, + data = data, kernel = "radial", + ranges = list(cost = costs, gamma = 10^(-2:1)) +) summary(results$radial) sapply(results, function(x) x$best.performance) @@ -527,9 +535,9 @@ sapply(results, function(x) x$best.parameters) ```{r} table(predict(results$radial$best.model, data), data$high_mpg) -plot(results$radial$best.model, data, horsepower~displacement) -plot(results$radial$best.model, data, horsepower~weight) -plot(results$radial$best.model, data, displacement~weight) +plot(results$radial$best.model, data, horsepower ~ displacement) +plot(results$radial$best.model, data, horsepower ~ weight) +plot(results$radial$best.model, data, displacement ~ weight) ``` ### Question 8 @@ -572,8 +580,10 @@ errs(fit) > range 0.01 to 10. ```{r} -tuned <- tune(svm, Purchase ~ ., data = OJ[train, ], kernel = "linear", - ranges = list(cost = 10^seq(-2, 1, length.out = 10))) +tuned <- tune(svm, Purchase ~ ., + data = OJ[train, ], kernel = "linear", + ranges = list(cost = 10^seq(-2, 1, length.out = 10)) +) tuned$best.parameters summary(tuned) ``` @@ -588,8 +598,10 @@ errs(tuned$best.model) > kernel. Use the default value for `gamma`. ```{r} -tuned2 <- tune(svm, Purchase ~ ., data = OJ[train, ], kernel = "radial", - ranges = list(cost = 10^seq(-2, 1, length.out = 10))) +tuned2 <- tune(svm, Purchase ~ ., + data = OJ[train, ], kernel = "radial", + ranges = list(cost = 10^seq(-2, 1, length.out = 10)) +) tuned2$best.parameters errs(tuned2$best.model) ``` @@ -598,8 +610,10 @@ errs(tuned2$best.model) > polynomial kernel. Set `degree = 2`. ```{r} -tuned3 <- tune(svm, Purchase ~ ., data = OJ[train, ], kernel = "polynomial", - ranges = list(cost = 10^seq(-2, 1, length.out = 10)), degree = 2) +tuned3 <- tune(svm, Purchase ~ ., + data = OJ[train, ], kernel = "polynomial", + ranges = list(cost = 10^seq(-2, 1, length.out = 10)), degree = 2 +) tuned3$best.parameters errs(tuned3$best.model) ``` diff --git a/10-deep-learning.Rmd b/10-deep-learning.Rmd index 3108399..0a0c973 100644 --- a/10-deep-learning.Rmd +++ b/10-deep-learning.Rmd @@ -248,7 +248,7 @@ absolute error. > Consider the simple function $R(\beta) = sin(\beta) + \beta/10$. > -> a. Draw a graph of this function over the range $\beta \in [−6, 6]$. +> a. Draw a graph of this function over the range $\beta \in [-6, 6]$. ```{r} r <- function(x) sin(x) + x / 10 diff --git a/11-survival-analysis-and-censored-data.Rmd b/11-survival-analysis-and-censored-data.Rmd index 221c7a1..cce8e6c 100644 --- a/11-survival-analysis-and-censored-data.Rmd +++ b/11-survival-analysis-and-censored-data.Rmd @@ -310,7 +310,7 @@ These are equivalent to 11.5 and 11.6. > Recall that the survival function $S(t)$, the hazard function $h(t)$, and the > density function $f(t)$ are defined in (11.2), (11.9), and (11.11), -> respectively. Furthermore, define $F(t) = 1 − S(t)$. Show that the following +> respectively. Furthermore, define $F(t) = 1 - S(t)$. Show that the following > relationships hold: > > $$ @@ -337,7 +337,7 @@ Integrating and then exponentiating we get the second identity. > survival times follow an exponential distribution. > > a. Suppose that a survival time follows an $Exp(\lambda)$ distribution, so -> that its density function is $f(t) = \lambda\exp(−\lambda t)$. Using the +> that its density function is $f(t) = \lambda\exp(-\lambda t)$. Using the > relationships provided in Exercise 8, show that $S(t) = \exp(-\lambda t)$. The cdf of an exponential distribution is $1 - \exp(-\lambda x)$ and diff --git a/12-unsupervised-learning.Rmd b/12-unsupervised-learning.Rmd index 20ce5dc..198aa47 100644 --- a/12-unsupervised-learning.Rmd +++ b/12-unsupervised-learning.Rmd @@ -142,7 +142,8 @@ d <- data.frame( x1 = c(1, 1, 0, 5, 6, 4), x2 = c(4, 3, 4, 1, 2, 0) ) -ggplot(d, aes(x = x1, y = x2)) + geom_point() +ggplot(d, aes(x = x1, y = x2)) + + geom_point() ``` > b. Randomly assign a cluster label to each observation. You can use the @@ -157,7 +158,7 @@ d$cluster <- sample(c(1, 2), size = nrow(d), replace = TRUE) > c. Compute the centroid for each cluster. ```{r} -centroids <- sapply(c(1,2), function(i) colMeans(d[d$cluster == i, 1:2])) +centroids <- sapply(c(1, 2), function(i) colMeans(d[d$cluster == i, 1:2])) ``` > d. Assign each observation to the centroid to which it is closest, in terms of @@ -165,7 +166,7 @@ centroids <- sapply(c(1,2), function(i) colMeans(d[d$cluster == i, 1:2])) ```{r} dist <- sapply(1:2, function(i) { - sqrt((d$x1 - centroids[1, i])^2 + (d$x2 - centroids[2, i])^2) + sqrt((d$x1 - centroids[1, i])^2 + (d$x2 - centroids[2, i])^2) }) d$cluster <- apply(dist, 1, which.min) ``` @@ -173,9 +174,9 @@ d$cluster <- apply(dist, 1, which.min) > e. Repeat (c) and (d) until the answers obtained stop changing. ```{r} -centroids <- sapply(c(1,2), function(i) colMeans(d[d$cluster == i, 1:2])) +centroids <- sapply(c(1, 2), function(i) colMeans(d[d$cluster == i, 1:2])) dist <- sapply(1:2, function(i) { - sqrt((d$x1 - centroids[1, i])^2 + (d$x2 - centroids[2, i])^2) + sqrt((d$x1 - centroids[1, i])^2 + (d$x2 - centroids[2, i])^2) }) d$cluster <- apply(dist, 1, which.min) ``` @@ -186,7 +187,8 @@ In this case, we get stable labels after the first iteration. > labels obtained. ```{r} -ggplot(d, aes(x = x1, y = x2, color = factor(cluster))) + geom_point() +ggplot(d, aes(x = x1, y = x2, color = factor(cluster))) + + geom_point() ``` ### Question 4 @@ -263,7 +265,7 @@ kmeans(dat, 2)$cluster > It turns out that these two measures are almost equivalent: if each > observation has been centered to have mean zero and standard deviation one, > and if we let $r_{ij}$ denote the correlation between the $i$th and $j$th -> observations, then the quantity $1 − r_{ij}$ is proportional to the squared +> observations, then the quantity $1 - r_{ij}$ is proportional to the squared > Euclidean distance between the ith and jth observations. > > On the `USArrests` data, show that this proportionality holds. @@ -393,8 +395,10 @@ data[classes == "C", 5:30] <- data[classes == "C", 5:30] + 1 ```{r} pca <- prcomp(data) -ggplot(data.frame(Class = classes, PC1 = pca$x[, 1], PC2 = pca$x[, 2]), - aes(x = PC1, y = PC2, col = Class)) + +ggplot( + data.frame(Class = classes, PC1 = pca$x[, 1], PC2 = pca$x[, 2]), + aes(x = PC1, y = PC2, col = Class) +) + geom_point() ```