Skip to content

Commit

Permalink
Merge pull request #19 from danhalligan/18-consistent-formatting
Browse files Browse the repository at this point in the history
Improved formatting
  • Loading branch information
danhalligan authored Jan 3, 2025
2 parents d1faf26 + 5c7bbe7 commit 5142a1a
Show file tree
Hide file tree
Showing 9 changed files with 226 additions and 188 deletions.
28 changes: 15 additions & 13 deletions 02-statistical-learning.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ college$Elite <- factor(ifelse(college$Top10perc > 50, "Yes", "No"))
summary(college$Elite)
plot(college$Outstate ~ college$Elite, xlab = "Elite", ylab = "Outstate")
par(mfrow = c(2,2))
par(mfrow = c(2, 2))
for (n in c(5, 10, 20, 50)) {
hist(college$Enroll, breaks = n, main = paste("n =", n), xlab = "Enroll")
}
Expand Down Expand Up @@ -374,7 +374,7 @@ x[-(10:85), numeric] |>
> the relationships among the predictors. Comment on your findings.
```{r}
pairs(x[, numeric], cex = 0.2)
pairs(x[, numeric], cex = 0.2)
cor(x[, numeric]) |>
kable()
Expand Down Expand Up @@ -425,8 +425,10 @@ library(tidyverse)
```

```{r}
ggplot(Boston, aes(nox, rm)) + geom_point()
ggplot(Boston, aes(ptratio, rm)) + geom_point()
ggplot(Boston, aes(nox, rm)) +
geom_point()
ggplot(Boston, aes(ptratio, rm)) +
geom_point()
heatmap(cor(Boston, method = "spearman"), cexRow = 1.1, cexCol = 1.1)
```

Expand All @@ -440,12 +442,12 @@ Yes
> predictor.
```{r}
Boston |>
pivot_longer(cols = 1:13) |>
filter(name %in% c("crim", "tax", "ptratio")) |>
ggplot(aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~name, scales="free", ncol= 1)
Boston |>
pivot_longer(cols = 1:13) |>
filter(name %in% c("crim", "tax", "ptratio")) |>
ggplot(aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~name, scales = "free", ncol = 1)
```

Yes, particularly crime and tax rates.
Expand Down Expand Up @@ -496,9 +498,9 @@ Boston |>
select(-c(crim, zn)) |>
pivot_longer(!rm) |>
mutate(">8 rooms" = rm > 8) |>
ggplot(aes(`>8 rooms`, value)) +
geom_boxplot() +
facet_wrap(~name, scales = "free")
ggplot(aes(`>8 rooms`, value)) +
geom_boxplot() +
facet_wrap(~name, scales = "free")
```

Census tracts with big average properties (more than eight rooms per dwelling)
Expand Down
36 changes: 20 additions & 16 deletions 03-linear-regression.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@ library(plotly)
```{r}
model <- function(gpa, iq, level) {
50 +
gpa * 20 +
iq * 0.07 +
level * 35 +
gpa * iq * 0.01 +
gpa * level * -10
gpa * 20 +
iq * 0.07 +
level * 35 +
gpa * iq * 0.01 +
gpa * level * -10
}
x <- seq(1, 5, length = 10)
y <- seq(1, 200, length = 20)
Expand All @@ -82,15 +82,18 @@ plot_ly(x = x, y = y) |>
add_surface(
z = ~college,
colorscale = list(c(0, 1), c("rgb(107,184,214)", "rgb(0,90,124)")),
colorbar = list(title = "College")) |>
colorbar = list(title = "College")
) |>
add_surface(
z = ~high_school,
colorscale = list(c(0, 1), c("rgb(255,112,184)", "rgb(128,0,64)")),
colorbar = list(title = "High school")) |>
colorbar = list(title = "High school")
) |>
layout(scene = list(
xaxis = list(title = "GPA"),
yaxis = list(title = "IQ"),
zaxis = list(title = "Salary")))
zaxis = list(title = "Salary")
))
```

Option iii correct.
Expand Down Expand Up @@ -366,7 +369,7 @@ par(mfrow = c(2, 2))
plot(Auto$horsepower, Auto$mpg, cex = 0.2)
plot(log(Auto$horsepower), Auto$mpg, cex = 0.2)
plot(sqrt(Auto$horsepower), Auto$mpg, cex = 0.2)
plot(Auto$horsepower ^ 2, Auto$mpg, cex = 0.2)
plot(Auto$horsepower^2, Auto$mpg, cex = 0.2)
x <- subset(Auto, select = -name)
x$horsepower <- log(x$horsepower)
Expand Down Expand Up @@ -553,7 +556,7 @@ We can show this numerically in R by computing $t$ using the above equation.

```{r}
n <- length(x)
sqrt(n - 1) * sum(x * y) / sqrt(sum(x ^ 2) * sum(y ^ 2) - sum(x * y) ^ 2)
sqrt(n - 1) * sum(x * y) / sqrt(sum(x^2) * sum(y^2) - sum(x * y)^2)
```

> e. Using the results from (d), argue that the _t_-statistic for the
Expand Down Expand Up @@ -846,9 +849,9 @@ contributions.
> answers.
```{r}
x1 <- c(x1 , 0.1)
x2 <- c(x2 , 0.8)
y <- c(y ,6)
x1 <- c(x1, 0.1)
x2 <- c(x2, 0.8)
y <- c(y, 6)
summary(lm(y ~ x1 + x2))
summary(lm(y ~ x1))
summary(lm(y ~ x2))
Expand Down Expand Up @@ -929,9 +932,10 @@ The results from (b) show reduced significance compared to the models fit in
(a).

```{r}
plot(sapply(fits, function(x) coef(x)[2]), coef(mfit)[-1],
xlab = "Univariate regression",
ylab = "multiple regression")
plot(sapply(fits, function(x) coef(x)[2]), coef(mfit)[-1],
xlab = "Univariate regression",
ylab = "multiple regression"
)
```

The estimated coefficients differ (in particular the estimated coefficient for
Expand Down
45 changes: 22 additions & 23 deletions 04-classification.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,10 @@ $$
Letting $x = e^{\beta_0 + \beta_1X}$

\begin{align}
\frac{P(X)}{1-p(X)} &= \frac{\frac{x}{1 + x}}
{1 - \frac{x}{1 + x}} \\
&= \frac{\frac{x}{1 + x}}
{\frac{1}{1 + x}} \\
&= x
\frac{P(X)}{1-p(X)}
&= \frac{\frac{x}{1 + x}} {1 - \frac{x}{1 + x}} \\
&= \frac{\frac{x}{1 + x}} {\frac{1}{1 + x}} \\
&= x
\end{align}

### Question 2
Expand Down Expand Up @@ -65,7 +64,7 @@ therefore, we can consider maximizing $\log(p_K(X))$

$$
\log(p_k(x)) = \log(\pi_k) - \frac{1}{2\sigma^2}(x - \mu_k)^2 -
\log\left(\sum_{l=1}^k \pi_l \exp\left(-\frac{1}{2\sigma^2}(x - \mu_l)^2\right)\right)
\log\left(\sum_{l=1}^k \pi_l \exp\left(-\frac{1}{2\sigma^2}(x - \mu_l)^2\right)\right)
$$

Remember that we are maximizing over $k$, and since the last term does not
Expand Down Expand Up @@ -265,7 +264,7 @@ when $X_1 = 40$ and $X_2 = 3.5$, $p(X) = 0.38$
> chance of getting an A in the class?
We would like to solve for $X_1$ where $p(X) = 0.5$. Taking the first equation
above, we need to solve $0 = 6 + 0.05X_1 + 3.5$, so $X_1 = 50$ hours.
above, we need to solve $0 = -6 + 0.05X_1 + 3.5$, so $X_1 = 50$ hours.

### Question 7

Expand Down Expand Up @@ -305,7 +304,7 @@ p(D|v) &= \frac{p(v|D) p(D)}{p(v|D)p(D) + p(v|N)p(N)} \\
\end{align}

```{r}
exp(-0.5) * 0.8 / (exp(-0.5) * 0.8 + exp(-2/9) * 0.2)
exp(-0.5) * 0.8 / (exp(-0.5) * 0.8 + exp(-2 / 9) * 0.2)
```

### Question 8
Expand Down Expand Up @@ -412,7 +411,7 @@ $$
(\hat\alpha_{orange0} - \hat\alpha_{apple0}) + (\hat\alpha_{orange1} - \hat\alpha_{apple1})x
$$

> c. Suppose that in your model, $\hat\beta_0 = 2$ and $\hat\beta = 1$. What
> c. Suppose that in your model, $\hat\beta_0 = 2$ and $\hat\beta = -1$. What
> are the coefficient estimates in your friend's model? Be as specific as
> possible.
Expand All @@ -423,7 +422,7 @@ We are unable to know the specific value of each parameter however.

> d. Now suppose that you and your friend fit the same two models on a different
> data set. This time, your friend gets the coefficient estimates
> $\hat\alpha_{orange0} = 1.2$, $\hat\alpha_{orange1} = 2$,
> $\hat\alpha_{orange0} = 1.2$, $\hat\alpha_{orange1} = -2$,
> $\hat\alpha_{apple0} = 3$, $\hat\alpha_{apple1} = 0.6$. What are the
> coefficient estimates in your model?
Expand Down Expand Up @@ -571,7 +570,7 @@ fit <- glm(Direction ~ Lag3, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
fit <- glm(Direction ~Lag4, data = Weekly[train, ], family = binomial)
fit <- glm(Direction ~ Lag4, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
Expand All @@ -583,7 +582,7 @@ fit <- glm(Direction ~ Lag1 * Lag2 * Lag3 * Lag4, data = Weekly[train, ], family
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
fit <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4,data = Weekly[train, ])
fit <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4, data = Weekly[train, ])
pred <- predict(fit, Weekly[!train, ], type = "response")$class
mean(pred == Weekly[!train, ]$Direction)
Expand Down Expand Up @@ -658,7 +657,7 @@ variables are colinear.
```{r}
set.seed(1)
train <- sample(seq_len(nrow(x)), nrow(x) * 2/3)
train <- sample(seq_len(nrow(x)), nrow(x) * 2 / 3)
```

> d. Perform LDA on the training data in order to predict `mpg01` using the
Expand Down Expand Up @@ -787,8 +786,8 @@ Power3 <- function(x, a) {
> `log = "y"`, or `log = "xy"` as arguments to the `plot()` function.
```{r}
plot(1:10, Power3(1:10, 2),
xlab = "x",
plot(1:10, Power3(1:10, 2),
xlab = "x",
ylab = expression(paste("x"^"2")),
log = "y"
)
Expand All @@ -806,7 +805,7 @@ plot(1:10, Power3(1:10, 2),
```{r}
PlotPower <- function(x, a, log = "y") {
plot(x, Power3(x, a),
xlab = "x",
xlab = "x",
ylab = substitute("x"^a, list(a = a)),
log = log
)
Expand All @@ -827,11 +826,11 @@ PlotPower(1:10, 3)
```{r}
x <- cbind(
ISLR2::Boston[, -1],
ISLR2::Boston[, -1],
data.frame("highcrim" = Boston$crim > median(Boston$crim))
)
set.seed(1)
train <- sample(seq_len(nrow(x)), nrow(x) * 2/3)
train <- sample(seq_len(nrow(x)), nrow(x) * 2 / 3)
```
We can find the most associated variables by performing wilcox tests.
Expand Down Expand Up @@ -861,8 +860,8 @@ Let's look at univariate associations with `highcrim` (in the training data)
x[train, ] |>
pivot_longer(!highcrim) |>
mutate(name = factor(name, levels = ord)) |>
ggplot(aes(highcrim, value)) +
geom_boxplot() +
ggplot(aes(highcrim, value)) +
geom_boxplot() +
facet_wrap(~name, scale = "free")
```
Expand Down Expand Up @@ -902,9 +901,9 @@ res <- sapply(1:12, function(max) fit_models(1:max))
res <- as_tibble(t(res))
res$n_var <- 1:12
pivot_longer(res, cols = !n_var) |>
ggplot(aes(n_var, value, col = name)) +
geom_line() +
xlab("Number of predictors") +
ggplot(aes(n_var, value, col = name)) +
geom_line() +
xlab("Number of predictors") +
ylab("Error rate")
```
Expand Down
22 changes: 11 additions & 11 deletions 07-moving-beyond-linearity.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,9 @@ grid()
```{r}
x <- seq(-2, 6, length.out = 1000)
b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x & x <= 2)
b2 <- function(x) (x - 3) * I(3 <= x & x <= 4) + I(4 < x & x <= 5)
f <- function(x) 1 + 1*b1(x) + 3*b2(x)
b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x & x <= 2)
b2 <- function(x) (x - 3) * I(3 <= x & x <= 4) + I(4 < x & x <= 5)
f <- function(x) 1 + 1 * b1(x) + 3 * b2(x)
plot(x, f(x), type = "l")
grid()
```
Expand Down Expand Up @@ -364,7 +364,7 @@ err5 <- mean(replicate(10, {
c(err, err1, err2, err3, err4, err5)
anova(fit, fit1, fit2, fit3, fit4, fit5)
x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out=1000)
x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out = 1000)
pred <- data.frame(
x = x,
"Linear" = predict(fit, data.frame(horsepower = x)),
Expand Down Expand Up @@ -407,7 +407,7 @@ lines(x, predict(fit, data.frame(dis = x)), col = "red", lty = 2)
```{r}
fits <- lapply(1:10, function(i) glm(nox ~ poly(dis, i), data = Boston))
x <- seq(min(Boston$dis), max(Boston$dis), length.out=1000)
x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
pred <- data.frame(lapply(fits, function(fit) predict(fit, data.frame(dis = x))))
colnames(pred) <- 1:10
pred$x <- x
Expand Down Expand Up @@ -604,7 +604,7 @@ beta1 <- 20
> ```
```{r}
a <- y - beta1*x1
a <- y - beta1 * x1
beta2 <- lm(a ~ x2)$coef[2]
```
Expand Down Expand Up @@ -633,15 +633,15 @@ res <- matrix(NA, nrow = 1000, ncol = 3)
colnames(res) <- c("beta0", "beta1", "beta2")
beta1 <- 20
for (i in 1:1000) {
beta2 <- lm(y - beta1*x1 ~ x2)$coef[2]
beta1 <- lm(y - beta2*x2 ~ x1)$coef[2]
beta0 <- lm(y - beta2*x2 ~ x1)$coef[1]
beta2 <- lm(y - beta1 * x1 ~ x2)$coef[2]
beta1 <- lm(y - beta2 * x2 ~ x1)$coef[2]
beta0 <- lm(y - beta2 * x2 ~ x1)$coef[1]
res[i, ] <- c(beta0, beta1, beta2)
}
res <- as.data.frame(res)
res$Iteration <- 1:1000
res <- pivot_longer(res, !Iteration)
p <- ggplot(res, aes(x=Iteration, y=value, color=name)) +
p <- ggplot(res, aes(x = Iteration, y = value, color = name)) +
geom_line() +
geom_point() +
scale_x_continuous(trans = "log10")
Expand Down Expand Up @@ -682,7 +682,7 @@ n <- 1000
betas <- rnorm(p) * 5
x <- matrix(rnorm(n * p), ncol = p, nrow = n)
y <- (x %*% betas) + rnorm(n) # ignore beta0 for simplicity
y <- (x %*% betas) + rnorm(n) # ignore beta0 for simplicity
# multiple regression
fit <- lm(y ~ x - 1)
Expand Down
Loading

0 comments on commit 5142a1a

Please sign in to comment.