-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathModel_jh4054_V2.Rmd
629 lines (485 loc) · 24.2 KB
/
Model_jh4054_V2.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
---
title: "Model_jh4054"
author: "Joy Hsu, Xinyi Lin, Matt Parker, Apoorva Srinivasan, Jiawei Ye"
date: "12/7/2018"
output:
html_document:
toc: true
toc_float: true
toc_depth: 6
code_folding: show
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE)
library(tidyverse)
library(HH) # For VIF function # or use car::vif(fit_rs)
library(leaps) # Stepwise and test-based criteria
library(modelr) # add predictions and cross-validation
library(glmnet) # for ridge
```
### Load Main dataset
```{r}
cancer_reg = read_csv("./data/Cancer_Registry.csv") %>%
janitor::clean_names() %>%
dplyr::select(target_death_rate, everything()) %>%
separate(geography, into = c("county", "state"), sep = ", ") %>%
mutate(
avg_deaths_per_year = as.numeric(avg_deaths_per_year),
med_income = as.numeric(med_income),
pop_est2015 = as.numeric(pop_est2015),
county = str_replace(county, " ", "")) %>%
dplyr::select(state, county, everything()) %>%
arrange(target_death_rate, incidence_rate)
```
### 1. Remove Categorical and high proportion missing values
* Remove state and county - catergorical parameters related to observations
* Remove avg_ann_count and avg_deaths_per_year, since this data is already captured in mortality rate and incidence, weighted by county population.
* Remove variables with high percentage missing values:
* pct_employed16_over - 152 missing. This variable has some collinearity with pct_unemployed16_over (R = -0.65), so we are not completely excluding information related to employment.
* pct_some_col18_24 - 2285 missing. Over 2/3 missing values.
* pct_private_coverage_alone - 609 missing. Since this variable has strong collinearity with pct_private_coverage (R = 0.93), so we are not excluding valuable information on insurance coverage
```{r}
# Examine missing values
# 3 variables with high missing: values pct_employed16_over (152), pct_some_col18_24 (2285), pct_private_coverage_alone (609)
skimr::skim(cancer_reg)
# observe collinearity between pct_private_coverage and variable to exclude, pct_private_coverage_alone
cancer_reg %>%
dplyr::select(target_death_rate, pct_private_coverage, pct_private_coverage_alone) %>% cor(x = ., use = "pairwise.complete.obs")
# observe collinearity between pct_unemployed16_over and variable to exclude, pct_employed16_over
cancer_reg %>%
dplyr::select(target_death_rate, pct_unemployed16_over, pct_employed16_over) %>% cor(x = ., use = "pairwise.complete.obs")
# remove variables with high missing values, county and state
cancer_reg_tidy = cancer_reg %>%
dplyr::select(-pct_employed16_over, -pct_some_col18_24, -pct_private_coverage_alone, -county, -state, -avg_ann_count, -avg_deaths_per_year)
```
### 2. Remove Variables with high Collinearity
Next, we will examine collinearity between remaining variables. Remove the following:
* remove bin_income, since we will use med_income to look at income
* remove poverty_percent, due to high collinearity with med_income (r = -0.78). We will use median income as key variable for model.
* remove of bin_income, since we will tentatively use med_income to look at income
* med_income - CL with multiple variables related to education attainment and insurance coverage: pct_bach_deg25_over, pct_private_coverage, pct_emp_priv_coverage, pct_public_coverage, pct_public_coverage_alone
* percent_married - CL with pct_married_households (r = 0.87)
* pct_hs25_over - CL with pct_bach_deg25_over (-0.74)
* Only keep "pct_public_coverage_alone" - out of all insurance status variables, this has highest correlation with main outcome. High CL between between all insurance variables.
Other variables of consideration.
* pct_white and pct_black has high CL (-0.83), but from a demographic standpoint, we may be interested in keeping both.
* create pct_nonwhite as a proxy
* discuss why pct_black and pct_white has such negative correlation in report
We are left with 21 variables
```{r}
# Examine collinearity between variables
cor_df = cancer_reg_tidy %>%
dplyr::select(-binned_inc) %>%
cor()
# remove collinear variables
# remove variables due to missing values
# remove variables due to collinearity
# create proxy variable pct_nonwhite
cancer_reg_tidy = cancer_reg %>%
dplyr::select(
-pct_employed16_over, -pct_some_col18_24, -pct_private_coverage_alone, -county, -state, -avg_ann_count, -avg_deaths_per_year) %>%
dplyr::select(
-median_age_female, -pct_emp_priv_coverage, -pct_public_coverage, -pct_private_coverage, -poverty_percent, -pct_hs25_over, -percent_married, -binned_inc) %>%
mutate(pct_nonwhite = 1 - pct_white) %>%
arrange(target_death_rate, incidence_rate)
skimr::skim(cancer_reg_tidy)
```
### 3. Automatic Search Procedures
Stepwise regression uses the AIC criterion for variable selection. Given a set of candidate models for the data, we prefer the model with the minimum AIC value. AIC rewards goodness of fit (as assessed by the likelihood function), but AIC also includes a penalty that is a function of number of parameters.
```{r}
# define full and null sets for forward, backward, both ways search procedures
null = lm(target_death_rate ~ 1, data = cancer_reg_tidy)
full = lm(target_death_rate ~ ., data = cancer_reg_tidy)
```
**Forward Selection Model**
Adjusted R-squared: 0.4988
Step: AIC=18159.34
target_death_rate ~ pct_bach_deg25_over + incidence_rate + pct_public_coverage_alone +
pct_other_race + pct_white + pct_hs18_24 + median_age_male +
birth_rate + pct_married_households + pct_unemployed16_over +
pop_est2015
```{r, warning=TRUE, error=TRUE}
# stepwise regression, forward
step(null, scope = list(lower = null, upper = full), direction = "forward")
# forward fit
fit_forward = lm(target_death_rate ~ pct_bach_deg25_over + incidence_rate + pct_public_coverage_alone +
pct_other_race + pct_white + pct_hs18_24 + median_age_male +
birth_rate + pct_married_households + pct_unemployed16_over +
pop_est2015, data = cancer_reg_tidy)
summary(fit_forward)
car::vif(fit_forward)
```
**Backward Elimination Model**
Adjusted R-squared: 0.4988
Step: AIC=18159.34
target_death_rate ~ incidence_rate + pop_est2015 + median_age_male +
pct_hs18_24 + pct_bach_deg25_over + pct_unemployed16_over +
pct_public_coverage_alone + pct_white + pct_other_race +
pct_married_households + birth_rate
```{r, error=TRUE}
# stepwise regression, backward
step(full, data = cancer_reg_tidy, direction = "backward")
# backward fit
fit_back = lm(target_death_rate ~ incidence_rate + pop_est2015 + median_age_male +
pct_hs18_24 + pct_bach_deg25_over + pct_unemployed16_over +
pct_public_coverage_alone + pct_white + pct_other_race +
pct_married_households + birth_rate, data = cancer_reg_tidy)
summary(fit_back)
car::vif(fit_back)
```
**Stepwise Regression, Both Directions**
Same 10 predictor output as forwards and backwards
Adjusted R-squared: 0.4988
Step: AIC=18159.34
target_death_rate ~ pct_bach_deg25_over + incidence_rate + pct_public_coverage_alone +
pct_other_race + pct_white + pct_hs18_24 + median_age_male +
birth_rate + pct_married_households + pct_unemployed16_over +
pop_est2015
```{r}
# stepwise regression, both directions
step(null, scope = list(upper=full), data = cancer_reg_tidy, direction = "both")
fit_both_dir = lm(target_death_rate ~ pct_bach_deg25_over + incidence_rate + pct_public_coverage_alone + pct_other_race + pct_white + pct_hs18_24 + median_age_male + birth_rate + pct_married_households + pct_unemployed16_over + pop_est2015, data = cancer_reg_tidy)
summary(fit_both_dir)
car::vif(fit_both_dir)
```
**Conclusion**
Automatic Search Procedures using forward & backward selection, stepwise regression generated the same "best model" with 11-predictors, using the AIC criterion. The model has Adjusted R-squared: 0.4988.
```{r}
fit_auto = lm(target_death_rate ~ pct_bach_deg25_over + incidence_rate + pct_public_coverage_alone + pct_other_race + pct_white + pct_hs18_24 + median_age_male + birth_rate + pct_married_households + pct_unemployed16_over + pop_est2015, data = cancer_reg_tidy)
summary(fit_auto)
```
### 4. Criterion Based Procedures
**Best subsets regression procedure with Mallows’ Cp and Adjusted R2 Criterion**
* For Mallow's Cp-statistic, the Cp Statistic should be less than or equal to the # of parameters. Optimizing for lowest Cp Statistic, we select 9 predictors
* For R^2(adj) criteria, we optimize for highest R^2(adj), while accounting for parsimony. Selecting ~7 predictors optimizes both the R^2(adj) criterion and overarching goal for parsimony.
```{r}
criterion = leaps::regsubsets(target_death_rate ~ ., data = cancer_reg_tidy, nvmax = 19)
crit_sum = summary(criterion)
summary(criterion)
# Plots for Cp Statistic and Adjusted R2
par(mar = c(4,4,1,1))
par(mfrow = c(1,2))
# Cp Statistic Plot
plot(x = 1:19, y = crit_sum$cp, xlab = "No of parameters", ylab = "Cp Statistic")
# abline - adds straight lines to a plot
abline(0,1)
# Adjusted R2 Plot
plot(x = 1:19, y = crit_sum$adjr2, xlab = "No of parameters", ylab = "Adj R2")
```
**Summary Table for Cp Statistic and Adjusted R2**
```{r}
# fit with all parameters
fit_multi = lm(target_death_rate ~ ., data = cancer_reg_tidy)
# best subset of variables for given number of parameters
best = function(model, ...) {
subsets <- regsubsets(formula(model), nvmax = 18, model.frame(model), ...)
subsets <- with(summary(subsets), cbind(p = as.numeric(rownames(which)), which, rss, rsq, adjr2, cp, bic))
return(subsets)
}
# Select the 'best' model of all subsets for 8-predictor model
cp_r2_df = round(best(fit_multi, nbest = 1), digits = 3)
cp_r2_df
```
```{r}
# should take log for pct_bach_deg25_over variable
cancer_reg_tidy %>%
ggplot(aes(x = log(pct_bach_deg25_over), y = target_death_rate)) +
geom_point()
```
### 5. Final 3 Models & Diagnostics (Check Model Assumptions)
Three models satisfied model assumptions:
* normality of residuals
* equal variance
```{r}
# 10 predictor stepwise automatic search model
# Adjusted R-squared: 0.4986
fit_auto = lm(target_death_rate ~ log(pct_bach_deg25_over) + incidence_rate + pct_public_coverage_alone +
pct_other_race + pct_white + pct_hs18_24 + median_age_male + birth_rate + pct_married_households + pct_unemployed16_over, data = cancer_reg_tidy)
summary(fit_auto)
car::vif(fit_auto)
par(mfrow = c(2,2))
plot(fit_auto)
##########
# 9 predictor cp model
# Adjusted R-squared: 0.4983
fit_cp = lm(target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_unemployed16_over + pct_public_coverage_alone + pct_other_race + pct_married_households + birth_rate, data = cancer_reg_tidy)
summary(fit_cp)
car::vif(fit_cp) # VIF values
par(mfrow = c(2,2))
plot(fit_cp)
##########
# 7 predictor adjusted R2 model
# Adjusted R-squared: 0.4927
fit_r2 = lm(target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_public_coverage_alone + pct_other_race + pct_married_households, data = cancer_reg_tidy)
summary(fit_r2)
car::vif(fit_r2) # VIF values
# Diagnostic plots
par(mfrow = c(2,2))
plot(fit_r2)
#residual covariate plot
plot(cancer_reg_tidy$incidence_rate, fit_r2$residuals)
abline(h=0, lwd=2, col=2)
plot(cancer_reg_tidy$median_age_male, fit_r2$residuals)
abline(h=0, lwd=2, col=2)
plot(cancer_reg_tidy$pct_hs18_24, fit_r2$residuals)
abline(h=0, lwd=2, col=2)
plot(cancer_reg_tidy$pct_bach_deg25_over, fit_r2$residuals)
abline(h=0, lwd=2, col=2)
plot(cancer_reg_tidy$pct_public_coverage_alone, fit_r2$residuals)
abline(h=0, lwd=2, col=2)
plot(cancer_reg_tidy$pct_other_race, fit_r2$residuals)
abline(h=0, lwd=2, col=2)
plot(cancer_reg_tidy$pct_married_households, fit_r2$residuals)
abline(h=0, lwd=2, col=2)
```
### 6. Remove one outlier in y
```{r}
cancer_reg_remove =
cancer_reg_tidy %>%
filter(target_death_rate < 300)
model_remove = lm(formula = target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_public_coverage_alone + pct_other_race + pct_married_households, data = cancer_reg_remove)
summary(model_remove)
par(mfrow = c(2, 2))
plot(fit_r2)
```
### 6. Examine Leverage & Influential points
#### 6.1 **Outliers in Y - Studentized Residuals**
We use internally studentized residuals to quantify the magnitude of residuals in standard deviation unit. As a rule of thumb, an observation with |r~i~| > 2.5 may be considered an outlier in Y.
The following points are extreme outliers in y with |r~i~| > 5.0:
* 802 - Williamsburg County, Virginia - NIH reported Incidence Rate for all cancers in Williamsburg County, Virginia, averaged across 2011-2015, is 376.2 per 100,000 (95% CI: 314.6, 447.3). However our dataset has incidence rate at 1014 per 100,000. **Steps**: exclude datapoint. Predicted mortality rate 265.0524, actual mortality rate it 162.1, residual -102.9524
* 3044 - Madison County, Mississippi
* 3046 - Woodson County, Kansas
```{r}
# add predictions & residuals for 7 predictor model to dataset
# df below used to examine state and county for outliers in y.
cancer_reg_pred_resid = cancer_reg %>%
dplyr::select(
state, county, target_death_rate, incidence_rate, median_age_male, pct_hs18_24, pct_bach_deg25_over, pct_private_coverage, pct_public_coverage_alone, pct_other_race, pct_married_households) %>%
modelr::add_predictions(fit_r2, var = "pred_model_7") %>%
modelr::add_residuals(fit_r2, var = "resid_model_7") %>%
arrange(target_death_rate, incidence_rate)
# rstandard function gives the INTERNALLY studentized residuals
stu_res = rstandard(fit_r2)
# 68 points with student residuals > 2.5
stu_res[abs(stu_res) > 2.5]
# 3 points with student residuals > 5
stu_res[abs(stu_res) > 5]
# test remove point 802: Williamsburg County, Virginia
# no substantial change in coefficients.
cancer_reg_remove =
cancer_reg_tidy %>%
filter(incidence_rate != 1014.2)
model_remove = lm(formula = target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_public_coverage_alone + pct_other_race + pct_married_households, data = cancer_reg_remove)
summary(model_remove)
```
#### 6.2 **Outliers in X - Leverage Values**
To determine outliers in X, we use leverage values (diagonal elements h~ii~ of the hat matrix) to measure how far each observation sits from the center of the X space. Observations with high leverage may have an influence on the fitted model's estimated coefficients. Observations with a leverage value 0.2 < h~ii~ < 0.5, will be considered moderate leverage observations. Examining the "hat" plot below, we do not observe any outliers in X.
```{r}
# Obtain DFFITS, Cook's Distance, Hat diagonal elements.
influence_measures = influence.measures(fit_r2)
# plot of hat values and extract hat values above 0.2.
lev = hat(model.matrix(fit_r2))
plot(lev)
# command returns all observations with leverage values above 0.2
cancer_reg_tidy[lev > 0.2,]
```
#### 6.3 **Influential Points - Cook's Distance**
After identifying the outliers in Y (points 802, 3044, 3046), we will use Cook's distance measures to determine if they are influential. Applying **Cook's distance** criteria, an observation can be influential if it has a high residual and high h~ii~. As a rule of thumb, a D~i~ > 0.5 in R is of concern. Examination of the Cook's distance reveals that none of the y-outliers 802 (red), 3044 (blue), or 3046 (green) met the criteria as influential points. As such, we will not take further actions to remove Y outliers from our dataset.
```{r}
# plot Cook's distance for fit_r2 7-predictor model
cook = cooks.distance(fit_r2)
plot(cook,ylab = "Cooks distances")
points(802,cook[802],col = 'red')
points(3044,cook[3044],col = 'blue')
points(3046,cook[3046],col = 'green')
```
### 7. N-Fold Cross Validation & Violin Plots
```{r}
set.seed(20)
# create training and testing sets, 100 fold cross-validation, 80/20 split
cv_cm = modelr::crossv_mc(data = cancer_reg_tidy, n = 100, test = 0.2, id = "id")
#
cv_cm = cv_cm %>%
mutate(lm_auto = map(train, ~lm(target_death_rate ~ pct_bach_deg25_over + incidence_rate + pct_public_coverage_alone + pct_other_race + pct_white + pct_hs18_24 + median_age_male + birth_rate + pct_married_households + pct_unemployed16_over, data = .x)),
lm_cp = map(train, ~lm(target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_unemployed16_over + pct_public_coverage_alone + pct_other_race + pct_married_households + birth_rate, data = .x)),
lm_r2 = map(train, ~lm(target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_public_coverage_alone + pct_other_race + pct_married_households, data = .x))) %>%
mutate(rmse_auto = map2_dbl(lm_auto, test, ~rmse(model = .x, data = .y)),
rmse_cp = map2_dbl(lm_cp, test, ~rmse(model = .x, data = .y)),
rmse_r2 = map2_dbl(lm_r2, test, ~rmse(model = .x, data = .y)))
```
```{r}
# experimental values for RMSE will vary each time we rerun code
cv_cm %>%
dplyr::select(starts_with("rmse")) %>%
gather(key = model, value = rmse) %>%
group_by(model) %>%
summarise(mean_rmse = mean(rmse),
var_rmse = var(rmse))
# Violin Plots
cv_cm %>%
dplyr::select(starts_with("rmse")) %>%
gather(key = model, value = rmse) %>%
mutate(
model = str_replace(model, "rmse_", "model "),
model = fct_reorder(model, rmse)) %>%
ggplot(aes(x = model, y = rmse)) +
geom_violin() +
labs(title = "Distribution of RMSE from Cross-Validation",
x = "Model",
y = "RMSE")
```
### 8. K-Fold Cross validation
```{r}
model_10 = lm(formula = target_death_rate ~ pct_bach_deg25_over + incidence_rate + pct_public_coverage_alone + pct_other_race + pct_white + pct_hs18_24 + median_age_male + birth_rate + pct_married_households + pct_unemployed16_over, data = cancer_reg_tidy)
model_9 = lm(formula = target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_unemployed16_over + pct_public_coverage_alone + pct_other_race + pct_married_households + birth_rate, data = cancer_reg_tidy)
model_7 = lm(formula = target_death_rate ~ incidence_rate + median_age_male + pct_bach_deg25_over + pct_hs18_24 + pct_public_coverage_alone + pct_other_race + pct_married_households, data = cancer_reg_tidy)
```
### 8.1. 10-fold CV
```{r}
library(caret)
train_data_10 = caret::trainControl(method = "cv", number = 10)
# Fit the 4-variables model that we discussed in previous lectures
model_caret10_10 =
caret::train(target_death_rate ~ pct_bach_deg25_over + incidence_rate + pct_public_coverage_alone + pct_other_race + pct_white + pct_hs18_24 + median_age_male + birth_rate + pct_married_households + pct_unemployed16_over,
data = cancer_reg_tidy,
trControl = train_data_10,
method = 'lm',
na.action = na.pass)
model_caret10_9 =
caret::train(target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_unemployed16_over + pct_public_coverage_alone + pct_other_race + pct_married_households + birth_rate,
data = cancer_reg_tidy,
trControl = train_data_10,
method = 'lm',
na.action = na.pass)
model_caret10_7 =
caret::train(target_death_rate ~ incidence_rate + median_age_male + pct_bach_deg25_over + pct_hs18_24 + pct_public_coverage_alone + pct_other_race + pct_married_households,
data = cancer_reg_tidy,
trControl = train_data_10,
method = 'lm',
na.action = na.pass)
model_caret10_10
# RMSE Rsquared MAE
# 19.7024 0.4938789 14.68475
model_caret10_9
# RMSE Rsquared MAE
# 19.68147 0.4967687 14.66503
model_caret10_7
# RMSE Rsquared MAE
# 19.79374 0.4900993 14.82677
```
### 8.2. 5-fold cv
```{r}
train_data_5 = caret::trainControl(method = "cv", number = 5)
# Fit the 4-variables model that we discussed in previous lectures
model_caret5_10 =
caret::train(target_death_rate ~ pct_bach_deg25_over + incidence_rate + pct_public_coverage_alone + pct_other_race + pct_white + pct_hs18_24 + median_age_male + birth_rate + pct_married_households + pct_unemployed16_over,
data = cancer_reg_tidy,
trControl = train_data_5,
method = 'lm',
na.action = na.pass)
model_caret5_9 =
caret::train(target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_unemployed16_over + pct_public_coverage_alone + pct_other_race + pct_married_households + birth_rate,
data = cancer_reg_tidy,
trControl = train_data_5,
method = 'lm',
na.action = na.pass)
model_caret5_7 =
caret::train(target_death_rate ~ incidence_rate + median_age_male + pct_bach_deg25_over + pct_hs18_24 + pct_public_coverage_alone + pct_other_race + pct_married_households,
data = cancer_reg_tidy,
trControl = train_data_5,
method = 'lm',
na.action = na.pass)
model_caret5_10
# RMSE Rsquared MAE
# 19.70115 0.4967758 14.65786
model_caret5_9
# RMSE Rsquared MAE
# 19.69373 0.4956894 14.66347
model_caret5_7
# RMSE Rsquared MAE
# 19.82486 0.4890669 14.84762
```
Since three models have similar performance in cross validation, based on 'principle of parsimony', we chose model with 7 predictors.
### 9. Ridge Regression
```{r}
grid = 10^seq(5,-2, length=100)
ridge_cancer =
lm.ridge(target_death_rate ~ incidence_rate + median_age_male + pct_bach_deg25_over + pct_hs18_24 +
pct_public_coverage_alone + pct_other_race + pct_married_households, data = cancer_reg_tidy, lambda = grid)
dim(coef(ridge_cancer))
#coef for 10^5
coef(ridge_cancer)[1,]
# coeffcients for 10^-2
coef(ridge_cancer)[100,]
```
```{r}
response =
cancer_reg %>%
dplyr::select(target_death_rate, incidence_rate, median_age_male, pct_bach_deg25_over, pct_hs18_24,
pct_public_coverage_alone, pct_other_race, pct_married_households) %>%
# drop_na() %>%
dplyr::select(target_death_rate) %>%
as.matrix()
predictors =
cancer_reg %>%
dplyr::select(target_death_rate, incidence_rate, median_age_male, pct_bach_deg25_over, pct_hs18_24,
pct_public_coverage_alone, pct_other_race, pct_married_households) %>%
# drop_na() %>%
dplyr::select(-target_death_rate) %>%
as.matrix()
ridge_cancer_1 = glmnet(predictors, response, alpha = 0, lambda = grid)
dim(coef(ridge_cancer_1))
ridge_cancer_1$lambda[50]
coef(ridge_cancer_1)[,50]
```
### 9.1 Ridge Cross Validation
Cross validation was used to obtain the smallest lambda
```{r}
set.seed(1)
cancer_train = sample(1:nrow(predictors),nrow(predictors)/2)
cancer_test = (-cancer_train)
response_test = response[cancer_test]
# Use build-in CV function; performs a 10-fold validation by default
# glmnet() generates it's own lambda sequence
set.seed(2)
cv_out = cv.glmnet(predictors[cancer_train,], response[cancer_train], alpha=0)
plot(cv_out)
# cv.glmnet() object contains the mean cross-validation error (cvm),
# lambda min that gives the minimum cvm, etc.
cv_out
best_lambda = cv_out$lambda.min
best_lambda
# Re-fit the model with the min lambda value, look at the coeff and MSE
# Ridge regression using all observations and 'best' lambda
ridge_pred = glmnet(predictors, response, alpha = 0, lambda = best_lambda)
#Compare LS method and ridge
compare_ridge_ls = cbind(coef(model_7), coef(ridge_pred))
colnames(compare_ridge_ls) <- c("LS", "Ridge")
compare_ridge_ls
```
### 10. Final Model
Final model: `lm(formula = target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_public_coverage_alone + pct_other_race + pct_married_households, data = cancer_reg_tidy)`
```{r}
fit_final = lm(formula = target_death_rate ~ incidence_rate + median_age_male + pct_hs18_24 + pct_bach_deg25_over + pct_public_coverage_alone + pct_other_race + pct_married_households, data = cancer_reg_tidy)
# Final Model Coefficients
fit_final %>% broom::tidy(conf.int = TRUE) %>%
dplyr::select(term, estimate, conf.low, conf.high, std.error, p.value) %>%
rename(
"beta1 estimates" = estimate,
"95% CI lower" = conf.low,
"95% CI upper" = conf.high) %>%
knitr::kable(digits = 3)
```
### 11. Supplementary Code
Supplementary code used to generate values for the report.
```{r}
poverty_fit = lm(poverty_percent ~ pct_public_coverage_alone, data = cancer_reg)
summary(poverty_fit)
education_fit = lm(pct_hs18_24 ~ pct_bach_deg25_over, data = cancer_reg)
summary(education_fit)
insurance_fit = lm(pct_private_coverage ~ pct_public_coverage_alone, data = cancer_reg)
summary(insurance_fit)
```