Apoorva Srinivasan 12/10/2018
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(patchwork)
library("leaps")
library(faraway)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:faraway':
##
## melanoma
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(broom)
cancer_reg = read_csv("./data/Cancer_Registry.csv") %>%
janitor::clean_names() %>%
dplyr::select(target_death_rate, everything()) %>%
separate(geography, into = c("county", "state"), sep = ",")
## Parsed with column specification:
## cols(
## .default = col_double(),
## avgDeathsPerYear = col_integer(),
## medIncome = col_integer(),
## popEst2015 = col_integer(),
## binnedInc = col_character(),
## Geography = col_character()
## )
## See spec(...) for full column specifications.
There are in total 35 variables and 3047observations in the dataset.
Our outcome of interest is target_death_rate
#missing data
#colSums(is.na(cancer_reg))
##pct_some_col18_24 has 2285 NAs, pct_employed_coverage_alone has 609 NA, pct_employed16_over has 152 NAs
missing_value = sapply(cancer_reg[1:34], function(x) sum(length(which(is.na(x)))))
# Percentage of missing value
percentage_missing = sapply(cancer_reg[1:34], function(x) sum(length(which(is.na(x)))) / nrow(cancer_reg))
percentage_missing %>% data.frame()
## .
## target_death_rate 0.00000000
## avg_ann_count 0.00000000
## avg_deaths_per_year 0.00000000
## incidence_rate 0.00000000
## med_income 0.00000000
## pop_est2015 0.00000000
## poverty_percent 0.00000000
## study_per_cap 0.00000000
## binned_inc 0.00000000
## median_age 0.00000000
## median_age_male 0.00000000
## median_age_female 0.00000000
## county 0.00000000
## state 0.00000000
## avg_household_size 0.00000000
## percent_married 0.00000000
## pct_no_hs18_24 0.00000000
## pct_hs18_24 0.00000000
## pct_some_col18_24 0.74991795
## pct_bach_deg18_24 0.00000000
## pct_hs25_over 0.00000000
## pct_bach_deg25_over 0.00000000
## pct_employed16_over 0.04988513
## pct_unemployed16_over 0.00000000
## pct_private_coverage 0.00000000
## pct_private_coverage_alone 0.19986872
## pct_emp_priv_coverage 0.00000000
## pct_public_coverage 0.00000000
## pct_public_coverage_alone 0.00000000
## pct_white 0.00000000
## pct_black 0.00000000
## pct_asian 0.00000000
## pct_other_race 0.00000000
## pct_married_households 0.00000000
##getting rid of variables with missing values more than 10%
cancer_reg = cancer_reg %>% select(-pct_some_col18_24, -pct_private_coverage_alone, -binned_inc, -median_age)
##removed binned_inc since we already have median income and median age since it is avg of median age female and male. so we'll build model with those those factors instead.
##percentage missing for pct_employed16_over is ~5%, checking to see if its correlated with the outcome
reg = lm(target_death_rate~pct_employed16_over, data = cancer_reg) %>%
summary()
##Since the p-value is small, we will retain pct_employed16_over
cancer_reg = cancer_reg %>% select(-county, -state) %>%
mutate(mortality = avg_deaths_per_year/pop_est2015, prevalence = avg_ann_count/pop_est2015) %>%
select(-pop_est2015, -avg_ann_count, -avg_deaths_per_year) %>%
mutate(study_per_cap =
as.factor(ifelse(study_per_cap == 0, "none",
ifelse(study_per_cap < quantile(study_per_cap, .25), "low",
ifelse(study_per_cap < quantile(study_per_cap, .5), "medium" ,
ifelse(study_per_cap < quantile(study_per_cap, .75), "high", "very high")))))) %>%
mutate(pct_non_white = pct_black+ pct_asian + pct_other_race) %>%
select(-pct_black, -pct_asian, -pct_other_race)
##since the number of white people are a lot higher, putting the other minorities under a single variable.
##Since count itself can be misleading, taking proportion will give us a better model. added mortality and prevalence instead of avg count and avg death
##removed state and county variables since we're building a predictive model, area doesn't really matter.
##changed study_per_cap to factor variable
hist(cancer_reg$target_death_rate) #outcome is normally distributed
hist(cancer_reg$pct_private_coverage)
hist(cancer_reg$pct_public_coverage)
hist(cancer_reg$pct_emp_priv_coverage)
hist(cancer_reg$pct_public_coverage_alone)
hist(cancer_reg$incidence_rate) ##right skewed
hist(cancer_reg$med_income) #somewhat right skewed-mostly ok
hist(cancer_reg$poverty_percent)
hist(cancer_reg$median_age_male)
hist(cancer_reg$median_age_female)
hist(cancer_reg$avg_household_size) ##left skewed
hist(cancer_reg$percent_married)
hist(cancer_reg$pct_no_hs18_24) #somehwat right
hist(cancer_reg$pct_hs18_24)
hist(cancer_reg$pct_bach_deg18_24)#right skewed
hist(cancer_reg$pct_hs25_over)
hist(cancer_reg$pct_bach_deg25_over)
hist(cancer_reg$pct_employed16_over)
hist(cancer_reg$pct_unemployed16_over)
hist(cancer_reg$pct_white) #left skewed
hist(cancer_reg$pct_non_white) #right skewed
hist(cancer_reg$pct_married_households)
hist(cancer_reg$birth_rate)
hist(cancer_reg$mortality)
hist(cancer_reg$prevalence) #right skewed
##they are all almost normally distributed
cont_var = dplyr::select(cancer_reg, target_death_rate, everything(), -c(study_per_cap))
knitr::kable(summary(cont_var), caption = "descriptive statistics for continuous variables")
target_death_rate | incidence_rate | med_income | poverty_percent | median_age_male | median_age_female | avg_household_size | percent_married | pct_no_hs18_24 | pct_hs18_24 | pct_bach_deg18_24 | pct_hs25_over | pct_bach_deg25_over | pct_employed16_over | pct_unemployed16_over | pct_private_coverage | pct_emp_priv_coverage | pct_public_coverage | pct_public_coverage_alone | pct_white | pct_married_households | birth_rate | mortality | prevalence | pct_non_white | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min. : 59.7 | Min. : 201.3 | Min. : 22640 | Min. : 3.20 | Min. :22.40 | Min. :22.30 | Min. :0.0221 | Min. :23.10 | Min. : 0.00 | Min. : 0.0 | Min. : 0.000 | Min. : 7.50 | Min. : 2.50 | Min. :17.60 | Min. : 0.400 | Min. :22.30 | Min. :13.5 | Min. :11.20 | Min. : 2.60 | Min. : 10.20 | Min. :22.99 | Min. : 0.000 | Min. :0.000485 | Min. :0.0009281 | Min. : 0.000 | |
1st Qu.:161.2 | 1st Qu.: 420.3 | 1st Qu.: 38882 | 1st Qu.:12.15 | 1st Qu.:36.35 | 1st Qu.:39.10 | 1st Qu.:2.3700 | 1st Qu.:47.75 | 1st Qu.:12.80 | 1st Qu.:29.2 | 1st Qu.: 3.100 | 1st Qu.:30.40 | 1st Qu.: 9.40 | 1st Qu.:48.60 | 1st Qu.: 5.500 | 1st Qu.:57.20 | 1st Qu.:34.5 | 1st Qu.:30.90 | 1st Qu.:14.85 | 1st Qu.: 77.30 | 1st Qu.:47.76 | 1st Qu.: 4.521 | 1st Qu.:0.001888 | 1st Qu.:0.0048022 | 1st Qu.: 1.964 | |
Median :178.1 | Median : 453.5 | Median : 45207 | Median :15.90 | Median :39.60 | Median :42.40 | Median :2.5000 | Median :52.40 | Median :17.10 | Median :34.7 | Median : 5.400 | Median :35.30 | Median :12.30 | Median :54.50 | Median : 7.600 | Median :65.10 | Median :41.1 | Median :36.30 | Median :18.80 | Median : 90.06 | Median :51.67 | Median : 5.381 | Median :0.002290 | Median :0.0056236 | Median : 5.569 | |
Mean :178.7 | Mean : 448.3 | Mean : 47063 | Mean :16.88 | Mean :39.57 | Mean :42.15 | Mean :2.4797 | Mean :51.77 | Mean :18.22 | Mean :35.0 | Mean : 6.158 | Mean :34.80 | Mean :13.28 | Mean :54.15 | Mean : 7.852 | Mean :64.35 | Mean :41.2 | Mean :36.25 | Mean :19.24 | Mean : 83.65 | Mean :51.24 | Mean : 5.640 | Mean :0.002287 | Mean :0.0232443 | Mean :12.345 | |
3rd Qu.:195.2 | 3rd Qu.: 480.9 | 3rd Qu.: 52492 | 3rd Qu.:20.40 | 3rd Qu.:42.50 | 3rd Qu.:45.30 | 3rd Qu.:2.6300 | 3rd Qu.:56.40 | 3rd Qu.:22.70 | 3rd Qu.:40.7 | 3rd Qu.: 8.200 | 3rd Qu.:39.65 | 3rd Qu.:16.10 | 3rd Qu.:60.30 | 3rd Qu.: 9.700 | 3rd Qu.:72.10 | 3rd Qu.:47.7 | 3rd Qu.:41.55 | 3rd Qu.:23.10 | 3rd Qu.: 95.45 | 3rd Qu.:55.40 | 3rd Qu.: 6.494 | 3rd Qu.:0.002681 | 3rd Qu.:0.0064874 | 3rd Qu.:16.974 | |
Max. :362.8 | Max. :1206.9 | Max. :125635 | Max. :47.40 | Max. :64.70 | Max. :65.70 | Max. :3.9700 | Max. :72.50 | Max. :64.10 | Max. :72.5 | Max. :51.800 | Max. :54.80 | Max. :42.20 | Max. :80.10 | Max. :29.400 | Max. :92.30 | Max. :70.7 | Max. :65.10 | Max. :46.60 | Max. :100.00 | Max. :78.08 | Max. :21.326 | Max. :0.005136 | Max. :2.3675123 | Max. :86.066 | |
NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA's :152 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
cancer_reg %>%
group_by(study_per_cap) %>%
count() %>%
ungroup() %>%
mutate(prop = n / sum(n)) %>%
knitr::kable(digits = 2, caption = "Descriptive Statistics for clinical trial")
study_per_cap | n | prop |
---|---|---|
high | 354 | 0.12 |
none | 1931 | 0.63 |
very high | 762 | 0.25 |
(not considering interaction)
multi.fit = lm(target_death_rate ~ ., data = cancer_reg)
summary(multi.fit)
##
## Call:
## lm(formula = target_death_rate ~ ., data = cancer_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.182 -7.532 0.058 7.141 84.257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.032e+02 1.058e+01 19.206 < 2e-16 ***
## incidence_rate 8.692e-02 5.297e-03 16.410 < 2e-16 ***
## med_income 2.619e-04 5.343e-05 4.902 1.00e-06 ***
## poverty_percent -3.129e-03 1.075e-01 -0.029 0.976775
## study_per_capnone -9.543e-01 8.107e-01 -1.177 0.239270
## study_per_capvery high -1.389e+00 8.532e-01 -1.628 0.103542
## median_age_male -6.012e-01 1.410e-01 -4.263 2.08e-05 ***
## median_age_female -2.264e+00 1.499e-01 -15.100 < 2e-16 ***
## avg_household_size -1.197e-02 6.501e-01 -0.018 0.985307
## percent_married 1.174e-02 1.177e-01 0.100 0.920606
## pct_no_hs18_24 -1.113e-03 3.771e-02 -0.030 0.976454
## pct_hs18_24 2.967e-01 3.324e-02 8.927 < 2e-16 ***
## pct_bach_deg18_24 3.625e-02 7.233e-02 0.501 0.616350
## pct_hs25_over 8.539e-02 6.429e-02 1.328 0.184219
## pct_bach_deg25_over -3.547e-01 1.052e-01 -3.373 0.000753 ***
## pct_employed16_over -6.641e-01 7.320e-02 -9.072 < 2e-16 ***
## pct_unemployed16_over 7.958e-01 1.121e-01 7.099 1.58e-12 ***
## pct_private_coverage -2.716e-01 8.736e-02 -3.109 0.001896 **
## pct_emp_priv_coverage 4.285e-01 7.042e-02 6.085 1.32e-09 ***
## pct_public_coverage -1.964e+00 1.544e-01 -12.720 < 2e-16 ***
## pct_public_coverage_alone 1.802e+00 1.904e-01 9.464 < 2e-16 ***
## pct_white -1.341e-01 3.874e-02 -3.460 0.000548 ***
## pct_married_households 5.822e-02 1.134e-01 0.513 0.607691
## birth_rate -5.153e-01 1.292e-01 -3.989 6.81e-05 ***
## mortality 4.949e+04 7.916e+02 62.524 < 2e-16 ***
## prevalence -1.699e+01 2.422e+00 -7.013 2.89e-12 ***
## pct_non_white -4.812e-02 3.746e-02 -1.284 0.199120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.71 on 2868 degrees of freedom
## (152 observations deleted due to missingness)
## Multiple R-squared: 0.7877, Adjusted R-squared: 0.7857
## F-statistic: 409.2 on 26 and 2868 DF, p-value: < 2.2e-16
cancer_subset = cancer_reg %>%
select(target_death_rate, incidence_rate, med_income, median_age_male, median_age_female, pct_hs18_24, pct_bach_deg25_over, pct_employed16_over, pct_unemployed16_over, pct_public_coverage_alone, birth_rate, mortality, prevalence, pct_private_coverage, pct_emp_priv_coverage, pct_public_coverage )
reg1 = lm(target_death_rate~pct_private_coverage, data = cancer_reg) %>%
summary()
reg2 = lm(target_death_rate~pct_emp_priv_coverage, data = cancer_reg) %>%
summary()
reg3 = lm(target_death_rate~pct_public_coverage, data = cancer_reg) %>%
summary()
reg4 = lm(target_death_rate~pct_public_coverage_alone , data = cancer_reg) %>%
summary()
##I'd pick public_coverage alone since it has max r^2
plot(cancer_reg$pct_private_coverage, cancer_reg$target_death_rate)
abline(reg1,lwd = 2,col = 2)
## Warning in abline(reg1, lwd = 2, col = 2): only using the first two of 8
## regression coefficients
plot(cancer_reg$pct_emp_priv_coverage, cancer_reg$target_death_rate)
abline(reg2,lwd = 2,col = 2)
## Warning in abline(reg2, lwd = 2, col = 2): only using the first two of 8
## regression coefficients
plot(cancer_reg$pct_public_coverage, cancer_reg$target_death_rate)
abline(reg3,lwd = 2,col = 2)
## Warning in abline(reg3, lwd = 2, col = 2): only using the first two of 8
## regression coefficients
plot(cancer_reg$pct_public_coverage_alone, cancer_reg$target_death_rate)
abline(reg4,lwd = 2,col = 2)
## Warning in abline(reg4, lwd = 2, col = 2): only using the first two of 8
## regression coefficients