-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinear_modeling_in_r.R
807 lines (582 loc) · 27 KB
/
linear_modeling_in_r.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
#' ---
#' title: "Linear Modeling with R"
#' author: "Clay Ford, UVA Library StatLab"
#' date: "Spring 2022"
#' ---
#'
# To submit a line of code, place your cursor in the line and hit Ctrl + Enter
# (Win/Linux) or Cmd + Enter (Mac)
# Load some packages we'll use today
library(tidyverse)
library(ggeffects)
library(car)
library(splines)
library(stargazer)
# linear modeling with simulated data -------------------------------------
# Let's begin by simulating some fake data
x <- 1:25
y <- 10 + 5*x
plot(x, y)
# 10 is the intercept, 5 is the slope
# y is completely determined by x
# Let's add some "noise" to our data by adding random draws from a Normal
# distribution with mean = 0 and a standard deviation = 10.
# set.seed(1) ensures we all get the same "random" data
set.seed(1)
noise <- rnorm(n = 25, mean = 0, sd = 10)
# Add the noise to 10 + 5*x and re-draw plot
y <- 10 + 5*x + noise
plot(x, y)
# This data is the combination of two parts:
# 1. 10 + 5*x
# 2. rnorm(n = 25, mean = 0, sd = 10)
# What if we were given this data and told to determine the process that
# generated it? In other words, fill in the blanks:
# 1. ___________
# 2. rnorm(n = 25, mean = 0, sd = ____)
# That's basically what linear modeling is.
# Traditional linear modeling assumes the following (among others):
# 1) the formula is a weighted sum of predictors (eg, 10 + 5*x)
# 2) the noise is a random draw from a Normal distribution with mean = 0
# 3) the standard deviation of the Normal distribution is constant
# Linear modeling tries to recover the weights in the first assumption (10 and
# 5) and the standard deviation in the 3rd assumption (10).
# Let's attempt to recover the data generating process. For this we use the lm()
# function. We have to specify the formula for the first assumption. The 2nd and
# 3rd assumptions are built into lm().
# "y ~ x" means we think Part 1 of the model is "y = intercept + slope*x". This
# tells lm() to take our data and find the best intercept and slope. Notice this
# is the correct model!
mod <- lm(y ~ x)
summary(mod)
# The model returns the following estimates:
# intercept = 11.135
# slope = 5.042
# sd = 9.7 (Residual standard error)
# We could use those estimates to generate data and see if they look similar to
# our original data.
y2 <- 11.135 + 5.042*x + rnorm(25, 0, 9.7)
plot(x, y)
points(x, y2, col = "red")
# We could also add the original and fitted lines
# abline() plots a line with intercept "a" and slope "b"
abline(a = 10, b = 5) # original
abline(mod, col = "red") # fitted
# We could also compare distributions using density curves (smooth version of
# histograms)
plot(density(y))
lines(density(y2), col = "red")
# In real life, we DO NOT KNOW the formula of weighted sums, or even if a
# formula of weighted sums is appropriate. We also don't know if the Normality
# assumption or constant variance assumption of the noise is plausible.
# Another example:
set.seed(15)
# random sample of "m" and "f"
gender <- sample(c("m", "f"), size = 200, replace = TRUE)
# random numbers from the range of 1 - 20
score <- runif(n = 200, min = 1, max = 20)
# the noise: random draws from a N(0,5) distribution
noise <- rnorm(n = 200, mean = 0, sd = 5)
# Now simulate y
# The third predictor is an "interaction";
# The "effect" of score depends on gender
y <- -1 + -3*(gender=="m") + 2*score + 3*(gender=="m")*score + noise
dat <- data.frame(y, gender, score)
# scatter plot of y versus x, colored by gender
ggplot(dat, aes(x = score, y = y, color = gender)) +
geom_point()
# distribution of y; notice it's not Normal. That's OK. The normality assumption
# is for the noise/error.
ggplot(dat, aes(x = y, y = stat(density))) +
geom_histogram(bins = 20) +
geom_density()
# Let's try to recover the "true" values in the formula and the SD of the noise.
# To fit an interaction, use a colon
mod2 <- lm(y ~ gender + score + gender:score, data = dat)
# or more succinctly
mod2 <- lm(y ~ gender * score, data = dat)
# We come pretty close to recovering the true values...
summary(mod2)
# Let's simulate data from our model and see how it compares to the original
# data. For this we can use the simulate() function
sim_y <- simulate(mod2)
# plot both y and sim.y and compare
ggplot(dat, aes(x = y)) +
geom_density() +
geom_density(aes(sim_1), sim_y, color = "red")
# Or the base R way
plot(density(dat$y))
lines(density(sim_y$sim_1), col = "red")
# By specifying geom_smooth(method = "lm") in the ggplot2 call we can add the
# fitted lines.
ggplot(dat, aes(x = score, y = y, color = gender)) +
geom_point() +
geom_abline(intercept = -1, slope = 2) + # true line for "f"
geom_abline(intercept = -4, slope = 5) + # true line for "m"
geom_smooth(method = "lm") # model predicted lines
# Probably better to use the model to create the plot. These are usually
# referred to as "effect plots". Here we use the ggpredict() function from the
# ggeffects package.
eff <- ggpredict(mod2, terms = c("score", "gender"))
eff
plot(eff, add.data = TRUE)
# Let's fit a "wrong" model, no interaction.
# "y ~ gender + score" means "y = intercept + b1*gender + b2*score"
# The effect of score is the same for both males and female
mod3 <- lm(y ~ gender + score, data = dat)
summary(mod3)
# Compare simulated data to observed data; not good!
sim_y <- simulate(mod3)
plot(density(dat$y))
lines(density(sim_y$sim_1), col = "red")
# Plot the fitted model
eff <- ggpredict(mod3, terms = c("score", "gender"))
plot(eff, add.data = TRUE)
# Hence this is linear modeling:
# 1) propose and fit model(s)
# 2) determine if the model is good
# 3) use the model to explain relationships or make predictions
# YOUR TURN #1 ------------------------------------------------------------
# submit the following code to simulate some data
set.seed(2)
x1 <- sample(1:5, size = 1000, replace = TRUE,
prob = c(0.1,0.2,0.3,0.3,0.1))
x2 <- rnorm(n = 1000, mean = 12, sd = 2)
noise <- rnorm(n = 1000, mean = 0, sd = 4)
y <- 5 + 10*x1 + -4*x2 + noise
df <- data.frame(y, x1, x2)
# Use lm() in attempt to recover the "true" values.
# linear modeling with real data ------------------------------------------
# Real estate sales (from Applied Linear Statistical Models, 5th ed)
# We are interested in predicting home sales prices as a function of various
# characteristics.
sales <- read.csv("https://github.com/clayford/LinearModelingR/raw/master/real_estate_sales.csv")
# price - sale price in dollars (our response/dependent variable)
# finsqft - finished square feet
# bedrooms - number of bedrooms
# bathrooms - number of bathrooms
# ac - presence or absence of air conditioning
# garagesize - number of cars garage can hold
# pool - presence or absence of pool
# quality - quality of construction: low, medium, high
# lotsize - lot size in square feet
# highway - adject to highway? yes or no
# glance at data
str(sales)
# Make categorical variables Factors;
# mutate_if: if variable is character, make it a factor
sales <- sales %>%
mutate_if(is.character, factor)
summary(sales)
# dependent variable
hist(sales$price)
# fit a linear model using finsqft, bedrooms and lotsize
# the plus (+) sign means "include" in model
sales_mod <- lm(price ~ finsqft + bedrooms + lotsize, data = sales)
summary(sales_mod)
# The coef() function extracts the coefficients (or weights)
coef(sales_mod)
# Some naive interpretation
# - each additional finished square foot adds about $163 to price
# - each additional bedroom drops the price by $9,400 (?)
# - each additional lot size square foot adds about $1 to price
# Simulate data from model and compare to observed price
sim_price <- simulate(sales_mod, nsim = 50)
# base R plot
plot(density(sales$price))
for(i in 1:50)lines(density(sim_price[[i]]), col = "grey80")
# tidyverse method
ggplot(sales, aes(x = price)) +
geom_density() +
geom_density(aes(x = price, group = sim),
pivot_longer(sim_price, everything(),
names_to = "sim",
values_to = "price"),
color = "lightblue")
# Recall our main assumptions:
# 1) sales is a weighted sum:
# sales = Intercept + finsqft + bedrooms + lotsize
# 2) noise is from N(0, SD)
# 3) the SD is constant
# R provides some basic diagnostic plots to assess 2 and 3
plot(sales_mod)
## How to interpret plots
# 1. Residuals vs Fitted: should have a horizontal line with uniform and
# symmertic scatter of points; if not, evidence that SD is not constant
# 2. Normal Q-Q: points should lie close to diagonal line; if not, evidence that
# noise is not drawn from N(0, SD)
# 3. Scale-Location: should have a horizontal line with uniform scatter of
# point; (similar to #1 but easier to detect trend in dispersion)
# 4. Residuals vs Leverage: points outside the contour lines are influential
# observations
# Plots 1 and 3 cause concern. Plot 2 less so.
# How to address concerns?
# Non-constant SD can be evidence of a mispecified model or a very skewed
# response. Notice that our response is quite skewed:
hist(sales$price)
# We could try transforming price to a different scale. A common transformation
# is a log transformation. This doesn't look a whole lot better...
hist(log(sales$price))
# Let's try modeling log-transformed price
sales_mod2 <- lm(log(price) ~ finsqft + bedrooms + lotsize, data = sales)
summary(sales_mod2)
plot(sales_mod2)
# The diagnostic plots look better. But is this a "good model"? Is our proposed
# model of weighted sums good? Let's simulate data and compare to observed data.
sim_price <- simulate(sales_mod2, nsim = 50)
plot(density(log(sales$price)))
for(i in 1:50)lines(density(sim_price[[i]]), lty = 2, col = "grey80")
# Let's say we're happy with this model. How to interpret?
coef(sales_mod2)
# The response is log transformed, so interpretation of coefficients changes. We
# first need to exponentiate and then interpret as multiplicative instead of
# additive effect.
exp(coef(sales_mod2))
# Some naive interpretation
# - each additional finished square foot increases price by 0.04%
# - each additional bedroom increases price by 0.7%
# - each additional lot size square foot increases price a small percent
# All of the p-values refer to hypothesis tests that the coefficients are 0.
# Many statisticians and researchers don't like these tests and prefer to look
# at confidence intervals.
exp(confint(sales_mod2)) %>% round(5)
# YOUR TURN #2 ------------------------------------------------------------
# Add bathrooms and garage size to the 2nd model we fit:
# (1) sales_mod <- lm(log(price) ~ finsqft + bedrooms + lotsize, data = sales)
# (2) check the diagnostic plots
# (3) What does the garagesize coefficient say?
# (4) Challenge: simulate data from the model and compare to the observed price
# Categorical predictors --------------------------------------------------
# Let's include quality in our model for price
# Recall quality is categorical predictor
summary(sales$quality)
sales_mod3 <- lm(log(price) ~ finsqft + bedrooms + lotsize +
bathrooms + garagesize + quality,
data = sales)
# Notice there are two coefficients for quality
summary(sales_mod3)
# In general if you have k levels of a categorical variable, you have k - 1
# coefficients in a model.
# By default, R sets up categorical variables as "treatment contrasts", which
# means the coefficients represent difference from some baseline or reference
# level.
# The reference level for quality is "high". Recall we need to use exp()
coef(sales_mod3) %>% exp() %>% round(2)
# qualitylow = 0.61
# a low quality home price is expected to be about 1 - 0.61 = 0.39, or 39% lower
# than a high quality house, holding all other variables equal.
# qualitymedium = 0.71
# a medium quality home price is expected to be about 1 - 0.71 = 0.29, or 29%
# lower than a high quality house, holding all other variables equal.
# Use the relevel() function to set a new reference level
# Let's set the reference level to "low"
sales$quality <- relevel(sales$quality, "low")
sales_mod4 <- lm(log(price) ~ finsqft + bedrooms + lotsize +
bathrooms + garagesize + quality,
data = sales)
summary(sales_mod4)
# Now the coefficients for quality are relative to the "low" level
coef(sales_mod4) %>% exp() %>% round(3)
# High quality homes are expected to be about 64% higher in price than low
# quality homes. Medium homes are expected to be about 17% higher in price than
# low quality homes.
# YOUR TURN #3 ------------------------------------------------------------
# Add highway to the following model and fit it.
# (1) lm(log(price) ~ finsqft + bedrooms + lotsize + bathrooms + garagesize + quality,
# data = sales)
# (2) What is the intreptation of highway? How does it relate to the expected price?
# Interactions and Effect Plots -------------------------------------------
# Often it isn't reasonable to expect the effect of one variable to be the same
# regardless of other variables. For example, the effect of finsqft may depend
# on number of bedrooms. It may not make sense to think of the effect of adding
# more bedrooms while holding finsqft constant.
# Use the colon to specify interactions on an individual basis.
# Example: finsqft:bedroom
# Does the effect of finsqft depend on whether or not you're near a highway?
sales_mod5 <- lm(log(price) ~ finsqft + bedrooms + lotsize +
bathrooms + garagesize +
quality + highway + finsqft:bedrooms, data = sales)
summary(sales_mod5)
# The interaction looks "significant" but small. What does it mean?
# This is where effect plots can help. Here we use ggeffects()
plot(ggpredict(sales_mod5, terms = c("finsqft", "bedrooms")))
# Too many colors! We can set our own values.
# add [1, 3, 5] next to bedrooms in the quotes
plot(ggpredict(sales_mod5, terms = c("finsqft", "bedrooms[1, 3, 5]")))
# In the previous plot, the lines were curved reflecting the non-linear log
# transformation of the response. Setting log.y = TRUE will transform the y-axis
# to the log scale and hence make the lines straight.
plot(ggpredict(sales_mod5, terms = c("finsqft", "bedrooms[1, 3, 5]")),
log.y = TRUE)
# Format the labels to show dollar amounts
plot(ggpredict(sales_mod5, terms = c("finsqft", "bedrooms[1, 3, 5]")),
log.y = TRUE,
labels = scales::dollar)
# The effect of finsqft decreases as number of bedrooms increase.
# Switch the order of the terms to put bedrooms on the x-axis
plot(ggpredict(sales_mod5, terms = c("bedrooms", "finsqft")))
# again we can set the values of finsqft
p <- plot(ggpredict(sales_mod5,
terms = c("bedrooms", "finsqft[1500, 2000, 2500]")))
p
# At 2500 finished sq feet number of bedrooms doesn't really seem to matter,
# The ggpredict() result can be saved and used as a data frame with ggplot
eff_out <- ggpredict(sales_mod5,
terms = c("bedrooms", "finsqft[1500, 2000, 2500]"))
names(eff_out)
eff_out <- eff_out %>%
rename(`Fin Sq Ft`=group)
ggplot(eff_out, aes(x = x, y = predicted, color = `Fin Sq Ft`)) +
geom_line() +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = `Fin Sq Ft`),
alpha = 1/5)
# When should you include interactions? What kind? How many? That requires some
# thought and expertise in the subject.
# YOUR TURN #4 ------------------------------------------------------------
# Add an interaction for lotsize and quality to the following model:
# (1) lm(log(price) ~ finsqft + bedrooms + lotsize + bathrooms + garagesize +
# quality + highway, data = sales)
# (2) Try creating an effect plot; use the code above as a template.
# Non-linear effects ------------------------------------------------------
# Often the simple assumption of a linear effect of a predictor is unrealistic.
# Fortunately there are ways to fit non-linear effects in a linear model.
# Let's demonstrate with some simulated data
# polynomial of degree 2
x <- seq(from = -10, to = 10, length.out = 100)
set.seed(3)
y <- 1.2 + 2*x + 0.9*x^2 + rnorm(100, sd = 10)
nl_dat <- data.frame(y, x)
plot(x, y)
# Fit a polynomial with the poly() function
mod4 <- lm(y ~ poly(x, 2), data = nl_dat)
summary(mod4)
# Notice the coefficients are quite large. That's because poly() generates
# orthogonal polynomials by default. That makes for numerical stability.
# Interpretation of coefficients isn't really worth the effort.
# We could fit the raw polynomials as follows and sort of "recover" the true
# values.
mod5 <- lm(y ~ poly(x, 2, raw = TRUE), data = nl_dat)
summary(mod5)
# Use plot() and ggpredict() to plot fitted model with data
plot(ggpredict(mod4), add.data = TRUE)
plot(ggpredict(mod5), add.data = TRUE)
# The modern approach to fitting non-linear effects is to use splines instead of
# polynomials.
# Instead of poly() we can use the ns() function from the splines package. ns
# stands for natural splines. The second argument is the degrees of freedom. It
# may help to think of that as the number of times the smooth line changes
# directions.
mod6 <- lm(y ~ ns(x, df = 2), data = nl_dat)
summary(mod6)
plot(ggpredict(mod6), add.data = TRUE)
# natural splines essentially allow us to fit a series of cubic polynomials
# connected at knots located in the range of our data.
# The ns() function transforms our data similar to a polynomial transformation.
# simple example data
z <- 1:10
# classic polynomial transformation
p2_out <- poly(z, 2, raw = TRUE)
p2_out
matplot(poly(z, 2, raw = TRUE))
cor(p2_out[,1], p2_out[,2])
# orthogonal, or uncorrelated, transformation
p2_out <- poly(z, 2)
p2_out
matplot(poly(z, 2))
cor(p2_out[,1], p2_out[,2])
# natural spline transformation
ns_out <- ns(z, 2)
ns_out
matplot(ns(z, 2))
cor(ns_out[,1], ns_out[,2])
# Frank Harrell states in his book Regression Model Strategies that 3 to 5 DF is
# almost always sufficient. His basic advice is to allocate more DF to
# variables you think are more important.
# How can we assess whether we need a non-linear effect? Partial residual plots
# can help. Also called term plots and component-residual plots. We'll use the
# crPlots() function from the car package.
sales_mod6 <- lm(log(price) ~ finsqft + bedrooms + bathrooms +
garagesize + lotsize,
data = sales)
# Notice we specify the numeric variables just as we would in a model
crPlots(sales_mod6, terms = ~ finsqft + bedrooms +
bathrooms + garagesize + lotsize)
crPlots(sales_mod6, terms = ~ finsqft)
# Partial-residual plots show the relationship between a predictor and the
# response variable given that the other predictors are also in the model.
# The blue dashed line is the fitted slope. The purple line is a smooth trend
# line. A curving purple line indicates a non-linear effect may warrant a
# non-linear effect.
# Let's fit a non-linear effect for finsqft using a natural spline with 3 DF.
sales_mod7 <- lm(log(price) ~ ns(finsqft, df = 3) + bedrooms + lotsize +
bathrooms + garagesize + quality + highway,
data = sales)
summary(sales_mod7)
# The coefficients are impossible to interpret. Effect plots are our only hope.
plot(ggpredict(sales_mod7, terms = "finsqft"))
# The partial-residual plot looks better
crPlots(sales_mod7, ~ns(finsqft, df = 3))
# YOUR TURN #5 ------------------------------------------------------------
# Fit a non-linear effect for bedrooms using a natural spline with 3 DF.
# (1) lm(log(price) ~ finsqft + bedrooms + lotsize + bathrooms + garagesize +
# quality + highway,
# data = sales)
# (2) generate an effect for the non-linear bedrooms effect
# (3) How does the crPlot for bedrooms look?
# Comparing models --------------------------------------------------------
# How do we determine if one model is better than another?
# One way is through hypothesis testing (partial F test).
# NULL: two models are equally "good"
# ALTERNATIVE: the more complex model is better
# Note: the smaller model must be a subset of the larger model!
# We can run this test with the base R anova() function
# Example
sales_mod1 <- lm(log(price) ~ finsqft + bedrooms + bathrooms, data = sales)
# same model with ac added
sales_mod2 <- lm(log(price) ~ finsqft + bedrooms + bathrooms + ac, data = sales)
# same model with ac and pool added
sales_mod3 <- lm(log(price) ~ finsqft + bedrooms + bathrooms +
ac + pool, data = sales)
# test that both models are equally good at modeling price; sales_mod1 is a
# subset of sales_mod2
anova(sales_mod1, sales_mod2, sales_mod3)
# It appears sales_mod2 is superior to sales_mod1. The low p-value says we
# reject the null; the bigger model appears to be better than the smaller model.
# However it does not appear that sales_mod3 is appreciably better than
# sales_mod2. The high p-value leads us to not reject the null of no difference
# between the models.
# AIC and BIC
# We can also use the AIC or BIC information criteria. These are not hypothesis
# tests. We simply see which has a lower value. These values estimate the
# out-of-sample accuracy if we were to use these models to make predictions on
# new data.
# Both indicate sales_mod2 is preferred to the other two models
AIC(sales_mod1, sales_mod2, sales_mod3)
BIC(sales_mod1, sales_mod2, sales_mod3)
# The nice thing about AIC/BIC is that we don't need to worry about whether the
# models contain a subset of predictors. They just need to have the same
# response variable.
# While lower is better, sometimes it's hard to know how much lower AIC/BIC
# needs to be. If a really complex model is only slightly lower than an easier
# to understand smaller model, the smaller model may be preferable.
# YOUR TURN #6 ------------------------------------------------------------
# Compare these two models using anova(), AIC() and BIC(). The second contains a
# complex interaction between finsqft and bedrooms.
home_mod1 <- lm(log(price) ~ ns(finsqft, 3) + ns(bedrooms, 3) + bathrooms +
ac + pool + quality, data = sales)
home_mod2 <- lm(log(price) ~ ns(finsqft, 3) * ns(bedrooms, 3) + bathrooms +
ac + pool + quality, data = sales)
# Using a linear model ----------------------------------------------------
# Once we have a linear model we may want to use it to make predictions for new
# data. We can do that with the predict() function. Note the new data needs to
# be in a data frame.
# Let's say we want to use this model
sales_mod7 <- lm(log(price) ~ ns(finsqft, df = 3) + bedrooms +
quality + lotsize + ns(bathrooms, df = 2) +
ns(garagesize, df = 2) + pool + ac +
ns(finsqft, df = 3):bedrooms +
ns(finsqft, df = 3):quality +
ns(finsqft, df = 3):ns(bathrooms, df = 2),
data = sales)
summary(sales_mod7)
plot(density(log(sales$price)))
sim_mod7 <- simulate(sales_mod7, nsim = 50)
for(i in 1:50)lines(density(sim_mod7[[i]]), col = "grey80")
# Predict mean home price with the following characteristics
newdata <- data.frame(finsqft = 2500,
bedrooms = 4,
quality = "medium",
lotsize = 20000,
bathrooms = 2,
garagesize = 2,
pool = "no",
ac = "yes")
predict(sales_mod7, newdata = newdata) %>% exp()
# with 95% confidence interval
predict(sales_mod7, newdata = newdata, interval = "confidence") %>% exp()
# Predict a SINGLE home price.
# set interval = "prediction"
# Notice how much wider the interval is
predict(sales_mod7, newdata = newdata, interval = "prediction") %>% exp()
# Predict mean home price for same measures but with lotsize ranging from 5,000
# to 85,000.
newdata <- data.frame(finsqft = 2500,
bedrooms = 4,
quality = "medium",
lotsize = seq(5000,85000,5000),
bathrooms = 2,
garagesize = 2,
pool = "no",
ac = "yes")
predict(sales_mod7, newdata = newdata, interval = "confidence") %>% exp()
# This is basically how effect plots are created.
pred_out <- predict(sales_mod7, newdata = newdata, interval = "confidence") %>%
exp() %>%
as.data.frame() %>%
mutate(lotsize = seq(5000, 85000, 5000))
# effect plot "by hand" with ggplot
ggplot(pred_out, aes(x = lotsize, y = fit)) +
geom_line() +
geom_ribbon(mapping = aes(ymin = lwr, ymax = upr), alpha = 1/5)
# same thing with ggpredict; use condition argument to set other predictors to
# specific values.
eff_out <- ggpredict(sales_mod7, terms = "lotsize",
condition = c(finsqft = 2500,
bedrooms = 4,
quality = "medium",
bathrooms = 2,
garagesize = 2,
pool = "no",
ac = "yes"))
plot(eff_out)
# YOUR TURN #7 ------------------------------------------------------------
# Modify the code below to produce an effect of plot for quality with lotsize
# set to 20000.
eff_out <- ggpredict(sales_mod7, terms = "lotsize",
condition = c(finsqft = 2500,
bedrooms = 4,
quality = "medium",
bathrooms = 2,
garagesize = 2,
pool = "no",
ac = "yes"))
plot(eff_out)
# Communicating modeling results ------------------------------------------
# The stargazer package can produce regression tables for articles and
# presentations. It can output tables in html, text or LaTeX. Below we
# demonstrate with text.
# The basic usage is as follows
stargazer(sales_mod2, sales_mod3, type = "text", title = "Modeling Results")
# The style argument allows you to create tables according to styles preferred
# by various journals. See ?stargazer
# "ajps" = American Journal of Political Science
stargazer(sales_mod2, sales_mod3,
type = "text",
title = "Modeling Results",
style = "ajps")
# "asr" American Sociological Review
stargazer(sales_mod2, sales_mod3,
type = "text",
title = "Modeling Results",
style = "asr")
# We can set the variables
stargazer(sales_mod2, sales_mod3,
type = "text",
title = "Modeling Results",
style = "ajps",
covariate.labels = c("Finished Sq Ft", "Bedrooms",
"Lot Size", "Bathrooms",
"AC", "Pool", "Garage Size",
"Quality - Low", "Quality - Medium",
"Intercept"))
# We can omit certain model statistics
stargazer(sales_mod2, sales_mod3,
type = "text",
title = "Modeling Results",
style = "ajps",
covariate.labels = c("Finished Sq Ft", "Bedrooms",
"Lot Size", "Bathrooms",
"AC", "Pool", "Garage Size",
"Quality - Low", "Quality - Medium",
"Intercept"),
omit.stat = "f")