-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathR_history_05_21_ggplot2.R
469 lines (386 loc) · 20.4 KB
/
R_history_05_21_ggplot2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
## ggplot2 examples
#using data from
#https://tutorials.iq.harvard.edu/R/Rgraphics/Rgraphics.html#exercise_i
setwd("C:\\Users\\borrillp\\Documents\\Rgraphics\\")
## Let's look at housing prices.
housing <- read.csv("dataSets/landdata-states.csv")
head(housing[1:5])
hist(housing$Home.Value) # base R plot a histogram
## `ggplot2' histogram example:
library(ggplot2)
ggplot(data=housing, aes(x = Home.Value)) + # set the data to use, x and y values in aes
geom_histogram(col="green", fill="blue") # set fixed parameters in geom_histogram
head(housing)
## now want to plot a subset of the data as scatter plot
# we can use the subset function directly within the ggplot call:
ggplot(subset(housing, State %in% c("MA", "TX")), # select two states MA and TX
aes(x=Date,
y=Home.Value,
color=State))+ # colour the graph by the State
geom_point() # add the points to the graph
# now we want to specify the colours for the states
ggplot(subset(housing, State %in% c("MA", "TX")), # select two states MA and TX
aes(x=Date,
y=Home.Value,
color=State))+ # colour the graph by the State
geom_point() + # add the points to the graph
scale_colour_manual(values=c("green","black")) # by default the first state alphabetically will be green
# we now to want to plot TX as green and MA as black:
ggplot(subset(housing, State %in% c("MA", "TX")), # select two states MA and TX
aes(x=Date,
y=Home.Value,
color=State))+ # colour the graph by the State
geom_point() + # add the points to the graph
scale_colour_manual(values=c(TX ="green",MA="black")) # this changes the colours
# NB you can put the names of the states in directly without quotes in ggplot calls
# using log scales
hp2001Q1 <- subset(housing, Date == 2001.25) # select a subset of data
head(hp2001Q1)
ggplot(hp2001Q1, # data to use
aes(y = Structure.Cost, x = Land.Value)) + # set the x and y axis
geom_point() # plot the scatterplot
# now plot with log scale on x axis
ggplot(hp2001Q1,
aes(y = Structure.Cost, x = log(Land.Value))) + # simply log the x data directly
geom_point()
# plotting with log scale on x axis using a new layer scale_x_log10()
ggplot(hp2001Q1,
aes(y = Structure.Cost, x = Land.Value)) +
geom_point() +
scale_x_log10()
## exercise 1
# read in the data
# this show information about the human development index and corruption perception index from around the world
dat <- read.csv("dataSets/EconomistData.csv")
head(dat)
#Create a scatter plot with CPI on the x axis and HDI on the y axis.
ggplot(dat, aes(x = CPI, y = HDI)) + geom_point()
#Color the points blue.
ggplot(dat, aes(x = CPI, y = HDI)) + geom_point(col="blue")
#Map the color of the the points to Region.
ggplot(dat, aes(x = CPI, y = HDI, col=Region)) + geom_point() # nb Region goes inside the aes() because we are mapping the Region to the colour of the points
#Make the points bigger by setting size to 2
ggplot(dat, aes(x = CPI, y = HDI, col=Region)) + geom_point(size=2)
#Map the size of the points to HDI.Rank
ggplot(dat, aes(x = CPI, y = HDI, size = HDI.Rank, col=Region)) + geom_point()
#exercise 2
#Re-create a scatter plot with CPI on the x axis and HDI on the y axis (as you did in the previous exercise).
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point()
#Overlay a smoothing line on top of the scatter plot using geom_smooth.
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point() +
geom_smooth()
#Overlay a smoothing line on top of the scatter plot using geom_smooth, but use a linear model for the predictions. Hint: see ?stat_smooth.
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point() +
geom_smooth(method = "lm")
#Overlay a smoothing line on top of the scatter plot using geom_line. Hint: change the statistical transformation.
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point() +
geom_line(stat="identity") # if we use the default stat ="identity" the geom_line connects directly between the points which isn't what we want
# we want to make a fitted line, rather than actually following the data points so we need to use stat="smooth"
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point() +
geom_line(stat = "smooth", method = "loess")
#BONUS: Overlay a smoothing line on top of the scatter plot using the default loess method, but make it less smooth. Hint: see ?loess.
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point() +
geom_smooth() # this is how it looks by default
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point() +
geom_smooth(span = .4) # we can alter the span parameter to make the curve less smooth
## scales and themes
# now let's try changing the theme
# we have a basic graph of the home price index vs state
ggplot(housing,
aes(x = State,
y = Home.Price.Index)) +
geom_point(aes(color = Date)) # we colour the points by the date the house was sold
# the problem is that lots of the points on the scatter plot are directly on top of each other so we can't see them well
# let's make the points transparent
ggplot(housing,
aes(x = State,
y = Home.Price.Index)) +
geom_point(aes(color = Date),
alpha = 0.5, # sets transparency
size = 1.5) # change size of points
# the transparency helps a bit but ggplot also has a nice "jitter" function which moves the points a bit apart from each other
ggplot(housing,
aes(x = State,
y = Home.Price.Index)) +
geom_point(aes(color = Date),
alpha = 0.5, # still using transparency
size = 1.5,
position = position_jitter(width = 0.25, height = 0)) # add a "jitter" so points overlap less
# now we want to move the legend to the top and change the x axis text size
# we do this through the theme
# theme affects parts of the graph that aren't the data themselves but are the surroundings/background
ggplot(housing,
aes(x = State,
y = Home.Price.Index)) +
geom_point(aes(color = Date),
alpha = 0.5, # still using transparency
size = 1.5,
position = position_jitter(width = 0.25, height = 0)) +
theme(legend.position="top", # move the legend
axis.text=element_text(size = 6)) # set the text size
# we can also edit the scale used for the colour of the dots
# we want to set the breaks to specific years, and change how the year numbers are displayed
ggplot(housing,
aes(x = State,
y = Home.Price.Index)) +
geom_point(aes(color = Date),
alpha = 0.5, # still using transparency
size = 1.5,
position = position_jitter(width = 0.25, height = 0)) +
theme(legend.position="top", # move the legend
axis.text=element_text(size = 6)) + # set the text size
scale_color_continuous(name="", # scale_color_continuous is used because we have a coloured scale, and it is continuous (rather than categorical) data
breaks = c(1976, 1994, 2013), # these are the breaks we want
labels = c("'76", "'94", "'13")) + # labels for the new breaks
scale_x_discrete(name="State Abbreviation") # we can also add a new x axis label
## exercise 3
# plot CPI against HDI, coloured by region as a scatterplot
ggplot(dat, aes(x = CPI, y = HDI, col=Region)) +
geom_point()
# modify axis titles
ggplot(dat, aes(x = CPI, y = HDI, col=Region)) +
geom_point() +
scale_x_continuous(name="Corruption perception index") # the xaxis is continuous so we need to use scale_x_continuous
ggplot(dat, aes(x = CPI, y = HDI, col=Region)) +
geom_point() +
scale_x_continuous(name="Corruption perception index") +
scale_y_continuous(name="Human development index") # similarly the y axis is continuous so we need to use scale_y_continuous
ggplot(dat, aes(x = CPI, y = HDI, col=Region)) +
geom_point() +
scale_x_continuous(name="Corruption perception index") +
scale_y_continuous(name="Human development index") +
scale_color_discrete(name="Region of the world") # we can also re-title the legend, but this time we have discrete colour scale, so we use scale_color_discrete
# now change colours for regions
ggplot(dat, aes(x = CPI, y = HDI, color = Region)) +
geom_point() +
scale_x_continuous(name = "Corruption Perception Index") +
scale_y_continuous(name = "Human Development Index") +
scale_color_manual(name = "Region of the world", # we have to use scale_color_manual because we want to manually change the color scale
values = c("red",
"pink",
"blue",
"green",
"orange",
"black"))
# we can change the colours also to html codes. The website http://colorbrewer2.org/ is very nice to find colour schemes
# it also allows you to find colour blind friendly colours which is important for publications
ggplot(dat, aes(x = CPI, y = HDI, color = Region)) +
geom_point() +
scale_x_continuous(name = "Corruption Perception Index") +
scale_y_continuous(name = "Human Development Index") +
scale_color_manual(name = "Region of the world", # we have to use scale_color_manual because we want to manually change the color scale
values = c("#7fc97f",
"#beaed4",
"#fdc086",
"#ffff99",
"#386cb0",
"#f0027f"))
# now we want to specific which region has which colour
ggplot(dat, aes(x = CPI, y = HDI, color = Region)) +
geom_point() +
scale_x_continuous(name = "Corruption Perception Index") +
scale_y_continuous(name = "Human Development Index") +
scale_color_manual(name = "Region of the world",
values = c(MENA="red", # we assign each region a colour
SSA="pink",
"East EU Cemt Asia"="blue",
'Asia Pacific'="green",
Americas="orange",
'EU W. Europe'="black"))
# if you're worried you might misspell the regions we can print them out as characters, and then copy and paste them into our previous command:
as.character(unique(dat$Region))
# we can use facet wrap to plot our graph separately for the six regions
ggplot(dat, aes(x = CPI, y = HDI, color = Region)) +
geom_point() +
scale_x_continuous(name = "Corruption Perception Index") +
scale_y_continuous(name = "Human Development Index") +
scale_color_manual(name = "Region of the world",
values = c(MENA="red",
SSA="pink",
"East EU Cemt Asia"="blue",
'Asia Pacific'="green",
Americas="orange",
'EU W. Europe'="black")) +
facet_wrap(~Region)
## now we want to plot the average Home.Value and Land.Value per date
head(housing)
# we need to calculate the mean value per year.
# we can use aggregate from base R:
housing.byyear.aggregate <- aggregate(cbind(Home.Value, Land.Value) ~ Date, data = housing, mean)
head(housing.byyear.aggregate)
# or use dplyr
library(tidyr)
library(dplyr)
housing.byyear <- housing %>%
group_by(Date) %>% # group_by date
summarise(Home.Value = mean(Home.Value), # calculate mean of Home.Value
Land.Value = mean(Land.Value)) # calculate mean of Land.Value
head(housing.byyear)
# we need to convert this wide format table to a long format table
# ggplot needs long format tables, because we need to have a column telling it whether the value is for Land.Value or Home.value
# we can use gather in tidyr
home.land.byyear <- gather(housing.byyear,
value = "value",
key = "type",
Home.Value, Land.Value)
head(home.land.byyear)
tail(home.land.byyear)
dim(housing.byyear) # dim of wide format
dim(home.land.byyear) # dim of long format
# now we can make our graph
ggplot(home.land.byyear,
aes(x=Date,
y=value,
color=type)) +
geom_line()
# we can see here that each Date has a row for Land.value and a row for Home.value (i.e. it is long format)
head(home.land.byyear[order(home.land.byyear$Date),])
## now we're doing the challening figure from the economist
# make economist figure
dat <- read.csv("dataSets/EconomistData.csv")
# start with a plot of HDI against CPI, coloured by region (same as before):
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point(aes(color = Region))
# now add in geom_smooth for a linear model, with the formula y~x + log(x)
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point(aes(color = Region)) +
geom_smooth(mapping = aes(linetype = "r2"), # add the line as a new part of the legend
method = "lm", # the fitting model should be a linear model
formula = y ~x + log(x), # this is the formula for the linear model
se=FALSE, # turn off the confidence interval
color="red") # colour the line red
#change the point shape to open circle
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point(aes(color = Region), shape=1, size=4,stroke=1.25) + # here we edit the shape to 1 (open circle), make the points bigger with size, and make the stroke thickness bigger for the open circles
geom_smooth(mapping = aes(linetype = "r2"),
method = "lm",
formula = y ~x + log(x),
se=FALSE,
color="red")
# change the order and labels of Region to:
#(OECD, Americas, Asia & Oceania, Central & Eastern Europe,
#Middle East & north Africa, Sub-Saharan Africa)
# for this we need to change the levels of the original dataset:
levels(dat$Region) # these are the current levels, these are in alphabetical order by default
# we can change the order of the levels by using factor
dat$Region <- factor(dat$Region, # re-factor the levels
levels = c("EU W. Europe", # this list is the order we want
"Americas",
"Asia Pacific",
"East EU Cemt Asia",
"MENA",
"SSA"))
levels(dat$Region) # the order has been updated
# plot the graph using the exact same code as before
# but since we changed the levels of Region the order of the legend will also change
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point(aes(color = Region), shape=1, size=4,stroke=1.25) +
geom_smooth(mapping = aes(linetype = "r2"),
method = "lm",
formula = y ~x + log(x),
se=FALSE,
color="red")
# we actually want to also edit the names of each region
# we can do this through factor again, but using the labels parameter also
dat$Region <- factor(dat$Region,
levels = c("EU W. Europe",
"Americas",
"Asia Pacific",
"East EU Cemt Asia",
"MENA",
"SSA"),
labels = c("OECD", # these are the new labels, they must be in the same order as the levels listed above, otherwise the names will get mixed up!
"Americas",
"Asia &\nOceania", # the \n indicates a new line should be put in
"Central &\nEastern Europe",
"Middle East &\nnorth Africa",
"Sub-Saharan\nAfrica"))
head(dat$Region) # we can see the names of Region have changed
# again plot the graph with the exact same code as above but we can see we have the new region names because we edited the dataframe
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point(aes(color = Region), shape=1, size=4, stroke=1.25) +
geom_smooth(mapping = aes(linetype = "r2"),
method = "lm",
formula = y ~x + log(x),
se=FALSE,
color="red") +theme(legend.position="top")
#label select points
# if we label all points we get a mess:
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point(aes(color = Region), shape=1, size=4, stroke=1.25) +
geom_smooth(mapping = aes(linetype = "r2"),
method = "lm",
formula = y ~x + log(x),
se=FALSE,
color="red") +theme(legend.position="top") +
geom_text(aes(label = Country), # add labels to each point according to the Country column
color = "gray20") # colour the points grey
# make a list of the selected countries to plot (we can't show them all)
pointsToLabel <- c("Russia", "Venezuela", "Iraq", "Myanmar", "Sudan",
"Afghanistan", "Congo", "Greece", "Argentina", "Brazil",
"India", "Italy", "China", "South Africa", "Spane",
"Botswana", "Cape Verde", "Bhutan", "Rwanda", "France",
"United States", "Germany", "Britain", "Barbados", "Norway", "Japan",
"New Zealand", "Singapore")
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point(aes(color = Region), shape=1, size=4, stroke=1.25) +
geom_smooth(mapping = aes(linetype = "r2"),
method = "lm",
formula = y ~x + log(x),
se=FALSE,
color="red") +theme(legend.position="top") +
geom_text(aes(label = Country),
color = "gray20",
data = subset(dat, Country %in% pointsToLabel)) # subset which countries to show as labels using the list we made
# the text labels are still overlapping a bit, we can use a package ggrepel to fix this
install.packages("ggrepel")
library(ggrepel)
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point(aes(color = Region), shape=1, size=4, stroke=1.25) +
geom_smooth(mapping = aes(linetype = "r2"),
method = "lm",
formula = y ~x + log(x),
se=FALSE,
color="red") +theme(legend.position="top") +
geom_text_repel(aes(label = Country), # we change geom_text to geom_text_repel because this can space the labels apart
color = "gray20",
data = subset(dat, Country %in% pointsToLabel),
force = 10) # the force determines how far apart the labels should be
## let's use facet wrap to split the graph into the 6 regions:
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point(aes(color = Region), shape=1, size=4, stroke=1.25) +
geom_smooth(mapping = aes(linetype = "r2"),
method = "lm",
formula = y ~x + log(x),
se=FALSE,
color="red") +
geom_text_repel(aes(label = Country),
color = "gray20",
data = subset(dat, Country %in% pointsToLabel),
force = 10) +
facet_wrap(~Region) # we split the graph into facets according to region
# now we want to change the theme (the overall appearance)
ggplot(dat, aes(x = CPI, y = HDI)) +
geom_point(aes(color = Region), shape=1, size=4, stroke=1.25) +
geom_smooth(mapping = aes(linetype = "r2"),
method = "lm",
formula = y ~x + log(x),
se=FALSE,
color="red") +
geom_text_repel(aes(label = Country),
color = "gray20",
data = subset(dat, Country %in% pointsToLabel),
force = 10) +
facet_wrap(~Region) +
theme_minimal() + # add an overall minimal theme
theme(legend.position="top")+ # we need the legend position to move after the theme_minimal, otherwise it gets over-ridden by the theme_minimal (which by default puts the legend at the right)
theme(strip.background = element_rect(fill="blue"), # we can edit individual theme elements, like the strip background, it is a rectangle so we need to call it an element_rectangle before specifying the colour etc.
strip.text = element_text(color = "white", size=10, face = "bold"), # similarly text is an element_text
plot.background = element_blank()) # we can also set element_blank which makes that part of the plot blank