-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdwr_09_dplyr.R
819 lines (609 loc) · 26 KB
/
dwr_09_dplyr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
#' ---
#' title: "Data Wrangling in R: dplyr"
#' author: "Clay Ford"
#' date: "Spring 2016"
#' output: pdf_document
#' ---
load("../data/datasets_L08.Rda")
# This lecture introduces two packages: plyr and dplyr.
# plyr is a package that supplies functions for splitting data into groups,
# applying function to each group, and combining the results back together;
# known as the split-apply-combine strategy.
# original journal article: http://www.jstatsoft.org/v40/i01
# plyr tutorial: http://plyr.had.co.nz/09-user/
# We'll only talk briefly about plyr and move on to dplyr, a sort of evolution
# of plyr. dplyr is specifically for data frames and has a very powerful but
# easy to use syntax for manipulating data. It's also very fast when working
# with "large" data frames.
# plyr --------------------------------------------------------------------
# install.packages("plyr")
library(plyr)
# Example of split-apply-combine
# split
temps <- split(weather$Max.TemperatureF, weather$Events)
temps
# apply a function to each group
maxmeans <- sapply(temps, mean)
maxmeans
# combine results
data.frame(event=names(maxmeans), meanMaxTemp=maxmeans, row.names = NULL)
# Here's how you do the same with plyr
ddply(weather, "Events", summarize, meanMaxTemp=mean(Max.TemperatureF))
# Let's break that down:
# - dd in ddply means data frame in, data frame out
# - first argument: input data frame
# - second argument: grouping variable to split data frame by
# - third argument: function to apply to each group; summarize is a plyr function
# - fourth argument: argument passed to third function
# Note we can do (mostly) the same with aggregate()
aggregate(Max.TemperatureF ~ Events, data=weather, mean)
# So why use plyr?
# ddply can also do things like this:
ddply(weather, "Events", summarize,
meanMaxTemp=mean(Max.TemperatureF),
medianMaxTemp=median(Max.TemperatureF),
sdMaxTemp=sd(Max.TemperatureF),
n=sum(!is.na(Max.TemperatureF)),
seMaxTemp=sdMaxTemp/sqrt(n))
# This is the same as we did before, but notice we calculated more than one
# summary. Also notice we used calculated summaries in the subsequent
# calculation of seMaxTemp. We cannot do that with aggregate().
# dplyr -------------------------------------------------------------------
# dplyr focuses only on data frames. It is faster than plyr and easier to use
# (in my opinion). In addition it comes with a fantastic Introductory vignette
# in the documentation.
# Let's detach plyr to prevent conflicts with dplyr. (Actually I think they may
# play well together now, but historically they haven't.) The unload=TRUE
# argument unloads the package from memory; Otherwise, R removes the package
# from the search path but doesn't unload it.
detach("package:plyr", unload=TRUE)
# now load dplyr
library(dplyr)
# We see that dplyr has functions with the same names of functions in the stats
# and base packages. The message "The following objects are masked..." means we
# have packages loaded with functions sharing the same name, and that when we
# use, say, the setdiff function, we'll be using the setdiff function in dplyr,
# not the setdiff function in the base package. To access the base setdiff
# function you need to specify base::setdiff().
# dplyr provides data manipulation verbs that work on a single data frame, a
# sort of grammar of data wrangling. The dplyr philosophy is to have small
# functions that each do one thing well. Some of the more commonly used verbs
# include:
# filter() - select a subset of the rows of a data frame
# slice() - select rows by position
# select() - select columns
# arrange() - reorder (sort) rows by columns
# rename() - rename variables (column headers)
# distinct()- return the unique values in a data frame
# mutate() - add new columns that are functions of existing columns
# transmute() - like mutate, but keeps only the new columns
# summarise() - summarize values
# top_n() - Select and order top n entries
# sample_n() - randomly sample fixed number of rows of a data frame
# sample_frac() - randomly sample fixed fraction of rows of a data frame
# group_by() - how to break a dataset down into groups of rows
# ungroup() - removing the grouping created in the previous function
# For all these functions, the first argument is a data frame. The subsequent
# arguments describe what to do with it, and you can refer to columns in the
# data frame directly without using $. And they all return a new data frame.
# dplyr also provides the ability to chain functions using the magrittr
# forward-pipe operator: %>%. Use Ctrl + Shift + M to quickly enter.
# These functions and many others are documented in RStudio's data wrangling
# cheat sheet:
# http://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
# Let's give all these functions a spin!
# Going back to our weather data.
# mean of Max.TemperatureF by Event
weather %>%
group_by(Events) %>%
summarise(meanTemp = mean(Max.TemperatureF))
# mean of Max.TemperatureF by Event, arranged in ascending order
weather %>%
group_by(Events) %>%
summarise(meanTemp = mean(Mean.TemperatureF)) %>%
arrange(meanTemp)
# mean of Max.TemperatureF by Event, arranged in ascending order for dates after
# May 31
weather %>%
filter(Date > "2013-05-31") %>%
group_by(Events) %>%
summarise(meanTemp = mean(Mean.TemperatureF)) %>%
arrange(meanTemp)
# Mean number of cosponsors per sponsor:
SenateBills %>%
group_by(sponsor) %>%
summarise(meanSponsors = mean(cosponsors))
# Notice output is truncated; whereas base R defaults to outputting everything,
# dplyr defaults to just a few rows.
# One way to see all rows: use as.data.frame()
SenateBills %>%
group_by(sponsor) %>%
summarise(meanSponsors = mean(cosponsors)) %>%
as.data.frame()
# Or use print with n argument to specify number of rows
SenateBills %>%
group_by(sponsor) %>%
summarise(meanSponsors = mean(cosponsors)) %>%
print(n=20)
# bill submitted by VA senators, just show bill number
SenateBills %>%
filter(grepl("\\[VA\\]", sponsor)) %>%
select(bill, sponsor)
# Now why did that return all rows? No group_by() function! The group_py
# function converts a data frame into a "tbl_df" class, which only prints a few
# rows when thrown to the console. More on this in a bit.
# Top 10 bills by number of cosponsors
SenateBills %>%
select(bill, sponsor, cosponsors) %>%
top_n(10, cosponsors)
# sample 20 rows from arrests data frame and just show ID, Sex, and Age
arrests %>%
sample_n(20) %>%
select(ID, Sex, Age)
# Everything we did above was just output to the console. To save our results,
# we need to use the assignment operator.
# add indicator to SenateBills that takes the value 1 if bill has any
# cosponsors, 0 otherwise, and update SenateBills.
SenateBills <- SenateBills %>%
mutate(cosponsorsI = as.numeric(cosponsors > 0))
table(SenateBills$cosponsorsI)
# We could have done this as well!
SenateBills %>%
mutate(cosponsorsI = as.numeric(cosponsors > 0)) -> SenateBills
# the %>% operator --------------------------------------------------------
# The %>% operator is the magrittr forward-pipe operator.
# help(`%>%`)
# We can use the %>% operator with base R functions
# http://blog.revolutionanalytics.com/2014/07/magrittr-simplifying-r-code-with-pipes.html
# The object on the left hand side is passed as the first argument to the
# function on the right hand side. For example, a common structure:
#
# my.data %>% my.function = my.function(my.data)
# my.data %>% my.function(arg=value) = my.function(my.data, arg=value)
# Let's compare the base R nested method with the magrittr method:
# nested functions
head(sort(allStocks$Volume, decreasing = TRUE))
# chained functions
allStocks$Volume %>% sort(decreasing=TRUE) %>% head()
# From assignment 3
trees <- read.csv("../data/139_treecores_rings.txt", na.strings = "-0.999")
# nested
head(tolower(trimws(as.character(trees$Condition.of.inner.core))))
# chained
trees$Condition.of.inner.core %>%
as.character() %>%
trimws() %>%
tolower() %>%
head()
rm(trees)
# nested functions
mean(is.na(arrests$Children[arrests$Sex=="Female"]))
# chained functions
arrests$Children %>%
`[`(arrests$Sex=="Female") %>%
is.na() %>%
mean()
# nested
paste0(round(prop.table(table(SenateBills$cosponsorsI)),2)*100,"%")
# chained
SenateBills$cosponsorsI %>%
table() %>%
prop.table() %>%
round(2) %>%
`*`(100) %>%
paste0("%")
# dplyr speed -------------------------------------------------------------
# This is a good time to demonstrate dplyr's speed. Let's generate a data
# frame with 30,000,000 rows.
DF <- data.frame(x = rep(c("A","B","C"), each = 1e7),
y = c(rnorm(1e7,100,4), rnorm(1e7,90,4), rnorm(1e7,80,4)))
dim(DF)
print(object.size(DF), units = "Mb")
# Now lets find the mean of y for each level of x using aggregate():
system.time(
ans1 <- aggregate(y ~ x, data=DF, mean)
)
ans1
# How about dplyr?
system.time(
ans2 <- DF %>%
group_by(x) %>%
summarise(y = mean(y))
)
ans2
# What about tapply()?
system.time(
ans3 <- with(DF, tapply(y, x, mean))
)
ans3
rm(ans1, ans2, ans3, DF)
# More on the dplyr verbs -------------------------------------------------
# tbl_df() - wraps a local data frame. The main advantage to using a tbl_df over
# a regular data frame is the printing: tbl objects only print a few rows and
# all the columns that fit on one screen, describing the rest of it as text.
# This is not technically one of the dplyr "verbs", nor is it even required, but
# it can help you from blowing away your console by accidentally printing your
# entire data frame.
class(weather)
# Create a data frame tbl.
weather <- tbl_df(weather)
class(weather)
weather
# dplyr has something similar to str() called glimpse(), though str() still
# works on data frame tbl.
glimpse(weather)
# if you want to print the entire data frame to the console, you can use
# as.data.frame(weather)
# filter() - select a subset of the rows of a data frame; works much like
# subset()
# days it snowed
filter(weather, snow==1)
# compare to base R brackets (notice tbl_df printing still in effect)
weather[weather$snow==1,]
# Of course base R brackets allow this:
weather[weather$snow==1,1:3]
# with dplyr...
weather %>%
filter(snow==1) %>%
select(1:3)
# another example...
# Max temp > 90 and max humidity > 90
filter(weather, Max.TemperatureF > 90 & Max.Humidity > 90)
# compare to
weather[weather$Max.TemperatureF > 90 & weather$Max.Humidity > 90, ]
# slice() - select rows by position
slice(weather,1:10)
# compare to:
weather[1:10,]
# arrange() - reorder (sort) rows by columns; much easier, in my opinion, than
# using order() with subsetting brackets
# sort popVa data frame by rescen42010; notice we can use tbl_df() on the fly
arrange(tbl_df(popVa), rescen42010)
# versus base R
popVa[order(popVa$rescen42010),] %>% tbl_df()
# with the desc() helper function
arrange(tbl_df(popVa), desc(rescen42010)) # uses desc() helper function
# versus base R
popVa[order(popVa$rescen42010, decreasing = TRUE),] %>% tbl_df()
# sort data frame by more than one variable
arrange(weather, Max.TemperatureF, Max.Dew.PointF)
# versus base R
weather[order(weather$Max.TemperatureF, weather$Max.Dew.PointF),]
# It's important to note that the weather data frame itself has not changed. The
# sort order does not change unless we assign the result!
# select() - select columns
select(weather, Max.TemperatureF, Min.TemperatureF, Temp.Range)
select(tbl_df(electionData), 7:10)
# compare to base R
weather[,c("Max.TemperatureF", "Min.TemperatureF", "Temp.Range")]
electionData[,7:10] %>% tbl_df()
# That's nice, but we can use ":" with the actual variable names.
select(tbl_df(allStocks), Open:Close)
select(tbl_df(allStocks), Open:Close, -Low)
# If variable has spaces, surround it with back ticks: `
select(tbl_df(electionData), `Obama Democratic`:`Stein Green`)
# Can also use - to drop variables
select(tbl_df(electionData), `Obama Democratic`:`Stein Green`, -`0 Independent`)
# in base R there really is no comparison; you have to type
# tbl_df(allStocks[,c("Open", "High", "Low", "Close")])
# tbl_df(allStocks[,c("Open", "High", "Close")])
# dplyr has a number of helper functions to use with select:
# - starts_with(x, ignore.case = TRUE): names starts with x
# - ends_with(x, ignore.case = TRUE): names ends in x
# - contains(x, ignore.case = TRUE): selects all variables whose name contains x
# - matches(x, ignore.case = TRUE): selects all variables whose name matches the
# regular expression x
# - num_range("x", 1:5, width = 2): selects all variables (numerically) from x01
# to x05.
# - one_of("x", "y", "z"): selects variables provided in a character vector.
# - everything(): selects all variables.
# Examples
select(weather, starts_with("Max"))
select(weather, ends_with("F"))
select(weather, contains("Dew"))
select(weather, matches("^[^.]+$"))
select(weather, -matches("^[^.]+$"))
select(popVa, num_range("respop", 72010:72012, width=5))
# rename() - rename variables (column headers); new name = old name
weather <- rename(weather, Snowed = snow)
# using rename() when variable name has spaces (use backticks)
electionData <- rename(electionData, MOV = `Margin of Victory Votes`)
# compare to base R
# names(weather)[30] <- "Snowed"
# or
# weather$Snowed <- weather$snow
# weather$snow <- NULL
# distinct()- return the unique values in a data frame; often used with select()
distinct(select(arrests, Children))
# or with %>%
arrests %>% select(Children) %>% distinct()
# in base R
unique(arrests$Children) # vector
data.frame(Children=unique(arrests$Children))
# mutate() - add new columns that are functions of existing columns;
# new columns can refer to other columns that you just created.
weather <- mutate(weather,
Dew.Point.Range = Max.Dew.PointF - Min.DewpointF,
Humidity.Range = Max.Humidity - Min.Humidity,
DH.Range.Ratio = Dew.Point.Range/Humidity.Range)
# base R
weather$Dew.Point.Range <- weather$Max.Dew.PointF - weather$Min.DewpointF
weather$Humidity.Range <- weather$Max.Humidity - weather$Min.Humidity
weather$DH.Range.Ratio <- weather$Dew.Point.Range/weather$Humidity.Range
# or using within()
weather <- within(weather, {
Dew.Point.Range <- Max.Dew.PointF - Min.DewpointF
Humidity.Range <- Max.Humidity - Min.Humidity
DH.Range.Ratio <- Dew.Point.Range/Humidity.Range
})
# or using transform();
# Heads up! transform() resets the class to "data.frame"
class(weather)
weather <- transform(weather, Dew.Point.Range = Max.Dew.PointF - Min.DewpointF,
Humidity.Range = Max.Humidity - Min.Humidity)
weather$DH.Range.Ratio <- weather$Dew.Point.Range - weather$Humidity.Range
class(weather) # no longer tbl_df
# reset
weather <- tbl_df(weather)
# another mutate example
weather <- mutate(weather, Temp.Centered = Max.TemperatureF - mean(Max.TemperatureF))
weather$Temp.Centered[1:5]
sum(weather$Temp.Centered) # should sum to 0, or thereabouts
# transmute() - like mutate, but keeps only the newly created variables
changes <- transmute(allStocks, HighLowDiff = High - Low,
OpenCloseDiff = Open - Close)
head(changes)
# in base R
changes <- data.frame(HighLowDiff = allStocks$High - allStocks$Low,
OpenCloseDiff = allStocks$Open - allStocks$Close)
head(changes)
# summarise() - summarize values and collapse a data frame to a single row
summarise(weather, meanMaxTemp = mean(Max.TemperatureF),
medianMaxTemp = median(Max.TemperatureF))
# in base R
data.frame(meanMaxTemp = mean(weather$Max.TemperatureF),
medianMaxTemp = median(weather$Max.TemperatureF))
# top_n() - Select top n rows (by value)
top_n(weather, 5, Max.TemperatureF) # notice order by date is preserved
# in base R, more complicated (to preserve order):
weather[rank(weather$Max.TemperatureF, ties.method = "min") > (nrow(weather)-10),]
# sample_n() - randomly sample fixed number of rows of a data frame
set.seed(1)
sample_n(weather, 5)
# in base R
set.seed(1)
weather[sample(x = nrow(weather), size = 5),]
# sample_frac() - randomly sample fixed fraction of rows of a data frame
set.seed(2)
sample_frac(weather, 0.10)
# in base R
set.seed(2)
weather[sample(x = nrow(weather), size = 0.10*nrow(weather)),]
# dplyr provides numerous helper functions:
# n(): number of observations in the current group; This function can only be
# used from within summarise, mutate and filter. For example:
summarise(group_by(weather, Events),n=n())
# or chained together
weather %>% group_by(Events) %>% summarise(n=n())
# base R:
xtabs(~ Events, data=weather) %>% as.data.frame()
# n_distinct(x): count the number of unique values in x. This is a faster and
# more concise equivalent of length(unique(x))
n_distinct(arrests$Children)
# first(x), last(x) and nth(x, n): similar to x[1], x[length(x)], x[n]
first(popVa$city)
last(popVa$city)
nth(popVa$city, 10)
# Combining and Comparing Data Sets ---------------------------------------
# dplyr has a number of functions for combining and comparing data sets. Let's
# walk through the examples presented in RStudio's dplyr cheat sheet.
a <- data.frame(x1=c("A","B","C"), x2=1:3,
stringsAsFactors = FALSE)
b <- data.frame(x1=c("A","B","D"), x2=c(TRUE,FALSE,TRUE),
stringsAsFactors = FALSE)
a;b
# mutating joins - create new data frames
# join matching rows from b to a (ie, keep all records in a)
left_join(a, b, by="x1")
merge(a, b, by="x1", all.x = TRUE) # base R equivalent
# join matching rows from a to b (ie, keep all records in b)
right_join(a, b, by="x1")
merge(a, b, by="x1", all.y = TRUE) # base R equivalent
# join data, retain only rows in both sets
inner_join(a, b, by="x1")
merge(a, b, by="x1") # base R equivalent
# join data, retain all values all rows (aka, outer join)
full_join(a, b, by="x1")
merge(a, b, by="x1", all=TRUE) # base R equivalent
# filtering joins - returns a filtered data frame
# all rows in a that have a match in b
semi_join(a, b, by="x1")
# all rows in a that do not have a match in b
anti_join(a, b, by="x1")
# set operations - comparing two data frames (notice these data frames have
# matching column names)
y <- data.frame(x1=c("A","B","C"), x2=1:3,
stringsAsFactors = FALSE)
z <- data.frame(x1=c("B","C","D"), x2=2:4,
stringsAsFactors = FALSE)
y;z
# rows that appear in both y and z
intersect(y, z)
# rows that appear in either or both y and z
union(y, z)
# rows that appear in y but not z
setdiff(y, z)
# Recall these are dplyr functions that are masking base R functions.
conflicts(where = search(), detail = TRUE)
# In dplyr documentation: "These functions override the set functions provided
# in base to make them generic so that efficient versions for data frames and
# other tables can be provided."
methods(intersect)
# binding - appending rows or columns
# append z to y as new rows
bind_rows(y, z) # returns a tbl_df
rbind(y, z)
# When you supply a column name with the `.id` argument, a new column is created
# to link each row to its original data frame
bind_rows(y, z, .id = "source")
# Also, columns don't need to match when row-binding
bind_rows(data.frame(x = 1:3), data.frame(y = 1:4))
# rbind gives an error:
# rbind(data.frame(x = 1:3), data.frame(y = 1:4))
# append z to y as new columns
bind_cols(y, z) # returns a tbl_df
cbind(y, z)
# More examples -----------------------------------------------------------
# Let's work through some more examples
# Find the minimum and maximum stock price for each stock
allStocks %>%
group_by(Stock) %>%
summarise(Min=min(Low), Max=max(High))
# How could we do that without dplyr?
tmp <- split(allStocks, allStocks$Stock)
cbind(
Min = sapply(tmp, function(x)min(x[,"Low"])),
Max = sapply(tmp, function(x)max(x[,"High"]))
)
# I like the first method better myself.
rm(tmp) # tidy up
# Find the largest change in Open and Close price for each stock
allStocks %>%
group_by(Stock) %>%
mutate(Change = Close - Open) %>%
summarise(LargestGain = max(Change), LargestLoss = min(Change))
# We can save the new data frame
lgl <- allStocks %>%
group_by(Stock) %>%
mutate(Change = Close - Open) %>%
summarise(LargestGain = max(Change), LargestLoss = min(Change))
lgl
class(lgl) # notice it has class "tbl_df"
# Again we can do assignment at the end of the chain as well
rm(lgl)
allStocks %>%
group_by(Stock) %>%
mutate(Change = Close - Open) %>%
summarise(LargestGain = max(Change), LargestLoss = min(Change)) -> lgl
# More examples...
# get mean pop'n in cities vs towns in popVa
popVa %>%
group_by(city.ind) %>%
summarize(meanPop=mean(respop72012))
# get % change in population from April 2010 to July 2012,
# rounded to one place, sort descending, show top 5
popVa %>%
select(city, rescen42010, respop72012) %>%
mutate(percentChange=round((respop72012-rescen42010)/rescen42010*100,1),
absoluteChange=respop72012-rescen42010) %>%
arrange(desc(percentChange)) %>%
head(5)
# same as before, but add an indicator for growing (or not) and save
popVaGRate <- popVa %>%
select(city, rescen42010, respop72012, city.ind) %>%
mutate(percentChange=round((respop72012-rescen42010)/rescen42010*100,1),
growing=ifelse(percentChange > 0, 1, 0)) %>%
arrange(desc(percentChange))
# top 10 fastest growing cities and towns
popVaGRate %>%
filter(growing == 1) %>%
select(city,percentChange, rescen42010, respop72012, city.ind) %>%
arrange(desc(percentChange)) %>%
head(n=10)
# Instead of head, use top_n()
popVaGRate %>%
filter(growing == 1) %>%
select(city,percentChange, rescen42010, respop72012, city.ind) %>%
top_n(10, percentChange)
# cosponsors of senate bills: the top 10 higgest cosponsored bills
SenateBills %>%
filter(cosponsors > 0) %>%
arrange(desc(cosponsors)) %>%
select(bill, sponsor, cosponsors) %>%
head(n=10)
# number of bills per sponsor (senator);
# for those with at least two bills;
# sorted descending
SenateBills %>%
group_by(sponsor) %>%
summarize(total=n()) %>%
arrange(desc(total)) %>%
filter(total>1)
# total arrested by occupation and sex
arrests %>%
group_by(Occup2, Sex) %>%
filter(Sex != 9) %>%
summarize(total = n()) %>%
arrange(desc(total))
# This isn't arranged in the order I requested. What's going on? We have to
# ungroup the data before we can arrange the data.
arrests %>%
group_by(Occup2, Sex) %>%
filter(Sex != 9) %>%
summarize(total = n()) %>%
ungroup() %>% # ungroup the data
arrange(desc(total))
# Moving on...
# Add a variable to weather for cumulative precipitation using cumsum(), a base
# R function. cumsum() belongs to a class of functions called window functions.
# These functions take a vector of values and return another vector of values.
cumsum(c(1,2,5,3))
weather <- weather %>%
mutate(cumPrecip = cumsum(PrecipitationIn))
weather$cumPrecip[1:10]
# quick plot of cumulative precipitation over 2013
plot(cumPrecip ~ Date, data=weather, type="l")
abline(h = seq(10,40,10), lty=3, col="grey")
# calculate mean max temperature per month
weather %>%
group_by(Month) %>%
summarize(meanMaxTemp=round(mean(Max.TemperatureF)))
# save the previous summary and graph a dot chart
meanMax <- weather %>%
group_by(Month) %>%
summarize(meanMaxTemp=round(mean(Max.TemperatureF)))
dotchart(x = meanMax$meanMaxTemp,
labels = meanMax$Month, lcolor="black", pch=19,
main="Mean Max C'ville Temp by Month, 2013")
# Let's shorten the names in electionData and derive some new variables.
electionData <- electionData %>%
rename(TEV = `Total Elec Vote`, TPR = `Total.Popular.Vote`,
EVD = `Elec Vote D`, EVR = `Elec Vote R`) %>%
mutate(MOV2 = ifelse(is.na(EVD),MOV*-1,MOV), # pos/neg Margin of victory
State = tolower(State), # make state lower case for mapping purposes
Blue = ifelse(!is.na(EVD),1,0)) # Blue State/Red State indicator
# With our data cleaned up we can create some graphs:
# install.packages("ggplot2)
library(ggplot2)
# a sideways bar plot showing Margin of Votes, coded by Blue indicator:
library(scales) # for comma() function
ggplot(electionData, aes(y=MOV2, x=State, fill=factor(Blue))) +
geom_bar(stat="identity", position="identity") +
scale_fill_manual(values=c("red","blue"), labels=c("Romney","Obama"),
guide=guide_legend(title=NULL)) +
scale_y_continuous(breaks=pretty(range(electionData$MOV2)),
labels=comma(pretty(range(electionData$MOV2)))) +
ylab("Margin of Votes") +
coord_flip()
# US map with color mapped to MOV2 to see how closely contested a state was. A
# less statistical way to display the same information as the previous graph.
# install.packages("maps)
library(maps)
# Use the map_data() function from the maps package to create a data frame of US
# map data. states contains lat/long data for states.
states <- map_data("state")
# Now merge the map data with the election data by state name using dplyr's
# inner_join()
choro <- inner_join(states, electionData, by = c("region" = "State"))
# Notice how we merge when we different variable names:
# by = c("region" = "State")
# Base R equivalent
# choro <- merge(states, electionData, by.x = "region", by.y = "State")
# now plot the map using ggplot
ggplot(choro, aes(x=long, y=lat, group=group, fill=MOV2)) +
geom_polygon(color="black") +
scale_fill_gradient2("Margin of Vote", low="red", high="blue",
space = "Lab",
labels=comma, limits=c(-4e6,4e6)) +
coord_quickmap()
# end