-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathLecture12.R
71 lines (53 loc) · 1.62 KB
/
Lecture12.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Coding Session: Data Manipulation with multidplyr
# https://multidplyr.tidyverse.org/index.html
# install.packages("multidplyr")
# https://github.com/tidyverse/multidplyr/
library(multidplyr)
library(dplyr, warn.conflicts = FALSE)
library(nycflights13)
library(tictoc)
str(flights)
flights %>% head() %>% View()
?multidplyr::new_cluster
cluster <- new_cluster(4)
print(cluster)
flights1 <- flights %>% group_by(dest) %>% partition(cluster)
flights1
tic()
flights %>%
group_by(dest) %>%
summarise(dep_delay = mean(dep_delay, na.rm = TRUE))
toc()
tic()
flights1 %>%
summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
collect()
toc()
## WHEN TO USE ----
# For basic dplyr verbs, multidplyr is unlikely to give you significant speed ups
# unless you have 10s or 100s of millions of data points
# (and in that scenario you should first try dtplyr, which uses data.table).
#
# multipldyr might help, however, if you’re doing more complex things.
# Let’s see how that plays out when fitting a moderately complex model.
daily_flights <- flights %>%
count(dest) %>%
filter(n >= 365)
common_dest <- flights %>%
semi_join(daily_flights, by = "dest") %>%
mutate(yday = lubridate::yday(ISOdate(year, month, day))) %>%
group_by(dest)
by_dest <- common_dest %>% partition(cluster)
by_dest
## cluster requires the library to be loaded prior to run
library(mgcv)
tic()
models <- common_dest %>%
group_by(dest) %>%
do(mod = gam(dep_delay ~ s(yday) + s(dep_time), data = .))
toc()
cluster_library(cluster, "mgcv")
tic()
models <- by_dest %>%
do(mod = gam(dep_delay ~ s(yday) + s(dep_time), data = .))
toc()