-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmake.R
209 lines (171 loc) · 5.97 KB
/
make.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
## r
## read in functions
source("R/funs.R")
## packages
pkgs <- c("ggplot2", "dplyr", "rtweet")
## install if necessary
if (any(!pkgs %in% installed.packages())) {
install.packages(pkgs[!pkgs %in% installed.packages()])
}
## load packages
library(ggplot2)
library(dplyr)
library(rtweet)
##---------------------------------------------------------------
## GET DATA FROM TRUMPTWITTERARCHIVE.COM
##---------------------------------------------------------------
## get data from trumptwitterarchive.com
tta <- get_trumptwitterarchive()
## extract data and tidy into data frame
tta_data <- trumptwitterarchive_data(tta)
## view time series of tweets
rtweet::ts_plot(tta_data, "weeks")
##---------------------------------------------------------------
## LOOKUP STATUSES
##---------------------------------------------------------------
## create vector of status IDs recovered from trumptwitterarchive.com
statusids <- tta$status_id
## lookup data for status IDs
tw_rt_lookup <- rtweet::lookup_statuses(statusids)
##---------------------------------------------------------------
## GET TIMELINE DATA
##---------------------------------------------------------------
## get 3200 most recent tweets from Trump's timeline
tw_rt_tmline <- rtweet::get_timeline("realdonaldtrump", n = 3200)
## combine rows and return unique
tw_rt <- rbind(tw_rt_lookup, tw_rt_tmline)
usrs <- rbind(users_data(tw_rt_lookup), users_data(tw_rt_tmline))
## remove duplicates
kp <- which(!duplicated(tw_rt$status_id))
tw_rt <- tw_rt[kp, ]
usrs <- usrs[kp, ]
attr(tw_rt, "users") <- usrs
## save withi timestamp
timestamp <- round(as.numeric(Sys.time()), 0)
rds <- paste0("data/trumptweets-", timestamp, ".rds")
saveRDS(tw_rt, rds)
save_as_csv(tw_rt, "data/trumptweets-1515775693.csv")
##---------------------------------------------------------------
## MERGE ALL UNIQUE DATA
##---------------------------------------------------------------
if (any(!tta_data$id_str %in% tw_rt$status_id)) {
uq_tta <- matrix(
NA,
sum(!tta_data$id_str %in% tw_rt$status_id, na.rm = TRUE),
ncol(tw_rt)
)
uq_tta <- structure(
as.data.frame(uq_tta),
names = names(tw_rt),
class = c("tbl", "tbl_df", "data.frame")
)
## subset unique tta rows
uqtta <- tta_data[!tta_data$id_str %in% tw_rt$status_id, ]
uq_tta[["status_id"]] <- uqtta$id_str
uq_tta[["text"]] <- uqtta$text
uq_tta[["created_at"]] <- uqtta$created_at
uq_tta[["source"]] <- uqtta$source
uq_tta[["retweet_count"]] <- uqtta$retweet_count
uq_tta[["favorite_count"]] <- uqtta$favorite_count
uq_tta[["reply_to_user_id"]] <- uqtta$in_reply_to_user_id_str
uq_tta[["is_retweet"]] <- uqtta$is_retweet
tw_rt <- rbind(tw_rt, uq_tta)
}
##---------------------------------------------------------------
## GET @realDonaldTrump's USER DATA
##---------------------------------------------------------------
## check how many statuses Trump has posted
rdt <- rtweet::lookup_users("realdonaldtrump")
## print number of statuses vs number of observations
message(paste("Trump has tweeted", rdt$statuses_count, "times."))
## number of tweets data collected so far...
ttws <- length(unique(tw_rt$status_id))
message(paste("Number of Trump tweets collected so far:", ttws))
rdt$statuses_count - ttws
##---------------------------------------------------------------
## HACK(Y) METHOD FOR RECOVERING MORE TWEETS
##---------------------------------------------------------------
## h.rtweet is a package i wrote to access a backdoor API
## for obvious reasons, it's not stored on a public repository
if ("h.rtweet" %in% installed.packages()) {
## backdoor API pkg
library(h.rtweet)
## get all tweets
hrt <- h.search_tweets(
"from:realdonaldtrump",
n = rdt$statuses_count
)
## convert date string to posixct
hrt$created_at <- as.POSIXct(
as.numeric(hrt$created_at), origin = "1970-01-01",
tz = "UTC"
)
if (any(!hrt$status_id %in% tw_rt$status_id)) {
uq_tta <- matrix(
NA,
sum(!hrt$status_id %in% tw_rt$status_id, na.rm = TRUE),
ncol(tw_rt)
)
uq_tta <- structure(
as.data.frame(uq_tta),
names = names(tw_rt),
class = c("tbl", "tbl_df", "data.frame")
)
## subset unique tta rows
uqtta <- hrt[!hrt$status_id %in% tw_rt$status_id, ]
uq_tta[["status_id"]] <- uqtta$status_id
uq_tta[["text"]] <- uqtta$text
uq_tta[["created_at"]] <- uqtta$created_at
uq_tta[["screen_name"]] <- uqtta$screen_name
uq_tta[["mentions_screen_name"]] <- strsplit(uqtta$mentions, " ")
tw_rt <- rbind(tw_rt, uq_tta)
}
}
##---------------------------------------------------------------
## SAVE DATA
##---------------------------------------------------------------
## save as R data file
saveRDS(tw_rt, "data/trumptweets-08-10-2017.rds")
## function to flatten (make csv friendly) data
flatten_data <- function(x) {
recs <- vapply(x, is.recursive, logical(1))
x[recs] <- lapply(x[recs], vapply, paste, collapse = " ", character(1))
x
}
## flatten and then save as CSV
tw_rt_csv <- flatten_data(tw_rt)
readr::write_csv(tw_rt_csv, "data/trumptweets-08-10-2017.csv")
##---------------------------------------------------------------
## PLOT DAILY TIME SERIES
##---------------------------------------------------------------
## daily time series data
d <- rtweet::ts_data(tw_rt, "days")
## build plot
p <- d %>%
ggplot(aes(time, n)) +
geom_line() +
theme_minimal() +
labs(
x = NULL, y = NULL,
title = "Daily frequency of @realDonaldTrump tweets",
subtitle = paste0("Data (N = ",
nrow(tw_rt),
") collected using rtweet (an R package)")
) +
theme(
plot.title = element_text(face = "bold"),
text = element_text(family = "Roboto"),
axis.text = element_text(colour = "black")
) +
scale_x_datetime(
date_breaks = "years",
date_labels = "%Y"
) +
coord_cartesian(
xlim = c(as.POSIXct("2009-07-03"), as.POSIXct("2017-07-09"))
)
## save plot as PNG
png("../trumptweets.png",
width = 7, height = 4.5, units = "in", res = 127.5)
p
dev.off()