-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path99_analysis_bosnian_all.R
142 lines (111 loc) · 4.56 KB
/
99_analysis_bosnian_all.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Analysis of the distribution of all
# Bosnian tweets.
# (c) Annerose Nisser, 2016-07-09
# Empty workspace
rm(list = ls())
setwd("~/Documents/15-16/Code/tweets_bosnia")
# Displaying Bosnian characters correctly:
Sys.setlocale("LC_CTYPE", "UTF-8")
# Inspiring links:
# http://juliasilge.com/blog/Ten-Thousand-Tweets/
# --------------------------- #
# Load required packages ----
require(jsonlite)
# require(lubridate) # for easily cleaning time stamps
# --------------------------- #
# Important: don't have the package rjson loaded!!
# rjson also has the command fromJSON, which could mask
# the command from jsonlite.
# fromJSON only works with jsonlite, not rjson!
detach("package:rjson", unload=TRUE)
tweets_path <- "data/tweets/bosnian_politicians_tweets.txt"
# tweets_path <- "~/Documents/15-16/Data/bosnian_politicians_tweets.txt"
tweets <- fromJSON(tweets_path,
simplifyDataFrame = T, flatten=TRUE)
# tweets <- readLines(bosnian_all_tweets, warn = "F")
# tweets <- fromJSON("~/Documents/15-16/Data/bs_all_tweets.txt",
# simplifyDataFrame = T)
# ------------------------------ #
names(tweets)
# Check whether there are any duplicate tweets (=
# check that the procedure of NOT adding duplicate
# tweets to the file effectively works):
length(tweets$id_str)
length(unique(tweets$id_str))
# yes, the procedure works efficiently. There is only one
# duplicate (probably at the beginning of the file?)
tweets[duplicated(tweets$id_str, fromLast = TRUE), ]
tweets[duplicated(tweets$id_str), ]
# Exclude those duplicates:
tweets <- tweets[!duplicated(tweets$id_str), ]
# ------------------------------ #
table(tweets$created_at)
table(tweets$user.lang)
table(tweets$lang)
table(tweets$place.country_code)
# ------------------------------ #
# Plot the temporal distribution of the tweets:
class(tweets$created_at)
time <- tweets$created_at
time[200:500]
# Extract the relevant time variable (create function to re-use later):
time_f <- function(time) {
time <- paste0(substring(time, 5, 10), ",", substring(time, 26, 30))
time <- as.Date(time, format = "%b%d, %Y") # see http://www.statmethods.net/input/dates.html
return(time)
}
time <- time_f(time)
time <- time[time>=as.Date("2016-01-01")] # subset time just to 2016
head(time)
# hist(time, breaks = 200)
# The temporal distribution looks quite ok.
plot(as.Date(names(table(time))), table(time), type = "l",
yaxt = "n", xlab = "day", "ylab" = "# daily tweets")
axis(2, at = pretty(table(time)), labels = pretty(table(time)))
abline(v = as.Date("2016-10-02"), col = "red", cex = 2, lty = 2)
text(as.Date("2016-10-02") - 4, mean(table(time)), "election day",
srt = 90, col = "red")
# ------------------------------ #
# Examine those who tweeted
# How many tweets came from the news portal klix.ba?
names(tweets)
t <- table(tweets$user.name)
t <- sort(t, decreasing = T)
par(mar = c(7.1, 4.1, 4.1, 4.1)) # bottom, left, top and right
barplot(t, xaxt = "n", yaxt = "n", ylab = "# tweets")
text(seq_along(t)*1.2, par("usr")[3] - 300,
labels = names(t), srt = 45, pos = 2, xpd = NA, cex = 0.7)
axis(2, at = pretty(t), labels = F, xpd = NA)
text(par("usr")[1] - 0.2, pretty(t),
labels = pretty(t), srt = 45, pos = 2, xpd = NA, cex = 0.7)
# ------------------------------ #
# Plot the longitudinal distribution by user type:
# is the user a politician/political party
# or a news portal?
# Users that are news portals:
# Vijesti Herceg Bosne, Klix.ba
sub <- subset(tweets, select = c("created_at", "user.name"))
names(sub) <- c("time", "user")
# Just keep the date for the time variable (and not the exact time stamp):
# make new user variable to distinguish between news portal
# and politician/political party:
sub$user2 <- ifelse(sub$user %in% c("Vijesti Herceg Bosne",
"Klix.ba"), "news",
ifelse(!is.na(sub$user), "politician", NA))
sub$time <- time_f(sub$time) # reshape time variable with function (defined above)
sub <- sub[sub$time>=as.Date("2016-01-01"), ] # restrict to observation in 2016
t <- table(sub$time, sub$user2)
head(t)
t <- as.data.frame(t)
names(t) <- c("date", "cat", "Freq")
t$date <- as.Date(as.character(t$date))
require(ggplot2)
e <- as.Date("2016-10-02")
ggplot(t, aes(date, Freq, group = cat)) +
geom_line(aes(color = cat)) +
geom_vline(aes(xintercept=as.numeric(e)),
colour="#990000", linetype="dashed") +
annotate(geom= "text", label = "elections",
x = e-4, y = 100, angle = 90) +
scale_y_continuous(name = "# tweets") +
ggtitle("Number of tweets by user category over time")