-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreview clean.R
109 lines (85 loc) · 3.33 KB
/
review clean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# SESSION ----------------------------------------------
library(tidyverse)
library(stringr)
library(tidytext)
library(rtweet)
setwd("~/Documents/Github/hardwax_bot")
reviews <- read_csv("./Data/reviews.csv")
# TEXT MINING ---------------------------------------------------
# get unique words
word_counts <- reviews %>%
unnest_tokens(word, value, token = "ngrams", to_lower = TRUE, n = 1) %>%
count(word, sort = TRUE) %>% filter(word != "")
# get sentence openers
opener_counts <- str_extract(reviews$value, '\\w*') %>%
str_to_lower() %>%
as_data_frame() %>%
count(value, sort = TRUE) %>%
rename(word=value) %>%
filter(word != "")
# get words preceding commas
comma_precede <- str_split_fixed(reviews$value, ',', 5) %>%
as_data_frame() %>%
filter(`V1` != "")
a <- comma_precede %>% filter(V2 != "")
b <- a %>% filter(`V3` != "") %>% select(`V2`) %>% as.vector()
c <- a %>% filter(`V4` != "") %>% select(`V3`)
d <- a %>% filter(`V5` != "") %>% select(`V4`)
comma_precede <- c(a$V1, b$V2, c$V3, d$V4) %>% as_data_frame()
# get last words preceding commas
comma_precede$value <- word(comma_precede$value, -1)
# make lower case
comma_precede$value <- tolower(comma_precede$value)
# get word proceeding comma counts
comma_precede_counts <- count(comma_precede, value, sort = TRUE) %>% rename(comma_n=n)
# join to abs word counts
word_counts <- left_join(word_counts, comma_precede_counts, by=c("word"="value"))
opener_counts <- left_join(opener_counts, comma_precede_counts, by=c("word"="value"))
# probability of comma after word
word_counts$comma_prob <- round((word_counts$comma_n / word_counts$n) * 100)
opener_counts$comma_prob <- round((opener_counts$comma_n / opener_counts$n) * 100)
word_counts$comma_prob[is.na(word_counts$comma_prob)] <- 0
opener_counts$comma_prob[is.na(opener_counts$comma_prob)] <- 0
# create bigrams
bigrams <- reviews %>%
unnest_tokens(bigram, value, token = "ngrams", to_lower = TRUE, n = 2) %>%
# separate bigram col
separate(bigram, c("word1", "word2"), sep = " ")
# new bigram counts:
bigram_counts <- bigrams %>%
count(word1, word2, sort = TRUE)
# create bigrams
trigrams <- reviews %>%
unnest_tokens(trigram, value, token = "ngrams", to_lower = TRUE, n = 3) %>%
# separate bigram col
separate(trigram, c("word1", "word2", "word3"), sep = " ")
# new bigram counts:
trigram_counts <- trigrams %>%
count(word1, word2, word3, sort = TRUE)
## EXPORT ----------------------------------------
write_csv(word_counts, "./Data/words.csv")
write_csv(opener_counts, "./Data/openers.csv")
write_csv(bigram_counts, "./Data/bigrams.csv")
write_csv(trigram_counts, "./Data/trigrams.csv")
# BOT INIT --------------------------------------------------------
# ## name assigned to created app
# appname <- "dumb_hardwax"
# ## api key (example below is not a real key)
# key <- "l0V6xuUUntFGN4eEblPYTSw5U"
# ## api secret (example below is not a real key)
# secret <- "VmYihhRJuj88Lq4AZgOmmU2R3KqLsRME66dsTrwUr7ZKi7c4Z6"
# twitter_token <- create_token(
# app = appname,
# consumer_key = key,
# consumer_secret = secret)
#
# # combine with name for token
# file_name <- file.path(getwd(), "twitter_token.rds")
#
# # save token
# saveRDS(twitter_token, file = "twitter_token.rds")
#
# # create environment file
# cat(paste0("TWITTER_PAT=", file_name),
# file = file.path(getwd(), ".Renviron"),
# append = TRUE)