Skip to content

Latest commit

 

History

History
877 lines (739 loc) · 35.2 KB

Sevilla_Transcription.md

File metadata and controls

877 lines (739 loc) · 35.2 KB

Sevilla_Transcription

Angela Krak 11/15/2021

R Markdown

mypath <- "/Users/angelakrak/Desktop/DataScience/Sevilla-Interview-Analysis/Transcriptions"
setwd(mypath) 
##Testing method on one file
df <- read.delim("002_Q3.txt", header = FALSE, sep = "\t", dec = ".", encoding = "UTF-8")
head(df)
##                                                                                           V1
## 1                                                                                        Sí.
## 2 Galicia, Madrid, Extremadura, Valencia, País Vasco, para partes de Cataluña no, Salamanca.
## 3                                                                      Sí, algunas veces sí.
## 4                          Hay el resto, partes de España el acento andaluz no se lo millan.
## 5                                                                           No lo entienden.
## 6                                                                     Creen que es inferior.
df <- df %>% add_column(fname = "002_Q3.txt")
head(df)
##                                                                                           V1
## 1                                                                                        Sí.
## 2 Galicia, Madrid, Extremadura, Valencia, País Vasco, para partes de Cataluña no, Salamanca.
## 3                                                                      Sí, algunas veces sí.
## 4                          Hay el resto, partes de España el acento andaluz no se lo millan.
## 5                                                                           No lo entienden.
## 6                                                                     Creen que es inferior.
##        fname
## 1 002_Q3.txt
## 2 002_Q3.txt
## 3 002_Q3.txt
## 4 002_Q3.txt
## 5 002_Q3.txt
## 6 002_Q3.txt
##Works well, let's try the whole set
## Making function for file names
get_df <- function(file) {
    df = read.delim(file, header = FALSE, sep = "\t", dec = ".",encoding = "UTF-8") 
    df["fname"] <- file
    return(df)
}
##Generating a list of file names
flist <- list.files(".", pattern = "*.txt", full.names=TRUE, recursive = TRUE)
flist
##   [1] "./002_Q1.txt" "./002_Q2.txt" "./002_Q3.txt" "./002_Q4.txt" "./002_Q5.txt"
##   [6] "./028_Q1.txt" "./028_Q2.txt" "./028_Q3.txt" "./028_Q4.txt" "./028_Q5.txt"
##  [11] "./045_Q1.txt" "./045_Q2.txt" "./045_Q3.txt" "./045_Q4.txt" "./045_Q5.txt"
##  [16] "./068_Q1.txt" "./068_Q2.txt" "./068_Q3.txt" "./068_Q4.txt" "./068_Q5.txt"
##  [21] "./111_Q1.txt" "./111_Q2.txt" "./111_Q3.txt" "./111_Q4.txt" "./111_Q5.txt"
##  [26] "./133_Q1.txt" "./133_Q2.txt" "./133_Q3.txt" "./133_Q4.txt" "./133_Q5.txt"
##  [31] "./189_Q1.txt" "./189_Q2.txt" "./189_Q3.txt" "./189_Q4.txt" "./189_Q5.txt"
##  [36] "./351_Q1.txt" "./351_Q2.txt" "./351_Q3.txt" "./351_Q4.txt" "./351_Q5.txt"
##  [41] "./470_Q1.txt" "./470_Q2.txt" "./470_Q3.txt" "./470_Q4.txt" "./470_Q5.txt"
##  [46] "./500_Q1.txt" "./500_Q2.txt" "./500_Q3.txt" "./500_Q4.txt" "./500_Q5.txt"
##  [51] "./574_Q1.txt" "./574_Q2.txt" "./574_Q3.txt" "./574_Q4.txt" "./574_Q5.txt"
##  [56] "./603_Q1.txt" "./603_Q2.txt" "./603_Q3.txt" "./603_Q4.txt" "./603_Q5.txt"
##  [61] "./643_Q1.txt" "./643_Q2.txt" "./643_Q3.txt" "./643_Q4.txt" "./643_Q5.txt"
##  [66] "./647_Q1.txt" "./647_Q2.txt" "./647_Q3.txt" "./647_Q4.txt" "./647_Q5.txt"
##  [71] "./706_Q1.txt" "./706_Q2.txt" "./706_Q3.txt" "./706_Q4.txt" "./706_Q5.txt"
##  [76] "./711_Q1.txt" "./711_Q2.txt" "./711_Q3.txt" "./711_Q4.txt" "./711_Q5.txt"
##  [81] "./809_Q1.txt" "./809_Q2.txt" "./809_Q3.txt" "./809_Q4.txt" "./809_Q5.txt"
##  [86] "./848_Q1.txt" "./848_Q2.txt" "./848_Q3.txt" "./848_Q4.txt" "./848_Q5.txt"
##  [91] "./865_Q1.txt" "./865_Q2.txt" "./865_Q3.txt" "./865_Q4.txt" "./865_Q5.txt"
##  [96] "./887_Q1.txt" "./887_Q2.txt" "./887_Q3.txt" "./887_Q4.txt" "./887_Q5.txt"
## [101] "./929_Q1.txt" "./929_Q2.txt" "./929_Q3.txt" "./929_Q4.txt" "./929_Q5.txt"
## [106] "./957_Q1.txt" "./957_Q2.txt" "./957_Q3.txt" "./957_Q4.txt" "./957_Q5.txt"
#Looping
master_df = data.frame('V1' = character(), 'fname' = character()) 
for(file in flist) {
    df = get_df(file)
    rbind(master_df, df) -> master_df
}
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './002_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './002_Q4.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './028_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './028_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './028_Q4.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './045_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './045_Q5.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './068_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './068_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './068_Q4.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './111_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './111_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './111_Q5.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './133_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './133_Q3.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './133_Q4.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './189_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './189_Q5.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './470_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './500_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './500_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './500_Q4.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './500_Q5.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './574_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './574_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './574_Q5.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './603_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './603_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './603_Q4.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './643_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './647_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './706_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './706_Q3.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './706_Q4.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './706_Q5.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './711_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './711_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './711_Q5.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './809_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './809_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './809_Q4.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './809_Q5.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './848_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './848_Q5.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './865_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './865_Q4.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './887_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './887_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './887_Q3.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './929_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './929_Q2.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './957_Q1.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './957_Q4.txt'

## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on './957_Q5.txt'
head(master_df) 
##                                                                  V1
## 1                                                       De Sevilla.
## 2                                                               Sí.
## 3 Opino que cuanto más variedad lingüística haya, hay más calidad. 
## 4                      Yo creo que todo lo que venga siempre deja. 
## 5                                Influye más cultura, más belleza. 
## 6                                          Más engrandece el país. 
##          fname
## 1 ./002_Q1.txt
## 2 ./002_Q1.txt
## 3 ./002_Q2.txt
## 4 ./002_Q2.txt
## 5 ./002_Q2.txt
## 6 ./002_Q2.txt
tail(master_df)
##                                                                                                                                                        V1
## 603 Y luego después fuera de España también exagera porque yo he visto por ejemplo películas donde aparecen andaluces y la verdad es que no me gusta, eh?
## 604                                                                                                                             Porque lo exageran mucho.
## 605                                                                                                                                           Permanecen.
## 606                                                                      Yo quiero decir que siempre van a permanecer aunque la gente ahora es más culta.
## 607                                                                                                                                 Intenta hablar mejor.
## 608                                                                              Pero al mismo tiempo hay como sentirse orgulloso de tu manera de hablar.
##            fname
## 603 ./957_Q4.txt
## 604 ./957_Q4.txt
## 605 ./957_Q5.txt
## 606 ./957_Q5.txt
## 607 ./957_Q5.txt
## 608 ./957_Q5.txt
##rename column to "line"
master_df <- master_df %>%
    rename(line = V1)

##Get participant info
master_df <- master_df %>%
mutate(question = str_extract(fname, "Q\\d"))
##Get question info
master_df <- master_df %>%
mutate(participant = str_extract(fname, "\\d+"))
head(master_df)
##                                                                line
## 1                                                       De Sevilla.
## 2                                                               Sí.
## 3 Opino que cuanto más variedad lingüística haya, hay más calidad. 
## 4                      Yo creo que todo lo que venga siempre deja. 
## 5                                Influye más cultura, más belleza. 
## 6                                          Más engrandece el país. 
##          fname question participant
## 1 ./002_Q1.txt       Q1         002
## 2 ./002_Q1.txt       Q1         002
## 3 ./002_Q2.txt       Q2         002
## 4 ./002_Q2.txt       Q2         002
## 5 ./002_Q2.txt       Q2         002
## 6 ./002_Q2.txt       Q2         002
tail(master_df)
##                                                                                                                                                      line
## 603 Y luego después fuera de España también exagera porque yo he visto por ejemplo películas donde aparecen andaluces y la verdad es que no me gusta, eh?
## 604                                                                                                                             Porque lo exageran mucho.
## 605                                                                                                                                           Permanecen.
## 606                                                                      Yo quiero decir que siempre van a permanecer aunque la gente ahora es más culta.
## 607                                                                                                                                 Intenta hablar mejor.
## 608                                                                              Pero al mismo tiempo hay como sentirse orgulloso de tu manera de hablar.
##            fname question participant
## 603 ./957_Q4.txt       Q4         957
## 604 ./957_Q4.txt       Q4         957
## 605 ./957_Q5.txt       Q5         957
## 606 ./957_Q5.txt       Q5         957
## 607 ./957_Q5.txt       Q5         957
## 608 ./957_Q5.txt       Q5         957
##STOP HERE FOR ANALYSIS OF INDIVIDUAL PHRASES, CONTINUE TO SPLIT WORDS

##fix line numbers
master_df <- master_df %>%
   mutate(line.number = row_number()) %>%
  ungroup()

##Unnest tokens
master_df <- master_df %>% 
    unnest_tokens(word, line)

##Reorder column names
##This will be the format that I use for running frequency analyses. 
master_df <- master_df %>% 
    select(word,line.number,fname, participant,question)
head(master_df)
##      word line.number        fname participant question
## 1      de           1 ./002_Q1.txt         002       Q1
## 2 sevilla           1 ./002_Q1.txt         002       Q1
## 3      sí           2 ./002_Q1.txt         002       Q1
## 4   opino           3 ./002_Q2.txt         002       Q2
## 5     que           3 ./002_Q2.txt         002       Q2
## 6  cuanto           3 ./002_Q2.txt         002       Q2
tail(master_df)
##           word line.number        fname participant question
## 6182 orgulloso         608 ./957_Q5.txt         957       Q5
## 6183        de         608 ./957_Q5.txt         957       Q5
## 6184        tu         608 ./957_Q5.txt         957       Q5
## 6185    manera         608 ./957_Q5.txt         957       Q5
## 6186        de         608 ./957_Q5.txt         957       Q5
## 6187    hablar         608 ./957_Q5.txt         957       Q5
##Frequency Analysis
## load 'stop word' library
library(tm)
##get overall frequency of data with stop words included
## I have to always add head() to avoid calling the whole data set, which cannot be shared
master_df  %>%  count (word, sort= TRUE) %>% head()
##   word   n
## 1  que 335
## 2   de 211
## 3   no 164
## 4   el 155
## 5    y 152
## 6   sí 147
##looks like the most frequent are all stop words!
options(repr.plot.width=12, repr.plot.height=8)
##plot overall frequency
master_df %>%
  count(word, sort = TRUE) %>%
  filter(n > 25) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

##separate data frames into questions
## I could have probably turned this into a function, but decided not to in order to manipulate the filters for each question if necessary. For example, some questions had less tokens, so the frequency filter could be lower. 
##Q1
master_df_Q1 <- master_df %>% 
    select(word,line.number,fname, participant,question) %>%
    filter(question == "Q1")
master_df_Q1  %>%  count (word, sort= TRUE) %>% head()
##      word  n
## 1      de 28
## 2 sevilla 28
## 3      sí 18
## 4 siempre 12
## 5 capital 11
## 6      he 11
master_df_Q1 %>%
  count(word, sort = TRUE) %>%
  filter(n > 5) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

##Q2
master_df_Q2 <- master_df %>% 
    select(word,line.number,fname, participant,question) %>%
    filter(question == "Q2")
master_df_Q2  %>%  count (word, sort= TRUE) %>% head()
##   word  n
## 1  que 75
## 2   el 37
## 3   es 33
## 4    y 29
## 5   en 26
## 6   de 25
master_df_Q2 %>%
  count(word, sort = TRUE) %>%
  filter(n > 6) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

##Q3
master_df_Q3 <- master_df %>% 
    select(word,line.number,fname, participant,question) %>%
    filter(question == "Q3")
master_df_Q3  %>%  count (word, sort= TRUE) %>% head()
##   word  n
## 1  que 87
## 2   sí 73
## 3   de 48
## 4   el 47
## 5   en 44
## 6    y 42
master_df_Q3 %>%
  count(word, sort = TRUE) %>%
  filter(n > 8) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

##Q4
master_df_Q4 <- master_df %>% 
    select(word,line.number,fname, participant,question) %>%
    filter(question == "Q4")
master_df_Q4  %>%  count (word, sort= TRUE) %>% head()
##   word  n
## 1  que 73
## 2   no 57
## 3   de 52
## 4   en 34
## 5    y 32
## 6   es 31
master_df_Q4 %>%
  count(word, sort = TRUE) %>%
  filter(n > 7) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

##Q5
master_df_Q5 <- master_df %>% 
    select(word,line.number,fname, participant,question) %>%
    filter(question == "Q5")
master_df_Q5  %>%  count (word, sort= TRUE) %>% head()
##   word  n
## 1  que 98
## 2   de 58
## 3   no 47
## 4    y 45
## 5   el 41
## 6 creo 29
master_df_Q5 %>%
  count(word, sort = TRUE) %>%
  filter(n > 7) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

##Look at Spanish stop words list
list(stopwords(kind = "es"))
## [[1]]
##   [1] "de"           "la"           "que"          "el"           "en"          
##   [6] "y"            "a"            "los"          "del"          "se"          
##  [11] "las"          "por"          "un"           "para"         "con"         
##  [16] "no"           "una"          "su"           "al"           "lo"          
##  [21] "como"         "más"          "pero"         "sus"          "le"          
##  [26] "ya"           "o"            "este"         "sí"           "porque"      
##  [31] "esta"         "entre"        "cuando"       "muy"          "sin"         
##  [36] "sobre"        "también"      "me"           "hasta"        "hay"         
##  [41] "donde"        "quien"        "desde"        "todo"         "nos"         
##  [46] "durante"      "todos"        "uno"          "les"          "ni"          
##  [51] "contra"       "otros"        "ese"          "eso"          "ante"        
##  [56] "ellos"        "e"            "esto"         "mí"           "antes"       
##  [61] "algunos"      "qué"          "unos"         "yo"           "otro"        
##  [66] "otras"        "otra"         "él"           "tanto"        "esa"         
##  [71] "estos"        "mucho"        "quienes"      "nada"         "muchos"      
##  [76] "cual"         "poco"         "ella"         "estar"        "estas"       
##  [81] "algunas"      "algo"         "nosotros"     "mi"           "mis"         
##  [86] "tú"           "te"           "ti"           "tu"           "tus"         
##  [91] "ellas"        "nosotras"     "vosotros"     "vosotras"     "os"          
##  [96] "mío"          "mía"          "míos"         "mías"         "tuyo"        
## [101] "tuya"         "tuyos"        "tuyas"        "suyo"         "suya"        
## [106] "suyos"        "suyas"        "nuestro"      "nuestra"      "nuestros"    
## [111] "nuestras"     "vuestro"      "vuestra"      "vuestros"     "vuestras"    
## [116] "esos"         "esas"         "estoy"        "estás"        "está"        
## [121] "estamos"      "estáis"       "están"        "esté"         "estés"       
## [126] "estemos"      "estéis"       "estén"        "estaré"       "estarás"     
## [131] "estará"       "estaremos"    "estaréis"     "estarán"      "estaría"     
## [136] "estarías"     "estaríamos"   "estaríais"    "estarían"     "estaba"      
## [141] "estabas"      "estábamos"    "estabais"     "estaban"      "estuve"      
## [146] "estuviste"    "estuvo"       "estuvimos"    "estuvisteis"  "estuvieron"  
## [151] "estuviera"    "estuvieras"   "estuviéramos" "estuvierais"  "estuvieran"  
## [156] "estuviese"    "estuvieses"   "estuviésemos" "estuvieseis"  "estuviesen"  
## [161] "estando"      "estado"       "estada"       "estados"      "estadas"     
## [166] "estad"        "he"           "has"          "ha"           "hemos"       
## [171] "habéis"       "han"          "haya"         "hayas"        "hayamos"     
## [176] "hayáis"       "hayan"        "habré"        "habrás"       "habrá"       
## [181] "habremos"     "habréis"      "habrán"       "habría"       "habrías"     
## [186] "habríamos"    "habríais"     "habrían"      "había"        "habías"      
## [191] "habíamos"     "habíais"      "habían"       "hube"         "hubiste"     
## [196] "hubo"         "hubimos"      "hubisteis"    "hubieron"     "hubiera"     
## [201] "hubieras"     "hubiéramos"   "hubierais"    "hubieran"     "hubiese"     
## [206] "hubieses"     "hubiésemos"   "hubieseis"    "hubiesen"     "habiendo"    
## [211] "habido"       "habida"       "habidos"      "habidas"      "soy"         
## [216] "eres"         "es"           "somos"        "sois"         "son"         
## [221] "sea"          "seas"         "seamos"       "seáis"        "sean"        
## [226] "seré"         "serás"        "será"         "seremos"      "seréis"      
## [231] "serán"        "sería"        "serías"       "seríamos"     "seríais"     
## [236] "serían"       "era"          "eras"         "éramos"       "erais"       
## [241] "eran"         "fui"          "fuiste"       "fue"          "fuimos"      
## [246] "fuisteis"     "fueron"       "fuera"        "fueras"       "fuéramos"    
## [251] "fuerais"      "fueran"       "fuese"        "fueses"       "fuésemos"    
## [256] "fueseis"      "fuesen"       "siendo"       "sido"         "tengo"       
## [261] "tienes"       "tiene"        "tenemos"      "tenéis"       "tienen"      
## [266] "tenga"        "tengas"       "tengamos"     "tengáis"      "tengan"      
## [271] "tendré"       "tendrás"      "tendrá"       "tendremos"    "tendréis"    
## [276] "tendrán"      "tendría"      "tendrías"     "tendríamos"   "tendríais"   
## [281] "tendrían"     "tenía"        "tenías"       "teníamos"     "teníais"     
## [286] "tenían"       "tuve"         "tuviste"      "tuvo"         "tuvimos"     
## [291] "tuvisteis"    "tuvieron"     "tuviera"      "tuvieras"     "tuviéramos"  
## [296] "tuvierais"    "tuvieran"     "tuviese"      "tuvieses"     "tuviésemos"  
## [301] "tuvieseis"    "tuviesen"     "teniendo"     "tenido"       "tenida"      
## [306] "tenidos"      "tenidas"      "tened"
##Make list for stop words
custom_stop_words <- bind_rows(stop_words, tibble(word = tm::stopwords("spanish"),
                                          lexicon = "custom"))
##Add more stop words
word <- c("entonces", "pues", "si", "bueno", "así", "bien", "cada", "risa", "toda", "mucho", "mucha", "va", "van", "muchas", "además", "eh", "da", "", "casi", "todas")
lexicon <- rep("custom", times=length(word))
new_stop_words <- data.frame(word, lexicon)
names(new_stop_words) <- c("word", "lexicon")
custom_stop_words <-bind_rows(custom_stop_words, new_stop_words)

##Remove stop words from master_df and save to master_df_nostop
master_df_nostop <- master_df %>%
  anti_join(custom_stop_words %>% 
              filter(lexicon=="custom"))
## Joining, by = "word"
##Get frequency for overall data frame
master_df_nostop  %>%  count (word, sort= TRUE) %>% head()
##      word  n
## 1  acento 66
## 2 andaluz 61
## 3    creo 58
## 4 siempre 53
## 5  españa 33
## 6   gente 29
##Plot overall frequency results for nostop
master_df_nostop %>%
  count(word, sort = TRUE) %>%
  filter(n > 10) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col(color="black") +
  labs(y = NULL) +
  theme(text= element_text(size=12))

##Separate data frames from nostop 
##Q1
master_df_Q1_nostop <- master_df_nostop %>% 
    select(word,line.number,fname, participant,question) %>%
    filter(question == "Q1")
master_df_Q1_nostop  %>%  count (word, sort= TRUE) %>% head()
##      word  n
## 1 sevilla 28
## 2 siempre 12
## 3 capital 11
## 4  vivido  8
## 5    aquí  6
## 6    años  5
master_df_Q1_nostop %>%
  count(word, sort = TRUE) %>%
  filter(n > 2) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

##Separate data frames from nostop 
##Q2
master_df_Q2_nostop <- master_df_nostop %>% 
    select(word,line.number,fname, participant,question) %>%
    filter(question == "Q2")
master_df_Q2_nostop  %>%  count (word, sort= TRUE) %>% head()
##        word  n
## 1  variedad 19
## 2      creo 12
## 3   catalán  9
## 4 dialectos  9
## 5   idiomas  9
## 6    lengua  8
master_df_Q2_nostop %>%
  count(word, sort = TRUE) %>%
  filter(n > 4) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill=word)) +
  geom_col(color="black") +
  labs(y = NULL) +
  theme(text= element_text(size=12))

##Separate data frames from nostop 
##Q3
master_df_Q3_nostop <- master_df_nostop %>% 
    select(word,line.number,fname, participant,question) %>%
    filter(question == "Q3")
master_df_Q3_nostop  %>%  count (word, sort= TRUE) %>% head()
##      word  n
## 1  acento 20
## 2  españa 18
## 3  madrid 16
## 4 andaluz 13
## 5 siempre 13
## 6 galicia 12
master_df_Q3_nostop %>%
  count(word, sort = TRUE) %>%
  filter(n > 4) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill=word)) +
  geom_col(color="black") +
  labs(y = NULL) +
  theme(text= element_text(size=12))

##Separate data frames from nostop 
##Q4
master_df_Q4_nostop <- master_df_nostop %>% 
    select(word,line.number,fname, participant,question) %>%
    filter(question == "Q4")
master_df_Q4_nostop  %>%  count (word, sort= TRUE) %>% head()
##             word  n
## 1        andaluz 28
## 2         acento 24
## 3        siempre 14
## 4      películas 13
## 5           creo 12
## 6 representación 12
master_df_Q4_nostop %>%
  count(word, sort = TRUE) %>%
  filter(n > 4) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill=word)) +
  geom_col(color="black") +
  labs(y = NULL) +
  theme(text= element_text(size=12))

##Separate data frames from nostop 
##Q5
master_df_Q5_nostop <- master_df_nostop %>% 
    select(word,line.number,fname, participant,question) %>%
    filter(question == "Q5")
master_df_Q5_nostop  %>%  count (word, sort= TRUE) %>% head()
##         word  n
## 1       creo 29
## 2     acento 20
## 3    andaluz 15
## 4  cambiando 13
## 5      gente 10
## 6 permanecen  9
master_df_Q5_nostop %>%
  count(word, sort = TRUE) %>%
  filter(n > 4) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill=word)) +
  geom_col(color="black") +
  labs(y = NULL) +
  theme(text= element_text(size=12))

library(wordcloud)
## Loading required package: RColorBrewer
library(dplyr)
master_df_nostop %>%
  count(word, sort = TRUE) %>%
  dplyr::filter(n > 8 & word != "master_df_nostop") %>%
  with(wordcloud::wordcloud(words = word, 
                            freq = n, 
                            max.words = 300,
                            random.order = FALSE,
                            rot.per = 0.3,
                            colors = brewer.pal(8,"Dark2")))

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] wordcloud_2.6      RColorBrewer_1.1-2 tm_0.7-8           NLP_0.2-1         
##  [5] forcats_0.5.1      stringr_1.4.0      dplyr_1.0.7        purrr_0.3.4       
##  [9] readr_2.0.2        tidyr_1.1.3        tibble_3.1.4       ggplot2_3.3.5     
## [13] tidyverse_1.3.1    tidytext_0.3.2    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.7        lubridate_1.7.10  lattice_0.20-45   assertthat_0.2.1 
##  [5] digest_0.6.27     utf8_1.2.2        slam_0.1-48       R6_2.5.1         
##  [9] cellranger_1.1.0  backports_1.2.1   reprex_2.0.1      evaluate_0.14    
## [13] highr_0.9         httr_1.4.2        pillar_1.6.2      rlang_0.4.11     
## [17] readxl_1.3.1      rstudioapi_0.13   Matrix_1.3-4      rmarkdown_2.10   
## [21] labeling_0.4.2    munsell_0.5.0     broom_0.7.9       compiler_4.1.1   
## [25] janeaustenr_0.1.5 modelr_0.1.8      xfun_0.25         pkgconfig_2.0.3  
## [29] htmltools_0.5.2   tidyselect_1.1.1  fansi_0.5.0       crayon_1.4.1     
## [33] tzdb_0.1.2        dbplyr_2.1.1      withr_2.4.2       SnowballC_0.7.0  
## [37] grid_4.1.1        jsonlite_1.7.2    gtable_0.3.0      lifecycle_1.0.0  
## [41] DBI_1.1.1         magrittr_2.0.1    scales_1.1.1      tokenizers_0.2.1 
## [45] cli_3.0.1         stringi_1.7.4     farver_2.1.0      fs_1.5.0         
## [49] xml2_1.3.2        ellipsis_0.3.2    generics_0.1.0    vctrs_0.3.8      
## [53] tools_4.1.1       glue_1.4.2        hms_1.1.0         parallel_4.1.1   
## [57] fastmap_1.1.0     yaml_2.2.1        colorspace_2.0-2  rvest_1.0.1      
## [61] knitr_1.33        haven_2.4.3