-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathLecture9.R
81 lines (62 loc) · 2.03 KB
/
Lecture9.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
## Advanced Scrapping Function
# Pay attention to memory
library(dplyr); library(tidyr); library(tibble);
library(tidytext); library(rvest); library(purrr);
library(tictoc)
get_ngramms <- function(url){
read_html(url) %>%
html_elements("p") %>%
html_text() %>%
enframe() %>%
drop_na() %>%
rename(line = 1, text = 2) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram)) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE) %>%
unite(bigram, word1, word2, sep = " ") %>%
filter(n >= 5)
}
# throws error
get_ngramms('https://en.wikipedia.org/wiki/United_States')
get_ngramms('abc')
# When running thousands of threads, we need to make sure code continues to run
# Simple Solutions
# Wrapper Option with try and silent = T ----
url = 'xyz'
res <- try(get_ngramms(url), silent = T)
get_ngramms_silent <- function(url){
res <- try(get_ngramms(url), silent = T)
if(class(res) == "try-error"){
warning(paste('Incorrect URL:', url)) # Warning will show in console
return(tibble())
}else{
return(res)
}
}
result <- get_ngramms_silent('xyzz')
# Saving Intermediate Results ----
get_ngramms_silent <- function(url, save_path = 'output/'){
res <- try(get_ngramms(url), silent = T)
if(length(class(res)) == 1 && class(res) == "try-error"){
warning(paste('Incorrect URL:', url)) # Warning will show in console
return(tibble())
}else{
url2 <- url %>% gsub('[/]', '_', .) %>%
gsub('[:]', '_', .) %>%
gsub('https___en.wikipedia.org_wiki_', '', .)
save(res, file = paste0(save_path, url2, '.Rdata'))
return(res)
}
}
dir.create('output')
result <- get_ngramms_silent('xyzz')
url3 <- 'https://en.wikipedia.org/wiki/United_States'
result <- get_ngramms_silent(url3)
all_countries <- read.csv('country_links.csv') %>%
pull(1)
tic()
res_map <- purrr::map(.x = all_countries, .f = get_ngramms_silent)
toc()