-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_customize.R
57 lines (40 loc) · 1.5 KB
/
01_customize.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(dplyr)
source("generate_tdm.R")
queries <- readRDS("cleaned_queries.rds")
clean_tbl_names <- function(tblnames){
tblnames <- gsub(pattern = "\\\n| |\\\t |\\[|\\]|\"", replacement = "", x = tblnames)
tblnames <- unlist(strsplit(tblnames, ","))
return(tblnames)
}
# Filter out use of extremely rare tables
# may be deprecated or erroneous
tbl_list <- as.data.frame(table(unlist(lapply(queries$TABLES, clean_tbl_names))))
# table must be used at least 5 times
cuttbls <- tbl_list$Var1[tbl_list$Freq < 5]
cutrows <- unlist(lapply(lapply(queries$TABLES, clean_tbl_names), function(x){
mean(cuttbls %in% x) > 0
}))
queries <- queries[!cutrows, ]
# generate termDocumentMatrix for cleaned queries
total_tdm <- generate_tdm_model(queries$query, NULL, TRUE, TRUE, TRUE)
termfreq <- data.frame(
term = Terms(total_tdm),
n = row_sums(total_tdm),
m = row_means(total_tdm),
row.names = NULL
)
docfreq <- data.frame(
docs = Docs(total_tdm),
n = col_sums(total_tdm),
m = col_means(total_tdm)
)
# term needs to appear 5+ times
# query needs to have at least 10 terms
select_tdm <- total_tdm[termfreq$n >= 5, docfreq$n >= 10]
# slight cleanup to remove terms who lost their only docs
select_tdm <- select_tdm[-which(row_sums(select_tdm) == 0), ]
# Only these queries got included
# have to use as.numeric to grab their Doc "names" for filtering
querytbl <- queries[as.numeric(select_tdm$dimnames$Docs), ]
saveRDS(querytbl, "select_querytbl.rds")
saveRDS(select_tdm, "select_tdm_model.rds")