-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFUN_XML_to_df.R
174 lines (132 loc) · 8.06 KB
/
FUN_XML_to_df.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
XML_to_df = function(input.datapath){
##### XML to df #####
XML.df <- data.frame(matrix(nrow = 0,ncol = 11))
colnames(XML.df) <- c("NO.","PMID","PubYear","Title","Abstract","CHAR","WORD","SENT","Search Word","FileNo","LitNo")
xml.all <- list()
for (k in 1:length(input.datapath)) {
xml1 <- xmlParse(input.datapath[k], encoding="UTF-8") %>%
xmlToList()
for (w in 1:length(xml1)) {
names(xml1)[[w]] <- paste0("PubmedArticle",k,"-",w)
xml1[[w]][[3]] <- as.numeric(k)
names(xml1[[w]])[[3]] = c("FileNo") # Serial number of Files
xml1[[w]][[4]] <- as.numeric(w)
names(xml1[[w]])[[4]] = c("LitNo") # Serial number of Literatures
}
xml.all <- c(xml.all,xml1)
rm(xml1)
}
Abstract.All <- ""
for (i in 1:length(xml.all)) {
Abstract <- xml.all[[i]][["MedlineCitation"]][["Article"]][["Abstract"]]
if (length(Abstract)==0 ) {
XML.df[i,1] <- i
XML.df[i,2] <- paste0("PMID: ",xml.all[[i]][["MedlineCitation"]][["PMID"]][["text"]])
if (length(xml.all[[i]][["MedlineCitation"]][["Article"]][["Journal"]][["JournalIssue"]][["PubDate"]][["Year"]]) == 0){
XML.df[i,3] <- ""
}else {
XML.df[i,3] <- xml.all[[i]][["MedlineCitation"]][["Article"]][["Journal"]][["JournalIssue"]][["PubDate"]][["Year"]]
}
if (length(xml.all[[i]][["MedlineCitation"]][["Article"]][["ArticleTitle"]]) == 0){
XML.df[i,4] <- ""
}else {
XML.df[i,4] <- xml.all[[i]][["MedlineCitation"]][["Article"]][["ArticleTitle"]]
}
XML.df[i,5:9] <- 0
XML.df[i,10] <- xml.all[[i]][["FileNo"]]
XML.df[i,11] <- xml.all[[i]][["LitNo"]]
}else {
try({
if (length(Abstract)==1) {
Abstract.1P <- str_c(Abstract[["AbstractText"]], collapse = " ")
# Abstract.1P <- Abstract[["AbstractText"]] %>% str_c(.,collapse=" ")
}else {
if (length(Abstract[["CopyrightInformation"]])==1) {
Abstract.1P <- ""
for (j in 1:(length(Abstract)-1)) {
if (class(Abstract[[j]])!='character') {
names(Abstract[[j]])[names(Abstract[[j]]) %in% c("i","b")] <- "text"
Abstract.1P <- paste0(Abstract.1P," ", str_c(as.character(Abstract[[j]][["text"]]),collapse=" "))
}else {
Abstract.1P <- paste0(Abstract.1P," ", Abstract[[j]])}
Abstract.1P <- gsub("^\\s", "", Abstract.1P)
}
}else {
Abstract.1P <- ""
for (j in 1:(length(Abstract))) {
names(Abstract[[j]])[names(Abstract[[j]]) %in% c("i","b")] <- "text"
Abstract.1P <- paste0(Abstract.1P," ", str_c(as.character(Abstract[[j]][["text"]]),collapse=" "))}
Abstract.1P <- gsub("^\\s", "", Abstract.1P)
}
}
Abstract.1P2 <- gsub('=','',Abstract.1P)
Abstract.1P_df <- tibble(line = 1:length(Abstract.1P), text = Abstract.1P)
Abstract.1P_df %>%
unnest_tokens(word, text) %>% as.data.frame() -> Abstract.1P_df.Word
##### Stemming (Porter's algorithm)#####
## Original
Abstract.1P_df.Word.C <- Abstract.1P_df.Word %>% count(word, sort = TRUE)
Abstract.1P_df.Word.C <- Abstract.1P_df.Word.C[order(Abstract.1P_df.Word.C$n, decreasing = TRUE),]
Abstract.1P_df.Word.C$word <- factor(Abstract.1P_df.Word.C$word, levels = Abstract.1P_df.Word.C$word)
## Stemming
Abstract.1P_df.Word.Stem <- Abstract.1P_df.Word %>% mutate(stem = wordStem(word))
Abstract.1P_df.Word.Stem.C <- Abstract.1P_df.Word.Stem %>% count(stem, sort = TRUE)
Abstract.1P_df.Word.Stem.C <- Abstract.1P_df.Word.Stem.C[order(Abstract.1P_df.Word.Stem.C$n, decreasing = TRUE),]
Abstract.1P_df.Word.Stem.C$stem <- factor(Abstract.1P_df.Word.Stem.C$stem, levels = Abstract.1P_df.Word.Stem.C$stem)
## Remove the stop word
Abstract.1P_df.Word.Stem.RmSW <- Abstract.1P_df.Word.Stem %>% anti_join(get_stopwords())
Abstract.1P_df.Word.Stem.RmSW.C <- Abstract.1P_df.Word.Stem.RmSW %>% count(stem, sort = TRUE)
Abstract.1P_df.Word.Stem.RmSW.C <- Abstract.1P_df.Word.Stem.RmSW.C[order(Abstract.1P_df.Word.Stem.RmSW.C$n, decreasing = TRUE),]
Abstract.1P_df.Word.Stem.RmSW.C$stem <- factor(Abstract.1P_df.Word.Stem.RmSW.C$stem, levels = Abstract.1P_df.Word.Stem.RmSW.C$stem)
# Fill the statistic result to df
XML.df[i,1] <- i
XML.df[i,2] <- paste0("PMID: ",xml.all[[i]][["MedlineCitation"]][["PMID"]][["text"]])
XML.df[i,3] <- xml.all[[i]][["MedlineCitation"]][["Article"]][["Journal"]][["JournalIssue"]][["PubDate"]][["Year"]]
XML.df[i,4] <- xml.all[[i]][["MedlineCitation"]][["Article"]][["ArticleTitle"]]
Abstract.1P.paste0 <- ""
for (c in 1:length(Abstract.1P)) {
Abstract.1P.paste0 <-paste0(Abstract.1P.paste0, Abstract.1P[c])
}
XML.df[i,5] <- Abstract.1P.paste0
XML.df[i,6] <- sum(nchar(Abstract.1P, type = "chars", allowNA = T, keepNA = NA)) # https://stat.ethz.ch/R-manual/R-devel/library/base/html/nchar.html
XML.df[i,7] <- sapply(str_split(Abstract.1P, " "), length) # https://www.tutorialspoint.com/how-to-count-the-number-of-words-in-a-string-in-r
XML.df[i,8] <- nsentence(Abstract.1P2) # https://rdrr.io/cran/quanteda/man/nsentence.html
XML.df[i,9] <- "NA"
XML.df[i,10] <- xml.all[[i]][["FileNo"]]
XML.df[i,11] <- xml.all[[i]][["LitNo"]]
Abstract.All <- paste0(Abstract.All," ", Abstract.1P)
rm(Abstract.1P,Abstract.1P2, Abstract.1P_df, Abstract.1P_df.Word)
})
}
Abstract.All_df <- tibble(line = 1:length(Abstract.All), text = Abstract.All)
Abstract.All_df %>%
unnest_tokens(word, text) %>% as.data.frame() -> Abstract.All_df.Word
##### Stemming (Porter's algorithm)#####
## Original
Abstract.All_df.Word.C <- Abstract.All_df.Word %>% count(word, sort = TRUE)
Abstract.All_df.Word.C <- Abstract.All_df.Word.C[order(Abstract.All_df.Word.C$n, decreasing = TRUE),]
Abstract.All_df.Word.C$word <- factor(Abstract.All_df.Word.C$word, levels = Abstract.All_df.Word.C$word)
## Stemming
Abstract.All_df.Word.Stem <- Abstract.All_df.Word %>% mutate(stem = wordStem(word))
Abstract.All_df.Word.Stem.C <- Abstract.All_df.Word.Stem %>% count(stem, sort = TRUE)
Abstract.All_df.Word.Stem.C <- Abstract.All_df.Word.Stem.C[order(Abstract.All_df.Word.Stem.C$n, decreasing = TRUE),]
Abstract.All_df.Word.Stem.C$stem <- factor(Abstract.All_df.Word.Stem.C$stem, levels = Abstract.All_df.Word.Stem.C$stem)
## Remove the stop word
Abstract.All_df.Word.Stem.RmSW <- Abstract.All_df.Word.Stem %>% anti_join(get_stopwords())
Abstract.All_df.Word.Stem.RmSW.C <- Abstract.All_df.Word.Stem.RmSW %>% count(stem, sort = TRUE)
Abstract.All_df.Word.Stem.RmSW.C <- Abstract.All_df.Word.Stem.RmSW.C[order(Abstract.All_df.Word.Stem.RmSW.C$n, decreasing = TRUE),]
Abstract.All_df.Word.Stem.RmSW.C$stem <- factor(Abstract.All_df.Word.Stem.RmSW.C$stem, levels = Abstract.All_df.Word.Stem.RmSW.C$stem)
# Keyword.df <- Abstract.All_df.Word[Abstract.All_df.Word[,2] %in% c(Keyword,tolower(Keyword),toupper(Keyword),capitalize(Keyword)),]
}
# Put all result to output list
Output <- list()
Output <- list(XML.df,
Abstract.All_df.Word, Abstract.All_df.Word.C,
Abstract.All_df.Word.Stem, Abstract.All_df.Word.Stem.C,
Abstract.All_df.Word.Stem.RmSW,Abstract.All_df.Word.Stem.RmSW.C)
names(Output) <- c("XML.df",
"Abs.All_df.Word","Abs.All_df.Word.C",
"Abs.All_df.Word.Stem","Abs.All_df.Word.Stem.C",
"Abs.All_df.Word.Stem.RmSW","Abs.All_df.Word.Stem.RmSW.C")
return(Output)
}