-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmaster_clean.do
251 lines (212 loc) · 6.74 KB
/
master_clean.do
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
/*
master_dta_clean.do
Called by pmidlist_assemble.do, in `master' section (where it also inherits `Q')
*/
pause on
local all 1
local mesh 0
local date 0
local journals 0
local pub_type 0
local grants 0
local affl 0
*===============================================================================
* Append scraped files
if "`Q'" == "notQA" {
forval i = 1/13 {
preserve
if `i' == 1 {
local start = 1
local end = 100000
}
if inrange(`i', 2, 12) {
local start = (`i' - 1) * 100000 + 1
local end = `i'*100000
}
if `i' == 13 {
local start = 1200000 + 1
local end = _N
}
dis "`start' : `end'"
keep if inrange(_n, `start', `end')
tempfile sub`i'
save `sub`i'', replace
restore
}
local maxI = 13
}
else {
forval i = 1/6 {
preserve
if `i' == 1 {
local start = 1
local end = 30000
}
if inrange(`i', 2, 5) {
local start = (`i' - 1) * 30000 + 1
local end = `i'*30000
}
if `i' == 6 {
local start = 150000 + 1
local end = _N
}
dis "`start' : `end'"
keep if inrange(_n, `start', `end')
tempfile sub`i'
save `sub`i'', replace
restore
}
local maxI = 6
}
*===============================================================================
*===============================================================================
forval I = 1/`maxI' {
use `sub`I'', clear
*===============================================================================
* Clean MeSH Terms Field
if `mesh' == 1 | `all' == 1 {
*===============================================================================
gen mesh_na = mesh == "NA"
tab mesh_na
split mesh, p("</MeshHeading>")
ren mesh mesh_raw
egen nterms = noccur(mesh_raw), string("<MeshHeading>")
egen max_nterms = max(nterms)
local max_nterms: dis max_nterms
drop max_nterms
forval x=1/`max_nterms' {
gen start = strpos(mesh`x', "MajorTopicYN=") + 17
gen maj = start - 3
gen end = strpos(mesh`x', "</DescriptorName>")
gen len = end-start
gen majortopic = substr(mesh`x', maj, 1)
replace mesh`x' = substr(mesh`x', start, len)
replace mesh`x' = "" if majortopic == "N"
drop start end len maj majortopic
compress mesh`x', nocoalesce
}
local max_1 = `max_nterms' - 1
forval i = 1/`max_1' {
local j = `i' + 1
forval k = `j'/`max_nterms' {
replace mesh`i' = mesh`k' if mesh`i' == "" & mesh`k' != ""
replace mesh`k' = "" if mesh`k' == mesh`i'
}
compress mesh`i'
}
* Double check pmid 12285838, which supposedly has 26 major mesh topics
*===============================================================================
} // end `mesh'
*===============================================================================
*===============================================================================
* Clean Date Field
if `date' == 1 | `all' == 1 {
*===============================================================================
ren date date_raw
gen start = strpos(date_raw, "<Year>") + 6
gen y = substr(date_raw, start, 4)
destring y, replace
drop start
gen start = strpos(date_raw, "<Month>") + 7
gen m = substr(date_raw, start, 2)
destring m, replace
drop start
gen start = strpos(date_raw, "<Day>") + 5
gen d = substr(date_raw, start, 2)
destring d, replace
drop start
gen date = mdy(m, d, y)
format date %td
drop d m y
*===============================================================================
} // end `date'
*===============================================================================
*===============================================================================
* Clean Journals Field
if `journals' == 1 | `all' == 1 {
*===============================================================================
gen journal_na = journal == "NA"
tab journal_na
ren journal journal_raw
gen start = strpos(journal_raw, "<Title>") + 7
gen end = strpos(journal_raw, "</Title>")
gen len = end-start
gen journal = substr(journal_raw, start, len) if start != 7
drop start end len
gen start = strpos(journal_raw, "<ISOAbbreviation>") + 17
gen end = strpos(journal_raw, "</ISOAbbreviation>")
gen len = end-start
gen journal_abbr = substr(journal_raw, start, len) if start != 17
drop start end len
*===============================================================================
} // end `journals'
*===============================================================================
*===============================================================================
* Clean Publication Type Field
if `pub_type' == 1 | `all' == 1 {
*===============================================================================
gen pt_na = pt == "NA"
tab pt_na
ren pt pt_raw
gen start = strpos(pt_raw, "<PublicationType UI=") + 30
gen end = strpos(pt_raw, "</PublicationType>")
gen len = end-start
gen pub_type = substr(pt_raw, start, len)
drop start end len
keep if inlist(pub_type, "Journal Article", "Clinical Study") ///
| substr(pub_type, 1, 14) == "Clinical Trial"
gen start = strpos(pub_type, ">") + 1
replace pub_type = substr(pub_type, start, .)
drop start
*===============================================================================
} // end `pub_type'
*===============================================================================
*===============================================================================
* Clean Affiliation Field
if `affl' == 1 | `all' == 1 {
*===============================================================================
gen affl_raw = affil
ren affil affl // to work in included do file
include $repo/pmid_authaffl_clean.do
*===============================================================================
} // end `affl'
*===============================================================================
save clean_p`I', replace
*pause
} // end looping through temp files
*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
use clean_p1, clear
if "`Q'" == "notQA" {
forval I = 2/13 {
append using clean_p`I'
}
}
if "`Q'" == "oldqa" {
forval I = 2/6 {
append using clean_p`I'
}
}
*br
pause
*===============================================================================
* Saving clean version
*===============================================================================
drop *_raw
order pmid date pub_type pt_na journal journal_abbr journal_na ///
affl country state_name state_abbr city zip cbsacode alt_cbsacode _merge ///
nterms mesh*
forval i = 1/50 {
local j = `i' + 1
forval k = `j'/50 {
replace mesh`i' = mesh`k' if mesh`i' == "" & mesh`k' != ""
replace mesh`k' = "" if mesh`k' == mesh`i'
}
}
compress *
pause
drop mesh15-mesh50
duplicates drop
duplicates tag pmid, gen(dup)
sort pmid affl
bys pmid: drop if _n == 2 & dup
save "master_`Q'_clean.dta", replace