-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path02_API-Normalized_string.py
512 lines (401 loc) · 19.8 KB
/
02_API-Normalized_string.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
"""
Created on Thu Jun 28 15:33:33 2018
@authors: [email protected],
Last modified: 2018-10-15
** Site-search log file analyzer, Part 3 **
Purpose: Run unmatched search queries against UMLS API.
The first UMLS API run uses normalized string matching, which is conservative
enough to assume that almost all the matches are correct. Some clean-up will
be needed, later.
----------------
SCRIPT CONTENTS
----------------
1. Start-up
2. Umls Api 1 - Normalized string matching
3. Isolate entries updated by API, complete tagging, and match to the
new version of the search log - logAfterUmlsApi1
4. Append new API results to GoldStandard
5. Create logAfterUmlsApi1 by updating logAfterGoldStandard - append assignedByUmlsApi
6. Create new 'uniques' dataframe/file for fuzzy matching
7. Start new 'uniques' dataframe
----------
RESOURCES
----------
Register at UMLS, get a UMLS-UTS API key, and add it below. This is the
primary source for Semantic Type classifications.
- UMLS quick start: https://www.nlm.nih.gov/research/umls/quickstart.html
- https://www.nlm.nih.gov/research/umls/
- https://documentation.uts.nlm.nih.gov/rest/authentication.html
- https://documentation.uts.nlm.nih.gov/rest/home.html
- https://documentation.uts.nlm.nih.gov/rest/concept/
- UMLS description of what Normalized String option is,
https://uts.nlm.nih.gov/doc/devGuide/webservices/metaops/find/find2.html
"""
#%%
# ============================================
# 1. Start-up / What to put into place, where
# ============================================
'''
# Re-starting?
listOfUniqueUnassignedAfterGS = pd.read_excel('01_Import-transform_files/listOfUniqueUnassignedAfterGS.xlsx')
'''
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import pie, axis, show
import requests
import json
import os
import lxml.html as lh
from lxml.html import fromstring
# import hunspell
# https://anaconda.org/conda-forge/hunspell
# hunspell doc: http://hunspell.github.io/
import numpy as np
# import time
# Set working directory
os.chdir('/Users/user/Projects/webDS/_util')
localDir = '02_Run_APIs_files/'
# Get local API key (after you have set it)
def get_umls_api_key(filename=None):
key = os.environ.get('UMLS_API_KEY', None)
if key is not None:
return key
if filename is None:
path = os.environ.get('HOME', None)
if path is None:
path = os.environ.get('USERPROFILE', None)
if path is None:
path = '.'
filename = os.path.join(path, '.umls_api_key')
with open(filename, 'r') as f:
key = f.readline().strip()
return key
myUTSAPIkey = get_umls_api_key()
# UMLS Terminology Services (UTS): Generate a one-day Ticket-Granting-Ticket (TGT)
tgt = requests.post('https://utslogin.nlm.nih.gov/cas/v1/api-key', data = {'apikey':myUTSAPIkey})
# For API key get a license from https://www.nlm.nih.gov/research/umls/
# tgt.text
response = fromstring(tgt.text)
todaysTgt = response.xpath('//form/@action')[0]
uiUri = "https://uts-ws.nlm.nih.gov/rest/search/current?"
semUri = "https://uts-ws.nlm.nih.gov/rest/content/current/CUI/"
#%%
# ===========================================
# 2. Umls Api 1 - Normalized string matching
# ===========================================
'''
In this run the API calls use the Normalized String setting. Example:
for the input string Yellow leaves, normalizedString would return two strings,
leaf yellow and leave yellow. Each string would be matched exactly to the
strings in the normalized string index to return a result.
Re-start:
# listOfUniqueUnassignedAfterGS = pd.read_excel('listOfUniqueUnassignedAfterGS.xlsx')
listToCheck6 = pd.read_excel(localDir + 'listToCheck6.xlsx')
listToCheck7 = pd.read_excel(localDir + 'listToCheck7.xlsx')
'''
# listToCheck1 = pd.read_excel(localDir + 'listToCheck1.xlsx')
# -------------------------------------------
# Batch rows if you want to do separate runs
# Arbitrary max of 5,000 rows per run...
# -------------------------------------------
# uniqueSearchTerms = search['adjustedQueryCase'].unique()
# Reduce entry length, to focus on single concepts that UTS API can match
# listOfUniqueUnassignedAfterGS = listOfUniqueUnassignedAfterGS.loc[(listOfUniqueUnassignedAfterGS['adjustedQueryCase'].str.len() <= 20) == True]
listToCheck1 = listOfUniqueUnassignedAfterGS
# listToCheck1 = unassignedAfterGS.iloc[0:20]
# listToCheck1 = listOfUniqueUnassignedAfterGS.iloc[0:10]
'''
listToCheck2 = listOfUniqueUnassignedAfterGS.iloc[6001:12000]
listToCheck3 = listOfUniqueUnassignedAfterGS.iloc[12001:18000]
'''
# If multiple sessions required, saving to file might help
writer = pd.ExcelWriter(localDir + 'listToCheck1.xlsx')
listToCheck1.to_excel(writer,'listToCheck1')
# df2.to_excel(writer,'Sheet2')
writer.save()
'''
writer = pd.ExcelWriter(localDir + 'apiGetNormalized1.xlsx')
apiGetNormalizedString.to_excel(writer,'listToCheck1')
# df2.to_excel(writer,'Sheet2')
writer.save()
'''
'''
OPTION - Bring in from file
listToCheck3 = pd.read_excel(localDir + 'listToCheck3.xlsx')
'''
#%%
# ----------------------------------------------------------
# Run this block after changing listToCheck top and bottom
# ----------------------------------------------------------
# Create df
apiGetNormalizedString = pd.DataFrame()
apiGetNormalizedString['adjustedQueryCase'] = ""
apiGetNormalizedString['preferredTerm'] = ""
apiGetNormalizedString['SemanticTypeName'] = list()
#%%
'''
Until you put this into a function, you need to change listToCheck#
and apiGetNormalizedString# counts every run!
Stay below 30 API requests per second. With 4 API requests per item
(2 .get and 2 .post requests)...
time.sleep commented out: 6,000 / 35 min = 171 per minute = 2.9 items per second / 11.4 requests per second
Computing differently, 6,000 items @ 4 Req per item = 24,000 Req, divided by 35 min+
686 Req/min = 11.4 Req/sec
time.sleep(.07): ~38 minutes to do 6,000; 158 per minute / 2.6 items per second
'''
# OLD VERSION PLUS SUCCESSFUL CAPTURE OF MULTIPLE SemanticType ENTRIES
for index, row in listToCheck1.iterrows():
currLogTerm = row['adjustedQueryCase']
# === Get 'preferred term' and its concept identifier (CUI/UI) =========
stTicket = requests.post(todaysTgt, data = {'service':'http://umlsks.nlm.nih.gov'}) # Get single-use Service Ticket (ST)
# Example: GET https://uts-ws.nlm.nih.gov/rest/search/current?string=tylenol&sabs=MSH&ticket=ST-681163-bDfgQz5vKe2DJXvI4Snm-cas
tQuery = {'string':currLogTerm, 'searchType':'normalizedString', 'ticket':stTicket.text} # removed 'sabs':'MSH',
getPrefTerm = requests.get(uiUri, params=tQuery)
getPrefTerm.encoding = 'utf-8'
tItems = json.loads(getPrefTerm.text)
tJson = tItems["result"]
if tJson["results"][0]["ui"] != "NONE": # Sub-loop to resolve "NONE"
currUi = tJson["results"][0]["ui"]
currPrefTerm = tJson["results"][0]["name"]
# === Get 'semantic type' =========
stTicket = requests.post(todaysTgt, data = {'service':'http://umlsks.nlm.nih.gov'}) # Get single-use Service Ticket (ST)
# Example: GET https://uts-ws.nlm.nih.gov/rest/content/current/CUI/C0699142?ticket=ST-512564-vUxzyI00ErMRm6tjefNP-cas
semQuery = {'ticket':stTicket.text}
getPrefTerm = requests.get(semUri+currUi, params=semQuery)
getPrefTerm.encoding = 'utf-8'
semItems = json.loads(getPrefTerm.text)
semJson = semItems["result"]
currSemTypes = []
for name in semJson["semanticTypes"]:
currSemTypes.append(name["name"]) # + " ; "
# === Post to dataframe =========
apiGetNormalizedString = apiGetNormalizedString.append(pd.DataFrame({'adjustedQueryCase': currLogTerm,
'preferredTerm': currPrefTerm,
'SemanticTypeName': currSemTypes}), ignore_index=True)
print('{} --> {}'.format(currLogTerm, currSemTypes)) # Write progress to console
# time.sleep(.06)
else:
# Post "NONE" to database and restart loop
apiGetNormalizedString = apiGetNormalizedString.append(pd.DataFrame({'adjustedQueryCase': currLogTerm, 'preferredTerm': "NONE"}, index=[0]), ignore_index=True)
print('{} --> NONE'.format(currLogTerm, )) # Write progress to console
# time.sleep(.06)
print ("* Done *")
writer = pd.ExcelWriter(localDir + 'apiGetNormalizedString3.xlsx')
apiGetNormalizedString.to_excel(writer,'apiGetNormalizedString')
# df2.to_excel(writer,'Sheet2')
writer.save()
#%%
# NEW VERSION
def getStuffFromAPI(currLogTerm): #Fetch from API
# === Get 'preferred term' and its concept identifier (CUI/UI) =========
stTicket = requests.post(todaysTgt, data = {'service':'http://umlsks.nlm.nih.gov'}) # Get single-use Service Ticket (ST)
# Example: GET https://uts-ws.nlm.nih.gov/rest/search/current?string=tylenol&sabs=MSH&ticket=ST-681163-bDfgQz5vKe2DJXvI4Snm-cas
tQuery = {'string':currLogTerm, 'searchType':'normalizedString', 'ticket':stTicket.text} # removed 'sabs':'MSH',
getPrefTerm = requests.get(uiUri, params=tQuery)
getPrefTerm.encoding = 'utf-8'
tItems = json.loads(getPrefTerm.text)
tJson = tItems["result"]
return tJson
def getSemanticType(currUi): #Send Stuff to the API
# === Get 'semantic type' =========
stTicket = requests.post(todaysTgt, data = {'service':'http://umlsks.nlm.nih.gov'}) # Get single-use Service Ticket (ST)
# Example: GET https://uts-ws.nlm.nih.gov/rest/content/current/CUI/C0699142?ticket=ST-512564-vUxzyI00ErMRm6tjefNP-cas
semQuery = {'ticket':stTicket.text}
getPrefTerm = requests.get(semUri+currUi, params=semQuery)
getPrefTerm.encoding = 'utf-8'
semItems = json.loads(getPrefTerm.text)
semJson = semItems["result"]
currSemTypes = []
for name in semJson["semanticTypes"]:
currSemTypes.append(name["name"]) # + " ; "
return currSemTypes
def postToDataFrame(currSemTypes, currPrefTerm, currLogTerm,apiGetNormalizedString):
# === Post to dataframe =========
apiGetNormalizedString = apiGetNormalizedString.append(pd.DataFrame({'adjustedQueryCase': currLogTerm,
'preferredTerm': currPrefTerm,
'SemanticTypeName': currSemTypes}), ignore_index=True)
return apiGetNormalizedString
#initialize spellchecker
bj = hunspell.HunSpell('/home/ubuntu/umls/sortedvocab.txt.dic', '/home/ubuntu/umls/sortedvocab.txt.aff')
for index, row in listToCheck1.iterrows():
currLogTerm = row['adjustedQueryCase']
tJson = getStuffFromAPI(currLogTerm)
if tJson["results"][0]["ui"] != "NONE": # Sub-loop to resolve "NONE"
currUi = tJson["results"][0]["ui"]
currPrefTerm = tJson["results"][0]["name"]
currSemTypes = getSemanticType(currUi)
# === Post to dataframe =========
apiGetNormalizedString = postToDataFrame(currSemTypes, currPrefTerm, currLogTerm,apiGetNormalizedString)
print('{} --> {}'.format(currLogTerm, currSemTypes)) # Write progress to console
# time.sleep(.06)
else:
original = currLogTerm
suggestion = bj.suggest(currLogTerm)
if len(suggestion) > 0 and (suggestion[0] != "["+original):
currLogTerm = suggestion[0]
tJson = getStuffFromAPI(currLogTerm)
currUi = tJson["results"][0]["ui"]
currPrefTerm = tJson["results"][0]["name"]
currSemTypes = getSemanticType(currUi)
# === Post to dataframe =========
apiGetNormalizedString = postToDataFrame(currSemTypes, currPrefTerm, currLogTerm, apiGetNormalizedString)
print('{}: {} --> {}'.format(original, currLogTerm, currSemTypes)) # Write progress to console
else:
# Post "NONE" to database and restart loop
apiGetNormalizedString = apiGetNormalizedString.append(pd.DataFrame({'adjustedQueryCase': currLogTerm, 'preferredTerm': "NONE"}, index=[0]), ignore_index=True)
print('{} --> NONE'.format(currLogTerm, )) # Write progress to console
# time.sleep(.06)
print ("* Done *")
writer = pd.ExcelWriter(localDir + 'apiGetNormalizedString1.xlsx')
apiGetNormalizedString.to_excel(writer,'apiGetNormalizedString')
# df2.to_excel(writer,'Sheet2')
writer.save()
# Free up memory: Remove listToCheck, listToCheck1, listToCheck2, listToCheck3,
# listToCheck4, nonForeign, searchLog, unassignedAfterGS
#%%
# ==================================================================
# 3. Combine multiple files if needed
# ==================================================================
'''
If you ran multiple runs and need to combine parts.
Re-starting? If using old version, the 'for' loop, use digit such asapiGetNormalizedString1
apiGetNormalizedString = pd.read_excel('02_Run_APIs_files/apiGetNormalizedString.xlsx')
apiGetNormalizedString = apiGetNormalizedString1
'''
"""
# Bring in stored info
newAssignments1 = pd.read_excel('02_Run_APIs_files/apiGetNormalizedString1.xlsx')
newAssignments2 = pd.read_excel('02_Run_APIs_files/apiGetNormalizedString2.xlsx')
newAssignments3 = pd.read_excel('02_Run_APIs_files/apiGetNormalizedString3.xlsx')
# Append
UmlsResult = newAssignments1.append([newAssignments2, newAssignments3]) # , sort=False
# afterUmlsApi = newAssignments1.append([newAssignments2, newAssignments3, newAssignments4, newAssignments5])
# Last run's data should be in a df already.
UmlsResult = newAssignments1.append([apiGetNormalizedString]) # , sort=False
Or if you only had one df:
UmlsResult = apiGetNormalizedString
"""
#%%
# ==========================================
# 4. Append new API results to GoldStandard
# ==========================================
'''
Improve future local matching and fuzzy matching.
# Open GoldStandard if needed
GoldStandard = pd.read_excel('01_Import-transform_files/GoldStandard_master.xlsx')
Re-start?
assignedByUmlsApi = pd.read_excel('02_Run_APIs_files/assignedByUmlsApi.xlsx')
'''
# Reduce to list of successful assignments
assignedByUmlsApi = UmlsResult.loc[(UmlsResult['preferredTerm'] != "NONE")]
assignedByUmlsApi = assignedByUmlsApi[~pd.isnull(assignedByUmlsApi['preferredTerm'])]
assignedByUmlsApi = assignedByUmlsApi.loc[(assignedByUmlsApi['preferredTerm'] != "Null Value")]
assignedByUmlsApi = assignedByUmlsApi[~pd.isnull(assignedByUmlsApi['adjustedQueryCase'])]
# Eyeball the df. If you need to remove rows...
# logAfterGoldStandard = logAfterGoldStandard.iloc[760:] # remove before index...
'''
# If you want to send to Excel
writer = pd.ExcelWriter(localDir + 'assignedByUmlsApi.xlsx')
assignedByUmlsApi.to_excel(writer,'assignedByUmlsApi')
# df2.to_excel(writer,'Sheet2')
writer.save()
assignedByUmlsApi = pd.read_excel('02_Run_APIs_files/assignedByUmlsApi.xlsx')
'''
GoldStandard = pd.read_excel('01_Import-transform_files/GoldStandard_master.xlsx')
# Append fully tagged UMLS API adds to GoldStandard
GoldStandard = GoldStandard.append(assignedByUmlsApi, sort=False)
# Sort, reset index
GoldStandard = GoldStandard.sort_values(by='adjustedQueryCase', ascending=True)
GoldStandard = GoldStandard.reset_index()
GoldStandard.drop(['index'], axis=1, inplace=True)
'''
Eyeball top and bottom of cols, remove rows by Index, if needed
GoldStandard.drop(58027, inplace=True)
'''
# Write out the updated GoldStandard
writer = pd.ExcelWriter('01_Import-transform_files/GoldStandard_master.xlsx')
GoldStandard.to_excel(writer,'GoldStandard')
writer.save()
#%%
# ============================================================================
# 5. Create logAfterUmlsApi1 by updating logAfterGoldStandard - append
# assignedByUmlsApi
# ============================================================================
'''
# Bring back in if needed
logAfterGoldStandard = pd.read_excel('01_Import-transform_files/logAfterGoldStandard.xlsx')
'''
# Join new UMLS API adds to the current search log master
logAfterUmlsApi1 = pd.merge(logAfterGoldStandard, assignedByUmlsApi, how='left', on='adjustedQueryCase')
logAfterUmlsApi1.columns
'''
'Referrer', 'Query', 'Timestamp', 'adjustedQueryCase',
'preferredTerm_x', 'SemanticTypeName_x', 'SemanticTypeName_y',
'preferredTerm_y'
'''
# Future: Look for a better way to do the above - MERGE WITH CONDITIONAL OVERWRITE. Temporary fix:
logAfterUmlsApi1['preferredTerm2'] = logAfterUmlsApi1['preferredTerm_x'].where(logAfterUmlsApi1['preferredTerm_x'].notnull(), logAfterUmlsApi1['preferredTerm_y'])
logAfterUmlsApi1['SemanticTypeName2'] = logAfterUmlsApi1['SemanticTypeName_x'].where(logAfterUmlsApi1['SemanticTypeName_x'].notnull(), logAfterUmlsApi1['SemanticTypeName_y'])
logAfterUmlsApi1.drop(['preferredTerm_x', 'preferredTerm_y',
'SemanticTypeName_x', 'SemanticTypeName_y'], axis=1, inplace=True)
logAfterUmlsApi1.rename(columns={'preferredTerm2': 'preferredTerm',
'SemanticTypeName2': 'SemanticTypeName'}, inplace=True)
# Re-sort full file
logAfterUmlsApi1 = logAfterUmlsApi1.sort_values(by='adjustedQueryCase', ascending=True)
logAfterUmlsApi1 = logAfterUmlsApi1.reset_index()
logAfterUmlsApi1.drop(['index'], axis=1, inplace=True)
# Save to file so you can open in future sessions, if needed
writer = pd.ExcelWriter(localDir + 'logAfterUmlsApi1.xlsx')
logAfterUmlsApi1.to_excel(writer,'logAfterUmlsApi1')
# df2.to_excel(writer,'Sheet2')
writer.save()
# ------------------------------------
# Visualize results - logAfterUmlsApi1
# ------------------------------------
# Pie for percentage of rows assigned; https://pythonspot.com/matplotlib-pie-chart/
totCount = len(logAfterUmlsApi1)
unassigned = logAfterUmlsApi1['preferredTerm'].isnull().sum()
assigned = totCount - unassigned
labels = ['Assigned', 'Unassigned']
sizes = [assigned, unassigned]
colors = ['lightskyblue', 'lightcoral']
explode = (0.1, 0) # explode 1st slice
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.f%%', shadow=False, startangle=100)
plt.axis('equal')
plt.title("Status after 'UMLS API' processing - \n{} queries with {} unassigned".format(totCount, unassigned))
plt.show()
# Bar of SemanticTypeName categories, horizontal
# Source: http://robertmitchellv.com/blog-bar-chart-annotations-pandas-mpl.html
ax = logAfterUmlsApi1['SemanticTypeName'].value_counts()[:20].plot(kind='barh', figsize=(10,6),
color="slateblue", fontsize=10);
ax.set_alpha(0.8)
ax.set_title("Categories assigned after 'UMLS API' processing with {} of {} unassigned".format(unassigned, totCount), fontsize=14)
ax.set_xlabel("Number of searches", fontsize=9);
# set individual bar lables using above list
for i in ax.patches:
# get_width pulls left or right; get_y pushes up or down
ax.text(i.get_width()+.1, i.get_y()+.31, \
str(round((i.get_width()), 2)), fontsize=9, color='dimgrey')
# invert for largest on top
ax.invert_yaxis()
plt.gcf().subplots_adjust(left=0.3)
# Remove listOfUniqueUnassignedAfterGS, listToCheck1, etc., logAfterGoldStandard, logAfterUmlsApi11,
# newAssignments1 etc.
#%%
# ============================================================================
# 6. Start new 'uniques' dataframe that gets new column for each of the below
# listOfUniqueUnassignedAfterUmls1
# ============================================================================
# Unique queries with no assignments
listOfUniqueUnassignedAfterUmls = logAfterUmlsApi1[pd.isnull(logAfterUmlsApi1['preferredTerm'])]
listOfUniqueUnassignedAfterUmls = listOfUniqueUnassignedAfterUmls.groupby('adjustedQueryCase').size()
listOfUniqueUnassignedAfterUmls = pd.DataFrame({'timesSearched':listOfUniqueUnassignedAfterUmls})
listOfUniqueUnassignedAfterUmls = listOfUniqueUnassignedAfterUmls.sort_values(by='timesSearched', ascending=False)
listOfUniqueUnassignedAfterUmls = listOfUniqueUnassignedAfterUmls.reset_index()
# Send to file to preserve
writer = pd.ExcelWriter(localDir + 'listOfUniqueUnassignedAfterUmls.xlsx')
listOfUniqueUnassignedAfterUmls.to_excel(writer,'unassignedToCheck')
writer.save()