-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHPOFunc.py
275 lines (232 loc) · 10.7 KB
/
HPOFunc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import pandas as pd
import re
from pyhpo.ontology import Ontology
import numpy as np
import itertools
import math
ontology=Ontology()
NullList=["none", "none documented", "nil", "(borderline)", "no concerns"]
def check_and_set_nan(strg: str, NullList: list = ["none", "none documented", "nil", "(borderline)", "no concerns"])->str:
"""
Some of the HPO responses are different variations of people saying "no"
This function is designed to catch those so that I don't faff about with
non-responses that don't need to be processed.
A pre-defined "NullList" is used to check if the string is in there and
then that element is removed.
NullList=["none", "none documented", "nil", "(borderline)", "no concerns"]
Then a string is checked to see if it contains values from the NullList.
Returns a string which has all the null list items removed.
If the original input is NaN, returns an empty string
"""
if pd.isna(strg):
strg = ""
if strg.lower() in [s.lower() for s in NullList]:
strg = ""
return strg
def drop_leading_hp(strg: str)->str:
"""
Takes in a string, ideally a HPO term, and removes the leading HP or hp from it
"""
return strg[2:] if strg.startswith('HP') else strg
def HPOSorter(strg: str) -> tuple[list[str], list[str]]:
"""
Takes a string from a single cell from the HPOData dataframe
Checks if it's null
Then separates it into two separate lists, one list of non_numeric values and one list of numeric values
The non_numeric values are then cleaned up to remove any leading "HP:" or "hp:" and then returned
Returns two lists, one of numeric values and one of non_numeric values
"""
strg=check_and_set_nan(strg, NullList)
if pd.isna(strg):
non_numeric_values=[]
numeric_values=[]
else:
numeric_values = re.findall(r'\d{2,}', strg)
# I want free text to have a leading capital preserved, but I want to remove any leading HP: or hp:
# different entries have upper or lower case combinations of HP, so I'm going to remove all of them
strg = re.sub(r'HP:', '', strg)
strg = re.sub(r'hp:', '', strg)
strg = re.sub(r'hP:', '', strg)
strg = re.sub(r'Hp:', '', strg)
strg = re.sub(r'HP', '', strg)
strg = re.sub(r'hp', '', strg)
strg = re.sub(r'hP', '', strg)
strg = re.sub(r'Hp', '', strg)
strg = re.sub(r':', '', strg)
strg = re.sub(r'\d{2,}', '', strg)
# Once that's been handled, split one thing into the entire list
non_numeric_values=re.split(r',| ', strg)
non_numeric_values=[value for value in non_numeric_values if re.search('[a-zA-Z]', value)]
non_numeric_values=[r.lstrip() for r in non_numeric_values]
non_numeric_values=[drop_leading_hp(r) for r in non_numeric_values]
return numeric_values, non_numeric_values
def get_hpo_or_error(strg: str, Process_Type: str="Non_numeric")->str:
"""
Takes a string "strg" and tries to get the HPO object from the ontology
If it fails, proceeds differently depending on the Process_Type
If Process_Type is "None", errors result in an empty string
If Process_Type is "Non_numeric", it tries a few different ways of cleaning up the string
it tries a few different ways of cleaning up the string
Designed to work with non_numeric values
If it still fails, it returns an error message, which can then be added to a list of problems
"""
# Maybe I want process type and error type
# Process type is numerical or letters
# If numerical, try it, then check if there's any "HPs"
# If there are HPs but no colons, add colons
# If t
if Process_Type == "None":
try:
out = ontology.get_hpo_object(strg)
return out
except RuntimeError:
out = ""
return out
elif Process_Type == "Non_numeric":
# Remove trailing whitespace
strg = strg.rstrip()
# Remove leading and trailing non-word characters
strg = re.sub(r'^\W+|\W+$', '', strg)
if len(strg) > 1:
strg = strg[0].upper() + strg[1:].lower()
else:
strg = strg.upper()
try:
# Attempt to get the HPO object
out = ontology.get_hpo_object(strg)
return out
except RuntimeError:
# Return an error message if both attempts fail
out = f"Error: {strg}"
return out
elif Process_Type == "Numeric":
try:
out = ontology.get_hpo_object(strg)
return out
except RuntimeError:
try:
hpstrg = "HP:"+strg
out = ontology.get_hpo_object(hpstrg)
return out
except RuntimeError:
out = f"Error: HP:{strg}"
return out
else:
print("Process_Type must be one of 'None', 'Non_numeric', or 'Numeric'")
return None
#print(get_hpo_or_error("arachnodactyly"))
def list_to_csv(lst: list)->str:
"""
Final output is going to be a single cell with semicolon separated HPO terms in a standardised format
This function just lets me turn a list into a singl string (because one HPO code has a comma in it, rip)
"""
return '; '.join(map(str, lst))
def HPOOutPutter(numeric_values: list[str], non_numeric_values: list[str])->tuple[list[str], list[str], str]:
"""
Takes the two lists from HPOSorter and then runs them through the get_hpo_or_error function
Numeric values are pretty easy to check
Non_numeric still require a bit of fiddling
Also it's quite important to output the errors in a nice way so that they can be manually checked
This is useful for me to develop this code so that it hits as many common expressions as possible
But also tells me what I missed.
It's also just handy because there are some specific entries that are so weird that it's way too much faff
to just make a general rule for them. Instead easier to flag and give to my PI who can manually check
"""
ProblemList = []
if not numeric_values: # If the list is empty
numeric_terms = []
# No contribution to ProblemList
else:
#numeric_terms = ["HP:" + str(r) for r in numeric_values]
numeric_terms = [str(get_hpo_or_error(r, Process_Type="Numeric")) for r in numeric_values]
ProblemList.extend([value for value in numeric_terms if isinstance(value, str) and value.startswith('Error:')])
numeric_terms = [value for value in numeric_terms if not (isinstance(value, str) and value.startswith('Error:'))]
if not non_numeric_values: # If the list is empty
non_numeric_terms = []
# No contribution to ProblemList
else:
non_numeric_terms = [str(get_hpo_or_error(r, Process_Type="Non_numeric")) for r in non_numeric_values]
ProblemList.extend([value for value in non_numeric_terms if isinstance(value, str) and value.startswith('Error:')])
non_numeric_terms = [value for value in non_numeric_terms if not (isinstance(value, str) and value.startswith('Error:'))]
Problems = list_to_csv(ProblemList)
return numeric_terms, non_numeric_terms, Problems # Note that outputs include the Problem cases so that they can be manually checked
def HPOSquisher(terms1, terms2):
"""
Takes the list of HPO terms from the numeric and non_numeric lists and does the union of them to give a list of all possible mentions
Sometimes people write stuff in both numeric and non_numeric form, and sometimes just in one or the other
"""
if len(terms1)+len(terms2) == 0:
BigTerms=[]
else:
BigTerms=sorted(list(set(terms1) | set(terms2)))
# Making it alphabetical makes testing easier
TermList=list_to_csv(BigTerms)
return TermList
def process_column(entry: str) -> tuple[str, str]:
"""
Takes the cell entry, runs it through the HPOSorter, HPOOutPutter and HPOSquisher functions
Returns the terms in a single cell, and any problems that go encountered
"""
numeric_values, non_numeric_values = HPOSorter(entry)
terms1, terms2, Problems = HPOOutPutter(numeric_values, non_numeric_values)
TermList = HPOSquisher(terms1, terms2)
return TermList, Problems
def HPOScorer(doctor_responses, parent_responses):
"""
Takes a list of HPO terms from the doctor and parent
returns the quantity and quality scores for both
as well as the codes where the quality score is non-zero
"""
if pd.notna(doctor_responses):
doctor_hpo = doctor_responses.split(";")
doctor_hpo = [hpo.split(" | ")[0] for hpo in doctor_hpo]
doctor_hpo = [doc.replace(" ", "") for doc in doctor_hpo]
doc_quant = len(doctor_hpo)
else:
doc_quant = 0
if pd.notna(parent_responses):
parent_hpo = parent_responses.split(";")
parent_hpo = [hpo.split(" | ")[0] for hpo in parent_hpo]
parent_hpo = [par.replace(" ", "") for par in parent_hpo]
par_quant = len(parent_hpo)
else:
par_quant = 0
doc_qual = 0
par_qual = 0
doc_codes = []
par_codes = []
if pd.notna(doctor_responses) and pd.notna(parent_responses):
for doc, par in itertools.product(doctor_hpo, parent_hpo):
#print(doc, par)
# Assuming ontology.path returns a list or tuple where the 3rd and 4th elements are needed
try:
path_result = ontology.path(doc, par)
except:
continue
a, b = path_result[2], path_result[3] # Corrected indexing
if a == 0:
doc_qual += 0 # This line could be omitted as it has no effect
par_qual += b
if b != 0:
par_codes.append(par)
elif b == 0:
doc_qual += a
par_qual += 0 # This line could also be omitted
if a != 0:
doc_codes.append(doc)
return int(doc_quant), int(par_quant), int(doc_qual), int(par_qual), doc_codes, par_codes
def Turn_Lists_Of_HPOs_Into_Just_Codes(HPOString):
"""
Takes a string of HPO terms and returns a list of just the codes
"""
if isinstance(HPOString, str):
HPOList = HPOString.split(", ")
if len(HPOList) == 0:
return HPOString
NewList = []
for HPO in HPOList:
HPO = HPO.split(" | ")[0]
NewList.append(HPO)
return NewList
else:
return HPOString