-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKOMP_QC_Report.py
445 lines (366 loc) · 23.2 KB
/
KOMP_QC_Report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
#!/usr/bin/env python
########
# Author: Shalmali Kulkarni
# This script pulls in Core PFS data via OData query and performs certain checks for validating mice, metadata (assay data attributes) and assay data. This is version v0 which for now just
# checks for presence of values for mandatory parameters, experimenter id to integer match on JAXLIMS and a valid test date.
# Next version might include controlled vocabularies for values and some case-by-case basis additions.
# It converts IMPC codes to column headers for easy understanding
# This script should generate 1 csv file, mouse, metadata and assay data which should include validation output from 17 experiments.
########## Import all required libraries #########################################################################################################################################
import mysql.connector as sql
import pandas as pd
import read_config as cfg
import requests
import re
from requests.auth import HTTPBasicAuth
import os
import json
import csv
from datetime import datetime, timedelta
import sys
###################################################################################################################################################################################
##################################################### List of experiments to validate from Core PFS ###############################################################################
# Experiments to validate
kompExperimentNames = [
"AUDITORY_BRAINSTEM_RESPONSE",
"BODY_COMPOSITION",
"BODY_WEIGHT",
"CLINICAL_BLOOD_CHEMISTRY",
"ELECTROCARDIOGRAM",
"ELECTRORETINOGRAPHY",
"EYE_MORPHOLOGY",
"FUNDUS_IMAGING",
"GLUCOSE_TOLERANCE_TEST",
"GRIP_STRENGTH",
"HEART_WEIGHT",
"HEMATOLOGY",
"HOLEBOARD",
"LIGHT_DARK_BOX",
"OPEN_FIELD",
"SHIRPA_DYSMORPHOLOGY",
"STARTLE_PPI"
]
#######################################################################################################################################################################################
################################################################## Get mandatory IMPC codes from JAXLIMS ##############################################################################
# Connect to the SQLite database
parameterdetails_conn = sql.connect(
host="rslims.jax.org",
user="dba",
password="rsdba",
database="komp"
)
## Query the database (sql)
parameterdetails_data_table = pd.read_sql("""
SELECT * FROM komp.dccparameterdetails;
""",
parameterdetails_conn)
# Close connection to the database
parameterdetails_conn.close()
impc_codes = parameterdetails_data_table['ImpcCode'].tolist()
###################################################################################################################################################################
###################################################### Mouse validation ###########################################################################################
# Get all mice
mycfg = cfg.parse_config(path="/projects/galaxy/tools/cba/config_sk.yml")
# Setup credentials for database
baseURL = mycfg['corepfs_database']['baseURL']
mouseEndpoint = mycfg['corepfs_database']['mouseEndpoint']
username = mycfg['corepfs_database']['username']
password = mycfg['corepfs_database']['password']
# Mouse validation
my_auth = HTTPBasicAuth(username, password)
query = baseURL + mouseEndpoint
result = requests.get(query, auth=my_auth, headers={"Prefer": "odata.maxpagesize=5000"})
mice_content = json.loads(result.content)
exp_ls = mice_content["value"]
fields_to_check = ['JAX_SAMPLE_EXTERNALID', 'JAX_MOUSESAMPLE_SEX', 'JAX_MOUSESAMPLE_ALLELE', 'JAX_MOUSESAMPLE_GENOTYPE', 'JAX_MOUSESAMPLE_DATEOFBIRTH']
mouse_table_rows = []
for komp_request in exp_ls:
for mouse_sample_lot in komp_request['REV_MOUSESAMPLELOT_KOMPREQUEST']:
mouse_sample_dict = mouse_sample_lot['SAMPLE']
if mouse_sample_dict['JAX_MOUSESAMPLE_ISFILLER'] is None:
for field in fields_to_check:
if not mouse_sample_dict.get(field):
mouse_barcode = mouse_sample_dict.get('Barcode', 'Unknown')
comment = f"The field '{field}' is missing."
mouse_table_rows.append([mouse_barcode, comment])
mouse_df = pd.DataFrame(mouse_table_rows, columns=["Mouse_ID", "Comments"])
#####################################################################################################################################################################################################
####################################### Metadata validation #########################################################################################################################################
# Parse the configuration file
mycfg = cfg.parse_config(path="/projects/galaxy/tools/cba/config_sk.yml")
baseURL = mycfg['corepfs_database']['baseURL']
username = mycfg['corepfs_database']['username']
password = mycfg['corepfs_database']['password']
experimentEndpointTemplate = mycfg['corepfs_database']['experimentEndpointTemplate']
# Initialize an empty list to store the table rows
metadata_table_rows = []
# Iterate over all experiment names
for experimentName in kompExperimentNames:
experimentEndpoint = experimentEndpointTemplate.format(exp=experimentName)
my_auth = HTTPBasicAuth(username, password)
query = baseURL + experimentEndpoint
result = requests.get(query, auth=my_auth, headers={"Prefer": "odata.maxpagesize=5000"})
# Check if the request was successful
if result.status_code == 200:
# Check if the response content is not empty
if result.content:
# Parse the string data into a dictionary
exp_content = json.loads(result.content)
# Get list of experiment entities
exp_ls = exp_content["value"]
# Find the entity with EntityTypeName "KOMP_XXX_XXX_EXPERIMENT"
for entity in exp_ls:
if entity["EntityTypeName"].startswith("KOMP_") and entity["EntityTypeName"].endswith("_EXPERIMENT"):
# Check if JAX_EXPERIMENT_STATUS is "Review Completed"
if entity.get("JAX_EXPERIMENT_STATUS") == "Review Completed":
# Iterate over the fields in the list
for field in impc_codes:
# Check if the field exists in the entity
if field in entity:
field_value = entity[field]
if not field_value:
experiment_barcode = entity.get('Barcode', 'Unknown')
comment = f"Metadata QC Fail: Missing value for {field}"
metadata_table_rows.append([experiment_barcode, comment])
break # Only check the first matching entity
else:
print(f"No content returned for {experimentName}.")
else:
print(f"Failed to retrieve data for {experimentName}. Status code: {result.status_code}")
metadata_df = pd.DataFrame(metadata_table_rows, columns=["Experiment_Barcode", "Comments"])
#######################################################################################################################################################################################################
#################################### Assay data validation ############################################################################################################################################
# Check experimenter ID - pull experimenter id and barcode from lims
experimentid_conn = sql.connect(
host="rslims.jax.org",
user="dba",
password="rsdba",
database="komp"
)
experimenterid_data_table = pd.read_sql("""
SELECT * FROM komp.experimenterid;
""",
experimentid_conn)
# Close connection to the database
experimentid_conn.close()
#print(experimenterid_data_table)
exp_id = experimenterid_data_table["FirstName"].tolist()
# Fields to ignore
ignore_fields = ['JAX_ASSAY_ASSAYFAILCOMMENTS', 'JAX_ASSAY_COMMENTS', 'JAX_ASSAY_ASSAYFAILREASON']
###### Construct a List for time series - hard coded
GTT_series = ['IMPC_IPG_002_001_T0', 'IMPC_IPG_002_001_T15', 'IMPC_IPG_002_001_T30', 'IMPC_IPG_002_001_T60', 'IMPC_IPG_002_001_T120']
OFD_series = ['JAX_OFD_005_001_1st5', 'JAX_OFD_005_001_2nd5', 'JAX_OFD_005_001_3rd5', 'JAX_OFD_005_001_4th5']
GRS_series = ['IMPC_GRS_001_001_T1', 'IMPC_GRS_001_001_T2', 'IMPC_GRS_001_001_T3', 'IMPC_GRS_002_001_T1', 'IMPC_GRS_002_001_T2', 'IMPC_GRS_002_001_T3']
HBD_series = ['JAX_HBD_002_001']
# Define the experiment types and their corresponding series ---- # Same for media series
experiment_series = {
"KOMP_GLUCOSE_TOLERANCE_TEST_EXPERIMENT": GTT_series,
"KOMP_OPEN_FIELD_EXPERIMENT": OFD_series,
"KOMP_HOLEBOARD_EXPERIMENT": HBD_series,
"KOMP_GRIP_STRENGTH_EXPERIMENT": GRS_series
}
media_series_impc_codes = ['IMPC_CSD_085_001_01', 'IMPC_CSD_085_001_02', 'IMPC_CSD_085_001_03',
'IMPC_CSD_085_001_04', 'IMPC_CSD_085_001_05', 'IMPC_CSD_085_001_06',
'IMPC_EYE_051_001_1', 'IMPC_EYE_051_001_2', 'IMPC_EYE_051_001_3',
'IMPC_EYE_051_001_4', 'IMPC_EYE_051_001_5',
'IMPC_ECG_025_001_f1', 'IMPC_ECG_025_001_f2']
# Parse the configuration file
mycfg = cfg.parse_config(path="/projects/galaxy/tools/cba/config_sk.yml")
baseURL = mycfg['corepfs_database']['baseURL']
username = mycfg['corepfs_database']['username']
password = mycfg['corepfs_database']['password']
experimentEndpointTemplate = mycfg['corepfs_database']['experimentEndpointTemplate']
# Initialize an empty list to store the table rows
table_rows = []
# Iterate over all experiment names
for experimentName in kompExperimentNames:
experimentEndpoint = experimentEndpointTemplate.format(exp=experimentName)
my_auth = HTTPBasicAuth(username, password)
query = baseURL + experimentEndpoint
result = requests.get(query, auth=my_auth, headers={"Prefer": "odata.maxpagesize=5000"})
# Check if the response is valid JSON
try:
exp_content = json.loads(result.content)
except json.JSONDecodeError:
print(f"Error decoding JSON for experiment: {experimentName}")
continue
# Get list of experiment entities
exp_ls = exp_content.get("value", [])
# Loop through each sample
for sample in exp_ls:
# Check the experiment status
if sample.get("JAX_EXPERIMENT_STATUS") != "Review Completed":
# Extract the experiment barcode from the start of the JSON dictionary
entity_type = sample.get("EntityTypeName", "").replace("EXPERIMENT", "ASSAY")
entity_id = sample.get("Id", "")
name = sample.get("Name", "")
baseURL_exp = "https://jacksonlabs.platformforscience.com/PROD/corelims?cmd=enter-validation&entityType="
url = f'{baseURL_exp}{entity_type.replace("_", "%20")}&entityId={entity_id}'
experiment_barcode = f'=HYPERLINK("{url}", "{name}")'
comments = "Experiment status is invalid"
table_rows.append([None, experiment_barcode, None, comments, "Experiment cannot be validated"])
continue # Skip this sample if the status is not "Review Completed"
# Extract the experiment barcode from the start of the JSON dictionary
entity_type = sample.get("EntityTypeName", "").replace("EXPERIMENT", "ASSAY")
entity_id = sample.get("Id", "")
name = sample.get("Name", "")
baseURL_exp = "https://jacksonlabs.platformforscience.com/PROD/corelims?cmd=enter-validation&entityType="
url = f'{baseURL_exp}{entity_type.replace("_", "%20")}&entityId={entity_id}'
experiment_barcode = f'=HYPERLINK("{url}", "{name}")'
for komp_sample in sample.get("EXPERIMENT_SAMPLES", []):
# Check if the field Active is 'true'
if not komp_sample.get("Active", False):
continue # Skip this sample if Active is 'false'
assay_data = komp_sample.get("ASSAY_DATA", {})
mouse_sample = komp_sample.get("ENTITY", {}).get("SAMPLE", {})
barcode = komp_sample.get("ENTITY", {}).get("Barcode", "")
# New validation logic for penbarcode location "LC5" and date of death before test date
if assay_data.get("JAX_ASSAY_PENBARCODE") == "LC5":
jax_mouse_sample_date_of_death_str = mouse_sample.get("JAX_MOUSESAMPLE_DATEOFDEATH")
if jax_mouse_sample_date_of_death_str:
jax_mouse_sample_date_of_death = datetime.strptime(jax_mouse_sample_date_of_death_str, "%Y-%m-%d")
# Find the test date from other assays in the same experiment
skip_validation = False
for other_komp_sample in sample.get("EXPERIMENT_SAMPLES", []):
other_assay_data = other_komp_sample.get("ASSAY_DATA", {})
other_penbarcode = other_assay_data.get("JAX_ASSAY_PENBARCODE")
other_test_date_str = (other_assay_data.get("JAX_ASSAY_TEST_DATE") or
other_assay_data.get("IMPC_CBC_046_001") or
other_assay_data.get("JAX_ASSAY_TESTDATE"))
# Ensure other_test_date_str is not None and handle accordingly
if other_test_date_str:
if isinstance(other_test_date_str, list):
other_test_date_str = [date_str.split()[0] if "IMPC_CBC_046_001" in other_assay_data else date_str for date_str in other_test_date_str]
elif isinstance(other_test_date_str, str):
if "IMPC_CBC_046_001" in other_assay_data:
other_test_date_str = other_test_date_str.split()[0]
other_test_date_str = [other_test_date_str] # Convert to list for uniform processing
for date_str in other_test_date_str:
# Parse the date based on the format
if "IMPC_CBC_046_001" in other_assay_data:
other_test_date = datetime.strptime(date_str, "%m/%d/%Y")
elif "JAX_ASSAY_TESTDATE" in other_assay_data:
other_test_date = datetime.strptime(date_str, "%m/%d/%Y")
else:
other_test_date = datetime.strptime(date_str, "%Y-%m-%d")
if other_test_date > jax_mouse_sample_date_of_death:
skip_validation = True
break
else:
# Find another assay_data with a valid date
for alt_komp_sample in sample.get("EXPERIMENT_SAMPLES", []):
alt_assay_data = alt_komp_sample.get("ASSAY_DATA", {})
alt_test_date_str = (alt_assay_data.get("JAX_ASSAY_TEST_DATE") or
alt_assay_data.get("IMPC_CBC_046_001") or
alt_assay_data.get("JAX_ASSAY_TESTDATE"))
if alt_test_date_str:
if isinstance(alt_test_date_str, list):
alt_test_date_str = [date_str.split()[0] if "IMPC_CBC_046_001" in alt_assay_data else date_str for date_str in alt_test_date_str]
elif isinstance(alt_test_date_str, str):
if "IMPC_CBC_046_001" in alt_assay_data:
alt_test_date_str = alt_test_date_str.split()[0]
alt_test_date_str = [alt_test_date_str] # Convert to list for uniform processing
for date_str in alt_test_date_str:
# Parse the date based on the format
if "IMPC_CBC_046_001" in alt_assay_data:
other_test_date = datetime.strptime(date_str, "%m/%d/%Y")
elif "JAX_ASSAY_TESTDATE" in alt_assay_data:
other_test_date = datetime.strptime(date_str, "%m/%d/%Y")
else:
other_test_date = datetime.strptime(date_str, "%Y-%m-%d")
if other_test_date > jax_mouse_sample_date_of_death:
skip_validation = True
break
if skip_validation:
break
# Skip validation only if both conditions are met
if skip_validation:
continue # Skip validation for this assay
# Check if "JAX_ASSAY_ASSAYFAILREASON" is set to '-'
if assay_data.get("JAX_ASSAY_ASSAYFAILREASON") != '-':
continue # Skip this sample if the condition is not met
# Remove the fields to ignore from assay_data
filtered_assay_data = {k: v for k, v in assay_data.items() if k not in ignore_fields}
# Check if the mouse is a filler
is_filler = mouse_sample.get("JAX_MOUSESAMPLE_ISFILLER")
if is_filler is not None:
comments = "Filler Mouse"
table_rows.append([barcode, experiment_barcode, assay_data.get("Barcode", ""), comments, "Procedure QC Failed"])
continue # Skip this sample if the mouse is a filler
# Check if each field in filtered_assay_data has a value and its corresponding QC field
for field in filtered_assay_data.keys():
field_qc = field + "_QC"
if field_qc in filtered_assay_data and filtered_assay_data[field_qc] != "-":
continue # Skip this field if the QC field is not set to "-"
if field != "Barcode" and not filtered_assay_data[field]:
comments = f"'{field}' does not have a value"
table_rows.append([barcode, experiment_barcode, assay_data.get("Barcode", ""), comments, "Procedure QC Failed"])
# Get the experiment type
experiment_type = entity.get("EXPERIMENT", {}).get("EntityTypeName", "") if isinstance(entity.get("EXPERIMENT", {}), dict) else ""
# Check if any value from the corresponding series list is present in the filtered_assay_data
for experiment, series in experiment_series.items():
if experiment_type == experiment:
for field in series:
if field not in filtered_assay_data:
comments = f"'{field}' does not have a value"
table_rows.append([barcode, experiment_barcode, assay_data.get("Barcode", ""), comments, "Procedure QC Failed"])
break
# Check for experimenter id present in the filtered_assay_data
if not any(value in exp_id for value in filtered_assay_data.values()):
comments = "No/incorrect experimenter ID associated with this experiment"
table_rows.append([barcode, experiment_barcode, assay_data.get("Barcode", ""), comments, "Procedure QC Failed"])
# For media series, check if the file path exists, if it does, check if there is a file on the server
for field in media_series_impc_codes:
if field in filtered_assay_data.keys():
# Check if it has a value (essentially a file path)
file_path = filtered_assay_data[field]
if file_path != '0' and file_path is not None:
file_exists = os.path.exists(file_path)
if not file_exists:
comments = f"Image not found '{file_path}' for '{field}'"
table_rows.append([barcode, experiment_barcode, assay_data.get("Barcode", ""), comments, "Procedure QC Failed"])
# Check the date of birth and assay test date
dob_str = mouse_sample.get("JAX_MOUSESAMPLE_DATEOFBIRTH")
test_date_str = assay_data.get("JAX_ASSAY_TEST_DATE")
if dob_str and test_date_str:
dob = datetime.strptime(dob_str, "%Y-%m-%d")
test_date = datetime.strptime(test_date_str, "%Y-%m-%d")
weeks_diff = (test_date - dob).days / 7
if not (dob + timedelta(weeks=3.5) <= test_date <= dob + timedelta(weeks=18)):
comments = f"Test date is {weeks_diff:.2f} weeks after DOB"
table_rows.append([barcode, experiment_barcode, assay_data.get("Barcode", ""), comments, "Procedure QC Failed"])
# Create a DataFrame from the table rows
assay_data_df = pd.DataFrame(table_rows, columns=["Mouse_ID", "Experiment_Barcode", "Assay_Data_Barcode", "Comments", "Assay_Fail_Reason"])
# Convert IMPC codes to column headers
impc_columnheader = pd.read_excel('/projects/galaxy/tools/cba/impc_column_header.xlsx')
# Create a dictionary from the lookup DataFrame for quick lookup
lookup_dict = dict(zip(impc_columnheader['Code'], impc_columnheader['Column_header']))
# Function to replace IMPC code with column header in comments
def replace_impc_with_header(comment):
# Find all IMPC codes in the comment
impc_codes = re.findall(r'IMPC_\w+', comment)
for code in impc_codes:
if code in lookup_dict:
comment = comment.replace(code, lookup_dict[code])
return comment
# Apply the function to the Comments column in assay_data_df
updated_assay_data_df = assay_data_df.copy()
updated_assay_data_df['Comments'] = updated_assay_data_df['Comments'].apply(replace_impc_with_header)
# Merge mouse_df with assay_data_df on Mouse_ID
merged_df = pd.merge(updated_assay_data_df, mouse_df, on="Mouse_ID", how="outer")
# Merge the result with metadata_df on Experiment_Barcode
merged_df = pd.merge(merged_df, metadata_df, on="Experiment_Barcode", how="outer")
# Combine the Comments columns into a single column
merged_df['Comments'] = merged_df['Comments_x'].combine_first(merged_df['Comments_y']).combine_first(merged_df['Comments'])
# Drop the extra Comments columns
merged_df.drop(columns=['Comments_x', 'Comments_y'], inplace=True)
# Fill missing values with NaN
merged_df.fillna(value=pd.NA, inplace=True)
# Get the current date
current_date = datetime.now().strftime("%Y%m%d")
# Write the merged table to an Excel file with the current date as the filename
if len(sys.argv) > 1:
file_name = sys.argv[1]
else:
file_name = f"QC_validation_header_final0_{current_date}.csv"
merged_df.to_csv(file_name, index=False)