Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add scripts for TSV utilities #19

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
335 changes: 54 additions & 281 deletions csv/definitions/example/example_flattened.tsv

Large diffs are not rendered by default.

302 changes: 51 additions & 251 deletions csv/definitions/pcgl/pcgl_flattened.tsv

Large diffs are not rendered by default.

350 changes: 0 additions & 350 deletions csv/example/example_flattened.csv

This file was deleted.

2 changes: 1 addition & 1 deletion csv/template/example/comorbidity_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
submitter_participant_id age_at_comorbidity_diagnosis comorbidity_code comorbidity_term comorbidity_treatment_status comorbidity_status
submitter_participant_id age_at_comorbidity_diagnosis comorbidity_code comorbidity_term comorbidity_treatment_status comorbidity_status
2 changes: 1 addition & 1 deletion csv/template/example/demographic_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
study_id submitter_participant_id gender sex_at_birth ethnicity race country_of_birth ancestry height weight highest_education_level_achieved employment type_of_residence number_of_other_people_in_household pregnancy
study_id submitter_participant_id gender sex_at_birth ethnicity race country_of_birth ancestry height weight highest_education_level_achieved employment type_of_residence number_of_other_people_in_household pregnancy
2 changes: 1 addition & 1 deletion csv/template/example/diagnosis_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
submitter_participant_id submitter_diagnosis_id age_at_diagnosis disease_code disease_term disease_category covid19_severity covid19_vaccine_doses
submitter_participant_id submitter_diagnosis_id age_at_diagnosis disease_code disease_term disease_category covid19_severity covid19_vaccine_doses
2 changes: 1 addition & 1 deletion csv/template/example/exposure_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
submitter_participant_id alcohol_consumption smoking_status physical_activity
submitter_participant_id alcohol_consumption smoking_status physical_activity
2 changes: 1 addition & 1 deletion csv/template/example/follow_up_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
submitter_participant_id age_at_followup disease_status_at_followup
submitter_participant_id age_at_followup disease_status_at_followup
2 changes: 1 addition & 1 deletion csv/template/example/imaging_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
image_hosted_url image_hosted_format image_processing_pipeline image_processing_personel image_processing_null_reason
image_hosted_url image_hosted_format image_processing_pipeline image_processing_personel image_processing_null_reason
2 changes: 1 addition & 1 deletion csv/template/example/measurement_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
submitter_participant_id age_at_measurement measurement_code measurement_term measurement_result_numeric measurement_unit measurement_result_categorical
submitter_participant_id age_at_measurement measurement_code measurement_term measurement_result_numeric measurement_unit measurement_result_categorical
2 changes: 1 addition & 1 deletion csv/template/example/medication_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
submitter_treatment_id drug_code drug_term drug_dose_units prescribed_cumulative_drug_dose actual_cumulative_drug_dose
submitter_treatment_id drug_code drug_term drug_dose_units prescribed_cumulative_drug_dose actual_cumulative_drug_dose
2 changes: 1 addition & 1 deletion csv/template/example/participant_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
study_id submitter_participant_id age_at_enrollment vital_status cause_of_death age_at_death
study_id submitter_participant_id age_at_enrollment vital_status cause_of_death age_at_death
2 changes: 1 addition & 1 deletion csv/template/example/phenotype_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
submitter_participant_id age_at_phenotype phenotype_code phenotype_term phenotype_observed phenotype_duration phenotype_severity
submitter_participant_id age_at_phenotype phenotype_code phenotype_term phenotype_observed phenotype_duration phenotype_severity
2 changes: 1 addition & 1 deletion csv/template/example/procedure_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
submitter_treatment_id procedure_code procedure_term procedure_body_site_code procedure_body_site_term
submitter_treatment_id procedure_code procedure_term procedure_body_site_code procedure_body_site_term
2 changes: 1 addition & 1 deletion csv/template/example/specimen_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
submitter_participant_id submitter_specimen_id specimen_tissue_source_code specimen_tissue_source_term specimen_storage specimen_processing age_at_specimen_collection specimen_anatomic_location_code specimen_anatomic_location_label specimen_laterality
submitter_participant_id submitter_specimen_id specimen_tissue_source_code specimen_tissue_source_term specimen_storage specimen_processing age_at_specimen_collection specimen_anatomic_location_code specimen_anatomic_location_label specimen_laterality
2 changes: 1 addition & 1 deletion csv/template/example/treatment_template.tsv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
submitter_participant_id submitter_treatment_id treatment_type age_at_treatment treatment_duration treatment_intent treatment_response treatment_status
submitter_participant_id submitter_treatment_id treatment_type age_at_treatment treatment_duration treatment_intent treatment_response treatment_status
156 changes: 156 additions & 0 deletions scripts/generateFlatDefinitionsTsvFromFullLinkml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Copyright (C) 2022, icgc-argo

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.

Authors:
Edmund Su
"""

import json
import glob
import urllib
import requests
import re
import numpy as np
import os
import random
import jsonschema
import string
import time
import random
import hashlib
import shutil
import argparse
import copy
import pandas as pd
from linkml_runtime.loaders import yaml_loader
from linkml_runtime.dumpers import yaml_dumper
from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition

def main():
"""
The script aims to translate the FULL linkML model to lectern syntax.
"""
parser = argparse.ArgumentParser(description='The script aims to translate the FULL linkML model to flatten TSV for viewing.')
parser.add_argument('-c', '--custom_linkml', dest="custom_linkml", help="The custom full LinkML schema", required=True,type=str)
parser.add_argument('-o', '--output_directory', dest="output_directory", help="Output directory to save the Lectern JSON schema", default=os.getcwd(),type=str)


cli_input= parser.parse_args()

if not cli_input.custom_linkml.endswith("_full.yaml"):
print("%s does not end with the correct suffix. Please check the correct yaml was provided." % (cli_input.custom_linkml))

model=yaml_loader.load(cli_input.custom_linkml, SchemaDefinition)

definitions=initialize_dataframe()

populateDataFrame(model,definitions)

definitions['schema']=[val.lower() for val in definitions['schema'].values.tolist()]
definitions.to_csv("%s/%s" % (cli_input.output_directory,cli_input.custom_linkml.split('/')[-1].replace("_full.yaml","_flattened.tsv")),index=False,sep='\t')

def initialize_dataframe():
df=pd.DataFrame()
df['field']=None
df['schema']=None
df['required']=None
df['dataType']=None
df['description']=None
df['comments']=None
df['exact_mappings']=None
return df

def populateDataFrame(model,definitions):
count=0
for lm_class in model.classes:
for slot in model.classes[lm_class]['slots']:
definitions.loc[count,"field"]=slot
definitions.loc[count,"schema"]=lm_class
count+=1

for ind in definitions.index.values.tolist():
slot=definitions.loc[ind,"field"]
key="required"
if key in model.slots[slot] and model.slots[slot][key]!=None:
definitions.loc[ind,key]=model.slots[slot][key]
else:
definitions.loc[ind,key]=False

key="range"
if key in model.slots[slot] and model.slots[slot][key]!=None:
if "Menu" in model.slots[slot][key]:
definitions.loc[ind,"dataType"]="string"
else:
definitions.loc[ind,"dataType"]=model.slots[slot][key]
else:
definitions.loc[ind,"dataType"]=False

key="description"
if key in model.slots[slot] and model.slots[slot][key]!=None:
definitions.loc[ind,key]=model.slots[slot][key]
else:
definitions.loc[ind,key]=None

key="comments"
if key in model.slots[slot] and len(model.slots[slot][key])!=0:
definitions.loc[ind,key]=model.slots[slot][key]
else:
definitions.loc[ind,key]=None

key="exact_mappings"
if key in model.slots[slot] and len(model.slots[slot][key])!=0:
definitions.loc[ind,key]=";".join(model.slots[slot][key])
else:
definitions.loc[ind,key]=None

validation_rules=[]

key="pattern"
if key in model.slots[slot] and model.slots[slot][key]!=None:
#print(key,ind)
validation_rules.append("%s:%s" % (key,model.slots[slot][key]))
#print(validation_rules)

key="minimum_value"
if key in model.slots[slot] and model.slots[slot][key]!=None:
#print(key,ind)
validation_rules.append("%s:%s" % (key,str(model.slots[slot][key])))
#print(validation_rules)

key="maximum_value"
if key in model.slots[slot] and model.slots[slot][key]!=None:
#print(key,ind)
validation_rules.append("%s:%s" % (key,str(model.slots[slot][key])))
#print(validation_rules)

key="range"
if "Menu" in model.slots[slot][key]:
#print(key,ind)
enum_list=[]
#validation_rules.append("%s:" % ("Enum"))
for enum_value in model.enums[model.slots[slot][key]]['permissible_values']:
enum_list.append(enum_value)
validation_rules.append("%s:%s" % ("Enum",",".join(enum_list)))
#print(validation_rules)

if len(validation_rules)>0:
definitions.loc[ind,"validation"]=";".join(validation_rules)

if __name__ == "__main__":
main()
82 changes: 82 additions & 0 deletions scripts/generateTemplateTsvFromFullLinkml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Copyright (C) 2022, icgc-argo

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.

Authors:
Edmund Su
"""

import json
import glob
import urllib
import requests
import re
import numpy as np
import os
import random
import jsonschema
import string
import time
import random
import hashlib
import shutil
import argparse
import copy
import pandas as pd
from linkml_runtime.loaders import yaml_loader
from linkml_runtime.dumpers import yaml_dumper
from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition

def main():
"""
The script aims to translate the FULL linkML model to lectern syntax.
"""
parser = argparse.ArgumentParser(description='The script aims to translate the FULL linkML model to flatten TSV for viewing.')
parser.add_argument('-c', '--custom_linkml', dest="custom_linkml", help="The custom full LinkML schema", required=True,type=str)
parser.add_argument('-o', '--output_directory', dest="output_directory", help="Output directory to save the Lectern JSON schema", default=os.getcwd(),type=str)


cli_input= parser.parse_args()

if not cli_input.custom_linkml.endswith("_full.yaml"):
print("%s does not end with the correct suffix. Please check the correct yaml was provided." % (cli_input.custom_linkml))

model=yaml_loader.load(cli_input.custom_linkml, SchemaDefinition)

templates={}
for lm_class in model.classes:
templates[lm_class]=pd.DataFrame()
populateDataFrame(model,templates[lm_class],lm_class)

for key in templates.keys():
templates[key].to_csv("%s/%s_template.tsv" % (cli_input.output_directory,key.lower()),sep='\t',index=False)


#templates=initialize_dataframe()

#populateDataFrame(model,definitions)

#definitions.to_csv("%s/%s" % (cli_input.output_directory,cli_input.custom_linkml.split('/')[-1].replace("_full.yaml","_flattened.tsv")),index=True,sep='\t')


def populateDataFrame(model,template,key):
for slot in model.classes[key]['slots']:
template[slot]=None

if __name__ == "__main__":
main()