Skip to content

Commit

Permalink
add db schema script processing and derived tables
Browse files Browse the repository at this point in the history
  • Loading branch information
importer system account committed Oct 17, 2024
1 parent af209c9 commit bc39952
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python3

import datetime
import re
import subprocess

def get_list_of_genetic_profile_ids():
get_genetic_profile_id_list_query="SELECT genetic_profile_id FROM genetic_profile WHERE genetic_alteration_type NOT IN ('GENERIC_ASSAY', 'MUTATION_EXTENDED', 'STRUCTURAL_VARIANT')"
query_argument_template="--query={0}"
query_argument = query_argument_template.format(get_genetic_profile_id_list_query)
clickhouse_client_obtain_genetic_profile_id_list = ["clickhouse", "client", "--config-file=clickhouse_client_config_2024-10-14-09-03-02.yaml", query_argument]
#TODO remove hardcode of clickhouse config file and accept (or generate) this file based on command like arguments
genetic_profile_query_result = subprocess.run(clickhouse_client_obtain_genetic_profile_id_list, shell=False, capture_output=True)#, stderr=stderr_file, stdout=stdout_file)
genetic_profile_id_list_string = genetic_profile_query_result.stdout.decode("utf-8")
genetic_profile_id_list = genetic_profile_id_list_string.splitlines()
return genetic_profile_id_list, genetic_profile_query_result.returncode

#TODO read SQL statement templates from external files rather than hardcoding
INSERT_EVENTS_INTO_GENETIC_ALTERATION_DERIVED_QUERY_TEMPLATE = '''
INSERT INTO TABLE genetic_alteration_derived
SELECT
sample_unique_id,
cancer_study_identifier,
hugo_gene_symbol,
replaceOne(stable_id, concat(sd.cancer_study_identifier, '_'), '') as profile_type,
alteration_value
FROM
(SELECT
sample_id,
hugo_gene_symbol,
stable_id,
alteration_value
FROM
(SELECT
g.hugo_gene_symbol AS hugo_gene_symbol,
gp.stable_id as stable_id,
arrayMap(x -> (x = '' ? NULL : x), splitByString(',', assumeNotNull(substring(ga.values,1,-1)))) AS alteration_value,
arrayMap(x -> (x = '' ? NULL : toInt32(x)), splitByString(',', assumeNotNull(substring(gps.ordered_sample_list,1,-1)))) AS sample_id
FROM
genetic_profile gp
JOIN genetic_profile_samples gps ON gp.genetic_profile_id = gps.genetic_profile_id
JOIN genetic_alteration ga ON gp.genetic_profile_id = ga.genetic_profile_id
JOIN gene g ON ga.genetic_entity_id = g.genetic_entity_id
WHERE
gp.genetic_profile_id={0})
ARRAY JOIN alteration_value, sample_id
WHERE alteration_value != 'NA') AS subquery
JOIN sample_derived sd ON sd.internal_id = subquery.sample_id'''
INSERT_EVENTS_INTO_GENETIC_ALTERATION_DERIVED_QUERY_TEMPLATE_1LINE = re.sub('\s+',' ', INSERT_EVENTS_INTO_GENETIC_ALTERATION_DERIVED_QUERY_TEMPLATE).strip()

#TODO add generic assay query too and allow selection of which to run by command line argument

def insert_event_records_for_profile_into_derived_table(derived_table_name, genetic_profile_id):
insert_events_query = INSERT_EVENTS_INTO_GENETIC_ALTERATION_DERIVED_QUERY_TEMPLATE_1LINE.format(genetic_profile_id)
query_argument_template="--query={0}"
query_argument = query_argument_template.format(insert_events_query)
clickhouse_client_insert_records = ["clickhouse", "client", "--config-file=clickhouse_client_config_2024-10-14-09-03-02.yaml", query_argument]
insert_events_query_result = subprocess.run(clickhouse_client_insert_records, shell=False, capture_output=True)#, stderr=stderr_file, stdout=stdout_file)
print("stderr from insert:")
print(insert_events_query_result.stderr.decode("utf-8"))
return insert_events_query_result.returncode

def main():
print(datetime.datetime.now())
genetic_profile_id_list, returncode = get_list_of_genetic_profile_ids()
print(genetic_profile_id_list)
print("return code was {0}".format(returncode))
for genetic_profile_id in genetic_profile_id_list:
print(datetime.datetime.now())
returncode = insert_event_records_for_profile_into_derived_table("genetic_alteration", genetic_profile_id)
if returncode != 0:
print("Error occurred during insertion of record for profile {0}".format(genetic_profile_id))
#TODO add a step where genetic profiles are mapped to cancer study stable id, and print that instead
print(datetime.datetime.now())
print("stop")

if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env bash

output_filename_prefix="$1"
shift
unset input_filename
declare -a input_filename
input_filename=()
while [ $# -gt 0 ] ; do
next_arg="$1"
if ! [ -r "$next_arg" ] ; then
echo "Error : cannot read from file '$next_arg'" >&2
exit 1
fi
input_filename+=($next_arg)
shift
done

zero_padded_string=""

function set_zero_padded_string() {
string=$1
field_width=$2
if [ ${#string} -ge $field_width ] ; then
zero_padded_string="$string"
return 0
fi
pad=""
local total_length=${#string}
while [ $total_length -lt $field_width ] ; do
pad="0$pad"
total_length=$(($total_length+1))
done
zero_padded_string="$pad""$string"
return 0
}

semicolon_count=0
pos=0
while [ $pos -lt ${#input_filename[@]} ] ; do
inputfile="${input_filename[$pos]}"
file_semicolon_count=$( cat $inputfile | grep -c ';' )
semicolon_count=$(($semicolon_count+$file_semicolon_count))
pos=$(($pos+1))
done
output_filecount_field_width=${#semicolon_count}

MULTI_SEMICOLON_RE=".*;.*;.*"
EARLY_SEMICOLON_RE=".*;.*[[:graph:]]"
HAS_SEMICOLON_RE=".*;.*"
output_file_index=1
pos=0
set_zero_padded_string "$output_file_index" "$output_filecount_field_width"
outputfile="${output_filename_prefix}_${zero_padded_string}.sql"
rm -f "$outputfile"
while [ $pos -lt ${#input_filename[@]} ] ; do
inputfile="${input_filename[$pos]}"
IFS=''; while read line ; do
if [[ $line =~ $MULTI_SEMICOLON_RE ]] ; then
echo "Error : line encountered in file $inputfile with multiple semicolons (unparsable) : $line" >&2
exit 1
fi
if [[ $line =~ $EARLY_SEMICOLON_RE ]] ; then
echo "Error : line encountered in file $inputfile with content after the semicolon (unparsable) : $line" >&2
exit 1
fi
echo "$line" >> "$outputfile"
if [[ $line =~ $HAS_SEMICOLON_RE ]] ; then
# this is the final line of the output file
output_file_index=$(($output_file_index+1))
set_zero_padded_string "$output_file_index" "$output_filecount_field_width"
outputfile="${output_filename_prefix}_${zero_padded_string}.sql"
rm -f "$outputfile"
fi
done < "$inputfile"
if [ -e "$outputfile" ] && ! [ -s "$outputfile" ] ; then
# if we have written anything to the current output file, it is now done (even without a terminating semicolon)
output_file_index=$(($output_file_index+1))
set_zero_padded_string "$output_file_index" "$output_filecount_field_width"
outputfile="${output_filename_prefix}_${zero_padded_string}.sql"
rm -f "$outputfile"
fi
pos=$(($pos+1))
done


0 comments on commit bc39952

Please sign in to comment.