-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add db schema script processing and derived tables
- Loading branch information
importer system account
committed
Oct 17, 2024
1 parent
af209c9
commit bc39952
Showing
2 changed files
with
163 additions
and
0 deletions.
There are no files selected for viewing
78 changes: 78 additions & 0 deletions
78
scripts/clickhouse_import_support/create_derived_tables_in_clickhouse_database_by_profile.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import datetime | ||
import re | ||
import subprocess | ||
|
||
def get_list_of_genetic_profile_ids(): | ||
get_genetic_profile_id_list_query="SELECT genetic_profile_id FROM genetic_profile WHERE genetic_alteration_type NOT IN ('GENERIC_ASSAY', 'MUTATION_EXTENDED', 'STRUCTURAL_VARIANT')" | ||
query_argument_template="--query={0}" | ||
query_argument = query_argument_template.format(get_genetic_profile_id_list_query) | ||
clickhouse_client_obtain_genetic_profile_id_list = ["clickhouse", "client", "--config-file=clickhouse_client_config_2024-10-14-09-03-02.yaml", query_argument] | ||
#TODO remove hardcode of clickhouse config file and accept (or generate) this file based on command like arguments | ||
genetic_profile_query_result = subprocess.run(clickhouse_client_obtain_genetic_profile_id_list, shell=False, capture_output=True)#, stderr=stderr_file, stdout=stdout_file) | ||
genetic_profile_id_list_string = genetic_profile_query_result.stdout.decode("utf-8") | ||
genetic_profile_id_list = genetic_profile_id_list_string.splitlines() | ||
return genetic_profile_id_list, genetic_profile_query_result.returncode | ||
|
||
#TODO read SQL statement templates from external files rather than hardcoding | ||
INSERT_EVENTS_INTO_GENETIC_ALTERATION_DERIVED_QUERY_TEMPLATE = ''' | ||
INSERT INTO TABLE genetic_alteration_derived | ||
SELECT | ||
sample_unique_id, | ||
cancer_study_identifier, | ||
hugo_gene_symbol, | ||
replaceOne(stable_id, concat(sd.cancer_study_identifier, '_'), '') as profile_type, | ||
alteration_value | ||
FROM | ||
(SELECT | ||
sample_id, | ||
hugo_gene_symbol, | ||
stable_id, | ||
alteration_value | ||
FROM | ||
(SELECT | ||
g.hugo_gene_symbol AS hugo_gene_symbol, | ||
gp.stable_id as stable_id, | ||
arrayMap(x -> (x = '' ? NULL : x), splitByString(',', assumeNotNull(substring(ga.values,1,-1)))) AS alteration_value, | ||
arrayMap(x -> (x = '' ? NULL : toInt32(x)), splitByString(',', assumeNotNull(substring(gps.ordered_sample_list,1,-1)))) AS sample_id | ||
FROM | ||
genetic_profile gp | ||
JOIN genetic_profile_samples gps ON gp.genetic_profile_id = gps.genetic_profile_id | ||
JOIN genetic_alteration ga ON gp.genetic_profile_id = ga.genetic_profile_id | ||
JOIN gene g ON ga.genetic_entity_id = g.genetic_entity_id | ||
WHERE | ||
gp.genetic_profile_id={0}) | ||
ARRAY JOIN alteration_value, sample_id | ||
WHERE alteration_value != 'NA') AS subquery | ||
JOIN sample_derived sd ON sd.internal_id = subquery.sample_id''' | ||
INSERT_EVENTS_INTO_GENETIC_ALTERATION_DERIVED_QUERY_TEMPLATE_1LINE = re.sub('\s+',' ', INSERT_EVENTS_INTO_GENETIC_ALTERATION_DERIVED_QUERY_TEMPLATE).strip() | ||
|
||
#TODO add generic assay query too and allow selection of which to run by command line argument | ||
|
||
def insert_event_records_for_profile_into_derived_table(derived_table_name, genetic_profile_id): | ||
insert_events_query = INSERT_EVENTS_INTO_GENETIC_ALTERATION_DERIVED_QUERY_TEMPLATE_1LINE.format(genetic_profile_id) | ||
query_argument_template="--query={0}" | ||
query_argument = query_argument_template.format(insert_events_query) | ||
clickhouse_client_insert_records = ["clickhouse", "client", "--config-file=clickhouse_client_config_2024-10-14-09-03-02.yaml", query_argument] | ||
insert_events_query_result = subprocess.run(clickhouse_client_insert_records, shell=False, capture_output=True)#, stderr=stderr_file, stdout=stdout_file) | ||
print("stderr from insert:") | ||
print(insert_events_query_result.stderr.decode("utf-8")) | ||
return insert_events_query_result.returncode | ||
|
||
def main(): | ||
print(datetime.datetime.now()) | ||
genetic_profile_id_list, returncode = get_list_of_genetic_profile_ids() | ||
print(genetic_profile_id_list) | ||
print("return code was {0}".format(returncode)) | ||
for genetic_profile_id in genetic_profile_id_list: | ||
print(datetime.datetime.now()) | ||
returncode = insert_event_records_for_profile_into_derived_table("genetic_alteration", genetic_profile_id) | ||
if returncode != 0: | ||
print("Error occurred during insertion of record for profile {0}".format(genetic_profile_id)) | ||
#TODO add a step where genetic profiles are mapped to cancer study stable id, and print that instead | ||
print(datetime.datetime.now()) | ||
print("stop") | ||
|
||
if __name__ == '__main__': | ||
main() |
85 changes: 85 additions & 0 deletions
85
scripts/clickhouse_import_support/split_input_sql_files_by_semicolon.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#!/usr/bin/env bash | ||
|
||
output_filename_prefix="$1" | ||
shift | ||
unset input_filename | ||
declare -a input_filename | ||
input_filename=() | ||
while [ $# -gt 0 ] ; do | ||
next_arg="$1" | ||
if ! [ -r "$next_arg" ] ; then | ||
echo "Error : cannot read from file '$next_arg'" >&2 | ||
exit 1 | ||
fi | ||
input_filename+=($next_arg) | ||
shift | ||
done | ||
|
||
zero_padded_string="" | ||
|
||
function set_zero_padded_string() { | ||
string=$1 | ||
field_width=$2 | ||
if [ ${#string} -ge $field_width ] ; then | ||
zero_padded_string="$string" | ||
return 0 | ||
fi | ||
pad="" | ||
local total_length=${#string} | ||
while [ $total_length -lt $field_width ] ; do | ||
pad="0$pad" | ||
total_length=$(($total_length+1)) | ||
done | ||
zero_padded_string="$pad""$string" | ||
return 0 | ||
} | ||
|
||
semicolon_count=0 | ||
pos=0 | ||
while [ $pos -lt ${#input_filename[@]} ] ; do | ||
inputfile="${input_filename[$pos]}" | ||
file_semicolon_count=$( cat $inputfile | grep -c ';' ) | ||
semicolon_count=$(($semicolon_count+$file_semicolon_count)) | ||
pos=$(($pos+1)) | ||
done | ||
output_filecount_field_width=${#semicolon_count} | ||
|
||
MULTI_SEMICOLON_RE=".*;.*;.*" | ||
EARLY_SEMICOLON_RE=".*;.*[[:graph:]]" | ||
HAS_SEMICOLON_RE=".*;.*" | ||
output_file_index=1 | ||
pos=0 | ||
set_zero_padded_string "$output_file_index" "$output_filecount_field_width" | ||
outputfile="${output_filename_prefix}_${zero_padded_string}.sql" | ||
rm -f "$outputfile" | ||
while [ $pos -lt ${#input_filename[@]} ] ; do | ||
inputfile="${input_filename[$pos]}" | ||
IFS=''; while read line ; do | ||
if [[ $line =~ $MULTI_SEMICOLON_RE ]] ; then | ||
echo "Error : line encountered in file $inputfile with multiple semicolons (unparsable) : $line" >&2 | ||
exit 1 | ||
fi | ||
if [[ $line =~ $EARLY_SEMICOLON_RE ]] ; then | ||
echo "Error : line encountered in file $inputfile with content after the semicolon (unparsable) : $line" >&2 | ||
exit 1 | ||
fi | ||
echo "$line" >> "$outputfile" | ||
if [[ $line =~ $HAS_SEMICOLON_RE ]] ; then | ||
# this is the final line of the output file | ||
output_file_index=$(($output_file_index+1)) | ||
set_zero_padded_string "$output_file_index" "$output_filecount_field_width" | ||
outputfile="${output_filename_prefix}_${zero_padded_string}.sql" | ||
rm -f "$outputfile" | ||
fi | ||
done < "$inputfile" | ||
if [ -e "$outputfile" ] && ! [ -s "$outputfile" ] ; then | ||
# if we have written anything to the current output file, it is now done (even without a terminating semicolon) | ||
output_file_index=$(($output_file_index+1)) | ||
set_zero_padded_string "$output_file_index" "$output_filecount_field_width" | ||
outputfile="${output_filename_prefix}_${zero_padded_string}.sql" | ||
rm -f "$outputfile" | ||
fi | ||
pos=$(($pos+1)) | ||
done | ||
|
||
|