From 6d71f9adda1c4ed93f25ef925dcef77d69e8effb Mon Sep 17 00:00:00 2001 From: naumenko-sa Date: Wed, 6 Feb 2019 23:12:32 -0500 Subject: [PATCH] updates variant database generation scripts --- cre.database.header | 2 +- cre.database.header1 | 2 +- cre.database.py | 30 +++++++++++------------------- cre.database.sh | 8 +++++--- cre.database_merge.py | 7 +++---- 5 files changed, 21 insertions(+), 28 deletions(-) diff --git a/cre.database.header b/cre.database.header index f44fb89..59ae7aa 100644 --- a/cre.database.header +++ b/cre.database.header @@ -1 +1 @@ -Position,Ref,Alt,Variation,Zygosity,Protein_change_ensembl,Gene,Conserved_in_29_mammals,Sift_score,Polyphen_score,Cadd_score,Gnomad_maf,Info,Sample +Position,Ref,Alt,Variation,Zygosity,Refseq_change,Gene,Conserved_in_20_mammals,Sift_score,Polyphen_score,Cadd_score,Gnomad_af,Sample diff --git a/cre.database.header1 b/cre.database.header1 index c536bb0..17ec636 100644 --- a/cre.database.header1 +++ b/cre.database.header1 @@ -1 +1 @@ -Sample,Position,Ref,Alt,Variation,Zygosity,Protein_change_ensembl,Gene,Conserved_in_29_mammals,Sift_score,Polyphen_score,Cadd_score,Gnomad_maf,Info,Sample +Sample,Position,Ref,Alt,Variation,Zygosity,Refseq_change,Gene,Conserved_in_20_mammals,Sift_score,Polyphen_score,Cadd_score,Gnomad_af,Sample diff --git a/cre.database.py b/cre.database.py index d523649..6109845 100755 --- a/cre.database.py +++ b/cre.database.py @@ -5,37 +5,29 @@ import re family = sys.argv[1] +report = sys.argv[2] -with open('samples.txt','rb') as f_samples: +with open('samples.txt', 'rb') as f_samples: samples = f_samples.readlines() samples = [x.strip() for x in samples] n_samples = len(samples) for sample in samples: - with open(family+".csv",'rb') as f_csv: + with open(report, 'rb') as f_csv: reader = csv.DictReader(f_csv) - with open(family+'.'+sample+'.c4r','w') as f_sample: - zygosity_field = 'Zygosity.'+sample - #because R when building report substitutes - with . in column names - zygosity_field = zygosity_field.replace("-",".") - fieldnames=['Position','Ref','Alt','Variation',zygosity_field,'Protein_change_ensembl','Gene','Conserved_in_29_mammals','Sift_score','Polyphen_score','Cadd_score'] + with open(family + '.' + sample + '.c4r', 'w') as f_sample: + zygosity_field = 'Zygosity.' + sample + #because R when building report substitutes - with _ in column names + zygosity_field = zygosity_field.replace("-", "_") + fieldnames = ['Position', 'Ref', 'Alt', 'Variation', zygosity_field, 'Refseq_change', + 'Gene', 'Conserved_in_20_mammals', 'Sift_score', 'Polyphen_score', 'Cadd_score', 'Gnomad_af'] for row in reader: l = [] for key in fieldnames: l.append(row[key]) - #some reports have Exac_maf, some Gnomad_maf - if ('Exac_maf' in row): - l.append('"'+row['Exac_maf']+'"') - else: - l.append('"'+row['Gnomad_maf']+'"') - #some reports generated earlier does not have Info_refseq - Info (ensembl) only - if ('Info_refseq' in row): - l.append('"'+row['Info_refseq']+'"') - else: - l.append('"'+row['Info']+'"') - zygosity=row[zygosity_field] - if (zygosity != '-' and not re.search('Insufficient',zygosity)): + zygosity = row[zygosity_field] + if (zygosity != '-' and not re.search('Insufficient', zygosity)): f_sample.write(','.join(l)+','+sample) f_sample.write('\n') diff --git a/cre.database.sh b/cre.database.sh index c3798a0..de7a49d 100755 --- a/cre.database.sh +++ b/cre.database.sh @@ -14,7 +14,9 @@ do do echo $family cd $family - cre.database.py $family + #get the latest report + report=`ls -1 $family.wes*.csv | grep -v clinical | tail -n1` + [[ -f $report ]] && cre.database.py $family $report mv *.c4r $2 cd .. done @@ -29,6 +31,6 @@ cre.database_merge.py $prefix.c4r.sample_wise.csv $prefix.c4r.variant_wise.csv #rm *.c4r #create files for report generation -cat $prefix.c4r.variant_wise.csv | awk -F ',' '{print $1"-"$2"-"$3"\t"$(NF-1)}' > ~/Desktop/reference_tables/seen_in_c4r_counts.txt -cat $prefix.c4r.variant_wise.csv | awk -F ',' '{print $1"-"$2"-"$3"\t"$NF}' > ~/Desktop/reference_tables/seen_in_c4r_samples.txt +cat $prefix.c4r.variant_wise.csv | awk -F ',' '{print $1"-"$2"-"$3"\t"$(NF-1)}' > seen_in_c4r_counts.txt +cat $prefix.c4r.variant_wise.csv | awk -F ',' '{print $1"-"$2"-"$3"\t"$NF}' > seen_in_c4r_samples.txt diff --git a/cre.database_merge.py b/cre.database_merge.py index 065bee5..aeb9cdd 100755 --- a/cre.database_merge.py +++ b/cre.database_merge.py @@ -4,7 +4,7 @@ import sys from collections import defaultdict -fieldnames=['Position','Ref','Alt','Variation','Zygosity','Protein_change_ensembl','Gene','Conserved_in_29_mammals','Sift_score','Polyphen_score','Cadd_score'] +fieldnames = ['Position', 'Ref', 'Alt', 'Variation', 'Zygosity', 'Refseq_change', 'Gene', 'Conserved_in_20_mammals', 'Sift_score', 'Polyphen_score', 'Cadd_score', 'Gnomad_af'] #Frequency','Samples'] frequencies = defaultdict(list) @@ -14,7 +14,7 @@ with open(sys.argv[1],'r') as f_csv: reader = csv.DictReader(f_csv) for row in reader: - superkey=row['Position']+'-'+row['Ref']+'-'+row['Alt'] + superkey = row['Position']+'-'+row['Ref']+'-'+row['Alt'] if superkey in frequencies: frequencies[superkey] += 1 samples[superkey].append(row['Sample']) @@ -23,8 +23,7 @@ l = [] for key in fieldnames: l.append(row[key]) - l.append('"'+row['Gnomad_maf']+'"') - l.append('"'+row['Info']+'"') + l.append('"'+row['Gnomad_af']+'"') annotations[superkey] = ','.join(l) ll = [] ll.append(row['Sample'])