Skip to content

Commit

Permalink
updates variant database generation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
naumenko-sa committed Feb 7, 2019
1 parent df806c8 commit 6d71f9a
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 28 deletions.
2 changes: 1 addition & 1 deletion cre.database.header
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Position,Ref,Alt,Variation,Zygosity,Protein_change_ensembl,Gene,Conserved_in_29_mammals,Sift_score,Polyphen_score,Cadd_score,Gnomad_maf,Info,Sample
Position,Ref,Alt,Variation,Zygosity,Refseq_change,Gene,Conserved_in_20_mammals,Sift_score,Polyphen_score,Cadd_score,Gnomad_af,Sample
2 changes: 1 addition & 1 deletion cre.database.header1
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Sample,Position,Ref,Alt,Variation,Zygosity,Protein_change_ensembl,Gene,Conserved_in_29_mammals,Sift_score,Polyphen_score,Cadd_score,Gnomad_maf,Info,Sample
Sample,Position,Ref,Alt,Variation,Zygosity,Refseq_change,Gene,Conserved_in_20_mammals,Sift_score,Polyphen_score,Cadd_score,Gnomad_af,Sample
30 changes: 11 additions & 19 deletions cre.database.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,29 @@
import re

family = sys.argv[1]
report = sys.argv[2]

with open('samples.txt','rb') as f_samples:
with open('samples.txt', 'rb') as f_samples:
samples = f_samples.readlines()
samples = [x.strip() for x in samples]

n_samples = len(samples)

for sample in samples:
with open(family+".csv",'rb') as f_csv:
with open(report, 'rb') as f_csv:
reader = csv.DictReader(f_csv)
with open(family+'.'+sample+'.c4r','w') as f_sample:
zygosity_field = 'Zygosity.'+sample
#because R when building report substitutes - with . in column names
zygosity_field = zygosity_field.replace("-",".")
fieldnames=['Position','Ref','Alt','Variation',zygosity_field,'Protein_change_ensembl','Gene','Conserved_in_29_mammals','Sift_score','Polyphen_score','Cadd_score']
with open(family + '.' + sample + '.c4r', 'w') as f_sample:
zygosity_field = 'Zygosity.' + sample
#because R when building report substitutes - with _ in column names
zygosity_field = zygosity_field.replace("-", "_")
fieldnames = ['Position', 'Ref', 'Alt', 'Variation', zygosity_field, 'Refseq_change',
'Gene', 'Conserved_in_20_mammals', 'Sift_score', 'Polyphen_score', 'Cadd_score', 'Gnomad_af']
for row in reader:
l = []
for key in fieldnames:
l.append(row[key])
#some reports have Exac_maf, some Gnomad_maf
if ('Exac_maf' in row):
l.append('"'+row['Exac_maf']+'"')
else:
l.append('"'+row['Gnomad_maf']+'"')
#some reports generated earlier does not have Info_refseq - Info (ensembl) only
if ('Info_refseq' in row):
l.append('"'+row['Info_refseq']+'"')
else:
l.append('"'+row['Info']+'"')
zygosity=row[zygosity_field]
if (zygosity != '-' and not re.search('Insufficient',zygosity)):
zygosity = row[zygosity_field]
if (zygosity != '-' and not re.search('Insufficient', zygosity)):
f_sample.write(','.join(l)+','+sample)
f_sample.write('\n')

Expand Down
8 changes: 5 additions & 3 deletions cre.database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ do
do
echo $family
cd $family
cre.database.py $family
#get the latest report
report=`ls -1 $family.wes*.csv | grep -v clinical | tail -n1`
[[ -f $report ]] && cre.database.py $family $report
mv *.c4r $2
cd ..
done
Expand All @@ -29,6 +31,6 @@ cre.database_merge.py $prefix.c4r.sample_wise.csv $prefix.c4r.variant_wise.csv
#rm *.c4r

#create files for report generation
cat $prefix.c4r.variant_wise.csv | awk -F ',' '{print $1"-"$2"-"$3"\t"$(NF-1)}' > ~/Desktop/reference_tables/seen_in_c4r_counts.txt
cat $prefix.c4r.variant_wise.csv | awk -F ',' '{print $1"-"$2"-"$3"\t"$NF}' > ~/Desktop/reference_tables/seen_in_c4r_samples.txt
cat $prefix.c4r.variant_wise.csv | awk -F ',' '{print $1"-"$2"-"$3"\t"$(NF-1)}' > seen_in_c4r_counts.txt
cat $prefix.c4r.variant_wise.csv | awk -F ',' '{print $1"-"$2"-"$3"\t"$NF}' > seen_in_c4r_samples.txt

7 changes: 3 additions & 4 deletions cre.database_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
from collections import defaultdict

fieldnames=['Position','Ref','Alt','Variation','Zygosity','Protein_change_ensembl','Gene','Conserved_in_29_mammals','Sift_score','Polyphen_score','Cadd_score']
fieldnames = ['Position', 'Ref', 'Alt', 'Variation', 'Zygosity', 'Refseq_change', 'Gene', 'Conserved_in_20_mammals', 'Sift_score', 'Polyphen_score', 'Cadd_score', 'Gnomad_af']
#Frequency','Samples']

frequencies = defaultdict(list)
Expand All @@ -14,7 +14,7 @@
with open(sys.argv[1],'r') as f_csv:
reader = csv.DictReader(f_csv)
for row in reader:
superkey=row['Position']+'-'+row['Ref']+'-'+row['Alt']
superkey = row['Position']+'-'+row['Ref']+'-'+row['Alt']
if superkey in frequencies:
frequencies[superkey] += 1
samples[superkey].append(row['Sample'])
Expand All @@ -23,8 +23,7 @@
l = []
for key in fieldnames:
l.append(row[key])
l.append('"'+row['Gnomad_maf']+'"')
l.append('"'+row['Info']+'"')
l.append('"'+row['Gnomad_af']+'"')
annotations[superkey] = ','.join(l)
ll = []
ll.append(row['Sample'])
Expand Down

0 comments on commit 6d71f9a

Please sign in to comment.