Skip to content

Commit

Permalink
modified: HISTORY.md
Browse files Browse the repository at this point in the history
	modified:   README.md
	new file:   cre.bam.validate.sh
	new file:   cre.database.header
	new file:   cre.database.py
	new file:   cre.database_merge.py
	new file:   cre.fixit.sh
	new file:   cre.gemini_load.sh
	new file:   cre.locate_sample.py
	modified:   cre.package.sh
	new file:   cre.phenomecentral.upload_vcf_attachment.prototype.py
	deleted:    bam.validate.sh
	deleted:    fixit.sh
	deleted:    gemini.vep2gemini.sh
  • Loading branch information
naumenko-sa committed Jul 13, 2017
1 parent 63eace9 commit dac884d
Show file tree
Hide file tree
Showing 11 changed files with 399 additions and 2 deletions.
2 changes: 1 addition & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
- 2017-07-13: added cre.package.sh. It packages reports to send.
- 2017-07-07: added bam.validate.sh, it produces bam.check file with picard's validation status.
- 2017-07-04: bcbio uses vep-merged, a merged cache of refseq and vep, no need to annotate with refseq anymore.
- 2017-07-04: bcbio uses vep-merged, a merged cache of refseq and vep, no need to annotate with refseq anymore, removed refseq from cre.sh.
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,29 @@ vcf.platypus.getNV.sh ${family}-platypus-annotated-decomposed.vcf.gz
## 4.6 Rscript ~/cre/[cre.R](../master/cre.R) [family] - creates report family.csv.

# 5. List of all scripts

* bcbio.array.pbs
* bcbio.pbs
* bcbio.prepare_families.sh
* bcbio.prepare_family1.sh
* bcbio.rename_old_names.sh
* bcl2fastq.sh
* cre.bam.validate.sh
* cre.fixit.sh - fixes sample names
* cre.gemini_load.sh loads vep-annotated vcf to gemini db.
* cre.package.sh
* cre.sh
* gemini.gemini2txt.sh
* gemini.refseq.sh
* gemini.variant_impacts.sh
* gemini.vep.refseq.sh
* gemini.vep.sh
* omim.sh
* orphanet.sh
* vcf.freebayes.getAO.sh
* vcf.gatk.get_depth.sh
* vcf.platypus.getNV.sh
* vcf.samtools.get_depth.sh
* vcf.split_multi.sh
* vep4seqr_hg38.sh
* vep4seqr.sh
16 changes: 16 additions & 0 deletions cre.bam.validate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

#PBS -l walltime=5:00:00,nodes=1:ppn=1
#PBS -joe .
#PBS -d .
#PBS -l vmem=20g,mem=20g

if [ -z $bam ]
then
bam=$1
fi

#if file is truncated, it would be the first error message in $bam.check
#uses picard wrapper from bcbio

picard ValidateSamFile I=$bam > $bam.check
1 change: 1 addition & 0 deletions cre.database.header
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Position,Ref,Alt,Variation,Zygosity,Protein_change_ensembl,Gene,Conserved_in_29_mammals,Sift_score,Polyphen_score,Cadd_score,Sample
34 changes: 34 additions & 0 deletions cre.database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/anaconda/bin/python

import csv
import sys
import re

family = sys.argv[1]

with open('samples.txt','rb') as f_samples:
samples = f_samples.readlines()
samples = [x.strip() for x in samples]

n_samples = len(samples)

for sample in samples:
with open(family+".csv",'rb') as f_csv:
reader = csv.DictReader(f_csv)
with open(family+'.'+sample+'.c4r','w') as f_sample:
zygosity_field = 'Zygosity.'+sample
#because R when building report substitutes - with . in column names
zygosity_field = zygosity_field.replace("-",".")
fieldnames=['Position','Ref','Alt','Variation',zygosity_field,'Protein_change_ensembl','Gene','Conserved_in_29_mammals','Sift_score','Polyphen_score','Cadd_score']
for row in reader:
l = []
for key in fieldnames:
l.append(row[key])
zygosity=row[zygosity_field]
if (zygosity != '-' and not re.search('Insufficient',zygosity)):
f_sample.write(','.join(l)+','+sample)
f_sample.write('\n')

f_sample.close()
f_samples.close()
f_csv.close()
39 changes: 39 additions & 0 deletions cre.database_merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/anaconda/bin/python

import csv
import sys
from collections import defaultdict

fieldnames=['Position','Ref','Alt','Variation','Zygosity','Protein_change_ensembl','Gene','Conserved_in_29_mammals','Sift_score','Polyphen_score','Cadd_score']
#Frequency','Samples']

frequencies = defaultdict(list)
samples = defaultdict(list)
annotations = defaultdict(list)

with open(sys.argv[1],'r') as f_csv:
reader = csv.DictReader(f_csv)
for row in reader:
superkey=row['Position']+'-'+row['Ref']+'-'+row['Alt']
if superkey in frequencies:
frequencies[superkey] += 1
samples[superkey].append(row['Sample'])
else:
frequencies[superkey] = 1
l = []
for key in fieldnames:
l.append(row[key])
annotations[superkey] = ','.join(l)
ll = []
ll.append(row['Sample'])
samples[superkey] = ll

with open(sys.argv[2],'w') as f_out:
f_out.write(','.join(fieldnames)+',Frequency,Samples')
f_out.write('\n')
for key in sorted(frequencies.keys()):
f_out.write(annotations[key]+','+str(frequencies[key])+','+';'.join(samples[key]))
f_out.write('\n')

f_out.close()
f_csv.close()
25 changes: 25 additions & 0 deletions cre.fixit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

#fixes sample names to family_sample

#$1 = family_id

cat samples.txt | awk -v fam=$1 '{print fam"_"$1}' > samples.txt.fixed
rm samples.txt
mv samples.txt.fixed samples.txt

for f in *ready.bam.bai;do mv $f `echo $f | sed s/"-ready"//`;done;
for f in *ready.bam;do mv $f `echo $f | sed s/"-ready"//`;done;
for f in *.bam;do mv $f `echo ${1}_${f}`;done;
for f in *.bam.bai;do mv $f `echo ${1}_${f}`;done;


for f in *.vcf.gz;do bcftools reheader -s samples.txt $f > $f.reheader;done;
rm *.vcf.gz
for f in *.reheader;do mv $f `echo $f | sed s/.reheader//`;done;
for f in *.vcf.gz; do tabix $f;done;

vcf.split_multi.sh $1.vcf.gz

# rerun cre.gemini_load.sh
# rerun cre.sh
32 changes: 32 additions & 0 deletions cre.gemini_load.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#/bin/bash

#vep2gemini - loads vep annotated vcf file to gemini database
#based on bcbio.log

#PBS -l walltime=23:00:00,nodes=1:ppn=16
#PBS -joe .
#PBS -d .
#PBS -l vmem=50g,mem=50g

if [ -z $vcf ]
then
vcf=$1
fi

bname=`echo $vcf | sed s/.vcf.gz//`

#--skip-cadd if no cadd
#remove GL chromosomes - sometimes they cause problems
mv $vcf $bname.tmp.vcf.gz
gunzip -c $bname.tmp.vcf.gz | grep "^#" > $bname.vcf
gunzip -c $bname.tmp.vcf.gz | grep -v "^#" | grep -v -i "^GL00" >> $bname.vcf
bgzip $bname.vcf
tabix $bname.vcf.gz
rm $bname.tmp.vcf.gz

#remove --passonly to load all variants
#add --skip-cadd to remove cadd
gemini load --passonly --skip-gerp-bp -v $vcf -t VEP --cores 16 --tempdir . $bname.db



39 changes: 39 additions & 0 deletions cre.locate_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/anaconda/bin/python

"""
Looks for a specific sample
"""

import re
import sys
import os
import os.path

sample = sys.argv[1]

family,sample_only = sample.split("_")

match = re.match('\d*',family)

if match:
prefix=str(int(match.group(0))/100)
report_path = prefix+'x/'+family+'/'+family

report=0
bam=0

if os.path.isfile(report_path+'.csv'):
#print("Report exists")
report=1
else:
print("Error: no report")

if os.path.isfile(report_path+'_'+sample_only+'.bam'):
#print("Bam exists")
bam=1
else:
print("ERROR: no bam")
if (bam==1 and report==1):
print(os.getcwd()+"/"+family+"\t"+os.getcwd()+"/"+report_path+'.bam')
else:
print("Family ID is not starting with digital")
2 changes: 1 addition & 1 deletion cre.package.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

#package all reports to send as one archive
#package all reports to send as one archive CHEO_DATE.tar.gz

current_dir=`pwd`

Expand Down
Loading

0 comments on commit dac884d

Please sign in to comment.