modified: HISTORY.md

modified: README.md new file: cre.bam.validate.sh new file: cre.database.header new file: cre.database.py new file: cre.database_merge.py new file: cre.fixit.sh new file: cre.gemini_load.sh new file: cre.locate_sample.py modified: cre.package.sh new file: cre.phenomecentral.upload_vcf_attachment.prototype.py deleted: bam.validate.sh deleted: fixit.sh deleted: gemini.vep2gemini.sh
naumenko-sa · Jul 13, 2017 · dac884d · dac884d
1 parent 63eace9
commit dac884d
Show file tree

Hide file tree

Showing 11 changed files with 399 additions and 2 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,3 +1,3 @@
 - 2017-07-13: added cre.package.sh. It packages reports to send.
 - 2017-07-07: added bam.validate.sh, it produces bam.check file with picard's validation status.
-- 2017-07-04: bcbio uses vep-merged, a merged cache of refseq and vep, no need to annotate with refseq anymore.
+- 2017-07-04: bcbio uses vep-merged, a merged cache of refseq and vep, no need to annotate with refseq anymore, removed refseq from cre.sh.
diff --git a/README.md b/README.md
@@ -171,3 +171,29 @@ vcf.platypus.getNV.sh ${family}-platypus-annotated-decomposed.vcf.gz
 ## 4.6 Rscript ~/cre/[cre.R](../master/cre.R) [family] - creates report family.csv.
 
 # 5. List of all scripts
+
+* bcbio.array.pbs
+* bcbio.pbs
+* bcbio.prepare_families.sh
+* bcbio.prepare_family1.sh
+* bcbio.rename_old_names.sh
+* bcl2fastq.sh
+* cre.bam.validate.sh
+* cre.fixit.sh - fixes sample names
+* cre.gemini_load.sh loads vep-annotated vcf to gemini db.
+* cre.package.sh
+* cre.sh
+* gemini.gemini2txt.sh
+* gemini.refseq.sh
+* gemini.variant_impacts.sh
+* gemini.vep.refseq.sh
+* gemini.vep.sh
+* omim.sh
+* orphanet.sh
+* vcf.freebayes.getAO.sh
+* vcf.gatk.get_depth.sh
+* vcf.platypus.getNV.sh
+* vcf.samtools.get_depth.sh
+* vcf.split_multi.sh
+* vep4seqr_hg38.sh
+* vep4seqr.sh
diff --git a/cre.bam.validate.sh b/cre.bam.validate.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#PBS -l walltime=5:00:00,nodes=1:ppn=1
+#PBS -joe .
+#PBS -d .
+#PBS -l vmem=20g,mem=20g
+
+if [ -z $bam ]
+then
+    bam=$1
+fi
+
+#if file is truncated, it would be the first error message in $bam.check
+#uses picard wrapper from bcbio
+
+picard ValidateSamFile I=$bam > $bam.check
diff --git a/cre.database.header b/cre.database.header
@@ -0,0 +1 @@
+Position,Ref,Alt,Variation,Zygosity,Protein_change_ensembl,Gene,Conserved_in_29_mammals,Sift_score,Polyphen_score,Cadd_score,Sample
diff --git a/cre.database.py b/cre.database.py
@@ -0,0 +1,34 @@
+#!/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/anaconda/bin/python
+
+import csv
+import sys
+import re
+
+family = sys.argv[1]
+
+with open('samples.txt','rb') as f_samples:
+    samples = f_samples.readlines()
+samples = [x.strip() for x in samples]
+
+n_samples = len(samples)
+
+for sample in samples:
+    with open(family+".csv",'rb') as f_csv:
+	reader = csv.DictReader(f_csv)
+	with open(family+'.'+sample+'.c4r','w') as f_sample:
+	    zygosity_field = 'Zygosity.'+sample
+	    #because R when building report substitutes - with . in column names
+	    zygosity_field = zygosity_field.replace("-",".")
+	    fieldnames=['Position','Ref','Alt','Variation',zygosity_field,'Protein_change_ensembl','Gene','Conserved_in_29_mammals','Sift_score','Polyphen_score','Cadd_score']
+	    for row in reader:
+		l = []
+		for key in fieldnames:
+		    l.append(row[key])
+		zygosity=row[zygosity_field]
+		if (zygosity != '-' and not re.search('Insufficient',zygosity)):
+		    f_sample.write(','.join(l)+','+sample)
+		    f_sample.write('\n')
+
+f_sample.close()
+f_samples.close()
+f_csv.close()
diff --git a/cre.database_merge.py b/cre.database_merge.py
@@ -0,0 +1,39 @@
+#!/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/anaconda/bin/python
+
+import csv
+import sys
+from collections import defaultdict
+
+fieldnames=['Position','Ref','Alt','Variation','Zygosity','Protein_change_ensembl','Gene','Conserved_in_29_mammals','Sift_score','Polyphen_score','Cadd_score']
+#Frequency','Samples']
+
+frequencies = defaultdict(list)
+samples = defaultdict(list)
+annotations = defaultdict(list)
+
+with open(sys.argv[1],'r') as f_csv:
+    reader = csv.DictReader(f_csv)
+    for row in reader:
+	superkey=row['Position']+'-'+row['Ref']+'-'+row['Alt']
+	if superkey in frequencies:
+	    frequencies[superkey] += 1
+	    samples[superkey].append(row['Sample'])
+	else:
+	    frequencies[superkey] = 1
+	    l = []
+	    for key in fieldnames:
+		l.append(row[key])
+	    annotations[superkey] = ','.join(l)
+	    ll = []
+	    ll.append(row['Sample'])
+	    samples[superkey] = ll
+
+with open(sys.argv[2],'w') as f_out:
+    f_out.write(','.join(fieldnames)+',Frequency,Samples')
+    f_out.write('\n')
+    for key in sorted(frequencies.keys()):
+	f_out.write(annotations[key]+','+str(frequencies[key])+','+';'.join(samples[key]))
+	f_out.write('\n')
+
+f_out.close()
+f_csv.close()
diff --git a/cre.fixit.sh b/cre.fixit.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#fixes sample names to family_sample
+
+#$1 = family_id
+
+cat samples.txt | awk -v fam=$1 '{print fam"_"$1}' > samples.txt.fixed
+rm samples.txt
+mv samples.txt.fixed samples.txt
+
+for f in *ready.bam.bai;do mv $f `echo $f | sed s/"-ready"//`;done;
+for f in *ready.bam;do mv $f `echo $f | sed s/"-ready"//`;done;
+for f in *.bam;do mv $f `echo ${1}_${f}`;done;
+for f in *.bam.bai;do mv $f `echo ${1}_${f}`;done;
+
+
+for f in *.vcf.gz;do bcftools reheader -s samples.txt $f  > $f.reheader;done;
+rm *.vcf.gz
+for f in *.reheader;do mv $f `echo $f | sed s/.reheader//`;done;
+for f in *.vcf.gz; do tabix $f;done;
+
+vcf.split_multi.sh $1.vcf.gz
+
+# rerun cre.gemini_load.sh
+# rerun cre.sh
diff --git a/cre.gemini_load.sh b/cre.gemini_load.sh
@@ -0,0 +1,32 @@
+#/bin/bash
+
+#vep2gemini - loads vep annotated vcf file to gemini database 
+#based on  bcbio.log
+
+#PBS -l walltime=23:00:00,nodes=1:ppn=16
+#PBS -joe .
+#PBS -d .
+#PBS -l vmem=50g,mem=50g
+
+if [ -z $vcf ]
+then
+    vcf=$1
+fi
+
+bname=`echo $vcf | sed s/.vcf.gz//`
+
+#--skip-cadd if no cadd
+#remove GL chromosomes - sometimes they cause problems
+mv $vcf $bname.tmp.vcf.gz
+gunzip -c $bname.tmp.vcf.gz | grep "^#" > $bname.vcf
+gunzip -c $bname.tmp.vcf.gz | grep -v "^#" | grep -v -i "^GL00" >> $bname.vcf
+bgzip $bname.vcf
+tabix $bname.vcf.gz
+rm $bname.tmp.vcf.gz
+
+#remove --passonly to load all variants
+#add   --skip-cadd to remove cadd
+gemini load --passonly --skip-gerp-bp -v $vcf -t VEP --cores 16 --tempdir . $bname.db
+
+
+
diff --git a/cre.locate_sample.py b/cre.locate_sample.py
@@ -0,0 +1,39 @@
+#!/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/anaconda/bin/python
+
+"""
+Looks for a specific sample
+"""
+
+import re
+import sys
+import os
+import os.path
+
+sample = sys.argv[1]
+
+family,sample_only = sample.split("_")
+
+match = re.match('\d*',family)
+
+if match:
+    prefix=str(int(match.group(0))/100)
+    report_path = prefix+'x/'+family+'/'+family
+
+    report=0
+    bam=0
+
+    if os.path.isfile(report_path+'.csv'):
+	#print("Report exists")
+	report=1
+    else:
+	print("Error: no report")
+
+    if os.path.isfile(report_path+'_'+sample_only+'.bam'):
+	#print("Bam exists")
+	bam=1
+    else:
+	print("ERROR: no bam")
+    if (bam==1 and report==1):
+        print(os.getcwd()+"/"+family+"\t"+os.getcwd()+"/"+report_path+'.bam') 
+else:
+    print("Family ID is not starting with digital")
diff --git a/cre.package.sh b/cre.package.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-#package all reports to send as one archive
+#package all reports to send as one archive CHEO_DATE.tar.gz
 
 current_dir=`pwd`
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Position,Ref,Alt,Variation,Zygosity,Protein_change_ensembl,Gene,Conserved_in_29_mammals,Sift_score,Polyphen_score,Cadd_score,Sample