modified: cre.bcbio.upgrade.sh

new file: cre.vcf.has2dp.sh modified: cre.vcf2cre.sh
naumenko-sa · Oct 25, 2018 · 19cc1d9 · 19cc1d9
1 parent 8ac8286
commit 19cc1d9
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 2 deletions.
diff --git a/cre.bcbio.upgrade.sh b/cre.bcbio.upgrade.sh
@@ -41,5 +41,5 @@
 #bcbio_nextgen.py upgrade -u skip --genomes GRCh37 --datatarget rnaseq
 
 #mouse
-bcbio_nextgen.py upgrade -u skip --genomes mm10 --datatarget rnaseq --aligners star
+bcbio_nextgen.py upgrade -u skip --genomes mm10 --datatarget rnaseq --aligners star --cores 5
 
diff --git a/cre.vcf.has2dp.sh b/cre.vcf.has2dp.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# if input vcf is from TCAG (HAS) it does not have DP INFO field, we need to fake it from FORMAT DP for SNVs and from DPI for indels:
+
+bname=`basename $1 .vcf.gz`
+
+# remove chr
+gunzip -c $1 |  sed s/"ID=chrM"/"ID=MT"/ | sed s/"^chrM"/MT/ | sed s/"ID=chr"/"ID="/ | sed s/"^chr"// > $bname.nochr.vcf
+bgzip $bname.nochr.vcf
+tabix $bname.nochr.vcf.gz
+
+gunzip -c $bname.nochr.vcf.gz | grep "^#" | head -n1 > $bname.dp.vcf
+echo "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read depth; some reads may have been filtered\">" >> $bname.dp.vcf
+gunzip -c $bname.nochr.vcf.gz | grep "^#" | sed 1d >> $bname.dp.vcf
+
+#process SNVs
+gunzip -c $bname.nochr.vcf.gz | grep -v "^#"  | grep PASS | grep ":DP:" | awk -F ':' '{print $0"\tDP="$9}' | awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$11";"$8"\t"$9"\t"$10}' >> $bname.dp.vcf
+
+#process indels
+gunzip -c $bname.nochr.vcf.gz | grep -v "^#"  | grep PASS | grep ":DPI:" | awk -F ':' '{print $0"\tDP="$8}' | awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$11";"$8"\t"$9"\t"$10}' | sed s/":DPI:"/":DP:"/ >> $bname.dp.vcf
+
+bgzip $bname.dp.vcf
+tabix $bname.dp.vcf.gz
+
+bcftools sort -o $bname.dp.sorted.vcf.gz -Oz $bname.dp.vcf.gz
+tabix $bname.dp.sorted.vcf.gz
+
+rm $bname.nochr.vcf.gz $bname.dp.vcf.gz
diff --git a/cre.vcf2cre.sh b/cre.vcf2cre.sh
@@ -1,5 +1,4 @@
 #!/bin/bash
-
 #PBS -l walltime=23:00:00,nodes=1:ppn=1
 #PBS -joe .
 #PBS -d .
@@ -19,6 +18,13 @@
 # AC, AF, MLEAC, MLEAF Number=1 not A. Because of that vt is not properly decomposing multiallelic variants, and vcf2db can't create gemini database
 # solution is to put Number=A in the vcf header or rerun variant calling with the latest bcbio
 
+# if input is from TCAG (HAS) it does not have DP INFO field, we need to fake it from FORMAT DP for SNVs and from DPI for indels:
+# cre.vcf.has2dp.sh
+# gunzip -c 331606_S1.flt.nochr.vcf.gz | grep "^#" > 331606.vcf
+# add to header: 
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+# gunzip -c 331606_S1.flt.nochr.vcf.gz | grep -v "^#"  | grep PASS | sed s/":DPI:"/":DP:"awk -F ':' '{print $0"\tDP="$9}' | awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$11";"$8"\t"$9"\t"$10}' >> 331606.vcf
+
 bname=`basename $original_vcf .vcf.gz`
 
 echo "###############################################"