modified: README.md

new file: gemini.vep.parse.pl new file: gemini.vep.refseq.sh
naumenko-sa · May 10, 2017 · 5d7d774 · 5d7d774
1 parent 2a4a2a1
commit 5d7d774
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -107,7 +107,21 @@ I can wait for 2-5 days for a project when working with cohorts. Faster processi
 or 
 ```
 qsub cre.sh -v family=[family],cleanup=1,make_report=1
-``` 
+```
+
+If you run a large cohort, some families maybe finished while some are still running.
+To delete unnecessary filed, cleanup finished projects:
+```
+mkdir ready4report
+for f in `cat all_dirs.txt`;do if [ -d ${f}/final/ ];then mv $f ready4report/;fi;done;
+
+```
+And then run cre for them:
+```
+cd ready4report
+ls | grep -v projects.txt > projects.txt
+for f in `cat projects.txt`;do qsub ~/cre/cre.sh -v family=$f;done;
+```
 
 What it does.
 During the cleanup step:

diff --git a/gemini.vep.parse.pl b/gemini.vep.parse.pl
@@ -0,0 +1,56 @@
+#!/usr/bin/perl
+
+# parses vcf with VEP annotation - CSQ field
+# works for VEP - refseq !
+# for ensembl fields are different:
+# f_exon = 9
+# f_hgvsc = 16
+# f_hgvsp = 17
+
+$f_gene=3;
+$f_exon=6;
+$f_hgvsc=13;
+$f_hgvsp=14;
+
+open(IN,$ARGV[0]);
+
+print "superindex\tInfo_refseq_no_gene\n";
+
+while (<IN>)
+{
+    chomp;
+    $impacts_string='';
+    if ($_ !~ /CHROM/)
+    {
+	@ar = split("\t",$_);
+	$id = "chr".$ar[0].":".$ar[1]."-".$ar[2]."-".$ar[3];
+	@ar2 = split(",",@ar[4]);
+	#print $id."\n";
+	foreach $impact (@ar2)
+	{
+	    #print $impact."\n";
+	    @ar3= split(/\|/,$impact);
+	    #if there is hvgsc
+	    if ($ar3[$f_hgvsc] ne '')
+	    {
+		if ($ar3[$f_hgvsp] eq '')
+		{
+		    $ar3[$f_hgvsp] = 'NA';
+		}
+		if ($ar3[$f_exon] eq '')
+		{
+		    $exon='NA';
+		}
+		else
+		{
+		    $exon = $ar3[$f_exon];
+		    $exon =~ s/"\-"/"NA"/;
+		}
+		$impacts_string .= "exon".$exon.":".$ar3[$f_hgvsc].":".$ar3[$f_hgvsp].",";
+	    }
+	}
+	print $id."\t".substr($impacts_string,0,-1)."\n";
+    }
+}
+
+close(IN);
diff --git a/gemini.vep.refseq.sh b/gemini.vep.refseq.sh
@@ -0,0 +1,30 @@
+#/bin/bash
+
+# gemini.vep.refseq.sh - annotate vcf with vep before loading into gemini database
+# based on  bcbio.log
+# uses hgvs notation with refseq transcript coordinates and no --pick - all effects for a gene
+# first you have to download refseq annotation
+# ftp://ftp.ensembl.org/pub/current_variation/VEP/homo_sapiens_refseq_vep_87_GRCh37.tar.gz
+# to
+# bcbio/genomes/Hsapiens/GRCh37/vep
+
+#PBS -l walltime=2:00:00,nodes=1:ppn=1
+#PBS -joe .
+#PBS -d .
+#PBS -l vmem=10g,mem=10g
+
+if [ -z $vcf ];
+then
+    vcf=$1
+fi
+
+bname=`echo $vcf | sed s/.vcf.gz//`
+
+unset PERL5LIB && export PATH=/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/anaconda/bin:$PATH && /home/naumenko/work/tools/bcbio/anaconda/bin/variant_effect_predictor.pl --vcf -o stdout \
+    -i $vcf --species homo_sapiens_refseq --no_stats --cache --offline --dir /hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/vep --symbol --numbers --biotype --total_length \
+    --canonical --gene_phenotype --ccds --fields Consequence,Codons,Amino_acids,Gene,SYMBOL,Feature,EXON,PolyPhen,SIFT,Protein_position,BIOTYPE,CANONICAL,CCDS,HGVSc,HGVSp \
+    --plugin LoF,human_ancestor_fa:/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/variation/human_ancestor.fa.gz --sift b --polyphen b --hgvs --shift_hgvs 1 \
+    --fasta /hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/seq/GRCh37.fa \
+    | sed '/^#/! s/;;/;/g' | bgzip -c > $bname.vepeffects_refseq.vcf.gz
+
+tabix $bname.vepeffects_refseq.vcf.gz