diff --git a/README.md b/README.md index 641bd99..0992ee7 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,21 @@ I can wait for 2-5 days for a project when working with cohorts. Faster processi or ``` qsub cre.sh -v family=[family],cleanup=1,make_report=1 -``` +``` + +If you run a large cohort, some families maybe finished while some are still running. +To delete unnecessary filed, cleanup finished projects: +``` +mkdir ready4report +for f in `cat all_dirs.txt`;do if [ -d ${f}/final/ ];then mv $f ready4report/;fi;done; + +``` +And then run cre for them: +``` +cd ready4report +ls | grep -v projects.txt > projects.txt +for f in `cat projects.txt`;do qsub ~/cre/cre.sh -v family=$f;done; +``` What it does. During the cleanup step: diff --git a/gemini.vep.parse.pl b/gemini.vep.parse.pl new file mode 100755 index 0000000..8c71496 --- /dev/null +++ b/gemini.vep.parse.pl @@ -0,0 +1,56 @@ +#!/usr/bin/perl + +# parses vcf with VEP annotation - CSQ field +# works for VEP - refseq ! +# for ensembl fields are different: +# f_exon = 9 +# f_hgvsc = 16 +# f_hgvsp = 17 + +$f_gene=3; +$f_exon=6; +$f_hgvsc=13; +$f_hgvsp=14; + +open(IN,$ARGV[0]); + +print "superindex\tInfo_refseq_no_gene\n"; + +while () +{ + chomp; + $impacts_string=''; + if ($_ !~ /CHROM/) + { + @ar = split("\t",$_); + $id = "chr".$ar[0].":".$ar[1]."-".$ar[2]."-".$ar[3]; + @ar2 = split(",",@ar[4]); + #print $id."\n"; + foreach $impact (@ar2) + { + #print $impact."\n"; + @ar3= split(/\|/,$impact); + #if there is hvgsc + if ($ar3[$f_hgvsc] ne '') + { + if ($ar3[$f_hgvsp] eq '') + { + $ar3[$f_hgvsp] = 'NA'; + } + if ($ar3[$f_exon] eq '') + { + $exon='NA'; + } + else + { + $exon = $ar3[$f_exon]; + $exon =~ s/"\-"/"NA"/; + } + $impacts_string .= "exon".$exon.":".$ar3[$f_hgvsc].":".$ar3[$f_hgvsp].","; + } + } + print $id."\t".substr($impacts_string,0,-1)."\n"; + } +} + +close(IN); \ No newline at end of file diff --git a/gemini.vep.refseq.sh b/gemini.vep.refseq.sh new file mode 100755 index 0000000..3856299 --- /dev/null +++ b/gemini.vep.refseq.sh @@ -0,0 +1,30 @@ +#/bin/bash + +# gemini.vep.refseq.sh - annotate vcf with vep before loading into gemini database +# based on bcbio.log +# uses hgvs notation with refseq transcript coordinates and no --pick - all effects for a gene +# first you have to download refseq annotation +# ftp://ftp.ensembl.org/pub/current_variation/VEP/homo_sapiens_refseq_vep_87_GRCh37.tar.gz +# to +# bcbio/genomes/Hsapiens/GRCh37/vep + +#PBS -l walltime=2:00:00,nodes=1:ppn=1 +#PBS -joe . +#PBS -d . +#PBS -l vmem=10g,mem=10g + +if [ -z $vcf ]; +then + vcf=$1 +fi + +bname=`echo $vcf | sed s/.vcf.gz//` + +unset PERL5LIB && export PATH=/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/anaconda/bin:$PATH && /home/naumenko/work/tools/bcbio/anaconda/bin/variant_effect_predictor.pl --vcf -o stdout \ + -i $vcf --species homo_sapiens_refseq --no_stats --cache --offline --dir /hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/vep --symbol --numbers --biotype --total_length \ + --canonical --gene_phenotype --ccds --fields Consequence,Codons,Amino_acids,Gene,SYMBOL,Feature,EXON,PolyPhen,SIFT,Protein_position,BIOTYPE,CANONICAL,CCDS,HGVSc,HGVSp \ + --plugin LoF,human_ancestor_fa:/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/variation/human_ancestor.fa.gz --sift b --polyphen b --hgvs --shift_hgvs 1 \ + --fasta /hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/seq/GRCh37.fa \ + | sed '/^#/! s/;;/;/g' | bgzip -c > $bname.vepeffects_refseq.vcf.gz + +tabix $bname.vepeffects_refseq.vcf.gz \ No newline at end of file