diff --git a/README.md b/README.md index b102453..b7bc130 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,7 @@ clinical research exome - excel report generation using data from [bcbio variant2](https://bcbio-nextgen.readthedocs.io/en/latest/contents/pipelines.html#germline-variant-calling) germline variant calling pipeline. -1. Create a project to run with bcbio. +#1. Create a project to run with bcbio. + +##1a. If you start from bam files. +Suppose you have a trio, each sample is a bam file. diff --git a/bcbio.prepare_families.sh b/bcbio.prepare_families.sh new file mode 100755 index 0000000..d6f1ff7 --- /dev/null +++ b/bcbio.prepare_families.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# prepares a run of multiples families to run variant calling, one family may have several samples +# $1 - a file table.txt in the format +# sample_id family_id absolute_path_to_bam_file +# creates one project per family +# +# run with +# bcbio.prepare_families.sh table.txt &> file.log to track failed bams +# or +# qsub ~/bioscripts/bcbio.prepare_families.sh -v project_list=table.txt + +# the scripts supposes it is install in ~/cre/ and bcbio is installed and available in the PATH +# uses +# bcbio.sample_sheet_header.csv +# bcbio.templates.exome.yaml + +# to create table.txt from a directory of bam files with names family_sample.yyy.bam +# for f in *.bam;do echo $f | awk -F "." '{print $1"\t"$0}' | awk -F '_' '{print $2"\t"$0}' | awk -v dir=`pwd` '{print $1"\t"$2"\t"dir"/"$4}' >> ~/table.txt;done; + +#PBS -l walltime=20:00:00,nodes=1:ppn=1 +#PBS -joe . +#PBS -d . +#PBS -l vmem=10g,mem=10g + +prepare_family() +{ + local family=$1 + + mkdir -p ${family}/input + mkdir ${family}/work + + cp ~/cre/bcbio.sample_sheet_header.csv $family.csv + + while read sample fam bam + do + ln -s $bam ${family}/input/${sample}.bam + echo $sample","$sample","$family",,," >> $family.csv + done < $family.txt + + bcbio_nextgen.py -w template ~/cre/bcbio.templates.exome.yaml $family.csv ${family}/input/*.bam + + rm $family.csv +} + +if [ -z $project_list ]; +then + project_list=$1 +fi + +cat $project_list | awk '{print $2}' | sort | uniq > families.txt + +for family in `cat families.txt` +do + # not grep because two family names may overlap + cat $project_list | awk -v fam=$family '{if ($2==fam) print $0}' > ${family}.txt + prepare_family $family + rm $family.txt +done diff --git a/bcbio.sample_sheet_header.csv b/bcbio.sample_sheet_header.csv new file mode 100644 index 0000000..c26026b --- /dev/null +++ b/bcbio.sample_sheet_header.csv @@ -0,0 +1 @@ +samplename,description,batch,phenotype,sex,variant_regions diff --git a/bcbio.templates.exome.yaml b/bcbio.templates.exome.yaml new file mode 100644 index 0000000..a1d9228 --- /dev/null +++ b/bcbio.templates.exome.yaml @@ -0,0 +1,35 @@ +details: +- algorithm: + aligner: bwa + effects: vep + effects_transcripts: all + ensemble: + numpass: 2 + use_filtered: false + realign: true + recalibrate: true + save_diskspace: true + tools_on: + - svplots + - qualimap + variantcaller: + - gatk-haplotype + - samtools + - platypus + - freebayes + analysis: variant2 + description: '166.3_5' + files: + - /hpf/largeprojects/ccmbio/naumenko/project_c4r_run10/input/166.3_5.bam + genome_build: GRCh37 + metadata: + batch: 166 +resources: + default: + cores: 5 + jvm_opts: + - -Xms750m + - -Xmx7000m + memory: 7G +upload: + dir: ../final