-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunpipeline.sh
76 lines (55 loc) · 2.79 KB
/
runpipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env bash
#BSUB -n 1 # number of tasksi/CPUs in job
#BSUB -q standard # queue
#BSUB -e error.%J # error file name in which %J is replaced by the job ID
#BSUB -o output.%J # output file name in which %J is replaced by the job ID
#BSUB -J pipeline
#Data: High coverage 1KG trios
#https://www.internationalgenome.org/data-portal/data-collection/30x-grch38
#load r and python modules
ml load r-4.0.3-gcc-9.3.0-4l6eluj
ml load python-3.9.0-gcc-9.3.0-5t75egs
#create a directory for data manipulation
if [ ! -d data ]; then
mkdir data;
fi
#create crams directory to store cram files
if [ ! -d crams ]; then
mkdir crams;
fi
#download the reference fasta
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
#index the reference fasta
samtools faidx GRCh38_full_analysis_set_plus_decoy_hla.fa
cd data
#fetch pedigree file
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/20130606_g1k_3202_samples_ped_population.txt
wd=$(pwd) # r/python working directory
#produce children.txt,father.txt,mother.txt,inds.txt,pops.txt and sup_pops.txt files
Rscript ../scripts/data.R $wd
#download the table containing the cram file links from https://www.internationalgenome.org/data-portal/data-collection/30x-grch38
#You can also get the same list from the github repo by uncommenting the following line (repo needs to be public):
#wget https://raw.githubusercontent.com/rabiafidan/denovoTS/master/data/igsr_30x%20GRCh38.tsv?token=ANT5TMV73QAXSOLCEELVBFTBMSRUG
#obtain cram file links of the all individuals of the trios
grep cram igsr_30x\ GRCh38.tsv | cut -f1 >links.txt
for i in $(cat inds.txt); do grep $i links.txt >> links2.txt; done
mv links2.txt links.txt
cd ..
#download the individual cram files
bsub -J "dwnld[1-1793]%40" < scripts/download.sh
#create the configuration file for Snakefile
python scripts/configfile.py $wd
#create a bed file to use for regional filtering of vcf files
#Download UCSC Dec2013 hg38 repeatmasker file as repeatmask.txt. Selected fields (GenoName GenoStart GenoEnd repClass)
grep 'Low_complexity' repeatmask.txt | grep $'^chr[0-9X]*\t' | cut -f1,2,3 > low_complexity.bed
grep 'Simple_repeat' repeatmask.txt | grep $'^chr[0-9X]*\t' | cut -f1,2,3 > microsatellite.bed
grep 'DNA' repeatmask.txt | grep $'^chr[0-9X]*\t' | cut -f1,2,3 > DNArep.bed #includes 'DNA?' ones, too
sort -k1,1 -k2,2n low_complexity.bed > low_complexity_sorted.bed
sort -k1,1 -k2,2n DNArep.bed > DNArep_sorted.bed
sort -k1,1 -k2,2n microsatellite.bed > microsatellite_sorted.bed
rm low_complexity.bed
rm microsatellite.bed
rm DNArep.bed
bedtools merge -i microsatellite_sorted.bed -i low_complexity_sorted.bed -i DNArep_sorted.bed > region.bed
#running Snakefile
bsub < scripts/snakemake.sh