forked from TyckoLab/CloudASM
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreparation.sh
71 lines (48 loc) · 2.36 KB
/
preparation.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/bin/bash
# Shortcut to Picardtools
PICARD="/genomics-packages/picard-tools-1.97"
# Create directories
mkdir -p $(dirname "${OUTPUT_DIR}")/$GENOME/variants # For variants
mkdir -p $(dirname "${OUTPUT_DIR}")/$GENOME/ref_genome # For the ref genome
FOLDER=$(dirname "${OUTPUT_DIR}")/$GENOME
# Reference discussion on the ref genome https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use
############################################ Reference genome
if [ $GENOME == "hg19" ] ; then
GENOME_URL="ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz"
else
# GRCh38
GENOME_URL="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz"
fi
# Download zipped ref genome
wget $GENOME_URL
# Unzip file
gunzip $(basename "$GENOME_URL")
# Obtain the name of the file of the reference genome
GENOME_NAME=$(basename "${GENOME_URL%.gz}")
# Create a name of the genome with the extension "fasta"
GENOME_NAME_FASTA=$(basename "${GENOME_NAME%.f*}".fasta)
# Move and rename the file with the extension fasta.
mv ${GENOME_NAME} ${FOLDER}/ref_genome/${GENOME_NAME_FASTA}
# Create the fasta sequence dictionary file (.dict file), required by Bis-SNP
java -jar ${PICARD}/CreateSequenceDictionary.jar \
R= ${FOLDER}/ref_genome/${GENOME_NAME_FASTA} \
O= ${FOLDER}/ref_genome/${GENOME_NAME_FASTA%.fasta}.dict
# Create a fasta index file, required by bis-snp (*.fai)
samtools faidx ${FOLDER}/ref_genome/${GENOME_NAME_FASTA}
echo "After dict and fai"
ls -lhR $(dirname "${OUTPUT_DIR}")
#Create bisulfite-converted genome, required by Bismark (Takes ~3 hours)
bismark_genome_preparation --bowtie2 --verbose ${FOLDER}/ref_genome
############################################ Variant database
if [ $GENOME == "hg19" ] ; then
VARIANTS_URL="https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/All_20180423.vcf.gz"
else
# GRCH38 -- the latest SNP database as of Sept 2019 is db151
VARIANTS_URL="https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/All_20180418.vcf.gz"
fi
# Download zipped variant database
wget $VARIANTS_URL
# Unzip file in the output folder
gunzip $(basename "$VARIANTS_URL")
# Move the file
mv $(basename "${VARIANTS_URL%.gz}") ${FOLDER}/variants/$(basename "${VARIANTS_URL%.gz}")