forked from reskejak/ATAC-seq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathATACseq_workflow.txt
257 lines (211 loc) · 14.6 KB
/
ATACseq_workflow.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# ATACseq_workflow.txt
# ATAC-seq workflow from reads to peaks, in Unix/R
# Jake Reske
# Michigan State University, 2020
# https://github.com/reskejak
# Machine-readable version of Figure 4 workflow for ATAC-seq data analysis
# beginning with raw, paired-end Illumina reads
# example experimental design: n=2 mouse ATAC-seq biological replicates for two conditions: treat and control
#######################################
# Unix dependencies, in order of use:
# FastQC
# Trim Galore!
# MultiQC
# Bowtie2
# samtools
# Harvard ATAC-seq module (for removeChrom python script)
# Picard (for MarkDuplicates tool)
# bedtools
# MACS2
# R dependencies:
# ATACseqQC (preseqR)
# additional unix bash scripts used for custom functions:
# bedpeTn5shift (https://raw.githubusercontent.com/reskejak/ATAC-seq/master/bedpeTn5shift.sh)
# bedpeMinimalConvert (https://raw.githubusercontent.com/reskejak/ATAC-seq/master/bedpeMinimalConvert.sh)
# naiveOverlapBroad (https://raw.githubusercontent.com/reskejak/ATAC-seq/master/naiveOverlapBroad.sh)
########################################
########################################
########################################
### QUALITY CONTROL
# in unix
# Concatenate technical replicates between flowcells
cat flowcell1/treat1_R1.fastq.gz flowcell2/treat1_R1.fastq.gz > cat/treat1_R1.fastq.gz # treat1_R1
cat flowcell1/treat1_R2.fastq.gz flowcell2/treat1_R2.fastq.gz > cat/treat1_R2.fastq.gz # treat1_R2
cat flowcell1/treat2_R1.fastq.gz flowcell2/treat2_R1.fastq.gz > cat/treat2_R1.fastq.gz # treat2_R1
cat flowcell1/treat2_R2.fastq.gz flowcell2/treat2_R2.fastq.gz > cat/treat2_R2.fastq.gz # treat2_R2
cat flowcell1/control1_R1.fastq.gz flowcell2/control1_R1.fastq.gz > cat/control1_R1.fastq.gz # control1_R1
cat flowcell1/control1_R2.fastq.gz flowcell2/control1_R2.fastq.gz > cat/control1_R2.fastq.gz # control1_R2
cat flowcell1/control2_R1.fastq.gz flowcell2/control2_R1.fastq.gz > cat/control2_R1.fastq.gz # control2_R1
cat flowcell1/control2_R2.fastq.gz flowcell2/control2_R2.fastq.gz > cat/control2_R2.fastq.gz # control2_R2
# QC raw reads
fastqc treat1_R1.fastq.gz
fastqc treat1_R2.fastq.gz
fastqc treat2_R1.fastq.gz
fastqc treat2_R2.fastq.gz
fastqc control1_R1.fastq.gz
fastqc control1_R2.fastq.gz
fastqc control2_R1.fastq.gz
fastqc control2_R2.fastq.gz
# Trim reads
trim_galore --paired -o trimmed/ treat1_R1.fastq.gz treat1_R2.fastq.gz
trim_galore --paired -o trimmed/ treat2_R1.fastq.gz treat2_R2.fastq.gz
trim_galore --paired -o trimmed/ control1_R1.fastq.gz control1_R2.fastq.gz
trim_galore --paired -o trimmed/ control2_R1.fastq.gz control2_R2.fastq.gz
# rename output files e.g. treat1_R1_trimmed.fq.gz or modify script
# QC trimmed reads
fastqc treat1_R1_trimmed.fq.gz
fastqc treat1_R2_trimmed.fq.gz
fastqc treat2_R1_trimmed.fq.gz
fastqc treat2_R2_trimmed.fq.gz
fastqc control1_R1_trimmed.fq.gz
fastqc control1_R2_trimmed.fq.gz
fastqc control2_R1_trimmed.fq.gz
fastqc control2_R2_trimmed.fq.gz
# Combine QC data
python multiqc . # in the fastqc output directory; repeat for raw and trimmed
########################################
### Align
# Align to reference genome
# first have to prepare genome index; see bowtie2 manual for details
bowtie2 --very-sensitive -X 1000 -x mm10_bt2_index -1 treat1_R1_trimmed.fq.gz -2 treat1_R2_trimmed.fq.gz | samtools view -bS - > treat1.bam
bowtie2 --very-sensitive -X 1000 -x mm10_bt2_index -1 treat2_R1_trimmed.fq.gz -2 treat2_R2_trimmed.fq.gz | samtools view -bS - > treat2.bam
bowtie2 --very-sensitive -X 1000 -x mm10_bt2_index -1 control1_R1_trimmed.fq.gz -2 control1_R2_trimmed.fq.gz | samtools view -bS - > control1.bam
bowtie2 --very-sensitive -X 1000 -x mm10_bt2_index -1 control2_R1_trimmed.fq.gz -2 control2_R2_trimmed.fq.gz | samtools view -bS - > control2.bam
# Coordinate sort and index
samtools sort -o treat1.sorted.bam treat1.bam; samtools index treat1.sorted.bam
samtools sort -o treat2.sorted.bam treat2.bam; samtools index treat2.sorted.bam
samtools sort -o control1.sorted.bam control1.bam; samtools index control1.sorted.bam
samtools sort -o control2.sorted.bam control2.bam; samtools index control2.sorted.bam
########################################
### Filter
# Remove mtDNA reads
# using Harvard ATAC-seq module (python script removeChrom.py)
# https://github.com/harvardinformatics/ATAC-seq
samtools view -h treat1.sorted.bam | python removeChrom.py - - chrM | samtools view -bh - > treat1.noMT.bam
samtools view -h treat2.sorted.bam | python removeChrom.py - - chrM | samtools view -bh - > treat2.noMT.bam
samtools view -h control1.sorted.bam | python removeChrom.py - - chrM | samtools view -bh - > control1.noMT.bam
samtools view -h control2.sorted.bam | python removeChrom.py - - chrM | samtools view -bh - > control2.noMT.bam
# Coordinate sort and index
samtools sort -o treat1.sorted.noMT.bam treat1.noMT.bam; samtools index treat1.sorted.noMT.bam
samtools sort -o treat2.sorted.noMT.bam treat2.noMT.bam; samtools index treat2.sorted.noMT.bam
samtools sort -o control1.sorted.noMT.bam control1.noMT.bam; samtools index control1.sorted.noMT.bam
samtools sort -o control2.sorted.noMT.bam control2.noMT.bam; samtools index control2.sorted.noMT.bam
# Restrict to properly-paired reads only
# -f 3 specifies only properly-paired reads
samtools view -bh -f 3 treat1.sorted.noMT.bam > treat1.filt.noMT.bam
samtools view -bh -f 3 treat2.sorted.noMT.bam > treat2.filt.noMT.bam
samtools view -bh -f 3 control1.sorted.noMT.bam > control1.filt.noMT.bam
samtools view -bh -f 3 control2.sorted.noMT.bam > control2.filt.noMT.bam
# Coordinate sort and index
samtools sort -o treat1.sorted.filt.noMT.bam treat1.filt.noMT.bam; samtools index treat1.sorted.filt.noMT.bam
samtools sort -o treat2.sorted.filt.noMT.bam treat2.filt.noMT.bam; samtools index treat2.sorted.filt.noMT.bam
samtools sort -o control1.sorted.filt.noMT.bam control1.filt.noMT.bam; samtools index control1.sorted.filt.noMT.bam
samtools sort -o control2.sorted.filt.noMT.bam control2.filt.noMT.bam; samtools index control2.sorted.filt.noMT.bam
########################################
### Complexity
# in R
library(ATACseqQC)
# Define sample files
treat1 <- "treat1.sorted.filt.noMT.bam"
treat2 <- "treat2.sorted.filt.noMT.bam"
control1 <- "control1.sorted.filt.noMT.bam"
control2 <- "control2.sorted.filt.noMT.bam"
treat1.bai <- "treat1.sorted.filt.noMT.bam.bai"
treat2.bai <- "treat2.sorted.filt.noMT.bam.bai"
control1.bai <- "control1.sorted.filt.noMT.bam.bai"
control2.bai <- "control2.sorted.filt.noMT.bam.bai"
# Calculate duplication frequency matrix
treat1.dups <- readsDupFreq(treat1, index=treat1.bai)
treat2.dups <- readsDupFreq(treat2, index=treat2.bai)
control1.dups <- readsDupFreq(control1, index=control1.bai)
control2.dups <- readsDupFreq(control2, index=control2.bai)
# Estimate library complexity
treat1.complexity <- estimateLibComplexity(treat1.dups, times=100, interpolate.sample.sizes=seq(0.1, 1, by=0.01)
treat2.complexity <- estimateLibComplexity(treat2.dups, times=100, interpolate.sample.sizes=seq(0.1, 1, by=0.01)
control1.complexity <- estimateLibComplexity(control1.dups, times=100, interpolate.sample.sizes=seq(0.1, 1, by=0.01)
control2.complexity <- estimateLibComplexity(control2.dups, times=100, interpolate.sample.sizes=seq(0.1, 1, by=0.01)
# notes on interpretation:
# $relative.size = relative library size, i.e. =1 is the full library size supplied
# $values = number of unique fragments sequenced at a given $relative.size
# downstream:
# from all libaries in experiment, identify lowest $values integer at $relative.size==1
# estimate fraction of library to subsample to achieve uniform number of unique fragments (molecular complexity)
# back in unix
# Subsample based on library complexity estimates
# dummy example
samtools view -h -b -s 1.45 treat1.sorted.filt.noMT.bam > treat1_sub.filt.noMT.bam # -s 1.45 indicates 45% subsample, seed=1
samtools view -h -b -s 1.78 treat2.sorted.filt.noMT.bam > treat2_sub.filt.noMT.bam # -s 1.78 indicates 78% subsample, seed=1
# do not subsample control1; e.g. sample with lowest complexity at given read depth
samtools view -h -b -s 1.91 control2.sorted.filt.noMT.bam > control2_sub.filt.noMT.bam # -s 1.91 indicates 91% subsample, seed=1
samtools sort -o treat1_sub.sorted.filt.noMT.bam treat1_sub.filt.noMT.bam; samtools index treat1_sub.sorted.filt.noMT.bam
samtools sort -o treat2_sub.sorted.filt.noMT.bam treat2_sub.filt.noMT.bam; samtools index treat2_sub.sorted.filt.noMT.bam
# no need to sort/index control1 again; did not subsample
samtools sort -o control2_sub.sorted.filt.noMT.bam control2_sub.filt.noMT.bam; samtools index control2_sub.sorted.filt.noMT.bam
# will proceed forward with the subsampled BAMs
########################################
### Filter
# Remove PCR duplicates
java -jar picard.jar MarkDuplicates I=treat1_sub.sorted.filt.noMT.bam O=treat1_sub.noDups.filt.noMT.bam M=treat1_sub_dups.txt REMOVE_DUPLICATES=true
java -jar picard.jar MarkDuplicates I=treat2_sub.sorted.filt.noMT.bam O=treat2_sub.noDups.filt.noMT.bam M=treat2_sub_dups.txt REMOVE_DUPLICATES=true
java -jar picard.jar MarkDuplicates I=control1.sorted.filt.noMT.bam O=control1.noDups.filt.noMT.bam M=control1_dups.txt REMOVE_DUPLICATES=true
java -jar picard.jar MarkDuplicates I=control2_sub.sorted.filt.noMT.bam O=control2_sub.noDups.filt.noMT.bam M=control2_sub_dups.txt REMOVE_DUPLICATES=true
# Coordinate sort and index
samtools sort -o treat1_sub.sorted.noDups.filt.noMT.bam treat1_sub.noDups.filt.noMT.bam; samtools index treat1_sub.sorted.noDups.filt.noMT.bam
samtools sort -o treat2_sub.sorted.noDups.filt.noMT.bam treat2_sub.noDups.filt.noMT.bam; samtools index treat2_sub.sorted.noDups.filt.noMT.bam
samtools sort -o control1.sorted.noDups.filt.noMT.bam control1.noDups.filt.noMT.bam; samtools index control1.sorted.noDups.filt.noMT.bam
samtools sort -o control2_sub.sorted.noDups.filt.noMT.bam control2_sub.noDups.filt.noMT.bam; samtools index control2_sub.sorted.noDups.filt.noMT.bam
########################################
### Format
# Sort by read name
samtools sort -n -o treat1_sub.namesorted.noDups.filt.noMT.bam treat1_sub.sorted.noDups.filt.noMT.bam
samtools sort -n -o treat2_sub.namesorted.noDups.filt.noMT.bam treat2_sub.sorted.noDups.filt.noMT.bam
samtools sort -n -o control1.namesorted.noDups.filt.noMT.bam control1.sorted.noDups.filt.noMT.bam
samtools sort -n -o control2_sub.namesorted.noDups.filt.noMT.bam control2_sub.sorted.noDups.filt.noMT.bam
# Fix read mates
samtools fixmate treat1_sub.namesorted.noDups.filt.noMT.bam treat1_sub.fixed.bam
samtools fixmate treat2_sub.namesorted.noDups.filt.noMT.bam treat2_sub.fixed.bam
samtools fixmate control1.namesorted.noDups.filt.noMT.bam control1.fixed.bam
samtools fixmate control2_sub.namesorted.noDups.filt.noMT.bam control2_sub.fixed.bam
# BEDPE conversion
samtools view -bf 0x2 treat1_sub.fixed.bam | bedtools bamtobed -i stdin -bedpe > treat1_sub.fixed.bedpe
samtools view -bf 0x2 treat2_sub.fixed.bam | bedtools bamtobed -i stdin -bedpe > treat2_sub.fixed.bedpe
samtools view -bf 0x2 control1.fixed.bam | bedtools bamtobed -i stdin -bedpe > control1.fixed.bedpe
samtools view -bf 0x2 control2_sub.fixed.bam | bedtools bamtobed -i stdin -bedpe > control2_sub.fixed.bedpe
# Tn5 shift
bash bedpeTn5shift.sh treat1_sub.fixed.bedpe > treat1_sub.tn5.bedpe
bash bedpeTn5shift.sh treat2_sub.fixed.bedpe > treat2_sub.tn5.bedpe
bash bedpeTn5shift.sh control1.fixed.bedpe > control1.tn5.bedpe
bash bedpeTn5shift.sh control2_sub.fixed.bedpe > control2_sub.tn5.bedpe
# Minimal conversion (from standard BEDPE format to that accepted by MACS2)
bash bedpeMinimalConvert.sh treat1_sub.tn5.bedpe > treat1_sub.minimal.bedpe
bash bedpeMinimalConvert.sh treat2_sub.tn5.bedpe > treat2_sub.minimal.bedpe
bash bedpeMinimalConvert.sh control1.tn5.bedpe > control1.minimal.bedpe
bash bedpeMinimalConvert.sh control2_sub.tn5.bedpe > control2_sub.minimal.bedpe
########################################
### Peak calling
# Call significant broad peaks (FDR < 0.05) on each individual replicate
macs2 callpeak -t treat1_sub.minimal.bedpe -f BEDPE -n treat1 -g mm --broad --broad-cutoff 0.05 --keep-dup all
macs2 callpeak -t treat2_sub.minimal.bedpe -f BEDPE -n treat2 -g mm --broad --broad-cutoff 0.05 --keep-dup all
macs2 callpeak -t control1.minimal.bedpe -f BEDPE -n control1 -g mm --broad --broad-cutoff 0.05 --keep-dup all
macs2 callpeak -t control2_sub.minimal.bedpe -f BEDPE -n control2 -g mm --broad --broad-cutoff 0.05 --keep-dup all
# -g mm refers to Mus musculus genome size; use -g hs for Homo sapiens, or see MACS2 manual for manually setting genome size e.g. for other organisms
# for simplicity, the "_sub" suffix was omitted to MACS2 output here
# Filter ENCODE-defined blacklist regions and unplaced contigs
# Blacklist regions refer to highly repetitive/unstructured regions with artificially high signal in genomic experiments. Available for certain model organisms. See publication for details. https://github.com/Boyle-Lab/Blacklist/
# Amemiya, H.M., Kundaje, A. & Boyle, A.P. The ENCODE Blacklist: Identification of Problematic Regions of the Genome. Sci Rep 9, 9354 (2019). (PMID: 31249361)
bedtools intersect -v -a treat1_peaks.broadPeak -b mm10.blacklist.bed | grep -P 'chr[\dXY]+[ \t]' > treat1_peaks.filt.broadPeak
bedtools intersect -v -a treat2_peaks.broadPeak -b mm10.blacklist.bed | grep -P 'chr[\dXY]+[ \t]' > treat2_peaks.filt.broadPeak
bedtools intersect -v -a control1_peaks.broadPeak -b mm10.blacklist.bed | grep -P 'chr[\dXY]+[ \t]' > control1_peaks.filt.broadPeak
bedtools intersect -v -a control2_peaks.broadPeak -b mm10.blacklist.bed | grep -P 'chr[\dXY]+[ \t]' > control2_peaks.filt.broadPeak
########################################
# Compute naive overlap
# Definition from ENCODE / Kundaje et al.: "Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs >= 0.5".
# Call MACS2 broad peaks on pooled replicates and filter blacklist regions
macs2 callpeak -t treat1_sub.minimal.bedpe treat2_sub.minimal.bedpe -f BEDPE -n treat_pool -g mm --broad --broad-cutoff 0.05 --keep-dup all
macs2 callpeak -t control1.minimal.bedpe control2_sub.minimal.bedpe -f BEDPE -n control_pool -g mm --broad --broad-cutoff 0.05 --keep-dup all
bedtools intersect -v -a treat_pool_peaks.broadPeak -b mm10.blacklist.bed | grep -P 'chr[\dXY]+[ \t]' > treat_pool_peaks.filt.broadPeak
bedtools intersect -v -a control_pool_peaks.broadPeak -b mm10.blacklist.bed | grep -P 'chr[\dXY]+[ \t]' > control_pool_peaks.filt.broadPeak
# Compute naive overlap
bash naiveOverlapBroad.sh treat1_peaks.filt.broadPeak treat2_peaks.filt.broadPeak treat_pool_peaks.filt.broadPeak > treat_overlap_peaks.filt.broadPeak
bash naiveOverlapBroad.sh control1_peaks.filt.broadPeak control2_peaks.filt.broadPeak control_pool_peaks.filt.broadPeak > control_overlap_peaks.filt.broadPeak