This repository has been archived by the owner on Jun 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_RNAseqVis_Heatmap.Rmd
executable file
·329 lines (296 loc) · 10.5 KB
/
3_RNAseqVis_Heatmap.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
---
title: "Visualisation of DESeq2 results"
description: "DEG analysis based on DESeq2 and GSEA"
principal investigator: "Joaquín de Navascués"
researcher: "Aleix Puig, modified by Joaquín de Navascués"
output:
html_document:
toc: true
toc_float: true
code_folding: hide
theme: readable
df_print: paged
---
```{r set-publication-theme, echo=FALSE, cache=FALSE}
ggplot2::theme_set(ggpubr::theme_pubr(base_size=10))
```
```{r setup, echo = FALSE, cache = FALSE}
knitr::opts_chunk$set(dev = 'png',
fig.align = 'center', fig.height = 5, fig.width = 8.5,
pdf.options(encoding = "ISOLatin9.enc"),
fig.path='integration/figures/', warning=FALSE, message=FALSE)
```
**Libraries:**
```{r, warning=FALSE}
if (!require("librarian")) install.packages("librarian")
# data
librarian::shelf(tibble, stringr, purrr, dplyr, plyr, DescTools, limma)
# graphics
librarian::shelf(RColorBrewer, pheatmap, cetcolor)
# convenience
library(here)
setwd(here::here()) # to distinguish from plyr::here()
```
**Path to definitive images (outside repo):**
```{r}
figdir <- paste0(c(head(str_split(getwd(),'/')[[1]],-1),
paste0(tail(str_split(getwd(),'/')[[1]],1), '_figures')),
collapse='/')
dir.create(figdir, showWarnings = FALSE)
```
# RNAseq results visualisation: Samples heatmap
## 1 Gather the DEG data
```{r}
# experimental design and labels
targets <- readRDS('output/targets.RDS')
# add column to distinguish control batches
targets$sample_type <- targets$Condition
targets$sample_type[targets$sample_type=='Control'] <- str_c(
targets$Condition, "_", targets$Batch)[targets$Condition=='Control']
targets[,c('Condition', 'sampleIDs', 'sample_type')]
saveRDS(targets, file='output/targets.RDS')
```
```{r}
# TPM counts
tpmNormalisedCounts <- readRDS('output/tpmNormalisedCounts.RDS')
# pseudocounts
vsd_bcorr <- readRDS('output/vst_pseudocounts_batchCorrected.RDS')
# DEG lists
data_list <- list.files("output", pattern = "_vs_")
datasets <- pmap(list(file = paste("output", data_list, sep = "/")),
readRDS)
names(datasets) <- str_split_fixed(
str_split_fixed(data_list, ".RDS", n=2)[,1],
"_", n=3)[,3]
```
Total unique differentially expressed genes for | log2FC | > 2:
```{r}
# Filter for DEG with abs(log2FC) >= `fc_thresh`
fc_thresh <- 2.5
DE_genes <- datasets %>%
map( filter, padj <= 0.05 ) %>%
map( filter, log2FoldChange %][% c(-fc_thresh, fc_thresh) )
# get the union of DEGs sets
DE_unique_genes <- unique(unlist(lapply(DE_genes, '[[', "ensemblGeneID")))
length(DE_unique_genes)
```
## 2 Using TPM counts with and without batch-correction
Let us first try to use the data as 'raw' as possible: TPM-normalised counts, Z-scored (as the range of values is 0.2 - 10,000) - obviously with no batch correction.
```{r}
# filter by DEGs
DE_tpm <- tpmNormalisedCounts[match(DE_unique_genes,
rownames(tpmNormalisedCounts)),]
```
This gets us the TPMs of all DEGs, without Z-scoring -- this is done internally in the `pheatmap` function).
Prepare the heatmap customisation:
```{r}
main.title <- 'Clustered heatmap of DEGs'
# annotation labels
## for batch
ann_labels <- data.frame(batch = ifelse(test = targets$Batch == 'a',
yes = '1',
no = '2'))
## for genotype
ann_labels$condition <- mapvalues(targets$Condition,
from=unique(targets$Condition),
to=c('da RNAi', 'da ov/ex',
'control', 'da:da ov/ex',
'scute ov/ex'))
ann_labels$condition <- factor(ann_labels$condition,
levels=c('da RNAi', 'control',
'da ov/ex', 'da:da ov/ex',
'scute ov/ex'))
rownames(ann_labels) <- targets$sampleIDs # same as `names(DE_tpm)`
# annotation colours
ann_colors = list(
batch = c('1' = brewer.pal(12, 'Paired')[2],
'2' = brewer.pal(12, 'Paired')[8]),
condition = c("da RNAi" = brewer.pal(12, 'Paired')[9],
"da ov/ex" = brewer.pal(12, 'Paired')[5],
"control" = brewer.pal(12, 'Paired')[1],
"da:da ov/ex" = brewer.pal(12, 'Paired')[6],
"scute ov/ex" = brewer.pal(12, 'Paired')[4])
)
```
### Heatmap of scaled TPM counts
```{r, fig.height=12, fig.width=10}
hm <- pheatmap(
# data
mat = DE_tpm,
scale = "row", # z-scores the rows
# main
main = main.title,
fontsize = 14,
clustering_method = "ward.D2",
# rows
cluster_rows = TRUE,
clustering_distance_rows = 'manhattan',
treeheight_row = 25, # default is 50
show_rownames = FALSE,
# cols
cluster_cols = TRUE,
clustering_distance_cols = 'manhattan',
treeheight_col = 25,
labels_col = targets$sample_type,
fontsize_col = 9,
angle_col = 45,
# annotation
annotation = ann_labels,
annotation_colors = ann_colors,
# tiles
color = cet_pal(n = 256, name = "cbd1", alpha = 1),
border_color = NA,
cellwidth = 20,
cellheight = 0.5
)
hm
```
The 'looks' are ok, but the organisation of the columns itself is not. Let us try with a batch-corrected version.
We use the `DESeqTranform` object to mark the batch-effects, and `limma::removeBatchEffect` to remove them:
```{r}
vsd <- readRDS('output/vst_pseudocounts.RDS')
# min(tpmNormalisedCounts) is 0 so we need to add 1 for log transform
tpm_log_bcorr <- removeBatchEffect(
log2(tpmNormalisedCounts+1),
batch=vsd$batch,
design=model.matrix(~condition, colData(vsd) )
)
# reverse log transform
tpm_bcorr <- 2^tpm_log_bcorr
# save for MA plots
saveRDS(tpm_bcorr, file='output/tmp_batch_corrected.RDS')
# filter by DEGs
DE_tpm_bcorr <- tpm_bcorr[match(DE_unique_genes,
rownames(tpm_bcorr)),]
```
ALl annotations are the same -- we are now ready to plot.
### Heatmap of batch-corrected, z-scored TPM
```{r, fig.height=12, fig.width=10}
hm <- pheatmap(
# data
mat = DE_tpm_bcorr,
scale = "row", # z-scores the rows
# main
main = main.title,
fontsize = 14,
clustering_method = "ward.D2",
# rows
cluster_rows = TRUE,
clustering_distance_rows = 'minkowski',
treeheight_row = 25, # default is 50
cutree_rows = 6,
show_rownames = FALSE,
# cols
cluster_cols = TRUE,
clustering_distance_cols = 'canberra',
treeheight_col = 25,
labels_col = targets$sample_type,
fontsize_col = 9,
angle_col = 45,
# annotation
annotation = ann_labels,
annotation_colors = ann_colors,
# tiles
color = cet_pal(n = 256, name = "cbd1", alpha = 1),
border_color = NA,
cellwidth = 20,
cellheight = 0.5
)
hm
```
This is clearly more reasonable -- however, it requires transforming TPMs forth and back. We might just as well use the pseudocounts generated by the `DESeq2::vst` transformation.
## 3 Heatmap of batch-corrected pseudocounts
To do this, we need the batch-corrected, `DESeqTranform` object from the DGE analysis with `DESeq2`, filtered by `log2FoldChange` and `padj`:
```{r}
vsd_bcorr <- readRDS('output/vst_pseudocounts_batchCorrected.RDS')
DE_vsd_bcorr <- as.data.frame(assay(vsd_bcorr)[match(DE_unique_genes,
rownames(vsd_bcorr)),])
```
It is not needed to get the z-scores for the rows (`scale = 'row'`), as the range of values is ~3-18:
```{r, fig.height=12, fig.width=10}
hm <- pheatmap(
# data
mat = DE_vsd_bcorr,
# main
main = main.title,
fontsize = 14,
clustering_method = "ward.D2",
# rows
cluster_rows = TRUE,
clustering_distance_rows = 'euclidean', # "euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"
treeheight_row = 25, # default is 50
show_rownames = FALSE,
# cols
cluster_cols = TRUE,
clustering_distance_cols = 'euclidean',
treeheight_col = 25,
labels_col = targets$sample_type,
fontsize_col = 9,
angle_col = 45,
# annotation
annotation = ann_labels,
annotation_colors = ann_colors,
# tiles
color = cet_pal(n = 256, name = "cbd1", alpha = 1),
border_color = NA,
cellwidth = 20,
cellheight = 0.5
)
hm
```
OK, I see the problem here. Clearly the way to go is do the reverse transformation $2^{`vsd`}$. But what is the point of doing this with TPMs? Normalising by transcript units loses its the "physicality" if applying a ${log~2~(tpm+1)}$ transform. So let us use the batch-corrected `DESeqTranform` object directly. The dataframe `DE_vsd_bcorr` already contains the matrix of `vst`-transformed values _for the DEGs_, so we just need to do:
```{r}
DE_bcorr <- 2^DE_vsd_bcorr
```
This has a range of ~ 12-30,000, so we have to plot with z-score scaling:
```{r, fig.height=12, fig.width=10}
hm <- pheatmap(
# data
mat = DE_bcorr,
scale = 'row',
# main
main = main.title,
fontsize = 14,
clustering_method = "ward.D2",
# rows
cluster_rows = TRUE,
clustering_distance_rows = 'minkowski', # "euclidean", "maximum", "manhattan", "canberra", "minkowski"
treeheight_row = 25, # default is 50
cutree_rows = 6,
show_rownames = FALSE,
# cols
cluster_cols = TRUE,
clustering_distance_cols = 'canberra',
treeheight_col = 25,
labels_col = targets$sample_type,
fontsize_col = 9,
angle_col = 45,
# annotation
annotation = ann_labels,
annotation_colors = ann_colors,
# tiles
color = cet_pal(n = 256, name = "cbd1", alpha = 1),
border_color = NA,
cellwidth = 20,
cellheight = 0.5,
)
# save it
tiff(file=paste0(figdir,'/Heatmap_vsd_bcorr.tiff'),
width=10, height=12, units="in", res=300)
hm
dev.off()
pdf(file=paste0(figdir,'/Heatmap_vsd_bcorr.pdf'),
width=10, height=12)
hm
dev.off()
png(file=paste0(figdir,'/Heatmap_vsd_bcorr.png'),
width=10, height=12, units="in", res=300)
hm
dev.off()
svg(file=paste0(figdir,'/Heatmap_vsd_bcorr.svg'),
width=10, height=12)
hm
dev.off()
hm
```
This is the clear winner, so I add the parameters to save it to file.