-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy path11-batchcorrection-lab.Rmd
364 lines (293 loc) · 17.8 KB
/
11-batchcorrection-lab.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
---
title: "11-batchcorrection-lab.Rmd"
author: "Orr Ashenberg"
date: "3/25/2020"
output: html_document
---
# Batch Correction Lab
In this lab, we will look at different single cell RNA-seq datasets collected from pancreatic islets. We will look at how different batch correction methods affect our data analysis.
Note: you can increase the system memory available to Docker by going to Docker -> Preferences -> Advanced and shifting the Memory slider.
## Load settings and packages
```{r setup_batch}
knitr::opts_chunk$set(echo = TRUE)
library(Seurat)
library(Matrix)
library(fossil)
library(dplyr)
library(plyr)
library(liger)
# Set folder location for saving output files. This is also the same location as input data.
mydir <- "data/batch_correction/"
setwd("/home/rstudio/materials/")
# Objects to save.
Rda.sparse.path <- paste0(mydir, "pancreas_subsample.Rda")
Rda.path <- paste0(mydir, "pancreas_nobatchcorrect.Rda")
Rda.Seurat3.path <- paste0(mydir, "pancreas_Seurat3.Rda")
Rda.liger.path <- paste0(mydir, "pancreas_liger.Rda")
```
## Read in pancreas expression matrices
```{r read_data_batch, eval = FALSE}
# Read in all four input expression matrices
celseq.data <- read.table(paste0(mydir, "pancreas_multi_celseq_expression_matrix.txt.gz"))
celseq2.data <- read.table(paste0(mydir, "pancreas_multi_celseq2_expression_matrix.txt.gz"))
fluidigmc1.data <- read.table(paste0(mydir, "pancreas_multi_fluidigmc1_expression_matrix.txt.gz"))
smartseq2.data <- read.table(paste0(mydir, "pancreas_multi_smartseq2_expression_matrix.txt.gz"))
# Convert to sparse matrices for efficiency
celseq.data <- as(as.matrix(celseq.data), "dgCMatrix")
celseq2.data <- as(as.matrix(celseq2.data), "dgCMatrix")
fluidigmc1.data <- as(as.matrix(fluidigmc1.data), "dgCMatrix")
smartseq2.data <- as(as.matrix(smartseq2.data), "dgCMatrix")
```
## Preparing the individual Seurat objects for each pancreas dataset without batch correction
```{r prepare_seurat, eval = FALSE}
# What is the size of each single cell RNA-seq dataset?
# Briefly describe the technology used to collect each dataset.
# Which datasets do you expect to be different and which do you expect to be similar?
dim(celseq.data)
dim(celseq2.data)
dim(fluidigmc1.data)
dim(smartseq2.data)
# Create and setup Seurat objects for each dataset with the following 6 steps.
# 1. CreateSeuratObject
# 2. subset
# 3. NormalizeData
# 4. FindVariableFeatures
# 5. ScaleData
# 6. Update @meta.data slot in Seurat object with tech column (celseq, celseq2, fluidigmc1, smartseq2)
# Look at the distributions of number of genes per cell before and after FilterCells.
# CEL-Seq (https://www.cell.com/cell-reports/fulltext/S2211-1247(12)00228-8)
# In subset, use low.thresholds = 1750
celseq <- CreateSeuratObject(counts = celseq.data)
VlnPlot(celseq, "nFeature_RNA")
celseq <- subset(celseq, subset = nFeature_RNA > 1750)
VlnPlot(celseq, "nFeature_RNA")
celseq <- NormalizeData(celseq, normalization.method = "LogNormalize", scale.factor = 10000)
celseq <- FindVariableFeatures(celseq, selection.method = "vst", nfeatures = 2000)
celseq <- ScaleData(celseq)
celseq[["tech"]] <- "celseq"
# CEL-Seq2 https://www.cell.com/molecular-cell/fulltext/S1097-2765(09)00641-8
# In subset, use low.thresholds = 2500.
celseq2 <- CreateSeuratObject(counts = celseq2.data)
VlnPlot(celseq2, "nFeature_RNA")
celseq2 <- subset(celseq2, subset = nFeature_RNA > 2500)
VlnPlot(celseq2, "nFeature_RNA")
celseq2 <- NormalizeData(celseq2, normalization.method = "LogNormalize", scale.factor = 10000)
celseq2 <- FindVariableFeatures(celseq2, selection.method = "vst", nfeatures = 2000)
celseq2 <- ScaleData(celseq2)
celseq2[["tech"]] <- "celseq2"
# Fluidigm C1
# Omit subset function because cells are already high quality.
fluidigmc1 <- CreateSeuratObject(counts = fluidigmc1.data)
VlnPlot(fluidigmc1, "nFeature_RNA")
fluidigmc1 <- NormalizeData(fluidigmc1, normalization.method = "LogNormalize", scale.factor = 10000)
fluidigmc1 <- FindVariableFeatures(fluidigmc1, selection.method = "vst", nfeatures = 2000)
fluidigmc1 <- ScaleData(fluidigmc1)
fluidigmc1[["tech"]] <- "fluidigmc1"
# SMART-Seq2
# In subset, use low.thresholds = 2500.
smartseq2 <- CreateSeuratObject(counts = smartseq2.data)
VlnPlot(smartseq2, "nFeature_RNA")
smartseq2 <- subset(smartseq2, subset = nFeature_RNA > 2500)
VlnPlot(smartseq2, "nFeature_RNA")
smartseq2 <- NormalizeData(smartseq2, normalization.method = "LogNormalize", scale.factor = 10000)
smartseq2 <- FindVariableFeatures(smartseq2, selection.method = "vst", nfeatures = 2000)
smartseq2 <- ScaleData(smartseq2)
smartseq2[["tech"]] <- "smartseq2"
# This code sub-samples the data in order to speed up calculations and not use too much memory.
Idents(celseq) <- "tech"
celseq <- subset(celseq, downsample = 500, seed = 1)
Idents(celseq2) <- "tech"
celseq2 <- subset(celseq2, downsample = 500, seed = 1)
Idents(fluidigmc1) <- "tech"
fluidigmc1 <- subset(fluidigmc1, downsample = 500, seed = 1)
Idents(smartseq2) <- "tech"
smartseq2 <- subset(smartseq2, downsample = 500, seed = 1)
# Save the sub-sampled Seurat objects
save(celseq, celseq2, fluidigmc1, smartseq2, file = Rda.sparse.path)
```
## Cluster pancreatic datasets without batch correction
Let us cluster all the pancreatic islet datasets together and see whether there is a batch effect.
```{r no_batch_correction}
load(Rda.sparse.path)
# Merge Seurat objects. Original sample identities are stored in gcdata[["tech"]].
# Cell names will now have the format tech_cellID (smartseq2_cell1...)
add.cell.ids <- c("celseq", "celseq2", "fluidigmc1", "smartseq2")
gcdata <- merge(x = celseq, y = list(celseq2, fluidigmc1, smartseq2), add.cell.ids = add.cell.ids, merge.data = FALSE)
Idents(gcdata) <- "tech" # use identity based on sample identity
# Look at how the number of genes per cell varies across the different technologies.
VlnPlot(gcdata, "nFeature_RNA", group.by = "tech")
# The merged data must be normalized and scaled (but you only need to scale the variable genes).
# Let us also find the variable genes again this time using all the pancreas data.
gcdata <- NormalizeData(gcdata, normalization.method = "LogNormalize", scale.factor = 10000)
var.genes <- SelectIntegrationFeatures(SplitObject(gcdata, split.by = "tech"), nfeatures = 2000, verbose = TRUE, fvf.nfeatures = 2000, selection.method = "vst")
VariableFeatures(gcdata) <- var.genes
gcdata <- ScaleData(gcdata, features = VariableFeatures(gcdata))
# Do PCA on data including only the variable genes.
gcdata <- RunPCA(gcdata, features = VariableFeatures(gcdata), npcs = 40, ndims.print = 1:5, nfeatures.print = 5)
# Color the PC biplot by the scRNA-seq technology. Hint: use DimPlot
# Which technologies look similar to one another?
DimPlot(gcdata, reduction = "pca", dims = c(1, 2), group.by = "tech")
# Cluster the cells using the first twenty principal components.
gcdata <- FindNeighbors(gcdata, reduction = "pca", dims = 1:20, k.param = 20)
gcdata <- FindClusters(gcdata, resolution = 0.8, algorithm = 1, random.seed = 100)
# Create a UMAP visualization.
gcdata <- RunUMAP(gcdata, dims = 1:20, reduction = "pca", n.neighbors = 15, min.dist = 0.5, spread = 1, metric = "euclidean", seed.use = 1)
# Visualize the Louvain clustering and the batches on the UMAP.
# Remember, the clustering is stored in @meta.data in column seurat_clusters and the technology is
# stored in the column tech. Remember you can also use DimPlot
DimPlot(gcdata, reduction = "umap", group.by = "seurat_clusters")
DimPlot(gcdata, reduction = "umap", group.by = "tech")
# Are you surprised by the results? Compare to your expectations from the PC biplot of PC1 vs PC2.
# What explains these results?
# Adjusted rand index test for overlap between technology and cluster labelings.
# This goes between 0 (completely dissimilar clustering) to 1 (identical clustering).
# The adjustment corrects for chance grouping between cluster elements.
# https://davetang.org/muse/2017/09/21/adjusted-rand-index/
ari <- dplyr::select(gcdata[[]], tech, seurat_clusters)
ari$tech <- plyr::mapvalues(ari$tech, from = c("celseq", "celseq2", "fluidigmc1", "smartseq2"), to = c(0, 1, 2, 3))
adj.rand.index(as.numeric(ari$tech), as.numeric(ari$seurat_clusters))
# Save current progress.
save(gcdata, file = Rda.path)
# To load the data, run the following command.
# load(Rda.path)
```
### Batch correction: canonical correlation analysis (CCA) + mutual nearest neighbors (MNN) using Seurat v3
Here we use Seurat v3 to see to what extent it can remove potential batch effects.
```{r batchcorrect_Seurat3}
# load(Rda.sparse.path)
# The first piece of code will identify variable genes that are highly variable in at least 2/4 datasets. We will use these variable genes in our batch correction.
# Why would we implement such a requirement?
ob.list <- list(celseq, celseq2, fluidigmc1, smartseq2)
# Identify anchors on the 4 pancreatic islet datasets, commonly shared variable genes across samples,
# and integrate samples.
gcdata.anchors <- FindIntegrationAnchors(object.list = ob.list, anchor.features = 2000, dims = 1:30)
gcdata <- IntegrateData(anchorset = gcdata.anchors, dims = 1:30)
DefaultAssay(gcdata) <- "integrated"
# Run the standard workflow for visualization and clustering.
# The integrated data object only stores the commonly shared variable genes.
gcdata <- ScaleData(gcdata, do.center = T, do.scale = F)
gcdata <- RunPCA(gcdata, npcs = 40, ndims.print = 1:5, nfeatures.print = 5)
DimPlot(gcdata, dims = c(1, 2), reduction = "pca", split.by = "tech")
# Clustering. Choose the dimensional reduction type to use and the number of aligned
# canonical correlation vectors to use.
gcdata <- FindNeighbors(gcdata, reduction = "pca", dims = 1:20, k.param = 20)
gcdata <- FindClusters(gcdata, resolution = 0.8, algorithm = 1, random.seed = 100)
# UMAP. Choose the dimensional reduction type to use and the number of aligned
# canonical correlation vectors to use.
gcdata <- RunUMAP(gcdata, dims = 1:30, reduction = "pca", n.neighbors = 15, min.dist = 0.5, spread = 1, metric = "euclidean", seed.use = 1)
# After data integration, use the original expression data in all visualization and DE tests.
# The integrated data must not be used in DE tests as it violates assumptions of independence in DE tests!
DefaultAssay(gcdata) <- "RNA"
# Visualize the Louvain clustering and the batches on the UMAP.
# Remember, the clustering is stored in @meta.data in column seurat_clusters
# and the technology is stored in the column tech. Remember you can also use DimPlot.
p1 <- DimPlot(gcdata, reduction = "umap", group.by = "seurat_clusters")
p2 <- DimPlot(gcdata, reduction = "umap", group.by = "tech")
p1 + p2
# Let's look to see how the adjusted rand index changed compared to using no batch correction.
ari <- dplyr::select(gcdata[[]], tech, seurat_clusters)
ari$tech <- plyr::mapvalues(ari$tech, from = c("celseq", "celseq2", "fluidigmc1", "smartseq2"), to = c(0, 1, 2, 3))
adj.rand.index(as.numeric(ari$tech), as.numeric(ari$seurat_clusters))
# We can also identify conserved marker genes across the batches. Differential gene expression is
# done across each batch, and the p-values are combined.
markers <- FindConservedMarkers(gcdata, ident.1 = 0, grouping.var = "tech", assay = "RNA", print.bar = T)
head(markers)
# Visualize the expression of the first 5 marker genes on UMAP across the different batches using DoHeatmap.
gcdata <- ScaleData(gcdata, features = rownames(gcdata), do.center = T, do.scale = F)
DoHeatmap(gcdata, features = rownames(markers)[1:5], group.by = "tech", disp.max = 3)
# Markers for pancreatic cells from "A Single-Cell Transcriptome Atlas of the
# Human Pancreas".https://www.cell.com/cell-systems/pdfExtended/S2405-4712(16)30292-7
genes <- c("GCG", "INS", "SST", "PPY", "PRSS1", "KRT19", "PECAM1", "COL1A1")
FeaturePlot(gcdata, genes, ncol = 4)
# Save current progress.
save(gcdata, file = Rda.Seurat3.path)
# To load the data, run the following command.
# load(Rda.Seurat3.path)
```
### Batch correction: integrative non-negative matrix factorization (NMF) using LIGER
Here we use integrative non-negative matrix factorization to see to what extent it can remove potential batch effects.
The important parameters in the batch correction are the number of factors (k), the penalty parameter (lambda), and the clustering resolution. The number of factors sets the number of factors (consisting of shared and dataset-specific factors) used in factorizing the matrix. The penalty parameter sets the balance between factors shared across the batches and factors specific to the individual batches. The default setting of lambda=5.0 is usually used by the Macosko lab. Resolution=1.0 is used in the Louvain clustering of the shared neighbor factors that have been quantile normalized.
```{r batchcorrect_liger, eval = FALSE}
# load(Rda.sparse.path)
ob.list <- list("celseq" = celseq, "celseq2" = celseq2, "fluidigmc1" = fluidigmc1, "smartseq2" = smartseq2)
# Identify variable genes that are variable across most samples.
var.genes <- SelectIntegrationFeatures(ob.list, nfeatures = 2000, verbose = TRUE, fvf.nfeatures = 2000, selection.method = "vst")
# Next we create a LIGER object with raw counts data from each batch.
data.liger <- createLiger(sapply(ob.list, function(data) data[['RNA']]@counts[, colnames(data)]), remove.missing = F)
# Normalize gene expression for each batch.
data.liger <- liger::normalize(data.liger)
# Use my method or Liger method for selecting variable genes (var.thresh changes number of variable genes).
[email protected] <- var.genes
# data.liger <- selectGenes(data.liger, var.thresh = 0.1, do.plot = F)
# Print out the number of variable genes for LIGER analysis.
print(length([email protected]))
# Scale the gene expression across the datasets.
# Why does LIGER not center the data? Hint, think about the use of
# non-negative matrix factorization and the constraints that this imposes.
data.liger <- scaleNotCenter(data.liger)
# These two steps take 10-20 min. Only run them if you finish with the rest of the code.
# Use the `suggestK` function to determine the appropriate number of factors to use.
# Use the `suggestLambda` function to find the smallest lambda for which the alignment metric stabilizes.
# k.suggest <- suggestK(data.liger, k.test = seq(5, 30, 5), plot.log2 = T)
# lambda.suggest <- suggestLambda(gcdata.liger, k.suggest)
# Use alternating least squares (ALS) to factorize the matrix.
# Next, quantile align the factor loadings across the datasets, and do clustering.
k.suggest <- 20 # with this line, we do not use the suggested k by suggestK()
lambda.suggest <- 5 # with this line, we do not use the suggested lambda by suggestLambda()
set.seed(100) # optimizeALS below is stochastic
data.liger <- optimizeALS(data.liger, k = k.suggest, lamda = lambda.suggest)
# What do matrices H, V, and W represent, and what are their dimensions?
dim(data.liger@H$celseq)
dim(data.liger@V$celseq)
dim(data.liger@W)
# Next, do clustering of cells in shared nearest factor space.
# Build SNF graph, do quantile normalization, cluster quantile normalized data
data.liger <- quantileAlignSNF(data.liger, resolution = 1) # SNF clustering and quantile alignment
# What are the dimensions of H.norm. What does this represent?
dim([email protected])
# Let's see what the liger data looks like mapped onto a UMAP visualization.
data.liger <- runUMAP(data.liger, n_neighbors = 15, min_dist = 0.5) # conda install -c conda-forge umap-learn
p <- plotByDatasetAndCluster(data.liger, return.plots = T)
print(p[[1]]) # plot by dataset
plot_grid(p[[1]], p[[2]])
# Let's look to see how the adjusted rand index changed compared to using no batch correction.
tech <- unlist(lapply(1:length(data.liger@H), function(x) {
rep(names(data.liger@H)[x], nrow(data.liger@H[[x]]))}))
clusters <- data.liger@clusters
ari <- data.frame("tech" = tech, "clusters" = clusters)
ari$tech <- plyr::mapvalues(ari$tech, from = c("celseq", "celseq2", "fluidigmc1", "smartseq2"), to = c(0, 1, 2, 3))
adj.rand.index(as.numeric(ari$tech), as.numeric(ari$clusters))
# Look at proportion of each batch in each cluster, and look at factor loadings across clusters
plotClusterProportions(data.liger)
plotClusterFactors(data.liger, use.aligned = T)
# Look at genes that are specific to a dataset and shared across datasets.
# Use the plotWordClouds function and choose 2 datasets.
pdf(paste0(mydir, "word_clouds.pdf"))
plotWordClouds(data.liger, dataset1 = "celseq2", dataset2 = "smartseq2")
dev.off()
# Look at factor loadings for each cell using plotFactors.
pdf(paste0(mydir, "plot_factors.pdf"))
plotFactors(data.liger)
dev.off()
# Identify shared and batch-specific marker genes from liger factorization.
# Use the getFactorMarkers function and choose 2 datasets.
# Then plot some genes of interest using plotGene.
markers <- getFactorMarkers(data.liger, dataset1 = "celseq2", dataset2 = "smartseq2", num.genes = 10)
plotGene(data.liger, gene = "INS")
# Save current progress.
save(data.liger, file = Rda.liger.path)
# To load the data, run the following command.
# load(Rda.liger.path)
```
## Additional exploration: Regressing out unwanted covariates
Learn how to regress out different technical covariates (number of UMIs, number of genes, percent mitochondrial reads) by studying [Seurat's PBMC tutorial](https://satijalab.org/seurat/pbmc3k_tutorial.html) and the ScaleData() function.
```{r regress_covars, eval = FALSE}
```
## Additional exploration: kBET
Within your RStudio session, install [k-nearest neighbour batch effect test](https://github.com/theislab/kBET) and learn how to use its functionality to quantify batch effects in the pancreatic data.
```{r kBET, eval = FALSE}
```
## Additional exploration: Seurat 3
Read how new version of Seurat does [data integration](https://satijalab.org/seurat/pancreas_integration_label_transfer.html)
## Acknowledgements
This document builds off a tutorial from the [Seurat website](https://www.dropbox.com/s/aji4ielg8gc70vj/multiple_pancreas_workflow.R?dl=1) and a tutorial from the [LIGER website](https://github.com/MacoskoLab/liger/blob/master/vignettes/liger-vignette.html).