-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPCs_scRNA-seq.Rmd
185 lines (138 loc) · 5.59 KB
/
PCs_scRNA-seq.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
---
title: "PCs 10x"
author: "Kin Lau"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output:
html_document:
code_folding: hide
self_contained: yes
toc: true
toc_depth: 5
toc_float:
collapsed: true
smooth_scroll: false
number_sections: true
---
```{r starttime}
# save start time for script
start_tm <- Sys.time()
start_tm
```
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo=TRUE, warning=TRUE, message=TRUE, cache=TRUE, cache.lazy = FALSE, dev=c('png','pdf'), fig.width=8, fig.height=8)
```
```{r make_outdir}
outdir <- "./PCs_scRNAseq_out_files/"
dir.create(outdir, recursive=TRUE)
```
# Packages loaded
```{r loadlibs, echo=TRUE, warning=TRUE, message=TRUE, cache=FALSE}
library(dplyr)
library(stringr)
library(readr)
library(DropletUtils)
library(scuttle)
library(scater)
library(Seurat)
library(future)
```
# Read in counts
```{r read_counts}
raw_counts_dirs <- list.dirs("../cellranger_output/", recursive = TRUE) %>%
grep("_PC\\/outs\\/filtered_feature_bc_matrix$", ., value=TRUE)
names(raw_counts_dirs) <- str_extract(raw_counts_dirs, "[^\\/]+(?=\\/outs\\/filtered_feature_bc_matrix)")
stopifnot(length(names(raw_counts_dirs)) == length(unique(names(raw_counts_dirs))))
# get decoder for ensembl gene id to gene symbol from one of the features.tsv files.
ens2gene <- read_tsv(paste0(raw_counts_dirs[1], "/features.tsv.gz"),
col_names = c("ens_gene","gene_sym", "assay_type"),
col_types="ccc") %>%
dplyr::select(ens_gene, gene_sym) %>%
unique()
sce <- read10xCounts(raw_counts_dirs)
rownames(sce) <- uniquifyFeatureNames(rowData(sce)$ID, rowData(sce)$Symbol)
```
# QC
```{r qc}
is.mito <- grepl("^mt-", rownames(sce))
#sce <- addPerCellQC(sce, subsets=list(Mito=is.mito))
df <- perCellQCMetrics(sce, subsets=list(Mito=is.mito))
reasons <- quickPerCellQC(df, sub.fields=c("subsets_Mito_percent"), batch=sce$Sample) # perCellQCFilters() will also identify outliers for the proportion-based metrics specified in the sub.fields= arguments. These distributions frequently exhibit a heavy right tail, but unlike the two previous metrics, it is the right tail itself that contains the putative low-quality cells. Thus, we do not perform any transformation to shrink the tail - rather, our hope is that the cells in the tail are identified as large outliers. (While it is theoretically possible to obtain a meaningless threshold above 100%, this is rare enough to not be of practical concern.) -- OSCA
# Cell removal reasons
colSums(as.matrix(reasons))
# add qc metrics to SCE
colData(sce) <- cbind(colData(sce), df)
sce$discard <- reasons$discard
table(sce$Sample, ifelse(sce$discard,"Discard","Keep"))
```
# QC plots
```{r qc_plots}
gridExtra::grid.arrange(
plotColData(sce, x="Sample", y="sum", colour_by="discard") +
scale_y_log10() + ggtitle("Total count"),
plotColData(sce, x="Sample", y="detected", colour_by="discard") +
scale_y_log10() + ggtitle("Detected features"),
plotColData(sce, x="Sample", y="subsets_Mito_percent",
colour_by="discard") + ggtitle("Mito percent"),
ncol=1
)
```
```{r qc_plots2, fig.width=4, fig.height=4}
# "The aim is to confirm that there are no cells with both large total counts and large mitochondrial counts, to ensure that we are not inadvertently removing high-quality cells that happen to be highly metabolically active" -OSCA
plotColData(sce, x="sum", y="subsets_Mito_percent", colour_by="discard")
```
# Filter cells and normalize
```{r filter_cells_and_normalize}
filtered <- sce[,!reasons$discard]
filtered <- logNormCounts(filtered)
write_rds(filtered, paste0(outdir, "filt_SCE.rds"))
```
# Convert to Seurat
```{r to_seurat}
filt.seurat <- as.Seurat(filtered, counts = "counts", data = "logcounts")
filt.seurat <- NormalizeData(filt.seurat) # renormalize counts using Seurat way
```
# Integrate across samples
```{r split_and_select_integration_feats}
# split the dataset into a list of two seurat objects (stim and CTRL)
filt.list <- SplitObject(filt.seurat, split.by = "Sample")
# normalize and identify variable features for each dataset independently
filt.list <- lapply(X = filt.list, FUN = function(x) {
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = filt.list)
```
# Perform integration
```{r integrate}
plan("multisession", workers = 2)
options(future.globals.maxSize = 5000 * 1024^2)
integrate.anchors <- FindIntegrationAnchors(object.list = filt.list, anchor.features = features)
plan("sequential")
filt.combined <- IntegrateData(anchorset = integrate.anchors)
```
# Perform integrated analysis
```{r integrated_analysis}
# specify that we will perform downstream analysis on the corrected data note that the
# original unmodified data still resides in the 'RNA' assay
DefaultAssay(filt.combined) <- "integrated"
# Run the standard workflow for visualization and clustering
filt.combined <- ScaleData(filt.combined, verbose = FALSE)
filt.combined <- RunPCA(filt.combined, npcs = 60, verbose = FALSE)
ElbowPlot(filt.combined, ndims = 60)
filt.combined <- RunUMAP(filt.combined, reduction = "pca", dims = 1:30)
filt.combined <- FindNeighbors(filt.combined, reduction = "pca", dims = 1:30)
filt.combined <- FindClusters(filt.combined, resolution = c(seq(0.1, 0.9, 0.1)))
write_rds(filt.combined, paste0(outdir, "integrated_Seurat.rds"))
```
# SessionInfo
```{r sessioninfo}
sessionInfo()
```
# Time
```{r endtime}
# output time taken to run script
end_tm <- Sys.time()
end_tm
end_tm - start_tm
```