Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle large matrices and change default Seurat format #21

Merged
merged 5 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions R/make-seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
#' @param dedup_method Method to handle duplicated gene symbols. If `unique`,
#' the gene symbols will be made unique following standard Seurat procedures.
#' If `sum`, the expression values for each gene symbols will be summed.
#' @param seurat_assay_version Whether to create Seurat `v3` or `v5` assay
#' objects. Default is `v3`.
#' @param seurat_assay_version Whether to create Seurat `v5` or `v3` assay
#' objects. Default is `v5`.
#'
#' @return a Seurat object
#' @export
Expand All @@ -37,16 +37,16 @@
#' # convert SCE to Seurat, merging duplicated gene names
#' seurat_obj <- sce_to_seurat(sce, dedup_method = "sum")
#'
#' # convert SCE to Seurat with v5 assay objects
#' seurat_obj <- sce_to_seurat(sce, seurat_assay_version = "v5")
#' # convert SCE to Seurat with v3 assay objects
#' seurat_obj <- sce_to_seurat(sce, seurat_assay_version = "v3")
#' }
#'
sce_to_seurat <- function(
sce,
use_symbols = TRUE,
reference = c("sce", "scpca", "10x2020", "10x2024"),
dedup_method = c("unique", "sum"),
seurat_assay_version = c("v3", "v5")) {
seurat_assay_version = c("v5", "v3")) {
reference <- match.arg(reference)
dedup_method <- match.arg(dedup_method)
seurat_assay_version <- match.arg(seurat_assay_version)
Expand Down Expand Up @@ -120,12 +120,12 @@ sce_to_seurat <- function(
}

# convert and set functions depending on assay version requested
if (seurat_assay_version == "v5") {
create_seurat_assay <- SeuratObject::CreateAssay5Object
suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay5"))
} else {
if (seurat_assay_version == "v3") {
create_seurat_assay <- SeuratObject::CreateAssayObject
suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay"))
} else {
create_seurat_assay <- SeuratObject::CreateAssay5Object
suppressWarnings(sobj[["RNA"]] <- as(sobj[["RNA"]], "Assay5"))
}

# add spliced data if present
Expand Down
39 changes: 31 additions & 8 deletions R/sum-duplicate-genes.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
#' PCA and UMAP. If FALSE, the input reduced dimensions are copied over. If
#' TRUE, the highly variable genes are also recalculated with the new values
#' stored in metadata. Default is FALSE.
#' @param cell_set_size Because very large matrices can cause trouble, we break
#' the summing into submatrices with this many cells. Default is 5000, which
#' should be fine for most datasets.
#'
#' @return a SingleCellExperiment object
#' @export
Expand All @@ -44,12 +47,16 @@
#' summed_sce <- sum_duplicate_genes(sce, recalculate_reduced_dims = TRUE)
#' }
#'
sum_duplicate_genes <- function(sce, normalize = TRUE, recalculate_reduced_dims = FALSE) {
sum_duplicate_genes <- function(sce,
normalize = TRUE,
recalculate_reduced_dims = FALSE,
cell_set_size = 5000) {
stopifnot(
"sce must be a SingleCellExperiment object" = is(sce, "SingleCellExperiment"),
"normalize must be a logical" = is.logical(normalize),
"recalculate_reduced_dims must be a logical" = is.logical(recalculate_reduced_dims),
"normalize can not be FALSE if recalculate_reduced_dims is TRUE" = normalize || !recalculate_reduced_dims
"normalize can not be FALSE if recalculate_reduced_dims is TRUE" = normalize || !recalculate_reduced_dims,
"cell_set_size should be an integer (much) greater than 1" = cell_set_size > 1 && cell_set_size %% 1 == 0
)
if (normalize) {
stopifnot(
Expand All @@ -67,14 +74,30 @@ sum_duplicate_genes <- function(sce, normalize = TRUE, recalculate_reduced_dims
return(sce)
}

# calculate the reduced matrices
# calculate the reduced matrices----------------------------------------------
unique_rows <- unique(rownames(sce)) # new row names
counts <- rowsum(counts(sce), rownames(sce))[unique_rows, ] |> # keep order, mostly
as("sparseMatrix")

# break up counts and assay matrices to avoid large non-sparse matrices during calculation
cell_sets <- seq(0, ncol(sce) - 1) %/% cell_set_size
# make sure the last set is not length one
cell_sets[ncol(sce)] <- cell_sets[ncol(sce) - 1]

counts <- unique(cell_sets) |>
purrr::map(\(x) {
rowsum(counts(sce)[, cell_sets == x], rownames(sce)) |>
as("sparseMatrix") |>
base::`[`(unique_rows, ) # reorder rows (faster with sparse)
}) |>
purrr::reduce(cbind)

if ("spliced" %in% assayNames(sce)) {
spliced_names <- rownames(assay(sce, "spliced"))
spliced <- rowsum(assay(sce, "spliced"), spliced_names)[unique(spliced_names), ] |>
as("sparseMatrix")
spliced <- unique(cell_sets) |>
purrr::map(\(x) {
rowsum(assay(sce, "spliced")[, cell_sets == x], rownames(sce)) |>
as("sparseMatrix") |>
base::`[`(unique_rows, )
}) |>
purrr::reduce(cbind)
assays <- list(counts = counts, spliced = spliced)
} else {
assays <- list(counts = counts)
Expand Down
10 changes: 5 additions & 5 deletions man/sce_to_seurat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 10 additions & 1 deletion man/sum_duplicate_genes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

63 changes: 32 additions & 31 deletions tests/testthat/test-make-seurat.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ test_that("SCE to Seurat with no id conversion works", {
expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA")

# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")

# assay contents

# expected layers present
expect_contains(
slotNames(seurat_obj[["RNA"]]),
c("counts", "data", "scale.data", "var.features", "meta.features")
names(seurat_obj[["RNA"]]@layers),
c("counts", "data", "scale.data")
)

expect_equal(seurat_obj[["RNA"]]$counts, counts(sce))
Expand Down Expand Up @@ -66,13 +67,13 @@ test_that("SCE to Seurat with id conversion works as expected", {
expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA")

# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")

# assay contents
# expected layers present
expect_contains(
slotNames(seurat_obj[["RNA"]]),
c("counts", "data", "scale.data", "var.features", "meta.features")
names(seurat_obj[["RNA"]]@layers),
c("counts", "data", "scale.data")
)

expect_equal(
Expand Down Expand Up @@ -116,13 +117,13 @@ test_that("SCE to Seurat with id conversion and 10x reference works as expected"


# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")

# assay contents
# expected layers present
expect_contains(
slotNames(seurat_obj[["RNA"]]),
c("counts", "data", "scale.data", "var.features", "meta.features")
names(seurat_obj[["RNA"]]@layers),
c("counts", "data", "scale.data")
)

expect_equal(
Expand Down Expand Up @@ -163,15 +164,15 @@ test_that("conversion works with altExps", {
expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA")

# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")
expect_s4_class(seurat_obj[["alt1"]], "Assay")
expect_s4_class(seurat_obj[["alt2"]], "Assay")
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")
expect_s4_class(seurat_obj[["alt1"]], "Assay5")
expect_s4_class(seurat_obj[["alt2"]], "Assay5")
})

test_that("Seurat v5 conversion works", {
test_that("Seurat v3 conversion works", {
sce <- readRDS(test_path("data", "scpca_sce.rds"))
seurat_obj <- sce_to_seurat(sce, use_symbols = FALSE, seurat_assay_version = "v5")
seurat_obj <- sce_to_seurat(sce, use_symbols = FALSE, seurat_assay_version = "v3")
expect_s4_class(seurat_obj, "Seurat")

expect_equal(dim(seurat_obj), dim(sce))
Expand All @@ -182,13 +183,13 @@ test_that("Seurat v5 conversion works", {
expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA")

# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")

# expected layers present
# assay contents
expect_contains(
names(seurat_obj[["RNA"]]@layers),
c("counts", "data", "scale.data")
slotNames(seurat_obj[["RNA"]]),
c("counts", "data", "scale.data", "var.features")
)

expect_equal(seurat_obj[["RNA"]]$counts, counts(sce))
Expand Down Expand Up @@ -224,13 +225,13 @@ test_that("Conversion works for non-processed samples", {
expect_equal(Seurat::DefaultAssay(seurat_obj), "RNA")

# assay types
expect_s4_class(seurat_obj[["RNA"]], "Assay")
expect_s4_class(seurat_obj[["spliced"]], "Assay")
expect_s4_class(seurat_obj[["RNA"]], "Assay5")
expect_s4_class(seurat_obj[["spliced"]], "Assay5")

# assay contents
# expected layers present
expect_contains(
slotNames(seurat_obj[["RNA"]]),
c("counts", "data", "scale.data", "var.features", "meta.features")
names(seurat_obj[["RNA"]]@layers),
c("counts", "data", "scale.data")
)

expect_equal(seurat_obj[["RNA"]]$counts, counts(sce))
Expand Down
65 changes: 62 additions & 3 deletions tests/testthat/test-sum-duplicate-genes.R
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🐰
i was typing for a tada emoji but accidentally typed ra... , so enjoy this rabbit who likes the new tests!

Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ test_that("merging works as expected", {
expect_equal(dim(deduped_sce), dim(sce))

expect_equal(rownames(deduped_sce), rownames(sce))
expect_equal(colnames(deduped_sce), colnames(sce))

expect_contains(
colnames(rowData(deduped_sce)),
Expand All @@ -16,11 +17,11 @@ test_that("merging works as expected", {


expect_equal(
counts(deduped_sce)[rownames(sce), ], # need to make the row order identical
counts(deduped_sce),
2 * counts(sce) |> Matrix::drop0() # original matrices may have explicit 0s
)
expect_equal(
assay(deduped_sce, "spliced")[rownames(sce), ],
assay(deduped_sce, "spliced"),
2 * assay(sce, "spliced") |> Matrix::drop0()
)
})
Expand All @@ -33,6 +34,36 @@ test_that("merging works as expected with unprocessed SCE", {
expect_equal(dim(deduped_sce), dim(sce))

expect_equal(rownames(deduped_sce), rownames(sce))
expect_equal(colnames(deduped_sce), colnames(sce))

expect_contains(
colnames(rowData(deduped_sce)),
c("gene_ids", "gene_symbol", "mean", "detected")
)
expect_equal(rowData(deduped_sce)$gene_ids, rowData(sce)$gene_ids)
expect_equal(rowData(deduped_sce)$gene_symbol, rowData(sce)$gene_symbol)


expect_equal(
counts(deduped_sce), # need to make the row order identical
2 * counts(sce) |> Matrix::drop0() # original matrices may have explicit 0s
)
expect_equal(
assay(deduped_sce, "spliced"),
2 * assay(sce, "spliced") |> Matrix::drop0()
)
})


test_that("merging works with split and combined cell sets", {
sce <- readRDS(test_path("data", "filtered_sce.rds"))
double_sce <- rbind(sce, sce)
deduped_sce <- sum_duplicate_genes(double_sce, cell_set_size = 20)

expect_equal(dim(deduped_sce), dim(sce))

expect_equal(rownames(deduped_sce), rownames(sce))
expect_equal(colnames(deduped_sce), colnames(sce))

expect_contains(
colnames(rowData(deduped_sce)),
Expand All @@ -43,11 +74,39 @@ test_that("merging works as expected with unprocessed SCE", {


expect_equal(
counts(deduped_sce)[rownames(sce), ], # need to make the row order identical
counts(deduped_sce),
2 * counts(sce) |> Matrix::drop0() # original matrices may have explicit 0s
)
expect_equal(
assay(deduped_sce, "spliced")[rownames(sce), ],
2 * assay(sce, "spliced") |> Matrix::drop0()
)
})

test_that("test merging for split when cell set size leaves a set of size 1", {
sce <- readRDS(test_path("data", "filtered_sce.rds"))
double_sce <- rbind(sce, sce)
deduped_sce <- sum_duplicate_genes(double_sce, cell_set_size = ncol(sce) - 1)

expect_equal(dim(deduped_sce), dim(sce))

expect_equal(rownames(deduped_sce), rownames(sce))
expect_equal(colnames(deduped_sce), colnames(sce))

expect_contains(
colnames(rowData(deduped_sce)),
c("gene_ids", "gene_symbol", "mean", "detected")
)
expect_equal(rowData(deduped_sce)$gene_ids, rowData(sce)$gene_ids)
expect_equal(rowData(deduped_sce)$gene_symbol, rowData(sce)$gene_symbol)


expect_equal(
counts(deduped_sce),
2 * counts(sce) |> Matrix::drop0() # original matrices may have explicit 0s
)
expect_equal(
assay(deduped_sce, "spliced"),
2 * assay(sce, "spliced") |> Matrix::drop0()
)
})