diff --git a/DESCRIPTION b/DESCRIPTION index 1faa817e..888a9018 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -83,5 +83,5 @@ Config/Needs/website: pkgdown, tibble, knitr, rprojroot, stringr, readr, Config/testthat/edition: 3 Encoding: UTF-8 Roxygen: list(markdown = TRUE, r6 = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 biocViews: SingleCell, DataImport, DataRepresentation diff --git a/R/AbstractAnnData.R b/R/AbstractAnnData.R index 582a846f..d4903cf1 100644 --- a/R/AbstractAnnData.R +++ b/R/AbstractAnnData.R @@ -120,11 +120,11 @@ AbstractAnnData <- R6::R6Class("AbstractAnnData", # nolint }, #' @description Number of observations in the AnnData object. n_obs = function() { - length(self$obs_names) + nrow(self$obs) }, #' @description Number of variables in the AnnData object. n_vars = function() { - length(self$var_names) + nrow(self$var) }, #' @description Keys ('column names') of `obs`. obs_keys = function() { @@ -284,13 +284,6 @@ AbstractAnnData <- R6::R6Class("AbstractAnnData", # nolint )) } - if (has_row_names(df)) { - warning(wrap_message( - "'", label, "' should not have any rownames, removing them from the data frame." - )) - rownames(df) <- NULL - } - df }, diff --git a/R/AnnData.R b/R/AnnData.R index 534b6033..b102c095 100644 --- a/R/AnnData.R +++ b/R/AnnData.R @@ -1,21 +1,54 @@ -#' An in-memory AnnData object +#' An AnnData object #' -#' @description -#' This class is used to represent an AnnData object in memory. -#' AnnData stores a data matrix `X` together with annotations of -#' observations `obs` (`obsm`, `obsp`), variables `var` (`varm`, `varp`), and -#' unstructured annotations `uns`. +#' @description An AnnData object. This class can either be an in-memory +#' AnnData (InMemoryAnnData) or an HDF5-backed AnnData (HDF5AnnData). The +#' AnnData object stores a data matrix `X` together with annotations of +#' observations `obs` (`obsm`, `obsp`) and variables `var` (`varm`, `varp`). +#' Additional layers of data can be stored in `layers` and unstructured +#' annotations in `uns`. #' -#' To read an AnnData file from disk, use [read_h5ad()] instead. +#' @section Functions that can be used to create AnnData objects: +#' +#' * [AnnData()]: Create an in-memory AnnData object. +#' * [read_h5ad()]: Read an HDF5-backed AnnData file from disk. +#' * [from_SingleCellExperiment()]: Convert a SingleCellExperiment object to an AnnData object. +#' * [from_Seurat()]: Convert a Seurat object to an AnnData object. +#' +#' @section Slots: +#' +#' * `X`: A matrix of observations by variables. +#' * `obs`: A data frame of observations. +#' * `var`: A data frame of variables. +#' * `obs_names`: Names of observations (alias for `rownames(obs)`). +#' * `var_names`: Names of variables (alias for `rownames(var)`). +#' * `layers`: A named list of matrices with the same dimensions as `X`. +#' * `obsm`: A named list of matrices with the same number of rows as `obs`. +#' * `varm`: A named list of matrices with the same number of rows as `var`. +#' * `obsp`: A named list of sparse matrices with the same number of rows and columns as the number of observations. +#' * `varp`: A named list of sparse matrices with the same number of rows and columns as the number of variables. +#' * `uns`: A named list of unstructured annotations. +#' +#' @section Methods: +#' +#' * `print()`: Print a summary of the AnnData object. +#' * `shape()`: Dimensions (observations x variables) of the AnnData object. +#' * `n_obs()`: Number of observations in the AnnData object. +#' * `n_vars()`: Number of variables in the AnnData object. +#' * `obs_keys()`: Column names of `obs`. +#' * `var_keys()`: Column names of `var`. +#' * `layers_keys()`: Element names of `layers`. +#' * `obsm_keys()`: Element names of `obsm`. +#' * `varm_keys()`: Element names of `varm`. +#' * `obsp_keys()`: Element names of `obsp`. +#' * `varp_keys()`: Element names of `varp`. +#' +#' @section Conversion methods: +#' +#' * `to_SingleCellExperiment()`: Convert to SingleCellExperiment. +#' * `to_Seurat()`: Convert to Seurat. +#' * `to_InMemoryAnnData()`: Convert to an InMemory AnnData. +#' * `to_HDF5AnnData()`: Convert to an HDF5 Backed AnnData. #' -#' @param obs_names A vector of unique identifiers -#' used to identify each row of `obs` and to act as an index into the -#' observation dimension of the AnnData object. The length of `obs_names` -#' defines the observation dimension of the AnnData object. -#' @param var_names A vector of unique identifiers used to identify each row -#' of `var` and to act as an index into the variable dimension of the -#' AnnData object. The length of `var_names` defines the variable -#' dimension of the AnnData object. #' @param X Either `NULL` or a observation × variable matrix with #' dimensions consistent with `obs` and `var`. #' @param layers Either `NULL` or a named list, where each element is an @@ -41,27 +74,31 @@ #' element is a sparse matrix where each dimension has length `n_vars`. #' @param uns The uns slot is used to store unstructured annotation. It must #' be either `NULL` or a named list. +#' @param shape Shape tuple (#observations, #variables). Can be provided +#' if `X` or `obs` and `var` are not provided. +#' +#' @return An [AbstractAnnData] object. +#' +#' @seealso [AbstractAnnData] #' #' @export #' #' @examples #' adata <- AnnData( -#' obs_names = paste0("obs", 1:3), -#' var_names = paste0("var", 1:4), #' X = matrix(1:12, nrow = 3, ncol = 4), #' obs = data.frame( +#' row.names = paste0("obs", 1:3), #' n_counts = c(1, 2, 3), #' n_cells = c(1, 2, 3) #' ), #' var = data.frame( +#' row.names = paste0("var", 1:4), #' n_cells = c(1, 2, 3, 4) #' ) #' ) #' #' adata AnnData <- function( - obs_names = NULL, - var_names = NULL, X = NULL, obs = NULL, var = NULL, @@ -70,10 +107,9 @@ AnnData <- function( varm = NULL, obsp = NULL, varp = NULL, - uns = NULL) { + uns = NULL, + shape = shape) { InMemoryAnnData$new( - obs_names = obs_names, - var_names = var_names, X = X, obs = obs, var = var, @@ -82,6 +118,7 @@ AnnData <- function( varm = varm, obsp = obsp, varp = varp, - uns = uns + uns = uns, + shape = shape ) } diff --git a/R/HDF5AnnData.R b/R/HDF5AnnData.R index 9d0e2127..a7d4c34b 100644 --- a/R/HDF5AnnData.R +++ b/R/HDF5AnnData.R @@ -6,10 +6,6 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint inherit = AbstractAnnData, private = list( .h5obj = NULL, - .n_obs = NULL, - .n_vars = NULL, - .obs_names = NULL, - .var_names = NULL, .compression = NULL ), active = list( @@ -124,7 +120,7 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint obs = function(value) { if (missing(value)) { # trackstatus: class=HDF5AnnData, feature=get_obs, status=done - read_h5ad_element(private$.h5obj, "/obs", include_index = FALSE) + read_h5ad_element(private$.h5obj, "/obs") } else { # trackstatus: class=HDF5AnnData, feature=set_obs, status=done value <- private$.validate_obsvar_dataframe(value, "obs") @@ -132,8 +128,7 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint value, private$.h5obj, "/obs", - private$.compression, - index = self$obs_names + private$.compression ) } }, @@ -141,15 +136,14 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint var = function(value) { if (missing(value)) { # trackstatus: class=HDF5AnnData, feature=get_var, status=done - read_h5ad_element(private$.h5obj, "/var", include_index = FALSE) + read_h5ad_element(private$.h5obj, "/var") } else { # trackstatus: class=HDF5AnnData, feature=set_var, status=done value <- private$.validate_obsvar_dataframe(value, "var") write_h5ad_element( value, private$.h5obj, - "/var", - index = self$var_names + "/var" ) } }, @@ -157,35 +151,20 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint obs_names = function(value) { if (missing(value)) { # trackstatus: class=HDF5AnnData, feature=get_obs_names, status=done - # obs names are cached to avoid reading all of obs whenever they are - # accessed - if (is.null(private$.obs_names)) { - private$.obs_names <- read_h5ad_data_frame_index(private$.h5obj, "obs") - } - private$.obs_names + rownames(self$obs) } else { # trackstatus: class=HDF5AnnData, feature=set_obs_names, status=done - value <- private$.validate_obsvar_names(value, "obs") - write_h5ad_data_frame_index(value, private$.h5obj, "obs", private$.compression, "_index") - private$.obs_names <- value + rownames(self$obs) <- value } }, #' @field var_names Names of variables var_names = function(value) { - # TODO: directly write to and read from /var/_index if (missing(value)) { # trackstatus: class=HDF5AnnData, feature=get_var_names, status=done - # var names are cached to avoid reading all of var whenever they are - # accessed - if (is.null(private$.var_names)) { - private$.var_names <- read_h5ad_data_frame_index(private$.h5obj, "var") - } - private$.var_names + rownames(self$var) } else { # trackstatus: class=HDF5AnnData, feature=set_var_names, status=done - value <- private$.validate_obsvar_names(value, "var") - write_h5ad_data_frame_index(value, private$.h5obj, "var", private$.compression, "_index") - private$.var_names <- value + rownames(self$var) <- value } }, #' @field uns The uns slot. Must be `NULL` or a named list. @@ -205,14 +184,6 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint #' #' @param file The filename (character) of the `.h5ad` file. If this #' file does not exist yet, `obs_names` and `var_names` must be provided. - #' @param obs_names A vector of unique identifiers - #' used to identify each row of `obs` and to act as an index into the - #' observation dimension of the AnnData object. The length of `obs_names` - #' defines the observation dimension of the AnnData object. - #' @param var_names A vector of unique identifiers used to identify each row - #' of `var` and to act as an index into the variable dimension of the - #' AnnData object. The length of `var_names` defines the variable - #' dimension of the AnnData object. #' @param X Either `NULL` or a observation × variable matrix with #' dimensions consistent with `obs` and `var`. #' @param layers Either `NULL` or a named list, where each element is an @@ -238,6 +209,8 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint #' element is a sparse matrix where each dimension has length `n_vars`. #' @param uns The uns slot is used to store unstructured annotation. It must #' be either `NULL` or a named list. + #' @param shape Shape tuple (#observations, #variables). Can be provided + #' if `X` or `obs` and `var` are not provided. #' @param compression The compression algorithm to use when writing the #' HDF5 file. Can be one of `"none"`, `"gzip"` or `"lzf"`. Defaults to #' `"none"`. @@ -250,8 +223,6 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint #' set on the created object. This will cause data to be overwritten if the #' file already exists. initialize = function(file, - obs_names = NULL, - var_names = NULL, X = NULL, obs = NULL, var = NULL, @@ -261,6 +232,7 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint obsp = NULL, varp = NULL, uns = NULL, + shape = NULL, compression = c("none", "gzip", "lzf")) { if (!requireNamespace("rhdf5", quietly = TRUE)) { stop("The HDF5 interface requires the 'rhdf5' package to be installed") @@ -270,23 +242,39 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint private$.compression <- compression if (!file.exists(file)) { - # Check obs_names and var_names have been provided - if (is.null(obs_names)) { - stop("When creating a new .h5ad file, `obs_names` must be defined.") - } - if (is.null(var_names)) { - stop("When creating a new .h5ad file, `var_names` must be defined.") - } + # Store filename + private$.h5obj <- file - # Create an empty H5AD using the provided obs/var names - write_empty_h5ad(file, obs_names, var_names, compression) + # Determine initial obs and var + shape <- get_shape(obs, var, X, shape) + obs <- get_initial_obs(obs, X, shape) + var <- get_initial_var(var, X, shape) - # Set private object slots - private$.h5obj <- file - private$.n_obs <- length(obs_names) - private$.n_vars <- length(var_names) - private$.obs_names <- obs_names - private$.var_names <- var_names + # Create an empty H5AD + write_empty_h5ad(private$.h5obj, obs, var, compression) + + # set other slots + if (!is.null(X)) { + self$X <- X + } + if (!is.null(layers)) { + self$layers <- layers + } + if (!is.null(obsm)) { + self$obsm <- obsm + } + if (!is.null(varm)) { + self$varm <- varm + } + if (!is.null(obsp)) { + self$obsp <- obsp + } + if (!is.null(varp)) { + self$varp <- varp + } + if (!is.null(uns)) { + self$uns <- uns + } } else { # Check the file is a valid H5AD attrs <- rhdf5::h5readAttributes(file, "/") @@ -301,68 +289,44 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint # Set the file path private$.h5obj <- file - # If obs or var names have been provided update those - if (!is.null(obs_names)) { - self$obs_names <- obs_names + if (!is.null(obs)) { + stop("obs must be NULL when loading an existing .h5ad file") } - - if (!is.null(var_names)) { - self$var_names <- var_names + if (!is.null(var)) { + stop("var must be NULL when loading an existing .h5ad file") + } + if (!is.null(X)) { + stop("X must be NULL when loading an existing .h5ad file") + } + if (!is.null(layers)) { + stop("layers must be NULL when loading an existing .h5ad file") + } + if (!is.null(obsm)) { + stop("obsm must be NULL when loading an existing .h5ad file") + } + if (!is.null(varm)) { + stop("varm must be NULL when loading an existing .h5ad file") + } + if (!is.null(obsp)) { + stop("obsp must be NULL when loading an existing .h5ad file") + } + if (!is.null(varp)) { + stop("varp must be NULL when loading an existing .h5ad file") + } + if (!is.null(uns)) { + stop("uns must be NULL when loading an existing .h5ad file") } - } - - # Update remaining slots - if (!is.null(X)) { - self$X <- X - } - - if (!is.null(obs)) { - self$obs <- obs - } - - if (!is.null(var)) { - self$var <- var - } - - if (!is.null(layers)) { - self$layers <- layers - } - - if (!is.null(obsm)) { - self$obsm <- obsm - } - - if (!is.null(varm)) { - self$varm <- varm - } - - if (!is.null(obsp)) { - self$obsp <- obsp - } - - if (!is.null(varp)) { - self$varp <- varp - } - - if (!is.null(uns)) { - self$uns <- uns } }, #' @description Number of observations in the AnnData object n_obs = function() { - if (is.null(private$.n_obs)) { - private$.n_obs <- length(self$obs_names) - } - private$.n_obs + nrow(self$obs) }, #' @description Number of variables in the AnnData object n_vars = function() { - if (is.null(private$.n_vars)) { - private$.n_vars <- length(self$var_names) - } - private$.n_vars + nrow(self$var) } ) ) @@ -390,15 +354,18 @@ HDF5AnnData <- R6::R6Class("HDF5AnnData", # nolint #' A = matrix(5:1, 3L, 5L), #' B = matrix(letters[1:5], 3L, 5L) #' ), -#' obs = data.frame(cell = 1:3), -#' var = data.frame(gene = 1:5), -#' obs_names = LETTERS[1:3], -#' var_names = letters[1:5] +#' obs = data.frame(row.names = LETTERS[1:3], cell = 1:3), +#' var = data.frame(row.names = letters[1:5], gene = 1:5), #' ) #' to_HDF5AnnData(ad, "test.h5ad") #' # remove file #' file.remove("test.h5ad") -to_HDF5AnnData <- function(adata, file, compression = c("none", "gzip", "lzf")) { # nolint +# nolint start: object_name_linter +to_HDF5AnnData <- function( + # nolint end: object_name_linter + adata, + file, + compression = c("none", "gzip", "lzf")) { stopifnot( inherits(adata, "AbstractAnnData") ) @@ -409,12 +376,11 @@ to_HDF5AnnData <- function(adata, file, compression = c("none", "gzip", "lzf")) var = adata$var, obsm = adata$obsm, varm = adata$varm, - obs_names = adata$obs_names, - var_names = adata$var_names, layers = adata$layers, obsp = adata$obsp, varp = adata$varp, uns = adata$uns, + shape = adata$shape(), compression = compression ) } diff --git a/R/InMemoryAnnData.R b/R/InMemoryAnnData.R index 6ab34501..02320fba 100644 --- a/R/InMemoryAnnData.R +++ b/R/InMemoryAnnData.R @@ -7,24 +7,22 @@ #' #' @examples #' ## complete example -#' ad <- InMemoryAnnData$new( +#' ad <- AnnData( #' X = matrix(1:15, 3L, 5L), #' layers = list( #' A = matrix(5:1, 3L, 5L), #' B = matrix(letters[1:5], 3L, 5L) #' ), -#' obs = data.frame(cell = 1:3), -#' var = data.frame(gene = 1:5), -#' obs_names = LETTERS[1:3], -#' var_names = letters[1:5] +#' obs = data.frame(row.names = LETTERS[1:3], cell = 1:3), +#' var = data.frame(row.names = letters[1:5], gene = 1:5) #' ) #' ad #' #' ## minimum example #' # -> using `AnnData()` is synonymous to `InMemoryAnnData$new()` #' ad <- AnnData( -#' obs_names = letters[1:10], -#' var_names = LETTERS[1:5] +#' obs = data.frame(row.names = letters[1:10]), +#' var = data.frame(row.names = LETTERS[1:5]) #' ) #' ad #' @export @@ -35,8 +33,6 @@ InMemoryAnnData <- R6::R6Class("InMemoryAnnData", # nolint .layers = NULL, .obs = NULL, .var = NULL, - .obs_names = NULL, - .var_names = NULL, .obsm = NULL, .varm = NULL, .obsp = NULL, @@ -115,10 +111,10 @@ InMemoryAnnData <- R6::R6Class("InMemoryAnnData", # nolint obs_names = function(value) { if (missing(value)) { # trackstatus: class=InMemoryAnnData, feature=get_obs_names, status=done - private$.obs_names + rownames(private$.obs) } else { # trackstatus: class=InMemoryAnnData, feature=set_obs_names, status=done - private$.obs_names <- private$.validate_obsvar_names(value, "obs") + rownames(private$.obs) <- private$.validate_obsvar_names(value, "obs") self } }, @@ -130,10 +126,10 @@ InMemoryAnnData <- R6::R6Class("InMemoryAnnData", # nolint var_names = function(value) { if (missing(value)) { # trackstatus: class=InMemoryAnnData, feature=get_var_names, status=done - private$.var_names + rownames(private$.var) } else { # trackstatus: class=InMemoryAnnData, feature=set_var_names, status=done - private$.var_names <- private$.validate_obsvar_names(value, "var") + rownames(private$.var) <- private$.validate_obsvar_names(value, "var") self } }, @@ -222,16 +218,6 @@ InMemoryAnnData <- R6::R6Class("InMemoryAnnData", # nolint public = list( #' @description Creates a new instance of an in memory AnnData object. #' Inherits from AbstractAnnData. - #' @param obs_names A vector of unique identifiers - #' used to identify each row of `obs` and to act as an index into - #' the observation dimension of the AnnData object. The length of - #' the `obs_names` defines the observation dimension of the AnnData - #' object. - #' @param var_names A vector of unique identifers - #' used to identify each row of `var` and to act as an index into - #' the variable dimension of the AnnData object. The length of - #' the `var_names` defines the variable dimension of the AnnData - #' object. #' @param X Either `NULL` or a observation × variable matrix with #' dimensions consistent with `obs` and `var`. #' @param layers Either `NULL` or a named list, where each element @@ -257,9 +243,9 @@ InMemoryAnnData <- R6::R6Class("InMemoryAnnData", # nolint #' element is a sparse matrix where each dimension has length `n_vars`. #' @param uns The uns slot is used to store unstructured annotation. #' It must be either `NULL` or a named list. - initialize = function(obs_names, - var_names, - X = NULL, + #' @param shape Shape tuple (#observations, #variables). Can be provided + #' if `X` or `obs` and `var` are not provided. + initialize = function(X = NULL, obs = NULL, var = NULL, layers = NULL, @@ -267,14 +253,24 @@ InMemoryAnnData <- R6::R6Class("InMemoryAnnData", # nolint varm = NULL, obsp = NULL, varp = NULL, - uns = NULL) { - # write obs and var first, because these are used by other validators - self$obs_names <- obs_names - self$var_names <- var_names + uns = NULL, + shape = NULL) { + # Determine initial obs and var + shape <- get_shape(obs, var, X, shape) + obs <- get_initial_obs(obs, X, shape) + var <- get_initial_var(var, X, shape) + + # set obs and var first + if (!is.data.frame(obs)) { + stop("obs must be a data.frame") + } + if (!is.data.frame(var)) { + stop("var must be a data.frame") + } + private$.obs <- obs + private$.var <- var # write other slots later - self$obs <- obs - self$var <- var self$X <- X self$layers <- layers self$obsm <- obsm @@ -305,10 +301,8 @@ InMemoryAnnData <- R6::R6Class("InMemoryAnnData", # nolint #' A = matrix(5:1, 3L, 5L), #' B = matrix(letters[1:5], 3L, 5L) #' ), -#' obs = data.frame(cell = 1:3), -#' var = data.frame(gene = 1:5), -#' obs_names = LETTERS[1:3], -#' var_names = letters[1:5] +#' obs = data.frame(cell = 1:3, row.names = LETTERS[1:3]), +#' var = data.frame(gene = 1:5, row.names = letters[1:5]) #' ) #' to_InMemoryAnnData(ad) to_InMemoryAnnData <- function(adata) { # nolint @@ -319,13 +313,12 @@ to_InMemoryAnnData <- function(adata) { # nolint X = adata$X, obs = adata$obs, var = adata$var, - obs_names = adata$obs_names, - var_names = adata$var_names, layers = adata$layers, obsm = adata$obsm, varm = adata$varm, obsp = adata$obsp, varp = adata$varp, - uns = adata$uns + uns = adata$uns, + shape = adata$shape() ) } diff --git a/R/Seurat.R b/R/Seurat.R index 9bdc236a..43f941b8 100644 --- a/R/Seurat.R +++ b/R/Seurat.R @@ -12,10 +12,8 @@ #' @examples #' ad <- AnnData( #' X = matrix(1:5, 3L, 5L), -#' obs = data.frame(cell = 1:3), -#' obs_names = letters[1:3], -#' var = data.frame(gene = 1:5), -#' var_names = letters[1:5] +#' obs = data.frame(row.names = LETTERS[1:3], cell = 1:3), +#' var = data.frame(row.names = letters[1:5], gene = 1:5) #' ) #' to_Seurat(ad) # TODO: Add parameters to choose which how X and layers are translated into counts, data and scaled.data @@ -145,31 +143,24 @@ from_Seurat <- function(seurat_obj, output_class = c("InMemoryAnnData", "HDF5Ann } } - # get obs_names - # trackstatus: class=Seurat, feature=set_obs_names, status=done - obs_names <- colnames(seurat_obj) # get obs + # trackstatus: class=Seurat, feature=set_obs_names, status=done # trackstatus: class=Seurat, feature=set_obs, status=done obs <- seurat_obj@meta.data - rownames(obs) <- NULL - - # construct var_names - # trackstatus: class=Seurat, feature=set_var_names, status=done - var_names <- rownames(seurat_obj@assays[[assay_name]]) + rownames(obs) <- colnames(seurat_obj) # TODO: this is probably not needed # construct var + # trackstatus: class=Seurat, feature=set_var_names, status=done # trackstatus: class=Seurat, feature=set_var, status=done var <- seurat_obj@assays[[assay_name]]@meta.features - rownames(var) <- NULL + rownames(var) <- rownames(seurat_obj@assays[[assay_name]]) # TODO: this is probably not needed # use generator to create new AnnData object generator <- get_anndata_constructor(output_class) ad <- generator$new( obs = obs, var = var, - obs_names = obs_names, - var_names = var_names, ... ) diff --git a/R/SingleCellExperiment.R b/R/SingleCellExperiment.R index 7c3a7d6a..6090168f 100644 --- a/R/SingleCellExperiment.R +++ b/R/SingleCellExperiment.R @@ -16,15 +16,13 @@ #' library(SingleCellExperiment) #' } #' ad <- AnnData( -#' X = matrix(1:15, 3L, 5L), +#' X = matrix(1:5, 3L, 5L), #' layers = list( -#' A = matrix(15:1, 3L, 5L), -#' B = matrix(letters[1:15], 3L, 5L) +#' A = matrix(5:1, 3L, 5L), +#' B = matrix(letters[1:5], 3L, 5L) #' ), -#' obs = data.frame(cell = 1:3), -#' var = data.frame(gene = 1:5), -#' obs_names = LETTERS[1:3], -#' var_names = letters[1:5] +#' obs = data.frame(row.names = LETTERS[1:3], cell = 1:3), +#' var = data.frame(row.names = letters[1:5], gene = 1:5) #' ) #' #' ## construct a SingleCellExperiment from an AnnData object @@ -112,7 +110,12 @@ to_SingleCellExperiment <- function(object) { # nolint #' from_SingleCellExperiment(sce, "InMemory") #' #' @export -from_SingleCellExperiment <- function(sce, output_class = c("InMemory", "HDF5AnnData"), ...) { # nolint +# nolint start: object_name_linter +from_SingleCellExperiment <- function( + # nolint end: object_name_linter + sce, + output_class = c("InMemory", "HDF5AnnData"), + ...) { stopifnot( inherits(sce, "SingleCellExperiment") ) @@ -121,30 +124,16 @@ from_SingleCellExperiment <- function(sce, output_class = c("InMemory", "HDF5Ann generator <- get_anndata_constructor(output_class) # trackstatus: class=SingleCellExperiment, feature=set_obs, status=done + # trackstatus: class=SingleCellExperiment, feature=set_obs_names, status=done obs <- as.data.frame( SummarizedExperiment::colData(sce) ) - rownames(obs) <- NULL # trackstatus: class=SingleCellExperiment, feature=set_var, status=done + # trackstatus: class=SingleCellExperiment, feature=set_var_names, status=done var <- as.data.frame( SummarizedExperiment::rowData(sce) ) - rownames(var) <- NULL - - # trackstatus: class=SingleCellExperiment, feature=set_obs_names, status=done - obs_names <- colnames(sce) - if (is.null(obs_names)) { - warning(wrap_message("colnames(sce) should not be NULL")) - obs_names <- as.character(seq_len(nrow(obs))) - } - - # trackstatus: class=SingleCellExperiment, feature=set_var_names, status=done - var_names <- rownames(sce) - if (is.null(var_names)) { - warning(wrap_message("rownames(sce) should not be NULL")) - var_names <- as.character(seq_len(nrow(var))) - } # trackstatus: class=SingleCellExperiment, feature=set_X, status=done # trackstatus: class=SingleCellExperiment, feature=set_layers, status=done @@ -178,8 +167,6 @@ from_SingleCellExperiment <- function(sce, output_class = c("InMemory", "HDF5Ann X = x, obs = obs, var = var, - obs_names = obs_names, - var_names = var_names, layers = layers, ... ) diff --git a/R/generate_dataset.R b/R/generate_dataset.R index 8d0e3e4e..cfc2a7b1 100644 --- a/R/generate_dataset.R +++ b/R/generate_dataset.R @@ -170,9 +170,11 @@ generate_dataset <- function( # generate obs_names obs_names <- paste0("cell", seq_len(n_obs)) + rownames(obs) <- obs_names # generate var_names var_names <- paste0("gene", seq_len(n_vars)) + rownames(var) <- var_names # generate obsm obsm <- lapply(obsm_types, function(obsm_type) { @@ -235,11 +237,9 @@ generate_dataset <- function( list( X = X, obs = obs, - obs_names = obs_names, obsm = obsm, obsp = obsp, var = var, - var_names = var_names, varm = varm, varp = varp, layers = layers, @@ -333,11 +333,9 @@ generate_dataset <- function( obs = dataset_list$obs, obsm = dataset_list$obsm, obsp = dataset_list$obsp, - obs_names = dataset_list$obs_names, var = dataset_list$var, varm = dataset_list$varm, varp = dataset_list$varp, - var_names = dataset_list$var_names, layers = dataset_list$layers, uns = dataset_list$uns ) diff --git a/R/read_h5ad_helpers.R b/R/read_h5ad_helpers.R index 11f50ee1..7603c14f 100644 --- a/R/read_h5ad_helpers.R +++ b/R/read_h5ad_helpers.R @@ -398,68 +398,26 @@ read_h5ad_mapping <- function(file, name, version = "0.1.0") { #' @param file Path to a H5AD file or an open H5AD handle #' @param name Name of the element within the H5AD file #' @param version Encoding version of the element to read -#' @param include_index Whether or not to include the index as a column -#' -#' @details -#' If `include_index == TRUE` the index stored in the HDF5 file is added as a -#' column to output `data.frame` using the defined index name as the column -#' name and this is set as an attribute. If `include_index == FALSE` the index -#' is not provided in the output. In either case row names are not set. #' #' @return a data.frame #' #' @noRd -read_h5ad_data_frame <- function(file, name, include_index = FALSE, - version = "0.2.0") { +read_h5ad_data_frame <- function(file, name, version = "0.2.0") { version <- match.arg(version) attributes <- rhdf5::h5readAttributes(file, name) index_name <- attributes$`_index` column_order <- attributes$`column-order` - columns <- read_h5ad_collection(file, name, column_order) - - if (length(columns) == 0) { - index <- read_h5ad_data_frame_index(file, name) - df <- data.frame(row.names = seq_along(index)) - } else { - df <- data.frame(columns) - } - - if (isTRUE(include_index)) { - index <- read_h5ad_data_frame_index(file, name) - df <- cbind(index, df) - - # The default index name is not allowed as a column name so adjust it - if (index_name == "_index") { - index_name <- ".index" - colnames(df)[1] <- index_name - } + index <- read_h5ad_element(file, file.path(name, index_name)) + data <- read_h5ad_collection(file, name, column_order) - attr(df, "_index") <- index_name # nolint - } - - df -} - -#' Read H5AD data frame index -#' -#' Read the index of a data frame from an H5AD file -#' -#' @param file Path to a H5AD file or an open H5AD handle -#' @param name Name of the element within the H5AD file -#' @param version Encoding version of the element to read -#' -#' @return an object containing the index -#' -#' @noRd -read_h5ad_data_frame_index <- function(file, name, version = "0.2.0") { - version <- match.arg(version) - - attributes <- rhdf5::h5readAttributes(file, name) - index_name <- attributes$`_index` - - read_h5ad_element(file, file.path(name, index_name)) + as.data.frame( + row.names = index, + data, + check.names = FALSE, + fix.empty.names = FALSE + ) } #' Read multiple H5AD datatypes diff --git a/R/utils.R b/R/utils.R index 035449a4..640c14b8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -10,3 +10,47 @@ has_row_names <- function(x) { !is.null(dimnames(x)[[1]]) } } + +get_shape <- function(obs, var, X, shape) { + n_obs <- + if (!is.null(obs)) { + nrow(obs) + } else if (!is.null(X)) { + nrow(X) + } else if (!is.null(shape)) { + shape[[1]] + } else { + 0L + } + n_vars <- + if (!is.null(var)) { + nrow(var) + } else if (!is.null(X)) { + ncol(X) + } else if (!is.null(shape)) { + shape[[2]] + } else { + 0L + } + c(n_obs, n_vars) +} + +get_initial_obs <- function(obs, X, shape) { + if (is.null(obs)) { + obs <- data.frame(matrix(NA, nrow = shape[[1]], ncol = 0)) + if (!is.null(X)) { + rownames(obs) <- rownames(X) + } + } + obs +} + +get_initial_var <- function(var, X, shape) { + if (is.null(var)) { + var <- data.frame(matrix(NA, nrow = shape[[2]], ncol = 0)) + if (!is.null(X)) { + colnames(var) <- colnames(X) + } + } + var +} diff --git a/R/write_h5ad.R b/R/write_h5ad.R index 438ea4ef..09c5ea60 100644 --- a/R/write_h5ad.R +++ b/R/write_h5ad.R @@ -14,15 +14,13 @@ #' #' @examples #' adata <- AnnData( -#' X = matrix(1:15, 3L, 5L), +#' X = matrix(1:5, 3L, 5L), #' layers = list( -#' A = matrix(15:1, 3L, 5L), -#' B = matrix(letters[1:15], 3L, 5L) +#' A = matrix(5:1, 3L, 5L), +#' B = matrix(letters[1:5], 3L, 5L) #' ), -#' obs = data.frame(cell = 1:3), -#' var = data.frame(gene = 1:5), -#' obs_names = LETTERS[1:3], -#' var_names = letters[1:5] +#' obs = data.frame(row.names = LETTERS[1:3], cell = 1:3), +#' var = data.frame(row.names = letters[1:5], gene = 1:5) #' ) #' h5ad_file <- tempfile(fileext = ".h5ad") #' write_h5ad(adata, h5ad_file) @@ -67,7 +65,10 @@ #' # h5ad_file <- tempfile(fileext = ".h5ad") #' # write_h5ad(obj, h5ad_file) #' } -write_h5ad <- function(object, path, compression = c("none", "gzip", "lzf")) { +write_h5ad <- function( + object, + path, + compression = c("none", "gzip", "lzf")) { if (inherits(object, "SingleCellExperiment")) { from_SingleCellExperiment( object, diff --git a/R/write_h5ad_helpers.R b/R/write_h5ad_helpers.R index 60e70dd9..cf9a8174 100644 --- a/R/write_h5ad_helpers.R +++ b/R/write_h5ad_helpers.R @@ -26,11 +26,6 @@ write_h5ad_element <- function( ...) { compression <- match.arg(compression) - # Delete the path if it already exists - if (hdf5_path_exists(file, name)) { - rhdf5::h5delete(file, name) - } - # Sparse matrices write_fun <- if (inherits(value, "sparseMatrix")) { # Sparse matrices @@ -70,6 +65,11 @@ write_h5ad_element <- function( stop("Writing '", class(value), "' objects to H5AD files is not supported") } + # Delete the path if it already exists + if (hdf5_path_exists(file, name)) { + rhdf5::h5delete(file, name) + } + tryCatch( { write_fun( @@ -147,7 +147,8 @@ write_h5ad_attributes <- function(file, name, attributes, is_scalar = TRUE) { # } else { scalar_value <- attr_name %in% is_scalar rhdf5::h5writeAttribute( - attr_value, h5obj, attr_name, asScalar = scalar_value + attr_value, h5obj, attr_name, + asScalar = scalar_value ) # nolint } } @@ -224,8 +225,6 @@ write_h5ad_dense_array <- function(value, file, name, compression, version = "0. } if (!is.vector(value)) { - # Transpose the value because writing with native=TRUE does not - # seem to work as expected value <- t(value) } @@ -421,10 +420,10 @@ write_h5ad_categorical <- function(value, file, name, compression, version = "0. categories <- levels(value) # Use zero-indexed values - codes <- as.integer(value) - 1 + codes <- as.integer(value) - 1L # Set missing values to -1 - codes[is.na(codes)] <- -1 + codes[is.na(codes)] <- -1L # write values to file hdf5_write_compressed(file, paste0(name, "/categories"), categories, compression) @@ -604,19 +603,19 @@ write_h5ad_data_frame_index <- function(value, file, name, compression, index_na #' @noRd #' #' @param file Path to the H5AD file to write -#' @param obs_names Vector containing observation names -#' @param var_names Vector containing variable names +#' @param obs Data frame with observations +#' @param var Data frame with variables #' @param compression The compression to use when writing the element. Can be #' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. #' @param version The H5AD version to write -write_empty_h5ad <- function(file, obs_names, var_names, compression, version = "0.1.0") { +write_empty_h5ad <- function(file, obs, var, compression, version = "0.1.0") { h5file <- rhdf5::H5Fcreate(file) rhdf5::H5Fclose(h5file) write_h5ad_encoding(file, "/", "anndata", "0.1.0") - write_h5ad_element(data.frame(row.names = obs_names), file, "/obs", compression) - write_h5ad_element(data.frame(row.names = var_names), file, "/var", compression) + write_h5ad_element(obs, file, "/obs", compression) + write_h5ad_element(var, file, "/var", compression) rhdf5::h5createGroup(file, "layers") write_h5ad_encoding(file, "/layers", "dict", "0.1.0") @@ -676,7 +675,6 @@ hdf5_path_exists <- function(file, target_path) { #' #' @return Whether the `path` exists in `file` hdf5_write_compressed <- function(file, name, value, compression = c("none", "gzip", "lzf")) { - compression <- match.arg(compression) if (!is.null(dim(value))) { diff --git a/man/AnnData.Rd b/man/AnnData.Rd index ba1d44c9..8a2bd37e 100644 --- a/man/AnnData.Rd +++ b/man/AnnData.Rd @@ -2,11 +2,9 @@ % Please edit documentation in R/AnnData.R \name{AnnData} \alias{AnnData} -\title{An in-memory AnnData object} +\title{An AnnData object} \usage{ AnnData( - obs_names = NULL, - var_names = NULL, X = NULL, obs = NULL, var = NULL, @@ -15,20 +13,11 @@ AnnData( varm = NULL, obsp = NULL, varp = NULL, - uns = NULL + uns = NULL, + shape = shape ) } \arguments{ -\item{obs_names}{A vector of unique identifiers -used to identify each row of \code{obs} and to act as an index into the -observation dimension of the AnnData object. The length of \code{obs_names} -defines the observation dimension of the AnnData object.} - -\item{var_names}{A vector of unique identifiers used to identify each row -of \code{var} and to act as an index into the variable dimension of the -AnnData object. The length of \code{var_names} defines the variable -dimension of the AnnData object.} - \item{X}{Either \code{NULL} or a observation × variable matrix with dimensions consistent with \code{obs} and \code{var}.} @@ -62,28 +51,91 @@ element is a sparse matrix where each dimension has length \code{n_vars}.} \item{uns}{The uns slot is used to store unstructured annotation. It must be either \code{NULL} or a named list.} + +\item{shape}{Shape tuple (#observations, #variables). Can be provided +if \code{X} or \code{obs} and \code{var} are not provided.} +} +\value{ +An \link{AbstractAnnData} object. } \description{ -This class is used to represent an AnnData object in memory. -AnnData stores a data matrix \code{X} together with annotations of -observations \code{obs} (\code{obsm}, \code{obsp}), variables \code{var} (\code{varm}, \code{varp}), and -unstructured annotations \code{uns}. +An AnnData object. This class can either be an in-memory +AnnData (InMemoryAnnData) or an HDF5-backed AnnData (HDF5AnnData). The +AnnData object stores a data matrix \code{X} together with annotations of +observations \code{obs} (\code{obsm}, \code{obsp}) and variables \code{var} (\code{varm}, \code{varp}). +Additional layers of data can be stored in \code{layers} and unstructured +annotations in \code{uns}. +} +\section{Functions that can be used to create AnnData objects}{ + +\itemize{ +\item \code{\link[=AnnData]{AnnData()}}: Create an in-memory AnnData object. +\item \code{\link[=read_h5ad]{read_h5ad()}}: Read an HDF5-backed AnnData file from disk. +\item \code{\link[=from_SingleCellExperiment]{from_SingleCellExperiment()}}: Convert a SingleCellExperiment object to an AnnData object. +\item \code{\link[=from_Seurat]{from_Seurat()}}: Convert a Seurat object to an AnnData object. +} +} + +\section{Slots}{ + +\itemize{ +\item \code{X}: A matrix of observations by variables. +\item \code{obs}: A data frame of observations. +\item \code{var}: A data frame of variables. +\item \code{obs_names}: Names of observations (alias for \code{rownames(obs)}). +\item \code{var_names}: Names of variables (alias for \code{rownames(var)}). +\item \code{layers}: A named list of matrices with the same dimensions as \code{X}. +\item \code{obsm}: A named list of matrices with the same number of rows as \code{obs}. +\item \code{varm}: A named list of matrices with the same number of rows as \code{var}. +\item \code{obsp}: A named list of sparse matrices with the same number of rows and columns as the number of observations. +\item \code{varp}: A named list of sparse matrices with the same number of rows and columns as the number of variables. +\item \code{uns}: A named list of unstructured annotations. +} +} + +\section{Methods}{ + +\itemize{ +\item \code{print()}: Print a summary of the AnnData object. +\item \code{shape()}: Dimensions (observations x variables) of the AnnData object. +\item \code{n_obs()}: Number of observations in the AnnData object. +\item \code{n_vars()}: Number of variables in the AnnData object. +\item \code{obs_keys()}: Column names of \code{obs}. +\item \code{var_keys()}: Column names of \code{var}. +\item \code{layers_keys()}: Element names of \code{layers}. +\item \code{obsm_keys()}: Element names of \code{obsm}. +\item \code{varm_keys()}: Element names of \code{varm}. +\item \code{obsp_keys()}: Element names of \code{obsp}. +\item \code{varp_keys()}: Element names of \code{varp}. +} +} + +\section{Conversion methods}{ -To read an AnnData file from disk, use \code{\link[=read_h5ad]{read_h5ad()}} instead. +\itemize{ +\item \code{to_SingleCellExperiment()}: Convert to SingleCellExperiment. +\item \code{to_Seurat()}: Convert to Seurat. +\item \code{to_InMemoryAnnData()}: Convert to an InMemory AnnData. +\item \code{to_HDF5AnnData()}: Convert to an HDF5 Backed AnnData. +} } + \examples{ adata <- AnnData( - obs_names = paste0("obs", 1:3), - var_names = paste0("var", 1:4), X = matrix(1:12, nrow = 3, ncol = 4), obs = data.frame( + row.names = paste0("obs", 1:3), n_counts = c(1, 2, 3), n_cells = c(1, 2, 3) ), var = data.frame( + row.names = paste0("var", 1:4), n_cells = c(1, 2, 3, 4) ) ) adata } +\seealso{ +\link{AbstractAnnData} +} diff --git a/man/HDF5AnnData.Rd b/man/HDF5AnnData.Rd index 33e9142f..5ef193ae 100644 --- a/man/HDF5AnnData.Rd +++ b/man/HDF5AnnData.Rd @@ -78,8 +78,6 @@ HDF5AnnData constructor \subsection{Usage}{ \if{html}{\out{