Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update to anndata 0.11 and memory efficient reads + writes #1152

Merged
merged 20 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
d9f6afc
feat: update to anndata 0.11 and memory efficient reads
nayib-jose-gloria Dec 6, 2024
0189f7e
use dask for memory-efficient writes
nayib-jose-gloria Dec 10, 2024
90441e7
remove TODOs for features anndata is not planning to support
nayib-jose-gloria Dec 10, 2024
743abd7
Merge branch 'main' into nayib/anndata-0-11
nayib-jose-gloria Dec 11, 2024
9e2ef74
use dask arrays
nayib-jose-gloria Dec 11, 2024
6a18d94
revert local changes
nayib-jose-gloria Dec 11, 2024
eec787b
replace custom chunk_matrix function with dask built-in chunking
nayib-jose-gloria Dec 16, 2024
ce7ac18
Merge branch 'main' into nayib/anndata-0-11
nayib-jose-gloria Dec 16, 2024
26abac4
use single-threaded
nayib-jose-gloria Dec 17, 2024
1fdeebe
filter columns in matrix outside of count_matrix_nonzero + fix test f…
nayib-jose-gloria Dec 18, 2024
f33e3c3
Merge branch 'main' into nayib/anndata-0-11
nayib-jose-gloria Dec 18, 2024
fa25f40
chunk dense arrays in same chunk_size as sparse arrays
nayib-jose-gloria Dec 20, 2024
2df0e0d
don't accidentally dask process metadata arrays with X in their name
nayib-jose-gloria Dec 21, 2024
6dddf18
don't pick up embeddings with 'layers' in the name
nayib-jose-gloria Dec 23, 2024
b8c8c29
leverage implicit 0 requirement in X matrix
nayib-jose-gloria Dec 23, 2024
4e20f61
configurable chunk_size in read_h5ad
nayib-jose-gloria Dec 23, 2024
0badeb7
lower default chunk_size + make number of parallel workers configurab…
nayib-jose-gloria Dec 23, 2024
eef37a9
pin below scipy 1.15
nayib-jose-gloria Jan 7, 2025
d580f8b
Merge branch 'main' into nayib/anndata-0-11
nayib-jose-gloria Jan 8, 2025
bde4829
fix requirements + unit test
nayib-jose-gloria Jan 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 42 additions & 12 deletions cellxgene_schema_cli/cellxgene_schema/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
from typing import Dict, List, Union

import anndata as ad
import h5py
import numpy as np
from anndata.compat import DaskArray
from anndata.experimental import read_dispatched, read_elem_as_dask
from cellxgene_ontology_guide.ontology_parser import OntologyParser
from scipy import sparse
from xxhash import xxh3_64_intdigest
Expand Down Expand Up @@ -68,7 +71,7 @@
return adata


def get_matrix_format(adata: ad.AnnData, matrix: Union[np.ndarray, sparse.spmatrix]) -> str:
def get_matrix_format(matrix: DaskArray) -> str:
"""
Given a matrix, returns the format as one of: csc, csr, coo, dense
or unknown.
Expand All @@ -84,15 +87,11 @@
# >>> return getattr(matrix, "format_str", "dense)
#
matrix_format = "unknown"
if adata.n_obs == 0 or adata.n_vars == 0:
matrix_slice = matrix[0:1, 0:1].compute()
if isinstance(matrix_slice, sparse.spmatrix):
matrix_format = matrix_slice.format
elif isinstance(matrix_slice, np.ndarray):
matrix_format = "dense"
else:
matrix_slice = matrix[0:1, 0:1]
if isinstance(matrix_slice, sparse.spmatrix):
matrix_format = matrix_slice.format
elif isinstance(matrix_slice, np.ndarray):
matrix_format = "dense"

assert matrix_format in ["unknown", "csr", "csc", "coo", "dense"]
return matrix_format

Expand All @@ -116,6 +115,36 @@
return getattr(adata, attr)


def read_backed(f: h5py.File, chunk_size: int = 10_000) -> ad.AnnData:
"""
Read an AnnData object from a h5py.File object, reading in matrices (dense or sparse) as dask arrays. Does not
read full matrices into memory.

:param f: h5py.File object
:param chunk_size: size of chunks to read matrices in
:return: ad.AnnData object
"""

def callback(func, elem_name: str, elem, iospec):
if "layers" in elem_name or ("X" in elem_name and "X_" not in elem_name):
if iospec.encoding_type in (
"csr_matrix",
"csc_matrix",
):
n_vars = elem.attrs.get("shape")[1]
return read_elem_as_dask(elem, chunks=(chunk_size, n_vars))
elif iospec.encoding_type == "array" and len(elem.shape) > 1:
return read_elem_as_dask(elem)

Check warning on line 137 in cellxgene_schema_cli/cellxgene_schema/utils.py

View check run for this annotation

Codecov / codecov/patch

cellxgene_schema_cli/cellxgene_schema/utils.py#L137

Added line #L137 was not covered by tests
else:
return func(elem)
else:
return func(elem)

adata = read_dispatched(f, callback=callback)

return adata


def read_h5ad(h5ad_path: Union[str, bytes, os.PathLike]) -> ad.AnnData:
"""
Reads h5ad into adata
Expand All @@ -124,13 +153,14 @@
:rtype None
"""
try:
adata = ad.read_h5ad(h5ad_path, backed="r")
f = h5py.File(h5ad_path)
adata = read_backed(f)

# This code, and AnnData in general, is optimized for row access.
# Running backed, with CSC, is prohibitively slow. Read the entire
# AnnData into memory if it is CSC.
if (get_matrix_format(adata, adata.X) == "csc") or (
(adata.raw is not None) and (get_matrix_format(adata, adata.raw.X) == "csc")
if (get_matrix_format(adata.X) == "csc") or (
(adata.raw is not None) and (get_matrix_format(adata.raw.X) == "csc")
):
logger.warning("Matrices are in CSC format; loading entire dataset into memory.")
adata = adata.to_memory()
Expand Down
Loading
Loading