diff --git a/docs/api.md b/docs/api.md index 479fe346c7..656a6d1192 100644 --- a/docs/api.md +++ b/docs/api.md @@ -436,6 +436,26 @@ Collections of useful measurements for evaluating results. ``` +## Experimental + +```{eval-rst} +.. module:: scanpy.experimental +.. currentmodule:: scanpy +``` + +New methods that are in early development which are not (yet) +integrated in Scanpy core. + +```{eval-rst} +.. autosummary:: + :toctree: generated/ + + experimental.pp.normalize_pearson_residuals + experimental.pp.normalize_pearson_residuals_pca + experimental.pp.highly_variable_genes + experimental.pp.recipe_pearson_residuals +``` + ## Classes {class}`~anndata.AnnData` is reexported from {mod}`anndata`. diff --git a/docs/references.rst b/docs/references.rst index 6d05613328..7c09607f44 100644 --- a/docs/references.rst +++ b/docs/references.rst @@ -119,6 +119,10 @@ References *Laplacian Dynamics and Multiscale Modular Structure in Networks* `arXiv `__. +.. [Lause21] Lause *et al.* (2021) + *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*, + `Genome Biology `__. + .. [Leek12] Leek *et al.* (2012), *sva: Surrogate Variable Analysis. R package* `Bioconductor `__. diff --git a/docs/release-notes/1.9.0.md b/docs/release-notes/1.9.0.md index 8a43cf1aff..52c33c42e6 100644 --- a/docs/release-notes/1.9.0.md +++ b/docs/release-notes/1.9.0.md @@ -8,3 +8,14 @@ - {func}`~scanpy.logging.print_versions` now uses `session_info` {pr}`2089` {smaller}`P Angerer` {smaller}`I Virshup` - `_choose_representation` now subsets the provided representation to n_pcs, regardless of the name of the provided representation (should affect mostly {func}`~scanpy.pp.neighbors`) {pr}`2179` {smaller}`I Virshup` {smaller}`PG Majev` - Embedding plots now have a `dimensions` argument, which lets users select which dimensions of their embedding to plot and uses the same broadcasting rules as other arguments {pr}`1538` {smaller}`I Virshup` + +```{rubric} Experimental module +``` + +- Added {mod}`scanpy.experimental` module! + + - Added {func}`scanpy.experimental.pp.normalize_pearson_residuals` for Pearson Residuals normalization {pr}`1715` {smaller}`J Lause, G Palla, I Virshup` + - Added {func}`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and PCA {pr}`1715` {smaller}`J Lause, G Palla, I Virshup` + - Added {func}`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals {pr}`1715` {smaller}`J Lause, G Palla, I Virshup` + - Added {func}`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and dimensionality reduction with PCA {pr}`1715` {smaller}`J Lause, G Palla, I Virshup` + - Added {func}`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA {pr}`1715` {smaller}`J Lause, G Palla, I Virshup` diff --git a/docs/tutorials.md b/docs/tutorials.md index 96600ba9f6..26b6998c54 100644 --- a/docs/tutorials.md +++ b/docs/tutorials.md @@ -94,6 +94,10 @@ See the [cell cycle] notebook. :width: 120px ``` +### Normalization with Pearson Residuals + +Normalization of scRNA-seq data with Pearson Residuals, from [^cite_lause21]: {tutorial}`tutorial_pearson_residuals` + ### Scaling Computations - Visualize and cluster [1.3M neurons] from 10x Genomics. diff --git a/scanpy/__init__.py b/scanpy/__init__.py index 37a05fdf98..ad7022c75e 100644 --- a/scanpy/__init__.py +++ b/scanpy/__init__.py @@ -14,7 +14,7 @@ from . import tools as tl from . import preprocessing as pp from . import plotting as pl - from . import datasets, logging, queries, external, get, metrics + from . import datasets, logging, queries, external, get, metrics, experimental from anndata import AnnData, concat from anndata import ( diff --git a/scanpy/experimental/__init__.py b/scanpy/experimental/__init__.py new file mode 100644 index 0000000000..8a00c90df0 --- /dev/null +++ b/scanpy/experimental/__init__.py @@ -0,0 +1 @@ +from . import pp diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py new file mode 100644 index 0000000000..a1408adf01 --- /dev/null +++ b/scanpy/experimental/_docs.py @@ -0,0 +1,79 @@ +"""Shared docstrings for experimental function parameters. +""" + +doc_adata = """\ +adata + The annotated data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. +""" + +doc_dist_params = """\ +theta + The negative binomial overdispersion parameter `theta` for Pearson residuals. + Higher values correspond to less overdispersion \ + (`var = mean + mean^2/theta`), and `theta=np.Inf` corresponds to a Poisson model. +clip + Determines if and how residuals are clipped: + + * If `None`, residuals are clipped to the interval \ + `[-sqrt(n_obs), sqrt(n_obs)]`, where `n_obs` is the number of cells in the dataset (default behavior). + * If any scalar `c`, residuals are clipped to the interval `[-c, c]`. Set \ + `clip=np.Inf` for no clipping. +""" + +doc_check_values = """\ +check_values + If `True`, checks if counts in selected layer are integers as expected by this + function, and return a warning if non-integers are found. Otherwise, proceed + without checking. Setting this to `False` can speed up code for large datasets. +""" + +doc_layer = """\ +layer + Layer to use as input instead of `X`. If `None`, `X` is used. +""" + +doc_subset = """\ +subset + Inplace subset to highly-variable genes if `True` otherwise merely indicate + highly variable genes. +""" + +doc_genes_batch_chunk = """\ +n_top_genes + Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. +batch_key + If specified, highly-variable genes are selected within each batch separately + and merged. This simple process avoids the selection of batch-specific genes + and acts as a lightweight batch correction method. Genes are first sorted by + how many batches they are a HVG. If `flavor='pearson_residuals'`, ties are + broken by the median rank (across batches) based on within-batch residual + variance. +chunksize + If `flavor='pearson_residuals'`, this dertermines how many genes are processed at + once while computing the residual variance. Choosing a smaller value will reduce + the required memory. +""" + +doc_pca_chunk = """\ +n_comps + Number of principal components to compute in the PCA step. +random_state + Random seed for setting the initial states for the optimization in the PCA step. +kwargs_pca + Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. +""" + +doc_inplace = """\ +inplace + If `True`, update `adata` with results. Otherwise, return results. See below for + details of what is returned. +""" + +doc_copy = """\ +copy + If `True`, the function runs on a copy of the input object and returns the + modified copy. Otherwise, the input object is modified direcly. Not compatible + with `inplace=False`. +""" diff --git a/scanpy/experimental/pp/__init__.py b/scanpy/experimental/pp/__init__.py new file mode 100644 index 0000000000..a5eaf9d9c2 --- /dev/null +++ b/scanpy/experimental/pp/__init__.py @@ -0,0 +1,8 @@ +from scanpy.experimental.pp._normalization import ( + normalize_pearson_residuals, + normalize_pearson_residuals_pca, +) + +from scanpy.experimental.pp._highly_variable_genes import highly_variable_genes + +from scanpy.experimental.pp._recipes import recipe_pearson_residuals diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py new file mode 100644 index 0000000000..166b6e06df --- /dev/null +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -0,0 +1,319 @@ +from multiprocessing.sharedctypes import Value +import warnings +from typing import Optional + +import numpy as np +import pandas as pd +import scipy.sparse as sp_sparse +from anndata import AnnData + + +from scanpy import logging as logg +from scanpy._settings import settings, Verbosity +from scanpy._utils import check_nonnegative_integers, view_to_actual +from scanpy.get import _get_obs_rep +from scanpy._compat import Literal +from scanpy._utils import _doc_params +from scanpy.preprocessing._utils import _get_mean_var +from scanpy.preprocessing._distributed import materialize_as_ndarray +from scanpy.preprocessing._simple import filter_genes +from scanpy.experimental._docs import ( + doc_adata, + doc_dist_params, + doc_genes_batch_chunk, + doc_check_values, + doc_layer, + doc_copy, + doc_inplace, +) + + +def _highly_variable_pearson_residuals( + adata: AnnData, + theta: float = 100, + clip: Optional[float] = None, + n_top_genes: int = 1000, + batch_key: Optional[str] = None, + chunksize: int = 1000, + check_values: bool = True, + layer: Optional[str] = None, + subset: bool = False, + inplace: bool = True, +) -> Optional[pd.DataFrame]: + """\ + See `scanpy.experimental.pp.highly_variable_genes`. + + Returns + ------- + If `inplace=True`, `adata.var` is updated with the following fields. Otherwise, + returns the same fields as :class:`~pandas.DataFrame`. + + highly_variable : bool + boolean indicator of highly-variable genes + means : float + means per gene + variances : float + variance per gene + residual_variances : float + Residual variance per gene. Averaged in the case of multiple batches. + highly_variable_rank : float + Rank of the gene according to residual variance, median rank in the case of multiple batches + highly_variable_nbatches : int + If `batch_key` given, denotes in how many batches genes are detected as HVG + highly_variable_intersection : bool + If `batch_key` given, denotes the genes that are highly variable in all batches + """ + + view_to_actual(adata) + X = _get_obs_rep(adata, layer=layer) + computed_on = layer if layer else 'adata.X' + + # Check for raw counts + if check_values and (check_nonnegative_integers(X) is False): + warnings.warn( + "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", + UserWarning, + ) + # check theta + if theta <= 0: + # TODO: would "underdispersion" with negative theta make sense? + # then only theta=0 were undefined.. + raise ValueError('Pearson residuals require theta > 0') + # prepare clipping + + if batch_key is None: + batch_info = np.zeros(adata.shape[0], dtype=int) + else: + batch_info = adata.obs[batch_key].values + n_batches = len(np.unique(batch_info)) + + # Get pearson residuals for each batch separately + residual_gene_vars = [] + for batch in np.unique(batch_info): + + adata_subset_prefilter = adata[batch_info == batch] + X_batch_prefilter = _get_obs_rep(adata_subset_prefilter, layer=layer) + + # Filter out zero genes + with settings.verbosity.override(Verbosity.error): + nonzero_genes = np.ravel(X_batch_prefilter.sum(axis=0)) != 0 + adata_subset = adata_subset_prefilter[:, nonzero_genes] + X_batch = _get_obs_rep(adata_subset, layer=layer) + + # Prepare clipping + if clip is None: + n = X_batch.shape[0] + clip = np.sqrt(n) + if clip < 0: + raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") + + if sp_sparse.issparse(X_batch): + sums_genes = np.sum(X_batch, axis=0) + sums_cells = np.sum(X_batch, axis=1) + sum_total = np.sum(sums_genes).squeeze() + else: + sums_genes = np.sum(X_batch, axis=0, keepdims=True) + sums_cells = np.sum(X_batch, axis=1, keepdims=True) + sum_total = np.sum(sums_genes) + + # Compute pearson residuals in chunks + residual_gene_var = np.empty((X_batch.shape[1])) + for start in np.arange(0, X_batch.shape[1], chunksize): + stop = start + chunksize + mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total) + X_dense = X_batch[:, start:stop].toarray() + residuals = (X_dense - mu) / np.sqrt(mu + mu**2 / theta) + residuals = np.clip(residuals, a_min=-clip, a_max=clip) + residual_gene_var[start:stop] = np.var(residuals, axis=0) + + # Add 0 values for genes that were filtered out + unmasked_residual_gene_var = np.zeros(len(nonzero_genes)) + unmasked_residual_gene_var[nonzero_genes] = residual_gene_var + residual_gene_vars.append(unmasked_residual_gene_var.reshape(1, -1)) + + residual_gene_vars = np.concatenate(residual_gene_vars, axis=0) + + # Get rank per gene within each batch + # argsort twice gives ranks, small rank means most variable + ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1) + ranks_residual_var = ranks_residual_var.astype(np.float32) + # count in how many batches a genes was among the n_top_genes + highly_variable_nbatches = np.sum( + (ranks_residual_var < n_top_genes).astype(int), axis=0 + ) + # set non-top genes within each batch to nan + ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan + ranks_masked_array = np.ma.masked_invalid(ranks_residual_var) + # Median rank across batches, ignoring batches in which gene was not selected + medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) + + means, variances = materialize_as_ndarray(_get_mean_var(X)) + df = pd.DataFrame.from_dict( + dict( + means=means, + variances=variances, + residual_variances=np.mean(residual_gene_vars, axis=0), + highly_variable_rank=medianrank_residual_var, + highly_variable_nbatches=highly_variable_nbatches.astype(np.int64), + highly_variable_intersection=highly_variable_nbatches == n_batches, + ) + ) + df = df.set_index(adata.var_names) + + # Sort genes by how often they selected as hvg within each batch and + # break ties with median rank of residual variance across batches + df.sort_values( + ['highly_variable_nbatches', 'highly_variable_rank'], + ascending=[False, True], + na_position='last', + inplace=True, + ) + + high_var = np.zeros(df.shape[0], dtype=bool) + high_var[:n_top_genes] = True + df['highly_variable'] = high_var + df = df.loc[adata.var_names, :] + + if inplace: + adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on} + logg.hint( + 'added\n' + ' \'highly_variable\', boolean vector (adata.var)\n' + ' \'highly_variable_rank\', float vector (adata.var)\n' + ' \'highly_variable_nbatches\', int vector (adata.var)\n' + ' \'highly_variable_intersection\', boolean vector (adata.var)\n' + ' \'means\', float vector (adata.var)\n' + ' \'variances\', float vector (adata.var)\n' + ' \'residual_variances\', float vector (adata.var)' + ) + adata.var['means'] = df['means'].values + adata.var['variances'] = df['variances'].values + adata.var['residual_variances'] = df['residual_variances'] + adata.var['highly_variable_rank'] = df['highly_variable_rank'].values + if batch_key is not None: + adata.var['highly_variable_nbatches'] = df[ + 'highly_variable_nbatches' + ].values + adata.var['highly_variable_intersection'] = df[ + 'highly_variable_intersection' + ].values + adata.var['highly_variable'] = df['highly_variable'].values + + if subset: + adata._inplace_subset_var(df['highly_variable'].values) + + else: + if batch_key is None: + df = df.drop( + ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1 + ) + if subset: + df = df.iloc[df.highly_variable.values, :] + + return df + + +@_doc_params( + adata=doc_adata, + dist_params=doc_dist_params, + genes_batch_chunk=doc_genes_batch_chunk, + check_values=doc_check_values, + layer=doc_layer, + inplace=doc_inplace, +) +def highly_variable_genes( + adata: AnnData, + *, + theta: float = 100, + clip: Optional[float] = None, + n_top_genes: Optional[int] = None, + batch_key: Optional[str] = None, + chunksize: int = 1000, + flavor: Literal['pearson_residuals'] = 'pearson_residuals', + check_values: bool = True, + layer: Optional[str] = None, + subset: bool = False, + inplace: bool = True, +) -> Optional[pd.DataFrame]: + """\ + Select highly variable genes using analytic Pearson residuals [Lause21]_. + + In [Lause21]_, Pearson residuals of a negative binomial offset model are computed + (with overdispersion `theta` shared across genes). By default, overdispersion + `theta=100` is used and residuals are clipped to `sqrt(n_obs)`. Finally, genes + are ranked by residual variance. + + Expects raw count input. + + Parameters + ---------- + {adata} + {dist_params} + {genes_batch_chunk} + flavor + Choose the flavor for identifying highly variable genes. In this experimental + version, only 'pearson_residuals' is functional. + {check_values} + {layer} + subset + If `True`, subset the data to highly-variable genes after finding them. + Otherwise merely indicate highly variable genes in `adata.var` (see below). + {inplace} + + Returns + ------- + If `inplace=True`, `adata.var` is updated with the following fields. Otherwise, + returns the same fields as :class:`~pandas.DataFrame`. + + highly_variable : bool + boolean indicator of highly-variable genes. + means : float + means per gene. + variances : float + variance per gene. + residual_variances : float + For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the + case of multiple batches. + highly_variable_rank : float + For `flavor='pearson_residuals'`, rank of the gene according to residual. + variance, median rank in the case of multiple batches. + highly_variable_nbatches : int + If `batch_key` given, denotes in how many batches genes are detected as HVG. + highly_variable_intersection : bool + If `batch_key` given, denotes the genes that are highly variable in all batches. + + Notes + ----- + Experimental version of `sc.pp.highly_variable_genes()` + """ + + logg.info('extracting highly variable genes') + + if not isinstance(adata, AnnData): + raise ValueError( + '`pp.highly_variable_genes` expects an `AnnData` argument, ' + 'pass `inplace=False` if you want to return a `pd.DataFrame`.' + ) + + if flavor == 'pearson_residuals': + if n_top_genes is None: + raise ValueError( + "`pp.highly_variable_genes` requires the argument `n_top_genes`" + " for `flavor='pearson_residuals'`" + ) + return _highly_variable_pearson_residuals( + adata, + layer=layer, + n_top_genes=n_top_genes, + batch_key=batch_key, + theta=theta, + clip=clip, + chunksize=chunksize, + subset=subset, + check_values=check_values, + inplace=inplace, + ) + else: + raise ValueError( + "This is an experimental API and only `flavor=pearson_residuals` is available." + ) diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py new file mode 100644 index 0000000000..ee116c8df1 --- /dev/null +++ b/scanpy/experimental/pp/_normalization.py @@ -0,0 +1,250 @@ +from typing import Optional, Dict +from warnings import warn + +import numpy as np +import pandas as pd +from anndata import AnnData +from scipy.sparse import issparse + +from scanpy import logging as logg + +from scanpy._utils import view_to_actual, check_nonnegative_integers +from scanpy.get import _get_obs_rep, _set_obs_rep +from scanpy._utils import _doc_params +from scanpy.preprocessing._pca import pca +from scanpy.experimental._docs import ( + doc_adata, + doc_dist_params, + doc_layer, + doc_check_values, + doc_copy, + doc_inplace, + doc_pca_chunk, +) + + +def _pearson_residuals(X, theta, clip, check_values, copy=False): + + X = X.copy() if copy else X + + # check theta + if theta <= 0: + # TODO: would "underdispersion" with negative theta make sense? + # then only theta=0 were undefined.. + raise ValueError('Pearson residuals require theta > 0') + # prepare clipping + if clip is None: + n = X.shape[0] + clip = np.sqrt(n) + if clip < 0: + raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") + + if check_values and not check_nonnegative_integers(X): + warn( + "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.", + UserWarning, + ) + + if issparse(X): + sums_genes = np.sum(X, axis=0) + sums_cells = np.sum(X, axis=1) + sum_total = np.sum(sums_genes).squeeze() + else: + sums_genes = np.sum(X, axis=0, keepdims=True) + sums_cells = np.sum(X, axis=1, keepdims=True) + sum_total = np.sum(sums_genes) + + mu = np.array(sums_cells @ sums_genes / sum_total) + diff = np.array(X - mu) + residuals = diff / np.sqrt(mu + mu**2 / theta) + + # clip + residuals = np.clip(residuals, a_min=-clip, a_max=clip) + + return residuals + + +@_doc_params( + adata=doc_adata, + dist_params=doc_dist_params, + check_values=doc_check_values, + layer=doc_layer, + inplace=doc_inplace, + copy=doc_copy, +) +def normalize_pearson_residuals( + adata: AnnData, + *, + theta: float = 100, + clip: Optional[float] = None, + check_values: bool = True, + layer: Optional[str] = None, + inplace: bool = True, + copy: bool = False, +) -> Optional[Dict[str, np.ndarray]]: + """\ + Applies analytic Pearson residual normalization, based on [Lause21]_. + + The residuals are based on a negative binomial offset model with overdispersion + `theta` shared across genes. By default, residuals are clipped to `sqrt(n_obs)` + and overdispersion `theta=100` is used. + + Expects raw count input. + + Params + ------ + {adata} + {dist_params} + {check_values} + {layer} + {inplace} + {copy} + + Returns + ------- + If `inplace=True`, `adata.X` or the selected layer in `adata.layers` is updated + with the normalized values. `adata.uns` is updated with the following fields. + If `inplace=False`, the same fields are returned as dictionary with the + normalized values in `results_dict['X']`. + + `.uns['pearson_residuals_normalization']['theta']` + The used value of the overdisperion parameter theta. + `.uns['pearson_residuals_normalization']['clip']` + The used value of the clipping parameter. + `.uns['pearson_residuals_normalization']['computed_on']` + The name of the layer on which the residuals were computed. + """ + + if copy: + if not inplace: + raise ValueError("`copy=True` cannot be used with `inplace=False`.") + adata = adata.copy() + + view_to_actual(adata) + X = _get_obs_rep(adata, layer=layer) + computed_on = layer if layer else 'adata.X' + + msg = f'computing analytic Pearson residuals on {computed_on}' + start = logg.info(msg) + + residuals = _pearson_residuals(X, theta, clip, check_values, copy=~inplace) + settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on) + + if inplace: + _set_obs_rep(adata, residuals, layer=layer) + adata.uns['pearson_residuals_normalization'] = settings_dict + else: + results_dict = dict(X=residuals, **settings_dict) + + logg.info(' finished ({time_passed})', time=start) + + if copy: + return adata + elif not inplace: + return results_dict + + +@_doc_params( + adata=doc_adata, + dist_params=doc_dist_params, + pca_chunk=doc_pca_chunk, + check_values=doc_check_values, + inplace=doc_inplace, +) +def normalize_pearson_residuals_pca( + adata: AnnData, + *, + theta: float = 100, + clip: Optional[float] = None, + n_comps: Optional[int] = 50, + random_state: Optional[float] = 0, + kwargs_pca: Optional[dict] = {}, + use_highly_variable: Optional[bool] = None, + check_values: bool = True, + inplace: bool = True, +) -> Optional[AnnData]: + """\ + Applies analytic Pearson residual normalization and PCA, based on [Lause21]_. + + The residuals are based on a negative binomial offset model with overdispersion + `theta` shared across genes. By default, residuals are clipped to `sqrt(n_obs)`, + overdispersion `theta=100` is used, and PCA is run with 50 components. + + Operates on the subset of highly variable genes in `adata.var['highly_variable']` + by default. Expects raw count input. + + Params + ------ + {adata} + {dist_params} + {pca_chunk} + use_highly_variable + If `True`, uses gene selection present in `adata.var['highly_variable']` to + subset the data before normalizing (default). Otherwise, proceed on the full + dataset. + {check_values} + {inplace} + + Returns + ------- + If `inplace=False`, returns the Pearson residual-based PCA results (as :class:`~anndata.AnnData` + object). If `inplace=True`, updates `adata` with the following fields: + + `.uns['pearson_residuals_normalization']['pearson_residuals_df']` + The subset of highly variable genes, normalized by Pearson residuals. + `.uns['pearson_residuals_normalization']['theta']` + The used value of the overdisperion parameter theta. + `.uns['pearson_residuals_normalization']['clip']` + The used value of the clipping parameter. + + `.obsm['X_pca']` + PCA representation of data after gene selection (if applicable) and Pearson + residual normalization. + `.varm['PCs']` + The principal components containing the loadings. When `inplace=True` and + `use_highly_variable=True`, this will contain empty rows for the genes not + selected. + `.uns['pca']['variance_ratio']` + Ratio of explained variance. + `.uns['pca']['variance']` + Explained variance, equivalent to the eigenvalues of the covariance matrix. + """ + + # check if HVG selection is there if user wants to use it + if use_highly_variable and 'highly_variable' not in adata.var_keys(): + raise ValueError( + "You passed `use_highly_variable=True`, but no HVG selection was found " + "(e.g., there was no 'highly_variable' column in adata.var).'" + ) + + # default behavior: if there is a HVG selection, we will use it + if use_highly_variable is None and 'highly_variable' in adata.var_keys(): + use_highly_variable = True + + if use_highly_variable: + adata_sub = adata[:, adata.var['highly_variable']].copy() + adata_pca = AnnData( + adata_sub.X.copy(), obs=adata_sub.obs[[]], var=adata_sub.var[[]] + ) + else: + adata_pca = AnnData(adata.X.copy(), obs=adata.obs[[]], var=adata.var[[]]) + + normalize_pearson_residuals( + adata_pca, theta=theta, clip=clip, check_values=check_values + ) + pca(adata_pca, n_comps=n_comps, random_state=random_state, **kwargs_pca) + + if inplace: + norm_settings = adata_pca.uns['pearson_residuals_normalization'] + norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df()) + if use_highly_variable: + adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) + adata.varm['PCs'][adata.var['highly_variable']] = adata_pca.varm['PCs'] + else: + adata.varm['PCs'] = adata_pca.varm['PCs'] + adata.uns['pca'] = adata_pca.uns['pca'] + adata.uns['pearson_residuals_normalization'] = norm_dict + adata.obsm['X_pca'] = adata_pca.obsm['X_pca'] + return None + else: + return adata_pca diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py new file mode 100644 index 0000000000..5aba49345a --- /dev/null +++ b/scanpy/experimental/pp/_recipes.py @@ -0,0 +1,143 @@ +from typing import Optional, Tuple +from anndata import AnnData +import pandas as pd +import numpy as np +from scanpy import experimental +from scanpy.preprocessing import pca +from scanpy.experimental._docs import ( + doc_adata, + doc_dist_params, + doc_genes_batch_chunk, + doc_pca_chunk, + doc_layer, + doc_check_values, + doc_inplace, +) +from scanpy._utils import _doc_params + + +@_doc_params( + adata=doc_adata, + dist_params=doc_dist_params, + genes_batch_chunk=doc_genes_batch_chunk, + pca_chunk=doc_pca_chunk, + check_values=doc_check_values, + inplace=doc_inplace, +) +def recipe_pearson_residuals( + adata: AnnData, + *, + theta: float = 100, + clip: Optional[float] = None, + n_top_genes: int = 1000, + batch_key: Optional[str] = None, + chunksize: int = 1000, + n_comps: Optional[int] = 50, + random_state: Optional[float] = 0, + kwargs_pca: dict = {}, + check_values: bool = True, + inplace: bool = True, +) -> Optional[Tuple[AnnData, pd.DataFrame]]: + """\ + Full pipeline for HVG selection and normalization by analytic Pearson residuals ([Lause21]_). + + Applies gene selection based on Pearson residuals. On the resulting subset, + Pearson residual normalization and PCA are performed. + + Expects raw count input. + + Params + ------ + {adata} + {dist_params} + {genes_batch_chunk} + {pca_chunk} + {check_values} + {inplace} + + Returns + ------- + If `inplace=False`, separately returns the gene selection results (as + :class:`~pandas.DataFrame`) and Pearson residual-based PCA results (as + :class:`~anndata.AnnData`). If `inplace=True`, updates `adata` with the + following fields for gene selection results: + + `.var['highly_variable']` : bool + boolean indicator of highly-variable genes. + `.var['means']` : float + means per gene. + `.var['variances']` : float + variances per gene. + `.var['residual_variances']` : float + Pearson residual variance per gene. Averaged in the case of multiple + batches. + `.var['highly_variable_rank']` : float + Rank of the gene according to residual variance, median rank in the + case of multiple batches. + `.var['highly_variable_nbatches']` : int + If batch_key is given, this denotes in how many batches genes are + detected as HVG. + `.var['highly_variable_intersection']` : bool + If batch_key is given, this denotes the genes that are highly variable + in all batches. + + The following fields contain Pearson residual-based PCA results and + normalization settings: + + `.uns['pearson_residuals_normalization']['pearson_residuals_df']` + The subset of highly variable genes, normalized by Pearson residuals. + `.uns['pearson_residuals_normalization']['theta']` + The used value of the overdisperion parameter theta. + `.uns['pearson_residuals_normalization']['clip']` + The used value of the clipping parameter. + + `.obsm['X_pca']` + PCA representation of data after gene selection and Pearson residual + normalization. + `.varm['PCs']` + The principal components containing the loadings. When `inplace=True` this + will contain empty rows for the genes not selected during HVG selection. + `.uns['pca']['variance_ratio']` + Ratio of explained variance. + `.uns['pca']['variance']` + Explained variance, equivalent to the eigenvalues of the covariance matrix. + """ + + hvg_args = dict( + flavor='pearson_residuals', + n_top_genes=n_top_genes, + batch_key=batch_key, + theta=theta, + clip=clip, + chunksize=chunksize, + check_values=check_values, + ) + + if inplace: + experimental.pp.highly_variable_genes(adata, **hvg_args, inplace=True) + # TODO: are these copies needed? + adata_pca = adata[:, adata.var['highly_variable']].copy() + else: + hvg = experimental.pp.highly_variable_genes(adata, **hvg_args, inplace=False) + # TODO: are these copies needed? + adata_pca = adata[:, hvg['highly_variable']].copy() + + experimental.pp.normalize_pearson_residuals( + adata_pca, theta=theta, clip=clip, check_values=check_values + ) + pca(adata_pca, n_comps=n_comps, random_state=random_state, **kwargs_pca) + + if inplace: + normalization_param = adata_pca.uns['pearson_residuals_normalization'] + normalization_dict = dict( + **normalization_param, pearson_residuals_df=adata_pca.to_df() + ) + + adata.uns['pca'] = adata_pca.uns['pca'] + adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) + adata.varm['PCs'][adata.var['highly_variable']] = adata_pca.varm['PCs'] + adata.uns['pearson_residuals_normalization'] = normalization_dict + adata.obsm['X_pca'] = adata_pca.obsm['X_pca'] + return None + else: + return adata_pca, hvg diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py index 7c2c4d7aca..b811a89cf0 100644 --- a/scanpy/preprocessing/__init__.py +++ b/scanpy/preprocessing/__init__.py @@ -8,5 +8,4 @@ from ._qc import calculate_qc_metrics from ._combat import combat from ._normalization import normalize_total - from ..neighbors import neighbors diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 19d21aad8f..7db4a098c3 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -33,20 +33,20 @@ def _highly_variable_genes_seurat_v3( Returns ------- Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or - updates `.var` with the following fields + updates `.var` with the following fields: highly_variable : bool - boolean indicator of highly-variable genes + boolean indicator of highly-variable genes. **means** - means per gene + means per gene. **variances** - variance per gene + variance per gene. **variances_norm** - normalized variance per gene, averaged in the case of multiple batches + normalized variance per gene, averaged in the case of multiple batches. highly_variable_rank : float - Rank of the gene according to normalized variance, median rank in the case of multiple batches + Rank of the gene according to normalized variance, median rank in the case of multiple batches. highly_variable_nbatches : int - If batch_key is given, this denotes in how many batches genes are detected as HVG + If batch_key is given, this denotes in how many batches genes are detected as HVG. """ try: @@ -305,8 +305,8 @@ def highly_variable_genes( """\ Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_. - Expects logarithmized data, except when `flavor='seurat_v3'` in which - count data is expected. + Expects logarithmized data, except when `flavor='seurat_v3'`, in which count + data is expected. Depending on `flavor`, this reproduces the R-implementations of Seurat [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_. @@ -322,6 +322,9 @@ def highly_variable_genes( standard deviation. Next, the normalized variance is computed as the variance of each gene after the transformation. Genes are ranked by the normalized variance. + See also `scanpy.experimental.pp._highly_variable_genes` for additional flavours + (e.g. Pearson residuals). + Parameters ---------- adata @@ -506,9 +509,10 @@ def highly_variable_genes( na_position='last', inplace=True, ) - df['highly_variable'] = False - df.highly_variable.iloc[:n_top_genes] = True - df = df.loc[adata.var_names] + high_var = np.zeros(df.shape[0]) + high_var[:n_top_genes] = True + df['highly_variable'] = high_var.astype(bool) + df = df.loc[adata.var_names, :] else: df = df.loc[adata.var_names] dispersion_norm = df.dispersions_norm.values diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index aa88fdef2d..65fb79c844 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -11,9 +11,10 @@ except ImportError: DaskArray = None -from .. import logging as logg -from .._compat import Literal -from .._utils import view_to_actual +from scanpy import logging as logg +from scanpy._compat import Literal +from scanpy._utils import view_to_actual + from scanpy.get import _get_obs_rep, _set_obs_rep diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index d211bcc20a..6abc04ed74 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -3,6 +3,7 @@ from anndata import AnnData + from .. import preprocessing as pp from ._deprecated.highly_variable_genes import ( filter_genes_dispersion, diff --git a/scanpy/tests/_data/_cached_datasets.py b/scanpy/tests/_data/_cached_datasets.py new file mode 100644 index 0000000000..f66bece206 --- /dev/null +++ b/scanpy/tests/_data/_cached_datasets.py @@ -0,0 +1,21 @@ +from functools import wraps +import scanpy as sc + + +def cached_dataset(func): + store = [] + + @wraps(func) + def wrapper(): + if len(store) < 1: + store.append(func()) + return store[0].copy() + + return wrapper + + +pbmc3k = cached_dataset(sc.datasets.pbmc3k) +pbmc68k_reduced = cached_dataset(sc.datasets.pbmc68k_reduced) +pbmc3k_processed = cached_dataset(sc.datasets.pbmc3k_processed) +krumsiek11 = cached_dataset(sc.datasets.krumsiek11) +paul15 = cached_dataset(sc.datasets.paul15) diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py index 61fc35e23e..9bae484355 100644 --- a/scanpy/tests/helpers.py +++ b/scanpy/tests/helpers.py @@ -6,8 +6,10 @@ import scanpy as sc import numpy as np - +import warnings +import pytest from anndata.tests.helpers import asarray, assert_equal +from scanpy.tests._data._cached_datasets import pbmc3k # TODO: Report more context on the fields being compared on error # TODO: Allow specifying paths to ignore on comparison @@ -83,3 +85,43 @@ def check_rep_results(func, X, *, fields=["layer", "obsm"], **kwargs): assert_equal(adatas_proc[field_a], adatas_proc[field_b]) for field in fields: assert_equal(adata_X, adatas_proc[field]) + + +def _prepare_pbmc_testdata(sparsity_func, dtype, small=False): + """Prepares 3k PBMC dataset with batch key `batch` and defined datatype/sparsity. + + Params + ------ + sparsity_func + sparsity function applied to adata.X (e.g. csr_matrix.toarray for dense or csr_matrix for sparse) + dtype + numpy dtype applied to adata.X (e.g. 'float32' or 'int64') + small + False (default) returns full data, True returns small subset of the data.""" + + adata = pbmc3k().copy() + + if small: + adata = adata[:1000, :500] + sc.pp.filter_cells(adata, min_genes=1) + np.random.seed(42) + adata.obs['batch'] = np.random.randint(0, 3, size=adata.shape[0]) + sc.pp.filter_genes(adata, min_cells=1) + adata.X = sparsity_func(adata.X.astype(dtype)) + return adata + + +def _check_check_values_warnings(function, adata, expected_warning, kwargs={}): + '''Runs `function` on `adata` with provided arguments `kwargs` twice: once with `check_values=True` and once with `check_values=False`. Checks that the `expected_warning` is only raised whtn `check_values=True`.''' + + # expecting 0 no-int warnings + with warnings.catch_warnings(record=True) as record: + function(adata.copy(), **kwargs, check_values=False) + warning_msgs = [w.message.args[0] for w in record] + assert expected_warning not in warning_msgs + + # expecting 1 no-int warning + with warnings.catch_warnings(record=True) as record: + function(adata.copy(), **kwargs, check_values=True) + warning_msgs = [w.message.args[0] for w in record] + assert expected_warning in warning_msgs diff --git a/scanpy/tests/test_clustering.py b/scanpy/tests/test_clustering.py index 1bb65d27ce..e0cbca4be2 100644 --- a/scanpy/tests/test_clustering.py +++ b/scanpy/tests/test_clustering.py @@ -1,10 +1,11 @@ import pytest import scanpy as sc +from scanpy.tests._data._cached_datasets import pbmc68k_reduced @pytest.fixture def adata_neighbors(): - return sc.datasets.pbmc68k_reduced() + return pbmc68k_reduced() def test_leiden_basic(adata_neighbors): diff --git a/scanpy/tests/test_dendrogram_key_added.py b/scanpy/tests/test_dendrogram_key_added.py index 6d43042914..a656b03c6a 100644 --- a/scanpy/tests/test_dendrogram_key_added.py +++ b/scanpy/tests/test_dendrogram_key_added.py @@ -1,6 +1,7 @@ import scanpy as sc import numpy as np import pytest +from scanpy.tests._data._cached_datasets import pbmc68k_reduced n_neighbors = 5 key = 'test' @@ -8,7 +9,7 @@ @pytest.fixture def adata(): - return sc.AnnData(sc.datasets.pbmc68k_reduced()) + return pbmc68k_reduced() @pytest.mark.parametrize('groupby', ['bulk_labels', ['bulk_labels', 'phase']]) diff --git a/scanpy/tests/test_deprecations.py b/scanpy/tests/test_deprecations.py index 620c774bd6..5006779c3b 100644 --- a/scanpy/tests/test_deprecations.py +++ b/scanpy/tests/test_deprecations.py @@ -1,10 +1,11 @@ import scanpy as sc +from scanpy.tests._data._cached_datasets import pbmc68k_reduced import pytest def test_deprecate_multicore_tsne(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() with pytest.warns( UserWarning, match="calling tsne with n_jobs > 1 would use MulticoreTSNE" diff --git a/scanpy/tests/test_embedding.py b/scanpy/tests/test_embedding.py index 1f778959c3..9120bd09e0 100644 --- a/scanpy/tests/test_embedding.py +++ b/scanpy/tests/test_embedding.py @@ -1,7 +1,7 @@ from importlib.util import find_spec from unittest.mock import patch import warnings - +from scanpy.tests._data._cached_datasets import pbmc68k_reduced import numpy as np import pytest from numpy.testing import assert_array_almost_equal, assert_array_equal, assert_raises @@ -10,7 +10,7 @@ def test_tsne(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() euclidean1 = sc.tl.tsne(pbmc, metric="euclidean", copy=True) with pytest.warns(UserWarning, match="In previous versions of scanpy"): @@ -32,7 +32,7 @@ def test_tsne(): def test_tsne_metric_warning(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() import sklearn with patch.object(sklearn, "__version__", "0.23.0"), pytest.warns( @@ -42,7 +42,7 @@ def test_tsne_metric_warning(): def test_umap_init_dtype(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc = pbmc[:100, :].copy() sc.tl.umap(pbmc, init_pos=pbmc.obsm["X_pca"][:, :2].astype(np.float32)) embed1 = pbmc.obsm["X_umap"].copy() @@ -57,7 +57,7 @@ def test_umap_init_dtype(): @pytest.mark.parametrize("layout", [pytest.param("fa", marks=needs_fa2), "fr"]) def test_umap_init_paga(layout): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc = pbmc[:100, :].copy() sc.tl.paga(pbmc) sc.pl.paga(pbmc, layout=layout, show=False) @@ -65,7 +65,7 @@ def test_umap_init_paga(layout): def test_diffmap(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.diffmap(pbmc) d1 = pbmc.obsm['X_diffmap'].copy() diff --git a/scanpy/tests/test_embedding_density.py b/scanpy/tests/test_embedding_density.py index 3e49f45f3f..e38bdc65db 100644 --- a/scanpy/tests/test_embedding_density.py +++ b/scanpy/tests/test_embedding_density.py @@ -1,6 +1,6 @@ import numpy as np from anndata import AnnData - +from scanpy.tests._data._cached_datasets import pbmc68k_reduced import scanpy as sc @@ -22,6 +22,6 @@ def test_embedding_density(): def test_embedding_density_plot(): # Test that sc.pl.embedding_density() runs without error - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() sc.tl.embedding_density(adata, 'umap') sc.pl.embedding_density(adata, 'umap', 'umap_density', show=False) diff --git a/scanpy/tests/test_filter_rank_genes_groups.py b/scanpy/tests/test_filter_rank_genes_groups.py index b6d7bb474c..7325d07376 100644 --- a/scanpy/tests/test_filter_rank_genes_groups.py +++ b/scanpy/tests/test_filter_rank_genes_groups.py @@ -1,6 +1,6 @@ import numpy as np from scanpy.tools import rank_genes_groups, filter_rank_genes_groups -from scanpy.datasets import pbmc68k_reduced +from scanpy.tests._data._cached_datasets import pbmc68k_reduced names_no_reference = np.array( diff --git a/scanpy/tests/test_get.py b/scanpy/tests/test_get.py index 6fac0386c5..30a3eb84f5 100644 --- a/scanpy/tests/test_get.py +++ b/scanpy/tests/test_get.py @@ -9,7 +9,7 @@ import scanpy as sc from scanpy.datasets._utils import filter_oldformatwarning - +from scanpy.tests._data._cached_datasets import pbmc68k_reduced TRANSPOSE_PARAMS = pytest.mark.parametrize( "dim,transform,func", @@ -202,7 +202,7 @@ def test_backed_vs_memory(): def test_column_content(): "uses a larger dataset to test column order and content" - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() # test that columns content is correct for obs_df query = ['CST3', 'NKG7', 'GNLY', 'louvain', 'n_counts', 'n_genes'] diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 79a83c4d50..9483848051 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -3,6 +3,14 @@ import numpy as np import scanpy as sc from pathlib import Path +from scipy.sparse import csr_matrix +from scanpy.tests.helpers import ( + _prepare_pbmc_testdata, + _check_check_values_warnings, +) +import warnings + +from scanpy.tests._data._cached_datasets import pbmc3k, pbmc68k_reduced FILE = Path(__file__).parent / Path('_scripts/seurat_hvg.csv') FILE_V3 = Path(__file__).parent / Path('_scripts/seurat_hvg_v3.csv.gz') @@ -54,10 +62,220 @@ def test_highly_variable_genes_basic(): assert np.all(np.isin(colnames, hvg_df.columns)) +def _check_pearson_hvg_columns(output_df, n_top_genes): + + assert pd.api.types.is_float_dtype(output_df['residual_variances'].dtype) + + assert output_df['highly_variable'].values.dtype is np.dtype('bool') + assert np.sum(output_df['highly_variable']) == n_top_genes + + assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1 + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtype): + + adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) + + # depending on check_values, warnings should be raised for non-integer data + if dtype == 'float32': + + adata_noninteger = adata.copy() + x, y = np.nonzero(adata_noninteger.X) + adata_noninteger.X[x[0], y[0]] = 0.5 + + _check_check_values_warnings( + function=sc.experimental.pp.highly_variable_genes, + adata=adata_noninteger, + expected_warning="`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", + kwargs=dict( + flavor='pearson_residuals', + n_top_genes=100, + ), + ) + + # errors should be raised for invalid theta values + for theta in [0, -1]: + + with pytest.raises(ValueError, match='Pearson residuals require theta > 0'): + sc.experimental.pp.highly_variable_genes( + adata.copy(), theta=theta, flavor='pearson_residuals', n_top_genes=100 + ) + + with pytest.raises( + ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.' + ): + sc.experimental.pp.highly_variable_genes( + adata.copy(), clip=-1, flavor='pearson_residuals', n_top_genes=100 + ) + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +@pytest.mark.parametrize('subset', [True, False]) +@pytest.mark.parametrize('clip', [None, np.Inf, 30]) +@pytest.mark.parametrize('theta', [100, np.Inf]) +@pytest.mark.parametrize('n_top_genes', [100, 200]) +def test_highly_variable_genes_pearson_residuals_general( + subset, sparsity_func, dtype, clip, theta, n_top_genes +): + adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) + # cleanup var + del adata.var + + # compute reference output + residuals_reference = sc.experimental.pp.normalize_pearson_residuals( + adata, clip=clip, theta=theta, inplace=False + )['X'] + residual_variances_reference = np.var(residuals_reference, axis=0) + + if subset: + # lazyly sort by residual variance and take top N + top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes] + # (results in sorted "gene order" in reference) + residual_variances_reference = residual_variances_reference[top_n_idx] + + # compute output to be tested + output_df = sc.experimental.pp.highly_variable_genes( + adata, + flavor='pearson_residuals', + n_top_genes=n_top_genes, + subset=subset, + inplace=False, + clip=clip, + theta=theta, + ) + + sc.experimental.pp.highly_variable_genes( + adata, + flavor='pearson_residuals', + n_top_genes=n_top_genes, + subset=subset, + inplace=True, + clip=clip, + theta=theta, + ) + + # compare inplace=True and inplace=False output + pd.testing.assert_frame_equal(output_df, adata.var) + + # check output is complete + for key in [ + 'highly_variable', + 'means', + 'variances', + 'residual_variances', + 'highly_variable_rank', + ]: + assert key in output_df.keys() + + # check consistency with normalization method + if subset: + # sort values before comparing as reference is sorted as well for subset case + sort_output_idx = np.argsort(-output_df['residual_variances'].values) + assert np.allclose( + output_df['residual_variances'].values[sort_output_idx], + residual_variances_reference, + ) + else: + assert np.allclose( + output_df['residual_variances'].values, residual_variances_reference + ) + + # check hvg flag + hvg_idx = np.where(output_df['highly_variable'])[0] + topn_idx = np.sort( + np.argsort(-output_df['residual_variances'].values)[:n_top_genes] + ) + assert np.all(hvg_idx == topn_idx) + + # check ranks + assert np.nanmin(output_df['highly_variable_rank'].values) == 0 + + # more general checks on ranks, hvg flag and residual variance + _check_pearson_hvg_columns(output_df, n_top_genes) + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +@pytest.mark.parametrize('subset', [True, False]) +@pytest.mark.parametrize('n_top_genes', [100, 200]) +def test_highly_variable_genes_pearson_residuals_batch( + subset, n_top_genes, sparsity_func, dtype +): + adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) + # cleanup var + del adata.var + n_genes = adata.shape[1] + + output_df = sc.experimental.pp.highly_variable_genes( + adata, + flavor='pearson_residuals', + n_top_genes=n_top_genes, + batch_key='batch', + subset=subset, + inplace=False, + ) + + sc.experimental.pp.highly_variable_genes( + adata, + flavor='pearson_residuals', + n_top_genes=n_top_genes, + batch_key='batch', + subset=subset, + inplace=True, + ) + + # compare inplace=True and inplace=False output + pd.testing.assert_frame_equal(output_df, adata.var) + + # check output is complete + for key in [ + 'highly_variable', + 'means', + 'variances', + 'residual_variances', + 'highly_variable_rank', + 'highly_variable_nbatches', + 'highly_variable_intersection', + ]: + assert key in output_df.keys() + + # general checks on ranks, hvg flag and residual variance + _check_pearson_hvg_columns(output_df, n_top_genes) + + # check intersection flag + nbatches = len(np.unique(adata.obs['batch'])) + assert output_df['highly_variable_intersection'].values.dtype is np.dtype('bool') + assert np.sum(output_df['highly_variable_intersection']) <= n_top_genes * nbatches + assert np.all(output_df['highly_variable'][output_df.highly_variable_intersection]) + + # check ranks (with batch_key these are the median of within-batch ranks) + assert pd.api.types.is_float_dtype(output_df['highly_variable_rank'].dtype) + + # check nbatches + assert output_df['highly_variable_nbatches'].values.dtype is np.dtype('int') + assert np.min(output_df['highly_variable_nbatches'].values) >= 0 + assert np.max(output_df['highly_variable_nbatches'].values) <= nbatches + + # check subsetting + if subset: + assert len(output_df) == n_top_genes + else: + assert len(output_df) == n_genes + + def test_higly_variable_genes_compare_to_seurat(): seurat_hvg_info = pd.read_csv(FILE, sep=' ') - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc.X = pbmc.raw.X pbmc.var_names_make_unique() @@ -98,7 +316,7 @@ def test_higly_variable_genes_compare_to_seurat_v3(): FILE_V3, sep=' ', dtype={"variances_norm": np.float64} ) - pbmc = sc.datasets.pbmc3k() + pbmc = pbmc3k() pbmc.var_names_make_unique() pbmc_dense = pbmc.copy() @@ -161,7 +379,7 @@ def test_higly_variable_genes_compare_to_seurat_v3(): def test_filter_genes_dispersion_compare_to_seurat(): seurat_hvg_info = pd.read_csv(FILE, sep=' ') - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc.X = pbmc.raw.X pbmc.var_names_make_unique() @@ -203,7 +421,7 @@ def test_filter_genes_dispersion_compare_to_seurat(): def test_highly_variable_genes_batches(): - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() adata[:100, :100].X = np.zeros((100, 100)) adata.obs['batch'] = ['0' if i < 100 else '1' for i in range(adata.n_obs)] @@ -252,7 +470,7 @@ def test_highly_variable_genes_batches(): def test_seurat_v3_mean_var_output_with_batchkey(): - pbmc = sc.datasets.pbmc3k() + pbmc = pbmc3k() pbmc.var_names_make_unique() n_cells = pbmc.shape[0] batch = np.zeros((n_cells), dtype=int) diff --git a/scanpy/tests/test_ingest.py b/scanpy/tests/test_ingest.py index a7ba765f98..8bd9c05be2 100644 --- a/scanpy/tests/test_ingest.py +++ b/scanpy/tests/test_ingest.py @@ -7,7 +7,7 @@ import scanpy as sc from scanpy import settings from scanpy._compat import pkg_version - +from scanpy.tests._data._cached_datasets import pbmc68k_reduced X = np.array( [ @@ -25,7 +25,7 @@ @pytest.fixture def adatas(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() n_split = 500 adata_ref = sc.AnnData(pbmc.X[:n_split, :], obs=pbmc.obs.iloc[:n_split]) adata_new = sc.AnnData(pbmc.X[n_split:, :]) diff --git a/scanpy/tests/test_metrics.py b/scanpy/tests/test_metrics.py index 025d788abb..4ace4a9c54 100644 --- a/scanpy/tests/test_metrics.py +++ b/scanpy/tests/test_metrics.py @@ -9,10 +9,11 @@ from anndata.tests.helpers import asarray import pytest +from scanpy.tests._data._cached_datasets import pbmc68k_reduced def test_gearys_c_consistency(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc.layers["raw"] = pbmc.raw.X.copy() g = pbmc.obsp["connectivities"] @@ -69,7 +70,7 @@ def test_gearys_c_correctness(): def test_morans_i_consistency(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc.layers["raw"] = pbmc.raw.X.copy() g = pbmc.obsp["connectivities"] @@ -133,7 +134,7 @@ def test_morans_i_correctness(): ) def test_graph_metrics_w_constant_values(metric, array_type): # https://github.com/theislab/scanpy/issues/1806 - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() XT = array_type(pbmc.raw.X.T.copy()) g = pbmc.obsp["connectivities"].copy() diff --git a/scanpy/tests/test_neighbors_key_added.py b/scanpy/tests/test_neighbors_key_added.py index 310d7e5d62..3b2ee68a2c 100644 --- a/scanpy/tests/test_neighbors_key_added.py +++ b/scanpy/tests/test_neighbors_key_added.py @@ -5,10 +5,12 @@ n_neighbors = 5 key = 'test' +from scanpy.tests._data._cached_datasets import pbmc68k_reduced + @pytest.fixture def adata(): - return sc.AnnData(sc.datasets.pbmc68k_reduced().X) + return sc.AnnData(pbmc68k_reduced().X) def test_neighbors_key_added(adata): diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index ebf2c796de..0308731d88 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -4,11 +4,18 @@ from scipy.sparse import csr_matrix import dask.array as da from scipy import sparse +import warnings import scanpy as sc -from scanpy.tests.helpers import check_rep_mutation, check_rep_results +from scanpy.tests.helpers import ( + check_rep_mutation, + check_rep_results, + _prepare_pbmc_testdata, + _check_check_values_warnings, +) from anndata.tests.helpers import assert_equal, asarray + X_total = [[1, 0], [3, 0], [5, 6]] X_frac = [[1, 0, 1], [3, 0, 1], [5, 6, 1]] @@ -69,3 +76,273 @@ def test_normalize_total_view(typ, dtype): assert not v.is_view assert_equal(adata, v) + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype): + + adata = _prepare_pbmc_testdata(sparsity_func, dtype) + + # depending on check_values, warnings should be raised for non-integer data + if dtype == 'float32': + + adata_noninteger = adata.copy() + x, y = np.nonzero(adata_noninteger.X) + adata_noninteger.X[x[0], y[0]] = 0.5 + + _check_check_values_warnings( + function=sc.experimental.pp.normalize_pearson_residuals, + adata=adata_noninteger, + expected_warning="`normalize_pearson_residuals()` expects raw count data, but non-integers were found.", + ) + + # errors should be raised for invalid theta values + for theta in [0, -1]: + + with pytest.raises(ValueError, match='Pearson residuals require theta > 0'): + sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=theta) + + with pytest.raises( + ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.' + ): + sc.experimental.pp.normalize_pearson_residuals(adata.copy(), clip=-1) + + +@pytest.mark.parametrize( + 'sparsity_func', [np.array, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +@pytest.mark.parametrize('theta', [0.01, 1, 100, np.Inf]) +@pytest.mark.parametrize('clip', [None, 1, np.Inf]) +def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip): + + # toy data + X = np.array([[3, 6], [2, 4], [1, 0]]) + ns = np.sum(X, axis=1) + ps = np.sum(X, axis=0) / np.sum(X) + mu = np.outer(ns, ps) + + # compute reference residuals + if np.isinf(theta): + # Poisson case + residuals_reference = (X - mu) / np.sqrt(mu) + else: + # NB case + residuals_reference = (X - mu) / np.sqrt(mu + mu**2 / theta) + + # compute output to test + adata = AnnData(sparsity_func(X), dtype=dtype) + output = sc.experimental.pp.normalize_pearson_residuals( + adata, theta=theta, clip=clip, inplace=False + ) + output_X = output['X'] + sc.experimental.pp.normalize_pearson_residuals( + adata, theta=theta, clip=clip, inplace=True + ) + + # check for correct new `adata.uns` keys + assert np.all(np.isin(['pearson_residuals_normalization'], list(adata.uns.keys()))) + assert np.all( + np.isin( + ['theta', 'clip', 'computed_on'], + list(adata.uns['pearson_residuals_normalization'].keys()), + ) + ) + # test against inplace + np.testing.assert_array_equal(adata.X, output_X) + + if clip is None: + # default clipping: compare to sqrt(n) threshold + clipping_threshold = np.sqrt(adata.shape[0]).astype(np.float32) + assert np.max(output_X) <= clipping_threshold + assert np.min(output_X) >= -clipping_threshold + elif np.isinf(clip): + # no clipping: compare to raw residuals + assert np.allclose(output_X, residuals_reference) + else: + # custom clipping: compare to custom threshold + assert np.max(output_X) <= clip + assert np.min(output_X) >= -clip + + +def _check_pearson_pca_fields(ad, n_cells, n_comps): + assert np.all( + np.isin( + ['pearson_residuals_normalization', 'pca'], + list(ad.uns.keys()), + ) + ), ( + """Missing `.uns` keys. Expected `['pearson_residuals_normalization', 'pca']`, but only %s were found""" + % (list(ad.uns.keys())) + ) + assert 'X_pca' in list( + ad.obsm.keys() + ), """Missing `obsm` key `'X_pca'`, only %s were found""" % (list(ad.obsm.keys())) + assert 'PCs' in list( + ad.varm.keys() + ), """Missing `varm` key `'PCs'`, only %s were found""" % (list(ad.varm.keys())) + assert ad.obsm['X_pca'].shape == ( + n_cells, + n_comps, + ), 'Wrong shape of PCA output in `X_pca`' + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +@pytest.mark.parametrize('n_hvgs', [100, 200]) +@pytest.mark.parametrize('n_comps', [30, 50]) +def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps): + + adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) + n_cells, n_genes = adata.shape + + adata_with_hvgs = adata.copy() + sc.experimental.pp.highly_variable_genes( + adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs + ) + adata_not_using_hvgs = adata_with_hvgs.copy() + + ### inplace = False ### + # outputs the (potentially hvg-restricted) adata_pca object + # PCA on all genes (no HVGs present) + adata_pca = sc.experimental.pp.normalize_pearson_residuals_pca( + adata.copy(), inplace=False, n_comps=n_comps + ) + # PCA on hvgs only (HVGs present, and by default, `use_highly_variable=True`) + adata_pca_with_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca( + adata_with_hvgs.copy(), inplace=False, n_comps=n_comps + ) + # PCA again on all genes (HVGs present, but hvg use supressed by `use_highly_variable=False`) + adata_pca_not_using_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca( + adata_not_using_hvgs.copy(), + inplace=False, + n_comps=n_comps, + use_highly_variable=False, + ) + + # for all cases, check adata_pca keys are complete + for ad in [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs]: + _check_pearson_pca_fields(ad, n_cells, n_comps) + + # check adata shape to see if all genes or only HVGs are in the returned adata + assert adata_pca.shape == (n_cells, n_genes) + assert adata_pca_with_hvgs.shape == (n_cells, n_hvgs) # only HVGs retained + assert adata_pca_not_using_hvgs.shape == (n_cells, n_genes) + + # check PC shapes to see whether or not HVGs were used for PCA + assert adata_pca.varm['PCs'].shape == (n_genes, n_comps) + assert adata_pca_with_hvgs.varm['PCs'].shape == ( + n_hvgs, + n_comps, + ) + assert adata_pca_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps) + + ### inplace = True ### + # modifies the input adata object + # PCA on all genes (no HVGs present) + sc.experimental.pp.normalize_pearson_residuals_pca( + adata, inplace=True, n_comps=n_comps + ) + # PCA on hvgs only (HVGs present, and by default, `use_highly_variable=True`) + sc.experimental.pp.normalize_pearson_residuals_pca( + adata_with_hvgs, inplace=True, n_comps=n_comps + ) + # PCA again on all genes (HVGs present, but hvg use supressed by `use_highly_variable=False`) + sc.experimental.pp.normalize_pearson_residuals_pca( + adata_not_using_hvgs, + inplace=True, + n_comps=n_comps, + use_highly_variable=False, + ) + + # for all cases, check adata_pca keys are complete + for ad in [adata, adata_with_hvgs, adata_not_using_hvgs]: + _check_pearson_pca_fields(ad, n_cells, n_comps) + + # check shapes: inplace adata's should always retains original shape + assert ad.shape == (n_cells, n_genes) + assert ad.varm['PCs'].shape == (n_genes, n_comps) + + # check if there are columns of all-zeros in the PCs shapes + # to see whether or not HVGs were used for PCA + # no all-zero-colums should exist + assert sum(np.sum(np.abs(adata.varm['PCs']), axis=1) == 0) == 0 + # number of all-zero-colums should be number of non-hvgs + assert ( + sum(np.sum(np.abs(adata_with_hvgs.varm['PCs']), axis=1) == 0) + == n_genes - n_hvgs + ) + # no all-zero-colums should exist + assert sum(np.sum(np.abs(adata_not_using_hvgs.varm['PCs']), axis=1) == 0) == 0 + + # compare PCA results beteen inplace/outplace + for ad_inplace, ad_outplace in zip( + [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs], + [adata, adata_with_hvgs, adata_not_using_hvgs], + ): + np.testing.assert_array_equal( + ad_inplace.obsm['X_pca'], + ad_outplace.obsm['X_pca'], + ) + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +@pytest.mark.parametrize('n_hvgs', [100, 200]) +@pytest.mark.parametrize('n_comps', [30, 50]) +def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comps): + adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) + n_cells, n_genes = adata.shape + + ### inplace = False ### + # outputs the (potentially hvg-restricted) adata_pca object + # PCA on all genes + adata_pca, hvg = sc.experimental.pp.recipe_pearson_residuals( + adata.copy(), inplace=False, n_comps=n_comps, n_top_genes=n_hvgs + ) + + # check PCA fields + _check_pearson_pca_fields(adata_pca, n_cells, n_comps) + # check adata output shape (only HVGs in output) + assert adata_pca.shape == (n_cells, n_hvgs) + # check PC shape (non-hvgs are removed, so only `n_hvgs` genes) + assert adata_pca.varm['PCs'].shape == (n_hvgs, n_comps) + + # check hvg df + assert np.all( + np.isin( + [ + 'means', + 'variances', + 'residual_variances', + 'highly_variable_rank', + 'highly_variable', + ], + list(hvg.columns), + ) + ) + assert np.sum(hvg['highly_variable']) == n_hvgs + assert hvg.shape[0] == n_genes + + ### inplace = True ### + # modifies the input adata object + # PCA on all genes + sc.experimental.pp.recipe_pearson_residuals( + adata, inplace=True, n_comps=n_comps, n_top_genes=n_hvgs + ) + + # check PCA fields and output shape + _check_pearson_pca_fields(adata, n_cells, n_comps) + # check adata shape (no change to input) + assert adata.shape == (n_cells, n_genes) + # check PC shape (non-hvgs are masked with 0s, so original number of genes) + assert adata.varm['PCs'].shape == (n_genes, n_comps) + # number of all-zero-colums should be number of non-hvgs + assert sum(np.sum(np.abs(adata.varm['PCs']), axis=1) == 0) == n_genes - n_hvgs diff --git a/scanpy/tests/test_paga.py b/scanpy/tests/test_paga.py index d4440a2884..fabfe243c4 100644 --- a/scanpy/tests/test_paga.py +++ b/scanpy/tests/test_paga.py @@ -5,7 +5,7 @@ import numpy as np import scanpy as sc - +from scanpy.tests._data._cached_datasets import pbmc68k_reduced, pbmc3k_processed import pytest HERE: Path = Path(__file__).parent @@ -15,7 +15,7 @@ @pytest.fixture(scope="module") def pbmc(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.paga(pbmc, groups='bulk_labels') pbmc.obs['cool_feature'] = pbmc[:, 'CST3'].X.squeeze() return pbmc @@ -81,7 +81,7 @@ def test_paga_compare(image_comparer): # Tests that https://github.com/theislab/scanpy/issues/1887 is fixed save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc3k_processed() + pbmc = pbmc3k_processed() sc.tl.paga(pbmc, groups="louvain") sc.pl.paga_compare(pbmc, basis="umap", show=False) @@ -92,7 +92,7 @@ def test_paga_compare(image_comparer): def test_paga_positions_reproducible(): """Check exact reproducibility and effect of random_state on paga positions""" # https://github.com/theislab/scanpy/issues/1859 - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.paga(pbmc, "bulk_labels") a = pbmc.copy() diff --git a/scanpy/tests/test_plotting.py b/scanpy/tests/test_plotting.py index ca00ba3efb..701d02f0c1 100644 --- a/scanpy/tests/test_plotting.py +++ b/scanpy/tests/test_plotting.py @@ -6,7 +6,12 @@ import pytest from matplotlib.testing import setup from packaging import version - +from scanpy.tests._data._cached_datasets import ( + pbmc3k, + pbmc3k_processed, + pbmc68k_reduced, + krumsiek11, +) from scanpy._compat import pkg_version setup() @@ -38,7 +43,7 @@ def test_heatmap(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() sc.pl.heatmap( adata, adata.var_names, 'cell_type', use_raw=False, show=False, dendrogram=True ) @@ -100,7 +105,7 @@ def test_heatmap(image_comparer): save_and_compare_images('master_heatmap_std_scale_obs') # test var_names as dict - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.leiden(pbmc, key_added="clusters", resolution=0.5) # call umap to trigger colors for the clusters sc.pl.umap(pbmc, color="clusters") @@ -152,7 +157,7 @@ def test_heatmap(image_comparer): ) def test_clustermap(image_comparer, obs_keys, name): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() sc.pl.clustermap(adata, obs_keys) save_and_compare_images(name) @@ -312,7 +317,7 @@ def test_clustermap(image_comparer, obs_keys, name): def test_dotplot_matrixplot_stacked_violin(image_comparer, id, fn): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() adata.obs['numeric_column'] = adata.X[:, 0] adata.layers['test'] = -1 * adata.X.copy() genes_dict = { @@ -331,7 +336,7 @@ def test_dotplot_matrixplot_stacked_violin(image_comparer, id, fn): def test_dotplot_obj(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) # test dotplot dot_min, dot_max, color_map, and var_groups - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() genes = [ 'CD79A', 'MS4A1', @@ -369,7 +374,7 @@ def test_dotplot_obj(image_comparer): def test_matrixplot_obj(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() marker_genes_dict = { "3": ["GNLY", "NKG7"], "1": ["FCER1A"], @@ -396,7 +401,7 @@ def test_matrixplot_obj(image_comparer): def test_stacked_violin_obj(image_comparer, plt): save_and_compare_images = image_comparer(ROOT, FIGS, tol=26) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() markers = { 'T-cell': ['CD3D', 'CD3E', 'IL32'], 'B-cell': ['CD79A', 'CD79B', 'MS4A1'], @@ -417,7 +422,7 @@ def test_stacked_violin_obj(image_comparer, plt): def test_tracksplot(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() sc.pl.tracksplot( adata, adata.var_names, 'cell_type', dendrogram=True, use_raw=False ) @@ -428,7 +433,7 @@ def test_multiple_plots(image_comparer): # only testing stacked_violin, matrixplot and dotplot save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() markers = { 'T-cell': ['CD3D', 'CD3E', 'IL32'], 'B-cell': ['CD79A', 'CD79B', 'MS4A1'], @@ -474,7 +479,7 @@ def test_violin(image_comparer): sc.pl.set_rcParams_defaults() sc.set_figure_params(dpi=50, color_map='viridis') - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.violin( pbmc, ['n_genes', 'percent_mito', 'n_counts'], @@ -523,7 +528,7 @@ def test_violin_without_raw(tmpdir): has_raw_pth = TESTDIR / "has_raw.png" no_raw_pth = TESTDIR / "no_raw.png" - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc_no_raw = pbmc.raw.to_adata().copy() sc.pl.violin(pbmc, 'CST3', groupby="bulk_labels", show=False, jitter=False) @@ -540,7 +545,7 @@ def test_violin_without_raw(tmpdir): def test_dendrogram(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=10) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.dendrogram(pbmc, 'bulk_labels') save_and_compare_images('dendrogram') @@ -548,7 +553,7 @@ def test_dendrogram(image_comparer): def test_correlation(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.correlation_matrix(pbmc, 'bulk_labels') save_and_compare_images('correlation') @@ -772,7 +777,7 @@ def test_correlation(image_comparer): def test_rank_genes_groups(image_comparer, name, fn): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.rank_genes_groups(pbmc, 'louvain', n_genes=pbmc.raw.shape[1]) # add gene symbol @@ -791,8 +796,8 @@ def gene_symbols_adatas(): Both have ensembl ids and hgnc symbols as columns in var. The first has ensembl ids as var_names, the second has symbols. """ - pbmc = sc.datasets.pbmc3k_processed().raw.to_adata() - pbmc_counts = sc.datasets.pbmc3k() + pbmc = pbmc3k_processed().raw.to_adata() + pbmc_counts = pbmc3k() pbmc.layers["counts"] = pbmc_counts[pbmc.obs_names, pbmc.var_names].X.copy() pbmc.var["gene_symbol"] = pbmc.var_names @@ -877,7 +882,7 @@ def test_rank_genes_groups_plots_n_genes_vs_var_names(tmpdir, func, check_same_i var_names as a dict works. """ N = 3 - pbmc = sc.datasets.pbmc68k_reduced().raw.to_adata() + pbmc = pbmc68k_reduced().raw.to_adata() groups = pbmc.obs["louvain"].cat.categories[:3] pbmc = pbmc[pbmc.obs["louvain"].isin(groups)][::3].copy() @@ -929,7 +934,7 @@ def wrapped(pth, **kwargs): def test_genes_symbols(image_comparer, id, fn): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() # add a 'symbols' column adata.var['symbols'] = adata.var.index.map(lambda x: "symbol_{}".format(x)) @@ -939,10 +944,10 @@ def test_genes_symbols(image_comparer, id, fn): save_and_compare_images(f"master_{id}_gene_symbols") -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def _pbmc_scatterplots(): - # Wrapped in another fixture to avoid any mutation - pbmc = sc.datasets.pbmc68k_reduced() + # Wrapped in another fixture to avoid mutation + pbmc = pbmc68k_reduced() pbmc.layers["sparse"] = pbmc.raw.X / 2 pbmc.layers["test"] = pbmc.X.copy() + 100 pbmc.var["numbers"] = [str(x) for x in range(pbmc.shape[1])] @@ -1075,7 +1080,7 @@ def test_scatter_embedding_groups_and_size(image_comparer): # plotted on top. This new ordering requires that the size # vector is also ordered (if given). save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.embedding( pbmc, 'umap', @@ -1088,7 +1093,7 @@ def test_scatter_embedding_groups_and_size(image_comparer): def test_scatter_embedding_add_outline_vmin_vmax_norm(image_comparer, check_same_image): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.embedding( pbmc, @@ -1189,7 +1194,7 @@ def test_scatter_embedding_add_outline_vmin_vmax_norm(image_comparer, check_same def test_timeseries(): - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() sc.pp.neighbors(adata, n_neighbors=5, method='gauss', knn=False) sc.tl.diffmap(adata) sc.tl.dpt(adata, n_branchings=1, n_dcs=10) @@ -1197,7 +1202,7 @@ def test_timeseries(): def test_scatter_raw(tmp_path): - pbmc = sc.datasets.pbmc68k_reduced()[:100].copy() + pbmc = pbmc68k_reduced()[:100].copy() raw_pth = tmp_path / "raw.png" x_pth = tmp_path / "X.png" @@ -1214,7 +1219,7 @@ def test_scatter_raw(tmp_path): def test_scatter_specify_layer_and_raw(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc.layers["layer"] = pbmc.raw.X.copy() with pytest.raises(ValueError): sc.pl.umap(pbmc, color="HES4", use_raw=True, layer="layer") @@ -1223,7 +1228,7 @@ def test_scatter_specify_layer_and_raw(): def test_scatter_no_basis_per_obs(image_comparer): """Test scatterplot of per-obs points with no basis""" save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.scatter(pbmc, x="HES4", y="percent_mito", color="n_genes", use_raw=False) save_and_compare_images("scatter_HES_percent_mito_n_genes") @@ -1231,14 +1236,14 @@ def test_scatter_no_basis_per_obs(image_comparer): def test_scatter_no_basis_per_var(image_comparer): """Test scatterplot of per-var points with no basis""" save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.scatter(pbmc, x="AAAGCCTGGCTAAC-1", y="AAATTCGATGCACA-1", use_raw=False) save_and_compare_images("scatter_AAAGCCTGGCTAAC-1_vs_AAATTCGATGCACA-1") @pytest.fixture def pbmc_filtered(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pp.filter_genes(pbmc, min_cells=10) return pbmc @@ -1293,7 +1298,7 @@ def test_scatter_no_basis_value_error(pbmc_filtered, x, y, color, use_raw): def test_rankings(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pp.pca(pbmc) sc.pl.pca_loadings(pbmc) save_and_compare_images('master_pca_loadings') @@ -1363,7 +1368,7 @@ def test_no_copy(): # https://github.com/theislab/scanpy/issues/1000 # Tests that plotting functions don't make a copy from a view unless they # actually have to - actual = sc.datasets.pbmc68k_reduced() + actual = pbmc68k_reduced() sc.pl.umap(actual, color=["bulk_labels", "louvain"], show=False) # Set colors view = actual[np.random.choice(actual.obs_names, size=actual.shape[0] // 5), :] @@ -1401,7 +1406,7 @@ def test_no_copy(): def test_groupby_index(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() genes = [ 'CD79A', @@ -1426,7 +1431,7 @@ def test_groupby_index(image_comparer): # test category order when groupby is a list (#1735) def test_groupby_list(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=30) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() np.random.seed(1) @@ -1447,7 +1452,7 @@ def test_color_cycler(caplog): # https://github.com/theislab/scanpy/issues/1885 import logging - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() colors = sns.color_palette("deep") cyl = sns.rcmod.cycler('color', sns.color_palette("deep")) @@ -1491,7 +1496,7 @@ def test_filter_rank_genes_groups_plots(tmpdir, plot, check_same_image): TESTDIR = Path(tmpdir) N_GENES = 4 - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon', pts=True) @@ -1526,7 +1531,7 @@ def test_filter_rank_genes_groups_plots(tmpdir, plot, check_same_image): def test_scrublet_plots(image_comparer, plt): save_and_compare_images = image_comparer(ROOT, FIGS, tol=30) - adata = sc.datasets.pbmc3k() + adata = pbmc3k() sc.external.pp.scrublet(adata, use_approx_neighbors=False) sc.external.pl.scrublet_score_distribution(adata, return_fig=True) diff --git a/scanpy/tests/test_preprocessing.py b/scanpy/tests/test_preprocessing.py index b9decb0579..e3e6712400 100644 --- a/scanpy/tests/test_preprocessing.py +++ b/scanpy/tests/test_preprocessing.py @@ -11,6 +11,7 @@ from anndata.tests.helpers import assert_equal, asarray from scanpy.tests.helpers import check_rep_mutation, check_rep_results +from scanpy.tests._data._cached_datasets import pbmc68k_reduced def test_log1p(tmp_path): @@ -115,7 +116,7 @@ def test_subsample_copy(): def test_scale(): - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() adata.X = adata.raw.X v = adata[:, 0 : adata.shape[1] // 2] # Should turn view to copy https://github.com/theislab/anndata/issues/171#issuecomment-508689965 @@ -336,7 +337,7 @@ def test_downsample_total_counts(count_matrix_format, replace, dtype): def test_recipe_weinreb(): # Just tests for failure for now - adata = sc.datasets.pbmc68k_reduced().raw.to_adata() + adata = pbmc68k_reduced().raw.to_adata() adata.X = adata.X.toarray() orig = adata.copy() diff --git a/scanpy/tests/test_queries.py b/scanpy/tests/test_queries.py index e4ef9cc69d..09c9aef0db 100644 --- a/scanpy/tests/test_queries.py +++ b/scanpy/tests/test_queries.py @@ -1,11 +1,12 @@ import pandas as pd import pytest import scanpy as sc +from scanpy.tests._data._cached_datasets import pbmc68k_reduced @pytest.mark.internet def test_enrich(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.rank_genes_groups(pbmc, "louvain", n_genes=pbmc.shape[1]) enrich_anndata = sc.queries.enrich(pbmc, "1") de = pd.DataFrame() @@ -29,7 +30,7 @@ def test_enrich(): @pytest.mark.internet def test_mito_genes(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() mt_genes = sc.queries.mitochondrial_genes("hsapiens") assert ( pbmc.var_names.isin(mt_genes["external_gene_name"]).sum() == 1 diff --git a/scanpy/tests/test_rank_genes_groups.py b/scanpy/tests/test_rank_genes_groups.py index febe80a4b4..9daa73de9c 100644 --- a/scanpy/tests/test_rank_genes_groups.py +++ b/scanpy/tests/test_rank_genes_groups.py @@ -16,7 +16,7 @@ from scanpy.tools import rank_genes_groups from scanpy.tools._rank_genes_groups import _RankGenes from scanpy.get import rank_genes_groups_df -from scanpy.datasets import pbmc68k_reduced +from scanpy.tests._data._cached_datasets import pbmc68k_reduced from scanpy._utils import select_groups @@ -216,12 +216,12 @@ def test_results_layers(): def test_rank_genes_groups_use_raw(): # https://github.com/theislab/scanpy/issues/1929 - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() assert pbmc.raw is not None sc.tl.rank_genes_groups(pbmc, groupby="bulk_labels", use_raw=True) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() del pbmc.raw assert pbmc.raw is None diff --git a/scanpy/tests/test_score_genes.py b/scanpy/tests/test_score_genes.py index 6f38237f86..56a9e4ac41 100644 --- a/scanpy/tests/test_score_genes.py +++ b/scanpy/tests/test_score_genes.py @@ -5,6 +5,7 @@ import pytest import pickle from pathlib import Path +from scanpy.tests._data._cached_datasets import paul15 HERE = Path(__file__).parent / Path('_data/') @@ -54,7 +55,7 @@ def test_score_with_reference(): and stored as a pickle object in ./data """ - adata = sc.datasets.paul15() + adata = paul15() sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000) sc.pp.scale(adata)