From 12e751d9da9a8a7327b19985ce8bd64d8624a0e6 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Tue, 2 Mar 2021 20:10:35 +0100 Subject: [PATCH 01/96] adding core functions and documentation for pearson residual normalization and hvg selection --- scanpy/preprocessing/__init__.py | 4 +- .../preprocessing/_highly_variable_genes.py | 249 +++++++++++++++++- scanpy/preprocessing/_normalization.py | 143 +++++++++- 3 files changed, 383 insertions(+), 13 deletions(-) diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py index 7c2c4d7aca..8bcad5ca03 100644 --- a/scanpy/preprocessing/__init__.py +++ b/scanpy/preprocessing/__init__.py @@ -7,6 +7,6 @@ from ._pca import pca from ._qc import calculate_qc_metrics from ._combat import combat -from ._normalization import normalize_total +from ._normalization import normalize_total, normalize_pearson_residuals -from ..neighbors import neighbors +from ..neighbors import neighbors \ No newline at end of file diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 157322f516..3ae163b3a4 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -1,5 +1,6 @@ import warnings -from typing import Optional +from typing import Optional, Union + import numpy as np import pandas as pd import scipy.sparse as sp_sparse @@ -174,6 +175,188 @@ def _highly_variable_genes_seurat_v3( df = df.drop(['highly_variable_nbatches'], axis=1) return df +def _highly_variable_pearson_residuals( + adata: AnnData, + layer: Optional[str] = None, + n_top_genes: int = 2000, + batch_key: Optional[str] = None, + theta: float = 100, + clip: Union[Literal['auto', 'none'], float] = 'auto', + chunksize: int = 100, + subset: bool = False, + inplace: bool = True, +) -> Optional[pd.DataFrame]: + """\ + See `highly_variable_genes`. + + Returns + ------- + Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or + updates `.var` with the following fields + + highly_variable + boolean indicator of highly-variable genes + means + means per gene + variances + variances per gene + residual_variances + Pearson residual variance per gene. Averaged in the case of multiple batches. + highly_variable_rank + Rank of the gene according to residual variance, median rank in the case of multiple batches + highly_variable_nbatches : int + If batch_key is given, this denotes in how many batches genes are detected as HVG + highly_variable_intersection : bool + If batch_key is given, this denotes the genes that are highly variable in all batches + """ + + X = adata.layers[layer] if layer is not None else adata.X + + # Check for raw counts + if check_nonnegative_integers(X) is False: + raise ValueError( + "`pp.highly_variable_genes` with `flavor='pearson_residuals'` expects " + "raw count data." + ) + + if batch_key is None: + batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int)) + else: + batch_info = adata.obs[batch_key].values + n_batches = len(np.unique(batch_info)) + + # Get pearson residuals for each batch separately + residual_gene_vars = [] + for batch in np.unique(batch_info): + + adata_subset = adata[batch_info == batch] + + # Filter out zero genes + with settings.verbosity.override(Verbosity.error): + nonzero_genes = filter_genes(adata_subset, min_cells=1, inplace=False)[0] + adata_subset = adata_subset[:, nonzero_genes] + + + X_batch = adata_subset.layers[layer] if layer is not None else adata_subset.X + + # Prepare clipping + if clip == 'auto': + n = X_batch.shape[0] + clip = np.sqrt(n) + if clip < 0: + raise ValueError("Pearson residuals require `clip>=0` or `clip='auto'`.") + + if sp_sparse.issparse(X_batch): + sums_genes = np.sum(X_batch, axis=0) + sums_cells = np.sum(X_batch, axis=1) + sum_total = np.sum(sums_genes).squeeze() + else: + sums_genes = np.sum(X_batch, axis=0, keepdims=True) + sums_cells = np.sum(X_batch, axis=1, keepdims=True) + sum_total = np.sum(sums_genes) + + # Compute pearson residuals in chunks + residual_gene_var = np.ones((X_batch.shape[1]))*np.nan + for start in np.arange(0,X_batch.shape[1],chunksize): + stop = start + chunksize + mu = np.array(sums_cells @ sums_genes[:,start:stop] / sum_total) + X_dense = X_batch[:,start:stop].toarray() + residuals = (X_dense - mu) / np.sqrt(mu + mu**2/theta) + residuals = np.clip(residuals, a_min = -clip, a_max = clip) + residual_gene_var[start:stop] = np.var(residuals,axis=0) + + # Add 0 values for genes that were filtered out + zero_gene_var = np.zeros(np.sum(~nonzero_genes)) + residual_gene_var = np.concatenate((residual_gene_var, + zero_gene_var)) + # Order as before filtering + idxs = np.concatenate((np.where(nonzero_genes)[0], + np.where(~nonzero_genes)[0])) + residual_gene_var = residual_gene_var[np.argsort(idxs)] + residual_gene_vars.append(residual_gene_var.reshape(1, -1)) + + residual_gene_vars = np.concatenate(residual_gene_vars, axis=0) + + # Get cutoffs and define hvgs per batch + residual_gene_vars_sorted = np.sort(residual_gene_vars,axis=1)[:,::-1] + cutoffs_per_batch = residual_gene_vars_sorted[:,n_top_genes] + highly_variable_per_batch = np.greater(residual_gene_vars.T,cutoffs_per_batch).T + + # Merge hvgs across batches + highly_variable_nbatches = np.sum(highly_variable_per_batch,axis=0) + highly_variable_intersection = highly_variable_nbatches == n_batches + + # Get rank per gene within each batch + # argsort twice gives ranks, small rank means most variable + ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1) + ranks_residual_var = ranks_residual_var.astype(np.float32) + ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan + ranks_masked_array = np.ma.masked_invalid(ranks_residual_var) + # Median rank across batches, ignoring batches in which gene was not selected + medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) + + means, variances = materialize_as_ndarray(_get_mean_var(X)) + df = pd.DataFrame.from_dict( + dict(means=means, + variances=variances, + residual_variances=np.mean(residual_gene_vars,axis=0), + highly_variable_rank=medianrank_residual_var, + highly_variable_nbatches=highly_variable_nbatches, + highly_variable_intersection=highly_variable_intersection, + ) + ) + df = df.set_index(adata.var_names) + + # Sort genes by how often they selected as hvg within each batch and + # break ties with median rank of residual variance across batches + df.sort_values( + ['highly_variable_nbatches', 'highly_variable_rank'], + ascending=[False,True], + na_position='last', + inplace=True, + ) + df['highly_variable'] = False + df.highly_variable.iloc[:n_top_genes] = True + ## TODO: following line raises a pandas warning (also for flavor = seurat and cellranger..) + df = df.loc[adata.var_names] + + if inplace or subset: + adata.uns['hvg'] = {'flavor': 'pearson_residuals'} + logg.hint( + 'added\n' + ' \'highly_variable\', boolean vector (adata.var)\n' + ' \'highly_variable_rank\', float vector (adata.var)\n' + ' \'highly_variable_nbatches\', int vector (adata.var)\n' + ' \'highly_variable_intersection\', boolean vector (adata.var)\n' + ' \'means\', float vector (adata.var)\n' + ' \'variances\', float vector (adata.var)\n' + ' \'residual_variances\', float vector (adata.var)' + ) + adata.var['highly_variable'] = df['highly_variable'].values + adata.var['highly_variable_rank'] = df['highly_variable_rank'].values + adata.var['means'] = df['means'].values + adata.var['variances'] = df['variances'].values + adata.var['residual_variances'] = df['residual_variances'].values.astype( + 'float64', copy=False + ) + if batch_key is not None: + adata.var['highly_variable_nbatches'] = df[ + 'highly_variable_nbatches' + ].values + adata.var['highly_variable_intersection'] = df[ + 'highly_variable_intersection' + ].values + if subset: + adata._inplace_subset_var(df['highly_variable'].values) + else: + if batch_key is None: + df = df.drop(['highly_variable_nbatches', + 'highly_variable_intersection'], + axis=1) + return df + + + def _highly_variable_genes_single_batch( adata: AnnData, @@ -288,6 +471,7 @@ def _highly_variable_genes_single_batch( return df + def highly_variable_genes( adata: AnnData, layer: Optional[str] = None, @@ -298,7 +482,10 @@ def highly_variable_genes( max_mean: Optional[float] = 3, span: Optional[float] = 0.3, n_bins: int = 20, - flavor: Literal['seurat', 'cell_ranger', 'seurat_v3'] = 'seurat', + theta: float = 100, + clip: Union[Literal['auto', 'none'], float] = 'auto', + chunksize: int = 1000, + flavor: Literal['seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals'] = 'seurat', subset: bool = False, inplace: bool = True, batch_key: Optional[str] = None, @@ -332,19 +519,24 @@ def highly_variable_genes( layer If provided, use `adata.layers[layer]` for expression values instead of `adata.X`. n_top_genes - Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'`. + Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. min_mean If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. max_mean If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. min_disp If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. max_disp If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. span The fraction of the data (cells) used when estimating the variance in the loess model fit if `flavor='seurat_v3'`. @@ -352,7 +544,24 @@ def highly_variable_genes( Number of bins for binning the mean gene expression. Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1. You'll be informed - about this if you set `settings.verbosity = 4`. + about this if you set `settings.verbosity = 4`. Ignored if + `flavor='pearson_residuals'`. + theta + If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta. + Higher values correspond to less overdispersion (var = mean + mean^2/theta), and + `theta=np.Inf` corresponds to a Poisson model. + clip + If `flavor='pearson_residuals'`, this determines if and how residuals are clipped: + + * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], + where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set + `clip=np.Inf` for no clipping. + + chunksize + If `flavor='pearson_residuals'`, this dertermines how many genes are processed at + once while computing the residual variance. Choosing a smaller value will reduce + the required memory. flavor Choose the flavor for identifying highly variable genes. For the dispersion based methods in their default workflows, Seurat passes the cutoffs whereas @@ -368,11 +577,13 @@ def highly_variable_genes( lightweight batch correction method. For all flavors, genes are first sorted by how many batches they are a HVG. For dispersion-based flavors ties are broken by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median - (across batches) rank based on within-batch normalized variance. + (across batches) rank based on within-batch normalized variance. If + `flavor='pearson_residuals'`, ties are broken based on check_values Check if counts in selected layer are integers. A Warning is returned if set to True. Only used if `flavor='seurat_v3'`. + Returns ------- Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or @@ -387,13 +598,18 @@ def highly_variable_genes( **dispersions_norm** For dispersion-based flavors, normalized dispersions per gene **variances** - For `flavor='seurat_v3'`, variance per gene + For `flavor='seurat_v3'` and `flavor='pearson_residuals'`, variance per gene **variances_norm** For `flavor='seurat_v3'`, normalized variance per gene, averaged in the case of multiple batches + **residual_variances** + For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of + multiple batches. highly_variable_rank : float For `flavor='seurat_v3'`, rank of the gene according to normalized variance, median rank in the case of multiple batches + For `flavor='pearson_residuals'`, rank of the gene according to residual + variance, median rank in the case of multiple batches highly_variable_nbatches : int If batch_key is given, this denotes in how many batches genes are detected as HVG highly_variable_intersection : bool @@ -428,6 +644,19 @@ def highly_variable_genes( subset=subset, inplace=inplace, ) + if flavor == 'pearson_residuals': + return _highly_variable_pearson_residuals( + adata, + layer = layer, + n_top_genes = n_top_genes, + batch_key = batch_key, + theta = theta, + clip = clip, + chunksize= chunksize, + subset = subset, + inplace = inplace, + ) + if batch_key is None: df = _highly_variable_genes_single_batch( diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 542afd38ca..40b2ec2422 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -8,10 +8,12 @@ from .. import logging as logg from .._compat import Literal -from .._utils import view_to_actual + +from .._utils import view_to_actual, check_nonnegative_integers from scanpy.get import _get_obs_rep, _set_obs_rep + def _normalize_data(X, counts, after=None, copy=False): X = X.copy() if copy else X if issubclass(X.dtype.type, (int, np.integer)): @@ -27,6 +29,145 @@ def _normalize_data(X, counts, after=None, copy=False): return X +def _pearson_residuals(X, theta, clip, copy=False): + + X = X.copy() if copy else X + X = X.toarray() if issparse(X) else X + + #check theta + if theta <= 0: + ## TODO: would "underdispersion" with negative theta make sense? then only theta=0 were undefined.. + raise ValueError('Pearson residuals require theta > 0') + #prepare clipping + if clip == 'auto': + n = X.shape[0] + clip = np.sqrt(n) + if clip < 0: + raise ValueError("Pearson residuals require `clip>=0` or `clip='auto'`.") + + if check_nonnegative_integers(X) is False: + raise ValueError( + "`pp.normalize_pearson_residuals` expects raw count data" + ) + + #get residuals + sums_genes = np.sum(X, axis=0, keepdims=True) + sums_cells = np.sum(X, axis=1, keepdims=True) + sum_total = np.sum(sums_genes) + mu = sums_cells @ sums_genes / sum_total + residuals = (X - mu) / np.sqrt(mu + mu**2/theta) + + #clip + residuals = np.clip(residuals, a_min = -clip, a_max = clip) + + return residuals + + +def normalize_pearson_residuals( + adata: AnnData, + theta: float = 100, + clip: Union[Literal['auto', 'none'], float] = 'auto', + layers: Union[Literal['all'], Iterable[str]] = None, + theta_per_layer: Optional[Dict[str, str]] = None, + clip_per_layer: Optional[Dict[str, Union[Literal['auto', 'none'], float]]] = None, ## TODO: Check if this is correct/needed + inplace: bool = True, +) -> Optional[Dict[str, np.ndarray]]: + """\ + Computes analytic Pearson residuals, assuming a negative binomial offset model + with overdispersion theta shared across genes. By default, residuals are + clipped to sqrt(n) and overdispersion theta=100 is used. + + Params + ------ + adata + The annotated data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. + theta + The NB overdispersion parameter theta. Higher values correspond to less + overdispersion (var = mean + mean^2/theta), and `theta=np.Inf` corresponds + to a Poisson model. + clip + Determines if and how residuals are clipped: + + * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], + where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set + `clip=np.Inf` for no clipping. + + layers + List of layers to compute Pearson residuals of. Set to `'all'` to + compute for all layers. + theta_per_layer + Dict that specifies which theta is used for each layer: + + * If `None`, the provided `theta` is used for all layers. + * Otherwise, each layer with key `layer_key` is processed with the theta + value in `theta_per_layer[layer_key]`. + clip_per_layer + Dict that specifies clipping behavior for each layer : + + * If `None`, the provided `clip` variable is used for all layers. + * Otherwise, each layer with key `layer_key` is clipped according to + `clip_per_layer[layer_key]`. See `clip` above for possible values. + + inplace + Whether to update `adata` or return dictionary with normalized copies of + `adata.X` and `adata.layers`. + + Returns + ------- + Returns dictionary with Pearson residuals of `adata.X` and `adata.layers` + or updates `adata` with normalized version of the original + `adata.X` and `adata.layers`, depending on `inplace`. + + """ + + if layers == 'all': + layers = adata.layers.keys() + + view_to_actual(adata) ### TODO: is this needed and if yes what for (normalize_total() has it so I used it..) + + # Handle X + msg = 'computing analytic Pearson residuals for adata.X' + start = logg.info(msg) + if inplace: + adata.X = _pearson_residuals(adata.X, theta, clip) + settings = dict(theta=theta, clip=clip) + settings['theta_per_layer']=theta_per_layer if theta_per_layer is not None + settings['clip_per_layer']=clip_per_layer if clip_per_layer is not None + adata.uns['normalization_pearson_residuals'] = settings + + else: + dat = dict(X=_pearson_residuals(adata.X, theta, clip, copy=True)) + + # Handle layers + for layer_name in (layers or ()): + + msg = f'computing analytic Pearson residuals for layer {layer_name}' + _ = logg.info(msg) + + # Default to theta/clip if no layer-specific theta/clip given + layer_theta = theta if theta_per_layer is None else theta_per_layer[layer_name] + layer_clip = clip if clip_per_layer is None else clip_per_layer[layer_name] + + layer = adata.layers[layer_name] + + if inplace: + adata.layers[layer_name] = _pearson_residuals(layer, layer_theta, layer_clip) + else: + dat[layer_name] = _pearson_residuals(layer, layer_theta, layer_clip, copy=True) + + if not layers is None: + adata.uns['normalization_pearson_residuals'] = dict( + theta=theta, + clip=clip) + + logg.info(' finished ({time_passed})', time=start) + + return dat if not inplace else None + + + def normalize_total( adata: AnnData, target_sum: Optional[float] = None, From 5d57961a99a466b07a2de4aa928e8c5ef396905e Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Wed, 3 Mar 2021 20:49:06 +0100 Subject: [PATCH 02/96] adding Pearson residual+PCA bundles, minor bug fixes --- scanpy/preprocessing/__init__.py | 4 +- .../preprocessing/_highly_variable_genes.py | 8 +- scanpy/preprocessing/_normalization.py | 112 ++++++++++++++- scanpy/preprocessing/_recipes.py | 131 +++++++++++++++++- 4 files changed, 244 insertions(+), 11 deletions(-) diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py index 8bcad5ca03..f1b4dad80c 100644 --- a/scanpy/preprocessing/__init__.py +++ b/scanpy/preprocessing/__init__.py @@ -1,4 +1,4 @@ -from ._recipes import recipe_zheng17, recipe_weinreb17, recipe_seurat +from ._recipes import recipe_zheng17, recipe_weinreb17, recipe_seurat, recipe_pearson_residuals from ._simple import filter_cells, filter_genes from ._deprecated.highly_variable_genes import filter_genes_dispersion from ._highly_variable_genes import highly_variable_genes @@ -7,6 +7,6 @@ from ._pca import pca from ._qc import calculate_qc_metrics from ._combat import combat -from ._normalization import normalize_total, normalize_pearson_residuals +from ._normalization import normalize_total, normalize_pearson_residuals, normalize_pearson_residuals_pca from ..neighbors import neighbors \ No newline at end of file diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 3ae163b3a4..88b3de9f29 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -578,7 +578,8 @@ def highly_variable_genes( by how many batches they are a HVG. For dispersion-based flavors ties are broken by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median (across batches) rank based on within-batch normalized variance. If - `flavor='pearson_residuals'`, ties are broken based on + `flavor='pearson_residuals'`, ties are broken by the median rank (across batches) + based on within-batch residual variance. check_values Check if counts in selected layer are integers. A Warning is returned if set to True. Only used if `flavor='seurat_v3'`. @@ -645,6 +646,11 @@ def highly_variable_genes( inplace=inplace, ) if flavor == 'pearson_residuals': + if n_top_genes is None: + raise ValueError( + "`pp.highly_variable_genes` requires the argument `n_top_genes`" + " for `flavor='pearson_residuals'`" + ) return _highly_variable_pearson_residuals( adata, layer = layer, diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 40b2ec2422..965f92075e 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -2,6 +2,7 @@ from warnings import warn import numpy as np +import pandas as pd from anndata import AnnData from scipy.sparse import issparse from sklearn.utils import sparsefuncs @@ -12,6 +13,9 @@ from .._utils import view_to_actual, check_nonnegative_integers from scanpy.get import _get_obs_rep, _set_obs_rep +from ._pca import pca + + def _normalize_data(X, counts, after=None, copy=False): @@ -67,9 +71,9 @@ def normalize_pearson_residuals( adata: AnnData, theta: float = 100, clip: Union[Literal['auto', 'none'], float] = 'auto', - layers: Union[Literal['all'], Iterable[str]] = None, + layers: Optional[Union[Literal['all'], Iterable[str]]] = None, theta_per_layer: Optional[Dict[str, str]] = None, - clip_per_layer: Optional[Dict[str, Union[Literal['auto', 'none'], float]]] = None, ## TODO: Check if this is correct/needed + clip_per_layer: Optional[Dict[str, Union[Literal['auto', 'none'], float]]] = None, inplace: bool = True, ) -> Optional[Dict[str, np.ndarray]]: """\ @@ -125,17 +129,19 @@ def normalize_pearson_residuals( if layers == 'all': layers = adata.layers.keys() - view_to_actual(adata) ### TODO: is this needed and if yes what for (normalize_total() has it so I used it..) + view_to_actual(adata) ### TODO: is this needed and if yes what for? normalize_total() has it so I used it here # Handle X msg = 'computing analytic Pearson residuals for adata.X' start = logg.info(msg) if inplace: adata.X = _pearson_residuals(adata.X, theta, clip) - settings = dict(theta=theta, clip=clip) - settings['theta_per_layer']=theta_per_layer if theta_per_layer is not None - settings['clip_per_layer']=clip_per_layer if clip_per_layer is not None - adata.uns['normalization_pearson_residuals'] = settings + settings_dict = dict(theta=theta, clip=clip) + if theta_per_layer is not None: + settings_dict['theta_per_layer']=theta_per_layer + if clip_per_layer is not None: + settings_dict['clip_per_layer']=clip_per_layer + adata.uns['normalization_pearson_residuals'] = settings_dict else: dat = dict(X=_pearson_residuals(adata.X, theta, clip, copy=True)) @@ -167,6 +173,98 @@ def normalize_pearson_residuals( return dat if not inplace else None + +def normalize_pearson_residuals_pca( + adata: AnnData, + theta: float = 100, + clip: Union[Literal['auto', 'none'], float] = 'auto', + n_comps_pca: Optional[int] = 50, + random_state_pca: Optional[float] = 0, + use_highly_variable: bool = True, + inplace: bool = False +) -> Optional[pd.DataFrame]: + + """\ + Applies PCA based on Pearson residual normalization. Operates on the subset of + highly variable genes in `adata.var['highly_variable']` by default. + + + Parameters + ---------- + adata + The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond + to cells and columns to genes. + use_highly_variable + Whether to use the gene selection in `adata.var['highly_variable']` to subset + the data before normalizing (default) or proceed on the full dataset. + theta + This is the NB overdispersion parameter theta for Pearson residual computations. + Higher values correspond to less overdispersion (var = mean + mean^2/theta), and + `theta=np.Inf` corresponds to a Poisson model. + clip + This determines if and how Pearson residuals are clipped: + + * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], + where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set + `clip=np.Inf` for no clipping. + + n_comps_pca + Number of principal components to compute. + random_state_pca + Change to use different initial states for the optimization. + inplace + Whether to place results in `adata` or return them. + + + Returns + ------- + If `inplace=False`, returns the Pearson residual-based PCA results (`adata_pca`). + If `inplace=True`, updates `adata` with the following fields: + + `.uns['pearson_residuals_normalization']['pearson_residuals_df']` + The hvg-subset, normalized by Pearson residuals + `.uns['pearson_residuals_normalization']['theta']` + The used value of the overdisperion parameter theta + `.uns['pearson_residuals_normalization']['clip']` + The used value of the clipping parameter + + `.obsm['pearson_residuals_X_pca']` + PCA representation of data after gene selection and Pearson residual normalization. + `.uns['pearson_residuals_pca']['PCs']` + The principal components containing the loadings. + `.uns['pearson_residuals_pca']['variance_ratio']` + Ratio of explained variance. + `.uns['pearson_residuals_pca']['variance']` + Explained variance, equivalent to the eigenvalues of the + covariance matrix. + + """ + + + + if use_highly_variable and 'highly_variable' in adata.var_keys(): + adata_pca = adata[:,adata.var['highly_variable']].copy() ##TODO: are these copies needed? + else: + adata_pca = adata.copy()##TODO: are these copies needed? + + normalize_pearson_residuals(adata_pca,theta=theta,clip=clip) + pca(adata_pca,n_comps=n_comps_pca,random_state=random_state_pca) + + if inplace: + normalization_settings = adata_pca.uns['normalization_pearson_residuals'] + normalization_dict = dict(**normalization_settings, + pearson_residuals_df = adata_pca.to_df()) + pca_settings = adata_pca.uns['pca'] + pca_dict = dict(**pca_settings, + PCs = adata_pca.varm['PCs']) + adata.uns['pearson_residuals_pca'] = pca_dict + adata.uns['pearson_residuals_normalization'] = normalization_dict + adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca'] + return None + else: + return adata_pca + def normalize_total( adata: AnnData, diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index d211bcc20a..75b1f42fc0 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -1,8 +1,10 @@ """Preprocessing recipes from the literature""" -from typing import Optional +from typing import Optional, Union, Literal, Tuple from anndata import AnnData +import pandas as pd + from .. import preprocessing as pp from ._deprecated.highly_variable_genes import ( filter_genes_dispersion, @@ -168,3 +170,130 @@ def recipe_zheng17( pp.scale(adata) logg.info(' finished', time=start) return adata if copy else None + + +def recipe_pearson_residuals( + adata: AnnData, + n_top_genes: int = 1000, + theta: float = 100, + clip: Union[Literal['auto', 'none'], float] = 'auto', + chunksize: int = 1000, + batch_key: Optional[str] = None, + n_comps_pca: Optional[int] = 50, + random_state_pca: Optional[float] = 0, + inplace: bool = False +) -> Optional[Tuple[pd.DataFrame,pd.DataFrame]]: + """\ + Applies gene selection based on Pearson residuals. On the resulting subset, + Pearson residual normalization and PCA are performed. + + + Parameters + ---------- + adata + The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond + to cells and columns to genes. + n_top_genes + Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. + chunksize + This dertermines how many genes are processed at once while computing the + Pearson residual variance. Choosing a smaller value will reduce the required memory. + theta + This is the NB overdispersion parameter theta for Pearson residual computations. + Higher values correspond to less overdispersion (var = mean + mean^2/theta), and + `theta=np.Inf` corresponds to a Poisson model. + clip + This determines if and how Pearson residuals are clipped: + + * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], + where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set + `clip=np.Inf` for no clipping. + batch_key + If specified, highly-variable genes are selected within each batch separately and merged. + This simple process avoids the selection of batch-specific genes and acts as a + lightweight batch correction method. For all flavors, genes are first sorted + by how many batches they are a HVG. Ties are broken by the median rank (across batches) + based on within-batch residual variance. + + n_comps_pca + Number of principal components to compute. + random_state_pca + Change to use different initial states for the optimization. + inplace + Whether to place results in `adata` or return them. + + Returns + ------- + If `inplace=False`, separately returns the gene selection results (`hvg`) and Pearson + residual-based PCA results (`adata_pca`). If `inplace=True`, updates `adata` with the + following fields for gene selection results…: + + `.var['highly_variable']` + boolean indicator of highly-variable genes + `.var['means']` + means per gene + `.var['variances']` + variances per gene + `.var['residual_variances']` + Pearson residual variance per gene. Averaged in the case of multiple batches. + `.var['highly_variable_rank']` + Rank of the gene according to residual variance, median rank in the case of multiple batches + `.var['highly_variable_nbatches']` + If batch_key is given, this denotes in how many batches genes are detected as HVG + `.var['highly_variable_intersection']` + If batch_key is given, this denotes the genes that are highly variable in all batches + + …and the following fields for Pearson residual-based PCA results and normalization settings: + + `.uns['pearson_residuals_normalization']['pearson_residuals_df']` + The hvg-subset, normalized by Pearson residuals + `.uns['pearson_residuals_normalization']['theta']` + The used value of the overdisperion parameter theta + `.uns['pearson_residuals_normalization']['clip']` + The used value of the clipping parameter + + `.obsm['pearson_residuals_X_pca']` + PCA representation of data after gene selection and Pearson residual normalization. + `.uns['pearson_residuals_pca']['PCs']` + The principal components containing the loadings. + `.uns['pearson_residuals_pca']['variance_ratio']` + Ratio of explained variance. + `.uns['pearson_residuals_pca']['variance']` + Explained variance, equivalent to the eigenvalues of the + covariance matrix. + + + """ + + hvg_args = dict(flavor = 'pearson_residuals', + n_top_genes = n_top_genes, + batch_key = batch_key, + theta = theta, + clip = clip, + chunksize = chunksize) + + if inplace: + pp.highly_variable_genes(adata,**hvg_args,inplace = True) + adata_pca = adata[:,adata.var['highly_variable']].copy() ##TODO: are these copies needed? + else: + hvg = pp.highly_variable_genes(adata,**hvg_args,inplace = False) + adata_pca = adata[:,hvg['highly_variable']].copy()##TODO: are these copies needed? + + pp.normalize_pearson_residuals(adata_pca,theta = theta,clip = clip) + pp.pca(adata_pca,n_comps = n_comps_pca,random_state = random_state_pca) + + if inplace: + normalization_settings = adata_pca.uns['normalization_pearson_residuals'] + normalization_dict = dict(**normalization_settings, + pearson_residuals_df = adata_pca.to_df()) + pca_settings = adata_pca.uns['pca'] + pca_dict = dict(**pca_settings, + PCs = adata_pca.varm['PCs']) + adata.uns['pearson_residuals_pca'] = pca_dict + adata.uns['pearson_residuals_normalization'] = normalization_dict + adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca'] + return None + else: + return adata_pca, hvg \ No newline at end of file From fced3f27ef8d4ee27c7af3ca3d7341cfd037c000 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Wed, 3 Mar 2021 22:39:29 +0100 Subject: [PATCH 03/96] some style cleanup, minor fixes --- .../preprocessing/_highly_variable_genes.py | 127 ++++++------ scanpy/preprocessing/_normalization.py | 183 ++++++++++-------- scanpy/preprocessing/_recipes.py | 133 +++++++------ 3 files changed, 239 insertions(+), 204 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 88b3de9f29..78347b6fc3 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -191,8 +191,8 @@ def _highly_variable_pearson_residuals( Returns ------- - Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or - updates `.var` with the following fields + Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) + or updates `.var` with the following fields highly_variable boolean indicator of highly-variable genes @@ -201,43 +201,49 @@ def _highly_variable_pearson_residuals( variances variances per gene residual_variances - Pearson residual variance per gene. Averaged in the case of multiple batches. + Pearson residual variance per gene. Averaged in the case of multiple + batches. highly_variable_rank - Rank of the gene according to residual variance, median rank in the case of multiple batches + Rank of the gene according to residual variance, median rank in the + case of multiple batches highly_variable_nbatches : int - If batch_key is given, this denotes in how many batches genes are detected as HVG + If batch_key is given, this denotes in how many batches genes are + detected as HVG highly_variable_intersection : bool - If batch_key is given, this denotes the genes that are highly variable in all batches + If batch_key is given, this denotes the genes that are highly variable + in all batches """ - + X = adata.layers[layer] if layer is not None else adata.X - + # Check for raw counts if check_nonnegative_integers(X) is False: raise ValueError( - "`pp.highly_variable_genes` with `flavor='pearson_residuals'` expects " - "raw count data." + "`pp.highly_variable_genes` with `flavor='pearson_residuals'`" + "expects raw count data." ) - + if batch_key is None: batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int)) else: batch_info = adata.obs[batch_key].values n_batches = len(np.unique(batch_info)) - + # Get pearson residuals for each batch separately residual_gene_vars = [] for batch in np.unique(batch_info): adata_subset = adata[batch_info == batch] - + # Filter out zero genes with settings.verbosity.override(Verbosity.error): nonzero_genes = filter_genes(adata_subset, min_cells=1, inplace=False)[0] adata_subset = adata_subset[:, nonzero_genes] - - X_batch = adata_subset.layers[layer] if layer is not None else adata_subset.X + if layer is not None: + X_batch = adata_subset.layers[layer] + else: + X_batch = adata_subset.X # Prepare clipping if clip == 'auto': @@ -249,77 +255,78 @@ def _highly_variable_pearson_residuals( if sp_sparse.issparse(X_batch): sums_genes = np.sum(X_batch, axis=0) sums_cells = np.sum(X_batch, axis=1) - sum_total = np.sum(sums_genes).squeeze() + sum_total = np.sum(sums_genes).squeeze() else: sums_genes = np.sum(X_batch, axis=0, keepdims=True) sums_cells = np.sum(X_batch, axis=1, keepdims=True) - sum_total = np.sum(sums_genes) + sum_total = np.sum(sums_genes) # Compute pearson residuals in chunks - residual_gene_var = np.ones((X_batch.shape[1]))*np.nan - for start in np.arange(0,X_batch.shape[1],chunksize): + residual_gene_var = np.ones((X_batch.shape[1])) * np.nan + for start in np.arange(0, X_batch.shape[1], chunksize): stop = start + chunksize - mu = np.array(sums_cells @ sums_genes[:,start:stop] / sum_total) - X_dense = X_batch[:,start:stop].toarray() - residuals = (X_dense - mu) / np.sqrt(mu + mu**2/theta) - residuals = np.clip(residuals, a_min = -clip, a_max = clip) - residual_gene_var[start:stop] = np.var(residuals,axis=0) - - # Add 0 values for genes that were filtered out + mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total) + X_dense = X_batch[:, start:stop].toarray() + residuals = (X_dense - mu) / np.sqrt(mu + mu ** 2 / theta) + residuals = np.clip(residuals, a_min=-clip, a_max=clip) + residual_gene_var[start:stop] = np.var(residuals, axis=0) + + # Add 0 values for genes that were filtered out zero_gene_var = np.zeros(np.sum(~nonzero_genes)) - residual_gene_var = np.concatenate((residual_gene_var, - zero_gene_var)) + residual_gene_var = np.concatenate((residual_gene_var, zero_gene_var)) # Order as before filtering - idxs = np.concatenate((np.where(nonzero_genes)[0], - np.where(~nonzero_genes)[0])) + idxs = np.concatenate((np.where(nonzero_genes)[0], np.where(~nonzero_genes)[0])) residual_gene_var = residual_gene_var[np.argsort(idxs)] residual_gene_vars.append(residual_gene_var.reshape(1, -1)) residual_gene_vars = np.concatenate(residual_gene_vars, axis=0) - + # Get cutoffs and define hvgs per batch - residual_gene_vars_sorted = np.sort(residual_gene_vars,axis=1)[:,::-1] - cutoffs_per_batch = residual_gene_vars_sorted[:,n_top_genes] - highly_variable_per_batch = np.greater(residual_gene_vars.T,cutoffs_per_batch).T - + residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1)[:, ::-1] + cutoffs_per_batch = residual_gene_vars_sorted[:, n_top_genes] + highly_variable_per_batch = np.greater(residual_gene_vars.T, cutoffs_per_batch).T + # Merge hvgs across batches - highly_variable_nbatches = np.sum(highly_variable_per_batch,axis=0) + highly_variable_nbatches = np.sum(highly_variable_per_batch, axis=0) highly_variable_intersection = highly_variable_nbatches == n_batches - + # Get rank per gene within each batch # argsort twice gives ranks, small rank means most variable ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1) ranks_residual_var = ranks_residual_var.astype(np.float32) - ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan + ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan ranks_masked_array = np.ma.masked_invalid(ranks_residual_var) - # Median rank across batches, ignoring batches in which gene was not selected - medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) - + # Median rank across batches, + # ignoring batches in which gene was not selected + medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) + means, variances = materialize_as_ndarray(_get_mean_var(X)) df = pd.DataFrame.from_dict( - dict(means=means, - variances=variances, - residual_variances=np.mean(residual_gene_vars,axis=0), - highly_variable_rank=medianrank_residual_var, - highly_variable_nbatches=highly_variable_nbatches, - highly_variable_intersection=highly_variable_intersection, - ) - ) + dict( + means=means, + variances=variances, + residual_variances=np.mean(residual_gene_vars, axis=0), + highly_variable_rank=medianrank_residual_var, + highly_variable_nbatches=highly_variable_nbatches, + highly_variable_intersection=highly_variable_intersection, + ) + ) df = df.set_index(adata.var_names) - + # Sort genes by how often they selected as hvg within each batch and # break ties with median rank of residual variance across batches df.sort_values( - ['highly_variable_nbatches', 'highly_variable_rank'], - ascending=[False,True], - na_position='last', - inplace=True, - ) + ['highly_variable_nbatches', 'highly_variable_rank'], + ascending=[False, True], + na_position='last', + inplace=True, + ) df['highly_variable'] = False df.highly_variable.iloc[:n_top_genes] = True - ## TODO: following line raises a pandas warning (also for flavor = seurat and cellranger..) + # TODO: following line raises a pandas warning + # (also for flavor = seurat and cellranger..) df = df.loc[adata.var_names] - + if inplace or subset: adata.uns['hvg'] = {'flavor': 'pearson_residuals'} logg.hint( @@ -350,9 +357,9 @@ def _highly_variable_pearson_residuals( adata._inplace_subset_var(df['highly_variable'].values) else: if batch_key is None: - df = df.drop(['highly_variable_nbatches', - 'highly_variable_intersection'], - axis=1) + df = df.drop( + ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1 + ) return df diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 965f92075e..1479bcb9b6 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -38,34 +38,33 @@ def _pearson_residuals(X, theta, clip, copy=False): X = X.copy() if copy else X X = X.toarray() if issparse(X) else X - #check theta + # check theta if theta <= 0: - ## TODO: would "underdispersion" with negative theta make sense? then only theta=0 were undefined.. - raise ValueError('Pearson residuals require theta > 0') - #prepare clipping + # TODO: would "underdispersion" with negative theta make sense? + # then only theta=0 were undefined.. + raise ValueError('Pearson residuals require theta > 0') + # prepare clipping if clip == 'auto': n = X.shape[0] clip = np.sqrt(n) if clip < 0: raise ValueError("Pearson residuals require `clip>=0` or `clip='auto'`.") - + if check_nonnegative_integers(X) is False: - raise ValueError( - "`pp.normalize_pearson_residuals` expects raw count data" - ) - - #get residuals + raise ValueError("`pp.normalize_pearson_residuals` expects raw count data") + + # get residuals sums_genes = np.sum(X, axis=0, keepdims=True) sums_cells = np.sum(X, axis=1, keepdims=True) - sum_total = np.sum(sums_genes) + sum_total = np.sum(sums_genes) mu = sums_cells @ sums_genes / sum_total - residuals = (X - mu) / np.sqrt(mu + mu**2/theta) + residuals = (X - mu) / np.sqrt(mu + mu ** 2 / theta) + + # clip + residuals = np.clip(residuals, a_min=-clip, a_max=clip) - #clip - residuals = np.clip(residuals, a_min = -clip, a_max = clip) - return residuals - + def normalize_pearson_residuals( adata: AnnData, @@ -77,9 +76,9 @@ def normalize_pearson_residuals( inplace: bool = True, ) -> Optional[Dict[str, np.ndarray]]: """\ - Computes analytic Pearson residuals, assuming a negative binomial offset model - with overdispersion theta shared across genes. By default, residuals are - clipped to sqrt(n) and overdispersion theta=100 is used. + Computes analytic Pearson residuals, assuming a negative binomial offset + model with overdispersion theta shared across genes. By default, residuals + are clipped to sqrt(n) and overdispersion theta=100 is used. Params ------ @@ -87,26 +86,27 @@ def normalize_pearson_residuals( The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. theta - The NB overdispersion parameter theta. Higher values correspond to less - overdispersion (var = mean + mean^2/theta), and `theta=np.Inf` corresponds - to a Poisson model. + The NB overdispersion parameter theta. Higher values correspond to + less overdispersion (var = mean + mean^2/theta), and `theta=np.Inf` + corresponds to a Poisson model. clip Determines if and how residuals are clipped: - * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], - where n is the number of cells in the dataset (default behavior). + * If `'auto'`, residuals are clipped to the interval + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset + (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. layers - List of layers to compute Pearson residuals of. Set to `'all'` to + List of layers to compute Pearson residuals of. Set to `'all'` to compute for all layers. theta_per_layer Dict that specifies which theta is used for each layer: * If `None`, the provided `theta` is used for all layers. - * Otherwise, each layer with key `layer_key` is processed with the theta - value in `theta_per_layer[layer_key]`. + * Otherwise, each layer with key `layer_key` is processed with the + theta value in `theta_per_layer[layer_key]`. clip_per_layer Dict that specifies clipping behavior for each layer : @@ -115,8 +115,8 @@ def normalize_pearson_residuals( `clip_per_layer[layer_key]`. See `clip` above for possible values. inplace - Whether to update `adata` or return dictionary with normalized copies of - `adata.X` and `adata.layers`. + Whether to update `adata` or return dictionary with normalized copies + of `adata.X` and `adata.layers`. Returns ------- @@ -125,12 +125,13 @@ def normalize_pearson_residuals( `adata.X` and `adata.layers`, depending on `inplace`. """ - + if layers == 'all': layers = adata.layers.keys() - - view_to_actual(adata) ### TODO: is this needed and if yes what for? normalize_total() has it so I used it here - + # TODO: is this needed and if yes what for? + # normalize_total() has it so I used it here + view_to_actual(adata) + # Handle X msg = 'computing analytic Pearson residuals for adata.X' start = logg.info(msg) @@ -138,74 +139,88 @@ def normalize_pearson_residuals( adata.X = _pearson_residuals(adata.X, theta, clip) settings_dict = dict(theta=theta, clip=clip) if theta_per_layer is not None: - settings_dict['theta_per_layer']=theta_per_layer + settings_dict['theta_per_layer'] = theta_per_layer if clip_per_layer is not None: - settings_dict['clip_per_layer']=clip_per_layer - adata.uns['normalization_pearson_residuals'] = settings_dict - + settings_dict['clip_per_layer'] = clip_per_layer + adata.uns['pearson_residuals_normalization'] = settings_dict + else: dat = dict(X=_pearson_residuals(adata.X, theta, clip, copy=True)) - + # Handle layers - for layer_name in (layers or ()): - + for layer_name in layers or (): + msg = f'computing analytic Pearson residuals for layer {layer_name}' _ = logg.info(msg) - + # Default to theta/clip if no layer-specific theta/clip given - layer_theta = theta if theta_per_layer is None else theta_per_layer[layer_name] - layer_clip = clip if clip_per_layer is None else clip_per_layer[layer_name] - + if theta_per_layer is None: + layer_theta = theta + else: + layer_theta = theta_per_layer[layer_name] + if clip_per_layer is None: + layer_clip = clip + else: + layer_clip = clip_per_layer[layer_name] + layer = adata.layers[layer_name] if inplace: - adata.layers[layer_name] = _pearson_residuals(layer, layer_theta, layer_clip) + adata.layers[layer_name] = _pearson_residuals( + layer, layer_theta, layer_clip + ) else: - dat[layer_name] = _pearson_residuals(layer, layer_theta, layer_clip, copy=True) - + dat[layer_name] = _pearson_residuals( + layer, layer_theta, layer_clip, copy=True + ) + if not layers is None: - adata.uns['normalization_pearson_residuals'] = dict( - theta=theta, - clip=clip) + adata.uns['pearson_residuals_normalization'] = dict( + theta=theta, + clip=clip, + ) logg.info(' finished ({time_passed})', time=start) return dat if not inplace else None - - - + + def normalize_pearson_residuals_pca( - adata: AnnData, + adata: AnnData, theta: float = 100, - clip: Union[Literal['auto', 'none'], float] = 'auto', + clip: Union[Literal['auto', 'none'], float] = 'auto', n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, use_highly_variable: bool = True, - inplace: bool = False + inplace: bool = False, ) -> Optional[pd.DataFrame]: """\ - Applies PCA based on Pearson residual normalization. Operates on the subset of - highly variable genes in `adata.var['highly_variable']` by default. - - + Applies PCA based on Pearson residual normalization. Operates on the + subset of highly variable genes in `adata.var['highly_variable']` by + default. + + Parameters ---------- adata The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. use_highly_variable - Whether to use the gene selection in `adata.var['highly_variable']` to subset - the data before normalizing (default) or proceed on the full dataset. + Whether to use the gene selection in `adata.var['highly_variable']` to + subset the data before normalizing (default) or proceed on the full + dataset. theta - This is the NB overdispersion parameter theta for Pearson residual computations. - Higher values correspond to less overdispersion (var = mean + mean^2/theta), and - `theta=np.Inf` corresponds to a Poisson model. + This is the NB overdispersion parameter theta for Pearson residual + computations. Higher values correspond to less overdispersion + (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a + Poisson model. clip This determines if and how Pearson residuals are clipped: - * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], - where n is the number of cells in the dataset (default behavior). + * If `'auto'`, residuals are clipped to the interval + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset + (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. @@ -219,7 +234,8 @@ def normalize_pearson_residuals_pca( Returns ------- - If `inplace=False`, returns the Pearson residual-based PCA results (`adata_pca`). + If `inplace=False`, returns the Pearson residual-based PCA results + (`adata_pca`). If `inplace=True`, updates `adata` with the following fields: `.uns['pearson_residuals_normalization']['pearson_residuals_df']` @@ -230,7 +246,8 @@ def normalize_pearson_residuals_pca( The used value of the clipping parameter `.obsm['pearson_residuals_X_pca']` - PCA representation of data after gene selection and Pearson residual normalization. + PCA representation of data after gene selection and Pearson residual + normalization. `.uns['pearson_residuals_pca']['PCs']` The principal components containing the loadings. `.uns['pearson_residuals_pca']['variance_ratio']` @@ -239,27 +256,25 @@ def normalize_pearson_residuals_pca( Explained variance, equivalent to the eigenvalues of the covariance matrix. - """ - + """ - if use_highly_variable and 'highly_variable' in adata.var_keys(): - adata_pca = adata[:,adata.var['highly_variable']].copy() ##TODO: are these copies needed? + # TODO: are these copies needed? + adata_pca = adata[:, adata.var['highly_variable']].copy() else: - adata_pca = adata.copy()##TODO: are these copies needed? - - normalize_pearson_residuals(adata_pca,theta=theta,clip=clip) - pca(adata_pca,n_comps=n_comps_pca,random_state=random_state_pca) - + # TODO: are these copies needed? + adata_pca = adata.copy() + + normalize_pearson_residuals(adata_pca, theta=theta, clip=clip) + pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca) + if inplace: - normalization_settings = adata_pca.uns['normalization_pearson_residuals'] - normalization_dict = dict(**normalization_settings, - pearson_residuals_df = adata_pca.to_df()) + norm_settings = adata_pca.uns['pearson_residuals_normalization'] + norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df()) pca_settings = adata_pca.uns['pca'] - pca_dict = dict(**pca_settings, - PCs = adata_pca.varm['PCs']) + pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs']) adata.uns['pearson_residuals_pca'] = pca_dict - adata.uns['pearson_residuals_normalization'] = normalization_dict + adata.uns['pearson_residuals_normalization'] = norm_dict adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca'] return None else: diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index 75b1f42fc0..0f2ce4e994 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -173,50 +173,54 @@ def recipe_zheng17( def recipe_pearson_residuals( - adata: AnnData, + adata: AnnData, n_top_genes: int = 1000, theta: float = 100, - clip: Union[Literal['auto', 'none'], float] = 'auto', + clip: Union[Literal['auto', 'none'], float] = 'auto', chunksize: int = 1000, batch_key: Optional[str] = None, n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, - inplace: bool = False -) -> Optional[Tuple[pd.DataFrame,pd.DataFrame]]: + inplace: bool = False, +) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: """\ - Applies gene selection based on Pearson residuals. On the resulting subset, + Applies gene selection based on Pearson residuals. On the resulting subset, Pearson residual normalization and PCA are performed. - - + + Parameters ---------- adata The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. n_top_genes - Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. + Number of highly-variable genes to keep. Mandatory if + `flavor='seurat_v3'` or `flavor='pearson_residuals'`. chunksize - This dertermines how many genes are processed at once while computing the - Pearson residual variance. Choosing a smaller value will reduce the required memory. + This dertermines how many genes are processed at once while computing + the Pearson residual variance. Choosing a smaller value will reduce + the required memory. theta - This is the NB overdispersion parameter theta for Pearson residual computations. - Higher values correspond to less overdispersion (var = mean + mean^2/theta), and - `theta=np.Inf` corresponds to a Poisson model. + This is the NB overdispersion parameter theta for Pearson residual + computations. Higher values correspond to less overdispersion + (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a + Poisson model. clip This determines if and how Pearson residuals are clipped: - - * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], - where n is the number of cells in the dataset (default behavior). + + * If `'auto'`, residuals are clipped to the interval + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset + (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. batch_key - If specified, highly-variable genes are selected within each batch separately and merged. - This simple process avoids the selection of batch-specific genes and acts as a - lightweight batch correction method. For all flavors, genes are first sorted - by how many batches they are a HVG. Ties are broken by the median rank (across batches) + If specified, highly-variable genes are selected within each batch + separately and merged. This simple process avoids the selection of + batch-specific genes and acts as a lightweight batch correction + method. For all flavors, genes are first sorted by how many batches + they are a HVG. Ties are broken by the median rank (across batches) based on within-batch residual variance. - + n_comps_pca Number of principal components to compute. random_state_pca @@ -226,36 +230,42 @@ def recipe_pearson_residuals( Returns ------- - If `inplace=False`, separately returns the gene selection results (`hvg`) and Pearson - residual-based PCA results (`adata_pca`). If `inplace=True`, updates `adata` with the - following fields for gene selection results…: + If `inplace=False`, separately returns the gene selection results (`hvg`) + and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`, + updates `adata` with the following fields for gene selection results…: `.var['highly_variable']` boolean indicator of highly-variable genes `.var['means']` means per gene `.var['variances']` - variances per gene + variances per gene `.var['residual_variances']` - Pearson residual variance per gene. Averaged in the case of multiple batches. + Pearson residual variance per gene. Averaged in the case of multiple + batches. `.var['highly_variable_rank']` - Rank of the gene according to residual variance, median rank in the case of multiple batches + Rank of the gene according to residual variance, median rank in the + case of multiple batches `.var['highly_variable_nbatches']` - If batch_key is given, this denotes in how many batches genes are detected as HVG + If batch_key is given, this denotes in how many batches genes are + detected as HVG `.var['highly_variable_intersection']` - If batch_key is given, this denotes the genes that are highly variable in all batches - - …and the following fields for Pearson residual-based PCA results and normalization settings: - + If batch_key is given, this denotes the genes that are highly variable + in all batches + + …and the following fields for Pearson residual-based PCA results and + normalization settings: + `.uns['pearson_residuals_normalization']['pearson_residuals_df']` The hvg-subset, normalized by Pearson residuals `.uns['pearson_residuals_normalization']['theta']` The used value of the overdisperion parameter theta `.uns['pearson_residuals_normalization']['clip']` The used value of the clipping parameter - + `.obsm['pearson_residuals_X_pca']` - PCA representation of data after gene selection and Pearson residual normalization. + PCA representation of data after gene selection and Pearson residual + normalization. `.uns['pearson_residuals_pca']['PCs']` The principal components containing the loadings. `.uns['pearson_residuals_pca']['variance_ratio']` @@ -263,37 +273,40 @@ def recipe_pearson_residuals( `.uns['pearson_residuals_pca']['variance']` Explained variance, equivalent to the eigenvalues of the covariance matrix. - - + """ - - hvg_args = dict(flavor = 'pearson_residuals', - n_top_genes = n_top_genes, - batch_key = batch_key, - theta = theta, - clip = clip, - chunksize = chunksize) - + + hvg_args = dict( + flavor='pearson_residuals', + n_top_genes=n_top_genes, + batch_key=batch_key, + theta=theta, + clip=clip, + chunksize=chunksize, + ) + if inplace: - pp.highly_variable_genes(adata,**hvg_args,inplace = True) - adata_pca = adata[:,adata.var['highly_variable']].copy() ##TODO: are these copies needed? + pp.highly_variable_genes(adata, **hvg_args, inplace=True) + # TODO: are these copies needed? + adata_pca = adata[:, adata.var['highly_variable']].copy() else: - hvg = pp.highly_variable_genes(adata,**hvg_args,inplace = False) - adata_pca = adata[:,hvg['highly_variable']].copy()##TODO: are these copies needed? - - pp.normalize_pearson_residuals(adata_pca,theta = theta,clip = clip) - pp.pca(adata_pca,n_comps = n_comps_pca,random_state = random_state_pca) - + hvg = pp.highly_variable_genes(adata, **hvg_args, inplace=False) + # TODO: are these copies needed? + adata_pca = adata[:, hvg['highly_variable']].copy() + + pp.normalize_pearson_residuals(adata_pca, theta=theta, clip=clip) + pp.pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca) + if inplace: - normalization_settings = adata_pca.uns['normalization_pearson_residuals'] - normalization_dict = dict(**normalization_settings, - pearson_residuals_df = adata_pca.to_df()) - pca_settings = adata_pca.uns['pca'] - pca_dict = dict(**pca_settings, - PCs = adata_pca.varm['PCs']) + normalization_param = adata_pca.uns['pearson_residuals_normalization'] + normalization_dict = dict( + **normalization_param, pearson_residuals_df=adata_pca.to_df() + ) + pca_param = adata_pca.uns['pca'] + pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs']) adata.uns['pearson_residuals_pca'] = pca_dict adata.uns['pearson_residuals_normalization'] = normalization_dict adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca'] return None else: - return adata_pca, hvg \ No newline at end of file + return adata_pca, hvg From 977b6cf4fdb7487dc12b95b9644b6f832baa520e Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 5 Mar 2021 15:16:50 +0100 Subject: [PATCH 04/96] adapting _normalize_pearson_residuals() to cleaned-up _normalized_total() from #1667 --- scanpy/preprocessing/_normalization.py | 101 +++++++++---------------- scanpy/preprocessing/_recipes.py | 2 +- 2 files changed, 35 insertions(+), 68 deletions(-) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 1479bcb9b6..f78287e3a0 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -14,6 +14,8 @@ from scanpy.get import _get_obs_rep, _set_obs_rep from ._pca import pca +from scanpy.get import _get_obs_rep, _set_obs_rep + @@ -36,6 +38,7 @@ def _normalize_data(X, counts, after=None, copy=False): def _pearson_residuals(X, theta, clip, copy=False): X = X.copy() if copy else X + ##TODO can we avoid making this dense? X = X.toarray() if issparse(X) else X # check theta @@ -70,9 +73,8 @@ def normalize_pearson_residuals( adata: AnnData, theta: float = 100, clip: Union[Literal['auto', 'none'], float] = 'auto', - layers: Optional[Union[Literal['all'], Iterable[str]]] = None, - theta_per_layer: Optional[Dict[str, str]] = None, - clip_per_layer: Optional[Dict[str, Union[Literal['auto', 'none'], float]]] = None, + layer: Optional[str] = None, + copy: bool=False, inplace: bool = True, ) -> Optional[Dict[str, np.ndarray]]: """\ @@ -98,91 +100,56 @@ def normalize_pearson_residuals( * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. - layers - List of layers to compute Pearson residuals of. Set to `'all'` to - compute for all layers. - theta_per_layer - Dict that specifies which theta is used for each layer: - - * If `None`, the provided `theta` is used for all layers. - * Otherwise, each layer with key `layer_key` is processed with the - theta value in `theta_per_layer[layer_key]`. - clip_per_layer - Dict that specifies clipping behavior for each layer : - - * If `None`, the provided `clip` variable is used for all layers. - * Otherwise, each layer with key `layer_key` is clipped according to - `clip_per_layer[layer_key]`. See `clip` above for possible values. - + layer + Layer to normalize instead of `X`. If `None`, `X` is normalized. inplace Whether to update `adata` or return dictionary with normalized copies of `adata.X` and `adata.layers`. + copy + Whether to modify copied input object. Not compatible with + `inplace=False`. Returns ------- - Returns dictionary with Pearson residuals of `adata.X` and `adata.layers` + Returns dictionary with Pearson residuals and settings or updates `adata` with normalized version of the original `adata.X` and `adata.layers`, depending on `inplace`. """ + + if copy: + if not inplace: + raise ValueError( + "`copy=True` cannot be used with `inplace=False`." + ) + adata = adata.copy() - if layers == 'all': - layers = adata.layers.keys() # TODO: is this needed and if yes what for? # normalize_total() has it so I used it here + # TODO: add to other files as well?! view_to_actual(adata) + + X = _get_obs_rep(adata, layer=layer) ## TODO add to other files as well! + computed_on = layer if layer else 'adata.X' - # Handle X - msg = 'computing analytic Pearson residuals for adata.X' + msg = 'computing analytic Pearson residuals on %s' % computed_on start = logg.info(msg) + + residuals = _pearson_residuals(X, theta, clip, copy = ~inplace) + settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on) + if inplace: - adata.X = _pearson_residuals(adata.X, theta, clip) - settings_dict = dict(theta=theta, clip=clip) - if theta_per_layer is not None: - settings_dict['theta_per_layer'] = theta_per_layer - if clip_per_layer is not None: - settings_dict['clip_per_layer'] = clip_per_layer + _set_obs_rep(adata,residuals,layer=layer) adata.uns['pearson_residuals_normalization'] = settings_dict - else: - dat = dict(X=_pearson_residuals(adata.X, theta, clip, copy=True)) - - # Handle layers - for layer_name in layers or (): - - msg = f'computing analytic Pearson residuals for layer {layer_name}' - _ = logg.info(msg) - - # Default to theta/clip if no layer-specific theta/clip given - if theta_per_layer is None: - layer_theta = theta - else: - layer_theta = theta_per_layer[layer_name] - if clip_per_layer is None: - layer_clip = clip - else: - layer_clip = clip_per_layer[layer_name] - - layer = adata.layers[layer_name] - - if inplace: - adata.layers[layer_name] = _pearson_residuals( - layer, layer_theta, layer_clip - ) - else: - dat[layer_name] = _pearson_residuals( - layer, layer_theta, layer_clip, copy=True - ) - - if not layers is None: - adata.uns['pearson_residuals_normalization'] = dict( - theta=theta, - clip=clip, - ) + results_dict = dict(X=residuals,**settings_dict) logg.info(' finished ({time_passed})', time=start) - return dat if not inplace else None + if copy: + return adata + elif not inplace: + return results_dict def normalize_pearson_residuals_pca( @@ -192,7 +159,7 @@ def normalize_pearson_residuals_pca( n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, use_highly_variable: bool = True, - inplace: bool = False, + inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index 0f2ce4e994..9cffa0f2de 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -181,7 +181,7 @@ def recipe_pearson_residuals( batch_key: Optional[str] = None, n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, - inplace: bool = False, + inplace: bool = True, ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: """\ Applies gene selection based on Pearson residuals. On the resulting subset, From d8d724c2d96ac33ab40860b42a91ae4b92c66b8a Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 5 Mar 2021 15:40:33 +0100 Subject: [PATCH 05/96] updating layer management as in #1667 for _highly_variable_pearson_residuals() as well --- scanpy/preprocessing/_highly_variable_genes.py | 9 ++++++--- scanpy/preprocessing/_normalization.py | 8 ++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 78347b6fc3..9897ed4965 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -184,7 +184,7 @@ def _highly_variable_pearson_residuals( clip: Union[Literal['auto', 'none'], float] = 'auto', chunksize: int = 100, subset: bool = False, - inplace: bool = True, + inplace: bool = True ) -> Optional[pd.DataFrame]: """\ See `highly_variable_genes`. @@ -214,7 +214,9 @@ def _highly_variable_pearson_residuals( in all batches """ - X = adata.layers[layer] if layer is not None else adata.X + view_to_actual(adata) + X = _get_obs_rep(adata, layer=layer) + computed_on = layer if layer else 'adata.X' # Check for raw counts if check_nonnegative_integers(X) is False: @@ -328,7 +330,8 @@ def _highly_variable_pearson_residuals( df = df.loc[adata.var_names] if inplace or subset: - adata.uns['hvg'] = {'flavor': 'pearson_residuals'} + adata.uns['hvg'] = {'flavor': 'pearson_residuals', + 'computed_on':computed_on} logg.hint( 'added\n' ' \'highly_variable\', boolean vector (adata.var)\n' diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index f78287e3a0..5ff4f32049 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -124,12 +124,8 @@ def normalize_pearson_residuals( ) adata = adata.copy() - # TODO: is this needed and if yes what for? - # normalize_total() has it so I used it here - # TODO: add to other files as well?! - view_to_actual(adata) - - X = _get_obs_rep(adata, layer=layer) ## TODO add to other files as well! + view_to_actual(adata) + X = _get_obs_rep(adata, layer=layer) computed_on = layer if layer else 'adata.X' msg = 'computing analytic Pearson residuals on %s' % computed_on From e23ea6cbbd243838ef0e26bc81be01cb939fd9fe Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 5 Mar 2021 16:07:22 +0100 Subject: [PATCH 06/96] slight performance improvement for sparse input --- scanpy/preprocessing/_normalization.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 5ff4f32049..557f4d1b82 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -38,8 +38,6 @@ def _normalize_data(X, counts, after=None, copy=False): def _pearson_residuals(X, theta, clip, copy=False): X = X.copy() if copy else X - ##TODO can we avoid making this dense? - X = X.toarray() if issparse(X) else X # check theta if theta <= 0: @@ -56,12 +54,18 @@ def _pearson_residuals(X, theta, clip, copy=False): if check_nonnegative_integers(X) is False: raise ValueError("`pp.normalize_pearson_residuals` expects raw count data") - # get residuals - sums_genes = np.sum(X, axis=0, keepdims=True) - sums_cells = np.sum(X, axis=1, keepdims=True) - sum_total = np.sum(sums_genes) - mu = sums_cells @ sums_genes / sum_total - residuals = (X - mu) / np.sqrt(mu + mu ** 2 / theta) + if sp_sparse.issparse(X): + sums_genes = np.sum(X, axis=0) + sums_cells = np.sum(X, axis=1) + sum_total = np.sum(sums_genes).squeeze() + else: + sums_genes = np.sum(X, axis=0, keepdims=True) + sums_cells = np.sum(X, axis=1, keepdims=True) + sum_total = np.sum(sums_genes) + + mu = np.array(sums_cells @ sums_genes / sum_total) + diff = np.array(X - mu) + residuals = diff / np.sqrt(mu + mu ** 2 / theta) # clip residuals = np.clip(residuals, a_min=-clip, a_max=clip) From fc49c2580305705250e554a8f7cd3105747d240a Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Wed, 10 Mar 2021 14:57:11 +0100 Subject: [PATCH 07/96] style cleanup --- scanpy/preprocessing/__init__.py | 15 ++++++-- .../preprocessing/_highly_variable_genes.py | 38 +++++++++---------- scanpy/preprocessing/_normalization.py | 25 +++++------- 3 files changed, 40 insertions(+), 38 deletions(-) diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py index f1b4dad80c..8adee1f813 100644 --- a/scanpy/preprocessing/__init__.py +++ b/scanpy/preprocessing/__init__.py @@ -1,4 +1,9 @@ -from ._recipes import recipe_zheng17, recipe_weinreb17, recipe_seurat, recipe_pearson_residuals +from ._recipes import ( + recipe_zheng17, + recipe_weinreb17, + recipe_seurat, + recipe_pearson_residuals, +) from ._simple import filter_cells, filter_genes from ._deprecated.highly_variable_genes import filter_genes_dispersion from ._highly_variable_genes import highly_variable_genes @@ -7,6 +12,10 @@ from ._pca import pca from ._qc import calculate_qc_metrics from ._combat import combat -from ._normalization import normalize_total, normalize_pearson_residuals, normalize_pearson_residuals_pca +from ._normalization import ( + normalize_total, + normalize_pearson_residuals, + normalize_pearson_residuals_pca, +) -from ..neighbors import neighbors \ No newline at end of file +from ..neighbors import neighbors diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 9897ed4965..2c63b6161a 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -175,6 +175,7 @@ def _highly_variable_genes_seurat_v3( df = df.drop(['highly_variable_nbatches'], axis=1) return df + def _highly_variable_pearson_residuals( adata: AnnData, layer: Optional[str] = None, @@ -184,7 +185,7 @@ def _highly_variable_pearson_residuals( clip: Union[Literal['auto', 'none'], float] = 'auto', chunksize: int = 100, subset: bool = False, - inplace: bool = True + inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ See `highly_variable_genes`. @@ -214,7 +215,7 @@ def _highly_variable_pearson_residuals( in all batches """ - view_to_actual(adata) + view_to_actual(adata) X = _get_obs_rep(adata, layer=layer) computed_on = layer if layer else 'adata.X' @@ -330,8 +331,7 @@ def _highly_variable_pearson_residuals( df = df.loc[adata.var_names] if inplace or subset: - adata.uns['hvg'] = {'flavor': 'pearson_residuals', - 'computed_on':computed_on} + adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on} logg.hint( 'added\n' ' \'highly_variable\', boolean vector (adata.var)\n' @@ -364,9 +364,7 @@ def _highly_variable_pearson_residuals( ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1 ) return df - - - + def _highly_variable_genes_single_batch( adata: AnnData, @@ -481,7 +479,6 @@ def _highly_variable_genes_single_batch( return df - def highly_variable_genes( adata: AnnData, layer: Optional[str] = None, @@ -495,7 +492,9 @@ def highly_variable_genes( theta: float = 100, clip: Union[Literal['auto', 'none'], float] = 'auto', chunksize: int = 1000, - flavor: Literal['seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals'] = 'seurat', + flavor: Literal[ + 'seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals' + ] = 'seurat', subset: bool = False, inplace: bool = True, batch_key: Optional[str] = None, @@ -658,21 +657,20 @@ def highly_variable_genes( if flavor == 'pearson_residuals': if n_top_genes is None: raise ValueError( - "`pp.highly_variable_genes` requires the argument `n_top_genes`" - " for `flavor='pearson_residuals'`" + "`pp.highly_variable_genes` requires the argument `n_top_genes`" + " for `flavor='pearson_residuals'`" ) return _highly_variable_pearson_residuals( adata, - layer = layer, - n_top_genes = n_top_genes, - batch_key = batch_key, - theta = theta, - clip = clip, - chunksize= chunksize, - subset = subset, - inplace = inplace, + layer=layer, + n_top_genes=n_top_genes, + batch_key=batch_key, + theta=theta, + clip=clip, + chunksize=chunksize, + subset=subset, + inplace=inplace, ) - if batch_key is None: df = _highly_variable_genes_single_batch( diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 557f4d1b82..be47494933 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -17,9 +17,6 @@ from scanpy.get import _get_obs_rep, _set_obs_rep - - - def _normalize_data(X, counts, after=None, copy=False): X = X.copy() if copy else X if issubclass(X.dtype.type, (int, np.integer)): @@ -78,7 +75,7 @@ def normalize_pearson_residuals( theta: float = 100, clip: Union[Literal['auto', 'none'], float] = 'auto', layer: Optional[str] = None, - copy: bool=False, + copy: bool = False, inplace: bool = True, ) -> Optional[Dict[str, np.ndarray]]: """\ @@ -120,29 +117,27 @@ def normalize_pearson_residuals( `adata.X` and `adata.layers`, depending on `inplace`. """ - + if copy: if not inplace: - raise ValueError( - "`copy=True` cannot be used with `inplace=False`." - ) + raise ValueError("`copy=True` cannot be used with `inplace=False`.") adata = adata.copy() - view_to_actual(adata) + view_to_actual(adata) X = _get_obs_rep(adata, layer=layer) computed_on = layer if layer else 'adata.X' msg = 'computing analytic Pearson residuals on %s' % computed_on start = logg.info(msg) - - residuals = _pearson_residuals(X, theta, clip, copy = ~inplace) + + residuals = _pearson_residuals(X, theta, clip, copy=~inplace) settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on) - + if inplace: - _set_obs_rep(adata,residuals,layer=layer) + _set_obs_rep(adata, residuals, layer=layer) adata.uns['pearson_residuals_normalization'] = settings_dict else: - results_dict = dict(X=residuals,**settings_dict) + results_dict = dict(X=residuals, **settings_dict) logg.info(' finished ({time_passed})', time=start) @@ -246,7 +241,7 @@ def normalize_pearson_residuals_pca( return None else: return adata_pca - + def normalize_total( adata: AnnData, From f91f2fe11a95846506a820e84e6a3106628ce5ce Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 12 Mar 2021 16:18:46 +0100 Subject: [PATCH 08/96] fixing import issue, fixing docstring style, adding check_values param and warning as in #1642 --- .../preprocessing/_highly_variable_genes.py | 40 ++++++++++--------- scanpy/preprocessing/_recipes.py | 18 ++++----- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 2c63b6161a..7f0387e81f 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -9,7 +9,8 @@ from .. import logging as logg from .._settings import settings, Verbosity -from .._utils import sanitize_anndata, check_nonnegative_integers +from .._utils import sanitize_anndata, check_nonnegative_integers, view_to_actual +from scanpy.get import _get_obs_rep, _set_obs_rep from .._compat import Literal from ._utils import _get_mean_var from ._distributed import materialize_as_ndarray @@ -34,20 +35,20 @@ def _highly_variable_genes_seurat_v3( Returns ------- Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or - updates `.var` with the following fields + updates `.var` with the following fields: highly_variable : bool - boolean indicator of highly-variable genes + boolean indicator of highly-variable genes. **means** - means per gene + means per gene. **variances** - variance per gene + variance per gene. **variances_norm** - normalized variance per gene, averaged in the case of multiple batches + normalized variance per gene, averaged in the case of multiple batches. highly_variable_rank : float - Rank of the gene according to normalized variance, median rank in the case of multiple batches + Rank of the gene according to normalized variance, median rank in the case of multiple batches. highly_variable_nbatches : int - If batch_key is given, this denotes in how many batches genes are detected as HVG + If batch_key is given, this denotes in how many batches genes are detected as HVG. """ try: @@ -184,6 +185,7 @@ def _highly_variable_pearson_residuals( theta: float = 100, clip: Union[Literal['auto', 'none'], float] = 'auto', chunksize: int = 100, + check_values: bool = True, subset: bool = False, inplace: bool = True, ) -> Optional[pd.DataFrame]: @@ -193,26 +195,26 @@ def _highly_variable_pearson_residuals( Returns ------- Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) - or updates `.var` with the following fields + or updates `.var` with the following fields: highly_variable - boolean indicator of highly-variable genes + boolean indicator of highly-variable genes. means - means per gene + means per gene. variances - variances per gene + variances per gene. residual_variances Pearson residual variance per gene. Averaged in the case of multiple batches. highly_variable_rank Rank of the gene according to residual variance, median rank in the - case of multiple batches + case of multiple batches. highly_variable_nbatches : int If batch_key is given, this denotes in how many batches genes are - detected as HVG + detected as HVG. highly_variable_intersection : bool If batch_key is given, this denotes the genes that are highly variable - in all batches + in all batches. """ view_to_actual(adata) @@ -220,10 +222,10 @@ def _highly_variable_pearson_residuals( computed_on = layer if layer else 'adata.X' # Check for raw counts - if check_nonnegative_integers(X) is False: - raise ValueError( - "`pp.highly_variable_genes` with `flavor='pearson_residuals'`" - "expects raw count data." + if check_values and (check_nonnegative_integers(X) == False): + warnings.warn( + "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", + UserWarning, ) if batch_key is None: diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index 9cffa0f2de..891bbb55ce 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -235,33 +235,33 @@ def recipe_pearson_residuals( updates `adata` with the following fields for gene selection results…: `.var['highly_variable']` - boolean indicator of highly-variable genes + boolean indicator of highly-variable genes. `.var['means']` - means per gene + means per gene. `.var['variances']` - variances per gene + variances per gene. `.var['residual_variances']` Pearson residual variance per gene. Averaged in the case of multiple batches. `.var['highly_variable_rank']` Rank of the gene according to residual variance, median rank in the - case of multiple batches + case of multiple batches. `.var['highly_variable_nbatches']` If batch_key is given, this denotes in how many batches genes are - detected as HVG + detected as HVG. `.var['highly_variable_intersection']` If batch_key is given, this denotes the genes that are highly variable - in all batches + in all batches. …and the following fields for Pearson residual-based PCA results and normalization settings: `.uns['pearson_residuals_normalization']['pearson_residuals_df']` - The hvg-subset, normalized by Pearson residuals + The hvg-subset, normalized by Pearson residuals. `.uns['pearson_residuals_normalization']['theta']` - The used value of the overdisperion parameter theta + The used value of the overdisperion parameter theta. `.uns['pearson_residuals_normalization']['clip']` - The used value of the clipping parameter + The used value of the clipping parameter. `.obsm['pearson_residuals_X_pca']` PCA representation of data after gene selection and Pearson residual From 60de21d3aaa7c236fe523d2b9ad737391ea4712e Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 12 Mar 2021 16:37:26 +0100 Subject: [PATCH 09/96] fixed small NameError, simplified clip argument --- scanpy/preprocessing/_highly_variable_genes.py | 10 +++++----- scanpy/preprocessing/_normalization.py | 14 +++++++------- scanpy/preprocessing/_recipes.py | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 7f0387e81f..1abc261c68 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -183,7 +183,7 @@ def _highly_variable_pearson_residuals( n_top_genes: int = 2000, batch_key: Optional[str] = None, theta: float = 100, - clip: Union[Literal['auto', 'none'], float] = 'auto', + clip: Optional[float] = None, chunksize: int = 100, check_values: bool = True, subset: bool = False, @@ -251,11 +251,11 @@ def _highly_variable_pearson_residuals( X_batch = adata_subset.X # Prepare clipping - if clip == 'auto': + if clip is None: n = X_batch.shape[0] clip = np.sqrt(n) if clip < 0: - raise ValueError("Pearson residuals require `clip>=0` or `clip='auto'`.") + raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") if sp_sparse.issparse(X_batch): sums_genes = np.sum(X_batch, axis=0) @@ -492,7 +492,7 @@ def highly_variable_genes( span: Optional[float] = 0.3, n_bins: int = 20, theta: float = 100, - clip: Union[Literal['auto', 'none'], float] = 'auto', + clip: Optional[float] = None, chunksize: int = 1000, flavor: Literal[ 'seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals' @@ -564,7 +564,7 @@ def highly_variable_genes( clip If `flavor='pearson_residuals'`, this determines if and how residuals are clipped: - * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index be47494933..5d1e65de1b 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -42,16 +42,16 @@ def _pearson_residuals(X, theta, clip, copy=False): # then only theta=0 were undefined.. raise ValueError('Pearson residuals require theta > 0') # prepare clipping - if clip == 'auto': + if clip is None: n = X.shape[0] clip = np.sqrt(n) if clip < 0: - raise ValueError("Pearson residuals require `clip>=0` or `clip='auto'`.") + raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") if check_nonnegative_integers(X) is False: raise ValueError("`pp.normalize_pearson_residuals` expects raw count data") - if sp_sparse.issparse(X): + if issparse(X): sums_genes = np.sum(X, axis=0) sums_cells = np.sum(X, axis=1) sum_total = np.sum(sums_genes).squeeze() @@ -73,7 +73,7 @@ def _pearson_residuals(X, theta, clip, copy=False): def normalize_pearson_residuals( adata: AnnData, theta: float = 100, - clip: Union[Literal['auto', 'none'], float] = 'auto', + clip: Optional[float] = None, layer: Optional[str] = None, copy: bool = False, inplace: bool = True, @@ -95,7 +95,7 @@ def normalize_pearson_residuals( clip Determines if and how residuals are clipped: - * If `'auto'`, residuals are clipped to the interval + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set @@ -150,7 +150,7 @@ def normalize_pearson_residuals( def normalize_pearson_residuals_pca( adata: AnnData, theta: float = 100, - clip: Union[Literal['auto', 'none'], float] = 'auto', + clip: Optional[float] = None, n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, use_highly_variable: bool = True, @@ -180,7 +180,7 @@ def normalize_pearson_residuals_pca( clip This determines if and how Pearson residuals are clipped: - * If `'auto'`, residuals are clipped to the interval + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index 891bbb55ce..ef56cc48ca 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -176,7 +176,7 @@ def recipe_pearson_residuals( adata: AnnData, n_top_genes: int = 1000, theta: float = 100, - clip: Union[Literal['auto', 'none'], float] = 'auto', + clip: Optional[float] = None, chunksize: int = 1000, batch_key: Optional[str] = None, n_comps_pca: Optional[int] = 50, @@ -208,7 +208,7 @@ def recipe_pearson_residuals( clip This determines if and how Pearson residuals are clipped: - * If `'auto'`, residuals are clipped to the interval + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set From 1f86989b991c32642fdc63742c4044caa25aa695 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 12 Mar 2021 17:53:02 +0100 Subject: [PATCH 10/96] remove pd.categorical() --- scanpy/preprocessing/_highly_variable_genes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 1abc261c68..83a93eb675 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -229,7 +229,7 @@ def _highly_variable_pearson_residuals( ) if batch_key is None: - batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int)) + batch_info = np.zeros(adata.shape[0], dtype=int) else: batch_info = adata.obs[batch_key].values n_batches = len(np.unique(batch_info)) From 95ec0e5074c6ba5863315f19388514d3d7c0b5d2 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 12 Mar 2021 18:27:09 +0100 Subject: [PATCH 11/96] adding check_values to docstrings and remaining pearson residual functions --- .../preprocessing/_highly_variable_genes.py | 3 ++- scanpy/preprocessing/_normalization.py | 27 +++++++++++++------ scanpy/preprocessing/_recipes.py | 8 +++++- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 83a93eb675..e9ba0609bd 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -593,7 +593,7 @@ def highly_variable_genes( based on within-batch residual variance. check_values Check if counts in selected layer are integers. A Warning is returned if set to True. - Only used if `flavor='seurat_v3'`. + Only used if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. Returns @@ -671,6 +671,7 @@ def highly_variable_genes( clip=clip, chunksize=chunksize, subset=subset, + check_values=check_values, inplace=inplace, ) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 5d1e65de1b..105b5d6aad 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -32,7 +32,7 @@ def _normalize_data(X, counts, after=None, copy=False): return X -def _pearson_residuals(X, theta, clip, copy=False): +def _pearson_residuals(X, theta, clip, check_values, copy=False): X = X.copy() if copy else X @@ -48,8 +48,11 @@ def _pearson_residuals(X, theta, clip, copy=False): if clip < 0: raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") - if check_nonnegative_integers(X) is False: - raise ValueError("`pp.normalize_pearson_residuals` expects raw count data") + if check_values and (check_nonnegative_integers(X) == False): + warnings.warn( + "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.", + UserWarning, + ) if issparse(X): sums_genes = np.sum(X, axis=0) @@ -76,6 +79,7 @@ def normalize_pearson_residuals( clip: Optional[float] = None, layer: Optional[str] = None, copy: bool = False, + check_values: bool = True, inplace: bool = True, ) -> Optional[Dict[str, np.ndarray]]: """\ @@ -103,12 +107,14 @@ def normalize_pearson_residuals( layer Layer to normalize instead of `X`. If `None`, `X` is normalized. - inplace - Whether to update `adata` or return dictionary with normalized copies - of `adata.X` and `adata.layers`. copy Whether to modify copied input object. Not compatible with `inplace=False`. + check_values + Check if counts in selected layer are integers. A Warning is returned if set to True. + inplace + Whether to update `adata` or return dictionary with normalized copies + of `adata.X` and `adata.layers`. Returns ------- @@ -130,7 +136,7 @@ def normalize_pearson_residuals( msg = 'computing analytic Pearson residuals on %s' % computed_on start = logg.info(msg) - residuals = _pearson_residuals(X, theta, clip, copy=~inplace) + residuals = _pearson_residuals(X, theta, clip, check_values, copy=~inplace) settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on) if inplace: @@ -154,6 +160,7 @@ def normalize_pearson_residuals_pca( n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, use_highly_variable: bool = True, + check_values: bool = True, inplace: bool = True, ) -> Optional[pd.DataFrame]: @@ -190,6 +197,8 @@ def normalize_pearson_residuals_pca( Number of principal components to compute. random_state_pca Change to use different initial states for the optimization. + check_values + Check if counts in selected layer are integers. A Warning is returned if set to True. inplace Whether to place results in `adata` or return them. @@ -227,7 +236,9 @@ def normalize_pearson_residuals_pca( # TODO: are these copies needed? adata_pca = adata.copy() - normalize_pearson_residuals(adata_pca, theta=theta, clip=clip) + normalize_pearson_residuals( + adata_pca, theta=theta, clip=clip, check_values=check_values + ) pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca) if inplace: diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index ef56cc48ca..5ecd58adea 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -181,6 +181,7 @@ def recipe_pearson_residuals( batch_key: Optional[str] = None, n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, + check_values: bool = True, inplace: bool = True, ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: """\ @@ -225,6 +226,8 @@ def recipe_pearson_residuals( Number of principal components to compute. random_state_pca Change to use different initial states for the optimization. + check_values + Check if counts in selected layer are integers. A Warning is returned if set to True. inplace Whether to place results in `adata` or return them. @@ -283,6 +286,7 @@ def recipe_pearson_residuals( theta=theta, clip=clip, chunksize=chunksize, + check_values=check_values, ) if inplace: @@ -294,7 +298,9 @@ def recipe_pearson_residuals( # TODO: are these copies needed? adata_pca = adata[:, hvg['highly_variable']].copy() - pp.normalize_pearson_residuals(adata_pca, theta=theta, clip=clip) + pp.normalize_pearson_residuals( + adata_pca, theta=theta, clip=clip, check_values=check_values + ) pp.pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca) if inplace: From ff822905ad88be11dee82d760c1c8fdd88066b7f Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 12 Mar 2021 20:05:31 +0100 Subject: [PATCH 12/96] np.empty instead of np.nan --- scanpy/preprocessing/_highly_variable_genes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index e9ba0609bd..e971ff52d1 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -267,7 +267,7 @@ def _highly_variable_pearson_residuals( sum_total = np.sum(sums_genes) # Compute pearson residuals in chunks - residual_gene_var = np.ones((X_batch.shape[1])) * np.nan + residual_gene_var = np.empty((X_batch.shape[1])) for start in np.arange(0, X_batch.shape[1], chunksize): stop = start + chunksize mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total) From f7f7dbdc9490c639233b49708ade4acd9844425a Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Mon, 15 Mar 2021 14:23:20 +0100 Subject: [PATCH 13/96] add references to docstrings, add HVG details to docstring --- docs/references.rst | 4 ++++ scanpy/preprocessing/_highly_variable_genes.py | 12 +++++++++--- scanpy/preprocessing/_normalization.py | 6 ++++++ scanpy/preprocessing/_recipes.py | 3 +++ 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/docs/references.rst b/docs/references.rst index 6d05613328..458534a3a3 100644 --- a/docs/references.rst +++ b/docs/references.rst @@ -119,6 +119,10 @@ References *Laplacian Dynamics and Multiscale Modular Structure in Networks* `arXiv `__. +.. [Lause20] Lause *et al.* (2020) + *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*, + `BioRxiv `__. + .. [Leek12] Leek *et al.* (2012), *sva: Surrogate Variable Analysis. R package* `Bioconductor `__. diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index e971ff52d1..f899eb2f3b 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -505,11 +505,12 @@ def highly_variable_genes( """\ Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_. - Expects logarithmized data, except when `flavor='seurat_v3'` in which - count data is expected. + Expects logarithmized data, except when `flavor='seurat_v3'` or + `flavor='pearson_residuals'`, in which count data is expected. Depending on `flavor`, this reproduces the R-implementations of Seurat - [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_. + [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_, or uses + analytical Peason residuals [Lause20]_. For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized dispersion is obtained by scaling with the mean and standard deviation of @@ -522,6 +523,11 @@ def highly_variable_genes( standard deviation. Next, the normalized variance is computed as the variance of each gene after the transformation. Genes are ranked by the normalized variance. + For [Lause20]_, Pearson residuals of a negative binomial offset model (with + overdispersion theta shared across genes) are computed. By default, overdispersion + theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked + by residual variance. + Parameters ---------- adata diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 105b5d6aad..01f051096d 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -87,6 +87,9 @@ def normalize_pearson_residuals( model with overdispersion theta shared across genes. By default, residuals are clipped to sqrt(n) and overdispersion theta=100 is used. + Based on "Analytic Pearson residuals for normalization of single-cell + RNA-seq UMI data", bioRxiv, [Lause20]_. + Params ------ adata @@ -169,6 +172,9 @@ def normalize_pearson_residuals_pca( subset of highly variable genes in `adata.var['highly_variable']` by default. + This workflow is based on "Analytic Pearson residuals for normalization of + single-cell RNA-seq UMI data", bioRxiv, [Lause20]_. + Parameters ---------- diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index 5ecd58adea..c03d2af737 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -188,6 +188,9 @@ def recipe_pearson_residuals( Applies gene selection based on Pearson residuals. On the resulting subset, Pearson residual normalization and PCA are performed. + This recipe is based on "Analytic Pearson residuals for normalization of + single-cell RNA-seq UMI data", bioRxiv, [Lause20]_. + Parameters ---------- From af0a8255cdfd487a5dd060e90a7ab5e95e8d8bef Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Mon, 15 Mar 2021 15:03:31 +0100 Subject: [PATCH 14/96] exposing pca keyword arguments to the user for the bundle/recipe functions --- scanpy/preprocessing/_normalization.py | 5 ++++- scanpy/preprocessing/_recipes.py | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 01f051096d..409fa7b7b7 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -163,6 +163,7 @@ def normalize_pearson_residuals_pca( n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, use_highly_variable: bool = True, + kwargs_pca: Optional[dict] = None, check_values: bool = True, inplace: bool = True, ) -> Optional[pd.DataFrame]: @@ -203,6 +204,8 @@ def normalize_pearson_residuals_pca( Number of principal components to compute. random_state_pca Change to use different initial states for the optimization. + kwargs_pca + Dictionary of further keyword arguments passed on to `sc.pp.pca()`. check_values Check if counts in selected layer are integers. A Warning is returned if set to True. inplace @@ -245,7 +248,7 @@ def normalize_pearson_residuals_pca( normalize_pearson_residuals( adata_pca, theta=theta, clip=clip, check_values=check_values ) - pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca) + pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca) if inplace: norm_settings = adata_pca.uns['pearson_residuals_normalization'] diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index c03d2af737..f68978ba41 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -181,6 +181,7 @@ def recipe_pearson_residuals( batch_key: Optional[str] = None, n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, + kwargs_pca: Optional[dict] = None, check_values: bool = True, inplace: bool = True, ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: @@ -229,13 +230,15 @@ def recipe_pearson_residuals( Number of principal components to compute. random_state_pca Change to use different initial states for the optimization. + kwargs_pca + Dictionary of further keyword arguments passed on to `sc.pp.pca()`. check_values Check if counts in selected layer are integers. A Warning is returned if set to True. inplace Whether to place results in `adata` or return them. Returns - ------- + ------ If `inplace=False`, separately returns the gene selection results (`hvg`) and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`, updates `adata` with the following fields for gene selection results…: @@ -304,7 +307,7 @@ def recipe_pearson_residuals( pp.normalize_pearson_residuals( adata_pca, theta=theta, clip=clip, check_values=check_values ) - pp.pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca) + pp.pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca) if inplace: normalization_param = adata_pca.uns['pearson_residuals_normalization'] From 142eaca0938dc153909dd6b2a7f06d3c4f5eadb1 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Mon, 15 Mar 2021 16:15:19 +0100 Subject: [PATCH 15/96] removed unneeded reversal in hvg, fix kwargs_pca bug, consistent defaults across files --- scanpy/preprocessing/_highly_variable_genes.py | 10 ++++++---- scanpy/preprocessing/_normalization.py | 2 +- scanpy/preprocessing/_recipes.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index f899eb2f3b..34a8e2ca5e 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -180,7 +180,7 @@ def _highly_variable_genes_seurat_v3( def _highly_variable_pearson_residuals( adata: AnnData, layer: Optional[str] = None, - n_top_genes: int = 2000, + n_top_genes: int = 1000, batch_key: Optional[str] = None, theta: float = 100, clip: Optional[float] = None, @@ -287,9 +287,11 @@ def _highly_variable_pearson_residuals( residual_gene_vars = np.concatenate(residual_gene_vars, axis=0) # Get cutoffs and define hvgs per batch - residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1)[:, ::-1] - cutoffs_per_batch = residual_gene_vars_sorted[:, n_top_genes] - highly_variable_per_batch = np.greater(residual_gene_vars.T, cutoffs_per_batch).T + residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1) + cutoffs_per_batch = residual_gene_vars_sorted[:, -n_top_genes] + highly_variable_per_batch = np.greater_equal( + residual_gene_vars.T, cutoffs_per_batch + ).T # Merge hvgs across batches highly_variable_nbatches = np.sum(highly_variable_per_batch, axis=0) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 409fa7b7b7..ffba1e771f 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -163,7 +163,7 @@ def normalize_pearson_residuals_pca( n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, use_highly_variable: bool = True, - kwargs_pca: Optional[dict] = None, + kwargs_pca: Optional[dict] = {}, check_values: bool = True, inplace: bool = True, ) -> Optional[pd.DataFrame]: diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index f68978ba41..7393e1a4ac 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -181,7 +181,7 @@ def recipe_pearson_residuals( batch_key: Optional[str] = None, n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, - kwargs_pca: Optional[dict] = None, + kwargs_pca: dict = {}, check_values: bool = True, inplace: bool = True, ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: From 541b252deab0f99f50085d9802e810eb5bb46b47 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 11 Jun 2021 17:52:13 +0200 Subject: [PATCH 16/96] fixing handling of `inplace` and `subset` arguments (see issue #1886), explicit typing of output, adding theta input check --- .../preprocessing/_highly_variable_genes.py | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 34a8e2ca5e..fe9277f3b7 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -197,18 +197,18 @@ def _highly_variable_pearson_residuals( Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or updates `.var` with the following fields: - highly_variable + highly_variable : bool boolean indicator of highly-variable genes. - means + means : float means per gene. - variances + variances : float variances per gene. - residual_variances + residual_variances : float Pearson residual variance per gene. Averaged in the case of multiple batches. - highly_variable_rank + highly_variable_rank : float Rank of the gene according to residual variance, median rank in the - case of multiple batches. + case of multiple batches. NaN for non-HVGs. highly_variable_nbatches : int If batch_key is given, this denotes in how many batches genes are detected as HVG. @@ -227,6 +227,12 @@ def _highly_variable_pearson_residuals( "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", UserWarning, ) + # check theta + if theta <= 0: + # TODO: would "underdispersion" with negative theta make sense? + # then only theta=0 were undefined.. + raise ValueError('Pearson residuals require theta > 0') + # prepare clipping if batch_key is None: batch_info = np.zeros(adata.shape[0], dtype=int) @@ -312,9 +318,11 @@ def _highly_variable_pearson_residuals( dict( means=means, variances=variances, - residual_variances=np.mean(residual_gene_vars, axis=0), + residual_variances=np.mean(residual_gene_vars, axis=0).astype( + np.float32, copy=False + ), highly_variable_rank=medianrank_residual_var, - highly_variable_nbatches=highly_variable_nbatches, + highly_variable_nbatches=highly_variable_nbatches.astype(np.int64), highly_variable_intersection=highly_variable_intersection, ) ) @@ -334,7 +342,7 @@ def _highly_variable_pearson_residuals( # (also for flavor = seurat and cellranger..) df = df.loc[adata.var_names] - if inplace or subset: + if inplace: adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on} logg.hint( 'added\n' @@ -350,9 +358,8 @@ def _highly_variable_pearson_residuals( adata.var['highly_variable_rank'] = df['highly_variable_rank'].values adata.var['means'] = df['means'].values adata.var['variances'] = df['variances'].values - adata.var['residual_variances'] = df['residual_variances'].values.astype( - 'float64', copy=False - ) + adata.var['residual_variances'] = df['residual_variances'] + if batch_key is not None: adata.var['highly_variable_nbatches'] = df[ 'highly_variable_nbatches' @@ -367,6 +374,9 @@ def _highly_variable_pearson_residuals( df = df.drop( ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1 ) + if subset: + df = df.iloc[df.highly_variable.values, :] + return df From fdd500be232260ad9e5c56c53813ea3a305edac8 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 11 Jun 2021 17:59:26 +0200 Subject: [PATCH 17/96] renaming output fields for consistency, fixing minor bug --- scanpy/preprocessing/_normalization.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index ffba1e771f..1f55e179e6 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -49,7 +49,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False): raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") if check_values and (check_nonnegative_integers(X) == False): - warnings.warn( + warn( "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.", UserWarning, ) @@ -225,7 +225,7 @@ def normalize_pearson_residuals_pca( `.uns['pearson_residuals_normalization']['clip']` The used value of the clipping parameter - `.obsm['pearson_residuals_X_pca']` + `.obsm['X_pearson_residuals_pca']` PCA representation of data after gene selection and Pearson residual normalization. `.uns['pearson_residuals_pca']['PCs']` @@ -257,9 +257,13 @@ def normalize_pearson_residuals_pca( pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs']) adata.uns['pearson_residuals_pca'] = pca_dict adata.uns['pearson_residuals_normalization'] = norm_dict - adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca'] + adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'] return None else: + adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy() + adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy() + del adata_pca.obsm['X_pca'] + del adata_pca.uns['pca'] return adata_pca From c6dfc1de5af8c7f000d9f5f3160b16be0b5395ef Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 11 Jun 2021 17:59:59 +0200 Subject: [PATCH 18/96] renaming output fields for consistency --- scanpy/preprocessing/_recipes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index 7393e1a4ac..6d61c24bec 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -318,7 +318,11 @@ def recipe_pearson_residuals( pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs']) adata.uns['pearson_residuals_pca'] = pca_dict adata.uns['pearson_residuals_normalization'] = normalization_dict - adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca'] + adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'] return None else: + adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy() + adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy() + del adata_pca.obsm['X_pca'] + del adata_pca.uns['pca'] return adata_pca, hvg From dc27c9f6c9caef7ae4df0447c9fc8f5ead462411 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 11 Jun 2021 18:01:36 +0200 Subject: [PATCH 19/96] adding function that prepares testdata (used for pearson residual tests) --- scanpy/tests/helpers.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py index 61fc35e23e..b7e97c36dd 100644 --- a/scanpy/tests/helpers.py +++ b/scanpy/tests/helpers.py @@ -83,3 +83,26 @@ def check_rep_results(func, X, *, fields=["layer", "obsm"], **kwargs): assert_equal(adatas_proc[field_a], adatas_proc[field_b]) for field in fields: assert_equal(adata_X, adatas_proc[field]) + + +def _prepare_pbmc_testdata(sparsity_func, dtype, small=False): + """Prepares 3k PBMC dataset with batch key `batch` and defined datatype/sparsity. + + Params + ------ + sparsity_func + sparsity function applied to adata.X (e.g. csr_matrix.toarray for dense or csr_matrix for sparse) + dtype + numpy dtype applied to adata.X (e.g. 'float32' or 'int64') + small + False (default) returns full data, True returns small subset of the data.""" + + adata = sc.datasets.pbmc3k() + if small: + adata = adata[:1000, :500] + sc.pp.filter_cells(adata, min_genes=1) + np.random.seed(42) + adata.obs['batch'] = np.random.randint(0, 3, size=adata.shape[0]) + sc.pp.filter_genes(adata, min_cells=1) + adata.X = sparsity_func(adata.X.astype(dtype)) + return adata From aef44d8b1f8106976e6ceaca0fbe1a8e4a730220 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 11 Jun 2021 18:02:55 +0200 Subject: [PATCH 20/96] adding tests for all pearson residual functions --- scanpy/tests/test_highly_variable_genes.py | 270 +++++++++++++++++++ scanpy/tests/test_normalization.py | 293 ++++++++++++++++++++- 2 files changed, 562 insertions(+), 1 deletion(-) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 8b3e4f52c2..69703376cf 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -3,6 +3,9 @@ import numpy as np import scanpy as sc from pathlib import Path +from scipy.sparse import csr_matrix +from scanpy.tests.helpers import _prepare_pbmc_testdata +import warnings FILE = Path(__file__).parent / Path('_scripts/seurat_hvg.csv') FILE_V3 = Path(__file__).parent / Path('_scripts/seurat_hvg_v3.csv.gz') @@ -54,6 +57,273 @@ def test_highly_variable_genes_basic(): assert np.all(np.isin(colnames, hvg_df.columns)) +def _residual_var_reference(adata, clip=None, theta=100): + sc.pp.normalize_pearson_residuals(adata, clip=clip, theta=theta) + residuals = adata.X + return np.var(residuals, axis=0) + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtype): + + adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) + + # depending on check_values, warnings should be raised for non-integer data + if dtype == 'float32': + + adata_noninteger = adata.copy() + x, y = np.nonzero(adata_noninteger.X) + adata_noninteger.X[x[0], y[0]] = 0.5 + nonint_warn_msg = "`flavor='pearson_residuals'` expects raw count data, but non-integers were found." + + # expecting 0 no-int warnings + with warnings.catch_warnings(record=True) as record: + sc.pp.highly_variable_genes( + adata_noninteger.copy(), + flavor='pearson_residuals', + n_top_genes=100, + check_values=False, + ) + nonint_warnings = [ + warning.message.args[0] == nonint_warn_msg for warning in record + ] + assert np.sum(nonint_warnings) == 0 + + # expecting 1 no-int warning + with warnings.catch_warnings(record=True) as record: + sc.pp.highly_variable_genes( + adata_noninteger.copy(), + flavor='pearson_residuals', + n_top_genes=100, + check_values=True, + ) + nonint_warnings = np.array( + [warning.message.args[0] == nonint_warn_msg for warning in record] + ) + assert np.sum(nonint_warnings) == 1 + + # errors should be raised for invalid theta values + with pytest.raises(ValueError) as record: + sc.pp.highly_variable_genes( + adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=0 + ) + with pytest.raises(ValueError) as record: + sc.pp.highly_variable_genes( + adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=-1 + ) + + # error should be raised for invalid clipping values + with pytest.raises(ValueError) as record: + sc.pp.highly_variable_genes( + adata.copy(), flavor='pearson_residuals', n_top_genes=100, clip=-1 + ) + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +@pytest.mark.parametrize('subset', [True, False]) +@pytest.mark.parametrize('inplace', [True, False]) +@pytest.mark.parametrize('clip', [None, np.Inf, 30]) +@pytest.mark.parametrize('theta', [100, np.Inf]) +def test_highly_variable_genes_pearson_residuals_values( + subset, inplace, sparsity_func, dtype, clip, theta +): + + n_top_genes = 100 + adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) + # compute reference output + residual_variances_reference = _residual_var_reference( + adata.copy(), clip=clip, theta=theta + ) + if subset: + # lazyly sort by residual variance and take top N + top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes] + # (results in sorted "gene order" in reference) + residual_variances_reference = residual_variances_reference[top_n_idx] + # compute output to be tested + output = sc.pp.highly_variable_genes( + adata, + flavor='pearson_residuals', + n_top_genes=n_top_genes, + subset=subset, + inplace=inplace, + clip=clip, + theta=theta, + ) + + # depending on inplace, check adata.var or output + if inplace: + assert output is None + output_df = adata.var + else: + output_df = output + + # consistency with normalization method + if subset: + # sort values before comparing as reference is sorted as well for subset case + sort_output_idx = np.argsort(-output_df['residual_variances'].values) + assert np.allclose( + output_df['residual_variances'].values[sort_output_idx], + residual_variances_reference, + ) + else: + assert np.allclose( + output_df['residual_variances'].values, residual_variances_reference + ) + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +@pytest.mark.parametrize('subset', [True, False]) +@pytest.mark.parametrize('inplace', [True, False]) +def test_highly_variable_genes_pearson_residuals_general( + subset, + inplace, + sparsity_func, + dtype, +): + + n_top_genes = 1000 + + adata = _prepare_pbmc_testdata(sparsity_func, dtype) + # compute reference output + residual_variances_reference = _residual_var_reference(adata.copy()) + if subset: + # lazyly sort by residual variance and take top N + top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes] + # (results in sorted "gene order" in reference) + residual_variances_reference = residual_variances_reference[top_n_idx] + # compute output to be tested + output = sc.pp.highly_variable_genes( + adata, + flavor='pearson_residuals', + n_top_genes=n_top_genes, + subset=subset, + inplace=inplace, + ) + + # depending on inplace, check adata.var or output + if inplace: + assert output is None + output_df = adata.var + else: + output_df = output + + # check output is complete + for key in [ + 'highly_variable', + 'means', + 'variances', + 'residual_variances', + 'highly_variable_rank', + ]: + assert key in output_df.keys() + + # check residual variances + assert output_df['residual_variances'].values.dtype is np.dtype('float32') + # consistency with normalization method + if subset: + # sort values before comparing as reference is sorted as well for subset case + sort_output_idx = np.argsort(-output_df['residual_variances'].values) + assert np.allclose( + output_df['residual_variances'].values[sort_output_idx], + residual_variances_reference, + ) + else: + assert np.allclose( + output_df['residual_variances'].values, residual_variances_reference + ) + + # check hvg flag + assert output_df['highly_variable'].values.dtype is np.dtype('bool') + assert np.sum(output_df['highly_variable']) == n_top_genes + hvg_idx = np.where(output_df['highly_variable'])[0] + topn_idx = np.sort( + np.argsort(-output_df['residual_variances'].values)[:n_top_genes] + ) + assert np.all(hvg_idx == topn_idx) + + # check ranks + assert np.nanmin(output_df['highly_variable_rank'].values) == 0 + assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1 + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +@pytest.mark.parametrize('subset', [True, False]) +@pytest.mark.parametrize('inplace', [True, False]) +def test_highly_variable_genes_pearson_residuals_batch( + subset, inplace, sparsity_func, dtype +): + + n_top_genes = 1000 + + adata = _prepare_pbmc_testdata(sparsity_func, dtype) + n_genes = adata.shape[1] + + output = sc.pp.highly_variable_genes( + adata, + flavor='pearson_residuals', + n_top_genes=n_top_genes, + batch_key='batch', + subset=subset, + inplace=inplace, + ) + + # depending on inplace, check adata.var or output + if inplace: + assert output is None + output_df = adata.var + else: + output_df = output + + # check output is complete + for key in [ + 'highly_variable', + 'means', + 'variances', + 'residual_variances', + 'highly_variable_rank', + 'highly_variable_nbatches', + 'highly_variable_intersection', + ]: + assert key in output_df.keys() + + # check hvg flag + assert output_df['highly_variable'].values.dtype is np.dtype('bool') + assert np.sum(output_df['highly_variable']) == n_top_genes + + # check intersection flag + nbatches = len(np.unique(adata.obs['batch'])) + assert output_df['highly_variable_intersection'].values.dtype is np.dtype('bool') + assert np.sum(output_df['highly_variable_intersection']) <= n_top_genes * nbatches + assert np.all(output_df['highly_variable'][output_df.highly_variable_intersection]) + + # check ranks (with batch_key these are the median of within-batch ranks) + assert output_df['highly_variable_rank'].values.dtype is np.dtype('float32') + assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1 + + # check nbatches + assert output_df['highly_variable_nbatches'].values.dtype is np.dtype('int') + assert np.min(output_df['highly_variable_nbatches'].values) >= 0 + assert np.max(output_df['highly_variable_nbatches'].values) <= nbatches + + # check subsetting + if subset: + assert len(output_df) == n_top_genes + else: + assert len(output_df) == n_genes + + def test_higly_variable_genes_compare_to_seurat(): seurat_hvg_info = pd.read_csv(FILE, sep=' ') diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index 0f5dbb102d..b840702fec 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -3,11 +3,17 @@ from anndata import AnnData from scipy.sparse import csr_matrix from scipy import sparse +import warnings import scanpy as sc -from scanpy.tests.helpers import check_rep_mutation, check_rep_results +from scanpy.tests.helpers import ( + check_rep_mutation, + check_rep_results, + _prepare_pbmc_testdata, +) from anndata.tests.helpers import assert_equal, asarray + X_total = [[1, 0], [3, 0], [5, 6]] X_frac = [[1, 0, 1], [3, 0, 1], [5, 6, 1]] @@ -56,3 +62,288 @@ def test_normalize_total_view(typ, dtype): assert not v.is_view assert_equal(adata, v) + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype): + + adata = _prepare_pbmc_testdata(sparsity_func, dtype) + + # depending on check_values, warnings should be raised for non-integer data + if dtype == 'float32': + + adata_noninteger = adata.copy() + x, y = np.nonzero(adata_noninteger.X) + adata_noninteger.X[x[0], y[0]] = 0.5 + nonint_warn_msg = "`normalize_pearson_residuals()` expects raw count data, but non-integers were found." + + # expecting 0 no-int warnings + with warnings.catch_warnings(record=True) as record: + sc.pp.normalize_pearson_residuals( + adata_noninteger.copy(), check_values=False + ) + nonint_warnings = [ + warning.message.args[0] == nonint_warn_msg for warning in record + ] + assert np.sum(nonint_warnings) == 0 + + # expecting 1 no-int warning + with warnings.catch_warnings(record=True) as record: + sc.pp.normalize_pearson_residuals( + adata_noninteger.copy(), check_values=True + ) + nonint_warnings = np.array( + [warning.message.args[0] == nonint_warn_msg for warning in record] + ) + assert np.sum(nonint_warnings) == 1 + + # errors should be raised for invalid theta values + with pytest.raises(ValueError) as record: + sc.pp.normalize_pearson_residuals(adata.copy(), theta=0) + with pytest.raises(ValueError) as record: + sc.pp.normalize_pearson_residuals(adata.copy(), theta=-1) + + # error should be raised for invalid clipping values + with pytest.raises(ValueError) as record: + sc.pp.normalize_pearson_residuals(adata.copy(), clip=-1) + + +@pytest.mark.parametrize( + 'sparsity_func', [np.array, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +@pytest.mark.parametrize('theta', [0.01, 1, 100, np.Inf]) +@pytest.mark.parametrize('clip', [None, 1, np.Inf]) +@pytest.mark.parametrize('inplace', [True, False]) +def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip, inplace): + + # toy data + X = np.array([[3, 6], [2, 4], [1, 0]]) + ns = np.sum(X, axis=1) + ps = np.sum(X, axis=0) / np.sum(X) + mu = np.outer(ns, ps) + + # compute reference residuals + if np.isinf(theta): + # Poisson case + residuals_reference = (X - mu) / np.sqrt(mu) + else: + # NB case + residuals_reference = (X - mu) / np.sqrt(mu + mu ** 2 / theta) + + # compute output to test + adata = AnnData(sparsity_func(X), dtype=dtype) + output = sc.pp.normalize_pearson_residuals( + adata, theta=theta, clip=clip, inplace=inplace + ) + + # handle and test inplace argument + if inplace: + output_X = adata.X + assert output is None + # check for correct new `adata.uns` keys + assert np.all( + np.isin(['pearson_residuals_normalization'], list(adata.uns.keys())) + ) + assert np.all( + np.isin( + ['theta', 'clip', 'computed_on'], + list(adata.uns['pearson_residuals_normalization'].keys()), + ) + ) + + else: + output_X = output['X'] + + if clip is None: + # default clipping: compare to sqrt(n) threshold + clipping_threshold = np.sqrt(adata.shape[0]).astype(np.float32) + assert np.max(output_X) <= clipping_threshold + assert np.min(output_X) >= -clipping_threshold + elif np.isinf(clip): + # no clipping: compare to raw residuals + assert np.allclose(output_X, residuals_reference) + else: + # custom clipping: compare to custom threshold + assert np.max(output_X) <= clip + assert np.min(output_X) >= -clip + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +def test_normalize_pearson_residuals_pca(sparsity_func, dtype): + + adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) + n_cells = adata.shape[0] + n_genes = adata.shape[1] + n_hvgs = 100 + n_comps_pca = 50 + adata_with_hvgs = adata.copy() + sc.pp.highly_variable_genes( + adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs + ) + adata_not_using_hvgs = adata_with_hvgs.copy() + + ### inplace = False ### + # outputs the (potentially hvg-restricted) adata_pca object + # PCA on all genes + adata_pca = sc.pp.normalize_pearson_residuals_pca( + adata.copy(), inplace=False, n_comps_pca=n_comps_pca + ) + # PCA on hvgs only + adata_pca_with_hvgs = sc.pp.normalize_pearson_residuals_pca( + adata_with_hvgs.copy(), inplace=False, n_comps_pca=n_comps_pca + ) + # PCA again on all genes (hvg use supressed) + adata_pca_not_using_hvgs = sc.pp.normalize_pearson_residuals_pca( + adata_not_using_hvgs.copy(), + inplace=False, + n_comps_pca=n_comps_pca, + use_highly_variable=False, + ) + + # for both cases, check adata_pca keys are complete + for ad in [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs]: + assert np.all( + np.isin( + ['pearson_residuals_normalization', 'pearson_residuals_pca'], + list(ad.uns.keys()), + ) + ) + assert np.all(np.isin(['X_pearson_residuals_pca'], list(ad.obsm.keys()))) + assert np.all(np.isin(['PCs'], list(ad.varm.keys()))) + assert ad.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca) + + # check adata shape to see if all genes or only HVGs are in the returned adata + assert adata_pca.shape == (n_cells, n_genes) + assert adata_pca_with_hvgs.shape == (n_cells, n_hvgs) # only HVGs retained + assert adata_pca_not_using_hvgs.shape == (n_cells, n_genes) + + # check PC shapes to see whether or not HVGs were used for PCA + assert adata_pca.varm['PCs'].shape == (n_genes, n_comps_pca) + assert adata_pca_with_hvgs.varm['PCs'].shape == ( + n_hvgs, + n_comps_pca, + ) # only HVGs used + assert adata_pca_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps_pca) + + ### inplace = True ### + # modifies the input adata object + # PCA on all genes + sc.pp.normalize_pearson_residuals_pca(adata, inplace=True, n_comps_pca=n_comps_pca) + # PCA on hvgs only + sc.pp.normalize_pearson_residuals_pca( + adata_with_hvgs, inplace=True, n_comps_pca=n_comps_pca + ) + # PCA again on all genes (hvg use supressed) + sc.pp.normalize_pearson_residuals_pca( + adata_not_using_hvgs, + inplace=True, + n_comps_pca=n_comps_pca, + use_highly_variable=False, + ) + + for ad in [adata, adata_with_hvgs, adata_not_using_hvgs]: + # check adata_pca keys are complete + assert np.all( + np.isin( + ['pearson_residuals_normalization', 'pearson_residuals_pca'], + list(ad.uns.keys()), + ) + ) + assert np.all(np.isin(['X_pearson_residuals_pca'], list(ad.obsm.keys()))) + # check shapes: adata should always retains original shape + assert ad.shape == (n_cells, n_genes) + assert ad.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca) + + # check PC shapes to see whether or not HVGs were used for PCA + assert adata.uns['pearson_residuals_pca']['PCs'].shape == (n_genes, n_comps_pca) + assert adata_with_hvgs.uns['pearson_residuals_pca']['PCs'].shape == ( + n_hvgs, + n_comps_pca, + ) + assert adata_not_using_hvgs.uns['pearson_residuals_pca']['PCs'].shape == ( + n_genes, + n_comps_pca, + ) + + +@pytest.mark.parametrize( + 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ +) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +def test_normalize_pearson_residuals_recipe(sparsity_func, dtype): + adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) + n_cells = adata.shape[0] + n_genes = adata.shape[1] + n_hvgs = 100 + n_comps_pca = 50 + adata_with_hvgs = adata.copy() + sc.pp.highly_variable_genes( + adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs + ) + adata_not_using_hvgs = adata_with_hvgs.copy() + + ### inplace = False ### + # outputs the (potentially hvg-restricted) adata_pca object + # PCA on all genes + adata_pca, hvg = sc.pp.recipe_pearson_residuals( + adata.copy(), inplace=False, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs + ) + + # for both cases, check adata_pca keys are complete + assert np.all( + np.isin( + ['pearson_residuals_normalization', 'pearson_residuals_pca'], + list(adata_pca.uns.keys()), + ) + ) + assert np.all(np.isin(['X_pearson_residuals_pca'], list(adata_pca.obsm.keys()))) + assert np.all(np.isin(['PCs'], list(adata_pca.varm.keys()))) + assert adata_pca.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca) + + # check adata shape + assert adata_pca.shape == (n_cells, n_hvgs) + # check PC shapes to check that HVGs were used for PCA + assert adata_pca.varm['PCs'].shape == (n_hvgs, n_comps_pca) + + # check hvg df + assert np.all( + np.isin( + [ + 'means', + 'variances', + 'residual_variances', + 'highly_variable_rank', + 'highly_variable', + ], + list(hvg.columns), + ) + ) + assert np.sum(hvg['highly_variable']) == n_hvgs + assert hvg.shape[0] == n_genes + + ### inplace = True ### + # modifies the input adata object + # PCA on all genes + sc.pp.recipe_pearson_residuals( + adata, inplace=True, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs + ) + + assert np.all( + np.isin( + ['pearson_residuals_normalization', 'pearson_residuals_pca'], + list(adata.uns.keys()), + ) + ) + assert np.all(np.isin(['X_pearson_residuals_pca'], list(adata.obsm.keys()))) + assert adata.shape == (n_cells, n_genes) + assert adata.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca) + + # check PC shapes to see whether or not HVGs were used for PCA + assert adata.uns['pearson_residuals_pca']['PCs'].shape == (n_hvgs, n_comps_pca) From e76cf7b6bbc3b38c5bacf666a49ccaa3832075e8 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Jun 2021 11:58:23 +0200 Subject: [PATCH 21/96] fix precommit high_var_genes --- scanpy/preprocessing/_highly_variable_genes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index fe9277f3b7..3231076780 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -222,7 +222,7 @@ def _highly_variable_pearson_residuals( computed_on = layer if layer else 'adata.X' # Check for raw counts - if check_values and (check_nonnegative_integers(X) == False): + if check_values and (check_nonnegative_integers(X) is False): warnings.warn( "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", UserWarning, From bdb7ce23bed8e7016f9f4caa928ea8e3c2b78152 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Jun 2021 12:10:13 +0200 Subject: [PATCH 22/96] try to get precommit to work --- scanpy/preprocessing/_highly_variable_genes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 3231076780..6c262a3087 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -586,7 +586,6 @@ def highly_variable_genes( where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. - chunksize If `flavor='pearson_residuals'`, this dertermines how many genes are processed at once while computing the residual variance. Choosing a smaller value will reduce From 6cea0404901b5cd3c729cb87d50a8c3f2bce55b0 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Jun 2021 12:12:44 +0200 Subject: [PATCH 23/96] try to get precommit to work --- .../preprocessing/_highly_variable_genes.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index ee26c7c5e6..bd32c2e653 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -546,23 +546,23 @@ def highly_variable_genes( layer If provided, use `adata.layers[layer]` for expression values instead of `adata.X`. n_top_genes - Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or + Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. min_mean If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. max_mean If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. min_disp If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. max_disp If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. span The fraction of the data (cells) used when estimating the variance in the loess @@ -571,22 +571,22 @@ def highly_variable_genes( Number of bins for binning the mean gene expression. Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1. You'll be informed - about this if you set `settings.verbosity = 4`. Ignored if + about this if you set `settings.verbosity = 4`. Ignored if `flavor='pearson_residuals'`. theta If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta. - Higher values correspond to less overdispersion (var = mean + mean^2/theta), and + Higher values correspond to less overdispersion (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a Poisson model. clip If `flavor='pearson_residuals'`, this determines if and how residuals are clipped: - + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. chunksize If `flavor='pearson_residuals'`, this dertermines how many genes are processed at - once while computing the residual variance. Choosing a smaller value will reduce + once while computing the residual variance. Choosing a smaller value will reduce the required memory. flavor Choose the flavor for identifying highly variable genes. For the dispersion @@ -603,7 +603,7 @@ def highly_variable_genes( lightweight batch correction method. For all flavors, genes are first sorted by how many batches they are a HVG. For dispersion-based flavors ties are broken by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median - (across batches) rank based on within-batch normalized variance. If + (across batches) rank based on within-batch normalized variance. If `flavor='pearson_residuals'`, ties are broken by the median rank (across batches) based on within-batch residual variance. check_values From d7e63f78eb79815e2536653577f097225e03eb76 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Jun 2021 13:16:23 +0200 Subject: [PATCH 24/96] fix recipes --- scanpy/preprocessing/_recipes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index 6d61c24bec..2e49415874 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -1,5 +1,5 @@ """Preprocessing recipes from the literature""" -from typing import Optional, Union, Literal, Tuple +from typing import Optional, Tuple from anndata import AnnData From 0b5a02b086d59fa3b7aeb1512f6911e8aef3bdea Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Jun 2021 13:21:03 +0200 Subject: [PATCH 25/96] fix normalization --- scanpy/preprocessing/_normalization.py | 26 ++++++++++++-------------- scanpy/tests/test_normalization.py | 1 - 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 2f1bd66c4b..de105ee29f 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -7,14 +7,13 @@ from scipy.sparse import issparse from sklearn.utils import sparsefuncs -from .. import logging as logg -from .._compat import Literal +from scanpy import logging as logg +from scanpy._compat import Literal -from .._utils import view_to_actual, check_nonnegative_integers +from scanpy.preprocessing._utils import view_to_actual, check_nonnegative_integers from scanpy.get import _get_obs_rep, _set_obs_rep from ._pca import pca -from scanpy.get import _get_obs_rep, _set_obs_rep def _normalize_data(X, counts, after=None, copy=False): @@ -48,7 +47,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False): if clip < 0: raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") - if check_values and (check_nonnegative_integers(X) == False): + if check_values and (check_nonnegative_integers(X) is False): warn( "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.", UserWarning, @@ -101,13 +100,13 @@ def normalize_pearson_residuals( corresponds to a Poisson model. clip Determines if and how residuals are clipped: - + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. - + layer Layer to normalize instead of `X`. If `None`, `X` is normalized. copy @@ -167,7 +166,6 @@ def normalize_pearson_residuals_pca( check_values: bool = True, inplace: bool = True, ) -> Optional[pd.DataFrame]: - """\ Applies PCA based on Pearson residual normalization. Operates on the subset of highly variable genes in `adata.var['highly_variable']` by @@ -193,13 +191,13 @@ def normalize_pearson_residuals_pca( Poisson model. clip This determines if and how Pearson residuals are clipped: - + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. - + n_comps_pca Number of principal components to compute. random_state_pca @@ -217,14 +215,14 @@ def normalize_pearson_residuals_pca( If `inplace=False`, returns the Pearson residual-based PCA results (`adata_pca`). If `inplace=True`, updates `adata` with the following fields: - + `.uns['pearson_residuals_normalization']['pearson_residuals_df']` The hvg-subset, normalized by Pearson residuals `.uns['pearson_residuals_normalization']['theta']` The used value of the overdisperion parameter theta `.uns['pearson_residuals_normalization']['clip']` The used value of the clipping parameter - + `.obsm['X_pearson_residuals_pca']` PCA representation of data after gene selection and Pearson residual normalization. @@ -234,8 +232,8 @@ def normalize_pearson_residuals_pca( Ratio of explained variance. `.uns['pearson_residuals_pca']['variance']` Explained variance, equivalent to the eigenvalues of the - covariance matrix. - + covariance matrix. + """ if use_highly_variable and 'highly_variable' in adata.var_keys(): diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index b840702fec..faad18fe7d 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -287,7 +287,6 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype): sc.pp.highly_variable_genes( adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs ) - adata_not_using_hvgs = adata_with_hvgs.copy() ### inplace = False ### # outputs the (potentially hvg-restricted) adata_pca object From 6779d23acbc0c66ec0266d791a4c307c204d35f7 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Jun 2021 13:23:28 +0200 Subject: [PATCH 26/96] remove relative imports --- scanpy/preprocessing/_normalization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index de105ee29f..633ef15337 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -10,10 +10,10 @@ from scanpy import logging as logg from scanpy._compat import Literal -from scanpy.preprocessing._utils import view_to_actual, check_nonnegative_integers +from scanpy._utils import view_to_actual, check_nonnegative_integers from scanpy.get import _get_obs_rep, _set_obs_rep -from ._pca import pca +from scanpy.preprocessing._pca import pca def _normalize_data(X, counts, after=None, copy=False): From 237e7cd5243dc3bde629b6fd5f64d20c3a61d3e9 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Jun 2021 13:48:56 +0200 Subject: [PATCH 27/96] fix docstrings --- scanpy/preprocessing/_highly_variable_genes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index bd32c2e653..9eedc11310 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -584,6 +584,7 @@ def highly_variable_genes( where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. + chunksize If `flavor='pearson_residuals'`, this dertermines how many genes are processed at once while computing the residual variance. Choosing a smaller value will reduce From d75aa36b3252a63abb814dd04f0c34acfbec2a99 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Jun 2021 13:59:58 +0200 Subject: [PATCH 28/96] retry to build docs --- scanpy/preprocessing/_highly_variable_genes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 9eedc11310..8f39a60990 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -10,7 +10,7 @@ from .. import logging as logg from .._settings import settings, Verbosity from .._utils import sanitize_anndata, check_nonnegative_integers, view_to_actual -from scanpy.get import _get_obs_rep, _set_obs_rep +from scanpy.get import _get_obs_rep from .._compat import Literal from ._utils import _get_mean_var from ._distributed import materialize_as_ndarray From 293b47d1144b9ff2a601ef88700810e62aa3e029 Mon Sep 17 00:00:00 2001 From: giovp Date: Tue, 29 Jun 2021 10:06:53 +0200 Subject: [PATCH 29/96] fix highvar docstring --- scanpy/preprocessing/_highly_variable_genes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 8f39a60990..56288627f8 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -580,10 +580,10 @@ def highly_variable_genes( clip If `flavor='pearson_residuals'`, this determines if and how residuals are clipped: - * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], - where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set - `clip=np.Inf` for no clipping. + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], + where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set + `clip=np.Inf` for no clipping. chunksize If `flavor='pearson_residuals'`, this dertermines how many genes are processed at From a61496b47849e1ef657f6e4319ddc22837ac3db1 Mon Sep 17 00:00:00 2001 From: giovp Date: Tue, 29 Jun 2021 11:26:34 +0200 Subject: [PATCH 30/96] more fixing docstrings --- docs/api.rst | 3 ++ .../preprocessing/_highly_variable_genes.py | 10 +++---- scanpy/preprocessing/_normalization.py | 29 ++++++++----------- scanpy/preprocessing/_recipes.py | 15 +++++----- 4 files changed, 27 insertions(+), 30 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 3656bc24bf..2bc9283a75 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -39,6 +39,8 @@ For visual quality control, see :func:`~scanpy.pl.highest_expr_genes` and pp.log1p pp.pca pp.normalize_total + pp.normalize_pearson_residuals + pp.normalize_pearson_residuals_pca pp.regress_out pp.scale pp.subsample @@ -53,6 +55,7 @@ Recipes pp.recipe_zheng17 pp.recipe_weinreb17 pp.recipe_seurat + pp.recipe_pearson_residuals Batch effect correction ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 56288627f8..aec0fe3275 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -513,14 +513,14 @@ def highly_variable_genes( check_values: bool = True, ) -> Optional[pd.DataFrame]: """\ - Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_. + Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_ [Lause20]_. Expects logarithmized data, except when `flavor='seurat_v3'` or `flavor='pearson_residuals'`, in which count data is expected. Depending on `flavor`, this reproduces the R-implementations of Seurat [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_, or uses - analytical Peason residuals [Lause20]_. + analytical Pearson residuals [Lause20]_. For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized dispersion is obtained by scaling with the mean and standard deviation of @@ -578,11 +578,11 @@ def highly_variable_genes( Higher values correspond to less overdispersion (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a Poisson model. clip - If `flavor='pearson_residuals'`, this determines if and how residuals are clipped: + If `flavor='pearson_residuals'`, this determines how residuals are clipped: - * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ `clip=np.Inf` for no clipping. chunksize diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 633ef15337..d5ea245663 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -82,12 +82,11 @@ def normalize_pearson_residuals( inplace: bool = True, ) -> Optional[Dict[str, np.ndarray]]: """\ - Computes analytic Pearson residuals, assuming a negative binomial offset - model with overdispersion theta shared across genes. By default, residuals - are clipped to sqrt(n) and overdispersion theta=100 is used. + Computes analytic Pearson residuals, based on [Lause20]_. - Based on "Analytic Pearson residuals for normalization of single-cell - RNA-seq UMI data", bioRxiv, [Lause20]_. + Assuming a negative binomial offset model with overdispersion + theta shared across genes, computes Pearson residuals. By default, residuals + are clipped to sqrt(n) and overdispersion theta=100 is used. Params ------ @@ -167,12 +166,9 @@ def normalize_pearson_residuals_pca( inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ - Applies PCA based on Pearson residual normalization. Operates on the - subset of highly variable genes in `adata.var['highly_variable']` by - default. + Applies Pearson residual normalization and PCA, based on [Lause20]_. - This workflow is based on "Analytic Pearson residuals for normalization of - single-cell RNA-seq UMI data", bioRxiv, [Lause20]_. + Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default. Parameters @@ -190,20 +186,19 @@ def normalize_pearson_residuals_pca( (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a Poisson model. clip - This determines if and how Pearson residuals are clipped: + This determines how Pearson residuals are clipped: - * If `None`, residuals are clipped to the interval - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset - (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set - `clip=np.Inf` for no clipping. + * If `None`, residuals are clipped to the interval \ + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + `clip=np.Inf` for no clipping. n_comps_pca Number of principal components to compute. random_state_pca Change to use different initial states for the optimization. kwargs_pca - Dictionary of further keyword arguments passed on to `sc.pp.pca()`. + Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. check_values Check if counts in selected layer are integers. A Warning is returned if set to True. inplace diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index 2e49415874..fd28bea576 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -186,12 +186,11 @@ def recipe_pearson_residuals( inplace: bool = True, ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: """\ + Gene selection and normalization based on [Lause20]_. + Applies gene selection based on Pearson residuals. On the resulting subset, Pearson residual normalization and PCA are performed. - This recipe is based on "Analytic Pearson residuals for normalization of - single-cell RNA-seq UMI data", bioRxiv, [Lause20]_. - Parameters ---------- @@ -213,11 +212,11 @@ def recipe_pearson_residuals( clip This determines if and how Pearson residuals are clipped: - * If `None`, residuals are clipped to the interval - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset - (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set - `clip=np.Inf` for no clipping. + * If `None`, residuals are clipped to the interval \ + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + `clip=np.Inf` for no clipping. + batch_key If specified, highly-variable genes are selected within each batch separately and merged. This simple process avoids the selection of From 7afb94f4ae05a8b35f415de6055ec07b0a9d7a6d Mon Sep 17 00:00:00 2001 From: giovp Date: Tue, 29 Jun 2021 11:40:24 +0200 Subject: [PATCH 31/96] docs build locally ? :hammer: --- scanpy/preprocessing/_normalization.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index d5ea245663..11dc9b8860 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -100,11 +100,10 @@ def normalize_pearson_residuals( clip Determines if and how residuals are clipped: - * If `None`, residuals are clipped to the interval - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset - (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set - `clip=np.Inf` for no clipping. + * If `None`, residuals are clipped to the interval \ + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + `clip=np.Inf` for no clipping. layer Layer to normalize instead of `X`. If `None`, `X` is normalized. From e3e50457c451cab673f9328387e54add9648a9ae Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 5 Jul 2021 16:03:02 +0200 Subject: [PATCH 32/96] minor cleanup test normalization --- .../preprocessing/_highly_variable_genes.py | 2 +- scanpy/preprocessing/_normalization.py | 4 ++-- scanpy/tests/test_normalization.py | 22 ++++++------------- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index aec0fe3275..4bd00a98e9 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -1,5 +1,5 @@ import warnings -from typing import Optional, Union +from typing import Optional import numpy as np import pandas as pd diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 11dc9b8860..adc8820b2e 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -47,7 +47,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False): if clip < 0: raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") - if check_values and (check_nonnegative_integers(X) is False): + if check_values and not check_nonnegative_integers(X): warn( "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.", UserWarning, @@ -133,7 +133,7 @@ def normalize_pearson_residuals( X = _get_obs_rep(adata, layer=layer) computed_on = layer if layer else 'adata.X' - msg = 'computing analytic Pearson residuals on %s' % computed_on + msg = f'computing analytic Pearson residuals on {computed_on}' start = logg.info(msg) residuals = _pearson_residuals(X, theta, clip, check_values, copy=~inplace) diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index faad18fe7d..7056443ce9 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -78,27 +78,19 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype): adata_noninteger = adata.copy() x, y = np.nonzero(adata_noninteger.X) adata_noninteger.X[x[0], y[0]] = 0.5 - nonint_warn_msg = "`normalize_pearson_residuals()` expects raw count data, but non-integers were found." - # expecting 0 no-int warnings - with warnings.catch_warnings(record=True) as record: + with pytest.warns(UserWarning) as record: sc.pp.normalize_pearson_residuals( - adata_noninteger.copy(), check_values=False + adata_noninteger.copy(), check_values=True ) - nonint_warnings = [ - warning.message.args[0] == nonint_warn_msg for warning in record - ] - assert np.sum(nonint_warnings) == 0 + assert len(record) == 1 + assert "expects raw count data" in record[0].message.args[0] - # expecting 1 no-int warning - with warnings.catch_warnings(record=True) as record: + with pytest.warns(None) as record: sc.pp.normalize_pearson_residuals( - adata_noninteger.copy(), check_values=True + adata_noninteger.copy(), check_values=False ) - nonint_warnings = np.array( - [warning.message.args[0] == nonint_warn_msg for warning in record] - ) - assert np.sum(nonint_warnings) == 1 + assert len(record) == 0 # errors should be raised for invalid theta values with pytest.raises(ValueError) as record: From e368b57eb1eadaa018ee1f3e94996a22bea5eb12 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 5 Jul 2021 16:21:57 +0200 Subject: [PATCH 33/96] more minor cleanups --- scanpy/tests/test_normalization.py | 32 ++++++++++++------------------ 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index 7056443ce9..c84fd40a60 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -109,8 +109,7 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype): @pytest.mark.parametrize('dtype', ['float32', 'int64']) @pytest.mark.parametrize('theta', [0.01, 1, 100, np.Inf]) @pytest.mark.parametrize('clip', [None, 1, np.Inf]) -@pytest.mark.parametrize('inplace', [True, False]) -def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip, inplace): +def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip): # toy data X = np.array([[3, 6], [2, 4], [1, 0]]) @@ -129,26 +128,21 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip, i # compute output to test adata = AnnData(sparsity_func(X), dtype=dtype) output = sc.pp.normalize_pearson_residuals( - adata, theta=theta, clip=clip, inplace=inplace + adata, theta=theta, clip=clip, inplace=False ) + output_X = output['X'] + sc.pp.normalize_pearson_residuals(adata, theta=theta, clip=clip, inplace=True) - # handle and test inplace argument - if inplace: - output_X = adata.X - assert output is None - # check for correct new `adata.uns` keys - assert np.all( - np.isin(['pearson_residuals_normalization'], list(adata.uns.keys())) - ) - assert np.all( - np.isin( - ['theta', 'clip', 'computed_on'], - list(adata.uns['pearson_residuals_normalization'].keys()), - ) + # check for correct new `adata.uns` keys + assert np.all(np.isin(['pearson_residuals_normalization'], list(adata.uns.keys()))) + assert np.all( + np.isin( + ['theta', 'clip', 'computed_on'], + list(adata.uns['pearson_residuals_normalization'].keys()), ) - - else: - output_X = output['X'] + ) + # test against inplace + np.testing.assert_array_equal(adata.X, output_X) if clip is None: # default clipping: compare to sqrt(n) threshold From bfbd4840e7e008967e1fd8e0ec8de3297f7beaaf Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 5 Jul 2021 16:43:36 +0200 Subject: [PATCH 34/96] final cleanup normalization --- scanpy/tests/test_normalization.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index c84fd40a60..eb9bbc8533 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -162,13 +162,13 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip): 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ ) @pytest.mark.parametrize('dtype', ['float32', 'int64']) -def test_normalize_pearson_residuals_pca(sparsity_func, dtype): +@pytest.mark.parametrize('n_hvgs', [100, 200]) +@pytest.mark.parametrize('n_comps_pca', [30, 50]) +def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_pca): adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) - n_cells = adata.shape[0] - n_genes = adata.shape[1] - n_hvgs = 100 - n_comps_pca = 50 + n_cells, n_genes = adata.shape + adata_with_hvgs = adata.copy() sc.pp.highly_variable_genes( adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs @@ -258,17 +258,27 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype): n_comps_pca, ) + # test for inplace/outplace + for ad_inplace, ad_outplace in zip( + [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs], + [adata, adata_with_hvgs, adata_not_using_hvgs], + ): + np.testing.assert_array_equal( + ad_inplace.obsm['X_pearson_residuals_pca'], + ad_outplace.obsm['X_pearson_residuals_pca'], + ) + @pytest.mark.parametrize( 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ ) @pytest.mark.parametrize('dtype', ['float32', 'int64']) -def test_normalize_pearson_residuals_recipe(sparsity_func, dtype): +@pytest.mark.parametrize('n_hvgs', [100, 200]) +@pytest.mark.parametrize('n_comps_pca', [30, 50]) +def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comps_pca): adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) - n_cells = adata.shape[0] - n_genes = adata.shape[1] - n_hvgs = 100 - n_comps_pca = 50 + n_cells, n_genes = adata.shape + adata_with_hvgs = adata.copy() sc.pp.highly_variable_genes( adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs From a55e677d445323284ebae9ca1923d16516a95fa2 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 5 Jul 2021 18:19:03 +0200 Subject: [PATCH 35/96] fixes high var --- .../preprocessing/_highly_variable_genes.py | 23 ++-- scanpy/tests/test_highly_variable_genes.py | 107 +++++++++--------- 2 files changed, 67 insertions(+), 63 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 4bd00a98e9..513270c318 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -335,11 +335,11 @@ def _highly_variable_pearson_residuals( na_position='last', inplace=True, ) - df['highly_variable'] = False - df.highly_variable.iloc[:n_top_genes] = True - # TODO: following line raises a pandas warning - # (also for flavor = seurat and cellranger..) - df = df.loc[adata.var_names] + + high_var = np.zeros(df.shape[0]) + high_var[:n_top_genes] = True + df['highly_variable'] = high_var.astype(bool) + df = df.loc[adata.var_names, :] if inplace: adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on} @@ -353,11 +353,11 @@ def _highly_variable_pearson_residuals( ' \'variances\', float vector (adata.var)\n' ' \'residual_variances\', float vector (adata.var)' ) - adata.var['highly_variable'] = df['highly_variable'].values - adata.var['highly_variable_rank'] = df['highly_variable_rank'].values adata.var['means'] = df['means'].values adata.var['variances'] = df['variances'].values adata.var['residual_variances'] = df['residual_variances'] + adata.var['highly_variable_rank'] = df['highly_variable_rank'].values + adata.var['highly_variable'] = df['highly_variable'].values if batch_key is not None: adata.var['highly_variable_nbatches'] = df[ @@ -770,11 +770,12 @@ def highly_variable_genes( na_position='last', inplace=True, ) - df['highly_variable'] = False - df.highly_variable.iloc[:n_top_genes] = True - df = df.loc[adata.var_names] + high_var = np.zeros(df.shape[0]) + high_var[:n_top_genes] = True + df['highly_variable'] = high_var.astype(bool) + df = df.loc[adata.var_names, :] else: - df = df.loc[adata.var_names] + df = df.loc[adata.var_names, :] dispersion_norm = df.dispersions_norm.values dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat gene_subset = np.logical_and.reduce( diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 359d648e96..aa5517eed5 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -77,33 +77,27 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp adata_noninteger = adata.copy() x, y = np.nonzero(adata_noninteger.X) adata_noninteger.X[x[0], y[0]] = 0.5 - nonint_warn_msg = "`flavor='pearson_residuals'` expects raw count data, but non-integers were found." # expecting 0 no-int warnings - with warnings.catch_warnings(record=True) as record: + with pytest.warns(None) as record: sc.pp.highly_variable_genes( adata_noninteger.copy(), flavor='pearson_residuals', n_top_genes=100, check_values=False, ) - nonint_warnings = [ - warning.message.args[0] == nonint_warn_msg for warning in record - ] - assert np.sum(nonint_warnings) == 0 + assert len(record) == 0 # expecting 1 no-int warning - with warnings.catch_warnings(record=True) as record: + with pytest.warns(None) as record: sc.pp.highly_variable_genes( adata_noninteger.copy(), flavor='pearson_residuals', n_top_genes=100, check_values=True, ) - nonint_warnings = np.array( - [warning.message.args[0] == nonint_warn_msg for warning in record] - ) - assert np.sum(nonint_warnings) == 1 + assert len(record) == 1 + assert "expects raw count data" in record[0].message.args[0] # errors should be raised for invalid theta values with pytest.raises(ValueError) as record: @@ -127,15 +121,15 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp ) @pytest.mark.parametrize('dtype', ['float32', 'int64']) @pytest.mark.parametrize('subset', [True, False]) -@pytest.mark.parametrize('inplace', [True, False]) @pytest.mark.parametrize('clip', [None, np.Inf, 30]) @pytest.mark.parametrize('theta', [100, np.Inf]) +@pytest.mark.parametrize('n_top_genes', [100, 200]) def test_highly_variable_genes_pearson_residuals_values( - subset, inplace, sparsity_func, dtype, clip, theta + subset, sparsity_func, dtype, clip, theta, n_top_genes ): - - n_top_genes = 100 adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) + # cleanup var + adata.var.drop(columns=adata.var.columns, inplace=True) # compute reference output residual_variances_reference = _residual_var_reference( adata.copy(), clip=clip, theta=theta @@ -145,23 +139,29 @@ def test_highly_variable_genes_pearson_residuals_values( top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes] # (results in sorted "gene order" in reference) residual_variances_reference = residual_variances_reference[top_n_idx] + # compute output to be tested - output = sc.pp.highly_variable_genes( + output_df = sc.pp.highly_variable_genes( adata, flavor='pearson_residuals', n_top_genes=n_top_genes, subset=subset, - inplace=inplace, + inplace=False, clip=clip, theta=theta, ) - # depending on inplace, check adata.var or output - if inplace: - assert output is None - output_df = adata.var - else: - output_df = output + sc.pp.highly_variable_genes( + adata, + flavor='pearson_residuals', + n_top_genes=n_top_genes, + subset=subset, + inplace=True, + clip=clip, + theta=theta, + ) + + pd.testing.assert_frame_equal(output_df, adata.var) # consistency with normalization method if subset: @@ -182,39 +182,39 @@ def test_highly_variable_genes_pearson_residuals_values( ) @pytest.mark.parametrize('dtype', ['float32', 'int64']) @pytest.mark.parametrize('subset', [True, False]) -@pytest.mark.parametrize('inplace', [True, False]) +@pytest.mark.parametrize('n_top_genes', [1000, 500]) def test_highly_variable_genes_pearson_residuals_general( - subset, - inplace, - sparsity_func, - dtype, + subset, sparsity_func, dtype, n_top_genes ): - n_top_genes = 1000 - adata = _prepare_pbmc_testdata(sparsity_func, dtype) + # cleanup var + adata.var.drop(columns=adata.var.columns, inplace=True) # compute reference output residual_variances_reference = _residual_var_reference(adata.copy()) if subset: - # lazyly sort by residual variance and take top N + # lazily sort by residual variance and take top N top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes] # (results in sorted "gene order" in reference) residual_variances_reference = residual_variances_reference[top_n_idx] # compute output to be tested - output = sc.pp.highly_variable_genes( + output_df = sc.pp.highly_variable_genes( adata, flavor='pearson_residuals', n_top_genes=n_top_genes, subset=subset, - inplace=inplace, + inplace=False, ) - # depending on inplace, check adata.var or output - if inplace: - assert output is None - output_df = adata.var - else: - output_df = output + sc.pp.highly_variable_genes( + adata, + flavor='pearson_residuals', + n_top_genes=n_top_genes, + subset=subset, + inplace=True, + ) + + pd.testing.assert_frame_equal(output_df, adata.var) # check output is complete for key in [ @@ -260,31 +260,34 @@ def test_highly_variable_genes_pearson_residuals_general( ) @pytest.mark.parametrize('dtype', ['float32', 'int64']) @pytest.mark.parametrize('subset', [True, False]) -@pytest.mark.parametrize('inplace', [True, False]) +@pytest.mark.parametrize('n_top_genes', [1000, 500]) def test_highly_variable_genes_pearson_residuals_batch( - subset, inplace, sparsity_func, dtype + subset, n_top_genes, sparsity_func, dtype ): - - n_top_genes = 1000 - adata = _prepare_pbmc_testdata(sparsity_func, dtype) + # cleanup var + adata.var.drop(columns=adata.var.columns, inplace=True) n_genes = adata.shape[1] - output = sc.pp.highly_variable_genes( + output_df = sc.pp.highly_variable_genes( adata, flavor='pearson_residuals', n_top_genes=n_top_genes, batch_key='batch', subset=subset, - inplace=inplace, + inplace=False, ) - # depending on inplace, check adata.var or output - if inplace: - assert output is None - output_df = adata.var - else: - output_df = output + sc.pp.highly_variable_genes( + adata, + flavor='pearson_residuals', + n_top_genes=n_top_genes, + batch_key='batch', + subset=subset, + inplace=True, + ) + + # pd.testing.assert_frame_equal(output_df, adata.var) # check output is complete for key in [ From 4f47c1118550f93df4123a319d43a105626da9d9 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 5 Jul 2021 18:19:45 +0200 Subject: [PATCH 36/96] init experimental module --- scanpy/experimental/__init__.py | 0 scanpy/experimental/pp/__init__.py | 0 scanpy/experimental/pp/_highly_variable_genes.py | 0 scanpy/experimental/pp/_normalization.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 scanpy/experimental/__init__.py create mode 100644 scanpy/experimental/pp/__init__.py create mode 100644 scanpy/experimental/pp/_highly_variable_genes.py create mode 100644 scanpy/experimental/pp/_normalization.py diff --git a/scanpy/experimental/__init__.py b/scanpy/experimental/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scanpy/experimental/pp/__init__.py b/scanpy/experimental/pp/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py new file mode 100644 index 0000000000..e69de29bb2 From c32eafc8eb00a8f891ce4711639cc7cc0feabbb5 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Wed, 14 Jul 2021 15:23:03 +0200 Subject: [PATCH 37/96] fix column ordering for batch case --- scanpy/preprocessing/_highly_variable_genes.py | 5 +++-- scanpy/tests/test_highly_variable_genes.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 513270c318..1575f5b083 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -357,8 +357,6 @@ def _highly_variable_pearson_residuals( adata.var['variances'] = df['variances'].values adata.var['residual_variances'] = df['residual_variances'] adata.var['highly_variable_rank'] = df['highly_variable_rank'].values - adata.var['highly_variable'] = df['highly_variable'].values - if batch_key is not None: adata.var['highly_variable_nbatches'] = df[ 'highly_variable_nbatches' @@ -366,8 +364,11 @@ def _highly_variable_pearson_residuals( adata.var['highly_variable_intersection'] = df[ 'highly_variable_intersection' ].values + adata.var['highly_variable'] = df['highly_variable'].values + if subset: adata._inplace_subset_var(df['highly_variable'].values) + else: if batch_key is None: df = df.drop( diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index aa5517eed5..c80e0a442a 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -287,7 +287,7 @@ def test_highly_variable_genes_pearson_residuals_batch( inplace=True, ) - # pd.testing.assert_frame_equal(output_df, adata.var) + pd.testing.assert_frame_equal(output_df, adata.var) # check output is complete for key in [ From f6d42865f4ff7f405b3ecd4957d9d85478a487bf Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Wed, 14 Jul 2021 16:46:54 +0200 Subject: [PATCH 38/96] moving to experimental, minor fix for experimental version of hvg selection --- scanpy/__init__.py | 2 +- scanpy/experimental/__init__.py | 1 + scanpy/experimental/pp/__init__.py | 8 + .../experimental/pp/_highly_variable_genes.py | 406 ++++++++++++++++++ scanpy/experimental/pp/_normalization.py | 242 +++++++++++ scanpy/preprocessing/__init__.py | 14 +- .../preprocessing/_highly_variable_genes.py | 291 +------------ scanpy/preprocessing/_normalization.py | 228 ---------- scanpy/preprocessing/_recipes.py | 155 ------- 9 files changed, 676 insertions(+), 671 deletions(-) diff --git a/scanpy/__init__.py b/scanpy/__init__.py index 31b7c7dba9..12e48314a1 100644 --- a/scanpy/__init__.py +++ b/scanpy/__init__.py @@ -14,7 +14,7 @@ from . import tools as tl from . import preprocessing as pp from . import plotting as pl - from . import datasets, logging, queries, external, get, metrics + from . import datasets, logging, queries, external, get, metrics, experimental from anndata import AnnData, concat from anndata import ( diff --git a/scanpy/experimental/__init__.py b/scanpy/experimental/__init__.py index e69de29bb2..8a00c90df0 100644 --- a/scanpy/experimental/__init__.py +++ b/scanpy/experimental/__init__.py @@ -0,0 +1 @@ +from . import pp diff --git a/scanpy/experimental/pp/__init__.py b/scanpy/experimental/pp/__init__.py index e69de29bb2..7ecf999363 100644 --- a/scanpy/experimental/pp/__init__.py +++ b/scanpy/experimental/pp/__init__.py @@ -0,0 +1,8 @@ +from ._normalization import ( + normalize_pearson_residuals, + normalize_pearson_residuals_pca, +) + +from ._highly_variable_genes import highly_variable_genes + +from ._recipes import recipe_pearson_residuals diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index e69de29bb2..af5f81ed74 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -0,0 +1,406 @@ +import warnings +from typing import Optional + +import numpy as np +import pandas as pd +import scipy.sparse as sp_sparse +from anndata import AnnData + + +from scanpy import logging as logg +from scanpy._settings import settings, Verbosity +from scanpy._utils import check_nonnegative_integers, view_to_actual +from scanpy.get import _get_obs_rep +from scanpy._compat import Literal +from scanpy.preprocessing._utils import _get_mean_var +from scanpy.preprocessing._distributed import materialize_as_ndarray +from scanpy.preprocessing._simple import filter_genes + + +def _highly_variable_pearson_residuals( + adata: AnnData, + layer: Optional[str] = None, + n_top_genes: int = 1000, + batch_key: Optional[str] = None, + theta: float = 100, + clip: Optional[float] = None, + chunksize: int = 100, + check_values: bool = True, + subset: bool = False, + inplace: bool = True, +) -> Optional[pd.DataFrame]: + """\ + See `highly_variable_genes`. + + Returns + ------- + Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) + or updates `.var` with the following fields: + + highly_variable : bool + boolean indicator of highly-variable genes. + means : float + means per gene. + variances : float + variances per gene. + residual_variances : float + Pearson residual variance per gene. Averaged in the case of multiple + batches. + highly_variable_rank : float + Rank of the gene according to residual variance, median rank in the + case of multiple batches. NaN for non-HVGs. + highly_variable_nbatches : int + If batch_key is given, this denotes in how many batches genes are + detected as HVG. + highly_variable_intersection : bool + If batch_key is given, this denotes the genes that are highly variable + in all batches. + """ + + view_to_actual(adata) + X = _get_obs_rep(adata, layer=layer) + computed_on = layer if layer else 'adata.X' + + # Check for raw counts + if check_values and (check_nonnegative_integers(X) is False): + warnings.warn( + "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", + UserWarning, + ) + # check theta + if theta <= 0: + # TODO: would "underdispersion" with negative theta make sense? + # then only theta=0 were undefined.. + raise ValueError('Pearson residuals require theta > 0') + # prepare clipping + + if batch_key is None: + batch_info = np.zeros(adata.shape[0], dtype=int) + else: + batch_info = adata.obs[batch_key].values + n_batches = len(np.unique(batch_info)) + + # Get pearson residuals for each batch separately + residual_gene_vars = [] + for batch in np.unique(batch_info): + + adata_subset = adata[batch_info == batch] + + # Filter out zero genes + with settings.verbosity.override(Verbosity.error): + nonzero_genes = filter_genes(adata_subset, min_cells=1, inplace=False)[0] + adata_subset = adata_subset[:, nonzero_genes] + + if layer is not None: + X_batch = adata_subset.layers[layer] + else: + X_batch = adata_subset.X + + # Prepare clipping + if clip is None: + n = X_batch.shape[0] + clip = np.sqrt(n) + if clip < 0: + raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") + + if sp_sparse.issparse(X_batch): + sums_genes = np.sum(X_batch, axis=0) + sums_cells = np.sum(X_batch, axis=1) + sum_total = np.sum(sums_genes).squeeze() + else: + sums_genes = np.sum(X_batch, axis=0, keepdims=True) + sums_cells = np.sum(X_batch, axis=1, keepdims=True) + sum_total = np.sum(sums_genes) + + # Compute pearson residuals in chunks + residual_gene_var = np.empty((X_batch.shape[1])) + for start in np.arange(0, X_batch.shape[1], chunksize): + stop = start + chunksize + mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total) + X_dense = X_batch[:, start:stop].toarray() + residuals = (X_dense - mu) / np.sqrt(mu + mu ** 2 / theta) + residuals = np.clip(residuals, a_min=-clip, a_max=clip) + residual_gene_var[start:stop] = np.var(residuals, axis=0) + + # Add 0 values for genes that were filtered out + zero_gene_var = np.zeros(np.sum(~nonzero_genes)) + residual_gene_var = np.concatenate((residual_gene_var, zero_gene_var)) + # Order as before filtering + idxs = np.concatenate((np.where(nonzero_genes)[0], np.where(~nonzero_genes)[0])) + residual_gene_var = residual_gene_var[np.argsort(idxs)] + residual_gene_vars.append(residual_gene_var.reshape(1, -1)) + + residual_gene_vars = np.concatenate(residual_gene_vars, axis=0) + + # Get cutoffs and define hvgs per batch + residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1) + cutoffs_per_batch = residual_gene_vars_sorted[:, -n_top_genes] + highly_variable_per_batch = np.greater_equal( + residual_gene_vars.T, cutoffs_per_batch + ).T + + # Merge hvgs across batches + highly_variable_nbatches = np.sum(highly_variable_per_batch, axis=0) + highly_variable_intersection = highly_variable_nbatches == n_batches + + # Get rank per gene within each batch + # argsort twice gives ranks, small rank means most variable + ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1) + ranks_residual_var = ranks_residual_var.astype(np.float32) + ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan + ranks_masked_array = np.ma.masked_invalid(ranks_residual_var) + # Median rank across batches, + # ignoring batches in which gene was not selected + medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) + + means, variances = materialize_as_ndarray(_get_mean_var(X)) + df = pd.DataFrame.from_dict( + dict( + means=means, + variances=variances, + residual_variances=np.mean(residual_gene_vars, axis=0).astype( + np.float32, copy=False + ), + highly_variable_rank=medianrank_residual_var, + highly_variable_nbatches=highly_variable_nbatches.astype(np.int64), + highly_variable_intersection=highly_variable_intersection, + ) + ) + df = df.set_index(adata.var_names) + + # Sort genes by how often they selected as hvg within each batch and + # break ties with median rank of residual variance across batches + df.sort_values( + ['highly_variable_nbatches', 'highly_variable_rank'], + ascending=[False, True], + na_position='last', + inplace=True, + ) + + high_var = np.zeros(df.shape[0]) + high_var[:n_top_genes] = True + df['highly_variable'] = high_var.astype(bool) + df = df.loc[adata.var_names, :] + + if inplace: + adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on} + logg.hint( + 'added\n' + ' \'highly_variable\', boolean vector (adata.var)\n' + ' \'highly_variable_rank\', float vector (adata.var)\n' + ' \'highly_variable_nbatches\', int vector (adata.var)\n' + ' \'highly_variable_intersection\', boolean vector (adata.var)\n' + ' \'means\', float vector (adata.var)\n' + ' \'variances\', float vector (adata.var)\n' + ' \'residual_variances\', float vector (adata.var)' + ) + adata.var['means'] = df['means'].values + adata.var['variances'] = df['variances'].values + adata.var['residual_variances'] = df['residual_variances'] + adata.var['highly_variable_rank'] = df['highly_variable_rank'].values + if batch_key is not None: + adata.var['highly_variable_nbatches'] = df[ + 'highly_variable_nbatches' + ].values + adata.var['highly_variable_intersection'] = df[ + 'highly_variable_intersection' + ].values + adata.var['highly_variable'] = df['highly_variable'].values + + if subset: + adata._inplace_subset_var(df['highly_variable'].values) + + else: + if batch_key is None: + df = df.drop( + ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1 + ) + if subset: + df = df.iloc[df.highly_variable.values, :] + + return df + + +def highly_variable_genes( + adata: AnnData, + layer: Optional[str] = None, + n_top_genes: Optional[int] = None, + min_disp: Optional[float] = 0.5, + max_disp: Optional[float] = np.inf, + min_mean: Optional[float] = 0.0125, + max_mean: Optional[float] = 3, + span: Optional[float] = 0.3, + n_bins: int = 20, + theta: float = 100, + clip: Optional[float] = None, + chunksize: int = 1000, + flavor: Literal[ + 'seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals' + ] = 'seurat', + subset: bool = False, + inplace: bool = True, + batch_key: Optional[str] = None, + check_values: bool = True, +) -> Optional[pd.DataFrame]: + """\ + Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_ [Lause20]_. + + Expects logarithmized data, except when `flavor='seurat_v3'` or + `flavor='pearson_residuals'`, in which count data is expected. + + Depending on `flavor`, this reproduces the R-implementations of Seurat + [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_, or uses + analytical Pearson residuals [Lause20]_. + + For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized + dispersion is obtained by scaling with the mean and standard deviation of + the dispersions for genes falling into a given bin for mean expression of + genes. This means that for each bin of mean expression, highly variable + genes are selected. + + For [Stuart19]_, a normalized variance for each gene is computed. First, the data + are standardized (i.e., z-score normalization per feature) with a regularized + standard deviation. Next, the normalized variance is computed as the variance + of each gene after the transformation. Genes are ranked by the normalized variance. + + For [Lause20]_, Pearson residuals of a negative binomial offset model (with + overdispersion theta shared across genes) are computed. By default, overdispersion + theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked + by residual variance. + + Parameters + ---------- + adata + The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond + to cells and columns to genes. + layer + If provided, use `adata.layers[layer]` for expression values instead of `adata.X`. + n_top_genes + Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. + min_mean + If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. + max_mean + If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. + min_disp + If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. + max_disp + If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. + span + The fraction of the data (cells) used when estimating the variance in the loess + model fit if `flavor='seurat_v3'`. + n_bins + Number of bins for binning the mean gene expression. Normalization is + done with respect to each bin. If just a single gene falls into a bin, + the normalized dispersion is artificially set to 1. You'll be informed + about this if you set `settings.verbosity = 4`. Ignored if + `flavor='pearson_residuals'`. + theta + If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta. + Higher values correspond to less overdispersion (var = mean + mean^2/theta), and + `theta=np.Inf` corresponds to a Poisson model. + clip + If `flavor='pearson_residuals'`, this determines how residuals are clipped: + + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ + where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + `clip=np.Inf` for no clipping. + + chunksize + If `flavor='pearson_residuals'`, this dertermines how many genes are processed at + once while computing the residual variance. Choosing a smaller value will reduce + the required memory. + flavor + Choose the flavor for identifying highly variable genes. For the dispersion + based methods in their default workflows, Seurat passes the cutoffs whereas + Cell Ranger passes `n_top_genes`. + subset + Inplace subset to highly-variable genes if `True` otherwise merely indicate + highly variable genes. + inplace + Whether to place calculated metrics in `.var` or return them. + batch_key + If specified, highly-variable genes are selected within each batch separately and merged. + This simple process avoids the selection of batch-specific genes and acts as a + lightweight batch correction method. For all flavors, genes are first sorted + by how many batches they are a HVG. For dispersion-based flavors ties are broken + by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median + (across batches) rank based on within-batch normalized variance. If + `flavor='pearson_residuals'`, ties are broken by the median rank (across batches) + based on within-batch residual variance. + check_values + Check if counts in selected layer are integers. A Warning is returned if set to True. + Only used if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. + + + Returns + ------- + Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or + updates `.var` with the following fields + + highly_variable : bool + boolean indicator of highly-variable genes + **means** + means per gene + **dispersions** + For dispersion-based flavors, dispersions per gene + **dispersions_norm** + For dispersion-based flavors, normalized dispersions per gene + **variances** + For `flavor='seurat_v3'` and `flavor='pearson_residuals'`, variance per gene + **variances_norm** + For `flavor='seurat_v3'`, normalized variance per gene, averaged in + the case of multiple batches + **residual_variances** + For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of + multiple batches. + highly_variable_rank : float + For `flavor='seurat_v3'`, rank of the gene according to normalized + variance, median rank in the case of multiple batches + For `flavor='pearson_residuals'`, rank of the gene according to residual + variance, median rank in the case of multiple batches + highly_variable_nbatches : int + If batch_key is given, this denotes in how many batches genes are detected as HVG + highly_variable_intersection : bool + If batch_key is given, this denotes the genes that are highly variable in all batches + + Notes + ----- + This function replaces :func:`~scanpy.pp.filter_genes_dispersion`. + """ + + logg.info('extracting highly variable genes') + + if not isinstance(adata, AnnData): + raise ValueError( + '`pp.highly_variable_genes` expects an `AnnData` argument, ' + 'pass `inplace=False` if you want to return a `pd.DataFrame`.' + ) + + if flavor == 'pearson_residuals': + if n_top_genes is None: + raise ValueError( + "`pp.highly_variable_genes` requires the argument `n_top_genes`" + " for `flavor='pearson_residuals'`" + ) + return _highly_variable_pearson_residuals( + adata, + layer=layer, + n_top_genes=n_top_genes, + batch_key=batch_key, + theta=theta, + clip=clip, + chunksize=chunksize, + subset=subset, + check_values=check_values, + inplace=inplace, + ) diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index e69de29bb2..5e068db8db 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -0,0 +1,242 @@ +from typing import Optional, Union, Dict +from warnings import warn + +import numpy as np +import pandas as pd +from anndata import AnnData +from scipy.sparse import issparse + +from scanpy import logging as logg + +from scanpy._utils import view_to_actual, check_nonnegative_integers +from scanpy.get import _get_obs_rep, _set_obs_rep + +from scanpy.preprocessing._pca import pca + + +def _pearson_residuals(X, theta, clip, check_values, copy=False): + + X = X.copy() if copy else X + + # check theta + if theta <= 0: + # TODO: would "underdispersion" with negative theta make sense? + # then only theta=0 were undefined.. + raise ValueError('Pearson residuals require theta > 0') + # prepare clipping + if clip is None: + n = X.shape[0] + clip = np.sqrt(n) + if clip < 0: + raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") + + if check_values and not check_nonnegative_integers(X): + warn( + "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.", + UserWarning, + ) + + if issparse(X): + sums_genes = np.sum(X, axis=0) + sums_cells = np.sum(X, axis=1) + sum_total = np.sum(sums_genes).squeeze() + else: + sums_genes = np.sum(X, axis=0, keepdims=True) + sums_cells = np.sum(X, axis=1, keepdims=True) + sum_total = np.sum(sums_genes) + + mu = np.array(sums_cells @ sums_genes / sum_total) + diff = np.array(X - mu) + residuals = diff / np.sqrt(mu + mu ** 2 / theta) + + # clip + residuals = np.clip(residuals, a_min=-clip, a_max=clip) + + return residuals + + +def normalize_pearson_residuals( + adata: AnnData, + theta: float = 100, + clip: Optional[float] = None, + layer: Optional[str] = None, + copy: bool = False, + check_values: bool = True, + inplace: bool = True, +) -> Optional[Dict[str, np.ndarray]]: + """\ + Computes analytic Pearson residuals, based on [Lause20]_. + + Assuming a negative binomial offset model with overdispersion + theta shared across genes, computes Pearson residuals. By default, residuals + are clipped to sqrt(n) and overdispersion theta=100 is used. + + Params + ------ + adata + The annotated data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. + theta + The NB overdispersion parameter theta. Higher values correspond to + less overdispersion (var = mean + mean^2/theta), and `theta=np.Inf` + corresponds to a Poisson model. + clip + Determines if and how residuals are clipped: + + * If `None`, residuals are clipped to the interval \ + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + `clip=np.Inf` for no clipping. + + layer + Layer to normalize instead of `X`. If `None`, `X` is normalized. + copy + Whether to modify copied input object. Not compatible with + `inplace=False`. + check_values + Check if counts in selected layer are integers. A Warning is returned if set to True. + inplace + Whether to update `adata` or return dictionary with normalized copies + of `adata.X` and `adata.layers`. + + Returns + ------- + Returns dictionary with Pearson residuals and settings + or updates `adata` with normalized version of the original + `adata.X` and `adata.layers`, depending on `inplace`. + + """ + + if copy: + if not inplace: + raise ValueError("`copy=True` cannot be used with `inplace=False`.") + adata = adata.copy() + + view_to_actual(adata) + X = _get_obs_rep(adata, layer=layer) + computed_on = layer if layer else 'adata.X' + + msg = f'computing analytic Pearson residuals on {computed_on}' + start = logg.info(msg) + + residuals = _pearson_residuals(X, theta, clip, check_values, copy=~inplace) + settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on) + + if inplace: + _set_obs_rep(adata, residuals, layer=layer) + adata.uns['pearson_residuals_normalization'] = settings_dict + else: + results_dict = dict(X=residuals, **settings_dict) + + logg.info(' finished ({time_passed})', time=start) + + if copy: + return adata + elif not inplace: + return results_dict + + +def normalize_pearson_residuals_pca( + adata: AnnData, + theta: float = 100, + clip: Optional[float] = None, + n_comps_pca: Optional[int] = 50, + random_state_pca: Optional[float] = 0, + use_highly_variable: bool = True, + kwargs_pca: Optional[dict] = {}, + check_values: bool = True, + inplace: bool = True, +) -> Optional[pd.DataFrame]: + """\ + Applies Pearson residual normalization and PCA, based on [Lause20]_. + + Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default. + + + Parameters + ---------- + adata + The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond + to cells and columns to genes. + use_highly_variable + Whether to use the gene selection in `adata.var['highly_variable']` to + subset the data before normalizing (default) or proceed on the full + dataset. + theta + This is the NB overdispersion parameter theta for Pearson residual + computations. Higher values correspond to less overdispersion + (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a + Poisson model. + clip + This determines how Pearson residuals are clipped: + + * If `None`, residuals are clipped to the interval \ + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + `clip=np.Inf` for no clipping. + + n_comps_pca + Number of principal components to compute. + random_state_pca + Change to use different initial states for the optimization. + kwargs_pca + Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. + check_values + Check if counts in selected layer are integers. A Warning is returned if set to True. + inplace + Whether to place results in `adata` or return them. + + + Returns + ------- + If `inplace=False`, returns the Pearson residual-based PCA results + (`adata_pca`). + If `inplace=True`, updates `adata` with the following fields: + + `.uns['pearson_residuals_normalization']['pearson_residuals_df']` + The hvg-subset, normalized by Pearson residuals + `.uns['pearson_residuals_normalization']['theta']` + The used value of the overdisperion parameter theta + `.uns['pearson_residuals_normalization']['clip']` + The used value of the clipping parameter + + `.obsm['X_pearson_residuals_pca']` + PCA representation of data after gene selection and Pearson residual + normalization. + `.uns['pearson_residuals_pca']['PCs']` + The principal components containing the loadings. + `.uns['pearson_residuals_pca']['variance_ratio']` + Ratio of explained variance. + `.uns['pearson_residuals_pca']['variance']` + Explained variance, equivalent to the eigenvalues of the + covariance matrix. + + """ + + if use_highly_variable and 'highly_variable' in adata.var_keys(): + # TODO: are these copies needed? + adata_pca = adata[:, adata.var['highly_variable']].copy() + else: + # TODO: are these copies needed? + adata_pca = adata.copy() + + normalize_pearson_residuals( + adata_pca, theta=theta, clip=clip, check_values=check_values + ) + pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca) + + if inplace: + norm_settings = adata_pca.uns['pearson_residuals_normalization'] + norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df()) + pca_settings = adata_pca.uns['pca'] + pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs']) + adata.uns['pearson_residuals_pca'] = pca_dict + adata.uns['pearson_residuals_normalization'] = norm_dict + adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'] + return None + else: + adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy() + adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy() + del adata_pca.obsm['X_pca'] + del adata_pca.uns['pca'] + return adata_pca diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py index 8adee1f813..b811a89cf0 100644 --- a/scanpy/preprocessing/__init__.py +++ b/scanpy/preprocessing/__init__.py @@ -1,9 +1,4 @@ -from ._recipes import ( - recipe_zheng17, - recipe_weinreb17, - recipe_seurat, - recipe_pearson_residuals, -) +from ._recipes import recipe_zheng17, recipe_weinreb17, recipe_seurat from ._simple import filter_cells, filter_genes from ._deprecated.highly_variable_genes import filter_genes_dispersion from ._highly_variable_genes import highly_variable_genes @@ -12,10 +7,5 @@ from ._pca import pca from ._qc import calculate_qc_metrics from ._combat import combat -from ._normalization import ( - normalize_total, - normalize_pearson_residuals, - normalize_pearson_residuals_pca, -) - +from ._normalization import normalize_total from ..neighbors import neighbors diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 1575f5b083..d9c8aae568 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -176,210 +176,6 @@ def _highly_variable_genes_seurat_v3( return df -def _highly_variable_pearson_residuals( - adata: AnnData, - layer: Optional[str] = None, - n_top_genes: int = 1000, - batch_key: Optional[str] = None, - theta: float = 100, - clip: Optional[float] = None, - chunksize: int = 100, - check_values: bool = True, - subset: bool = False, - inplace: bool = True, -) -> Optional[pd.DataFrame]: - """\ - See `highly_variable_genes`. - - Returns - ------- - Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) - or updates `.var` with the following fields: - - highly_variable : bool - boolean indicator of highly-variable genes. - means : float - means per gene. - variances : float - variances per gene. - residual_variances : float - Pearson residual variance per gene. Averaged in the case of multiple - batches. - highly_variable_rank : float - Rank of the gene according to residual variance, median rank in the - case of multiple batches. NaN for non-HVGs. - highly_variable_nbatches : int - If batch_key is given, this denotes in how many batches genes are - detected as HVG. - highly_variable_intersection : bool - If batch_key is given, this denotes the genes that are highly variable - in all batches. - """ - - view_to_actual(adata) - X = _get_obs_rep(adata, layer=layer) - computed_on = layer if layer else 'adata.X' - - # Check for raw counts - if check_values and (check_nonnegative_integers(X) is False): - warnings.warn( - "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", - UserWarning, - ) - # check theta - if theta <= 0: - # TODO: would "underdispersion" with negative theta make sense? - # then only theta=0 were undefined.. - raise ValueError('Pearson residuals require theta > 0') - # prepare clipping - - if batch_key is None: - batch_info = np.zeros(adata.shape[0], dtype=int) - else: - batch_info = adata.obs[batch_key].values - n_batches = len(np.unique(batch_info)) - - # Get pearson residuals for each batch separately - residual_gene_vars = [] - for batch in np.unique(batch_info): - - adata_subset = adata[batch_info == batch] - - # Filter out zero genes - with settings.verbosity.override(Verbosity.error): - nonzero_genes = filter_genes(adata_subset, min_cells=1, inplace=False)[0] - adata_subset = adata_subset[:, nonzero_genes] - - if layer is not None: - X_batch = adata_subset.layers[layer] - else: - X_batch = adata_subset.X - - # Prepare clipping - if clip is None: - n = X_batch.shape[0] - clip = np.sqrt(n) - if clip < 0: - raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") - - if sp_sparse.issparse(X_batch): - sums_genes = np.sum(X_batch, axis=0) - sums_cells = np.sum(X_batch, axis=1) - sum_total = np.sum(sums_genes).squeeze() - else: - sums_genes = np.sum(X_batch, axis=0, keepdims=True) - sums_cells = np.sum(X_batch, axis=1, keepdims=True) - sum_total = np.sum(sums_genes) - - # Compute pearson residuals in chunks - residual_gene_var = np.empty((X_batch.shape[1])) - for start in np.arange(0, X_batch.shape[1], chunksize): - stop = start + chunksize - mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total) - X_dense = X_batch[:, start:stop].toarray() - residuals = (X_dense - mu) / np.sqrt(mu + mu ** 2 / theta) - residuals = np.clip(residuals, a_min=-clip, a_max=clip) - residual_gene_var[start:stop] = np.var(residuals, axis=0) - - # Add 0 values for genes that were filtered out - zero_gene_var = np.zeros(np.sum(~nonzero_genes)) - residual_gene_var = np.concatenate((residual_gene_var, zero_gene_var)) - # Order as before filtering - idxs = np.concatenate((np.where(nonzero_genes)[0], np.where(~nonzero_genes)[0])) - residual_gene_var = residual_gene_var[np.argsort(idxs)] - residual_gene_vars.append(residual_gene_var.reshape(1, -1)) - - residual_gene_vars = np.concatenate(residual_gene_vars, axis=0) - - # Get cutoffs and define hvgs per batch - residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1) - cutoffs_per_batch = residual_gene_vars_sorted[:, -n_top_genes] - highly_variable_per_batch = np.greater_equal( - residual_gene_vars.T, cutoffs_per_batch - ).T - - # Merge hvgs across batches - highly_variable_nbatches = np.sum(highly_variable_per_batch, axis=0) - highly_variable_intersection = highly_variable_nbatches == n_batches - - # Get rank per gene within each batch - # argsort twice gives ranks, small rank means most variable - ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1) - ranks_residual_var = ranks_residual_var.astype(np.float32) - ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan - ranks_masked_array = np.ma.masked_invalid(ranks_residual_var) - # Median rank across batches, - # ignoring batches in which gene was not selected - medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) - - means, variances = materialize_as_ndarray(_get_mean_var(X)) - df = pd.DataFrame.from_dict( - dict( - means=means, - variances=variances, - residual_variances=np.mean(residual_gene_vars, axis=0).astype( - np.float32, copy=False - ), - highly_variable_rank=medianrank_residual_var, - highly_variable_nbatches=highly_variable_nbatches.astype(np.int64), - highly_variable_intersection=highly_variable_intersection, - ) - ) - df = df.set_index(adata.var_names) - - # Sort genes by how often they selected as hvg within each batch and - # break ties with median rank of residual variance across batches - df.sort_values( - ['highly_variable_nbatches', 'highly_variable_rank'], - ascending=[False, True], - na_position='last', - inplace=True, - ) - - high_var = np.zeros(df.shape[0]) - high_var[:n_top_genes] = True - df['highly_variable'] = high_var.astype(bool) - df = df.loc[adata.var_names, :] - - if inplace: - adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on} - logg.hint( - 'added\n' - ' \'highly_variable\', boolean vector (adata.var)\n' - ' \'highly_variable_rank\', float vector (adata.var)\n' - ' \'highly_variable_nbatches\', int vector (adata.var)\n' - ' \'highly_variable_intersection\', boolean vector (adata.var)\n' - ' \'means\', float vector (adata.var)\n' - ' \'variances\', float vector (adata.var)\n' - ' \'residual_variances\', float vector (adata.var)' - ) - adata.var['means'] = df['means'].values - adata.var['variances'] = df['variances'].values - adata.var['residual_variances'] = df['residual_variances'] - adata.var['highly_variable_rank'] = df['highly_variable_rank'].values - if batch_key is not None: - adata.var['highly_variable_nbatches'] = df[ - 'highly_variable_nbatches' - ].values - adata.var['highly_variable_intersection'] = df[ - 'highly_variable_intersection' - ].values - adata.var['highly_variable'] = df['highly_variable'].values - - if subset: - adata._inplace_subset_var(df['highly_variable'].values) - - else: - if batch_key is None: - df = df.drop( - ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1 - ) - if subset: - df = df.iloc[df.highly_variable.values, :] - - return df - - def _highly_variable_genes_single_batch( adata: AnnData, layer: Optional[str] = None, @@ -502,26 +298,20 @@ def highly_variable_genes( max_mean: Optional[float] = 3, span: Optional[float] = 0.3, n_bins: int = 20, - theta: float = 100, - clip: Optional[float] = None, - chunksize: int = 1000, - flavor: Literal[ - 'seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals' - ] = 'seurat', + flavor: Literal['seurat', 'cell_ranger', 'seurat_v3'] = 'seurat', subset: bool = False, inplace: bool = True, batch_key: Optional[str] = None, check_values: bool = True, ) -> Optional[pd.DataFrame]: """\ - Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_ [Lause20]_. + Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_. - Expects logarithmized data, except when `flavor='seurat_v3'` or - `flavor='pearson_residuals'`, in which count data is expected. + Expects logarithmized data, except when `flavor='seurat_v3'`, in which count + data is expected. Depending on `flavor`, this reproduces the R-implementations of Seurat - [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_, or uses - analytical Pearson residuals [Lause20]_. + [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_. For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized dispersion is obtained by scaling with the mean and standard deviation of @@ -534,10 +324,8 @@ def highly_variable_genes( standard deviation. Next, the normalized variance is computed as the variance of each gene after the transformation. Genes are ranked by the normalized variance. - For [Lause20]_, Pearson residuals of a negative binomial offset model (with - overdispersion theta shared across genes) are computed. By default, overdispersion - theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked - by residual variance. + See also `scanpy.experimental.pp._highly_variable_genes` for additional flavours + (e.g. Pearson residuals). Parameters ---------- @@ -547,24 +335,19 @@ def highly_variable_genes( layer If provided, use `adata.layers[layer]` for expression values instead of `adata.X`. n_top_genes - Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. + Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'`. min_mean If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. max_mean If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. min_disp If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. max_disp If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. + normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`. span The fraction of the data (cells) used when estimating the variance in the loess model fit if `flavor='seurat_v3'`. @@ -572,24 +355,7 @@ def highly_variable_genes( Number of bins for binning the mean gene expression. Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1. You'll be informed - about this if you set `settings.verbosity = 4`. Ignored if - `flavor='pearson_residuals'`. - theta - If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta. - Higher values correspond to less overdispersion (var = mean + mean^2/theta), and - `theta=np.Inf` corresponds to a Poisson model. - clip - If `flavor='pearson_residuals'`, this determines how residuals are clipped: - - * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ - where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping. - - chunksize - If `flavor='pearson_residuals'`, this dertermines how many genes are processed at - once while computing the residual variance. Choosing a smaller value will reduce - the required memory. + about this if you set `settings.verbosity = 4`. flavor Choose the flavor for identifying highly variable genes. For the dispersion based methods in their default workflows, Seurat passes the cutoffs whereas @@ -605,12 +371,10 @@ def highly_variable_genes( lightweight batch correction method. For all flavors, genes are first sorted by how many batches they are a HVG. For dispersion-based flavors ties are broken by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median - (across batches) rank based on within-batch normalized variance. If - `flavor='pearson_residuals'`, ties are broken by the median rank (across batches) - based on within-batch residual variance. + (across batches) rank based on within-batch normalized variance. check_values Check if counts in selected layer are integers. A Warning is returned if set to True. - Only used if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. + Only used if `flavor='seurat_v3'`. Returns @@ -627,18 +391,13 @@ def highly_variable_genes( **dispersions_norm** For dispersion-based flavors, normalized dispersions per gene **variances** - For `flavor='seurat_v3'` and `flavor='pearson_residuals'`, variance per gene + For `flavor='seurat_v3'`, variance per gene **variances_norm** For `flavor='seurat_v3'`, normalized variance per gene, averaged in the case of multiple batches - **residual_variances** - For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of - multiple batches. highly_variable_rank : float For `flavor='seurat_v3'`, rank of the gene according to normalized variance, median rank in the case of multiple batches - For `flavor='pearson_residuals'`, rank of the gene according to residual - variance, median rank in the case of multiple batches highly_variable_nbatches : int If batch_key is given, this denotes in how many batches genes are detected as HVG highly_variable_intersection : bool @@ -673,24 +432,6 @@ def highly_variable_genes( subset=subset, inplace=inplace, ) - if flavor == 'pearson_residuals': - if n_top_genes is None: - raise ValueError( - "`pp.highly_variable_genes` requires the argument `n_top_genes`" - " for `flavor='pearson_residuals'`" - ) - return _highly_variable_pearson_residuals( - adata, - layer=layer, - n_top_genes=n_top_genes, - batch_key=batch_key, - theta=theta, - clip=clip, - chunksize=chunksize, - subset=subset, - check_values=check_values, - inplace=inplace, - ) if batch_key is None: df = _highly_variable_genes_single_batch( diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index adc8820b2e..0a853d3c89 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -31,234 +31,6 @@ def _normalize_data(X, counts, after=None, copy=False): return X -def _pearson_residuals(X, theta, clip, check_values, copy=False): - - X = X.copy() if copy else X - - # check theta - if theta <= 0: - # TODO: would "underdispersion" with negative theta make sense? - # then only theta=0 were undefined.. - raise ValueError('Pearson residuals require theta > 0') - # prepare clipping - if clip is None: - n = X.shape[0] - clip = np.sqrt(n) - if clip < 0: - raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") - - if check_values and not check_nonnegative_integers(X): - warn( - "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.", - UserWarning, - ) - - if issparse(X): - sums_genes = np.sum(X, axis=0) - sums_cells = np.sum(X, axis=1) - sum_total = np.sum(sums_genes).squeeze() - else: - sums_genes = np.sum(X, axis=0, keepdims=True) - sums_cells = np.sum(X, axis=1, keepdims=True) - sum_total = np.sum(sums_genes) - - mu = np.array(sums_cells @ sums_genes / sum_total) - diff = np.array(X - mu) - residuals = diff / np.sqrt(mu + mu ** 2 / theta) - - # clip - residuals = np.clip(residuals, a_min=-clip, a_max=clip) - - return residuals - - -def normalize_pearson_residuals( - adata: AnnData, - theta: float = 100, - clip: Optional[float] = None, - layer: Optional[str] = None, - copy: bool = False, - check_values: bool = True, - inplace: bool = True, -) -> Optional[Dict[str, np.ndarray]]: - """\ - Computes analytic Pearson residuals, based on [Lause20]_. - - Assuming a negative binomial offset model with overdispersion - theta shared across genes, computes Pearson residuals. By default, residuals - are clipped to sqrt(n) and overdispersion theta=100 is used. - - Params - ------ - adata - The annotated data matrix of shape `n_obs` × `n_vars`. - Rows correspond to cells and columns to genes. - theta - The NB overdispersion parameter theta. Higher values correspond to - less overdispersion (var = mean + mean^2/theta), and `theta=np.Inf` - corresponds to a Poisson model. - clip - Determines if and how residuals are clipped: - - * If `None`, residuals are clipped to the interval \ - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping. - - layer - Layer to normalize instead of `X`. If `None`, `X` is normalized. - copy - Whether to modify copied input object. Not compatible with - `inplace=False`. - check_values - Check if counts in selected layer are integers. A Warning is returned if set to True. - inplace - Whether to update `adata` or return dictionary with normalized copies - of `adata.X` and `adata.layers`. - - Returns - ------- - Returns dictionary with Pearson residuals and settings - or updates `adata` with normalized version of the original - `adata.X` and `adata.layers`, depending on `inplace`. - - """ - - if copy: - if not inplace: - raise ValueError("`copy=True` cannot be used with `inplace=False`.") - adata = adata.copy() - - view_to_actual(adata) - X = _get_obs_rep(adata, layer=layer) - computed_on = layer if layer else 'adata.X' - - msg = f'computing analytic Pearson residuals on {computed_on}' - start = logg.info(msg) - - residuals = _pearson_residuals(X, theta, clip, check_values, copy=~inplace) - settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on) - - if inplace: - _set_obs_rep(adata, residuals, layer=layer) - adata.uns['pearson_residuals_normalization'] = settings_dict - else: - results_dict = dict(X=residuals, **settings_dict) - - logg.info(' finished ({time_passed})', time=start) - - if copy: - return adata - elif not inplace: - return results_dict - - -def normalize_pearson_residuals_pca( - adata: AnnData, - theta: float = 100, - clip: Optional[float] = None, - n_comps_pca: Optional[int] = 50, - random_state_pca: Optional[float] = 0, - use_highly_variable: bool = True, - kwargs_pca: Optional[dict] = {}, - check_values: bool = True, - inplace: bool = True, -) -> Optional[pd.DataFrame]: - """\ - Applies Pearson residual normalization and PCA, based on [Lause20]_. - - Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default. - - - Parameters - ---------- - adata - The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond - to cells and columns to genes. - use_highly_variable - Whether to use the gene selection in `adata.var['highly_variable']` to - subset the data before normalizing (default) or proceed on the full - dataset. - theta - This is the NB overdispersion parameter theta for Pearson residual - computations. Higher values correspond to less overdispersion - (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a - Poisson model. - clip - This determines how Pearson residuals are clipped: - - * If `None`, residuals are clipped to the interval \ - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping. - - n_comps_pca - Number of principal components to compute. - random_state_pca - Change to use different initial states for the optimization. - kwargs_pca - Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. - check_values - Check if counts in selected layer are integers. A Warning is returned if set to True. - inplace - Whether to place results in `adata` or return them. - - - Returns - ------- - If `inplace=False`, returns the Pearson residual-based PCA results - (`adata_pca`). - If `inplace=True`, updates `adata` with the following fields: - - `.uns['pearson_residuals_normalization']['pearson_residuals_df']` - The hvg-subset, normalized by Pearson residuals - `.uns['pearson_residuals_normalization']['theta']` - The used value of the overdisperion parameter theta - `.uns['pearson_residuals_normalization']['clip']` - The used value of the clipping parameter - - `.obsm['X_pearson_residuals_pca']` - PCA representation of data after gene selection and Pearson residual - normalization. - `.uns['pearson_residuals_pca']['PCs']` - The principal components containing the loadings. - `.uns['pearson_residuals_pca']['variance_ratio']` - Ratio of explained variance. - `.uns['pearson_residuals_pca']['variance']` - Explained variance, equivalent to the eigenvalues of the - covariance matrix. - - """ - - if use_highly_variable and 'highly_variable' in adata.var_keys(): - # TODO: are these copies needed? - adata_pca = adata[:, adata.var['highly_variable']].copy() - else: - # TODO: are these copies needed? - adata_pca = adata.copy() - - normalize_pearson_residuals( - adata_pca, theta=theta, clip=clip, check_values=check_values - ) - pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca) - - if inplace: - norm_settings = adata_pca.uns['pearson_residuals_normalization'] - norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df()) - pca_settings = adata_pca.uns['pca'] - pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs']) - adata.uns['pearson_residuals_pca'] = pca_dict - adata.uns['pearson_residuals_normalization'] = norm_dict - adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'] - return None - else: - adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy() - adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy() - del adata_pca.obsm['X_pca'] - del adata_pca.uns['pca'] - return adata_pca - - def normalize_total( adata: AnnData, target_sum: Optional[float] = None, diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index fd28bea576..a4696e0827 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -170,158 +170,3 @@ def recipe_zheng17( pp.scale(adata) logg.info(' finished', time=start) return adata if copy else None - - -def recipe_pearson_residuals( - adata: AnnData, - n_top_genes: int = 1000, - theta: float = 100, - clip: Optional[float] = None, - chunksize: int = 1000, - batch_key: Optional[str] = None, - n_comps_pca: Optional[int] = 50, - random_state_pca: Optional[float] = 0, - kwargs_pca: dict = {}, - check_values: bool = True, - inplace: bool = True, -) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: - """\ - Gene selection and normalization based on [Lause20]_. - - Applies gene selection based on Pearson residuals. On the resulting subset, - Pearson residual normalization and PCA are performed. - - - Parameters - ---------- - adata - The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond - to cells and columns to genes. - n_top_genes - Number of highly-variable genes to keep. Mandatory if - `flavor='seurat_v3'` or `flavor='pearson_residuals'`. - chunksize - This dertermines how many genes are processed at once while computing - the Pearson residual variance. Choosing a smaller value will reduce - the required memory. - theta - This is the NB overdispersion parameter theta for Pearson residual - computations. Higher values correspond to less overdispersion - (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a - Poisson model. - clip - This determines if and how Pearson residuals are clipped: - - * If `None`, residuals are clipped to the interval \ - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping. - - batch_key - If specified, highly-variable genes are selected within each batch - separately and merged. This simple process avoids the selection of - batch-specific genes and acts as a lightweight batch correction - method. For all flavors, genes are first sorted by how many batches - they are a HVG. Ties are broken by the median rank (across batches) - based on within-batch residual variance. - - n_comps_pca - Number of principal components to compute. - random_state_pca - Change to use different initial states for the optimization. - kwargs_pca - Dictionary of further keyword arguments passed on to `sc.pp.pca()`. - check_values - Check if counts in selected layer are integers. A Warning is returned if set to True. - inplace - Whether to place results in `adata` or return them. - - Returns - ------ - If `inplace=False`, separately returns the gene selection results (`hvg`) - and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`, - updates `adata` with the following fields for gene selection results…: - - `.var['highly_variable']` - boolean indicator of highly-variable genes. - `.var['means']` - means per gene. - `.var['variances']` - variances per gene. - `.var['residual_variances']` - Pearson residual variance per gene. Averaged in the case of multiple - batches. - `.var['highly_variable_rank']` - Rank of the gene according to residual variance, median rank in the - case of multiple batches. - `.var['highly_variable_nbatches']` - If batch_key is given, this denotes in how many batches genes are - detected as HVG. - `.var['highly_variable_intersection']` - If batch_key is given, this denotes the genes that are highly variable - in all batches. - - …and the following fields for Pearson residual-based PCA results and - normalization settings: - - `.uns['pearson_residuals_normalization']['pearson_residuals_df']` - The hvg-subset, normalized by Pearson residuals. - `.uns['pearson_residuals_normalization']['theta']` - The used value of the overdisperion parameter theta. - `.uns['pearson_residuals_normalization']['clip']` - The used value of the clipping parameter. - - `.obsm['pearson_residuals_X_pca']` - PCA representation of data after gene selection and Pearson residual - normalization. - `.uns['pearson_residuals_pca']['PCs']` - The principal components containing the loadings. - `.uns['pearson_residuals_pca']['variance_ratio']` - Ratio of explained variance. - `.uns['pearson_residuals_pca']['variance']` - Explained variance, equivalent to the eigenvalues of the - covariance matrix. - - """ - - hvg_args = dict( - flavor='pearson_residuals', - n_top_genes=n_top_genes, - batch_key=batch_key, - theta=theta, - clip=clip, - chunksize=chunksize, - check_values=check_values, - ) - - if inplace: - pp.highly_variable_genes(adata, **hvg_args, inplace=True) - # TODO: are these copies needed? - adata_pca = adata[:, adata.var['highly_variable']].copy() - else: - hvg = pp.highly_variable_genes(adata, **hvg_args, inplace=False) - # TODO: are these copies needed? - adata_pca = adata[:, hvg['highly_variable']].copy() - - pp.normalize_pearson_residuals( - adata_pca, theta=theta, clip=clip, check_values=check_values - ) - pp.pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca) - - if inplace: - normalization_param = adata_pca.uns['pearson_residuals_normalization'] - normalization_dict = dict( - **normalization_param, pearson_residuals_df=adata_pca.to_df() - ) - pca_param = adata_pca.uns['pca'] - pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs']) - adata.uns['pearson_residuals_pca'] = pca_dict - adata.uns['pearson_residuals_normalization'] = normalization_dict - adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'] - return None - else: - adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy() - adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy() - del adata_pca.obsm['X_pca'] - del adata_pca.uns['pca'] - return adata_pca, hvg From dd16140cdb95961d8d26497ce3fd20dfea6904d4 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Wed, 14 Jul 2021 16:49:43 +0200 Subject: [PATCH 39/96] linking tests to new experimental submodule, style cleanup --- scanpy/tests/test_highly_variable_genes.py | 24 +++++++------- scanpy/tests/test_normalization.py | 38 ++++++++++++---------- 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index c80e0a442a..a92c59ebf5 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -58,7 +58,7 @@ def test_highly_variable_genes_basic(): def _residual_var_reference(adata, clip=None, theta=100): - sc.pp.normalize_pearson_residuals(adata, clip=clip, theta=theta) + sc.experimental.pp.normalize_pearson_residuals(adata, clip=clip, theta=theta) residuals = adata.X return np.var(residuals, axis=0) @@ -80,7 +80,7 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp # expecting 0 no-int warnings with pytest.warns(None) as record: - sc.pp.highly_variable_genes( + sc.experimental.pp.highly_variable_genes( adata_noninteger.copy(), flavor='pearson_residuals', n_top_genes=100, @@ -90,7 +90,7 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp # expecting 1 no-int warning with pytest.warns(None) as record: - sc.pp.highly_variable_genes( + sc.experimental.pp.highly_variable_genes( adata_noninteger.copy(), flavor='pearson_residuals', n_top_genes=100, @@ -101,17 +101,17 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp # errors should be raised for invalid theta values with pytest.raises(ValueError) as record: - sc.pp.highly_variable_genes( + sc.experimental.pp.highly_variable_genes( adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=0 ) with pytest.raises(ValueError) as record: - sc.pp.highly_variable_genes( + sc.experimental.pp.highly_variable_genes( adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=-1 ) # error should be raised for invalid clipping values with pytest.raises(ValueError) as record: - sc.pp.highly_variable_genes( + sc.experimental.pp.highly_variable_genes( adata.copy(), flavor='pearson_residuals', n_top_genes=100, clip=-1 ) @@ -141,7 +141,7 @@ def test_highly_variable_genes_pearson_residuals_values( residual_variances_reference = residual_variances_reference[top_n_idx] # compute output to be tested - output_df = sc.pp.highly_variable_genes( + output_df = sc.experimental.pp.highly_variable_genes( adata, flavor='pearson_residuals', n_top_genes=n_top_genes, @@ -151,7 +151,7 @@ def test_highly_variable_genes_pearson_residuals_values( theta=theta, ) - sc.pp.highly_variable_genes( + sc.experimental.pp.highly_variable_genes( adata, flavor='pearson_residuals', n_top_genes=n_top_genes, @@ -198,7 +198,7 @@ def test_highly_variable_genes_pearson_residuals_general( # (results in sorted "gene order" in reference) residual_variances_reference = residual_variances_reference[top_n_idx] # compute output to be tested - output_df = sc.pp.highly_variable_genes( + output_df = sc.experimental.pp.highly_variable_genes( adata, flavor='pearson_residuals', n_top_genes=n_top_genes, @@ -206,7 +206,7 @@ def test_highly_variable_genes_pearson_residuals_general( inplace=False, ) - sc.pp.highly_variable_genes( + sc.experimental.pp.highly_variable_genes( adata, flavor='pearson_residuals', n_top_genes=n_top_genes, @@ -269,7 +269,7 @@ def test_highly_variable_genes_pearson_residuals_batch( adata.var.drop(columns=adata.var.columns, inplace=True) n_genes = adata.shape[1] - output_df = sc.pp.highly_variable_genes( + output_df = sc.experimental.pp.highly_variable_genes( adata, flavor='pearson_residuals', n_top_genes=n_top_genes, @@ -278,7 +278,7 @@ def test_highly_variable_genes_pearson_residuals_batch( inplace=False, ) - sc.pp.highly_variable_genes( + sc.experimental.pp.highly_variable_genes( adata, flavor='pearson_residuals', n_top_genes=n_top_genes, diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index eb9bbc8533..ad7ba7bde7 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -80,27 +80,27 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype): adata_noninteger.X[x[0], y[0]] = 0.5 with pytest.warns(UserWarning) as record: - sc.pp.normalize_pearson_residuals( + sc.experimental.pp.normalize_pearson_residuals( adata_noninteger.copy(), check_values=True ) assert len(record) == 1 assert "expects raw count data" in record[0].message.args[0] with pytest.warns(None) as record: - sc.pp.normalize_pearson_residuals( + sc.experimental.pp.normalize_pearson_residuals( adata_noninteger.copy(), check_values=False ) assert len(record) == 0 # errors should be raised for invalid theta values with pytest.raises(ValueError) as record: - sc.pp.normalize_pearson_residuals(adata.copy(), theta=0) + sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=0) with pytest.raises(ValueError) as record: - sc.pp.normalize_pearson_residuals(adata.copy(), theta=-1) + sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=-1) # error should be raised for invalid clipping values with pytest.raises(ValueError) as record: - sc.pp.normalize_pearson_residuals(adata.copy(), clip=-1) + sc.experimental.pp.normalize_pearson_residuals(adata.copy(), clip=-1) @pytest.mark.parametrize( @@ -127,11 +127,13 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip): # compute output to test adata = AnnData(sparsity_func(X), dtype=dtype) - output = sc.pp.normalize_pearson_residuals( + output = sc.experimental.pp.normalize_pearson_residuals( adata, theta=theta, clip=clip, inplace=False ) output_X = output['X'] - sc.pp.normalize_pearson_residuals(adata, theta=theta, clip=clip, inplace=True) + sc.experimental.pp.normalize_pearson_residuals( + adata, theta=theta, clip=clip, inplace=True + ) # check for correct new `adata.uns` keys assert np.all(np.isin(['pearson_residuals_normalization'], list(adata.uns.keys()))) @@ -170,7 +172,7 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p n_cells, n_genes = adata.shape adata_with_hvgs = adata.copy() - sc.pp.highly_variable_genes( + sc.experimental.pp.highly_variable_genes( adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs ) adata_not_using_hvgs = adata_with_hvgs.copy() @@ -178,15 +180,15 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p ### inplace = False ### # outputs the (potentially hvg-restricted) adata_pca object # PCA on all genes - adata_pca = sc.pp.normalize_pearson_residuals_pca( + adata_pca = sc.experimental.pp.normalize_pearson_residuals_pca( adata.copy(), inplace=False, n_comps_pca=n_comps_pca ) # PCA on hvgs only - adata_pca_with_hvgs = sc.pp.normalize_pearson_residuals_pca( + adata_pca_with_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca( adata_with_hvgs.copy(), inplace=False, n_comps_pca=n_comps_pca ) # PCA again on all genes (hvg use supressed) - adata_pca_not_using_hvgs = sc.pp.normalize_pearson_residuals_pca( + adata_pca_not_using_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca( adata_not_using_hvgs.copy(), inplace=False, n_comps_pca=n_comps_pca, @@ -221,13 +223,15 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p ### inplace = True ### # modifies the input adata object # PCA on all genes - sc.pp.normalize_pearson_residuals_pca(adata, inplace=True, n_comps_pca=n_comps_pca) + sc.experimental.pp.normalize_pearson_residuals_pca( + adata, inplace=True, n_comps_pca=n_comps_pca + ) # PCA on hvgs only - sc.pp.normalize_pearson_residuals_pca( + sc.experimental.pp.normalize_pearson_residuals_pca( adata_with_hvgs, inplace=True, n_comps_pca=n_comps_pca ) # PCA again on all genes (hvg use supressed) - sc.pp.normalize_pearson_residuals_pca( + sc.experimental.pp.normalize_pearson_residuals_pca( adata_not_using_hvgs, inplace=True, n_comps_pca=n_comps_pca, @@ -280,14 +284,14 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp n_cells, n_genes = adata.shape adata_with_hvgs = adata.copy() - sc.pp.highly_variable_genes( + sc.experimental.pp.highly_variable_genes( adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs ) ### inplace = False ### # outputs the (potentially hvg-restricted) adata_pca object # PCA on all genes - adata_pca, hvg = sc.pp.recipe_pearson_residuals( + adata_pca, hvg = sc.experimental.pp.recipe_pearson_residuals( adata.copy(), inplace=False, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs ) @@ -326,7 +330,7 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp ### inplace = True ### # modifies the input adata object # PCA on all genes - sc.pp.recipe_pearson_residuals( + sc.experimental.pp.recipe_pearson_residuals( adata, inplace=True, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs ) From a19f90ec25f60f2421f257695ecbd0b3a8fb47c5 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Wed, 14 Jul 2021 17:18:50 +0200 Subject: [PATCH 40/96] adapt input arguments and docstring for experimental version of hvg selection function --- .../experimental/pp/_highly_variable_genes.py | 73 ++----------------- 1 file changed, 7 insertions(+), 66 deletions(-) diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index af5f81ed74..c90b468027 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -225,43 +225,19 @@ def highly_variable_genes( adata: AnnData, layer: Optional[str] = None, n_top_genes: Optional[int] = None, - min_disp: Optional[float] = 0.5, - max_disp: Optional[float] = np.inf, - min_mean: Optional[float] = 0.0125, - max_mean: Optional[float] = 3, - span: Optional[float] = 0.3, - n_bins: int = 20, theta: float = 100, clip: Optional[float] = None, chunksize: int = 1000, - flavor: Literal[ - 'seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals' - ] = 'seurat', + flavor: Literal['pearson_residuals'] = 'pearson_residuals', subset: bool = False, inplace: bool = True, batch_key: Optional[str] = None, check_values: bool = True, ) -> Optional[pd.DataFrame]: """\ - Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_ [Lause20]_. + Annotate highly variable genes using Analytical Pearson residuals [Lause20]_. - Expects logarithmized data, except when `flavor='seurat_v3'` or - `flavor='pearson_residuals'`, in which count data is expected. - - Depending on `flavor`, this reproduces the R-implementations of Seurat - [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_, or uses - analytical Pearson residuals [Lause20]_. - - For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized - dispersion is obtained by scaling with the mean and standard deviation of - the dispersions for genes falling into a given bin for mean expression of - genes. This means that for each bin of mean expression, highly variable - genes are selected. - - For [Stuart19]_, a normalized variance for each gene is computed. First, the data - are standardized (i.e., z-score normalization per feature) with a regularized - standard deviation. Next, the normalized variance is computed as the variance - of each gene after the transformation. Genes are ranked by the normalized variance. + Expects count data input. For [Lause20]_, Pearson residuals of a negative binomial offset model (with overdispersion theta shared across genes) are computed. By default, overdispersion @@ -278,31 +254,6 @@ def highly_variable_genes( n_top_genes Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. - min_mean - If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. - max_mean - If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. - min_disp - If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. - max_disp - If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the - normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. - span - The fraction of the data (cells) used when estimating the variance in the loess - model fit if `flavor='seurat_v3'`. - n_bins - Number of bins for binning the mean gene expression. Normalization is - done with respect to each bin. If just a single gene falls into a bin, - the normalized dispersion is artificially set to 1. You'll be informed - about this if you set `settings.verbosity = 4`. Ignored if - `flavor='pearson_residuals'`. theta If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta. Higher values correspond to less overdispersion (var = mean + mean^2/theta), and @@ -320,9 +271,8 @@ def highly_variable_genes( once while computing the residual variance. Choosing a smaller value will reduce the required memory. flavor - Choose the flavor for identifying highly variable genes. For the dispersion - based methods in their default workflows, Seurat passes the cutoffs whereas - Cell Ranger passes `n_top_genes`. + Choose the flavor for identifying highly variable genes. In this experimental + version, only 'pearson_residuals' is functional. subset Inplace subset to highly-variable genes if `True` otherwise merely indicate highly variable genes. @@ -351,21 +301,12 @@ def highly_variable_genes( boolean indicator of highly-variable genes **means** means per gene - **dispersions** - For dispersion-based flavors, dispersions per gene - **dispersions_norm** - For dispersion-based flavors, normalized dispersions per gene **variances** - For `flavor='seurat_v3'` and `flavor='pearson_residuals'`, variance per gene - **variances_norm** - For `flavor='seurat_v3'`, normalized variance per gene, averaged in - the case of multiple batches + variance per gene **residual_variances** For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of multiple batches. highly_variable_rank : float - For `flavor='seurat_v3'`, rank of the gene according to normalized - variance, median rank in the case of multiple batches For `flavor='pearson_residuals'`, rank of the gene according to residual variance, median rank in the case of multiple batches highly_variable_nbatches : int @@ -375,7 +316,7 @@ def highly_variable_genes( Notes ----- - This function replaces :func:`~scanpy.pp.filter_genes_dispersion`. + Experimental version of `sc.pp.highly_variable_genes()` """ logg.info('extracting highly variable genes') From 659da16c498bed4ed8b7e6e34b1f85b5b142168f Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 1 Aug 2021 19:18:45 +0200 Subject: [PATCH 41/96] add recipes --- scanpy/experimental/pp/__init__.py | 6 +- scanpy/experimental/pp/_recipes.py | 148 +++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 scanpy/experimental/pp/_recipes.py diff --git a/scanpy/experimental/pp/__init__.py b/scanpy/experimental/pp/__init__.py index 7ecf999363..a5eaf9d9c2 100644 --- a/scanpy/experimental/pp/__init__.py +++ b/scanpy/experimental/pp/__init__.py @@ -1,8 +1,8 @@ -from ._normalization import ( +from scanpy.experimental.pp._normalization import ( normalize_pearson_residuals, normalize_pearson_residuals_pca, ) -from ._highly_variable_genes import highly_variable_genes +from scanpy.experimental.pp._highly_variable_genes import highly_variable_genes -from ._recipes import recipe_pearson_residuals +from scanpy.experimental.pp._recipes import recipe_pearson_residuals diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py new file mode 100644 index 0000000000..ba557abc8b --- /dev/null +++ b/scanpy/experimental/pp/_recipes.py @@ -0,0 +1,148 @@ +from typing import Optional, Tuple +from anndata import AnnData +import pandas as pd +from scanpy import experimental +from scanpy.preprocessing import pca + + +def recipe_pearson_residuals( + adata: AnnData, + n_top_genes: int = 1000, + theta: float = 100, + clip: Optional[float] = None, + chunksize: int = 1000, + batch_key: Optional[str] = None, + n_comps_pca: Optional[int] = 50, + random_state_pca: Optional[float] = 0, + kwargs_pca: dict = {}, + check_values: bool = True, + inplace: bool = True, +) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: + """\ + Gene selection and normalization based on [Lause20]_. + Applies gene selection based on Pearson residuals. On the resulting subset, + Pearson residual normalization and PCA are performed. + Parameters + ---------- + adata + The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond + to cells and columns to genes. + n_top_genes + Number of highly-variable genes to keep. Mandatory if + `flavor='seurat_v3'` or `flavor='pearson_residuals'`. + chunksize + This dertermines how many genes are processed at once while computing + the Pearson residual variance. Choosing a smaller value will reduce + the required memory. + theta + This is the NB overdispersion parameter theta for Pearson residual + computations. Higher values correspond to less overdispersion + (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a + Poisson model. + clip + This determines if and how Pearson residuals are clipped: + * If `None`, residuals are clipped to the interval \ + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + `clip=np.Inf` for no clipping. + batch_key + If specified, highly-variable genes are selected within each batch + separately and merged. This simple process avoids the selection of + batch-specific genes and acts as a lightweight batch correction + method. For all flavors, genes are first sorted by how many batches + they are a HVG. Ties are broken by the median rank (across batches) + based on within-batch residual variance. + n_comps_pca + Number of principal components to compute. + random_state_pca + Change to use different initial states for the optimization. + kwargs_pca + Dictionary of further keyword arguments passed on to `sc.pp.pca()`. + check_values + Check if counts in selected layer are integers. A Warning is returned if set to True. + inplace + Whether to place results in `adata` or return them. + Returns + ------ + If `inplace=False`, separately returns the gene selection results (`hvg`) + and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`, + updates `adata` with the following fields for gene selection results…: + `.var['highly_variable']` + boolean indicator of highly-variable genes. + `.var['means']` + means per gene. + `.var['variances']` + variances per gene. + `.var['residual_variances']` + Pearson residual variance per gene. Averaged in the case of multiple + batches. + `.var['highly_variable_rank']` + Rank of the gene according to residual variance, median rank in the + case of multiple batches. + `.var['highly_variable_nbatches']` + If batch_key is given, this denotes in how many batches genes are + detected as HVG. + `.var['highly_variable_intersection']` + If batch_key is given, this denotes the genes that are highly variable + in all batches. + …and the following fields for Pearson residual-based PCA results and + normalization settings: + `.uns['pearson_residuals_normalization']['pearson_residuals_df']` + The hvg-subset, normalized by Pearson residuals. + `.uns['pearson_residuals_normalization']['theta']` + The used value of the overdisperion parameter theta. + `.uns['pearson_residuals_normalization']['clip']` + The used value of the clipping parameter. + `.obsm['pearson_residuals_X_pca']` + PCA representation of data after gene selection and Pearson residual + normalization. + `.uns['pearson_residuals_pca']['PCs']` + The principal components containing the loadings. + `.uns['pearson_residuals_pca']['variance_ratio']` + Ratio of explained variance. + `.uns['pearson_residuals_pca']['variance']` + Explained variance, equivalent to the eigenvalues of the + covariance matrix. + """ + + hvg_args = dict( + flavor='pearson_residuals', + n_top_genes=n_top_genes, + batch_key=batch_key, + theta=theta, + clip=clip, + chunksize=chunksize, + check_values=check_values, + ) + + if inplace: + experimental.pp.highly_variable_genes(adata, **hvg_args, inplace=True) + # TODO: are these copies needed? + adata_pca = adata[:, adata.var['highly_variable']].copy() + else: + hvg = experimental.pp.highly_variable_genes(adata, **hvg_args, inplace=False) + # TODO: are these copies needed? + adata_pca = adata[:, hvg['highly_variable']].copy() + + experimental.pp.normalize_pearson_residuals( + adata_pca, theta=theta, clip=clip, check_values=check_values + ) + pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca) + + if inplace: + normalization_param = adata_pca.uns['pearson_residuals_normalization'] + normalization_dict = dict( + **normalization_param, pearson_residuals_df=adata_pca.to_df() + ) + pca_param = adata_pca.uns['pca'] + pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs']) + adata.uns['pearson_residuals_pca'] = pca_dict + adata.uns['pearson_residuals_normalization'] = normalization_dict + adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'] + return None + else: + adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy() + adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy() + del adata_pca.obsm['X_pca'] + del adata_pca.uns['pca'] + return adata_pca, hvg From bf0bb8e86b1187649b4612e1558c29f06c5c556b Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 1 Aug 2021 19:29:51 +0200 Subject: [PATCH 42/96] fix docs --- docs/api.rst | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 2bc9283a75..cdcb2c7b87 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -39,8 +39,6 @@ For visual quality control, see :func:`~scanpy.pl.highest_expr_genes` and pp.log1p pp.pca pp.normalize_total - pp.normalize_pearson_residuals - pp.normalize_pearson_residuals_pca pp.regress_out pp.scale pp.subsample @@ -55,7 +53,6 @@ Recipes pp.recipe_zheng17 pp.recipe_weinreb17 pp.recipe_seurat - pp.recipe_pearson_residuals Batch effect correction ~~~~~~~~~~~~~~~~~~~~~~~ @@ -372,6 +369,24 @@ Collections of useful measurements for evaluating results. metrics.morans_i +Experimental +------------ + +.. module:: scanpy.experimental +.. currentmodule:: scanpy + +New methods that are in early development which are not (yet) +integrated in Scanpy core. + +.. autosummary:: + :toctree: generated/ + + pp.normalize_pearson_residuals + pp.normalize_pearson_residuals_pca + pp.highly_variable_genes + pp.recipe_pearson_residuals + + Classes ------- From 191c449bbd8d24990cc87db1fc2c2b96f0f1fd04 Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 1 Aug 2021 19:52:22 +0200 Subject: [PATCH 43/96] add correct module docs --- docs/api.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index cdcb2c7b87..1400e3807b 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -381,10 +381,10 @@ integrated in Scanpy core. .. autosummary:: :toctree: generated/ - pp.normalize_pearson_residuals - pp.normalize_pearson_residuals_pca - pp.highly_variable_genes - pp.recipe_pearson_residuals + experimental.pp.normalize_pearson_residuals + experimental.pp.normalize_pearson_residuals_pca + experimental.pp.highly_variable_genes + experimental.pp.recipe_pearson_residuals Classes From 7f3d6ed9bff995736c00fcafdea5dd6a4920fdc2 Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 1 Aug 2021 20:37:08 +0200 Subject: [PATCH 44/96] fix recipe docstrings --- scanpy/experimental/pp/_recipes.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index ba557abc8b..e8c143f9d9 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -20,8 +20,10 @@ def recipe_pearson_residuals( ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: """\ Gene selection and normalization based on [Lause20]_. + Applies gene selection based on Pearson residuals. On the resulting subset, Pearson residual normalization and PCA are performed. + Parameters ---------- adata @@ -40,11 +42,13 @@ def recipe_pearson_residuals( (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a Poisson model. clip - This determines if and how Pearson residuals are clipped: + Determines if and how residuals are clipped: + * If `None`, residuals are clipped to the interval \ [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ `clip=np.Inf` for no clipping. + batch_key If specified, highly-variable genes are selected within each batch separately and merged. This simple process avoids the selection of @@ -62,6 +66,7 @@ def recipe_pearson_residuals( Check if counts in selected layer are integers. A Warning is returned if set to True. inplace Whether to place results in `adata` or return them. + Returns ------ If `inplace=False`, separately returns the gene selection results (`hvg`) @@ -103,6 +108,7 @@ def recipe_pearson_residuals( `.uns['pearson_residuals_pca']['variance']` Explained variance, equivalent to the eigenvalues of the covariance matrix. + """ hvg_args = dict( From 87bf42506e8719745d3fd3cfe40478aa7d5a0d76 Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 1 Aug 2021 20:48:40 +0200 Subject: [PATCH 45/96] try fix indentation --- scanpy/experimental/pp/_normalization.py | 4 ++-- scanpy/experimental/pp/_recipes.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index 5e068db8db..b43240569b 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -71,8 +71,8 @@ def normalize_pearson_residuals( theta shared across genes, computes Pearson residuals. By default, residuals are clipped to sqrt(n) and overdispersion theta=100 is used. - Params - ------ + Parameters + ---------- adata The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index e8c143f9d9..79aec9fbba 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -44,9 +44,9 @@ def recipe_pearson_residuals( clip Determines if and how residuals are clipped: - * If `None`, residuals are clipped to the interval \ + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + * If any scalar c, residuals are clipped to the interval [-c, c]. Set `clip=np.Inf` for no clipping. batch_key From 0b8ba5f7744a2a5cd22763105a228ddec1736b92 Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 1 Aug 2021 22:06:52 +0200 Subject: [PATCH 46/96] fix indentation --- scanpy/experimental/pp/_recipes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 79aec9fbba..10cdb72400 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -42,11 +42,11 @@ def recipe_pearson_residuals( (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a Poisson model. clip - Determines if and how residuals are clipped: + This determines how Pearson residuals are clipped: - * If `None`, residuals are clipped to the interval + * If `None`, residuals are clipped to the interval \ [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ `clip=np.Inf` for no clipping. batch_key @@ -68,7 +68,7 @@ def recipe_pearson_residuals( Whether to place results in `adata` or return them. Returns - ------ + ------- If `inplace=False`, separately returns the gene selection results (`hvg`) and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`, updates `adata` with the following fields for gene selection results…: From 88bf93a41adfa0784ab1eb48f24900c066c233d6 Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 1 Aug 2021 22:20:43 +0200 Subject: [PATCH 47/96] fix --- scanpy/experimental/pp/_recipes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 10cdb72400..b22ffe5496 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -44,10 +44,10 @@ def recipe_pearson_residuals( clip This determines how Pearson residuals are clipped: - * If `None`, residuals are clipped to the interval \ - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping. + * If `None`, residuals are clipped to the interval \ + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + `clip=np.Inf` for no clipping. batch_key If specified, highly-variable genes are selected within each batch From ef81b72133938381f4a623f0ba76bad95a60cb60 Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 1 Aug 2021 22:26:10 +0200 Subject: [PATCH 48/96] new indentation --- scanpy/experimental/pp/_recipes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index b22ffe5496..014c13d203 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -44,10 +44,10 @@ def recipe_pearson_residuals( clip This determines how Pearson residuals are clipped: - * If `None`, residuals are clipped to the interval \ - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping. + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)] + where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. + Set`clip=np.Inf` for no clipping. batch_key If specified, highly-variable genes are selected within each batch From 900c12c6c5f71485ff7113fcaf1fd6907a620b09 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 2 Aug 2021 08:31:39 +0200 Subject: [PATCH 49/96] add space --- scanpy/experimental/pp/_recipes.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 014c13d203..ab64aa79b6 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -24,6 +24,7 @@ def recipe_pearson_residuals( Applies gene selection based on Pearson residuals. On the resulting subset, Pearson residual normalization and PCA are performed. + Parameters ---------- adata @@ -44,10 +45,10 @@ def recipe_pearson_residuals( clip This determines how Pearson residuals are clipped: - * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)] - where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. - Set`clip=np.Inf` for no clipping. + * If `None`, residuals are clipped to the interval \ + [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + `clip=np.Inf` for no clipping. batch_key If specified, highly-variable genes are selected within each batch From b00a0b627ef67eb00015643cd6c74e7f672980c8 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Mon, 2 Aug 2021 13:25:26 +0200 Subject: [PATCH 50/96] fixing typo in docstring --- scanpy/experimental/pp/_recipes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index ab64aa79b6..2f297a307d 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -99,7 +99,7 @@ def recipe_pearson_residuals( The used value of the overdisperion parameter theta. `.uns['pearson_residuals_normalization']['clip']` The used value of the clipping parameter. - `.obsm['pearson_residuals_X_pca']` + `.obsm['X_pearson_residuals_pca']` PCA representation of data after gene selection and Pearson residual normalization. `.uns['pearson_residuals_pca']['PCs']` From 617aff1f4b22d6a67c11628a4333ea038f484d08 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Mon, 2 Aug 2021 13:46:39 +0200 Subject: [PATCH 51/96] renaming pca output fields --- scanpy/experimental/pp/_recipes.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 2f297a307d..38e2923bcf 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -99,14 +99,14 @@ def recipe_pearson_residuals( The used value of the overdisperion parameter theta. `.uns['pearson_residuals_normalization']['clip']` The used value of the clipping parameter. - `.obsm['X_pearson_residuals_pca']` + `.obsm['X_pca']` PCA representation of data after gene selection and Pearson residual normalization. - `.uns['pearson_residuals_pca']['PCs']` + `.uns['pca']['PCs']` The principal components containing the loadings. - `.uns['pearson_residuals_pca']['variance_ratio']` + `.uns['pca']['variance_ratio']` Ratio of explained variance. - `.uns['pearson_residuals_pca']['variance']` + `.uns['pca']['variance']` Explained variance, equivalent to the eigenvalues of the covariance matrix. @@ -143,13 +143,9 @@ def recipe_pearson_residuals( ) pca_param = adata_pca.uns['pca'] pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs']) - adata.uns['pearson_residuals_pca'] = pca_dict + adata.uns['pca'] = pca_dict adata.uns['pearson_residuals_normalization'] = normalization_dict - adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'] + adata.obsm['X_pca'] = adata_pca.obsm['X_pca'] return None else: - adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy() - adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy() - del adata_pca.obsm['X_pca'] - del adata_pca.uns['pca'] return adata_pca, hvg From 4dabfcdb5415fd95fe300f5e6ee10cef744fb252 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Mon, 2 Aug 2021 22:55:59 +0200 Subject: [PATCH 52/96] adapting tests to new output fieldname --- scanpy/tests/test_normalization.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index ad7ba7bde7..09068ced3f 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -298,13 +298,13 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp # for both cases, check adata_pca keys are complete assert np.all( np.isin( - ['pearson_residuals_normalization', 'pearson_residuals_pca'], + ['pearson_residuals_normalization', 'pca'], list(adata_pca.uns.keys()), ) ) - assert np.all(np.isin(['X_pearson_residuals_pca'], list(adata_pca.obsm.keys()))) + assert np.all(np.isin(['X_pca'], list(adata_pca.obsm.keys()))) assert np.all(np.isin(['PCs'], list(adata_pca.varm.keys()))) - assert adata_pca.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca) + assert adata_pca.obsm['X_pca'].shape == (n_cells, n_comps_pca) # check adata shape assert adata_pca.shape == (n_cells, n_hvgs) @@ -336,13 +336,13 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp assert np.all( np.isin( - ['pearson_residuals_normalization', 'pearson_residuals_pca'], + ['pearson_residuals_normalization', 'pca'], list(adata.uns.keys()), ) ) - assert np.all(np.isin(['X_pearson_residuals_pca'], list(adata.obsm.keys()))) + assert np.all(np.isin(['X_pca'], list(adata.obsm.keys()))) assert adata.shape == (n_cells, n_genes) - assert adata.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca) + assert adata.obsm['X_pca'].shape == (n_cells, n_comps_pca) # check PC shapes to see whether or not HVGs were used for PCA - assert adata.uns['pearson_residuals_pca']['PCs'].shape == (n_hvgs, n_comps_pca) + assert adata.uns['pca']['PCs'].shape == (n_hvgs, n_comps_pca) From 58ac8e0825d2903594cdfad0e61b00ee4607c9b5 Mon Sep 17 00:00:00 2001 From: giovp Date: Fri, 6 Aug 2021 11:15:04 +0200 Subject: [PATCH 53/96] fix docs :hammer: --- scanpy/experimental/pp/_highly_variable_genes.py | 2 +- scanpy/experimental/pp/_recipes.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index c90b468027..110913e932 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -30,7 +30,7 @@ def _highly_variable_pearson_residuals( inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ - See `highly_variable_genes`. + See `scanpy.pp.highly_variable_genes`. Returns ------- diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 38e2923bcf..35b5338878 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -21,8 +21,8 @@ def recipe_pearson_residuals( """\ Gene selection and normalization based on [Lause20]_. - Applies gene selection based on Pearson residuals. On the resulting subset, - Pearson residual normalization and PCA are performed. + Applies gene selection based on Pearson residuals. + On the resulting subset, Pearson residual normalization and PCA are performed. Parameters From 8ae83380ec37a48374bcb443de704f045a754f03 Mon Sep 17 00:00:00 2001 From: giovp Date: Fri, 6 Aug 2021 11:56:12 +0200 Subject: [PATCH 54/96] update docs --- scanpy/experimental/pp/_normalization.py | 27 ++++++++++-------------- scanpy/experimental/pp/_recipes.py | 25 ++++++++++++---------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index b43240569b..9689eb8476 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, Dict +from typing import Optional, Dict from warnings import warn import numpy as np @@ -71,8 +71,8 @@ def normalize_pearson_residuals( theta shared across genes, computes Pearson residuals. By default, residuals are clipped to sqrt(n) and overdispersion theta=100 is used. - Parameters - ---------- + Params + ------ adata The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. @@ -152,9 +152,8 @@ def normalize_pearson_residuals_pca( Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default. - - Parameters - ---------- + Params + ------ adata The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. @@ -200,14 +199,14 @@ def normalize_pearson_residuals_pca( `.uns['pearson_residuals_normalization']['clip']` The used value of the clipping parameter - `.obsm['X_pearson_residuals_pca']` + `.obsm['X_pca']` PCA representation of data after gene selection and Pearson residual normalization. - `.uns['pearson_residuals_pca']['PCs']` + `.uns['pca']['PCs']` The principal components containing the loadings. - `.uns['pearson_residuals_pca']['variance_ratio']` + `.uns['pca']['variance_ratio']` Ratio of explained variance. - `.uns['pearson_residuals_pca']['variance']` + `.uns['pca']['variance']` Explained variance, equivalent to the eigenvalues of the covariance matrix. @@ -230,13 +229,9 @@ def normalize_pearson_residuals_pca( norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df()) pca_settings = adata_pca.uns['pca'] pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs']) - adata.uns['pearson_residuals_pca'] = pca_dict + adata.uns['pca'] = pca_dict adata.uns['pearson_residuals_normalization'] = norm_dict - adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'] + adata.obsm['X_pca'] = adata_pca.obsm['X_pca'] return None else: - adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy() - adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy() - del adata_pca.obsm['X_pca'] - del adata_pca.uns['pca'] return adata_pca diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 35b5338878..45c2c41725 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -21,15 +21,14 @@ def recipe_pearson_residuals( """\ Gene selection and normalization based on [Lause20]_. - Applies gene selection based on Pearson residuals. - On the resulting subset, Pearson residual normalization and PCA are performed. + Applies gene selection based on Pearson residuals. On the resulting subset, + Pearson residual normalization and PCA are performed. - - Parameters - ---------- + Params + ------ adata - The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond - to cells and columns to genes. + The annotated data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. n_top_genes Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. @@ -43,7 +42,7 @@ def recipe_pearson_residuals( (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a Poisson model. clip - This determines how Pearson residuals are clipped: + Determines if and how residuals are clipped: * If `None`, residuals are clipped to the interval \ [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). @@ -62,17 +61,19 @@ def recipe_pearson_residuals( random_state_pca Change to use different initial states for the optimization. kwargs_pca - Dictionary of further keyword arguments passed on to `sc.pp.pca()`. + Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. check_values - Check if counts in selected layer are integers. A Warning is returned if set to True. + Check if counts in selected layer are integers. A `Warning` is returned if set to True. inplace Whether to place results in `adata` or return them. + Returns ------- If `inplace=False`, separately returns the gene selection results (`hvg`) and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`, - updates `adata` with the following fields for gene selection results…: + updates `adata` with the following fields for gene selection results: + `.var['highly_variable']` boolean indicator of highly-variable genes. `.var['means']` @@ -91,8 +92,10 @@ def recipe_pearson_residuals( `.var['highly_variable_intersection']` If batch_key is given, this denotes the genes that are highly variable in all batches. + …and the following fields for Pearson residual-based PCA results and normalization settings: + `.uns['pearson_residuals_normalization']['pearson_residuals_df']` The hvg-subset, normalized by Pearson residuals. `.uns['pearson_residuals_normalization']['theta']` From 535129cb3c2c971cd7b5128573da93afb8881490 Mon Sep 17 00:00:00 2001 From: giovp Date: Fri, 6 Aug 2021 12:20:56 +0200 Subject: [PATCH 55/96] fix test :hammer: --- scanpy/tests/test_normalization.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index 09068ced3f..b2defaeead 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -199,13 +199,13 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p for ad in [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs]: assert np.all( np.isin( - ['pearson_residuals_normalization', 'pearson_residuals_pca'], + ['pearson_residuals_normalization', 'pca'], list(ad.uns.keys()), ) ) - assert np.all(np.isin(['X_pearson_residuals_pca'], list(ad.obsm.keys()))) + assert np.all(np.isin(['X_pca'], list(ad.obsm.keys()))) assert np.all(np.isin(['PCs'], list(ad.varm.keys()))) - assert ad.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca) + assert ad.obsm['X_pca'].shape == (n_cells, n_comps_pca) # check adata shape to see if all genes or only HVGs are in the returned adata assert adata_pca.shape == (n_cells, n_genes) @@ -242,22 +242,24 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p # check adata_pca keys are complete assert np.all( np.isin( - ['pearson_residuals_normalization', 'pearson_residuals_pca'], + [ + 'pearson_residuals_normalization', + ], list(ad.uns.keys()), ) ) - assert np.all(np.isin(['X_pearson_residuals_pca'], list(ad.obsm.keys()))) + assert np.all(np.isin(['X_pca'], list(ad.obsm.keys()))) # check shapes: adata should always retains original shape assert ad.shape == (n_cells, n_genes) - assert ad.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca) + assert ad.obsm['X_pca'].shape == (n_cells, n_comps_pca) # check PC shapes to see whether or not HVGs were used for PCA - assert adata.uns['pearson_residuals_pca']['PCs'].shape == (n_genes, n_comps_pca) - assert adata_with_hvgs.uns['pearson_residuals_pca']['PCs'].shape == ( + assert adata.uns['pca']['PCs'].shape == (n_genes, n_comps_pca) + assert adata_with_hvgs.uns['pca']['PCs'].shape == ( n_hvgs, n_comps_pca, ) - assert adata_not_using_hvgs.uns['pearson_residuals_pca']['PCs'].shape == ( + assert adata_not_using_hvgs.uns['pca']['PCs'].shape == ( n_genes, n_comps_pca, ) @@ -268,8 +270,8 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p [adata, adata_with_hvgs, adata_not_using_hvgs], ): np.testing.assert_array_equal( - ad_inplace.obsm['X_pearson_residuals_pca'], - ad_outplace.obsm['X_pearson_residuals_pca'], + ad_inplace.obsm['X_pca'], + ad_outplace.obsm['X_pca'], ) From 3addbe75216d42bf495c616e7be1b9663f4aeaad Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Tue, 10 Aug 2021 16:49:36 +0200 Subject: [PATCH 56/96] ensure argument and docstring consistency --- .../experimental/pp/_highly_variable_genes.py | 102 +++++++++--------- scanpy/experimental/pp/_normalization.py | 76 +++++++------ scanpy/experimental/pp/_recipes.py | 64 +++++------ 3 files changed, 122 insertions(+), 120 deletions(-) diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index 110913e932..a0d344d390 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -19,18 +19,18 @@ def _highly_variable_pearson_residuals( adata: AnnData, - layer: Optional[str] = None, - n_top_genes: int = 1000, - batch_key: Optional[str] = None, theta: float = 100, clip: Optional[float] = None, - chunksize: int = 100, + n_top_genes: int = 1000, + batch_key: Optional[str] = None, + chunksize: int = 1000, check_values: bool = True, + layer: Optional[str] = None, subset: bool = False, inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ - See `scanpy.pp.highly_variable_genes`. + See `scanpy.experimental.pp.highly_variable_genes`. Returns ------- @@ -38,23 +38,19 @@ def _highly_variable_pearson_residuals( or updates `.var` with the following fields: highly_variable : bool - boolean indicator of highly-variable genes. + boolean indicator of highly-variable genes means : float - means per gene. + means per gene variances : float - variances per gene. + variance per gene residual_variances : float - Pearson residual variance per gene. Averaged in the case of multiple - batches. + Residual variance per gene. Averaged in the case of multiple batches. highly_variable_rank : float - Rank of the gene according to residual variance, median rank in the - case of multiple batches. NaN for non-HVGs. + Rank of the gene according to residual variance, median rank in the case of multiple batches highly_variable_nbatches : int - If batch_key is given, this denotes in how many batches genes are - detected as HVG. + If `batch_key` given, denotes in how many batches genes are detected as HVG highly_variable_intersection : bool - If batch_key is given, this denotes the genes that are highly variable - in all batches. + If `batch_key` given, denotes the genes that are highly variable in all batches """ view_to_actual(adata) @@ -223,49 +219,55 @@ def _highly_variable_pearson_residuals( def highly_variable_genes( adata: AnnData, - layer: Optional[str] = None, - n_top_genes: Optional[int] = None, theta: float = 100, clip: Optional[float] = None, + n_top_genes: Optional[int] = None, + batch_key: Optional[str] = None, chunksize: int = 1000, flavor: Literal['pearson_residuals'] = 'pearson_residuals', + check_values: bool = True, + layer: Optional[str] = None, subset: bool = False, inplace: bool = True, - batch_key: Optional[str] = None, - check_values: bool = True, ) -> Optional[pd.DataFrame]: """\ - Annotate highly variable genes using Analytical Pearson residuals [Lause20]_. - - Expects count data input. + Annotate highly variable genes using analytic Pearson residuals [Lause20]_. For [Lause20]_, Pearson residuals of a negative binomial offset model (with overdispersion theta shared across genes) are computed. By default, overdispersion theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked by residual variance. + Expects raw count input. + + Parameters ---------- adata - The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond - to cells and columns to genes. - layer - If provided, use `adata.layers[layer]` for expression values instead of `adata.X`. - n_top_genes - Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. + The annotated data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. theta - If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta. - Higher values correspond to less overdispersion (var = mean + mean^2/theta), and - `theta=np.Inf` corresponds to a Poisson model. + The negative binomial overdispersion parameter theta for Pearson residuals. + Higher values correspond to less overdispersion (var = mean + mean^2/theta), + and `theta=np.Inf` corresponds to a Poisson model. clip - If `flavor='pearson_residuals'`, this determines how residuals are clipped: + If `flavor='pearson_residuals'`, determines if and how residuals are clipped: * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ `clip=np.Inf` for no clipping. + n_top_genes + Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. + batch_key + If specified, highly-variable genes are selected within each batch separately + and merged. This simple process avoids the selection of batch-specific genes + and acts as a lightweight batch correction method. Genes are first sorted by + how many batches they are a HVG. If `flavor='pearson_residuals'`, ties are + broken by the median rank (across batches) based on within-batch residual + variance. chunksize If `flavor='pearson_residuals'`, this dertermines how many genes are processed at once while computing the residual variance. Choosing a smaller value will reduce @@ -273,24 +275,16 @@ def highly_variable_genes( flavor Choose the flavor for identifying highly variable genes. In this experimental version, only 'pearson_residuals' is functional. + check_values + Check if counts in selected layer are integers. A Warning is returned if set to + True. Only used if `flavor='pearson_residuals'`. + layer + If provided, use `adata.layers[layer]` for expression values instead of `adata.X`. subset Inplace subset to highly-variable genes if `True` otherwise merely indicate highly variable genes. inplace Whether to place calculated metrics in `.var` or return them. - batch_key - If specified, highly-variable genes are selected within each batch separately and merged. - This simple process avoids the selection of batch-specific genes and acts as a - lightweight batch correction method. For all flavors, genes are first sorted - by how many batches they are a HVG. For dispersion-based flavors ties are broken - by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median - (across batches) rank based on within-batch normalized variance. If - `flavor='pearson_residuals'`, ties are broken by the median rank (across batches) - based on within-batch residual variance. - check_values - Check if counts in selected layer are integers. A Warning is returned if set to True. - Only used if `flavor='seurat_v3'` or `flavor='pearson_residuals'`. - Returns ------- @@ -299,20 +293,20 @@ def highly_variable_genes( highly_variable : bool boolean indicator of highly-variable genes - **means** + means : float means per gene - **variances** + variances : float variance per gene - **residual_variances** - For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of - multiple batches. + residual_variances : float + For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the + case of multiple batches. highly_variable_rank : float For `flavor='pearson_residuals'`, rank of the gene according to residual variance, median rank in the case of multiple batches highly_variable_nbatches : int - If batch_key is given, this denotes in how many batches genes are detected as HVG + If `batch_key` given, denotes in how many batches genes are detected as HVG highly_variable_intersection : bool - If batch_key is given, this denotes the genes that are highly variable in all batches + If `batch_key` given, denotes the genes that are highly variable in all batches Notes ----- diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index 9689eb8476..dbdf5e59d0 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -59,17 +59,19 @@ def normalize_pearson_residuals( adata: AnnData, theta: float = 100, clip: Optional[float] = None, + check_values: bool = True, layer: Optional[str] = None, copy: bool = False, - check_values: bool = True, inplace: bool = True, ) -> Optional[Dict[str, np.ndarray]]: """\ - Computes analytic Pearson residuals, based on [Lause20]_. + Applies analytic Pearson residual normalization, based on [Lause20]_. - Assuming a negative binomial offset model with overdispersion - theta shared across genes, computes Pearson residuals. By default, residuals - are clipped to sqrt(n) and overdispersion theta=100 is used. + The residuals are based on a negative binomial offset model with overdispersion + `theta` shared across genes. By default, residuals are clipped to sqrt(n) and + overdispersion `theta=100` is used. + + Expects raw count input. Params ------ @@ -77,24 +79,24 @@ def normalize_pearson_residuals( The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. theta - The NB overdispersion parameter theta. Higher values correspond to - less overdispersion (var = mean + mean^2/theta), and `theta=np.Inf` - corresponds to a Poisson model. + The negative binomial overdispersion parameter theta for Pearson residuals. + Higher values correspond to less overdispersion (var = mean + mean^2/theta), + and `theta=np.Inf` corresponds to a Poisson model. clip Determines if and how residuals are clipped: - * If `None`, residuals are clipped to the interval \ - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ + where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ `clip=np.Inf` for no clipping. + check_values + Check if counts in selected layer are integers. A Warning is returned if set to + True. layer Layer to normalize instead of `X`. If `None`, `X` is normalized. copy - Whether to modify copied input object. Not compatible with - `inplace=False`. - check_values - Check if counts in selected layer are integers. A Warning is returned if set to True. + Whether to modify copied input object. Not compatible with `inplace=False`. inplace Whether to update `adata` or return dictionary with normalized copies of `adata.X` and `adata.layers`. @@ -142,35 +144,36 @@ def normalize_pearson_residuals_pca( clip: Optional[float] = None, n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, - use_highly_variable: bool = True, kwargs_pca: Optional[dict] = {}, + use_highly_variable: bool = True, check_values: bool = True, inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ - Applies Pearson residual normalization and PCA, based on [Lause20]_. + Applies analytic Pearson residual normalization and PCA, based on [Lause20]_. + + The residuals are based on a negative binomial offset model with overdispersion + `theta` shared across genes. By default, residuals are clipped to sqrt(n), + overdispersion `theta=100` is used, and PCA is run with 50 components. + + Operates on the subset of highly variable genes in `adata.var['highly_variable']` + by default. Expects raw count input. - Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default. Params ------ adata - The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond - to cells and columns to genes. - use_highly_variable - Whether to use the gene selection in `adata.var['highly_variable']` to - subset the data before normalizing (default) or proceed on the full - dataset. + The annotated data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. theta - This is the NB overdispersion parameter theta for Pearson residual - computations. Higher values correspond to less overdispersion - (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a - Poisson model. + The negative binomial overdispersion parameter theta for Pearson residuals. + Higher values correspond to less overdispersion (var = mean + mean^2/theta), + and `theta=np.Inf` corresponds to a Poisson model. clip - This determines how Pearson residuals are clipped: + Determines if and how residuals are clipped: - * If `None`, residuals are clipped to the interval \ - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ + where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ `clip=np.Inf` for no clipping. @@ -180,8 +183,12 @@ def normalize_pearson_residuals_pca( Change to use different initial states for the optimization. kwargs_pca Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. + use_highly_variable + Whether to use the gene selection in `adata.var['highly_variable']` to subset + the data before normalizing (default) or proceed on the full dataset. check_values - Check if counts in selected layer are integers. A Warning is returned if set to True. + Check if counts in selected layer are integers. A Warning is returned if set to + True. inplace Whether to place results in `adata` or return them. @@ -200,15 +207,14 @@ def normalize_pearson_residuals_pca( The used value of the clipping parameter `.obsm['X_pca']` - PCA representation of data after gene selection and Pearson residual - normalization. + PCA representation of data after gene selection (if applicable) and Pearson + residual normalization. `.uns['pca']['PCs']` The principal components containing the loadings. `.uns['pca']['variance_ratio']` Ratio of explained variance. `.uns['pca']['variance']` - Explained variance, equivalent to the eigenvalues of the - covariance matrix. + Explained variance, equivalent to the eigenvalues of the covariance matrix. """ diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 45c2c41725..67fc6e81a9 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -7,11 +7,11 @@ def recipe_pearson_residuals( adata: AnnData, - n_top_genes: int = 1000, theta: float = 100, clip: Optional[float] = None, - chunksize: int = 1000, + n_top_genes: int = 1000, batch_key: Optional[str] = None, + chunksize: int = 1000, n_comps_pca: Optional[int] = 50, random_state_pca: Optional[float] = 0, kwargs_pca: dict = {}, @@ -24,38 +24,39 @@ def recipe_pearson_residuals( Applies gene selection based on Pearson residuals. On the resulting subset, Pearson residual normalization and PCA are performed. + Expects raw count input. + + Params ------ adata The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. - n_top_genes - Number of highly-variable genes to keep. Mandatory if - `flavor='seurat_v3'` or `flavor='pearson_residuals'`. - chunksize - This dertermines how many genes are processed at once while computing - the Pearson residual variance. Choosing a smaller value will reduce - the required memory. theta - This is the NB overdispersion parameter theta for Pearson residual - computations. Higher values correspond to less overdispersion - (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a - Poisson model. + The negative binomial overdispersion parameter theta for Pearson residuals. + Higher values correspond to less overdispersion (var = mean + mean^2/theta), + and `theta=np.Inf` corresponds to a Poisson model. clip Determines if and how residuals are clipped: - * If `None`, residuals are clipped to the interval \ - [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior). + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ + where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ `clip=np.Inf` for no clipping. + n_top_genes + Number of highly-variable genes to keep. batch_key - If specified, highly-variable genes are selected within each batch - separately and merged. This simple process avoids the selection of - batch-specific genes and acts as a lightweight batch correction - method. For all flavors, genes are first sorted by how many batches - they are a HVG. Ties are broken by the median rank (across batches) - based on within-batch residual variance. + If specified, highly-variable genes are selected within each batch separately + and merged. This simple process avoids the selection of batch-specific genes + and acts as a lightweight batch correction method. Genes are first sorted by + how many batches they are a HVG. Ties are broken by the median rank (across + batches) based on within-batch residual variance. + chunksize + This dertermines how many genes are processed at once while computing + the Pearson residual variance. Choosing a smaller value will reduce + the required memory. + n_comps_pca Number of principal components to compute. random_state_pca @@ -63,7 +64,8 @@ def recipe_pearson_residuals( kwargs_pca Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. check_values - Check if counts in selected layer are integers. A `Warning` is returned if set to True. + Check if counts in selected layer are integers. A Warning is returned if set to + True. inplace Whether to place results in `adata` or return them. @@ -74,22 +76,22 @@ def recipe_pearson_residuals( and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`, updates `adata` with the following fields for gene selection results: - `.var['highly_variable']` + `.var['highly_variable']` : bool boolean indicator of highly-variable genes. - `.var['means']` + `.var['means']` : float means per gene. - `.var['variances']` + `.var['variances']` : float variances per gene. - `.var['residual_variances']` + `.var['residual_variances']` : float Pearson residual variance per gene. Averaged in the case of multiple batches. - `.var['highly_variable_rank']` + `.var['highly_variable_rank']` : float Rank of the gene according to residual variance, median rank in the case of multiple batches. - `.var['highly_variable_nbatches']` + `.var['highly_variable_nbatches']` : int If batch_key is given, this denotes in how many batches genes are detected as HVG. - `.var['highly_variable_intersection']` + `.var['highly_variable_intersection']` : bool If batch_key is given, this denotes the genes that are highly variable in all batches. @@ -102,6 +104,7 @@ def recipe_pearson_residuals( The used value of the overdisperion parameter theta. `.uns['pearson_residuals_normalization']['clip']` The used value of the clipping parameter. + `.obsm['X_pca']` PCA representation of data after gene selection and Pearson residual normalization. @@ -110,8 +113,7 @@ def recipe_pearson_residuals( `.uns['pca']['variance_ratio']` Ratio of explained variance. `.uns['pca']['variance']` - Explained variance, equivalent to the eigenvalues of the - covariance matrix. + Explained variance, equivalent to the eigenvalues of the covariance matrix. """ From 92159836926cfc44722c2d811dfae448479483ba Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Tue, 10 Aug 2021 16:52:47 +0200 Subject: [PATCH 57/96] update citation year --- docs/references.rst | 2 +- scanpy/experimental/pp/_highly_variable_genes.py | 4 ++-- scanpy/experimental/pp/_normalization.py | 4 ++-- scanpy/experimental/pp/_recipes.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/references.rst b/docs/references.rst index 458534a3a3..84bdd7f7fe 100644 --- a/docs/references.rst +++ b/docs/references.rst @@ -119,7 +119,7 @@ References *Laplacian Dynamics and Multiscale Modular Structure in Networks* `arXiv `__. -.. [Lause20] Lause *et al.* (2020) +.. [Lause21] Lause *et al.* (2021) *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*, `BioRxiv `__. diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index a0d344d390..9d357a7466 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -231,9 +231,9 @@ def highly_variable_genes( inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ - Annotate highly variable genes using analytic Pearson residuals [Lause20]_. + Annotate highly variable genes using analytic Pearson residuals [Lause21]_. - For [Lause20]_, Pearson residuals of a negative binomial offset model (with + For [Lause21]_, Pearson residuals of a negative binomial offset model (with overdispersion theta shared across genes) are computed. By default, overdispersion theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked by residual variance. diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index dbdf5e59d0..39c8c4141a 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -65,7 +65,7 @@ def normalize_pearson_residuals( inplace: bool = True, ) -> Optional[Dict[str, np.ndarray]]: """\ - Applies analytic Pearson residual normalization, based on [Lause20]_. + Applies analytic Pearson residual normalization, based on [Lause21]_. The residuals are based on a negative binomial offset model with overdispersion `theta` shared across genes. By default, residuals are clipped to sqrt(n) and @@ -150,7 +150,7 @@ def normalize_pearson_residuals_pca( inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ - Applies analytic Pearson residual normalization and PCA, based on [Lause20]_. + Applies analytic Pearson residual normalization and PCA, based on [Lause21]_. The residuals are based on a negative binomial offset model with overdispersion `theta` shared across genes. By default, residuals are clipped to sqrt(n), diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 67fc6e81a9..243e2b8379 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -19,7 +19,7 @@ def recipe_pearson_residuals( inplace: bool = True, ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: """\ - Gene selection and normalization based on [Lause20]_. + Gene selection and normalization based on [Lause21]_. Applies gene selection based on Pearson residuals. On the resulting subset, Pearson residual normalization and PCA are performed. From 37695a9ed96f2ca1062bf0599456b547465c542d Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Tue, 17 Aug 2021 21:57:12 +0200 Subject: [PATCH 58/96] cleaning imports in `preprocessing` functions --- scanpy/preprocessing/_highly_variable_genes.py | 5 +---- scanpy/preprocessing/_normalization.py | 5 +---- scanpy/preprocessing/_recipes.py | 3 +-- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index d9c8aae568..7cede9e528 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -1,6 +1,5 @@ import warnings from typing import Optional - import numpy as np import pandas as pd import scipy.sparse as sp_sparse @@ -9,8 +8,7 @@ from .. import logging as logg from .._settings import settings, Verbosity -from .._utils import sanitize_anndata, check_nonnegative_integers, view_to_actual -from scanpy.get import _get_obs_rep +from .._utils import sanitize_anndata, check_nonnegative_integers from .._compat import Literal from ._utils import _get_mean_var from ._distributed import materialize_as_ndarray @@ -376,7 +374,6 @@ def highly_variable_genes( Check if counts in selected layer are integers. A Warning is returned if set to True. Only used if `flavor='seurat_v3'`. - Returns ------- Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 0a853d3c89..c4ab816085 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -2,7 +2,6 @@ from warnings import warn import numpy as np -import pandas as pd from anndata import AnnData from scipy.sparse import issparse from sklearn.utils import sparsefuncs @@ -10,11 +9,9 @@ from scanpy import logging as logg from scanpy._compat import Literal -from scanpy._utils import view_to_actual, check_nonnegative_integers +from scanpy._utils import view_to_actual from scanpy.get import _get_obs_rep, _set_obs_rep -from scanpy.preprocessing._pca import pca - def _normalize_data(X, counts, after=None, copy=False): X = X.copy() if copy else X diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py index a4696e0827..6abc04ed74 100644 --- a/scanpy/preprocessing/_recipes.py +++ b/scanpy/preprocessing/_recipes.py @@ -1,9 +1,8 @@ """Preprocessing recipes from the literature""" -from typing import Optional, Tuple +from typing import Optional from anndata import AnnData -import pandas as pd from .. import preprocessing as pp from ._deprecated.highly_variable_genes import ( From f42f4b8d4c8398bb89fba975165be7e1213c71c9 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Tue, 17 Aug 2021 23:23:41 +0200 Subject: [PATCH 59/96] making inputcheck tests specific to error/warning messages --- scanpy/tests/test_highly_variable_genes.py | 31 +++++++++++++++------- scanpy/tests/test_normalization.py | 29 ++++++++++++++------ 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index a92c59ebf5..ce3d343e0e 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -79,38 +79,51 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp adata_noninteger.X[x[0], y[0]] = 0.5 # expecting 0 no-int warnings - with pytest.warns(None) as record: + with warnings.catch_warnings(record=True) as record: sc.experimental.pp.highly_variable_genes( adata_noninteger.copy(), flavor='pearson_residuals', n_top_genes=100, - check_values=False, + check_values=True, ) - assert len(record) == 0 + + warning_msgs = [w.message.args[0] for w in record] + assert ( + "`flavor='pearson_residuals'` expects raw count data, but non-integers were found." + not in warning_msgs + ) # expecting 1 no-int warning - with pytest.warns(None) as record: + with pytest.warns( + UserWarning, + match="`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", + ) as record: sc.experimental.pp.highly_variable_genes( adata_noninteger.copy(), flavor='pearson_residuals', n_top_genes=100, check_values=True, ) - assert len(record) == 1 - assert "expects raw count data" in record[0].message.args[0] # errors should be raised for invalid theta values - with pytest.raises(ValueError) as record: + with pytest.raises( + ValueError, match='Pearson residuals require theta > 0' + ) as record: sc.experimental.pp.highly_variable_genes( adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=0 ) - with pytest.raises(ValueError) as record: + + with pytest.raises( + ValueError, match='Pearson residuals require theta > 0' + ) as record: sc.experimental.pp.highly_variable_genes( adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=-1 ) # error should be raised for invalid clipping values - with pytest.raises(ValueError) as record: + with pytest.raises( + ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.' + ) as record: sc.experimental.pp.highly_variable_genes( adata.copy(), flavor='pearson_residuals', n_top_genes=100, clip=-1 ) diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index b2defaeead..53141419e0 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -79,27 +79,40 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype): x, y = np.nonzero(adata_noninteger.X) adata_noninteger.X[x[0], y[0]] = 0.5 - with pytest.warns(UserWarning) as record: + with warnings.catch_warnings(record=True) as record: sc.experimental.pp.normalize_pearson_residuals( adata_noninteger.copy(), check_values=True ) - assert len(record) == 1 - assert "expects raw count data" in record[0].message.args[0] + warning_msgs = [w.message.args[0] for w in record] + assert ( + "`normalize_pearson_residuals()` expects raw count data, but non-integers were found." + in warning_msgs + ) - with pytest.warns(None) as record: + with warnings.catch_warnings(record=True) as record: sc.experimental.pp.normalize_pearson_residuals( adata_noninteger.copy(), check_values=False ) - assert len(record) == 0 + warning_msgs = [w.message.args[0] for w in record] + assert ( + "`normalize_pearson_residuals()` expects raw count data, but non-integers were found." + not in warning_msgs + ) # errors should be raised for invalid theta values - with pytest.raises(ValueError) as record: + with pytest.raises( + ValueError, match='Pearson residuals require theta > 0' + ) as record: sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=0) - with pytest.raises(ValueError) as record: + with pytest.raises( + ValueError, match='Pearson residuals require theta > 0' + ) as record: sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=-1) # error should be raised for invalid clipping values - with pytest.raises(ValueError) as record: + with pytest.raises( + ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.' + ) as record: sc.experimental.pp.normalize_pearson_residuals(adata.copy(), clip=-1) From 1e20c3bda7c30dbf520b30035cee6660ba562f94 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Tue, 17 Aug 2021 23:50:27 +0200 Subject: [PATCH 60/96] making inputcheck tests specific to error/warning messages --- scanpy/tests/test_highly_variable_genes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index ce3d343e0e..a8cbf0d9f0 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -84,7 +84,7 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp adata_noninteger.copy(), flavor='pearson_residuals', n_top_genes=100, - check_values=True, + check_values=False, ) warning_msgs = [w.message.args[0] for w in record] From 1f02e2c457d0f6df9636d5c36997b14784096201 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 20 Aug 2021 17:55:35 +0200 Subject: [PATCH 61/96] resolve HVGs across batches more cleanly, fix dtype issue --- .../experimental/pp/_highly_variable_genes.py | 34 ++++++------------- scanpy/tests/test_highly_variable_genes.py | 4 +-- 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index 9d357a7466..03fc078ac8 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -119,34 +119,24 @@ def _highly_variable_pearson_residuals( residual_gene_var[start:stop] = np.var(residuals, axis=0) # Add 0 values for genes that were filtered out - zero_gene_var = np.zeros(np.sum(~nonzero_genes)) - residual_gene_var = np.concatenate((residual_gene_var, zero_gene_var)) - # Order as before filtering - idxs = np.concatenate((np.where(nonzero_genes)[0], np.where(~nonzero_genes)[0])) - residual_gene_var = residual_gene_var[np.argsort(idxs)] - residual_gene_vars.append(residual_gene_var.reshape(1, -1)) + unmasked_residual_gene_var = np.zeros(len(nonzero_genes)) + unmasked_residual_gene_var[nonzero_genes] = residual_gene_var + residual_gene_vars.append(unmasked_residual_gene_var.reshape(1, -1)) residual_gene_vars = np.concatenate(residual_gene_vars, axis=0) - # Get cutoffs and define hvgs per batch - residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1) - cutoffs_per_batch = residual_gene_vars_sorted[:, -n_top_genes] - highly_variable_per_batch = np.greater_equal( - residual_gene_vars.T, cutoffs_per_batch - ).T - - # Merge hvgs across batches - highly_variable_nbatches = np.sum(highly_variable_per_batch, axis=0) - highly_variable_intersection = highly_variable_nbatches == n_batches - # Get rank per gene within each batch # argsort twice gives ranks, small rank means most variable ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1) ranks_residual_var = ranks_residual_var.astype(np.float32) + # count in how many batches a genes was among the n_top_genes + highly_variable_nbatches = np.sum( + (ranks_residual_var < n_top_genes).astype(int), axis=0 + ) + # set non-top genes within each batch to nan ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan ranks_masked_array = np.ma.masked_invalid(ranks_residual_var) - # Median rank across batches, - # ignoring batches in which gene was not selected + # Median rank across batches, ignoring batches in which gene was not selected medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) means, variances = materialize_as_ndarray(_get_mean_var(X)) @@ -154,12 +144,10 @@ def _highly_variable_pearson_residuals( dict( means=means, variances=variances, - residual_variances=np.mean(residual_gene_vars, axis=0).astype( - np.float32, copy=False - ), + residual_variances=np.mean(residual_gene_vars, axis=0), highly_variable_rank=medianrank_residual_var, highly_variable_nbatches=highly_variable_nbatches.astype(np.int64), - highly_variable_intersection=highly_variable_intersection, + highly_variable_intersection=highly_variable_nbatches == n_batches, ) ) df = df.set_index(adata.var_names) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index a8cbf0d9f0..d58a1840b9 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -240,7 +240,7 @@ def test_highly_variable_genes_pearson_residuals_general( assert key in output_df.keys() # check residual variances - assert output_df['residual_variances'].values.dtype is np.dtype('float32') + assert pd.api.types.is_float_dtype(output_df['residual_variances'].dtype) # consistency with normalization method if subset: # sort values before comparing as reference is sorted as well for subset case @@ -325,7 +325,7 @@ def test_highly_variable_genes_pearson_residuals_batch( assert np.all(output_df['highly_variable'][output_df.highly_variable_intersection]) # check ranks (with batch_key these are the median of within-batch ranks) - assert output_df['highly_variable_rank'].values.dtype is np.dtype('float32') + assert pd.api.types.is_float_dtype(output_df['highly_variable_rank'].dtype) assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1 # check nbatches From 0add1b7ba4e1bd139fb2b9de81be868bfb5632e9 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 20 Aug 2021 18:33:26 +0200 Subject: [PATCH 62/96] renaming pca input arguments --- scanpy/experimental/pp/_normalization.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index 39c8c4141a..29fb1cf60d 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -142,8 +142,8 @@ def normalize_pearson_residuals_pca( adata: AnnData, theta: float = 100, clip: Optional[float] = None, - n_comps_pca: Optional[int] = 50, - random_state_pca: Optional[float] = 0, + n_comps: Optional[int] = 50, + random_state: Optional[float] = 0, kwargs_pca: Optional[dict] = {}, use_highly_variable: bool = True, check_values: bool = True, @@ -177,10 +177,10 @@ def normalize_pearson_residuals_pca( * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ `clip=np.Inf` for no clipping. - n_comps_pca - Number of principal components to compute. - random_state_pca - Change to use different initial states for the optimization. + n_comps + Number of principal components to compute for the PCA step. + random_state + Change to use different initial states for the optimization of the PCA step. kwargs_pca Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. use_highly_variable @@ -228,7 +228,7 @@ def normalize_pearson_residuals_pca( normalize_pearson_residuals( adata_pca, theta=theta, clip=clip, check_values=check_values ) - pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca) + pca(adata_pca, n_comps=n_comps, random_state=random_state, **kwargs_pca) if inplace: norm_settings = adata_pca.uns['pearson_residuals_normalization'] From 2a2b98a10fdd6799a3a4c73fc51b332aa3cb7c77 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 20 Aug 2021 19:04:37 +0200 Subject: [PATCH 63/96] renaming pca input arguments --- scanpy/experimental/pp/_recipes.py | 14 ++++----- scanpy/tests/test_normalization.py | 48 +++++++++++++++--------------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 243e2b8379..066dd2d590 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -12,8 +12,8 @@ def recipe_pearson_residuals( n_top_genes: int = 1000, batch_key: Optional[str] = None, chunksize: int = 1000, - n_comps_pca: Optional[int] = 50, - random_state_pca: Optional[float] = 0, + n_comps: Optional[int] = 50, + random_state: Optional[float] = 0, kwargs_pca: dict = {}, check_values: bool = True, inplace: bool = True, @@ -57,10 +57,10 @@ def recipe_pearson_residuals( the Pearson residual variance. Choosing a smaller value will reduce the required memory. - n_comps_pca - Number of principal components to compute. - random_state_pca - Change to use different initial states for the optimization. + n_comps + Number of principal components to compute in the PCA step. + random_state + Change to use different initial states for the optimization in the PCA step. kwargs_pca Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. check_values @@ -139,7 +139,7 @@ def recipe_pearson_residuals( experimental.pp.normalize_pearson_residuals( adata_pca, theta=theta, clip=clip, check_values=check_values ) - pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca) + pca(adata_pca, n_comps=n_comps, random_state=random_state, **kwargs_pca) if inplace: normalization_param = adata_pca.uns['pearson_residuals_normalization'] diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index 53141419e0..53e0da37b7 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -178,8 +178,8 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip): ) @pytest.mark.parametrize('dtype', ['float32', 'int64']) @pytest.mark.parametrize('n_hvgs', [100, 200]) -@pytest.mark.parametrize('n_comps_pca', [30, 50]) -def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_pca): +@pytest.mark.parametrize('n_comps', [30, 50]) +def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps): adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) n_cells, n_genes = adata.shape @@ -194,17 +194,17 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p # outputs the (potentially hvg-restricted) adata_pca object # PCA on all genes adata_pca = sc.experimental.pp.normalize_pearson_residuals_pca( - adata.copy(), inplace=False, n_comps_pca=n_comps_pca + adata.copy(), inplace=False, n_comps=n_comps ) # PCA on hvgs only adata_pca_with_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca( - adata_with_hvgs.copy(), inplace=False, n_comps_pca=n_comps_pca + adata_with_hvgs.copy(), inplace=False, n_comps=n_comps ) # PCA again on all genes (hvg use supressed) adata_pca_not_using_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca( adata_not_using_hvgs.copy(), inplace=False, - n_comps_pca=n_comps_pca, + n_comps=n_comps, use_highly_variable=False, ) @@ -218,7 +218,7 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p ) assert np.all(np.isin(['X_pca'], list(ad.obsm.keys()))) assert np.all(np.isin(['PCs'], list(ad.varm.keys()))) - assert ad.obsm['X_pca'].shape == (n_cells, n_comps_pca) + assert ad.obsm['X_pca'].shape == (n_cells, n_comps) # check adata shape to see if all genes or only HVGs are in the returned adata assert adata_pca.shape == (n_cells, n_genes) @@ -226,28 +226,28 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p assert adata_pca_not_using_hvgs.shape == (n_cells, n_genes) # check PC shapes to see whether or not HVGs were used for PCA - assert adata_pca.varm['PCs'].shape == (n_genes, n_comps_pca) + assert adata_pca.varm['PCs'].shape == (n_genes, n_comps) assert adata_pca_with_hvgs.varm['PCs'].shape == ( n_hvgs, - n_comps_pca, + n_comps, ) # only HVGs used - assert adata_pca_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps_pca) + assert adata_pca_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps) ### inplace = True ### # modifies the input adata object # PCA on all genes sc.experimental.pp.normalize_pearson_residuals_pca( - adata, inplace=True, n_comps_pca=n_comps_pca + adata, inplace=True, n_comps=n_comps ) # PCA on hvgs only sc.experimental.pp.normalize_pearson_residuals_pca( - adata_with_hvgs, inplace=True, n_comps_pca=n_comps_pca + adata_with_hvgs, inplace=True, n_comps=n_comps ) # PCA again on all genes (hvg use supressed) sc.experimental.pp.normalize_pearson_residuals_pca( adata_not_using_hvgs, inplace=True, - n_comps_pca=n_comps_pca, + n_comps=n_comps, use_highly_variable=False, ) @@ -264,17 +264,17 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p assert np.all(np.isin(['X_pca'], list(ad.obsm.keys()))) # check shapes: adata should always retains original shape assert ad.shape == (n_cells, n_genes) - assert ad.obsm['X_pca'].shape == (n_cells, n_comps_pca) + assert ad.obsm['X_pca'].shape == (n_cells, n_comps) # check PC shapes to see whether or not HVGs were used for PCA - assert adata.uns['pca']['PCs'].shape == (n_genes, n_comps_pca) + assert adata.uns['pca']['PCs'].shape == (n_genes, n_comps) assert adata_with_hvgs.uns['pca']['PCs'].shape == ( n_hvgs, - n_comps_pca, + n_comps, ) assert adata_not_using_hvgs.uns['pca']['PCs'].shape == ( n_genes, - n_comps_pca, + n_comps, ) # test for inplace/outplace @@ -293,8 +293,8 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p ) @pytest.mark.parametrize('dtype', ['float32', 'int64']) @pytest.mark.parametrize('n_hvgs', [100, 200]) -@pytest.mark.parametrize('n_comps_pca', [30, 50]) -def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comps_pca): +@pytest.mark.parametrize('n_comps', [30, 50]) +def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comps): adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) n_cells, n_genes = adata.shape @@ -307,7 +307,7 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp # outputs the (potentially hvg-restricted) adata_pca object # PCA on all genes adata_pca, hvg = sc.experimental.pp.recipe_pearson_residuals( - adata.copy(), inplace=False, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs + adata.copy(), inplace=False, n_comps=n_comps, n_top_genes=n_hvgs ) # for both cases, check adata_pca keys are complete @@ -319,12 +319,12 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp ) assert np.all(np.isin(['X_pca'], list(adata_pca.obsm.keys()))) assert np.all(np.isin(['PCs'], list(adata_pca.varm.keys()))) - assert adata_pca.obsm['X_pca'].shape == (n_cells, n_comps_pca) + assert adata_pca.obsm['X_pca'].shape == (n_cells, n_comps) # check adata shape assert adata_pca.shape == (n_cells, n_hvgs) # check PC shapes to check that HVGs were used for PCA - assert adata_pca.varm['PCs'].shape == (n_hvgs, n_comps_pca) + assert adata_pca.varm['PCs'].shape == (n_hvgs, n_comps) # check hvg df assert np.all( @@ -346,7 +346,7 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp # modifies the input adata object # PCA on all genes sc.experimental.pp.recipe_pearson_residuals( - adata, inplace=True, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs + adata, inplace=True, n_comps=n_comps, n_top_genes=n_hvgs ) assert np.all( @@ -357,7 +357,7 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp ) assert np.all(np.isin(['X_pca'], list(adata.obsm.keys()))) assert adata.shape == (n_cells, n_genes) - assert adata.obsm['X_pca'].shape == (n_cells, n_comps_pca) + assert adata.obsm['X_pca'].shape == (n_cells, n_comps) # check PC shapes to see whether or not HVGs were used for PCA - assert adata.uns['pca']['PCs'].shape == (n_hvgs, n_comps_pca) + assert adata.uns['pca']['PCs'].shape == (n_hvgs, n_comps) From 01500570f6a917415afecf84f951f7504642ef68 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 20 Aug 2021 23:53:28 +0200 Subject: [PATCH 64/96] _pca bundle: more efficient copy handling, added input check. both _pca and _recipe: varm field for PCs, adapted tests and docs --- scanpy/experimental/pp/_normalization.py | 38 +++++++++++++++------- scanpy/experimental/pp/_recipes.py | 13 +++++--- scanpy/tests/test_normalization.py | 41 +++++++++++++----------- 3 files changed, 57 insertions(+), 35 deletions(-) diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index 29fb1cf60d..097620ce33 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -145,7 +145,7 @@ def normalize_pearson_residuals_pca( n_comps: Optional[int] = 50, random_state: Optional[float] = 0, kwargs_pca: Optional[dict] = {}, - use_highly_variable: bool = True, + use_highly_variable: Optional[bool] = None, check_values: bool = True, inplace: bool = True, ) -> Optional[pd.DataFrame]: @@ -209,8 +209,10 @@ def normalize_pearson_residuals_pca( `.obsm['X_pca']` PCA representation of data after gene selection (if applicable) and Pearson residual normalization. - `.uns['pca']['PCs']` - The principal components containing the loadings. + `.varm['PCs']` + The principal components containing the loadings. When `inplace=True` and + `use_highly_variable=True`, this will contain empty rows for the genes not + selected. `.uns['pca']['variance_ratio']` Ratio of explained variance. `.uns['pca']['variance']` @@ -218,12 +220,23 @@ def normalize_pearson_residuals_pca( """ - if use_highly_variable and 'highly_variable' in adata.var_keys(): - # TODO: are these copies needed? - adata_pca = adata[:, adata.var['highly_variable']].copy() + # check if HVG selection is there if user wants to use it + if use_highly_variable and 'highly_variable' not in adata.var_keys(): + raise ValueError( + 'You passed `use_highly_variable=True`, but no HVG selection was found (`highly_variable` missing in `adata.var_keys()`.' + ) + + # default behavior: if there is a HVG selection, we will use it + if use_highly_variable is None and 'highly_variable' in adata.var_keys(): + use_highly_variable = True + + if use_highly_variable: + adata_sub = adata[:, adata.var['highly_variable']].copy() + adata_pca = AnnData( + adata_sub.X.copy(), obs=adata_sub.obs[[]], var=adata_sub.var[[]] + ) else: - # TODO: are these copies needed? - adata_pca = adata.copy() + adata_pca = AnnData(adata.X.copy(), obs=adata.obs[[]], var=adata.var[[]]) normalize_pearson_residuals( adata_pca, theta=theta, clip=clip, check_values=check_values @@ -233,9 +246,12 @@ def normalize_pearson_residuals_pca( if inplace: norm_settings = adata_pca.uns['pearson_residuals_normalization'] norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df()) - pca_settings = adata_pca.uns['pca'] - pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs']) - adata.uns['pca'] = pca_dict + if use_highly_variable: + adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) + adata.varm['PCs'][adata.var['highly_variable']] = adata_pca.varm['PCs'] + else: + adata.varm['PCs'] = adata_pca.varm['PCs'] + adata.uns['pca'] = adata_pca.uns['pca'] adata.uns['pearson_residuals_normalization'] = norm_dict adata.obsm['X_pca'] = adata_pca.obsm['X_pca'] return None diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 066dd2d590..5589e816aa 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -1,6 +1,7 @@ from typing import Optional, Tuple from anndata import AnnData import pandas as pd +import numpy as np from scanpy import experimental from scanpy.preprocessing import pca @@ -108,8 +109,9 @@ def recipe_pearson_residuals( `.obsm['X_pca']` PCA representation of data after gene selection and Pearson residual normalization. - `.uns['pca']['PCs']` - The principal components containing the loadings. + `.varm['PCs']` + The principal components containing the loadings. When `inplace=True` this + will contain empty rows for the genes not selected during HVG selection. `.uns['pca']['variance_ratio']` Ratio of explained variance. `.uns['pca']['variance']` @@ -146,9 +148,10 @@ def recipe_pearson_residuals( normalization_dict = dict( **normalization_param, pearson_residuals_df=adata_pca.to_df() ) - pca_param = adata_pca.uns['pca'] - pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs']) - adata.uns['pca'] = pca_dict + + adata.uns['pca'] = adata_pca.uns['pca'] + adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) + adata.varm['PCs'][adata.var['highly_variable']] = adata_pca.varm['PCs'] adata.uns['pearson_residuals_normalization'] = normalization_dict adata.obsm['X_pca'] = adata_pca.obsm['X_pca'] return None diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index 53e0da37b7..73005fea3c 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -235,15 +235,15 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps): ### inplace = True ### # modifies the input adata object - # PCA on all genes + # PCA on all genes (no HVG supplied) sc.experimental.pp.normalize_pearson_residuals_pca( adata, inplace=True, n_comps=n_comps ) - # PCA on hvgs only + # PCA on hvgs only (HVGs supplied and automatically used) sc.experimental.pp.normalize_pearson_residuals_pca( adata_with_hvgs, inplace=True, n_comps=n_comps ) - # PCA again on all genes (hvg use supressed) + # PCA again on all genes (HVGs supplied and NOT used) sc.experimental.pp.normalize_pearson_residuals_pca( adata_not_using_hvgs, inplace=True, @@ -266,17 +266,23 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps): assert ad.shape == (n_cells, n_genes) assert ad.obsm['X_pca'].shape == (n_cells, n_comps) - # check PC shapes to see whether or not HVGs were used for PCA - assert adata.uns['pca']['PCs'].shape == (n_genes, n_comps) - assert adata_with_hvgs.uns['pca']['PCs'].shape == ( - n_hvgs, - n_comps, - ) - assert adata_not_using_hvgs.uns['pca']['PCs'].shape == ( - n_genes, - n_comps, + # check if there are columns of all-zeros in the PCs shapes + # to see whether or not HVGs were used for PCA + assert adata.varm['PCs'].shape == (n_genes, n_comps) + # no all-zero-colums should exist + assert sum(np.sum(np.abs(adata.varm['PCs']), axis=1) == 0) == 0 + + assert adata_with_hvgs.varm['PCs'].shape == (n_genes, n_comps) + # number of all-zero-colums should be number of non-hvgs + assert ( + sum(np.sum(np.abs(adata_with_hvgs.varm['PCs']), axis=1) == 0) + == n_genes - n_hvgs ) + assert adata_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps) + # no all-zero-colums should exist + assert sum(np.sum(np.abs(adata_not_using_hvgs.varm['PCs']), axis=1) == 0) == 0 + # test for inplace/outplace for ad_inplace, ad_outplace in zip( [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs], @@ -298,11 +304,6 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) n_cells, n_genes = adata.shape - adata_with_hvgs = adata.copy() - sc.experimental.pp.highly_variable_genes( - adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs - ) - ### inplace = False ### # outputs the (potentially hvg-restricted) adata_pca object # PCA on all genes @@ -359,5 +360,7 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp assert adata.shape == (n_cells, n_genes) assert adata.obsm['X_pca'].shape == (n_cells, n_comps) - # check PC shapes to see whether or not HVGs were used for PCA - assert adata.uns['pca']['PCs'].shape == (n_hvgs, n_comps) + # check PC shape + assert adata.varm['PCs'].shape == (n_genes, n_comps) + # number of all-zero-colums should be number of non-hvgs + assert sum(np.sum(np.abs(adata.varm['PCs']), axis=1) == 0) == n_genes - n_hvgs From e9c0b89be81e99fd23c499aaf5d7fb78017657ab Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Sun, 22 Aug 2021 21:53:41 +0200 Subject: [PATCH 65/96] move repeated inputcheck code to helpers --- scanpy/tests/helpers.py | 35 +++++++++++ scanpy/tests/test_highly_variable_genes.py | 71 ++++++++-------------- scanpy/tests/test_normalization.py | 55 +++++++---------- 3 files changed, 81 insertions(+), 80 deletions(-) diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py index b7e97c36dd..aa47bef971 100644 --- a/scanpy/tests/helpers.py +++ b/scanpy/tests/helpers.py @@ -6,6 +6,8 @@ import scanpy as sc import numpy as np +import warnings +import pytest from anndata.tests.helpers import asarray, assert_equal @@ -106,3 +108,36 @@ def _prepare_pbmc_testdata(sparsity_func, dtype, small=False): sc.pp.filter_genes(adata, min_cells=1) adata.X = sparsity_func(adata.X.astype(dtype)) return adata + + +def _make_noninteger_data(adata): + '''Adds a single non-integer to the data matrix, e.g. for testing `check_value` arguments.''' + + adata_noninteger = adata.copy() + x, y = np.nonzero(adata_noninteger.X) + adata_noninteger.X[x[0], y[0]] = 0.5 + + return adata_noninteger + + +def _test_check_values_warnings(function, adata, expected_warning, kwargs={}): + '''Runs `function` on `adata` with provided arguments `kwargs` twice: once with `check_values=True` and once with `check_values=False`. Checks that the `expected_warning` is only raised whtn `check_values=True`.''' + + # expecting 0 no-int warnings + with warnings.catch_warnings(record=True) as record: + function(adata.copy(), **kwargs, check_values=False) + warning_msgs = [w.message.args[0] for w in record] + assert expected_warning not in warning_msgs + + # expecting 1 no-int warning + with warnings.catch_warnings(record=True) as record: + function(adata.copy(), **kwargs, check_values=True) + warning_msgs = [w.message.args[0] for w in record] + assert expected_warning in warning_msgs + + +def _test_value_error(function, adata, expected_error, kwargs={}): + '''Runs `function` on `adata` with provided arguments `kwargs` and checks if `error_msg` is raised as an `ValueError`.''' + + with pytest.raises(ValueError, match=expected_error): + function(adata.copy(), **kwargs) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index d58a1840b9..32c86dde60 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -4,7 +4,12 @@ import scanpy as sc from pathlib import Path from scipy.sparse import csr_matrix -from scanpy.tests.helpers import _prepare_pbmc_testdata +from scanpy.tests.helpers import ( + _prepare_pbmc_testdata, + _make_noninteger_data, + _test_check_values_warnings, + _test_value_error, +) import warnings FILE = Path(__file__).parent / Path('_scripts/seurat_hvg.csv') @@ -74,59 +79,33 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp # depending on check_values, warnings should be raised for non-integer data if dtype == 'float32': - adata_noninteger = adata.copy() - x, y = np.nonzero(adata_noninteger.X) - adata_noninteger.X[x[0], y[0]] = 0.5 + adata_noninteger = _make_noninteger_data(adata) - # expecting 0 no-int warnings - with warnings.catch_warnings(record=True) as record: - sc.experimental.pp.highly_variable_genes( - adata_noninteger.copy(), + _test_check_values_warnings( + function=sc.experimental.pp.highly_variable_genes, + adata=adata_noninteger, + expected_warning="`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", + kwargs=dict( flavor='pearson_residuals', n_top_genes=100, - check_values=False, - ) - - warning_msgs = [w.message.args[0] for w in record] - assert ( - "`flavor='pearson_residuals'` expects raw count data, but non-integers were found." - not in warning_msgs + ), ) - # expecting 1 no-int warning - with pytest.warns( - UserWarning, - match="`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", - ) as record: - sc.experimental.pp.highly_variable_genes( - adata_noninteger.copy(), - flavor='pearson_residuals', - n_top_genes=100, - check_values=True, - ) - # errors should be raised for invalid theta values - with pytest.raises( - ValueError, match='Pearson residuals require theta > 0' - ) as record: - sc.experimental.pp.highly_variable_genes( - adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=0 + for theta in [0, -1]: + _test_value_error( + function=sc.experimental.pp.highly_variable_genes, + adata=adata, + expected_error='Pearson residuals require theta > 0', + kwargs=dict(theta=theta, flavor='pearson_residuals', n_top_genes=100), ) - with pytest.raises( - ValueError, match='Pearson residuals require theta > 0' - ) as record: - sc.experimental.pp.highly_variable_genes( - adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=-1 - ) - - # error should be raised for invalid clipping values - with pytest.raises( - ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.' - ) as record: - sc.experimental.pp.highly_variable_genes( - adata.copy(), flavor='pearson_residuals', n_top_genes=100, clip=-1 - ) + _test_value_error( + function=sc.experimental.pp.highly_variable_genes, + adata=adata, + expected_error='Pearson residuals require `clip>=0` or `clip=None`.', + kwargs=dict(clip=-1, flavor='pearson_residuals', n_top_genes=100), + ) @pytest.mark.parametrize( diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index 73005fea3c..47131952e3 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -10,6 +10,9 @@ check_rep_mutation, check_rep_results, _prepare_pbmc_testdata, + _make_noninteger_data, + _test_check_values_warnings, + _test_value_error, ) from anndata.tests.helpers import assert_equal, asarray @@ -75,45 +78,29 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype): # depending on check_values, warnings should be raised for non-integer data if dtype == 'float32': - adata_noninteger = adata.copy() - x, y = np.nonzero(adata_noninteger.X) - adata_noninteger.X[x[0], y[0]] = 0.5 + adata_noninteger = _make_noninteger_data(adata) - with warnings.catch_warnings(record=True) as record: - sc.experimental.pp.normalize_pearson_residuals( - adata_noninteger.copy(), check_values=True - ) - warning_msgs = [w.message.args[0] for w in record] - assert ( - "`normalize_pearson_residuals()` expects raw count data, but non-integers were found." - in warning_msgs + _test_check_values_warnings( + function=sc.experimental.pp.normalize_pearson_residuals, + adata=adata_noninteger, + expected_warning="`normalize_pearson_residuals()` expects raw count data, but non-integers were found.", ) - with warnings.catch_warnings(record=True) as record: - sc.experimental.pp.normalize_pearson_residuals( - adata_noninteger.copy(), check_values=False - ) - warning_msgs = [w.message.args[0] for w in record] - assert ( - "`normalize_pearson_residuals()` expects raw count data, but non-integers were found." - not in warning_msgs + # errors should be raised for invalid theta values + for theta in [0, -1]: + _test_value_error( + function=sc.experimental.pp.normalize_pearson_residuals, + adata=adata, + expected_error='Pearson residuals require theta > 0', + kwargs=dict(theta=theta), ) - # errors should be raised for invalid theta values - with pytest.raises( - ValueError, match='Pearson residuals require theta > 0' - ) as record: - sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=0) - with pytest.raises( - ValueError, match='Pearson residuals require theta > 0' - ) as record: - sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=-1) - - # error should be raised for invalid clipping values - with pytest.raises( - ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.' - ) as record: - sc.experimental.pp.normalize_pearson_residuals(adata.copy(), clip=-1) + _test_value_error( + function=sc.experimental.pp.normalize_pearson_residuals, + adata=adata, + expected_error='Pearson residuals require `clip>=0` or `clip=None`.', + kwargs=dict(clip=-1), + ) @pytest.mark.parametrize( From 3e02b056ab37d51ef11aa5c65932739ba0dd413a Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Mon, 23 Aug 2021 13:42:37 +0200 Subject: [PATCH 66/96] merging tests *_values and *_general --- scanpy/tests/test_highly_variable_genes.py | 60 ++-------------------- 1 file changed, 4 insertions(+), 56 deletions(-) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 32c86dde60..926e50bda3 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -116,12 +116,12 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp @pytest.mark.parametrize('clip', [None, np.Inf, 30]) @pytest.mark.parametrize('theta', [100, np.Inf]) @pytest.mark.parametrize('n_top_genes', [100, 200]) -def test_highly_variable_genes_pearson_residuals_values( +def test_highly_variable_genes_pearson_residuals_general( subset, sparsity_func, dtype, clip, theta, n_top_genes ): adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) # cleanup var - adata.var.drop(columns=adata.var.columns, inplace=True) + del adata.var # compute reference output residual_variances_reference = _residual_var_reference( adata.copy(), clip=clip, theta=theta @@ -153,59 +153,7 @@ def test_highly_variable_genes_pearson_residuals_values( theta=theta, ) - pd.testing.assert_frame_equal(output_df, adata.var) - - # consistency with normalization method - if subset: - # sort values before comparing as reference is sorted as well for subset case - sort_output_idx = np.argsort(-output_df['residual_variances'].values) - assert np.allclose( - output_df['residual_variances'].values[sort_output_idx], - residual_variances_reference, - ) - else: - assert np.allclose( - output_df['residual_variances'].values, residual_variances_reference - ) - - -@pytest.mark.parametrize( - 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ -) -@pytest.mark.parametrize('dtype', ['float32', 'int64']) -@pytest.mark.parametrize('subset', [True, False]) -@pytest.mark.parametrize('n_top_genes', [1000, 500]) -def test_highly_variable_genes_pearson_residuals_general( - subset, sparsity_func, dtype, n_top_genes -): - - adata = _prepare_pbmc_testdata(sparsity_func, dtype) - # cleanup var - adata.var.drop(columns=adata.var.columns, inplace=True) - # compute reference output - residual_variances_reference = _residual_var_reference(adata.copy()) - if subset: - # lazily sort by residual variance and take top N - top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes] - # (results in sorted "gene order" in reference) - residual_variances_reference = residual_variances_reference[top_n_idx] - # compute output to be tested - output_df = sc.experimental.pp.highly_variable_genes( - adata, - flavor='pearson_residuals', - n_top_genes=n_top_genes, - subset=subset, - inplace=False, - ) - - sc.experimental.pp.highly_variable_genes( - adata, - flavor='pearson_residuals', - n_top_genes=n_top_genes, - subset=subset, - inplace=True, - ) - + # compare inplace=True and inplace=False output pd.testing.assert_frame_equal(output_df, adata.var) # check output is complete @@ -258,7 +206,7 @@ def test_highly_variable_genes_pearson_residuals_batch( ): adata = _prepare_pbmc_testdata(sparsity_func, dtype) # cleanup var - adata.var.drop(columns=adata.var.columns, inplace=True) + del adata.var n_genes = adata.shape[1] output_df = sc.experimental.pp.highly_variable_genes( From 720578de421093e3c13b0a99590026db68beff84 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Mon, 23 Aug 2021 14:23:02 +0200 Subject: [PATCH 67/96] condense code in pearson hvg selection test, smaller test data for speedup --- scanpy/tests/test_highly_variable_genes.py | 31 +++++++++++++--------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 926e50bda3..d32237df4d 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -68,6 +68,16 @@ def _residual_var_reference(adata, clip=None, theta=100): return np.var(residuals, axis=0) +def _check_pearson_hvg_columns(output_df, n_top_genes): + + assert pd.api.types.is_float_dtype(output_df['residual_variances'].dtype) + + assert output_df['highly_variable'].values.dtype is np.dtype('bool') + assert np.sum(output_df['highly_variable']) == n_top_genes + + assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1 + + @pytest.mark.parametrize( 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ ) @@ -166,9 +176,7 @@ def test_highly_variable_genes_pearson_residuals_general( ]: assert key in output_df.keys() - # check residual variances - assert pd.api.types.is_float_dtype(output_df['residual_variances'].dtype) - # consistency with normalization method + # check consistency with normalization method if subset: # sort values before comparing as reference is sorted as well for subset case sort_output_idx = np.argsort(-output_df['residual_variances'].values) @@ -182,8 +190,6 @@ def test_highly_variable_genes_pearson_residuals_general( ) # check hvg flag - assert output_df['highly_variable'].values.dtype is np.dtype('bool') - assert np.sum(output_df['highly_variable']) == n_top_genes hvg_idx = np.where(output_df['highly_variable'])[0] topn_idx = np.sort( np.argsort(-output_df['residual_variances'].values)[:n_top_genes] @@ -192,7 +198,9 @@ def test_highly_variable_genes_pearson_residuals_general( # check ranks assert np.nanmin(output_df['highly_variable_rank'].values) == 0 - assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1 + + # more general checks on ranks, hvg flag and residual variance + _check_pearson_hvg_columns(output_df, n_top_genes) @pytest.mark.parametrize( @@ -200,11 +208,11 @@ def test_highly_variable_genes_pearson_residuals_general( ) @pytest.mark.parametrize('dtype', ['float32', 'int64']) @pytest.mark.parametrize('subset', [True, False]) -@pytest.mark.parametrize('n_top_genes', [1000, 500]) +@pytest.mark.parametrize('n_top_genes', [100, 200]) def test_highly_variable_genes_pearson_residuals_batch( subset, n_top_genes, sparsity_func, dtype ): - adata = _prepare_pbmc_testdata(sparsity_func, dtype) + adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) # cleanup var del adata.var n_genes = adata.shape[1] @@ -227,6 +235,7 @@ def test_highly_variable_genes_pearson_residuals_batch( inplace=True, ) + # compare inplace=True and inplace=False output pd.testing.assert_frame_equal(output_df, adata.var) # check output is complete @@ -241,9 +250,8 @@ def test_highly_variable_genes_pearson_residuals_batch( ]: assert key in output_df.keys() - # check hvg flag - assert output_df['highly_variable'].values.dtype is np.dtype('bool') - assert np.sum(output_df['highly_variable']) == n_top_genes + # general checks on ranks, hvg flag and residual variance + _check_pearson_hvg_columns(output_df, n_top_genes) # check intersection flag nbatches = len(np.unique(adata.obs['batch'])) @@ -253,7 +261,6 @@ def test_highly_variable_genes_pearson_residuals_batch( # check ranks (with batch_key these are the median of within-batch ranks) assert pd.api.types.is_float_dtype(output_df['highly_variable_rank'].dtype) - assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1 # check nbatches assert output_df['highly_variable_nbatches'].values.dtype is np.dtype('int') From 83b7338cc78fb85c41fbd3a866bb791e8eb5050f Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Mon, 23 Aug 2021 15:30:40 +0200 Subject: [PATCH 68/96] condensing code in normalization tests --- scanpy/tests/test_normalization.py | 103 +++++++++++++---------------- 1 file changed, 45 insertions(+), 58 deletions(-) diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index 47131952e3..3d41effe05 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -160,6 +160,28 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip): assert np.min(output_X) >= -clip +def _check_pearson_pca_fields(ad, n_cells, n_comps): + assert np.all( + np.isin( + ['pearson_residuals_normalization', 'pca'], + list(ad.uns.keys()), + ) + ), ( + """Missing `.uns` keys. Expected `['pearson_residuals_normalization', 'pca']`, but only %s were found""" + % (list(ad.uns.keys())) + ) + assert 'X_pca' in list( + ad.obsm.keys() + ), """Missing `obsm` key `'X_pca'`, only %s were found""" % (list(ad.obsm.keys())) + assert 'PCs' in list( + ad.varm.keys() + ), """Missing `varm` key `'PCs'`, only %s were found""" % (list(ad.varm.keys())) + assert ad.obsm['X_pca'].shape == ( + n_cells, + n_comps, + ), 'Wrong shape of PCA output in `X_pca`' + + @pytest.mark.parametrize( 'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__ ) @@ -179,15 +201,15 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps): ### inplace = False ### # outputs the (potentially hvg-restricted) adata_pca object - # PCA on all genes + # PCA on all genes (no HVGs present) adata_pca = sc.experimental.pp.normalize_pearson_residuals_pca( adata.copy(), inplace=False, n_comps=n_comps ) - # PCA on hvgs only + # PCA on hvgs only (HVGs present, and by default, `use_highly_variable=True`) adata_pca_with_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca( adata_with_hvgs.copy(), inplace=False, n_comps=n_comps ) - # PCA again on all genes (hvg use supressed) + # PCA again on all genes (HVGs present, but hvg use supressed by `use_highly_variable=False`) adata_pca_not_using_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca( adata_not_using_hvgs.copy(), inplace=False, @@ -195,17 +217,9 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps): use_highly_variable=False, ) - # for both cases, check adata_pca keys are complete + # for all cases, check adata_pca keys are complete for ad in [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs]: - assert np.all( - np.isin( - ['pearson_residuals_normalization', 'pca'], - list(ad.uns.keys()), - ) - ) - assert np.all(np.isin(['X_pca'], list(ad.obsm.keys()))) - assert np.all(np.isin(['PCs'], list(ad.varm.keys()))) - assert ad.obsm['X_pca'].shape == (n_cells, n_comps) + _check_pearson_pca_fields(ad, n_cells, n_comps) # check adata shape to see if all genes or only HVGs are in the returned adata assert adata_pca.shape == (n_cells, n_genes) @@ -217,20 +231,20 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps): assert adata_pca_with_hvgs.varm['PCs'].shape == ( n_hvgs, n_comps, - ) # only HVGs used + ) assert adata_pca_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps) ### inplace = True ### # modifies the input adata object - # PCA on all genes (no HVG supplied) + # PCA on all genes (no HVGs present) sc.experimental.pp.normalize_pearson_residuals_pca( adata, inplace=True, n_comps=n_comps ) - # PCA on hvgs only (HVGs supplied and automatically used) + # PCA on hvgs only (HVGs present, and by default, `use_highly_variable=True`) sc.experimental.pp.normalize_pearson_residuals_pca( adata_with_hvgs, inplace=True, n_comps=n_comps ) - # PCA again on all genes (HVGs supplied and NOT used) + # PCA again on all genes (HVGs present, but hvg use supressed by `use_highly_variable=False`) sc.experimental.pp.normalize_pearson_residuals_pca( adata_not_using_hvgs, inplace=True, @@ -238,39 +252,27 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps): use_highly_variable=False, ) + # for all cases, check adata_pca keys are complete for ad in [adata, adata_with_hvgs, adata_not_using_hvgs]: - # check adata_pca keys are complete - assert np.all( - np.isin( - [ - 'pearson_residuals_normalization', - ], - list(ad.uns.keys()), - ) - ) - assert np.all(np.isin(['X_pca'], list(ad.obsm.keys()))) - # check shapes: adata should always retains original shape + _check_pearson_pca_fields(ad, n_cells, n_comps) + + # check shapes: inplace adata's should always retains original shape assert ad.shape == (n_cells, n_genes) - assert ad.obsm['X_pca'].shape == (n_cells, n_comps) + assert ad.varm['PCs'].shape == (n_genes, n_comps) # check if there are columns of all-zeros in the PCs shapes # to see whether or not HVGs were used for PCA - assert adata.varm['PCs'].shape == (n_genes, n_comps) # no all-zero-colums should exist assert sum(np.sum(np.abs(adata.varm['PCs']), axis=1) == 0) == 0 - - assert adata_with_hvgs.varm['PCs'].shape == (n_genes, n_comps) # number of all-zero-colums should be number of non-hvgs assert ( sum(np.sum(np.abs(adata_with_hvgs.varm['PCs']), axis=1) == 0) == n_genes - n_hvgs ) - - assert adata_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps) # no all-zero-colums should exist assert sum(np.sum(np.abs(adata_not_using_hvgs.varm['PCs']), axis=1) == 0) == 0 - # test for inplace/outplace + # compare PCA results beteen inplace/outplace for ad_inplace, ad_outplace in zip( [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs], [adata, adata_with_hvgs, adata_not_using_hvgs], @@ -298,20 +300,11 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp adata.copy(), inplace=False, n_comps=n_comps, n_top_genes=n_hvgs ) - # for both cases, check adata_pca keys are complete - assert np.all( - np.isin( - ['pearson_residuals_normalization', 'pca'], - list(adata_pca.uns.keys()), - ) - ) - assert np.all(np.isin(['X_pca'], list(adata_pca.obsm.keys()))) - assert np.all(np.isin(['PCs'], list(adata_pca.varm.keys()))) - assert adata_pca.obsm['X_pca'].shape == (n_cells, n_comps) - - # check adata shape + # check PCA fields + _check_pearson_pca_fields(adata_pca, n_cells, n_comps) + # check adata output shape (only HVGs in output) assert adata_pca.shape == (n_cells, n_hvgs) - # check PC shapes to check that HVGs were used for PCA + # check PC shape (non-hvgs are removed, so only `n_hvgs` genes) assert adata_pca.varm['PCs'].shape == (n_hvgs, n_comps) # check hvg df @@ -337,17 +330,11 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp adata, inplace=True, n_comps=n_comps, n_top_genes=n_hvgs ) - assert np.all( - np.isin( - ['pearson_residuals_normalization', 'pca'], - list(adata.uns.keys()), - ) - ) - assert np.all(np.isin(['X_pca'], list(adata.obsm.keys()))) + # check PCA fields and output shape + _check_pearson_pca_fields(adata, n_cells, n_comps) + # check adata shape (no change to input) assert adata.shape == (n_cells, n_genes) - assert adata.obsm['X_pca'].shape == (n_cells, n_comps) - - # check PC shape + # check PC shape (non-hvgs are masked with 0s, so original number of genes) assert adata.varm['PCs'].shape == (n_genes, n_comps) # number of all-zero-colums should be number of non-hvgs assert sum(np.sum(np.abs(adata.varm['PCs']), axis=1) == 0) == n_genes - n_hvgs From a616419a4f1c95a19a81f2878f7ad91c45a1eaa8 Mon Sep 17 00:00:00 2001 From: giovp Date: Tue, 31 Aug 2021 10:56:19 +0200 Subject: [PATCH 69/96] add asteriks for keyword --- scanpy/experimental/pp/_highly_variable_genes.py | 1 + scanpy/experimental/pp/_normalization.py | 1 + 2 files changed, 2 insertions(+) diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index 03fc078ac8..d3fed085b1 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -207,6 +207,7 @@ def _highly_variable_pearson_residuals( def highly_variable_genes( adata: AnnData, + *, theta: float = 100, clip: Optional[float] = None, n_top_genes: Optional[int] = None, diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index 097620ce33..463d3230b2 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -57,6 +57,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False): def normalize_pearson_residuals( adata: AnnData, + *, theta: float = 100, clip: Optional[float] = None, check_values: bool = True, From 62660a2f525f7ba08200ece6791f0dae3792edb0 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Tue, 14 Sep 2021 12:02:41 +0200 Subject: [PATCH 70/96] updating refs to Genome Biology publication --- docs/references.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/references.rst b/docs/references.rst index 84bdd7f7fe..7c09607f44 100644 --- a/docs/references.rst +++ b/docs/references.rst @@ -121,7 +121,7 @@ References .. [Lause21] Lause *et al.* (2021) *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*, - `BioRxiv `__. + `Genome Biology `__. .. [Leek12] Leek *et al.* (2012), *sva: Surrogate Variable Analysis. R package* From b5cb3aa6ccc1b79b76e857776c3d371cf201b934 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 24 Dec 2021 09:53:40 +0100 Subject: [PATCH 71/96] cleanup helpers.py --- scanpy/tests/helpers.py | 27 ++++-------- scanpy/tests/test_highly_variable_genes.py | 48 ++++++++++------------ scanpy/tests/test_normalization.py | 29 ++++++------- 3 files changed, 42 insertions(+), 62 deletions(-) diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py index aa47bef971..bdbbe7b156 100644 --- a/scanpy/tests/helpers.py +++ b/scanpy/tests/helpers.py @@ -99,7 +99,13 @@ def _prepare_pbmc_testdata(sparsity_func, dtype, small=False): small False (default) returns full data, True returns small subset of the data.""" - adata = sc.datasets.pbmc3k() + # loading from disk takes long, so cache raw data after loading it once + if 'ADATA_PBMC_RAW' not in globals(): + global ADATA_PBMC_RAW + ADATA_PBMC_RAW = sc.datasets.pbmc3k() + + adata = ADATA_PBMC_RAW.copy() + if small: adata = adata[:1000, :500] sc.pp.filter_cells(adata, min_genes=1) @@ -110,17 +116,7 @@ def _prepare_pbmc_testdata(sparsity_func, dtype, small=False): return adata -def _make_noninteger_data(adata): - '''Adds a single non-integer to the data matrix, e.g. for testing `check_value` arguments.''' - - adata_noninteger = adata.copy() - x, y = np.nonzero(adata_noninteger.X) - adata_noninteger.X[x[0], y[0]] = 0.5 - - return adata_noninteger - - -def _test_check_values_warnings(function, adata, expected_warning, kwargs={}): +def _check_check_values_warnings(function, adata, expected_warning, kwargs={}): '''Runs `function` on `adata` with provided arguments `kwargs` twice: once with `check_values=True` and once with `check_values=False`. Checks that the `expected_warning` is only raised whtn `check_values=True`.''' # expecting 0 no-int warnings @@ -134,10 +130,3 @@ def _test_check_values_warnings(function, adata, expected_warning, kwargs={}): function(adata.copy(), **kwargs, check_values=True) warning_msgs = [w.message.args[0] for w in record] assert expected_warning in warning_msgs - - -def _test_value_error(function, adata, expected_error, kwargs={}): - '''Runs `function` on `adata` with provided arguments `kwargs` and checks if `error_msg` is raised as an `ValueError`.''' - - with pytest.raises(ValueError, match=expected_error): - function(adata.copy(), **kwargs) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index d32237df4d..31addb225c 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -6,9 +6,7 @@ from scipy.sparse import csr_matrix from scanpy.tests.helpers import ( _prepare_pbmc_testdata, - _make_noninteger_data, - _test_check_values_warnings, - _test_value_error, + _check_check_values_warnings, ) import warnings @@ -62,12 +60,6 @@ def test_highly_variable_genes_basic(): assert np.all(np.isin(colnames, hvg_df.columns)) -def _residual_var_reference(adata, clip=None, theta=100): - sc.experimental.pp.normalize_pearson_residuals(adata, clip=clip, theta=theta) - residuals = adata.X - return np.var(residuals, axis=0) - - def _check_pearson_hvg_columns(output_df, n_top_genes): assert pd.api.types.is_float_dtype(output_df['residual_variances'].dtype) @@ -89,9 +81,11 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp # depending on check_values, warnings should be raised for non-integer data if dtype == 'float32': - adata_noninteger = _make_noninteger_data(adata) + adata_noninteger = adata.copy() + x, y = np.nonzero(adata_noninteger.X) + adata_noninteger.X[x[0], y[0]] = 0.5 - _test_check_values_warnings( + _check_check_values_warnings( function=sc.experimental.pp.highly_variable_genes, adata=adata_noninteger, expected_warning="`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", @@ -103,19 +97,18 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp # errors should be raised for invalid theta values for theta in [0, -1]: - _test_value_error( - function=sc.experimental.pp.highly_variable_genes, - adata=adata, - expected_error='Pearson residuals require theta > 0', - kwargs=dict(theta=theta, flavor='pearson_residuals', n_top_genes=100), - ) - _test_value_error( - function=sc.experimental.pp.highly_variable_genes, - adata=adata, - expected_error='Pearson residuals require `clip>=0` or `clip=None`.', - kwargs=dict(clip=-1, flavor='pearson_residuals', n_top_genes=100), - ) + with pytest.raises(ValueError, match='Pearson residuals require theta > 0'): + sc.experimental.pp.highly_variable_genes( + adata.copy(), theta=theta, flavor='pearson_residuals', n_top_genes=100 + ) + + with pytest.raises( + ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.' + ): + sc.experimental.pp.highly_variable_genes( + adata.copy(), clip=-1, flavor='pearson_residuals', n_top_genes=100 + ) @pytest.mark.parametrize( @@ -132,10 +125,13 @@ def test_highly_variable_genes_pearson_residuals_general( adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True) # cleanup var del adata.var + # compute reference output - residual_variances_reference = _residual_var_reference( - adata.copy(), clip=clip, theta=theta - ) + residuals_reference = sc.experimental.pp.normalize_pearson_residuals( + adata, clip=clip, theta=theta, inplace=False + )['X'] + residual_variances_reference = np.var(residuals_reference, axis=0) + if subset: # lazyly sort by residual variance and take top N top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes] diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index 3d41effe05..21588a2610 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -10,9 +10,7 @@ check_rep_mutation, check_rep_results, _prepare_pbmc_testdata, - _make_noninteger_data, - _test_check_values_warnings, - _test_value_error, + _check_check_values_warnings, ) from anndata.tests.helpers import assert_equal, asarray @@ -78,9 +76,11 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype): # depending on check_values, warnings should be raised for non-integer data if dtype == 'float32': - adata_noninteger = _make_noninteger_data(adata) + adata_noninteger = adata.copy() + x, y = np.nonzero(adata_noninteger.X) + adata_noninteger.X[x[0], y[0]] = 0.5 - _test_check_values_warnings( + _check_check_values_warnings( function=sc.experimental.pp.normalize_pearson_residuals, adata=adata_noninteger, expected_warning="`normalize_pearson_residuals()` expects raw count data, but non-integers were found.", @@ -88,19 +88,14 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype): # errors should be raised for invalid theta values for theta in [0, -1]: - _test_value_error( - function=sc.experimental.pp.normalize_pearson_residuals, - adata=adata, - expected_error='Pearson residuals require theta > 0', - kwargs=dict(theta=theta), - ) - _test_value_error( - function=sc.experimental.pp.normalize_pearson_residuals, - adata=adata, - expected_error='Pearson residuals require `clip>=0` or `clip=None`.', - kwargs=dict(clip=-1), - ) + with pytest.raises(ValueError, match='Pearson residuals require theta > 0'): + sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=theta) + + with pytest.raises( + ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.' + ): + sc.experimental.pp.normalize_pearson_residuals(adata.copy(), clip=-1) @pytest.mark.parametrize( From aa9037f1ef57df9da98e0f2f1da5517935feb2f3 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 24 Dec 2021 10:51:24 +0100 Subject: [PATCH 72/96] cleanup main files as requested by @ivirshup --- .../experimental/pp/_highly_variable_genes.py | 17 +++++++---------- scanpy/experimental/pp/_normalization.py | 4 +++- scanpy/experimental/pp/_recipes.py | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index d3fed085b1..f98370544f 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -80,17 +80,14 @@ def _highly_variable_pearson_residuals( residual_gene_vars = [] for batch in np.unique(batch_info): - adata_subset = adata[batch_info == batch] + adata_subset_prefilter = adata[batch_info == batch] + X_batch_prefilter = _get_obs_rep(adata_subset_prefilter, layer=layer) # Filter out zero genes with settings.verbosity.override(Verbosity.error): - nonzero_genes = filter_genes(adata_subset, min_cells=1, inplace=False)[0] - adata_subset = adata_subset[:, nonzero_genes] - - if layer is not None: - X_batch = adata_subset.layers[layer] - else: - X_batch = adata_subset.X + nonzero_genes = np.ravel(X_batch_prefilter.sum(axis=0)) != 0 + adata_subset = adata_subset_prefilter[:, nonzero_genes] + X_batch = _get_obs_rep(adata_subset, layer=layer) # Prepare clipping if clip is None: @@ -161,9 +158,9 @@ def _highly_variable_pearson_residuals( inplace=True, ) - high_var = np.zeros(df.shape[0]) + high_var = np.zeros(df.shape[0], dtype=bool) high_var[:n_top_genes] = True - df['highly_variable'] = high_var.astype(bool) + df['highly_variable'] = high_var df = df.loc[adata.var_names, :] if inplace: diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index 463d3230b2..449ca02f1d 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -141,6 +141,7 @@ def normalize_pearson_residuals( def normalize_pearson_residuals_pca( adata: AnnData, + *, theta: float = 100, clip: Optional[float] = None, n_comps: Optional[int] = 50, @@ -224,7 +225,8 @@ def normalize_pearson_residuals_pca( # check if HVG selection is there if user wants to use it if use_highly_variable and 'highly_variable' not in adata.var_keys(): raise ValueError( - 'You passed `use_highly_variable=True`, but no HVG selection was found (`highly_variable` missing in `adata.var_keys()`.' + "You passed `use_highly_variable=True`, but no HVG selection was found " + "(e.g., there was no 'highly_variable' column in adata.var).'" ) # default behavior: if there is a HVG selection, we will use it diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 5589e816aa..ea7da74f59 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -8,6 +8,7 @@ def recipe_pearson_residuals( adata: AnnData, + *, theta: float = 100, clip: Optional[float] = None, n_top_genes: int = 1000, @@ -57,7 +58,6 @@ def recipe_pearson_residuals( This dertermines how many genes are processed at once while computing the Pearson residual variance. Choosing a smaller value will reduce the required memory. - n_comps Number of principal components to compute in the PCA step. random_state From e972daf7f4e124efac8e7797e49d4a31bb69ab54 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Fri, 24 Dec 2021 11:48:52 +0100 Subject: [PATCH 73/96] revert unneeded settingWithCopy fix --- scanpy/preprocessing/_highly_variable_genes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 7cede9e528..561a7c7bf0 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -514,7 +514,7 @@ def highly_variable_genes( df['highly_variable'] = high_var.astype(bool) df = df.loc[adata.var_names, :] else: - df = df.loc[adata.var_names, :] + df = df.loc[adata.var_names] dispersion_norm = df.dispersions_norm.values dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat gene_subset = np.logical_and.reduce( From 47bd877a681d83ade8f4abb030e95d5fa3d21c79 Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 23 Feb 2022 19:04:09 +0100 Subject: [PATCH 74/96] cache data --- scanpy/tests/helpers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py index bdbbe7b156..d1dec31995 100644 --- a/scanpy/tests/helpers.py +++ b/scanpy/tests/helpers.py @@ -8,7 +8,7 @@ import numpy as np import warnings import pytest - +from functools import cache from anndata.tests.helpers import asarray, assert_equal # TODO: Report more context on the fields being compared on error @@ -87,6 +87,11 @@ def check_rep_results(func, X, *, fields=["layer", "obsm"], **kwargs): assert_equal(adata_X, adatas_proc[field]) +@cache +def _get_pbmc3k(): + return sc.datasets.pbmc3k() + + def _prepare_pbmc_testdata(sparsity_func, dtype, small=False): """Prepares 3k PBMC dataset with batch key `batch` and defined datatype/sparsity. @@ -95,16 +100,11 @@ def _prepare_pbmc_testdata(sparsity_func, dtype, small=False): sparsity_func sparsity function applied to adata.X (e.g. csr_matrix.toarray for dense or csr_matrix for sparse) dtype - numpy dtype applied to adata.X (e.g. 'float32' or 'int64') + numpy dtype applied to adata.X (e.g. 'float32' or 'int64') small False (default) returns full data, True returns small subset of the data.""" - # loading from disk takes long, so cache raw data after loading it once - if 'ADATA_PBMC_RAW' not in globals(): - global ADATA_PBMC_RAW - ADATA_PBMC_RAW = sc.datasets.pbmc3k() - - adata = ADATA_PBMC_RAW.copy() + adata = _get_pbmc3k() if small: adata = adata[:1000, :500] From 13a44be8fb07814e4a44b6388418dae363cc66d1 Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 23 Feb 2022 19:36:13 +0100 Subject: [PATCH 75/96] use doc_params for doc --- scanpy/experimental/_docs.py | 47 +++++++++++++++++++++++ scanpy/experimental/pp/_normalization.py | 49 ++++++++++-------------- scanpy/experimental/pp/_recipes.py | 41 +++++++++----------- 3 files changed, 87 insertions(+), 50 deletions(-) create mode 100644 scanpy/experimental/_docs.py diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py new file mode 100644 index 0000000000..e8fecf19be --- /dev/null +++ b/scanpy/experimental/_docs.py @@ -0,0 +1,47 @@ +"""Shared docstrings for experimental function parameters. +""" + +doc_adata = """\ +adata + The annotated data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. +""" + +doc_norm_params = """\ +theta + The negative binomial overdispersion parameter theta for Pearson residuals. + Higher values correspond to less overdispersion (var = mean + mean^2/theta), + and `theta=np.Inf` corresponds to a Poisson model. +clip + Determines if and how residuals are clipped: + + * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ + where n is the number of cells in the dataset (default behavior). + * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + `clip=np.Inf` for no clipping.0 + +""" + +doc_layer_copy = """\ +check_values + Check if counts in selected layer are integers. A Warning is returned if set to + True. +layer + Layer to normalize instead of `X`. If `None`, `X` is normalized. +copy + Whether to modify copied input object. Not compatible with `inplace=False`. +inplace + Whether to update `adata` or return dictionary with normalized copies + of `adata.X` and `adata.layers`. +""" + +doc_inplace = """\ +inplace + Whether to update `adata` or return dictionary with normalized copies + of `adata.X` and `adata.layers`. +""" + +doc_copy = """\ +copy + Whether to modify copied input object. Not compatible with `inplace=False`. +""" diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index 449ca02f1d..254950b2e1 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -10,8 +10,15 @@ from scanpy._utils import view_to_actual, check_nonnegative_integers from scanpy.get import _get_obs_rep, _set_obs_rep - +from scanpy._utils import _doc_params from scanpy.preprocessing._pca import pca +from scanpy.experimental._docs import ( + doc_adata, + doc_norm_params, + doc_layer, + doc_copy, + doc_inplace, +) def _pearson_residuals(X, theta, clip, check_values, copy=False): @@ -55,6 +62,13 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False): return residuals +@_doc_params( + adata=doc_adata, + norm_params=doc_norm_params, + layer=doc_layer, + inplace=doc_inplace, + copy=doc_copy, +) def normalize_pearson_residuals( adata: AnnData, *, @@ -62,8 +76,8 @@ def normalize_pearson_residuals( clip: Optional[float] = None, check_values: bool = True, layer: Optional[str] = None, - copy: bool = False, inplace: bool = True, + copy: bool = False, ) -> Optional[Dict[str, np.ndarray]]: """\ Applies analytic Pearson residual normalization, based on [Lause21]_. @@ -76,38 +90,17 @@ def normalize_pearson_residuals( Params ------ - adata - The annotated data matrix of shape `n_obs` × `n_vars`. - Rows correspond to cells and columns to genes. - theta - The negative binomial overdispersion parameter theta for Pearson residuals. - Higher values correspond to less overdispersion (var = mean + mean^2/theta), - and `theta=np.Inf` corresponds to a Poisson model. - clip - Determines if and how residuals are clipped: - - * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ - where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping. - - check_values - Check if counts in selected layer are integers. A Warning is returned if set to - True. - layer - Layer to normalize instead of `X`. If `None`, `X` is normalized. - copy - Whether to modify copied input object. Not compatible with `inplace=False`. - inplace - Whether to update `adata` or return dictionary with normalized copies - of `adata.X` and `adata.layers`. + {adata} + {norm_params} + {layer} + {inplace} + {copy} Returns ------- Returns dictionary with Pearson residuals and settings or updates `adata` with normalized version of the original `adata.X` and `adata.layers`, depending on `inplace`. - """ if copy: diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index ea7da74f59..e4fa4cec5c 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -4,8 +4,21 @@ import numpy as np from scanpy import experimental from scanpy.preprocessing import pca - - +from scanpy.experimental._docs import ( + doc_adata, + doc_norm_params, + doc_layer, + doc_inplace, +) +from scanpy._utils import _doc_params + + +@_doc_params( + adata=doc_adata, + norm_params=doc_norm_params, + layer=doc_layer, + inplace=doc_inplace, +) def recipe_pearson_residuals( adata: AnnData, *, @@ -31,21 +44,8 @@ def recipe_pearson_residuals( Params ------ - adata - The annotated data matrix of shape `n_obs` × `n_vars`. - Rows correspond to cells and columns to genes. - theta - The negative binomial overdispersion parameter theta for Pearson residuals. - Higher values correspond to less overdispersion (var = mean + mean^2/theta), - and `theta=np.Inf` corresponds to a Poisson model. - clip - Determines if and how residuals are clipped: - - * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ - where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping. - + {adata} + {norm_params} n_top_genes Number of highly-variable genes to keep. batch_key @@ -64,11 +64,8 @@ def recipe_pearson_residuals( Change to use different initial states for the optimization in the PCA step. kwargs_pca Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. - check_values - Check if counts in selected layer are integers. A Warning is returned if set to - True. - inplace - Whether to place results in `adata` or return them. + {layer} + {inplace} Returns From 0e4711d60964c8f9c3b8c630aa7ac4e2b8af6fe5 Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 23 Feb 2022 20:10:22 +0100 Subject: [PATCH 76/96] fix doc_params var --- scanpy/experimental/_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py index e8fecf19be..a767bf9a06 100644 --- a/scanpy/experimental/_docs.py +++ b/scanpy/experimental/_docs.py @@ -22,7 +22,7 @@ """ -doc_layer_copy = """\ +doc_layer = """\ check_values Check if counts in selected layer are integers. A Warning is returned if set to True. From aa55183dadfd732c6d2c2a1b897a32a3e056f8e0 Mon Sep 17 00:00:00 2001 From: giovp Date: Thu, 24 Feb 2022 09:30:50 +0100 Subject: [PATCH 77/96] finalize docs --- scanpy/experimental/_docs.py | 30 +++++++-- .../experimental/pp/_highly_variable_genes.py | 63 ++++++++----------- scanpy/experimental/pp/_normalization.py | 6 +- scanpy/experimental/pp/_recipes.py | 22 ++----- 4 files changed, 59 insertions(+), 62 deletions(-) diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py index a767bf9a06..ad05c56c7e 100644 --- a/scanpy/experimental/_docs.py +++ b/scanpy/experimental/_docs.py @@ -7,7 +7,7 @@ Rows correspond to cells and columns to genes. """ -doc_norm_params = """\ +doc_dist_params = """\ theta The negative binomial overdispersion parameter theta for Pearson residuals. Higher values correspond to less overdispersion (var = mean + mean^2/theta), @@ -28,11 +28,29 @@ True. layer Layer to normalize instead of `X`. If `None`, `X` is normalized. -copy - Whether to modify copied input object. Not compatible with `inplace=False`. -inplace - Whether to update `adata` or return dictionary with normalized copies - of `adata.X` and `adata.layers`. +""" + +doc_subset = """\ +subset + Inplace subset to highly-variable genes if `True` otherwise merely indicate + highly variable genes. +""" + +doc_genes_batch_chunk = """\ +n_top_genes + Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or + `flavor='pearson_residuals'`. +batch_key + If specified, highly-variable genes are selected within each batch separately + and merged. This simple process avoids the selection of batch-specific genes + and acts as a lightweight batch correction method. Genes are first sorted by + how many batches they are a HVG. If `flavor='pearson_residuals'`, ties are + broken by the median rank (across batches) based on within-batch residual + variance. +chunksize + If `flavor='pearson_residuals'`, this dertermines how many genes are processed at + once while computing the residual variance. Choosing a smaller value will reduce + the required memory. """ doc_inplace = """\ diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index f98370544f..aed70f6e5f 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -1,3 +1,4 @@ +from multiprocessing.sharedctypes import Value import warnings from typing import Optional @@ -12,9 +13,18 @@ from scanpy._utils import check_nonnegative_integers, view_to_actual from scanpy.get import _get_obs_rep from scanpy._compat import Literal +from scanpy._utils import _doc_params from scanpy.preprocessing._utils import _get_mean_var from scanpy.preprocessing._distributed import materialize_as_ndarray from scanpy.preprocessing._simple import filter_genes +from scanpy.experimental._docs import ( + doc_adata, + doc_dist_params, + doc_genes_batch_chunk, + doc_layer, + doc_copy, + doc_inplace, +) def _highly_variable_pearson_residuals( @@ -202,6 +212,13 @@ def _highly_variable_pearson_residuals( return df +@_doc_params( + adata=doc_adata, + dist_params=doc_dist_params, + genes_batch_chunk=doc_genes_batch_chunk, + layer=doc_layer, + inplace=doc_inplace, +) def highly_variable_genes( adata: AnnData, *, @@ -226,51 +243,19 @@ def highly_variable_genes( Expects raw count input. - Parameters ---------- - adata - The annotated data matrix of shape `n_obs` × `n_vars`. - Rows correspond to cells and columns to genes. - theta - The negative binomial overdispersion parameter theta for Pearson residuals. - Higher values correspond to less overdispersion (var = mean + mean^2/theta), - and `theta=np.Inf` corresponds to a Poisson model. - clip - If `flavor='pearson_residuals'`, determines if and how residuals are clipped: - - * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ - where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping. - - n_top_genes - Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or - `flavor='pearson_residuals'`. - batch_key - If specified, highly-variable genes are selected within each batch separately - and merged. This simple process avoids the selection of batch-specific genes - and acts as a lightweight batch correction method. Genes are first sorted by - how many batches they are a HVG. If `flavor='pearson_residuals'`, ties are - broken by the median rank (across batches) based on within-batch residual - variance. - chunksize - If `flavor='pearson_residuals'`, this dertermines how many genes are processed at - once while computing the residual variance. Choosing a smaller value will reduce - the required memory. + {adata} + {dist_params} + {genes_batch_chunk} flavor Choose the flavor for identifying highly variable genes. In this experimental version, only 'pearson_residuals' is functional. - check_values - Check if counts in selected layer are integers. A Warning is returned if set to - True. Only used if `flavor='pearson_residuals'`. - layer - If provided, use `adata.layers[layer]` for expression values instead of `adata.X`. + {layer} subset Inplace subset to highly-variable genes if `True` otherwise merely indicate highly variable genes. - inplace - Whether to place calculated metrics in `.var` or return them. + {in_place} Returns ------- @@ -325,3 +310,7 @@ def highly_variable_genes( check_values=check_values, inplace=inplace, ) + else: + raise ValueError( + "This is an experimental API and only `flavor=pearson_residuals` is available." + ) diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index 254950b2e1..6ef838c5cb 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -14,7 +14,7 @@ from scanpy.preprocessing._pca import pca from scanpy.experimental._docs import ( doc_adata, - doc_norm_params, + doc_dist_params, doc_layer, doc_copy, doc_inplace, @@ -64,7 +64,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False): @_doc_params( adata=doc_adata, - norm_params=doc_norm_params, + norm_params=doc_dist_params, layer=doc_layer, inplace=doc_inplace, copy=doc_copy, @@ -91,7 +91,7 @@ def normalize_pearson_residuals( Params ------ {adata} - {norm_params} + {dist_params} {layer} {inplace} {copy} diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index e4fa4cec5c..76336a1e4d 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -6,7 +6,8 @@ from scanpy.preprocessing import pca from scanpy.experimental._docs import ( doc_adata, - doc_norm_params, + doc_dist_params, + doc_genes_batch_chunk, doc_layer, doc_inplace, ) @@ -15,7 +16,8 @@ @_doc_params( adata=doc_adata, - norm_params=doc_norm_params, + dist_params=doc_dist_params, + genes_batch_chunk=doc_genes_batch_chunk, layer=doc_layer, inplace=doc_inplace, ) @@ -45,19 +47,8 @@ def recipe_pearson_residuals( Params ------ {adata} - {norm_params} - n_top_genes - Number of highly-variable genes to keep. - batch_key - If specified, highly-variable genes are selected within each batch separately - and merged. This simple process avoids the selection of batch-specific genes - and acts as a lightweight batch correction method. Genes are first sorted by - how many batches they are a HVG. Ties are broken by the median rank (across - batches) based on within-batch residual variance. - chunksize - This dertermines how many genes are processed at once while computing - the Pearson residual variance. Choosing a smaller value will reduce - the required memory. + {dist_params} + {genes_batch_chunk} n_comps Number of principal components to compute in the PCA step. random_state @@ -67,7 +58,6 @@ def recipe_pearson_residuals( {layer} {inplace} - Returns ------- If `inplace=False`, separately returns the gene selection results (`hvg`) From 8e9b07b4d154ae567bc80da88fc7c0d05e448885 Mon Sep 17 00:00:00 2001 From: giovp Date: Thu, 24 Feb 2022 10:21:39 +0100 Subject: [PATCH 78/96] fix param doc --- scanpy/experimental/pp/_normalization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index 6ef838c5cb..ef0ca75b65 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -64,7 +64,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False): @_doc_params( adata=doc_adata, - norm_params=doc_dist_params, + dist_params=doc_dist_params, layer=doc_layer, inplace=doc_inplace, copy=doc_copy, From dce90b2920b0dfb7d0dbfba227a60c4e9b8c2633 Mon Sep 17 00:00:00 2001 From: giovp Date: Thu, 24 Feb 2022 12:02:55 +0100 Subject: [PATCH 79/96] wrong var still --- scanpy/experimental/pp/_highly_variable_genes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index aed70f6e5f..e03beae206 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -255,7 +255,7 @@ def highly_variable_genes( subset Inplace subset to highly-variable genes if `True` otherwise merely indicate highly variable genes. - {in_place} + {inplace} Returns ------- From ca65af525d03b0938ff4c7e0dd7c144078e43c9a Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Feb 2022 14:08:56 +0100 Subject: [PATCH 80/96] add cached datasets module and test on high_var_genes tests --- scanpy/tests/_data/_cached_datasets.py | 19 +++++++++++++++++++ scanpy/tests/helpers.py | 9 ++------- scanpy/tests/test_highly_variable_genes.py | 12 +++++++----- 3 files changed, 28 insertions(+), 12 deletions(-) create mode 100644 scanpy/tests/_data/_cached_datasets.py diff --git a/scanpy/tests/_data/_cached_datasets.py b/scanpy/tests/_data/_cached_datasets.py new file mode 100644 index 0000000000..0ccdf00b75 --- /dev/null +++ b/scanpy/tests/_data/_cached_datasets.py @@ -0,0 +1,19 @@ +from functools import wraps +import scanpy as sc + + +def cached_dataset(func): + store = [] + + @wraps(func) + def wrapper(): + if len(store) < 1: + store.append(func()) + return store[0].copy() + + return wrapper + + +pbmc3k = cached_dataset(sc.datasets.pbmc3k) +pbmc68k_reduced = cached_dataset(sc.datasets.pbmc68k_reduced) +pbmc3k_processed = cached_dataset(sc.datasets.pbmc3k_processed) diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py index d1dec31995..9bae484355 100644 --- a/scanpy/tests/helpers.py +++ b/scanpy/tests/helpers.py @@ -8,8 +8,8 @@ import numpy as np import warnings import pytest -from functools import cache from anndata.tests.helpers import asarray, assert_equal +from scanpy.tests._data._cached_datasets import pbmc3k # TODO: Report more context on the fields being compared on error # TODO: Allow specifying paths to ignore on comparison @@ -87,11 +87,6 @@ def check_rep_results(func, X, *, fields=["layer", "obsm"], **kwargs): assert_equal(adata_X, adatas_proc[field]) -@cache -def _get_pbmc3k(): - return sc.datasets.pbmc3k() - - def _prepare_pbmc_testdata(sparsity_func, dtype, small=False): """Prepares 3k PBMC dataset with batch key `batch` and defined datatype/sparsity. @@ -104,7 +99,7 @@ def _prepare_pbmc_testdata(sparsity_func, dtype, small=False): small False (default) returns full data, True returns small subset of the data.""" - adata = _get_pbmc3k() + adata = pbmc3k().copy() if small: adata = adata[:1000, :500] diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 31addb225c..068ebd3280 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -10,6 +10,8 @@ ) import warnings +from scanpy.tests._data._cached_datasets import pbmc3k, pbmc68k_reduced + FILE = Path(__file__).parent / Path('_scripts/seurat_hvg.csv') FILE_V3 = Path(__file__).parent / Path('_scripts/seurat_hvg_v3.csv.gz') FILE_V3_BATCH = Path(__file__).parent / Path('_scripts/seurat_hvg_v3_batch.csv') @@ -273,7 +275,7 @@ def test_highly_variable_genes_pearson_residuals_batch( def test_higly_variable_genes_compare_to_seurat(): seurat_hvg_info = pd.read_csv(FILE, sep=' ') - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced().copy() pbmc.X = pbmc.raw.X pbmc.var_names_make_unique() @@ -314,7 +316,7 @@ def test_higly_variable_genes_compare_to_seurat_v3(): FILE_V3, sep=' ', dtype={"variances_norm": np.float64} ) - pbmc = sc.datasets.pbmc3k() + pbmc = pbmc3k().copy() pbmc.var_names_make_unique() pbmc_dense = pbmc.copy() @@ -377,7 +379,7 @@ def test_higly_variable_genes_compare_to_seurat_v3(): def test_filter_genes_dispersion_compare_to_seurat(): seurat_hvg_info = pd.read_csv(FILE, sep=' ') - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced().copy() pbmc.X = pbmc.raw.X pbmc.var_names_make_unique() @@ -419,7 +421,7 @@ def test_filter_genes_dispersion_compare_to_seurat(): def test_highly_variable_genes_batches(): - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced().copy() adata[:100, :100].X = np.zeros((100, 100)) adata.obs['batch'] = ['0' if i < 100 else '1' for i in range(adata.n_obs)] @@ -468,7 +470,7 @@ def test_highly_variable_genes_batches(): def test_seurat_v3_mean_var_output_with_batchkey(): - pbmc = sc.datasets.pbmc3k() + pbmc = pbmc3k().copy() pbmc.var_names_make_unique() n_cells = pbmc.shape[0] batch = np.zeros((n_cells), dtype=int) From d3a07cb9aa3e14449c7fea2cf7dbfb513e1ddfc5 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Feb 2022 15:35:49 +0100 Subject: [PATCH 81/96] use new cache dataset module for tests --- scanpy/tests/_data/_cached_datasets.py | 2 + scanpy/tests/test_clustering.py | 3 +- scanpy/tests/test_dendrogram_key_added.py | 3 +- scanpy/tests/test_deprecations.py | 3 +- scanpy/tests/test_embedding.py | 12 +-- scanpy/tests/test_embedding_density.py | 4 +- scanpy/tests/test_filter_rank_genes_groups.py | 2 +- scanpy/tests/test_get.py | 4 +- scanpy/tests/test_highly_variable_genes.py | 10 +-- scanpy/tests/test_ingest.py | 4 +- scanpy/tests/test_metrics.py | 7 +- scanpy/tests/test_neighbors_key_added.py | 4 +- scanpy/tests/test_paga.py | 8 +- scanpy/tests/test_plotting.py | 75 ++++++++++--------- scanpy/tests/test_preprocessing.py | 5 +- scanpy/tests/test_queries.py | 5 +- scanpy/tests/test_rank_genes_groups.py | 6 +- scanpy/tests/test_score_genes.py | 3 +- 18 files changed, 88 insertions(+), 72 deletions(-) diff --git a/scanpy/tests/_data/_cached_datasets.py b/scanpy/tests/_data/_cached_datasets.py index 0ccdf00b75..f66bece206 100644 --- a/scanpy/tests/_data/_cached_datasets.py +++ b/scanpy/tests/_data/_cached_datasets.py @@ -17,3 +17,5 @@ def wrapper(): pbmc3k = cached_dataset(sc.datasets.pbmc3k) pbmc68k_reduced = cached_dataset(sc.datasets.pbmc68k_reduced) pbmc3k_processed = cached_dataset(sc.datasets.pbmc3k_processed) +krumsiek11 = cached_dataset(sc.datasets.krumsiek11) +paul15 = cached_dataset(sc.datasets.paul15) diff --git a/scanpy/tests/test_clustering.py b/scanpy/tests/test_clustering.py index 1bb65d27ce..e0cbca4be2 100644 --- a/scanpy/tests/test_clustering.py +++ b/scanpy/tests/test_clustering.py @@ -1,10 +1,11 @@ import pytest import scanpy as sc +from scanpy.tests._data._cached_datasets import pbmc68k_reduced @pytest.fixture def adata_neighbors(): - return sc.datasets.pbmc68k_reduced() + return pbmc68k_reduced() def test_leiden_basic(adata_neighbors): diff --git a/scanpy/tests/test_dendrogram_key_added.py b/scanpy/tests/test_dendrogram_key_added.py index 6d43042914..a656b03c6a 100644 --- a/scanpy/tests/test_dendrogram_key_added.py +++ b/scanpy/tests/test_dendrogram_key_added.py @@ -1,6 +1,7 @@ import scanpy as sc import numpy as np import pytest +from scanpy.tests._data._cached_datasets import pbmc68k_reduced n_neighbors = 5 key = 'test' @@ -8,7 +9,7 @@ @pytest.fixture def adata(): - return sc.AnnData(sc.datasets.pbmc68k_reduced()) + return pbmc68k_reduced() @pytest.mark.parametrize('groupby', ['bulk_labels', ['bulk_labels', 'phase']]) diff --git a/scanpy/tests/test_deprecations.py b/scanpy/tests/test_deprecations.py index 620c774bd6..5006779c3b 100644 --- a/scanpy/tests/test_deprecations.py +++ b/scanpy/tests/test_deprecations.py @@ -1,10 +1,11 @@ import scanpy as sc +from scanpy.tests._data._cached_datasets import pbmc68k_reduced import pytest def test_deprecate_multicore_tsne(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() with pytest.warns( UserWarning, match="calling tsne with n_jobs > 1 would use MulticoreTSNE" diff --git a/scanpy/tests/test_embedding.py b/scanpy/tests/test_embedding.py index 1f778959c3..9120bd09e0 100644 --- a/scanpy/tests/test_embedding.py +++ b/scanpy/tests/test_embedding.py @@ -1,7 +1,7 @@ from importlib.util import find_spec from unittest.mock import patch import warnings - +from scanpy.tests._data._cached_datasets import pbmc68k_reduced import numpy as np import pytest from numpy.testing import assert_array_almost_equal, assert_array_equal, assert_raises @@ -10,7 +10,7 @@ def test_tsne(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() euclidean1 = sc.tl.tsne(pbmc, metric="euclidean", copy=True) with pytest.warns(UserWarning, match="In previous versions of scanpy"): @@ -32,7 +32,7 @@ def test_tsne(): def test_tsne_metric_warning(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() import sklearn with patch.object(sklearn, "__version__", "0.23.0"), pytest.warns( @@ -42,7 +42,7 @@ def test_tsne_metric_warning(): def test_umap_init_dtype(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc = pbmc[:100, :].copy() sc.tl.umap(pbmc, init_pos=pbmc.obsm["X_pca"][:, :2].astype(np.float32)) embed1 = pbmc.obsm["X_umap"].copy() @@ -57,7 +57,7 @@ def test_umap_init_dtype(): @pytest.mark.parametrize("layout", [pytest.param("fa", marks=needs_fa2), "fr"]) def test_umap_init_paga(layout): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc = pbmc[:100, :].copy() sc.tl.paga(pbmc) sc.pl.paga(pbmc, layout=layout, show=False) @@ -65,7 +65,7 @@ def test_umap_init_paga(layout): def test_diffmap(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.diffmap(pbmc) d1 = pbmc.obsm['X_diffmap'].copy() diff --git a/scanpy/tests/test_embedding_density.py b/scanpy/tests/test_embedding_density.py index 3e49f45f3f..e38bdc65db 100644 --- a/scanpy/tests/test_embedding_density.py +++ b/scanpy/tests/test_embedding_density.py @@ -1,6 +1,6 @@ import numpy as np from anndata import AnnData - +from scanpy.tests._data._cached_datasets import pbmc68k_reduced import scanpy as sc @@ -22,6 +22,6 @@ def test_embedding_density(): def test_embedding_density_plot(): # Test that sc.pl.embedding_density() runs without error - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() sc.tl.embedding_density(adata, 'umap') sc.pl.embedding_density(adata, 'umap', 'umap_density', show=False) diff --git a/scanpy/tests/test_filter_rank_genes_groups.py b/scanpy/tests/test_filter_rank_genes_groups.py index b6d7bb474c..7325d07376 100644 --- a/scanpy/tests/test_filter_rank_genes_groups.py +++ b/scanpy/tests/test_filter_rank_genes_groups.py @@ -1,6 +1,6 @@ import numpy as np from scanpy.tools import rank_genes_groups, filter_rank_genes_groups -from scanpy.datasets import pbmc68k_reduced +from scanpy.tests._data._cached_datasets import pbmc68k_reduced names_no_reference = np.array( diff --git a/scanpy/tests/test_get.py b/scanpy/tests/test_get.py index 6fac0386c5..30a3eb84f5 100644 --- a/scanpy/tests/test_get.py +++ b/scanpy/tests/test_get.py @@ -9,7 +9,7 @@ import scanpy as sc from scanpy.datasets._utils import filter_oldformatwarning - +from scanpy.tests._data._cached_datasets import pbmc68k_reduced TRANSPOSE_PARAMS = pytest.mark.parametrize( "dim,transform,func", @@ -202,7 +202,7 @@ def test_backed_vs_memory(): def test_column_content(): "uses a larger dataset to test column order and content" - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() # test that columns content is correct for obs_df query = ['CST3', 'NKG7', 'GNLY', 'louvain', 'n_counts', 'n_genes'] diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 068ebd3280..9483848051 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -275,7 +275,7 @@ def test_highly_variable_genes_pearson_residuals_batch( def test_higly_variable_genes_compare_to_seurat(): seurat_hvg_info = pd.read_csv(FILE, sep=' ') - pbmc = pbmc68k_reduced().copy() + pbmc = pbmc68k_reduced() pbmc.X = pbmc.raw.X pbmc.var_names_make_unique() @@ -316,7 +316,7 @@ def test_higly_variable_genes_compare_to_seurat_v3(): FILE_V3, sep=' ', dtype={"variances_norm": np.float64} ) - pbmc = pbmc3k().copy() + pbmc = pbmc3k() pbmc.var_names_make_unique() pbmc_dense = pbmc.copy() @@ -379,7 +379,7 @@ def test_higly_variable_genes_compare_to_seurat_v3(): def test_filter_genes_dispersion_compare_to_seurat(): seurat_hvg_info = pd.read_csv(FILE, sep=' ') - pbmc = pbmc68k_reduced().copy() + pbmc = pbmc68k_reduced() pbmc.X = pbmc.raw.X pbmc.var_names_make_unique() @@ -421,7 +421,7 @@ def test_filter_genes_dispersion_compare_to_seurat(): def test_highly_variable_genes_batches(): - adata = pbmc68k_reduced().copy() + adata = pbmc68k_reduced() adata[:100, :100].X = np.zeros((100, 100)) adata.obs['batch'] = ['0' if i < 100 else '1' for i in range(adata.n_obs)] @@ -470,7 +470,7 @@ def test_highly_variable_genes_batches(): def test_seurat_v3_mean_var_output_with_batchkey(): - pbmc = pbmc3k().copy() + pbmc = pbmc3k() pbmc.var_names_make_unique() n_cells = pbmc.shape[0] batch = np.zeros((n_cells), dtype=int) diff --git a/scanpy/tests/test_ingest.py b/scanpy/tests/test_ingest.py index a7ba765f98..8bd9c05be2 100644 --- a/scanpy/tests/test_ingest.py +++ b/scanpy/tests/test_ingest.py @@ -7,7 +7,7 @@ import scanpy as sc from scanpy import settings from scanpy._compat import pkg_version - +from scanpy.tests._data._cached_datasets import pbmc68k_reduced X = np.array( [ @@ -25,7 +25,7 @@ @pytest.fixture def adatas(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() n_split = 500 adata_ref = sc.AnnData(pbmc.X[:n_split, :], obs=pbmc.obs.iloc[:n_split]) adata_new = sc.AnnData(pbmc.X[n_split:, :]) diff --git a/scanpy/tests/test_metrics.py b/scanpy/tests/test_metrics.py index 025d788abb..4ace4a9c54 100644 --- a/scanpy/tests/test_metrics.py +++ b/scanpy/tests/test_metrics.py @@ -9,10 +9,11 @@ from anndata.tests.helpers import asarray import pytest +from scanpy.tests._data._cached_datasets import pbmc68k_reduced def test_gearys_c_consistency(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc.layers["raw"] = pbmc.raw.X.copy() g = pbmc.obsp["connectivities"] @@ -69,7 +70,7 @@ def test_gearys_c_correctness(): def test_morans_i_consistency(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc.layers["raw"] = pbmc.raw.X.copy() g = pbmc.obsp["connectivities"] @@ -133,7 +134,7 @@ def test_morans_i_correctness(): ) def test_graph_metrics_w_constant_values(metric, array_type): # https://github.com/theislab/scanpy/issues/1806 - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() XT = array_type(pbmc.raw.X.T.copy()) g = pbmc.obsp["connectivities"].copy() diff --git a/scanpy/tests/test_neighbors_key_added.py b/scanpy/tests/test_neighbors_key_added.py index 310d7e5d62..3b2ee68a2c 100644 --- a/scanpy/tests/test_neighbors_key_added.py +++ b/scanpy/tests/test_neighbors_key_added.py @@ -5,10 +5,12 @@ n_neighbors = 5 key = 'test' +from scanpy.tests._data._cached_datasets import pbmc68k_reduced + @pytest.fixture def adata(): - return sc.AnnData(sc.datasets.pbmc68k_reduced().X) + return sc.AnnData(pbmc68k_reduced().X) def test_neighbors_key_added(adata): diff --git a/scanpy/tests/test_paga.py b/scanpy/tests/test_paga.py index d4440a2884..fabfe243c4 100644 --- a/scanpy/tests/test_paga.py +++ b/scanpy/tests/test_paga.py @@ -5,7 +5,7 @@ import numpy as np import scanpy as sc - +from scanpy.tests._data._cached_datasets import pbmc68k_reduced, pbmc3k_processed import pytest HERE: Path = Path(__file__).parent @@ -15,7 +15,7 @@ @pytest.fixture(scope="module") def pbmc(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.paga(pbmc, groups='bulk_labels') pbmc.obs['cool_feature'] = pbmc[:, 'CST3'].X.squeeze() return pbmc @@ -81,7 +81,7 @@ def test_paga_compare(image_comparer): # Tests that https://github.com/theislab/scanpy/issues/1887 is fixed save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc3k_processed() + pbmc = pbmc3k_processed() sc.tl.paga(pbmc, groups="louvain") sc.pl.paga_compare(pbmc, basis="umap", show=False) @@ -92,7 +92,7 @@ def test_paga_compare(image_comparer): def test_paga_positions_reproducible(): """Check exact reproducibility and effect of random_state on paga positions""" # https://github.com/theislab/scanpy/issues/1859 - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.paga(pbmc, "bulk_labels") a = pbmc.copy() diff --git a/scanpy/tests/test_plotting.py b/scanpy/tests/test_plotting.py index b0acd839f3..98611ab6f4 100644 --- a/scanpy/tests/test_plotting.py +++ b/scanpy/tests/test_plotting.py @@ -6,7 +6,12 @@ import pytest from matplotlib.testing import setup from packaging import version - +from scanpy.tests._data._cached_datasets import ( + pbmc3k, + pbmc3k_processed, + pbmc68k_reduced, + krumsiek11, +) from scanpy._compat import pkg_version setup() @@ -38,7 +43,7 @@ def test_heatmap(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() sc.pl.heatmap( adata, adata.var_names, 'cell_type', use_raw=False, show=False, dendrogram=True ) @@ -100,7 +105,7 @@ def test_heatmap(image_comparer): save_and_compare_images('master_heatmap_std_scale_obs') # test var_names as dict - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.leiden(pbmc, key_added="clusters", resolution=0.5) # call umap to trigger colors for the clusters sc.pl.umap(pbmc, color="clusters") @@ -152,7 +157,7 @@ def test_heatmap(image_comparer): ) def test_clustermap(image_comparer, obs_keys, name): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() sc.pl.clustermap(adata, obs_keys) save_and_compare_images(name) @@ -312,7 +317,7 @@ def test_clustermap(image_comparer, obs_keys, name): def test_dotplot_matrixplot_stacked_violin(image_comparer, id, fn): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() adata.obs['numeric_column'] = adata.X[:, 0] adata.layers['test'] = -1 * adata.X.copy() genes_dict = { @@ -331,7 +336,7 @@ def test_dotplot_matrixplot_stacked_violin(image_comparer, id, fn): def test_dotplot_obj(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) # test dotplot dot_min, dot_max, color_map, and var_groups - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() genes = [ 'CD79A', 'MS4A1', @@ -369,7 +374,7 @@ def test_dotplot_obj(image_comparer): def test_matrixplot_obj(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() marker_genes_dict = { "3": ["GNLY", "NKG7"], "1": ["FCER1A"], @@ -396,7 +401,7 @@ def test_matrixplot_obj(image_comparer): def test_stacked_violin_obj(image_comparer, plt): save_and_compare_images = image_comparer(ROOT, FIGS, tol=26) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() markers = { 'T-cell': ['CD3D', 'CD3E', 'IL32'], 'B-cell': ['CD79A', 'CD79B', 'MS4A1'], @@ -417,7 +422,7 @@ def test_stacked_violin_obj(image_comparer, plt): def test_tracksplot(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() sc.pl.tracksplot( adata, adata.var_names, 'cell_type', dendrogram=True, use_raw=False ) @@ -428,7 +433,7 @@ def test_multiple_plots(image_comparer): # only testing stacked_violin, matrixplot and dotplot save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() markers = { 'T-cell': ['CD3D', 'CD3E', 'IL32'], 'B-cell': ['CD79A', 'CD79B', 'MS4A1'], @@ -474,7 +479,7 @@ def test_violin(image_comparer): sc.pl.set_rcParams_defaults() sc.set_figure_params(dpi=50, color_map='viridis') - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.violin( pbmc, ['n_genes', 'percent_mito', 'n_counts'], @@ -523,7 +528,7 @@ def test_violin_without_raw(tmpdir): has_raw_pth = TESTDIR / "has_raw.png" no_raw_pth = TESTDIR / "no_raw.png" - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc_no_raw = pbmc.raw.to_adata().copy() sc.pl.violin(pbmc, 'CST3', groupby="bulk_labels", show=False, jitter=False) @@ -540,7 +545,7 @@ def test_violin_without_raw(tmpdir): def test_dendrogram(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=10) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.dendrogram(pbmc, 'bulk_labels') save_and_compare_images('dendrogram') @@ -548,7 +553,7 @@ def test_dendrogram(image_comparer): def test_correlation(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.correlation_matrix(pbmc, 'bulk_labels') save_and_compare_images('correlation') @@ -772,7 +777,7 @@ def test_correlation(image_comparer): def test_rank_genes_groups(image_comparer, name, fn): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.rank_genes_groups(pbmc, 'louvain', n_genes=pbmc.raw.shape[1]) # add gene symbol @@ -791,8 +796,8 @@ def gene_symbols_adatas(): Both have ensembl ids and hgnc symbols as columns in var. The first has ensembl ids as var_names, the second has symbols. """ - pbmc = sc.datasets.pbmc3k_processed().raw.to_adata() - pbmc_counts = sc.datasets.pbmc3k() + pbmc = pbmc3k_processed().raw.to_adata() + pbmc_counts = pbmc3k() pbmc.layers["counts"] = pbmc_counts[pbmc.obs_names, pbmc.var_names].X.copy() pbmc.var["gene_symbol"] = pbmc.var_names @@ -877,7 +882,7 @@ def test_rank_genes_groups_plots_n_genes_vs_var_names(tmpdir, func, check_same_i var_names as a dict works. """ N = 3 - pbmc = sc.datasets.pbmc68k_reduced().raw.to_adata() + pbmc = pbmc68k_reduced().raw.to_adata() groups = pbmc.obs["louvain"].cat.categories[:3] pbmc = pbmc[pbmc.obs["louvain"].isin(groups)][::3].copy() @@ -929,7 +934,7 @@ def wrapped(pth, **kwargs): def test_genes_symbols(image_comparer, id, fn): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() # add a 'symbols' column adata.var['symbols'] = adata.var.index.map(lambda x: "symbol_{}".format(x)) @@ -941,7 +946,7 @@ def test_genes_symbols(image_comparer, id, fn): @pytest.fixture(scope="module") def pbmc_scatterplots(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc.layers["sparse"] = pbmc.raw.X / 2 pbmc.layers["test"] = pbmc.X.copy() + 100 pbmc.var["numbers"] = [str(x) for x in range(pbmc.shape[1])] @@ -1069,7 +1074,7 @@ def test_scatter_embedding_groups_and_size(image_comparer): # plotted on top. This new ordering requires that the size # vector is also ordered (if given). save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.embedding( pbmc, 'umap', @@ -1082,7 +1087,7 @@ def test_scatter_embedding_groups_and_size(image_comparer): def test_scatter_embedding_add_outline_vmin_vmax_norm(image_comparer, check_same_image): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.embedding( pbmc, @@ -1183,7 +1188,7 @@ def test_scatter_embedding_add_outline_vmin_vmax_norm(image_comparer, check_same def test_timeseries(): - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() sc.pp.neighbors(adata, n_neighbors=5, method='gauss', knn=False) sc.tl.diffmap(adata) sc.tl.dpt(adata, n_branchings=1, n_dcs=10) @@ -1191,7 +1196,7 @@ def test_timeseries(): def test_scatter_raw(tmp_path): - pbmc = sc.datasets.pbmc68k_reduced()[:100].copy() + pbmc = pbmc68k_reduced()[:100].copy() raw_pth = tmp_path / "raw.png" x_pth = tmp_path / "X.png" @@ -1208,7 +1213,7 @@ def test_scatter_raw(tmp_path): def test_scatter_specify_layer_and_raw(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() pbmc.layers["layer"] = pbmc.raw.X.copy() with pytest.raises(ValueError): sc.pl.umap(pbmc, color="HES4", use_raw=True, layer="layer") @@ -1217,7 +1222,7 @@ def test_scatter_specify_layer_and_raw(): def test_scatter_no_basis_per_obs(image_comparer): """Test scatterplot of per-obs points with no basis""" save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.scatter(pbmc, x="HES4", y="percent_mito", color="n_genes", use_raw=False) save_and_compare_images("scatter_HES_percent_mito_n_genes") @@ -1225,14 +1230,14 @@ def test_scatter_no_basis_per_obs(image_comparer): def test_scatter_no_basis_per_var(image_comparer): """Test scatterplot of per-var points with no basis""" save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pl.scatter(pbmc, x="AAAGCCTGGCTAAC-1", y="AAATTCGATGCACA-1", use_raw=False) save_and_compare_images("scatter_AAAGCCTGGCTAAC-1_vs_AAATTCGATGCACA-1") @pytest.fixture def pbmc_filtered(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pp.filter_genes(pbmc, min_cells=10) return pbmc @@ -1287,7 +1292,7 @@ def test_scatter_no_basis_value_error(pbmc_filtered, x, y, color, use_raw): def test_rankings(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.pp.pca(pbmc) sc.pl.pca_loadings(pbmc) save_and_compare_images('master_pca_loadings') @@ -1357,7 +1362,7 @@ def test_no_copy(): # https://github.com/theislab/scanpy/issues/1000 # Tests that plotting functions don't make a copy from a view unless they # actually have to - actual = sc.datasets.pbmc68k_reduced() + actual = pbmc68k_reduced() sc.pl.umap(actual, color=["bulk_labels", "louvain"], show=False) # Set colors view = actual[np.random.choice(actual.obs_names, size=actual.shape[0] // 5), :] @@ -1395,7 +1400,7 @@ def test_no_copy(): def test_groupby_index(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() genes = [ 'CD79A', @@ -1420,7 +1425,7 @@ def test_groupby_index(image_comparer): # test category order when groupby is a list (#1735) def test_groupby_list(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=30) - adata = sc.datasets.krumsiek11() + adata = krumsiek11() np.random.seed(1) @@ -1441,7 +1446,7 @@ def test_color_cycler(caplog): # https://github.com/theislab/scanpy/issues/1885 import logging - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() colors = sns.color_palette("deep") cyl = sns.rcmod.cycler('color', sns.color_palette("deep")) @@ -1469,7 +1474,7 @@ def test_filter_rank_genes_groups_plots(tmpdir, plot, check_same_image): TESTDIR = Path(tmpdir) N_GENES = 4 - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon', pts=True) @@ -1504,7 +1509,7 @@ def test_filter_rank_genes_groups_plots(tmpdir, plot, check_same_image): def test_scrublet_plots(image_comparer, plt): save_and_compare_images = image_comparer(ROOT, FIGS, tol=30) - adata = sc.datasets.pbmc3k() + adata = pbmc3k() sc.external.pp.scrublet(adata, use_approx_neighbors=False) sc.external.pl.scrublet_score_distribution(adata, return_fig=True) diff --git a/scanpy/tests/test_preprocessing.py b/scanpy/tests/test_preprocessing.py index b9decb0579..e3e6712400 100644 --- a/scanpy/tests/test_preprocessing.py +++ b/scanpy/tests/test_preprocessing.py @@ -11,6 +11,7 @@ from anndata.tests.helpers import assert_equal, asarray from scanpy.tests.helpers import check_rep_mutation, check_rep_results +from scanpy.tests._data._cached_datasets import pbmc68k_reduced def test_log1p(tmp_path): @@ -115,7 +116,7 @@ def test_subsample_copy(): def test_scale(): - adata = sc.datasets.pbmc68k_reduced() + adata = pbmc68k_reduced() adata.X = adata.raw.X v = adata[:, 0 : adata.shape[1] // 2] # Should turn view to copy https://github.com/theislab/anndata/issues/171#issuecomment-508689965 @@ -336,7 +337,7 @@ def test_downsample_total_counts(count_matrix_format, replace, dtype): def test_recipe_weinreb(): # Just tests for failure for now - adata = sc.datasets.pbmc68k_reduced().raw.to_adata() + adata = pbmc68k_reduced().raw.to_adata() adata.X = adata.X.toarray() orig = adata.copy() diff --git a/scanpy/tests/test_queries.py b/scanpy/tests/test_queries.py index e4ef9cc69d..09c9aef0db 100644 --- a/scanpy/tests/test_queries.py +++ b/scanpy/tests/test_queries.py @@ -1,11 +1,12 @@ import pandas as pd import pytest import scanpy as sc +from scanpy.tests._data._cached_datasets import pbmc68k_reduced @pytest.mark.internet def test_enrich(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() sc.tl.rank_genes_groups(pbmc, "louvain", n_genes=pbmc.shape[1]) enrich_anndata = sc.queries.enrich(pbmc, "1") de = pd.DataFrame() @@ -29,7 +30,7 @@ def test_enrich(): @pytest.mark.internet def test_mito_genes(): - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() mt_genes = sc.queries.mitochondrial_genes("hsapiens") assert ( pbmc.var_names.isin(mt_genes["external_gene_name"]).sum() == 1 diff --git a/scanpy/tests/test_rank_genes_groups.py b/scanpy/tests/test_rank_genes_groups.py index febe80a4b4..9daa73de9c 100644 --- a/scanpy/tests/test_rank_genes_groups.py +++ b/scanpy/tests/test_rank_genes_groups.py @@ -16,7 +16,7 @@ from scanpy.tools import rank_genes_groups from scanpy.tools._rank_genes_groups import _RankGenes from scanpy.get import rank_genes_groups_df -from scanpy.datasets import pbmc68k_reduced +from scanpy.tests._data._cached_datasets import pbmc68k_reduced from scanpy._utils import select_groups @@ -216,12 +216,12 @@ def test_results_layers(): def test_rank_genes_groups_use_raw(): # https://github.com/theislab/scanpy/issues/1929 - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() assert pbmc.raw is not None sc.tl.rank_genes_groups(pbmc, groupby="bulk_labels", use_raw=True) - pbmc = sc.datasets.pbmc68k_reduced() + pbmc = pbmc68k_reduced() del pbmc.raw assert pbmc.raw is None diff --git a/scanpy/tests/test_score_genes.py b/scanpy/tests/test_score_genes.py index 6f38237f86..56a9e4ac41 100644 --- a/scanpy/tests/test_score_genes.py +++ b/scanpy/tests/test_score_genes.py @@ -5,6 +5,7 @@ import pytest import pickle from pathlib import Path +from scanpy.tests._data._cached_datasets import paul15 HERE = Path(__file__).parent / Path('_data/') @@ -54,7 +55,7 @@ def test_score_with_reference(): and stored as a pickle object in ./data """ - adata = sc.datasets.paul15() + adata = paul15() sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000) sc.pp.scale(adata) From bdd37cdf329d81157e271561a6028fe956f260a8 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Feb 2022 15:43:32 +0100 Subject: [PATCH 82/96] fix precommit --- scanpy/experimental/pp/_highly_variable_genes.py | 2 +- scanpy/experimental/pp/_normalization.py | 2 +- scanpy/tests/test_normalization.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index e03beae206..9bf5bfbd06 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -121,7 +121,7 @@ def _highly_variable_pearson_residuals( stop = start + chunksize mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total) X_dense = X_batch[:, start:stop].toarray() - residuals = (X_dense - mu) / np.sqrt(mu + mu ** 2 / theta) + residuals = (X_dense - mu) / np.sqrt(mu + mu**2 / theta) residuals = np.clip(residuals, a_min=-clip, a_max=clip) residual_gene_var[start:stop] = np.var(residuals, axis=0) diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index ef0ca75b65..d076f03ffd 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -54,7 +54,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False): mu = np.array(sums_cells @ sums_genes / sum_total) diff = np.array(X - mu) - residuals = diff / np.sqrt(mu + mu ** 2 / theta) + residuals = diff / np.sqrt(mu + mu**2 / theta) # clip residuals = np.clip(residuals, a_min=-clip, a_max=clip) diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index 21588a2610..8a3fc9d357 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -118,7 +118,7 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip): residuals_reference = (X - mu) / np.sqrt(mu) else: # NB case - residuals_reference = (X - mu) / np.sqrt(mu + mu ** 2 / theta) + residuals_reference = (X - mu) / np.sqrt(mu + mu**2 / theta) # compute output to test adata = AnnData(sparsity_func(X), dtype=dtype) From aba3906d3959cc397fee1264e530300b36b162cf Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 28 Feb 2022 16:25:35 +0100 Subject: [PATCH 83/96] fix docs --- scanpy/experimental/_docs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py index ad05c56c7e..40dd6f119a 100644 --- a/scanpy/experimental/_docs.py +++ b/scanpy/experimental/_docs.py @@ -18,8 +18,7 @@ * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ where n is the number of cells in the dataset (default behavior). * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping.0 - + `clip=np.Inf` for no clipping. """ doc_layer = """\ From c9dbf489bb465b385958e2b4a7736b65c8a56b43 Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 9 Mar 2022 21:30:58 +0100 Subject: [PATCH 84/96] fix reference and add notebook to tutorials --- docs/references.rst | 4 ++++ docs/tutorials.rst | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/docs/references.rst b/docs/references.rst index 7c09607f44..e1adc6012c 100644 --- a/docs/references.rst +++ b/docs/references.rst @@ -268,3 +268,7 @@ References .. [Zunder15] Zunder *et al.* (2015), *A continuous molecular roadmap to iPSC reprogramming through progression analysis of single-cell mass cytometry*, `Cell Stem Cell `__. + +.. [Lause21] Lause *et al.* (2021), + *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*, + `Genome Biology `__. diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 8c304332c2..95f2527093 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -101,6 +101,11 @@ See the `cell cycle`_ notebook. :width: 120px :align: right +Normalization with Pearson Residuals +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Normalization of scRNA-seq data with Pearson Residuals, from [Lause21]_: :tutorial:`tutorial_pearson_residuals` + Scaling Computations ~~~~~~~~~~~~~~~~~~~~ From e335966b762c745937117e72786cba2401176714 Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 9 Mar 2022 21:33:02 +0100 Subject: [PATCH 85/96] add release note --- docs/release-notes/1.9.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst index 3d1c73beda..a0061736c5 100644 --- a/docs/release-notes/1.9.0.rst +++ b/docs/release-notes/1.9.0.rst @@ -5,3 +5,4 @@ - :func:`~scanpy.tl.filter_rank_genes_groups` now allows to filter with absolute values of log fold change :pr:`1649` :smaller:`S Rybakov` - :func:`~scanpy.pl.embedding_density` now allows more than 10 groups :pr:`1936` :smaller:`A Wolf` +- :mod:`~scanpy.experimental` new Scanpy experimental module with Pearson Residuals method for normalization and HVG selection :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` From bf7fb25cd5f825f095dbdac7f2d820c27093099a Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 9 Mar 2022 21:43:14 +0100 Subject: [PATCH 86/96] add release note --- docs/release-notes/1.9.0.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst index a0061736c5..842cf95ff2 100644 --- a/docs/release-notes/1.9.0.rst +++ b/docs/release-notes/1.9.0.rst @@ -5,4 +5,11 @@ - :func:`~scanpy.tl.filter_rank_genes_groups` now allows to filter with absolute values of log fold change :pr:`1649` :smaller:`S Rybakov` - :func:`~scanpy.pl.embedding_density` now allows more than 10 groups :pr:`1936` :smaller:`A Wolf` -- :mod:`~scanpy.experimental` new Scanpy experimental module with Pearson Residuals method for normalization and HVG selection :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` + +.. rubric:: Experimental module + +- Added :mod:`scanpy.experimental` module! + + - Added :func:`scanpy.experimental.pp.normalization_pearson_residuals` for Perason Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` + - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` + - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` From 1045d984973eed5b6b48c953caea909739e2a074 Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 9 Mar 2022 21:45:00 +0100 Subject: [PATCH 87/96] fix release note --- docs/release-notes/1.9.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst index 842cf95ff2..304341e220 100644 --- a/docs/release-notes/1.9.0.rst +++ b/docs/release-notes/1.9.0.rst @@ -10,6 +10,6 @@ - Added :mod:`scanpy.experimental` module! - - Added :func:`scanpy.experimental.pp.normalization_pearson_residuals` for Perason Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` + - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Perason Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` From f7d4c49cf3a38d498a4bde120b55c017d0d98a3b Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 9 Mar 2022 21:45:23 +0100 Subject: [PATCH 88/96] typo --- docs/release-notes/1.9.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst index 304341e220..d225cf09f2 100644 --- a/docs/release-notes/1.9.0.rst +++ b/docs/release-notes/1.9.0.rst @@ -10,6 +10,6 @@ - Added :mod:`scanpy.experimental` module! - - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Perason Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` + - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Pearson Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` From 5f76cdfb4bd94760d406c7bed7619df6d538363b Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 9 Mar 2022 22:05:31 +0100 Subject: [PATCH 89/96] remove duplicate reference --- docs/references.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/references.rst b/docs/references.rst index e1adc6012c..7c09607f44 100644 --- a/docs/references.rst +++ b/docs/references.rst @@ -268,7 +268,3 @@ References .. [Zunder15] Zunder *et al.* (2015), *A continuous molecular roadmap to iPSC reprogramming through progression analysis of single-cell mass cytometry*, `Cell Stem Cell `__. - -.. [Lause21] Lause *et al.* (2021), - *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*, - `Genome Biology `__. From 19b018cf79d74fc938eff201596ee18f79b04268 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Sat, 12 Mar 2022 13:53:31 +0100 Subject: [PATCH 90/96] fixing black flake etc requirements --- scanpy/experimental/_docs.py | 36 ++++++--- .../experimental/pp/_highly_variable_genes.py | 17 +++-- scanpy/experimental/pp/_normalization.py | 75 +++++++++---------- scanpy/experimental/pp/_recipes.py | 25 +++---- 4 files changed, 84 insertions(+), 69 deletions(-) diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py index 40dd6f119a..d040bd1262 100644 --- a/scanpy/experimental/_docs.py +++ b/scanpy/experimental/_docs.py @@ -10,21 +10,24 @@ doc_dist_params = """\ theta The negative binomial overdispersion parameter theta for Pearson residuals. - Higher values correspond to less overdispersion (var = mean + mean^2/theta), - and `theta=np.Inf` corresponds to a Poisson model. + Higher values correspond to less overdispersion \ + (`var = mean + mean^2/theta`), and `theta=np.Inf` corresponds to a Poisson model. clip Determines if and how residuals are clipped: - * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ - where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ + * If `None`, residuals are clipped to the interval \ + `[-sqrt(n_obs), sqrt(n_obs)]`, where `n_obs` is the number of cells in the dataset (default behavior). + * If any scalar `c`, residuals are clipped to the interval `[-c, c]`. Set \ `clip=np.Inf` for no clipping. """ -doc_layer = """\ +doc_check_values = """\ check_values - Check if counts in selected layer are integers. A Warning is returned if set to - True. + Check if counts in selected layer are integers. A warning is returned if set to + `True`. +""" + +doc_layer = """\ layer Layer to normalize instead of `X`. If `None`, `X` is normalized. """ @@ -52,13 +55,24 @@ the required memory. """ +doc_pca_chunk = """\ +n_comps + Number of principal components to compute in the PCA step. +random_state + Change to use different initial states for the optimization in the PCA step. +kwargs_pca + Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. +""" + doc_inplace = """\ inplace - Whether to update `adata` or return dictionary with normalized copies - of `adata.X` and `adata.layers`. + If `True`, update `adata` with results. Otherwise, return results. See below for + details of what is returned. """ doc_copy = """\ copy - Whether to modify copied input object. Not compatible with `inplace=False`. + If `True`, the function runs on a copy of the input object and returns the + modified copy. Otherwise, the input object is modified direcly. Not compatible + with `inplace=False`. """ diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index 9bf5bfbd06..1becb2a1d1 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -21,6 +21,7 @@ doc_adata, doc_dist_params, doc_genes_batch_chunk, + doc_check_values, doc_layer, doc_copy, doc_inplace, @@ -44,8 +45,8 @@ def _highly_variable_pearson_residuals( Returns ------- - Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) - or updates `.var` with the following fields: + If `inplace=True`, `adata.var` is updated with the following fields. Otherwise, + returns the same fields as :class:`~pandas.DataFrame`. highly_variable : bool boolean indicator of highly-variable genes @@ -216,6 +217,7 @@ def _highly_variable_pearson_residuals( adata=doc_adata, dist_params=doc_dist_params, genes_batch_chunk=doc_genes_batch_chunk, + check_values=doc_check_values, layer=doc_layer, inplace=doc_inplace, ) @@ -237,9 +239,9 @@ def highly_variable_genes( Annotate highly variable genes using analytic Pearson residuals [Lause21]_. For [Lause21]_, Pearson residuals of a negative binomial offset model (with - overdispersion theta shared across genes) are computed. By default, overdispersion - theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked - by residual variance. + overdispersion theta shared across genes) are computed. By default, + overdispersion `theta=100` is used and residuals are clipped to `sqrt(n_obs)`. + Finally, genes are ranked by residual variance. Expects raw count input. @@ -251,6 +253,7 @@ def highly_variable_genes( flavor Choose the flavor for identifying highly variable genes. In this experimental version, only 'pearson_residuals' is functional. + {check_values} {layer} subset Inplace subset to highly-variable genes if `True` otherwise merely indicate @@ -259,8 +262,8 @@ def highly_variable_genes( Returns ------- - Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or - updates `.var` with the following fields + If `inplace=True`, `adata.var` is updated with the following fields. Otherwise, + returns the same fields as :class:`~pandas.DataFrame`. highly_variable : bool boolean indicator of highly-variable genes diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index d076f03ffd..ab26ae2af3 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -16,8 +16,10 @@ doc_adata, doc_dist_params, doc_layer, + doc_check_values, doc_copy, doc_inplace, + doc_pca_chunk, ) @@ -65,6 +67,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False): @_doc_params( adata=doc_adata, dist_params=doc_dist_params, + check_values=doc_check_values, layer=doc_layer, inplace=doc_inplace, copy=doc_copy, @@ -83,8 +86,8 @@ def normalize_pearson_residuals( Applies analytic Pearson residual normalization, based on [Lause21]_. The residuals are based on a negative binomial offset model with overdispersion - `theta` shared across genes. By default, residuals are clipped to sqrt(n) and - overdispersion `theta=100` is used. + `theta` shared across genes. By default, residuals are clipped to `sqrt(n_obs)` + and overdispersion `theta=100` is used. Expects raw count input. @@ -92,15 +95,24 @@ def normalize_pearson_residuals( ------ {adata} {dist_params} + {check_values} {layer} {inplace} {copy} Returns ------- - Returns dictionary with Pearson residuals and settings - or updates `adata` with normalized version of the original - `adata.X` and `adata.layers`, depending on `inplace`. + If `inplace=True`, `adata.X` or the selected layer in `adata.layers` is updated + with the normalized values. `adata.uns` is updated with the following fields. + If `inplace=False`, the same fields are returned as dictionary with the + normalized values in `results_dict['X']`. + + `.uns['pearson_residuals_normalization']['theta']` + The used value of the overdisperion parameter theta + `.uns['pearson_residuals_normalization']['clip']` + The used value of the clipping parameter + `.uns['pearson_residuals_normalization']['computed_on']` + The name of the layer on which the residuals were computed. """ if copy: @@ -132,6 +144,13 @@ def normalize_pearson_residuals( return results_dict +@_doc_params( + adata=doc_adata, + dist_params=doc_dist_params, + pca_chunk=doc_pca_chunk, + check_values=doc_check_values, + inplace=doc_inplace, +) def normalize_pearson_residuals_pca( adata: AnnData, *, @@ -143,12 +162,12 @@ def normalize_pearson_residuals_pca( use_highly_variable: Optional[bool] = None, check_values: bool = True, inplace: bool = True, -) -> Optional[pd.DataFrame]: +) -> Optional[AnnData]: """\ Applies analytic Pearson residual normalization and PCA, based on [Lause21]_. The residuals are based on a negative binomial offset model with overdispersion - `theta` shared across genes. By default, residuals are clipped to sqrt(n), + `theta` shared across genes. By default, residuals are clipped to `sqrt(n_obs)`, overdispersion `theta=100` is used, and PCA is run with 50 components. Operates on the subset of highly variable genes in `adata.var['highly_variable']` @@ -157,42 +176,22 @@ def normalize_pearson_residuals_pca( Params ------ - adata - The annotated data matrix of shape `n_obs` × `n_vars`. - Rows correspond to cells and columns to genes. - theta - The negative binomial overdispersion parameter theta for Pearson residuals. - Higher values correspond to less overdispersion (var = mean + mean^2/theta), - and `theta=np.Inf` corresponds to a Poisson model. - clip - Determines if and how residuals are clipped: - - * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \ - where n is the number of cells in the dataset (default behavior). - * If any scalar c, residuals are clipped to the interval [-c, c]. Set \ - `clip=np.Inf` for no clipping. - - n_comps - Number of principal components to compute for the PCA step. - random_state - Change to use different initial states for the optimization of the PCA step. - kwargs_pca - Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. + {adata} + {dist_params} + {pca_chunk} use_highly_variable - Whether to use the gene selection in `adata.var['highly_variable']` to subset - the data before normalizing (default) or proceed on the full dataset. - check_values - Check if counts in selected layer are integers. A Warning is returned if set to - True. - inplace - Whether to place results in `adata` or return them. + If `True`, use gene selection present in `adata.var['highly_variable']` to + subset the data before normalizing (default). Otherwise, proceed on the full + dataset. + {check_values} + {inplace} Returns ------- - If `inplace=False`, returns the Pearson residual-based PCA results - (`adata_pca`). - If `inplace=True`, updates `adata` with the following fields: + If `inplace=False`, returns the Pearson residual-based PCA results (`adata_pca`, + :class:`~anndata.AnnData`). If `inplace=True`, updates `adata` with the following + fields: `.uns['pearson_residuals_normalization']['pearson_residuals_df']` The hvg-subset, normalized by Pearson residuals diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 76336a1e4d..b2ccbf6335 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -8,7 +8,9 @@ doc_adata, doc_dist_params, doc_genes_batch_chunk, + doc_pca_chunk, doc_layer, + doc_check_values, doc_inplace, ) from scanpy._utils import _doc_params @@ -18,7 +20,8 @@ adata=doc_adata, dist_params=doc_dist_params, genes_batch_chunk=doc_genes_batch_chunk, - layer=doc_layer, + pca_chunk=doc_pca_chunk, + check_values=doc_check_values, inplace=doc_inplace, ) def recipe_pearson_residuals( @@ -34,7 +37,7 @@ def recipe_pearson_residuals( kwargs_pca: dict = {}, check_values: bool = True, inplace: bool = True, -) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]: +) -> Optional[Tuple[AnnData, pd.DataFrame]]: """\ Gene selection and normalization based on [Lause21]_. @@ -49,20 +52,16 @@ def recipe_pearson_residuals( {adata} {dist_params} {genes_batch_chunk} - n_comps - Number of principal components to compute in the PCA step. - random_state - Change to use different initial states for the optimization in the PCA step. - kwargs_pca - Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. - {layer} + {pca_chunk} + {check_values} {inplace} Returns ------- - If `inplace=False`, separately returns the gene selection results (`hvg`) - and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`, - updates `adata` with the following fields for gene selection results: + If `inplace=False`, separately returns the gene selection results (`hvg`, + :class:`~pandas.DataFrame`) and Pearson residual-based PCA results (`adata_pca`, + :class:`~anndata.AnnData`). If `inplace=True`, updates `adata` with the + following fields for gene selection results: `.var['highly_variable']` : bool boolean indicator of highly-variable genes. @@ -83,7 +82,7 @@ def recipe_pearson_residuals( If batch_key is given, this denotes the genes that are highly variable in all batches. - …and the following fields for Pearson residual-based PCA results and + The following fields contain Pearson residual-based PCA results and normalization settings: `.uns['pearson_residuals_normalization']['pearson_residuals_df']` From ce9ee43d43b53205717fa29352510f0603ca50b9 Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Sun, 13 Mar 2022 00:42:16 +0100 Subject: [PATCH 91/96] add _pca function to release note --- docs/release-notes/1.9.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst index d225cf09f2..51963e6f72 100644 --- a/docs/release-notes/1.9.0.rst +++ b/docs/release-notes/1.9.0.rst @@ -12,4 +12,5 @@ - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Pearson Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` + - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` From 7ffdec3b391d039dd9991788232f3f534e614dee Mon Sep 17 00:00:00 2001 From: Jan Lause Date: Sun, 13 Mar 2022 01:43:05 +0100 Subject: [PATCH 92/96] last edits to docs --- scanpy/experimental/_docs.py | 11 ++++---- .../experimental/pp/_highly_variable_genes.py | 28 +++++++++---------- scanpy/experimental/pp/_normalization.py | 17 ++++++----- scanpy/experimental/pp/_recipes.py | 8 +++--- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py index d040bd1262..a1408adf01 100644 --- a/scanpy/experimental/_docs.py +++ b/scanpy/experimental/_docs.py @@ -9,7 +9,7 @@ doc_dist_params = """\ theta - The negative binomial overdispersion parameter theta for Pearson residuals. + The negative binomial overdispersion parameter `theta` for Pearson residuals. Higher values correspond to less overdispersion \ (`var = mean + mean^2/theta`), and `theta=np.Inf` corresponds to a Poisson model. clip @@ -23,13 +23,14 @@ doc_check_values = """\ check_values - Check if counts in selected layer are integers. A warning is returned if set to - `True`. + If `True`, checks if counts in selected layer are integers as expected by this + function, and return a warning if non-integers are found. Otherwise, proceed + without checking. Setting this to `False` can speed up code for large datasets. """ doc_layer = """\ layer - Layer to normalize instead of `X`. If `None`, `X` is normalized. + Layer to use as input instead of `X`. If `None`, `X` is used. """ doc_subset = """\ @@ -59,7 +60,7 @@ n_comps Number of principal components to compute in the PCA step. random_state - Change to use different initial states for the optimization in the PCA step. + Random seed for setting the initial states for the optimization in the PCA step. kwargs_pca Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`. """ diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index 1becb2a1d1..166b6e06df 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -236,12 +236,12 @@ def highly_variable_genes( inplace: bool = True, ) -> Optional[pd.DataFrame]: """\ - Annotate highly variable genes using analytic Pearson residuals [Lause21]_. + Select highly variable genes using analytic Pearson residuals [Lause21]_. - For [Lause21]_, Pearson residuals of a negative binomial offset model (with - overdispersion theta shared across genes) are computed. By default, - overdispersion `theta=100` is used and residuals are clipped to `sqrt(n_obs)`. - Finally, genes are ranked by residual variance. + In [Lause21]_, Pearson residuals of a negative binomial offset model are computed + (with overdispersion `theta` shared across genes). By default, overdispersion + `theta=100` is used and residuals are clipped to `sqrt(n_obs)`. Finally, genes + are ranked by residual variance. Expects raw count input. @@ -256,8 +256,8 @@ def highly_variable_genes( {check_values} {layer} subset - Inplace subset to highly-variable genes if `True` otherwise merely indicate - highly variable genes. + If `True`, subset the data to highly-variable genes after finding them. + Otherwise merely indicate highly variable genes in `adata.var` (see below). {inplace} Returns @@ -266,21 +266,21 @@ def highly_variable_genes( returns the same fields as :class:`~pandas.DataFrame`. highly_variable : bool - boolean indicator of highly-variable genes + boolean indicator of highly-variable genes. means : float - means per gene + means per gene. variances : float - variance per gene + variance per gene. residual_variances : float For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of multiple batches. highly_variable_rank : float - For `flavor='pearson_residuals'`, rank of the gene according to residual - variance, median rank in the case of multiple batches + For `flavor='pearson_residuals'`, rank of the gene according to residual. + variance, median rank in the case of multiple batches. highly_variable_nbatches : int - If `batch_key` given, denotes in how many batches genes are detected as HVG + If `batch_key` given, denotes in how many batches genes are detected as HVG. highly_variable_intersection : bool - If `batch_key` given, denotes the genes that are highly variable in all batches + If `batch_key` given, denotes the genes that are highly variable in all batches. Notes ----- diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index ab26ae2af3..4445b55d21 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -108,9 +108,9 @@ def normalize_pearson_residuals( normalized values in `results_dict['X']`. `.uns['pearson_residuals_normalization']['theta']` - The used value of the overdisperion parameter theta + The used value of the overdisperion parameter theta. `.uns['pearson_residuals_normalization']['clip']` - The used value of the clipping parameter + The used value of the clipping parameter. `.uns['pearson_residuals_normalization']['computed_on']` The name of the layer on which the residuals were computed. """ @@ -180,7 +180,7 @@ def normalize_pearson_residuals_pca( {dist_params} {pca_chunk} use_highly_variable - If `True`, use gene selection present in `adata.var['highly_variable']` to + If `True`, uses gene selection present in `adata.var['highly_variable']` to subset the data before normalizing (default). Otherwise, proceed on the full dataset. {check_values} @@ -189,16 +189,15 @@ def normalize_pearson_residuals_pca( Returns ------- - If `inplace=False`, returns the Pearson residual-based PCA results (`adata_pca`, - :class:`~anndata.AnnData`). If `inplace=True`, updates `adata` with the following - fields: + If `inplace=False`, returns the Pearson residual-based PCA results (as :class:`~anndata.AnnData` + object). If `inplace=True`, updates `adata` with the following fields: `.uns['pearson_residuals_normalization']['pearson_residuals_df']` - The hvg-subset, normalized by Pearson residuals + The subset of highly variable genes, normalized by Pearson residuals. `.uns['pearson_residuals_normalization']['theta']` - The used value of the overdisperion parameter theta + The used value of the overdisperion parameter theta. `.uns['pearson_residuals_normalization']['clip']` - The used value of the clipping parameter + The used value of the clipping parameter. `.obsm['X_pca']` PCA representation of data after gene selection (if applicable) and Pearson diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index b2ccbf6335..7f11beaeac 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -39,7 +39,7 @@ def recipe_pearson_residuals( inplace: bool = True, ) -> Optional[Tuple[AnnData, pd.DataFrame]]: """\ - Gene selection and normalization based on [Lause21]_. + Full pipeline for HVG selection and normalization by analytic Pearson residuals ([Lause21]_). Applies gene selection based on Pearson residuals. On the resulting subset, Pearson residual normalization and PCA are performed. @@ -58,8 +58,8 @@ def recipe_pearson_residuals( Returns ------- - If `inplace=False`, separately returns the gene selection results (`hvg`, - :class:`~pandas.DataFrame`) and Pearson residual-based PCA results (`adata_pca`, + If `inplace=False`, separately returns the gene selection results (as + :class:`~pandas.DataFrame`) and Pearson residual-based PCA results (as :class:`~anndata.AnnData`). If `inplace=True`, updates `adata` with the following fields for gene selection results: @@ -86,7 +86,7 @@ def recipe_pearson_residuals( normalization settings: `.uns['pearson_residuals_normalization']['pearson_residuals_df']` - The hvg-subset, normalized by Pearson residuals. + The subset of highly variable genes, normalized by Pearson residuals. `.uns['pearson_residuals_normalization']['theta']` The used value of the overdisperion parameter theta. `.uns['pearson_residuals_normalization']['clip']` From a0aaf966b854285463a0fd90483005337d367d19 Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 16 Mar 2022 19:37:29 +0100 Subject: [PATCH 93/96] fix release and tutorial image --- docs/release-notes/1.9.0.rst | 1 + docs/tutorials.rst | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst index 51963e6f72..634199b464 100644 --- a/docs/release-notes/1.9.0.rst +++ b/docs/release-notes/1.9.0.rst @@ -11,6 +11,7 @@ - Added :mod:`scanpy.experimental` module! - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Pearson Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` + - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 95f2527093..5412675b6b 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -97,15 +97,14 @@ See the `cell cycle`_ notebook. .. _cell cycle: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/180209_cell_cycle/cell_cycle.ipynb -.. image:: _static/img/tutorials/170522_visualizing_one_million_cells/tsne_1.3M.png - :width: 120px - :align: right - Normalization with Pearson Residuals ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Normalization of scRNA-seq data with Pearson Residuals, from [Lause21]_: :tutorial:`tutorial_pearson_residuals` +.. image:: _static/img/tutorials/170522_visualizing_one_million_cells/tsne_1.3M.png + :width: 120px + :align: right Scaling Computations ~~~~~~~~~~~~~~~~~~~~ From ad81e29dc26b53af44834b8df4ab0afa912dbe1c Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 16 Mar 2022 20:12:09 +0100 Subject: [PATCH 94/96] try fix pre-commit --- docs/release-notes/1.9.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst index 756357394e..4ff60a7cd8 100644 --- a/docs/release-notes/1.9.0.rst +++ b/docs/release-notes/1.9.0.rst @@ -16,4 +16,3 @@ - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - \ No newline at end of file From d74a0e6811b234b99acf32281a1ce30b2dc94f33 Mon Sep 17 00:00:00 2001 From: giovp Date: Wed, 16 Mar 2022 20:43:44 +0100 Subject: [PATCH 95/96] minor docs --- scanpy/experimental/pp/_normalization.py | 3 --- scanpy/experimental/pp/_recipes.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py index 4445b55d21..ee116c8df1 100644 --- a/scanpy/experimental/pp/_normalization.py +++ b/scanpy/experimental/pp/_normalization.py @@ -173,7 +173,6 @@ def normalize_pearson_residuals_pca( Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default. Expects raw count input. - Params ------ {adata} @@ -186,7 +185,6 @@ def normalize_pearson_residuals_pca( {check_values} {inplace} - Returns ------- If `inplace=False`, returns the Pearson residual-based PCA results (as :class:`~anndata.AnnData` @@ -210,7 +208,6 @@ def normalize_pearson_residuals_pca( Ratio of explained variance. `.uns['pca']['variance']` Explained variance, equivalent to the eigenvalues of the covariance matrix. - """ # check if HVG selection is there if user wants to use it diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py index 7f11beaeac..5aba49345a 100644 --- a/scanpy/experimental/pp/_recipes.py +++ b/scanpy/experimental/pp/_recipes.py @@ -46,7 +46,6 @@ def recipe_pearson_residuals( Expects raw count input. - Params ------ {adata} @@ -102,7 +101,6 @@ def recipe_pearson_residuals( Ratio of explained variance. `.uns['pca']['variance']` Explained variance, equivalent to the eigenvalues of the covariance matrix. - """ hvg_args = dict( From 970b0fadba479e83b0644e45f873f2a677c86bf0 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 29 Mar 2022 17:51:29 +0200 Subject: [PATCH 96/96] Remove accidentally included files from merge --- docs/release-notes/1.9.0.rst | 18 ----- docs/tutorials.rst | 147 ----------------------------------- 2 files changed, 165 deletions(-) delete mode 100644 docs/release-notes/1.9.0.rst delete mode 100644 docs/tutorials.rst diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst deleted file mode 100644 index 4ff60a7cd8..0000000000 --- a/docs/release-notes/1.9.0.rst +++ /dev/null @@ -1,18 +0,0 @@ -1.9.0 :small:`the future` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. rubric:: Features - -- :func:`~scanpy.tl.filter_rank_genes_groups` now allows to filter with absolute values of log fold change :pr:`1649` :smaller:`S Rybakov` -- :func:`~scanpy.pl.embedding_density` now allows more than 10 groups :pr:`1936` :smaller:`A Wolf` -- :func:`~scanpy.logging.print_versions` now uses `session_info` :pr:`2089` :smaller:`P Angerer` :smaller:`I Virshup` - -.. rubric:: Experimental module - -- Added :mod:`scanpy.experimental` module! - - - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Pearson Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` - - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup` diff --git a/docs/tutorials.rst b/docs/tutorials.rst deleted file mode 100644 index 5412675b6b..0000000000 --- a/docs/tutorials.rst +++ /dev/null @@ -1,147 +0,0 @@ -Tutorials -========= - -Clustering ----------- - -For getting started, we recommend Scanpy’s reimplementation :tutorial:`pbmc3k` -of Seurat’s [Satija15]_ clustering tutorial for 3k PBMCs from 10x Genomics, -containing preprocessing, clustering and the identification of cell types via -known marker genes. - -.. image:: _static/img/tutorials/170505_seurat/filter_genes_dispersion.png - :width: 100px -.. image:: _static/img/tutorials/170505_seurat/louvain.png - :width: 100px -.. image:: _static/img/tutorials/170505_seurat/NKG7.png - :width: 100px -.. image:: _static/img/tutorials/170505_seurat/violin.png - :width: 100px -.. image:: _static/img/tutorials/170505_seurat/cell_types.png - :width: 200px - - -Visualization -------------- - -This tutorial shows how to visually explore genes using scanpy. :tutorial:`plotting/core` - -.. image:: _static/img/stacked_violin_dotplot_matrixplot.png - :width: 550px - - -Trajectory inference --------------------- - -Get started with the following example for hematopoiesis for data of [Paul15]_: :tutorial:`paga-paul15` - -.. image:: _static/img/tutorials/paga_paul15.png - :width: 450px - -More examples for trajectory inference on complex datasets can be found in the -`PAGA `_ repository [Wolf19]_, for instance, multi-resolution analyses of whole -animals, such as for planaria_ for data of [Plass18]_. - -.. image:: _static/img/tutorials/paga_planaria.png - :width: 350px - -As a reference for simple pseudotime analyses, we provide the diffusion pseudotime (DPT) analyses of [Haghverdi16]_ -for two hematopoiesis datasets: `DPT example 1`_ [Paul15]_ and `DPT example 2`_ [Moignard15]_. - -.. _planaria: https://nbviewer.jupyter.org/github/theislab/paga/blob/master/planaria/planaria.ipynb -.. _DPT example 1: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/170502_paul15/paul15.ipynb -.. _DPT example 2: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/170501_moignard15/moignard15.ipynb - - -Integrating datasets --------------------- - -Map labels and embeddings of reference data to new data: :tutorial:`integrating-data-using-ingest` - -.. image:: https://scanpy-tutorials.readthedocs.io/en/latest/_images/integrating-data-using-ingest_21_0.png - :width: 350px - - -Spatial data ------------- - -* Basic analysis of spatial data: :tutorial:`spatial/basic-analysis` -* Integrating spatial data with scRNA-seq using scanorama: :tutorial:`spatial/integration-scanorama` - -.. image:: _static/img/spatial-basic-analysis.png - :width: 250px - - -Further Tutorials ------------------ - -.. _conversion_to_r: - -Conversion: AnnData, SingleCellExperiment, and Seurat objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. image:: https://github.com/theislab/scanpy-in-R/raw/master/logo.png - :width: 200px - :align: right - -* See `Seurat to AnnData`_ for a tutorial on `anndata2ri`. -* See the `Scanpy in R`_ guide for a tutorial on interacting with Scanpy from R. - -.. _Seurat to AnnData: https://github.com/LuckyMD/Code_snippets/blob/master/Seurat_to_anndata.ipynb -.. _Scanpy in R: https://theislab.github.io/scanpy-in-R/ - -Regressing out cell cycle -~~~~~~~~~~~~~~~~~~~~~~~~~ - -See the `cell cycle`_ notebook. - -.. _cell cycle: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/180209_cell_cycle/cell_cycle.ipynb - -Normalization with Pearson Residuals -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Normalization of scRNA-seq data with Pearson Residuals, from [Lause21]_: :tutorial:`tutorial_pearson_residuals` - -.. image:: _static/img/tutorials/170522_visualizing_one_million_cells/tsne_1.3M.png - :width: 120px - :align: right - -Scaling Computations -~~~~~~~~~~~~~~~~~~~~ - -* Visualize and cluster `1.3M neurons`_ from 10x Genomics. - -.. _1.3M neurons: https://github.com/theislab/scanpy_usage/tree/master/170522_visualizing_one_million_cells - -Simulations -~~~~~~~~~~~ - -Simulating single cells using literature-curated gene regulatory networks [Wittmann09]_. - -.. image:: _static/img/tutorials/170430_krumsiek11/timeseries.png - :width: 200px - :align: right - -* Notebook for `myeloid differentiation`_ -* Notebook for simple toggleswitch_ - -.. _myeloid differentiation: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/170430_krumsiek11/krumsiek11.ipynb -.. _toggleswitch: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/170430_krumsiek11/toggleswitch.ipynb - -Images -~~~~~~ - -See pseudotime-time inference on deep-learning based features for `cell cycle reconstruction`_ from image data [Eulenberg17]_. - -.. _cell cycle reconstruction: https://github.com/theislab/scanpy_usage/tree/master/170529_images - -.. - User Examples - ~~~~~~~~~~~~~ - - January 12, 2018: `Exploring the mouse cell atlas`_ by `David P. Cook`_. - Data by `Tabula Muris Consortium`_. - - .. _Exploring the mouse cell atlas: https://github.com/dpcook/fun_analysis/blob/master/tabula_muris/mouse_atlas_scanpy.ipynb - .. _David P. Cook: https://twitter.com/DavidPCook - .. _Tabula Muris Consortium: https://www.biorxiv.org/content/early/2017/12/20/237446