From 12e751d9da9a8a7327b19985ce8bd64d8624a0e6 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Tue, 2 Mar 2021 20:10:35 +0100
Subject: [PATCH 01/96] adding core functions and documentation for pearson
 residual normalization and hvg selection

---
 scanpy/preprocessing/__init__.py              |   4 +-
 .../preprocessing/_highly_variable_genes.py   | 249 +++++++++++++++++-
 scanpy/preprocessing/_normalization.py        | 143 +++++++++-
 3 files changed, 383 insertions(+), 13 deletions(-)

diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py
index 7c2c4d7aca..8bcad5ca03 100644
--- a/scanpy/preprocessing/__init__.py
+++ b/scanpy/preprocessing/__init__.py
@@ -7,6 +7,6 @@
 from ._pca import pca
 from ._qc import calculate_qc_metrics
 from ._combat import combat
-from ._normalization import normalize_total
+from ._normalization import normalize_total, normalize_pearson_residuals
 
-from ..neighbors import neighbors
+from ..neighbors import neighbors
\ No newline at end of file
diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 157322f516..3ae163b3a4 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -1,5 +1,6 @@
 import warnings
-from typing import Optional
+from typing import Optional, Union
+
 import numpy as np
 import pandas as pd
 import scipy.sparse as sp_sparse
@@ -174,6 +175,188 @@ def _highly_variable_genes_seurat_v3(
             df = df.drop(['highly_variable_nbatches'], axis=1)
         return df
 
+def _highly_variable_pearson_residuals(
+    adata: AnnData,
+    layer: Optional[str] = None,
+    n_top_genes: int = 2000,
+    batch_key: Optional[str] = None,
+    theta: float = 100,
+    clip: Union[Literal['auto', 'none'], float] = 'auto',
+    chunksize: int = 100,
+    subset: bool = False,
+    inplace: bool = True,
+) -> Optional[pd.DataFrame]:
+    """\
+    See `highly_variable_genes`.
+
+    Returns
+    -------
+    Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or
+    updates `.var` with the following fields
+
+    highly_variable
+        boolean indicator of highly-variable genes
+    means
+        means per gene
+    variances
+        variances per gene 
+    residual_variances
+        Pearson residual variance per gene. Averaged in the case of multiple batches.
+    highly_variable_rank
+        Rank of the gene according to residual variance, median rank in the case of multiple batches
+    highly_variable_nbatches : int
+        If batch_key is given, this denotes in how many batches genes are detected as HVG
+    highly_variable_intersection : bool
+        If batch_key is given, this denotes the genes that are highly variable in all batches
+    """
+    
+    X = adata.layers[layer] if layer is not None else adata.X
+    
+    # Check for raw counts
+    if check_nonnegative_integers(X) is False:
+        raise ValueError(
+            "`pp.highly_variable_genes` with `flavor='pearson_residuals'` expects "
+            "raw count data."
+        )
+    
+    if batch_key is None:
+        batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int))
+    else:
+        batch_info = adata.obs[batch_key].values
+    n_batches = len(np.unique(batch_info))
+        
+    # Get pearson residuals for each batch separately
+    residual_gene_vars = []
+    for batch in np.unique(batch_info):
+
+        adata_subset = adata[batch_info == batch]
+       
+        # Filter out zero genes
+        with settings.verbosity.override(Verbosity.error):
+            nonzero_genes = filter_genes(adata_subset, min_cells=1, inplace=False)[0]
+        adata_subset = adata_subset[:, nonzero_genes]
+
+
+        X_batch = adata_subset.layers[layer] if layer is not None else adata_subset.X 
+
+        # Prepare clipping
+        if clip == 'auto':
+            n = X_batch.shape[0]
+            clip = np.sqrt(n)
+        if clip < 0:
+            raise ValueError("Pearson residuals require `clip>=0` or `clip='auto'`.")
+
+        if sp_sparse.issparse(X_batch):
+            sums_genes = np.sum(X_batch, axis=0)
+            sums_cells = np.sum(X_batch, axis=1)
+            sum_total  = np.sum(sums_genes).squeeze()
+        else:
+            sums_genes = np.sum(X_batch, axis=0, keepdims=True)
+            sums_cells = np.sum(X_batch, axis=1, keepdims=True)
+            sum_total  = np.sum(sums_genes)
+
+        # Compute pearson residuals in chunks
+        residual_gene_var = np.ones((X_batch.shape[1]))*np.nan
+        for start in np.arange(0,X_batch.shape[1],chunksize):
+            stop = start + chunksize
+            mu = np.array(sums_cells @ sums_genes[:,start:stop] / sum_total)
+            X_dense = X_batch[:,start:stop].toarray()
+            residuals = (X_dense - mu) / np.sqrt(mu + mu**2/theta)
+            residuals = np.clip(residuals, a_min = -clip, a_max = clip)
+            residual_gene_var[start:stop] = np.var(residuals,axis=0)
+            
+        # Add 0 values for genes that were filtered out 
+        zero_gene_var = np.zeros(np.sum(~nonzero_genes))
+        residual_gene_var = np.concatenate((residual_gene_var,
+                                            zero_gene_var))
+        # Order as before filtering
+        idxs = np.concatenate((np.where(nonzero_genes)[0],
+                               np.where(~nonzero_genes)[0]))
+        residual_gene_var = residual_gene_var[np.argsort(idxs)]
+        residual_gene_vars.append(residual_gene_var.reshape(1, -1))
+
+    residual_gene_vars = np.concatenate(residual_gene_vars, axis=0)
+    
+    # Get cutoffs and define hvgs per batch
+    residual_gene_vars_sorted = np.sort(residual_gene_vars,axis=1)[:,::-1]
+    cutoffs_per_batch = residual_gene_vars_sorted[:,n_top_genes]
+    highly_variable_per_batch = np.greater(residual_gene_vars.T,cutoffs_per_batch).T
+    
+    # Merge hvgs across batches
+    highly_variable_nbatches = np.sum(highly_variable_per_batch,axis=0)
+    highly_variable_intersection = highly_variable_nbatches == n_batches
+    
+    # Get rank per gene within each batch
+    # argsort twice gives ranks, small rank means most variable
+    ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1)
+    ranks_residual_var = ranks_residual_var.astype(np.float32)
+    ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan 
+    ranks_masked_array = np.ma.masked_invalid(ranks_residual_var)
+    # Median rank across batches, ignoring batches in which gene was not selected
+    medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) 
+    
+    means, variances = materialize_as_ndarray(_get_mean_var(X))
+    df = pd.DataFrame.from_dict(
+                dict(means=means,
+                     variances=variances,
+                     residual_variances=np.mean(residual_gene_vars,axis=0),
+                     highly_variable_rank=medianrank_residual_var,
+                     highly_variable_nbatches=highly_variable_nbatches,
+                     highly_variable_intersection=highly_variable_intersection,
+                )
+         )
+    df = df.set_index(adata.var_names)
+    
+    # Sort genes by how often they selected as hvg within each batch and
+    # break ties with median rank of residual variance across batches
+    df.sort_values(
+                    ['highly_variable_nbatches', 'highly_variable_rank'],
+                    ascending=[False,True],
+                    na_position='last',
+                    inplace=True,
+                  )
+    df['highly_variable'] = False
+    df.highly_variable.iloc[:n_top_genes] = True
+    ## TODO: following line raises a pandas warning (also for flavor = seurat and cellranger..)
+    df = df.loc[adata.var_names]
+    
+    if inplace or subset:
+        adata.uns['hvg'] = {'flavor': 'pearson_residuals'}
+        logg.hint(
+            'added\n'
+            '    \'highly_variable\', boolean vector (adata.var)\n'
+            '    \'highly_variable_rank\', float vector (adata.var)\n'
+            '    \'highly_variable_nbatches\', int vector (adata.var)\n'
+            '    \'highly_variable_intersection\', boolean vector (adata.var)\n'
+            '    \'means\', float vector (adata.var)\n'
+            '    \'variances\', float vector (adata.var)\n'
+            '    \'residual_variances\', float vector (adata.var)'
+        )
+        adata.var['highly_variable'] = df['highly_variable'].values
+        adata.var['highly_variable_rank'] = df['highly_variable_rank'].values
+        adata.var['means'] = df['means'].values
+        adata.var['variances'] = df['variances'].values
+        adata.var['residual_variances'] = df['residual_variances'].values.astype(
+            'float64', copy=False
+        )
+        if batch_key is not None:
+            adata.var['highly_variable_nbatches'] = df[
+                'highly_variable_nbatches'
+            ].values
+            adata.var['highly_variable_intersection'] = df[
+                'highly_variable_intersection'
+            ].values
+        if subset:
+            adata._inplace_subset_var(df['highly_variable'].values)
+    else:
+        if batch_key is None:
+            df = df.drop(['highly_variable_nbatches',
+                          'highly_variable_intersection'],
+                         axis=1)
+        return df
+    
+    
+    
 
 def _highly_variable_genes_single_batch(
     adata: AnnData,
@@ -288,6 +471,7 @@ def _highly_variable_genes_single_batch(
     return df
 
 
+
 def highly_variable_genes(
     adata: AnnData,
     layer: Optional[str] = None,
@@ -298,7 +482,10 @@ def highly_variable_genes(
     max_mean: Optional[float] = 3,
     span: Optional[float] = 0.3,
     n_bins: int = 20,
-    flavor: Literal['seurat', 'cell_ranger', 'seurat_v3'] = 'seurat',
+    theta: float = 100,
+    clip: Union[Literal['auto', 'none'], float] = 'auto',
+    chunksize: int = 1000,
+    flavor: Literal['seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals'] = 'seurat',
     subset: bool = False,
     inplace: bool = True,
     batch_key: Optional[str] = None,
@@ -332,19 +519,24 @@ def highly_variable_genes(
     layer
         If provided, use `adata.layers[layer]` for expression values instead of `adata.X`.
     n_top_genes
-        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'`.
+        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or 
+        `flavor='pearson_residuals'`.
     min_mean
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or 
+        `flavor='pearson_residuals'`.
     max_mean
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or 
+        `flavor='pearson_residuals'`.
     min_disp
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or 
+        `flavor='pearson_residuals'`.
     max_disp
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or 
+        `flavor='pearson_residuals'`.
     span
         The fraction of the data (cells) used when estimating the variance in the loess
         model fit if `flavor='seurat_v3'`.
@@ -352,7 +544,24 @@ def highly_variable_genes(
         Number of bins for binning the mean gene expression. Normalization is
         done with respect to each bin. If just a single gene falls into a bin,
         the normalized dispersion is artificially set to 1. You'll be informed
-        about this if you set `settings.verbosity = 4`.
+        about this if you set `settings.verbosity = 4`. Ignored if 
+        `flavor='pearson_residuals'`.
+    theta
+        If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta.
+        Higher values correspond to less overdispersion (var = mean + mean^2/theta), and 
+        `theta=np.Inf` corresponds to a Poisson model.
+    clip
+        If `flavor='pearson_residuals'`, this determines if and how residuals are clipped:
+        
+        * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
+        where n is the number of cells in the dataset (default behavior).
+        * If any scalar c, residuals are clipped to the interval [-c, c]. Set
+        `clip=np.Inf` for no clipping.
+        
+    chunksize
+        If `flavor='pearson_residuals'`, this dertermines how many genes are processed at
+        once while computing the residual variance. Choosing a smaller value will reduce 
+        the required memory.
     flavor
         Choose the flavor for identifying highly variable genes. For the dispersion
         based methods in their default workflows, Seurat passes the cutoffs whereas
@@ -368,11 +577,13 @@ def highly_variable_genes(
         lightweight batch correction method. For all flavors, genes are first sorted
         by how many batches they are a HVG. For dispersion-based flavors ties are broken
         by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median
-        (across batches) rank based on within-batch normalized variance.
+        (across batches) rank based on within-batch normalized variance. If 
+        `flavor='pearson_residuals'`, ties are broken based on
     check_values
         Check if counts in selected layer are integers. A Warning is returned if set to True.
         Only used if `flavor='seurat_v3'`.
 
+
     Returns
     -------
     Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or
@@ -387,13 +598,18 @@ def highly_variable_genes(
     **dispersions_norm**
         For dispersion-based flavors, normalized dispersions per gene
     **variances**
-        For `flavor='seurat_v3'`, variance per gene
+        For `flavor='seurat_v3'` and `flavor='pearson_residuals'`, variance per gene
     **variances_norm**
         For `flavor='seurat_v3'`, normalized variance per gene, averaged in
         the case of multiple batches
+    **residual_variances**
+        For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of
+        multiple batches.
     highly_variable_rank : float
         For `flavor='seurat_v3'`, rank of the gene according to normalized
         variance, median rank in the case of multiple batches
+        For `flavor='pearson_residuals'`, rank of the gene according to residual
+        variance, median rank in the case of multiple batches
     highly_variable_nbatches : int
         If batch_key is given, this denotes in how many batches genes are detected as HVG
     highly_variable_intersection : bool
@@ -428,6 +644,19 @@ def highly_variable_genes(
             subset=subset,
             inplace=inplace,
         )
+    if flavor == 'pearson_residuals':
+        return _highly_variable_pearson_residuals(
+            adata,
+            layer = layer,
+            n_top_genes = n_top_genes,
+            batch_key = batch_key,
+            theta = theta,
+            clip = clip,
+            chunksize= chunksize,
+            subset = subset,
+            inplace = inplace,
+        )
+        
 
     if batch_key is None:
         df = _highly_variable_genes_single_batch(
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 542afd38ca..40b2ec2422 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -8,10 +8,12 @@
 
 from .. import logging as logg
 from .._compat import Literal
-from .._utils import view_to_actual
+
+from .._utils import view_to_actual, check_nonnegative_integers
 from scanpy.get import _get_obs_rep, _set_obs_rep
 
 
+
 def _normalize_data(X, counts, after=None, copy=False):
     X = X.copy() if copy else X
     if issubclass(X.dtype.type, (int, np.integer)):
@@ -27,6 +29,145 @@ def _normalize_data(X, counts, after=None, copy=False):
     return X
 
 
+def _pearson_residuals(X, theta, clip, copy=False):
+
+    X = X.copy() if copy else X
+    X = X.toarray() if issparse(X) else X
+
+    #check theta
+    if theta <= 0:
+        ## TODO: would "underdispersion" with negative theta make sense? then only theta=0 were undefined..
+        raise ValueError('Pearson residuals require theta > 0')        
+    #prepare clipping
+    if clip == 'auto':
+        n = X.shape[0]
+        clip = np.sqrt(n)
+    if clip < 0:
+        raise ValueError("Pearson residuals require `clip>=0` or `clip='auto'`.")
+     
+    if check_nonnegative_integers(X) is False:
+        raise ValueError(
+            "`pp.normalize_pearson_residuals` expects raw count data"
+        )      
+    
+    #get residuals
+    sums_genes = np.sum(X, axis=0, keepdims=True)
+    sums_cells = np.sum(X, axis=1, keepdims=True)
+    sum_total  = np.sum(sums_genes)
+    mu = sums_cells @ sums_genes / sum_total
+    residuals = (X - mu) / np.sqrt(mu + mu**2/theta)
+
+    #clip
+    residuals = np.clip(residuals, a_min = -clip, a_max = clip)
+    
+    return residuals
+    
+
+def normalize_pearson_residuals(
+    adata: AnnData,
+    theta: float = 100,
+    clip: Union[Literal['auto', 'none'], float] = 'auto',
+    layers: Union[Literal['all'], Iterable[str]] = None,
+    theta_per_layer: Optional[Dict[str, str]] = None,
+    clip_per_layer: Optional[Dict[str, Union[Literal['auto', 'none'], float]]] = None,  ## TODO: Check if this is correct/needed
+    inplace: bool = True,
+) -> Optional[Dict[str, np.ndarray]]:
+    """\
+    Computes analytic Pearson residuals, assuming a negative binomial offset model
+    with overdispersion theta shared across genes. By default, residuals are
+    clipped to sqrt(n) and overdispersion theta=100 is used.
+
+    Params
+    ------
+    adata
+        The annotated data matrix of shape `n_obs` × `n_vars`.
+        Rows correspond to cells and columns to genes.
+    theta
+        The NB overdispersion parameter theta. Higher values correspond to less
+        overdispersion (var = mean + mean^2/theta), and `theta=np.Inf` corresponds
+        to a Poisson model.
+    clip
+        Determines if and how residuals are clipped:
+        
+        * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
+        where n is the number of cells in the dataset (default behavior).
+        * If any scalar c, residuals are clipped to the interval [-c, c]. Set
+        `clip=np.Inf` for no clipping.
+        
+    layers
+        List of layers to compute Pearson residuals of. Set to `'all'` to 
+        compute for all layers.
+    theta_per_layer
+        Dict that specifies which theta is used for each layer:
+
+        * If `None`, the provided `theta` is used for all layers.
+        * Otherwise, each layer with key `layer_key` is processed with the theta
+          value in `theta_per_layer[layer_key]`.
+    clip_per_layer
+        Dict that specifies clipping behavior for each layer :
+
+        * If `None`, the provided `clip` variable is used for all layers.
+        * Otherwise, each layer with key `layer_key` is clipped according to
+          `clip_per_layer[layer_key]`. See `clip` above for possible values.
+          
+    inplace
+        Whether to update `adata` or return dictionary with normalized copies of
+        `adata.X` and `adata.layers`.
+
+    Returns
+    -------
+    Returns dictionary with Pearson residuals of `adata.X` and `adata.layers`
+    or updates `adata` with normalized version of the original
+    `adata.X` and `adata.layers`, depending on `inplace`.
+
+    """
+    
+    if layers == 'all':
+        layers = adata.layers.keys()
+        
+    view_to_actual(adata) ### TODO: is this needed and if yes what for (normalize_total() has it so I used it..)
+    
+    # Handle X
+    msg = 'computing analytic Pearson residuals for adata.X'
+    start = logg.info(msg)
+    if inplace:
+        adata.X = _pearson_residuals(adata.X, theta, clip)
+        settings = dict(theta=theta, clip=clip)
+        settings['theta_per_layer']=theta_per_layer if theta_per_layer is not None
+        settings['clip_per_layer']=clip_per_layer if clip_per_layer is not None
+        adata.uns['normalization_pearson_residuals'] = settings
+        
+    else:
+        dat = dict(X=_pearson_residuals(adata.X, theta, clip, copy=True))
+        
+    # Handle layers
+    for layer_name in (layers or ()):
+        
+        msg = f'computing analytic Pearson residuals for layer {layer_name}'
+        _ = logg.info(msg)
+                
+        # Default to theta/clip if no layer-specific theta/clip given
+        layer_theta = theta if theta_per_layer is None else theta_per_layer[layer_name]
+        layer_clip = clip if clip_per_layer is None else clip_per_layer[layer_name]
+        
+        layer = adata.layers[layer_name]
+
+        if inplace:
+            adata.layers[layer_name] = _pearson_residuals(layer, layer_theta, layer_clip)
+        else:
+            dat[layer_name] = _pearson_residuals(layer, layer_theta, layer_clip, copy=True)
+            
+    if not layers is None:
+        adata.uns['normalization_pearson_residuals'] = dict(
+                theta=theta,
+                clip=clip)
+
+    logg.info('    finished ({time_passed})', time=start)
+
+    return dat if not inplace else None
+    
+    
+
 def normalize_total(
     adata: AnnData,
     target_sum: Optional[float] = None,

From 5d57961a99a466b07a2de4aa928e8c5ef396905e Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Wed, 3 Mar 2021 20:49:06 +0100
Subject: [PATCH 02/96] adding Pearson residual+PCA bundles, minor bug fixes

---
 scanpy/preprocessing/__init__.py              |   4 +-
 .../preprocessing/_highly_variable_genes.py   |   8 +-
 scanpy/preprocessing/_normalization.py        | 112 ++++++++++++++-
 scanpy/preprocessing/_recipes.py              | 131 +++++++++++++++++-
 4 files changed, 244 insertions(+), 11 deletions(-)

diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py
index 8bcad5ca03..f1b4dad80c 100644
--- a/scanpy/preprocessing/__init__.py
+++ b/scanpy/preprocessing/__init__.py
@@ -1,4 +1,4 @@
-from ._recipes import recipe_zheng17, recipe_weinreb17, recipe_seurat
+from ._recipes import recipe_zheng17, recipe_weinreb17, recipe_seurat, recipe_pearson_residuals
 from ._simple import filter_cells, filter_genes
 from ._deprecated.highly_variable_genes import filter_genes_dispersion
 from ._highly_variable_genes import highly_variable_genes
@@ -7,6 +7,6 @@
 from ._pca import pca
 from ._qc import calculate_qc_metrics
 from ._combat import combat
-from ._normalization import normalize_total, normalize_pearson_residuals
+from ._normalization import normalize_total, normalize_pearson_residuals, normalize_pearson_residuals_pca
 
 from ..neighbors import neighbors
\ No newline at end of file
diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 3ae163b3a4..88b3de9f29 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -578,7 +578,8 @@ def highly_variable_genes(
         by how many batches they are a HVG. For dispersion-based flavors ties are broken
         by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median
         (across batches) rank based on within-batch normalized variance. If 
-        `flavor='pearson_residuals'`, ties are broken based on
+        `flavor='pearson_residuals'`, ties are broken by the median rank (across batches)
+        based on within-batch residual variance.
     check_values
         Check if counts in selected layer are integers. A Warning is returned if set to True.
         Only used if `flavor='seurat_v3'`.
@@ -645,6 +646,11 @@ def highly_variable_genes(
             inplace=inplace,
         )
     if flavor == 'pearson_residuals':
+        if n_top_genes is None:
+            raise ValueError(
+            "`pp.highly_variable_genes` requires the argument `n_top_genes`"
+            " for `flavor='pearson_residuals'`"
+            )
         return _highly_variable_pearson_residuals(
             adata,
             layer = layer,
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 40b2ec2422..965f92075e 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -2,6 +2,7 @@
 from warnings import warn
 
 import numpy as np
+import pandas as pd
 from anndata import AnnData
 from scipy.sparse import issparse
 from sklearn.utils import sparsefuncs
@@ -12,6 +13,9 @@
 from .._utils import view_to_actual, check_nonnegative_integers
 from scanpy.get import _get_obs_rep, _set_obs_rep
 
+from ._pca import pca
+
+
 
 
 def _normalize_data(X, counts, after=None, copy=False):
@@ -67,9 +71,9 @@ def normalize_pearson_residuals(
     adata: AnnData,
     theta: float = 100,
     clip: Union[Literal['auto', 'none'], float] = 'auto',
-    layers: Union[Literal['all'], Iterable[str]] = None,
+    layers: Optional[Union[Literal['all'], Iterable[str]]] = None,
     theta_per_layer: Optional[Dict[str, str]] = None,
-    clip_per_layer: Optional[Dict[str, Union[Literal['auto', 'none'], float]]] = None,  ## TODO: Check if this is correct/needed
+    clip_per_layer: Optional[Dict[str, Union[Literal['auto', 'none'], float]]] = None,
     inplace: bool = True,
 ) -> Optional[Dict[str, np.ndarray]]:
     """\
@@ -125,17 +129,19 @@ def normalize_pearson_residuals(
     if layers == 'all':
         layers = adata.layers.keys()
         
-    view_to_actual(adata) ### TODO: is this needed and if yes what for (normalize_total() has it so I used it..)
+    view_to_actual(adata) ### TODO: is this needed and if yes what for? normalize_total() has it so I used it here
     
     # Handle X
     msg = 'computing analytic Pearson residuals for adata.X'
     start = logg.info(msg)
     if inplace:
         adata.X = _pearson_residuals(adata.X, theta, clip)
-        settings = dict(theta=theta, clip=clip)
-        settings['theta_per_layer']=theta_per_layer if theta_per_layer is not None
-        settings['clip_per_layer']=clip_per_layer if clip_per_layer is not None
-        adata.uns['normalization_pearson_residuals'] = settings
+        settings_dict = dict(theta=theta, clip=clip)
+        if theta_per_layer is not None:
+            settings_dict['theta_per_layer']=theta_per_layer 
+        if clip_per_layer is not None:
+            settings_dict['clip_per_layer']=clip_per_layer 
+        adata.uns['normalization_pearson_residuals'] = settings_dict
         
     else:
         dat = dict(X=_pearson_residuals(adata.X, theta, clip, copy=True))
@@ -167,6 +173,98 @@ def normalize_pearson_residuals(
     return dat if not inplace else None
     
     
+    
+def normalize_pearson_residuals_pca(
+    adata: AnnData, 
+    theta: float = 100,
+    clip: Union[Literal['auto', 'none'], float] = 'auto',    
+    n_comps_pca: Optional[int] = 50,
+    random_state_pca: Optional[float] = 0,
+    use_highly_variable: bool = True,
+    inplace: bool = False
+) -> Optional[pd.DataFrame]:
+
+    """\
+    Applies PCA based on Pearson residual normalization. Operates on the subset of
+    highly variable genes in `adata.var['highly_variable']` by default.
+    
+    
+    Parameters
+    ----------
+    adata
+        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
+        to cells and columns to genes.
+    use_highly_variable
+        Whether to use the gene selection in `adata.var['highly_variable']` to subset
+        the data before normalizing (default) or proceed on the full dataset.
+    theta
+        This is the NB overdispersion parameter theta for Pearson residual computations.
+        Higher values correspond to less overdispersion (var = mean + mean^2/theta), and 
+        `theta=np.Inf` corresponds to a Poisson model.
+    clip
+        This determines if and how Pearson residuals are clipped:
+        
+        * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
+        where n is the number of cells in the dataset (default behavior).
+        * If any scalar c, residuals are clipped to the interval [-c, c]. Set
+        `clip=np.Inf` for no clipping.
+        
+    n_comps_pca
+        Number of principal components to compute.
+    random_state_pca
+        Change to use different initial states for the optimization.
+    inplace
+        Whether to place results in `adata` or return them.
+
+
+    Returns
+    -------
+    If `inplace=False`, returns the Pearson residual-based PCA results (`adata_pca`).
+    If `inplace=True`, updates `adata` with the following fields:
+    
+    `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
+         The hvg-subset, normalized by Pearson residuals
+    `.uns['pearson_residuals_normalization']['theta']`
+         The used value of the overdisperion parameter theta
+    `.uns['pearson_residuals_normalization']['clip']`
+         The used value of the clipping parameter
+    
+    `.obsm['pearson_residuals_X_pca']`
+        PCA representation of data after gene selection and Pearson residual normalization.
+    `.uns['pearson_residuals_pca']['PCs']`
+         The principal components containing the loadings.
+    `.uns['pearson_residuals_pca']['variance_ratio']`
+         Ratio of explained variance.
+    `.uns['pearson_residuals_pca']['variance']`
+         Explained variance, equivalent to the eigenvalues of the
+         covariance matrix.        
+    
+    """    
+    
+
+    
+    if use_highly_variable and 'highly_variable' in adata.var_keys():
+        adata_pca = adata[:,adata.var['highly_variable']].copy() ##TODO: are these copies needed?
+    else:
+        adata_pca = adata.copy()##TODO: are these copies needed?
+    
+    normalize_pearson_residuals(adata_pca,theta=theta,clip=clip)
+    pca(adata_pca,n_comps=n_comps_pca,random_state=random_state_pca)
+    
+    if inplace:
+        normalization_settings = adata_pca.uns['normalization_pearson_residuals']
+        normalization_dict = dict(**normalization_settings,
+                                pearson_residuals_df = adata_pca.to_df())
+        pca_settings = adata_pca.uns['pca']
+        pca_dict = dict(**pca_settings, 
+                         PCs = adata_pca.varm['PCs'])
+        adata.uns['pearson_residuals_pca'] = pca_dict
+        adata.uns['pearson_residuals_normalization'] = normalization_dict
+        adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca']
+        return None
+    else:
+        return adata_pca
+    
 
 def normalize_total(
     adata: AnnData,
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index d211bcc20a..75b1f42fc0 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -1,8 +1,10 @@
 """Preprocessing recipes from the literature"""
-from typing import Optional
+from typing import Optional, Union, Literal, Tuple
 
 from anndata import AnnData
 
+import pandas as pd
+
 from .. import preprocessing as pp
 from ._deprecated.highly_variable_genes import (
     filter_genes_dispersion,
@@ -168,3 +170,130 @@ def recipe_zheng17(
     pp.scale(adata)
     logg.info('    finished', time=start)
     return adata if copy else None
+
+
+def recipe_pearson_residuals(
+    adata: AnnData, 
+    n_top_genes: int = 1000,
+    theta: float = 100,
+    clip: Union[Literal['auto', 'none'], float] = 'auto',    
+    chunksize: int = 1000,
+    batch_key: Optional[str] = None,
+    n_comps_pca: Optional[int] = 50,
+    random_state_pca: Optional[float] = 0,
+    inplace: bool = False
+) -> Optional[Tuple[pd.DataFrame,pd.DataFrame]]:
+    """\
+    Applies gene selection based on Pearson residuals. On the resulting subset, 
+    Pearson residual normalization and PCA are performed.
+    
+    
+    Parameters
+    ----------
+    adata
+        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
+        to cells and columns to genes.
+    n_top_genes
+        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or 
+        `flavor='pearson_residuals'`.
+    chunksize
+        This dertermines how many genes are processed at once while computing the 
+        Pearson residual variance. Choosing a smaller value will reduce the required memory.
+    theta
+        This is the NB overdispersion parameter theta for Pearson residual computations.
+        Higher values correspond to less overdispersion (var = mean + mean^2/theta), and 
+        `theta=np.Inf` corresponds to a Poisson model.
+    clip
+        This determines if and how Pearson residuals are clipped:
+        
+        * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
+        where n is the number of cells in the dataset (default behavior).
+        * If any scalar c, residuals are clipped to the interval [-c, c]. Set
+        `clip=np.Inf` for no clipping.
+    batch_key
+        If specified, highly-variable genes are selected within each batch separately and merged.
+        This simple process avoids the selection of batch-specific genes and acts as a
+        lightweight batch correction method. For all flavors, genes are first sorted
+        by how many batches they are a HVG. Ties are broken by the median rank (across batches)
+        based on within-batch residual variance.
+        
+    n_comps_pca
+        Number of principal components to compute.
+    random_state_pca
+        Change to use different initial states for the optimization.
+    inplace
+        Whether to place results in `adata` or return them.
+
+    Returns
+    -------
+    If `inplace=False`, separately returns the gene selection results (`hvg`) and Pearson 
+    residual-based PCA results (`adata_pca`). If `inplace=True`, updates `adata` with the
+    following fields for gene selection results…:
+
+    `.var['highly_variable']`
+        boolean indicator of highly-variable genes
+    `.var['means']`
+        means per gene
+    `.var['variances']`
+        variances per gene 
+    `.var['residual_variances']`
+        Pearson residual variance per gene. Averaged in the case of multiple batches.
+    `.var['highly_variable_rank']`
+        Rank of the gene according to residual variance, median rank in the case of multiple batches
+    `.var['highly_variable_nbatches']`
+        If batch_key is given, this denotes in how many batches genes are detected as HVG
+    `.var['highly_variable_intersection']`
+        If batch_key is given, this denotes the genes that are highly variable in all batches
+        
+    …and the following fields for Pearson residual-based PCA results and normalization settings:    
+    
+    `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
+         The hvg-subset, normalized by Pearson residuals
+    `.uns['pearson_residuals_normalization']['theta']`
+         The used value of the overdisperion parameter theta
+    `.uns['pearson_residuals_normalization']['clip']`
+         The used value of the clipping parameter
+    
+    `.obsm['pearson_residuals_X_pca']`
+        PCA representation of data after gene selection and Pearson residual normalization.
+    `.uns['pearson_residuals_pca']['PCs']`
+         The principal components containing the loadings.
+    `.uns['pearson_residuals_pca']['variance_ratio']`
+         Ratio of explained variance.
+    `.uns['pearson_residuals_pca']['variance']`
+         Explained variance, equivalent to the eigenvalues of the
+         covariance matrix.
+        
+    
+    """
+        
+    hvg_args = dict(flavor = 'pearson_residuals',
+                    n_top_genes = n_top_genes,
+                    batch_key = batch_key,
+                    theta = theta,
+                    clip = clip,
+                    chunksize = chunksize)
+    
+    if inplace:
+        pp.highly_variable_genes(adata,**hvg_args,inplace = True)
+        adata_pca = adata[:,adata.var['highly_variable']].copy() ##TODO: are these copies needed?
+    else:
+        hvg = pp.highly_variable_genes(adata,**hvg_args,inplace = False)
+        adata_pca = adata[:,hvg['highly_variable']].copy()##TODO: are these copies needed?
+    
+    pp.normalize_pearson_residuals(adata_pca,theta = theta,clip = clip)
+    pp.pca(adata_pca,n_comps = n_comps_pca,random_state = random_state_pca)
+    
+    if inplace:
+        normalization_settings = adata_pca.uns['normalization_pearson_residuals']
+        normalization_dict = dict(**normalization_settings,
+                                pearson_residuals_df = adata_pca.to_df())
+        pca_settings = adata_pca.uns['pca']
+        pca_dict = dict(**pca_settings, 
+                         PCs = adata_pca.varm['PCs'])
+        adata.uns['pearson_residuals_pca'] = pca_dict
+        adata.uns['pearson_residuals_normalization'] = normalization_dict
+        adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca']
+        return None
+    else:
+        return adata_pca, hvg
\ No newline at end of file

From fced3f27ef8d4ee27c7af3ca3d7341cfd037c000 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Wed, 3 Mar 2021 22:39:29 +0100
Subject: [PATCH 03/96] some style cleanup, minor fixes

---
 .../preprocessing/_highly_variable_genes.py   | 127 ++++++------
 scanpy/preprocessing/_normalization.py        | 183 ++++++++++--------
 scanpy/preprocessing/_recipes.py              | 133 +++++++------
 3 files changed, 239 insertions(+), 204 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 88b3de9f29..78347b6fc3 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -191,8 +191,8 @@ def _highly_variable_pearson_residuals(
 
     Returns
     -------
-    Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or
-    updates `.var` with the following fields
+    Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`)
+    or updates `.var` with the following fields
 
     highly_variable
         boolean indicator of highly-variable genes
@@ -201,43 +201,49 @@ def _highly_variable_pearson_residuals(
     variances
         variances per gene 
     residual_variances
-        Pearson residual variance per gene. Averaged in the case of multiple batches.
+        Pearson residual variance per gene. Averaged in the case of multiple
+        batches.
     highly_variable_rank
-        Rank of the gene according to residual variance, median rank in the case of multiple batches
+        Rank of the gene according to residual variance, median rank in the
+        case of multiple batches
     highly_variable_nbatches : int
-        If batch_key is given, this denotes in how many batches genes are detected as HVG
+        If batch_key is given, this denotes in how many batches genes are
+        detected as HVG
     highly_variable_intersection : bool
-        If batch_key is given, this denotes the genes that are highly variable in all batches
+        If batch_key is given, this denotes the genes that are highly variable
+        in all batches
     """
-    
+
     X = adata.layers[layer] if layer is not None else adata.X
-    
+
     # Check for raw counts
     if check_nonnegative_integers(X) is False:
         raise ValueError(
-            "`pp.highly_variable_genes` with `flavor='pearson_residuals'` expects "
-            "raw count data."
+            "`pp.highly_variable_genes` with `flavor='pearson_residuals'`"
+            "expects raw count data."
         )
-    
+
     if batch_key is None:
         batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int))
     else:
         batch_info = adata.obs[batch_key].values
     n_batches = len(np.unique(batch_info))
-        
+
     # Get pearson residuals for each batch separately
     residual_gene_vars = []
     for batch in np.unique(batch_info):
 
         adata_subset = adata[batch_info == batch]
-       
+
         # Filter out zero genes
         with settings.verbosity.override(Verbosity.error):
             nonzero_genes = filter_genes(adata_subset, min_cells=1, inplace=False)[0]
         adata_subset = adata_subset[:, nonzero_genes]
 
-
-        X_batch = adata_subset.layers[layer] if layer is not None else adata_subset.X 
+        if layer is not None:
+            X_batch = adata_subset.layers[layer]
+        else:
+            X_batch = adata_subset.X
 
         # Prepare clipping
         if clip == 'auto':
@@ -249,77 +255,78 @@ def _highly_variable_pearson_residuals(
         if sp_sparse.issparse(X_batch):
             sums_genes = np.sum(X_batch, axis=0)
             sums_cells = np.sum(X_batch, axis=1)
-            sum_total  = np.sum(sums_genes).squeeze()
+            sum_total = np.sum(sums_genes).squeeze()
         else:
             sums_genes = np.sum(X_batch, axis=0, keepdims=True)
             sums_cells = np.sum(X_batch, axis=1, keepdims=True)
-            sum_total  = np.sum(sums_genes)
+            sum_total = np.sum(sums_genes)
 
         # Compute pearson residuals in chunks
-        residual_gene_var = np.ones((X_batch.shape[1]))*np.nan
-        for start in np.arange(0,X_batch.shape[1],chunksize):
+        residual_gene_var = np.ones((X_batch.shape[1])) * np.nan
+        for start in np.arange(0, X_batch.shape[1], chunksize):
             stop = start + chunksize
-            mu = np.array(sums_cells @ sums_genes[:,start:stop] / sum_total)
-            X_dense = X_batch[:,start:stop].toarray()
-            residuals = (X_dense - mu) / np.sqrt(mu + mu**2/theta)
-            residuals = np.clip(residuals, a_min = -clip, a_max = clip)
-            residual_gene_var[start:stop] = np.var(residuals,axis=0)
-            
-        # Add 0 values for genes that were filtered out 
+            mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total)
+            X_dense = X_batch[:, start:stop].toarray()
+            residuals = (X_dense - mu) / np.sqrt(mu + mu ** 2 / theta)
+            residuals = np.clip(residuals, a_min=-clip, a_max=clip)
+            residual_gene_var[start:stop] = np.var(residuals, axis=0)
+
+        # Add 0 values for genes that were filtered out
         zero_gene_var = np.zeros(np.sum(~nonzero_genes))
-        residual_gene_var = np.concatenate((residual_gene_var,
-                                            zero_gene_var))
+        residual_gene_var = np.concatenate((residual_gene_var, zero_gene_var))
         # Order as before filtering
-        idxs = np.concatenate((np.where(nonzero_genes)[0],
-                               np.where(~nonzero_genes)[0]))
+        idxs = np.concatenate((np.where(nonzero_genes)[0], np.where(~nonzero_genes)[0]))
         residual_gene_var = residual_gene_var[np.argsort(idxs)]
         residual_gene_vars.append(residual_gene_var.reshape(1, -1))
 
     residual_gene_vars = np.concatenate(residual_gene_vars, axis=0)
-    
+
     # Get cutoffs and define hvgs per batch
-    residual_gene_vars_sorted = np.sort(residual_gene_vars,axis=1)[:,::-1]
-    cutoffs_per_batch = residual_gene_vars_sorted[:,n_top_genes]
-    highly_variable_per_batch = np.greater(residual_gene_vars.T,cutoffs_per_batch).T
-    
+    residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1)[:, ::-1]
+    cutoffs_per_batch = residual_gene_vars_sorted[:, n_top_genes]
+    highly_variable_per_batch = np.greater(residual_gene_vars.T, cutoffs_per_batch).T
+
     # Merge hvgs across batches
-    highly_variable_nbatches = np.sum(highly_variable_per_batch,axis=0)
+    highly_variable_nbatches = np.sum(highly_variable_per_batch, axis=0)
     highly_variable_intersection = highly_variable_nbatches == n_batches
-    
+
     # Get rank per gene within each batch
     # argsort twice gives ranks, small rank means most variable
     ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1)
     ranks_residual_var = ranks_residual_var.astype(np.float32)
-    ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan 
+    ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan
     ranks_masked_array = np.ma.masked_invalid(ranks_residual_var)
-    # Median rank across batches, ignoring batches in which gene was not selected
-    medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan) 
-    
+    # Median rank across batches,
+    # ignoring batches in which gene was not selected
+    medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan)
+
     means, variances = materialize_as_ndarray(_get_mean_var(X))
     df = pd.DataFrame.from_dict(
-                dict(means=means,
-                     variances=variances,
-                     residual_variances=np.mean(residual_gene_vars,axis=0),
-                     highly_variable_rank=medianrank_residual_var,
-                     highly_variable_nbatches=highly_variable_nbatches,
-                     highly_variable_intersection=highly_variable_intersection,
-                )
-         )
+        dict(
+            means=means,
+            variances=variances,
+            residual_variances=np.mean(residual_gene_vars, axis=0),
+            highly_variable_rank=medianrank_residual_var,
+            highly_variable_nbatches=highly_variable_nbatches,
+            highly_variable_intersection=highly_variable_intersection,
+        )
+    )
     df = df.set_index(adata.var_names)
-    
+
     # Sort genes by how often they selected as hvg within each batch and
     # break ties with median rank of residual variance across batches
     df.sort_values(
-                    ['highly_variable_nbatches', 'highly_variable_rank'],
-                    ascending=[False,True],
-                    na_position='last',
-                    inplace=True,
-                  )
+        ['highly_variable_nbatches', 'highly_variable_rank'],
+        ascending=[False, True],
+        na_position='last',
+        inplace=True,
+    )
     df['highly_variable'] = False
     df.highly_variable.iloc[:n_top_genes] = True
-    ## TODO: following line raises a pandas warning (also for flavor = seurat and cellranger..)
+    # TODO: following line raises a pandas warning
+    # (also for flavor = seurat and cellranger..)
     df = df.loc[adata.var_names]
-    
+
     if inplace or subset:
         adata.uns['hvg'] = {'flavor': 'pearson_residuals'}
         logg.hint(
@@ -350,9 +357,9 @@ def _highly_variable_pearson_residuals(
             adata._inplace_subset_var(df['highly_variable'].values)
     else:
         if batch_key is None:
-            df = df.drop(['highly_variable_nbatches',
-                          'highly_variable_intersection'],
-                         axis=1)
+            df = df.drop(
+                ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1
+            )
         return df
     
     
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 965f92075e..1479bcb9b6 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -38,34 +38,33 @@ def _pearson_residuals(X, theta, clip, copy=False):
     X = X.copy() if copy else X
     X = X.toarray() if issparse(X) else X
 
-    #check theta
+    # check theta
     if theta <= 0:
-        ## TODO: would "underdispersion" with negative theta make sense? then only theta=0 were undefined..
-        raise ValueError('Pearson residuals require theta > 0')        
-    #prepare clipping
+        # TODO: would "underdispersion" with negative theta make sense?
+        # then only theta=0 were undefined..
+        raise ValueError('Pearson residuals require theta > 0')
+    # prepare clipping
     if clip == 'auto':
         n = X.shape[0]
         clip = np.sqrt(n)
     if clip < 0:
         raise ValueError("Pearson residuals require `clip>=0` or `clip='auto'`.")
-     
+
     if check_nonnegative_integers(X) is False:
-        raise ValueError(
-            "`pp.normalize_pearson_residuals` expects raw count data"
-        )      
-    
-    #get residuals
+        raise ValueError("`pp.normalize_pearson_residuals` expects raw count data")
+
+    # get residuals
     sums_genes = np.sum(X, axis=0, keepdims=True)
     sums_cells = np.sum(X, axis=1, keepdims=True)
-    sum_total  = np.sum(sums_genes)
+    sum_total = np.sum(sums_genes)
     mu = sums_cells @ sums_genes / sum_total
-    residuals = (X - mu) / np.sqrt(mu + mu**2/theta)
+    residuals = (X - mu) / np.sqrt(mu + mu ** 2 / theta)
+
+    # clip
+    residuals = np.clip(residuals, a_min=-clip, a_max=clip)
 
-    #clip
-    residuals = np.clip(residuals, a_min = -clip, a_max = clip)
-    
     return residuals
-    
+
 
 def normalize_pearson_residuals(
     adata: AnnData,
@@ -77,9 +76,9 @@ def normalize_pearson_residuals(
     inplace: bool = True,
 ) -> Optional[Dict[str, np.ndarray]]:
     """\
-    Computes analytic Pearson residuals, assuming a negative binomial offset model
-    with overdispersion theta shared across genes. By default, residuals are
-    clipped to sqrt(n) and overdispersion theta=100 is used.
+    Computes analytic Pearson residuals, assuming a negative binomial offset
+    model with overdispersion theta shared across genes. By default, residuals
+    are clipped to sqrt(n) and overdispersion theta=100 is used.
 
     Params
     ------
@@ -87,26 +86,27 @@ def normalize_pearson_residuals(
         The annotated data matrix of shape `n_obs` × `n_vars`.
         Rows correspond to cells and columns to genes.
     theta
-        The NB overdispersion parameter theta. Higher values correspond to less
-        overdispersion (var = mean + mean^2/theta), and `theta=np.Inf` corresponds
-        to a Poisson model.
+        The NB overdispersion parameter theta. Higher values correspond to
+        less overdispersion (var = mean + mean^2/theta), and `theta=np.Inf`
+        corresponds to a Poisson model.
     clip
         Determines if and how residuals are clipped:
         
-        * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
-        where n is the number of cells in the dataset (default behavior).
+        * If `'auto'`, residuals are clipped to the interval
+        [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
+        (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
         `clip=np.Inf` for no clipping.
         
     layers
-        List of layers to compute Pearson residuals of. Set to `'all'` to 
+        List of layers to compute Pearson residuals of. Set to `'all'` to
         compute for all layers.
     theta_per_layer
         Dict that specifies which theta is used for each layer:
 
         * If `None`, the provided `theta` is used for all layers.
-        * Otherwise, each layer with key `layer_key` is processed with the theta
-          value in `theta_per_layer[layer_key]`.
+        * Otherwise, each layer with key `layer_key` is processed with the
+        theta value in `theta_per_layer[layer_key]`.
     clip_per_layer
         Dict that specifies clipping behavior for each layer :
 
@@ -115,8 +115,8 @@ def normalize_pearson_residuals(
           `clip_per_layer[layer_key]`. See `clip` above for possible values.
           
     inplace
-        Whether to update `adata` or return dictionary with normalized copies of
-        `adata.X` and `adata.layers`.
+        Whether to update `adata` or return dictionary with normalized copies
+        of `adata.X` and `adata.layers`.
 
     Returns
     -------
@@ -125,12 +125,13 @@ def normalize_pearson_residuals(
     `adata.X` and `adata.layers`, depending on `inplace`.
 
     """
-    
+
     if layers == 'all':
         layers = adata.layers.keys()
-        
-    view_to_actual(adata) ### TODO: is this needed and if yes what for? normalize_total() has it so I used it here
-    
+    # TODO: is this needed and if yes what for?
+    # normalize_total() has it so I used it here
+    view_to_actual(adata)
+
     # Handle X
     msg = 'computing analytic Pearson residuals for adata.X'
     start = logg.info(msg)
@@ -138,74 +139,88 @@ def normalize_pearson_residuals(
         adata.X = _pearson_residuals(adata.X, theta, clip)
         settings_dict = dict(theta=theta, clip=clip)
         if theta_per_layer is not None:
-            settings_dict['theta_per_layer']=theta_per_layer 
+            settings_dict['theta_per_layer'] = theta_per_layer
         if clip_per_layer is not None:
-            settings_dict['clip_per_layer']=clip_per_layer 
-        adata.uns['normalization_pearson_residuals'] = settings_dict
-        
+            settings_dict['clip_per_layer'] = clip_per_layer
+        adata.uns['pearson_residuals_normalization'] = settings_dict
+
     else:
         dat = dict(X=_pearson_residuals(adata.X, theta, clip, copy=True))
-        
+
     # Handle layers
-    for layer_name in (layers or ()):
-        
+    for layer_name in layers or ():
+
         msg = f'computing analytic Pearson residuals for layer {layer_name}'
         _ = logg.info(msg)
-                
+
         # Default to theta/clip if no layer-specific theta/clip given
-        layer_theta = theta if theta_per_layer is None else theta_per_layer[layer_name]
-        layer_clip = clip if clip_per_layer is None else clip_per_layer[layer_name]
-        
+        if theta_per_layer is None:
+            layer_theta = theta
+        else:
+            layer_theta = theta_per_layer[layer_name]
+        if clip_per_layer is None:
+            layer_clip = clip
+        else:
+            layer_clip = clip_per_layer[layer_name]
+
         layer = adata.layers[layer_name]
 
         if inplace:
-            adata.layers[layer_name] = _pearson_residuals(layer, layer_theta, layer_clip)
+            adata.layers[layer_name] = _pearson_residuals(
+                layer, layer_theta, layer_clip
+            )
         else:
-            dat[layer_name] = _pearson_residuals(layer, layer_theta, layer_clip, copy=True)
-            
+            dat[layer_name] = _pearson_residuals(
+                layer, layer_theta, layer_clip, copy=True
+            )
+
     if not layers is None:
-        adata.uns['normalization_pearson_residuals'] = dict(
-                theta=theta,
-                clip=clip)
+        adata.uns['pearson_residuals_normalization'] = dict(
+            theta=theta,
+            clip=clip,
+        )
 
     logg.info('    finished ({time_passed})', time=start)
 
     return dat if not inplace else None
-    
-    
-    
+
+
 def normalize_pearson_residuals_pca(
-    adata: AnnData, 
+    adata: AnnData,
     theta: float = 100,
-    clip: Union[Literal['auto', 'none'], float] = 'auto',    
+    clip: Union[Literal['auto', 'none'], float] = 'auto',
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
     use_highly_variable: bool = True,
-    inplace: bool = False
+    inplace: bool = False,
 ) -> Optional[pd.DataFrame]:
 
     """\
-    Applies PCA based on Pearson residual normalization. Operates on the subset of
-    highly variable genes in `adata.var['highly_variable']` by default.
-    
-    
+    Applies PCA based on Pearson residual normalization. Operates on the
+    subset of highly variable genes in `adata.var['highly_variable']` by
+    default.
+
+
     Parameters
     ----------
     adata
         The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
         to cells and columns to genes.
     use_highly_variable
-        Whether to use the gene selection in `adata.var['highly_variable']` to subset
-        the data before normalizing (default) or proceed on the full dataset.
+        Whether to use the gene selection in `adata.var['highly_variable']` to
+        subset the data before normalizing (default) or proceed on the full
+        dataset.
     theta
-        This is the NB overdispersion parameter theta for Pearson residual computations.
-        Higher values correspond to less overdispersion (var = mean + mean^2/theta), and 
-        `theta=np.Inf` corresponds to a Poisson model.
+        This is the NB overdispersion parameter theta for Pearson residual
+        computations. Higher values correspond to less overdispersion
+        (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
+        Poisson model.
     clip
         This determines if and how Pearson residuals are clipped:
         
-        * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
-        where n is the number of cells in the dataset (default behavior).
+        * If `'auto'`, residuals are clipped to the interval
+        [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
+        (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
         `clip=np.Inf` for no clipping.
         
@@ -219,7 +234,8 @@ def normalize_pearson_residuals_pca(
 
     Returns
     -------
-    If `inplace=False`, returns the Pearson residual-based PCA results (`adata_pca`).
+    If `inplace=False`, returns the Pearson residual-based PCA results
+    (`adata_pca`).
     If `inplace=True`, updates `adata` with the following fields:
     
     `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
@@ -230,7 +246,8 @@ def normalize_pearson_residuals_pca(
          The used value of the clipping parameter
     
     `.obsm['pearson_residuals_X_pca']`
-        PCA representation of data after gene selection and Pearson residual normalization.
+        PCA representation of data after gene selection and Pearson residual
+        normalization.
     `.uns['pearson_residuals_pca']['PCs']`
          The principal components containing the loadings.
     `.uns['pearson_residuals_pca']['variance_ratio']`
@@ -239,27 +256,25 @@ def normalize_pearson_residuals_pca(
          Explained variance, equivalent to the eigenvalues of the
          covariance matrix.        
     
-    """    
-    
+    """
 
-    
     if use_highly_variable and 'highly_variable' in adata.var_keys():
-        adata_pca = adata[:,adata.var['highly_variable']].copy() ##TODO: are these copies needed?
+        # TODO: are these copies needed?
+        adata_pca = adata[:, adata.var['highly_variable']].copy()
     else:
-        adata_pca = adata.copy()##TODO: are these copies needed?
-    
-    normalize_pearson_residuals(adata_pca,theta=theta,clip=clip)
-    pca(adata_pca,n_comps=n_comps_pca,random_state=random_state_pca)
-    
+        # TODO: are these copies needed?
+        adata_pca = adata.copy()
+
+    normalize_pearson_residuals(adata_pca, theta=theta, clip=clip)
+    pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca)
+
     if inplace:
-        normalization_settings = adata_pca.uns['normalization_pearson_residuals']
-        normalization_dict = dict(**normalization_settings,
-                                pearson_residuals_df = adata_pca.to_df())
+        norm_settings = adata_pca.uns['pearson_residuals_normalization']
+        norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df())
         pca_settings = adata_pca.uns['pca']
-        pca_dict = dict(**pca_settings, 
-                         PCs = adata_pca.varm['PCs'])
+        pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs'])
         adata.uns['pearson_residuals_pca'] = pca_dict
-        adata.uns['pearson_residuals_normalization'] = normalization_dict
+        adata.uns['pearson_residuals_normalization'] = norm_dict
         adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca']
         return None
     else:
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index 75b1f42fc0..0f2ce4e994 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -173,50 +173,54 @@ def recipe_zheng17(
 
 
 def recipe_pearson_residuals(
-    adata: AnnData, 
+    adata: AnnData,
     n_top_genes: int = 1000,
     theta: float = 100,
-    clip: Union[Literal['auto', 'none'], float] = 'auto',    
+    clip: Union[Literal['auto', 'none'], float] = 'auto',
     chunksize: int = 1000,
     batch_key: Optional[str] = None,
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
-    inplace: bool = False
-) -> Optional[Tuple[pd.DataFrame,pd.DataFrame]]:
+    inplace: bool = False,
+) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
     """\
-    Applies gene selection based on Pearson residuals. On the resulting subset, 
+    Applies gene selection based on Pearson residuals. On the resulting subset,
     Pearson residual normalization and PCA are performed.
-    
-    
+
+
     Parameters
     ----------
     adata
         The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
         to cells and columns to genes.
     n_top_genes
-        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or 
-        `flavor='pearson_residuals'`.
+        Number of highly-variable genes to keep. Mandatory if
+        `flavor='seurat_v3'` or `flavor='pearson_residuals'`.
     chunksize
-        This dertermines how many genes are processed at once while computing the 
-        Pearson residual variance. Choosing a smaller value will reduce the required memory.
+        This dertermines how many genes are processed at once while computing
+        the Pearson residual variance. Choosing a smaller value will reduce
+        the required memory.
     theta
-        This is the NB overdispersion parameter theta for Pearson residual computations.
-        Higher values correspond to less overdispersion (var = mean + mean^2/theta), and 
-        `theta=np.Inf` corresponds to a Poisson model.
+        This is the NB overdispersion parameter theta for Pearson residual
+        computations. Higher values correspond to less overdispersion
+        (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
+        Poisson model.
     clip
         This determines if and how Pearson residuals are clipped:
-        
-        * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
-        where n is the number of cells in the dataset (default behavior).
+
+        * If `'auto'`, residuals are clipped to the interval
+        [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
+        (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
         `clip=np.Inf` for no clipping.
     batch_key
-        If specified, highly-variable genes are selected within each batch separately and merged.
-        This simple process avoids the selection of batch-specific genes and acts as a
-        lightweight batch correction method. For all flavors, genes are first sorted
-        by how many batches they are a HVG. Ties are broken by the median rank (across batches)
+        If specified, highly-variable genes are selected within each batch
+        separately and merged. This simple process avoids the selection of
+        batch-specific genes and acts as a lightweight batch correction
+        method. For all flavors, genes are first sorted by how many batches
+        they are a HVG. Ties are broken by the median rank (across batches)
         based on within-batch residual variance.
-        
+
     n_comps_pca
         Number of principal components to compute.
     random_state_pca
@@ -226,36 +230,42 @@ def recipe_pearson_residuals(
 
     Returns
     -------
-    If `inplace=False`, separately returns the gene selection results (`hvg`) and Pearson 
-    residual-based PCA results (`adata_pca`). If `inplace=True`, updates `adata` with the
-    following fields for gene selection results…:
+    If `inplace=False`, separately returns the gene selection results (`hvg`)
+    and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`,
+    updates `adata` with the following fields for gene selection results…:
 
     `.var['highly_variable']`
         boolean indicator of highly-variable genes
     `.var['means']`
         means per gene
     `.var['variances']`
-        variances per gene 
+        variances per gene
     `.var['residual_variances']`
-        Pearson residual variance per gene. Averaged in the case of multiple batches.
+        Pearson residual variance per gene. Averaged in the case of multiple
+        batches.
     `.var['highly_variable_rank']`
-        Rank of the gene according to residual variance, median rank in the case of multiple batches
+        Rank of the gene according to residual variance, median rank in the
+        case of multiple batches
     `.var['highly_variable_nbatches']`
-        If batch_key is given, this denotes in how many batches genes are detected as HVG
+        If batch_key is given, this denotes in how many batches genes are
+        detected as HVG
     `.var['highly_variable_intersection']`
-        If batch_key is given, this denotes the genes that are highly variable in all batches
-        
-    …and the following fields for Pearson residual-based PCA results and normalization settings:    
-    
+        If batch_key is given, this denotes the genes that are highly variable
+        in all batches
+
+    …and the following fields for Pearson residual-based PCA results and
+    normalization settings:
+
     `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
          The hvg-subset, normalized by Pearson residuals
     `.uns['pearson_residuals_normalization']['theta']`
          The used value of the overdisperion parameter theta
     `.uns['pearson_residuals_normalization']['clip']`
          The used value of the clipping parameter
-    
+
     `.obsm['pearson_residuals_X_pca']`
-        PCA representation of data after gene selection and Pearson residual normalization.
+        PCA representation of data after gene selection and Pearson residual
+        normalization.
     `.uns['pearson_residuals_pca']['PCs']`
          The principal components containing the loadings.
     `.uns['pearson_residuals_pca']['variance_ratio']`
@@ -263,37 +273,40 @@ def recipe_pearson_residuals(
     `.uns['pearson_residuals_pca']['variance']`
          Explained variance, equivalent to the eigenvalues of the
          covariance matrix.
-        
-    
+
     """
-        
-    hvg_args = dict(flavor = 'pearson_residuals',
-                    n_top_genes = n_top_genes,
-                    batch_key = batch_key,
-                    theta = theta,
-                    clip = clip,
-                    chunksize = chunksize)
-    
+
+    hvg_args = dict(
+        flavor='pearson_residuals',
+        n_top_genes=n_top_genes,
+        batch_key=batch_key,
+        theta=theta,
+        clip=clip,
+        chunksize=chunksize,
+    )
+
     if inplace:
-        pp.highly_variable_genes(adata,**hvg_args,inplace = True)
-        adata_pca = adata[:,adata.var['highly_variable']].copy() ##TODO: are these copies needed?
+        pp.highly_variable_genes(adata, **hvg_args, inplace=True)
+        # TODO: are these copies needed?
+        adata_pca = adata[:, adata.var['highly_variable']].copy()
     else:
-        hvg = pp.highly_variable_genes(adata,**hvg_args,inplace = False)
-        adata_pca = adata[:,hvg['highly_variable']].copy()##TODO: are these copies needed?
-    
-    pp.normalize_pearson_residuals(adata_pca,theta = theta,clip = clip)
-    pp.pca(adata_pca,n_comps = n_comps_pca,random_state = random_state_pca)
-    
+        hvg = pp.highly_variable_genes(adata, **hvg_args, inplace=False)
+        # TODO: are these copies needed?
+        adata_pca = adata[:, hvg['highly_variable']].copy()
+
+    pp.normalize_pearson_residuals(adata_pca, theta=theta, clip=clip)
+    pp.pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca)
+
     if inplace:
-        normalization_settings = adata_pca.uns['normalization_pearson_residuals']
-        normalization_dict = dict(**normalization_settings,
-                                pearson_residuals_df = adata_pca.to_df())
-        pca_settings = adata_pca.uns['pca']
-        pca_dict = dict(**pca_settings, 
-                         PCs = adata_pca.varm['PCs'])
+        normalization_param = adata_pca.uns['pearson_residuals_normalization']
+        normalization_dict = dict(
+            **normalization_param, pearson_residuals_df=adata_pca.to_df()
+        )
+        pca_param = adata_pca.uns['pca']
+        pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs'])
         adata.uns['pearson_residuals_pca'] = pca_dict
         adata.uns['pearson_residuals_normalization'] = normalization_dict
         adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca']
         return None
     else:
-        return adata_pca, hvg
\ No newline at end of file
+        return adata_pca, hvg

From 977b6cf4fdb7487dc12b95b9644b6f832baa520e Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 5 Mar 2021 15:16:50 +0100
Subject: [PATCH 04/96] adapting _normalize_pearson_residuals() to cleaned-up
 _normalized_total() from #1667

---
 scanpy/preprocessing/_normalization.py | 101 +++++++++----------------
 scanpy/preprocessing/_recipes.py       |   2 +-
 2 files changed, 35 insertions(+), 68 deletions(-)

diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 1479bcb9b6..f78287e3a0 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -14,6 +14,8 @@
 from scanpy.get import _get_obs_rep, _set_obs_rep
 
 from ._pca import pca
+from scanpy.get import _get_obs_rep, _set_obs_rep
+
 
 
 
@@ -36,6 +38,7 @@ def _normalize_data(X, counts, after=None, copy=False):
 def _pearson_residuals(X, theta, clip, copy=False):
 
     X = X.copy() if copy else X
+    ##TODO can we avoid making this dense?
     X = X.toarray() if issparse(X) else X
 
     # check theta
@@ -70,9 +73,8 @@ def normalize_pearson_residuals(
     adata: AnnData,
     theta: float = 100,
     clip: Union[Literal['auto', 'none'], float] = 'auto',
-    layers: Optional[Union[Literal['all'], Iterable[str]]] = None,
-    theta_per_layer: Optional[Dict[str, str]] = None,
-    clip_per_layer: Optional[Dict[str, Union[Literal['auto', 'none'], float]]] = None,
+    layer: Optional[str] = None,
+    copy: bool=False,
     inplace: bool = True,
 ) -> Optional[Dict[str, np.ndarray]]:
     """\
@@ -98,91 +100,56 @@ def normalize_pearson_residuals(
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
         `clip=np.Inf` for no clipping.
         
-    layers
-        List of layers to compute Pearson residuals of. Set to `'all'` to
-        compute for all layers.
-    theta_per_layer
-        Dict that specifies which theta is used for each layer:
-
-        * If `None`, the provided `theta` is used for all layers.
-        * Otherwise, each layer with key `layer_key` is processed with the
-        theta value in `theta_per_layer[layer_key]`.
-    clip_per_layer
-        Dict that specifies clipping behavior for each layer :
-
-        * If `None`, the provided `clip` variable is used for all layers.
-        * Otherwise, each layer with key `layer_key` is clipped according to
-          `clip_per_layer[layer_key]`. See `clip` above for possible values.
-          
+    layer
+        Layer to normalize instead of `X`. If `None`, `X` is normalized.
     inplace
         Whether to update `adata` or return dictionary with normalized copies
         of `adata.X` and `adata.layers`.
+    copy
+        Whether to modify copied input object. Not compatible with
+        `inplace=False`.
 
     Returns
     -------
-    Returns dictionary with Pearson residuals of `adata.X` and `adata.layers`
+    Returns dictionary with Pearson residuals and settings
     or updates `adata` with normalized version of the original
     `adata.X` and `adata.layers`, depending on `inplace`.
 
     """
+    
+    if copy:
+        if not inplace:
+            raise ValueError(
+                "`copy=True` cannot be used with `inplace=False`."
+            )
+        adata = adata.copy()
 
-    if layers == 'all':
-        layers = adata.layers.keys()
     # TODO: is this needed and if yes what for?
     # normalize_total() has it so I used it here
+    # TODO: add to other files as well?!
     view_to_actual(adata)
+        
+    X = _get_obs_rep(adata, layer=layer)  ## TODO add to other files as well!
+    computed_on = layer if layer else 'adata.X'
 
-    # Handle X
-    msg = 'computing analytic Pearson residuals for adata.X'
+    msg = 'computing analytic Pearson residuals on %s' % computed_on
     start = logg.info(msg)
+    
+    residuals = _pearson_residuals(X, theta, clip, copy = ~inplace)
+    settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on)
+    
     if inplace:
-        adata.X = _pearson_residuals(adata.X, theta, clip)
-        settings_dict = dict(theta=theta, clip=clip)
-        if theta_per_layer is not None:
-            settings_dict['theta_per_layer'] = theta_per_layer
-        if clip_per_layer is not None:
-            settings_dict['clip_per_layer'] = clip_per_layer
+        _set_obs_rep(adata,residuals,layer=layer)      
         adata.uns['pearson_residuals_normalization'] = settings_dict
-
     else:
-        dat = dict(X=_pearson_residuals(adata.X, theta, clip, copy=True))
-
-    # Handle layers
-    for layer_name in layers or ():
-
-        msg = f'computing analytic Pearson residuals for layer {layer_name}'
-        _ = logg.info(msg)
-
-        # Default to theta/clip if no layer-specific theta/clip given
-        if theta_per_layer is None:
-            layer_theta = theta
-        else:
-            layer_theta = theta_per_layer[layer_name]
-        if clip_per_layer is None:
-            layer_clip = clip
-        else:
-            layer_clip = clip_per_layer[layer_name]
-
-        layer = adata.layers[layer_name]
-
-        if inplace:
-            adata.layers[layer_name] = _pearson_residuals(
-                layer, layer_theta, layer_clip
-            )
-        else:
-            dat[layer_name] = _pearson_residuals(
-                layer, layer_theta, layer_clip, copy=True
-            )
-
-    if not layers is None:
-        adata.uns['pearson_residuals_normalization'] = dict(
-            theta=theta,
-            clip=clip,
-        )
+        results_dict = dict(X=residuals,**settings_dict)
 
     logg.info('    finished ({time_passed})', time=start)
 
-    return dat if not inplace else None
+    if copy:
+        return adata
+    elif not inplace:
+        return results_dict
 
 
 def normalize_pearson_residuals_pca(
@@ -192,7 +159,7 @@ def normalize_pearson_residuals_pca(
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
     use_highly_variable: bool = True,
-    inplace: bool = False,
+    inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
 
     """\
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index 0f2ce4e994..9cffa0f2de 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -181,7 +181,7 @@ def recipe_pearson_residuals(
     batch_key: Optional[str] = None,
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
-    inplace: bool = False,
+    inplace: bool = True,
 ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
     """\
     Applies gene selection based on Pearson residuals. On the resulting subset,

From d8d724c2d96ac33ab40860b42a91ae4b92c66b8a Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 5 Mar 2021 15:40:33 +0100
Subject: [PATCH 05/96] updating layer management as in #1667 for
 _highly_variable_pearson_residuals() as well

---
 scanpy/preprocessing/_highly_variable_genes.py | 9 ++++++---
 scanpy/preprocessing/_normalization.py         | 8 ++------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 78347b6fc3..9897ed4965 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -184,7 +184,7 @@ def _highly_variable_pearson_residuals(
     clip: Union[Literal['auto', 'none'], float] = 'auto',
     chunksize: int = 100,
     subset: bool = False,
-    inplace: bool = True,
+    inplace: bool = True
 ) -> Optional[pd.DataFrame]:
     """\
     See `highly_variable_genes`.
@@ -214,7 +214,9 @@ def _highly_variable_pearson_residuals(
         in all batches
     """
 
-    X = adata.layers[layer] if layer is not None else adata.X
+    view_to_actual(adata)        
+    X = _get_obs_rep(adata, layer=layer)
+    computed_on = layer if layer else 'adata.X'
 
     # Check for raw counts
     if check_nonnegative_integers(X) is False:
@@ -328,7 +330,8 @@ def _highly_variable_pearson_residuals(
     df = df.loc[adata.var_names]
 
     if inplace or subset:
-        adata.uns['hvg'] = {'flavor': 'pearson_residuals'}
+        adata.uns['hvg'] = {'flavor': 'pearson_residuals',
+                            'computed_on':computed_on}
         logg.hint(
             'added\n'
             '    \'highly_variable\', boolean vector (adata.var)\n'
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index f78287e3a0..5ff4f32049 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -124,12 +124,8 @@ def normalize_pearson_residuals(
             )
         adata = adata.copy()
 
-    # TODO: is this needed and if yes what for?
-    # normalize_total() has it so I used it here
-    # TODO: add to other files as well?!
-    view_to_actual(adata)
-        
-    X = _get_obs_rep(adata, layer=layer)  ## TODO add to other files as well!
+    view_to_actual(adata)        
+    X = _get_obs_rep(adata, layer=layer)
     computed_on = layer if layer else 'adata.X'
 
     msg = 'computing analytic Pearson residuals on %s' % computed_on

From e23ea6cbbd243838ef0e26bc81be01cb939fd9fe Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 5 Mar 2021 16:07:22 +0100
Subject: [PATCH 06/96] slight performance improvement for sparse input

---
 scanpy/preprocessing/_normalization.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 5ff4f32049..557f4d1b82 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -38,8 +38,6 @@ def _normalize_data(X, counts, after=None, copy=False):
 def _pearson_residuals(X, theta, clip, copy=False):
 
     X = X.copy() if copy else X
-    ##TODO can we avoid making this dense?
-    X = X.toarray() if issparse(X) else X
 
     # check theta
     if theta <= 0:
@@ -56,12 +54,18 @@ def _pearson_residuals(X, theta, clip, copy=False):
     if check_nonnegative_integers(X) is False:
         raise ValueError("`pp.normalize_pearson_residuals` expects raw count data")
 
-    # get residuals
-    sums_genes = np.sum(X, axis=0, keepdims=True)
-    sums_cells = np.sum(X, axis=1, keepdims=True)
-    sum_total = np.sum(sums_genes)
-    mu = sums_cells @ sums_genes / sum_total
-    residuals = (X - mu) / np.sqrt(mu + mu ** 2 / theta)
+    if sp_sparse.issparse(X):
+        sums_genes = np.sum(X, axis=0)
+        sums_cells = np.sum(X, axis=1)
+        sum_total = np.sum(sums_genes).squeeze()
+    else:
+        sums_genes = np.sum(X, axis=0, keepdims=True)
+        sums_cells = np.sum(X, axis=1, keepdims=True)
+        sum_total = np.sum(sums_genes)
+
+    mu = np.array(sums_cells @ sums_genes / sum_total)
+    diff = np.array(X - mu)
+    residuals = diff / np.sqrt(mu + mu ** 2 / theta)
 
     # clip
     residuals = np.clip(residuals, a_min=-clip, a_max=clip)

From fc49c2580305705250e554a8f7cd3105747d240a Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Wed, 10 Mar 2021 14:57:11 +0100
Subject: [PATCH 07/96] style cleanup

---
 scanpy/preprocessing/__init__.py              | 15 ++++++--
 .../preprocessing/_highly_variable_genes.py   | 38 +++++++++----------
 scanpy/preprocessing/_normalization.py        | 25 +++++-------
 3 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py
index f1b4dad80c..8adee1f813 100644
--- a/scanpy/preprocessing/__init__.py
+++ b/scanpy/preprocessing/__init__.py
@@ -1,4 +1,9 @@
-from ._recipes import recipe_zheng17, recipe_weinreb17, recipe_seurat, recipe_pearson_residuals
+from ._recipes import (
+    recipe_zheng17,
+    recipe_weinreb17,
+    recipe_seurat,
+    recipe_pearson_residuals,
+)
 from ._simple import filter_cells, filter_genes
 from ._deprecated.highly_variable_genes import filter_genes_dispersion
 from ._highly_variable_genes import highly_variable_genes
@@ -7,6 +12,10 @@
 from ._pca import pca
 from ._qc import calculate_qc_metrics
 from ._combat import combat
-from ._normalization import normalize_total, normalize_pearson_residuals, normalize_pearson_residuals_pca
+from ._normalization import (
+    normalize_total,
+    normalize_pearson_residuals,
+    normalize_pearson_residuals_pca,
+)
 
-from ..neighbors import neighbors
\ No newline at end of file
+from ..neighbors import neighbors
diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 9897ed4965..2c63b6161a 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -175,6 +175,7 @@ def _highly_variable_genes_seurat_v3(
             df = df.drop(['highly_variable_nbatches'], axis=1)
         return df
 
+
 def _highly_variable_pearson_residuals(
     adata: AnnData,
     layer: Optional[str] = None,
@@ -184,7 +185,7 @@ def _highly_variable_pearson_residuals(
     clip: Union[Literal['auto', 'none'], float] = 'auto',
     chunksize: int = 100,
     subset: bool = False,
-    inplace: bool = True
+    inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
     See `highly_variable_genes`.
@@ -214,7 +215,7 @@ def _highly_variable_pearson_residuals(
         in all batches
     """
 
-    view_to_actual(adata)        
+    view_to_actual(adata)
     X = _get_obs_rep(adata, layer=layer)
     computed_on = layer if layer else 'adata.X'
 
@@ -330,8 +331,7 @@ def _highly_variable_pearson_residuals(
     df = df.loc[adata.var_names]
 
     if inplace or subset:
-        adata.uns['hvg'] = {'flavor': 'pearson_residuals',
-                            'computed_on':computed_on}
+        adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on}
         logg.hint(
             'added\n'
             '    \'highly_variable\', boolean vector (adata.var)\n'
@@ -364,9 +364,7 @@ def _highly_variable_pearson_residuals(
                 ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1
             )
         return df
-    
-    
-    
+
 
 def _highly_variable_genes_single_batch(
     adata: AnnData,
@@ -481,7 +479,6 @@ def _highly_variable_genes_single_batch(
     return df
 
 
-
 def highly_variable_genes(
     adata: AnnData,
     layer: Optional[str] = None,
@@ -495,7 +492,9 @@ def highly_variable_genes(
     theta: float = 100,
     clip: Union[Literal['auto', 'none'], float] = 'auto',
     chunksize: int = 1000,
-    flavor: Literal['seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals'] = 'seurat',
+    flavor: Literal[
+        'seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals'
+    ] = 'seurat',
     subset: bool = False,
     inplace: bool = True,
     batch_key: Optional[str] = None,
@@ -658,21 +657,20 @@ def highly_variable_genes(
     if flavor == 'pearson_residuals':
         if n_top_genes is None:
             raise ValueError(
-            "`pp.highly_variable_genes` requires the argument `n_top_genes`"
-            " for `flavor='pearson_residuals'`"
+                "`pp.highly_variable_genes` requires the argument `n_top_genes`"
+                " for `flavor='pearson_residuals'`"
             )
         return _highly_variable_pearson_residuals(
             adata,
-            layer = layer,
-            n_top_genes = n_top_genes,
-            batch_key = batch_key,
-            theta = theta,
-            clip = clip,
-            chunksize= chunksize,
-            subset = subset,
-            inplace = inplace,
+            layer=layer,
+            n_top_genes=n_top_genes,
+            batch_key=batch_key,
+            theta=theta,
+            clip=clip,
+            chunksize=chunksize,
+            subset=subset,
+            inplace=inplace,
         )
-        
 
     if batch_key is None:
         df = _highly_variable_genes_single_batch(
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 557f4d1b82..be47494933 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -17,9 +17,6 @@
 from scanpy.get import _get_obs_rep, _set_obs_rep
 
 
-
-
-
 def _normalize_data(X, counts, after=None, copy=False):
     X = X.copy() if copy else X
     if issubclass(X.dtype.type, (int, np.integer)):
@@ -78,7 +75,7 @@ def normalize_pearson_residuals(
     theta: float = 100,
     clip: Union[Literal['auto', 'none'], float] = 'auto',
     layer: Optional[str] = None,
-    copy: bool=False,
+    copy: bool = False,
     inplace: bool = True,
 ) -> Optional[Dict[str, np.ndarray]]:
     """\
@@ -120,29 +117,27 @@ def normalize_pearson_residuals(
     `adata.X` and `adata.layers`, depending on `inplace`.
 
     """
-    
+
     if copy:
         if not inplace:
-            raise ValueError(
-                "`copy=True` cannot be used with `inplace=False`."
-            )
+            raise ValueError("`copy=True` cannot be used with `inplace=False`.")
         adata = adata.copy()
 
-    view_to_actual(adata)        
+    view_to_actual(adata)
     X = _get_obs_rep(adata, layer=layer)
     computed_on = layer if layer else 'adata.X'
 
     msg = 'computing analytic Pearson residuals on %s' % computed_on
     start = logg.info(msg)
-    
-    residuals = _pearson_residuals(X, theta, clip, copy = ~inplace)
+
+    residuals = _pearson_residuals(X, theta, clip, copy=~inplace)
     settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on)
-    
+
     if inplace:
-        _set_obs_rep(adata,residuals,layer=layer)      
+        _set_obs_rep(adata, residuals, layer=layer)
         adata.uns['pearson_residuals_normalization'] = settings_dict
     else:
-        results_dict = dict(X=residuals,**settings_dict)
+        results_dict = dict(X=residuals, **settings_dict)
 
     logg.info('    finished ({time_passed})', time=start)
 
@@ -246,7 +241,7 @@ def normalize_pearson_residuals_pca(
         return None
     else:
         return adata_pca
-    
+
 
 def normalize_total(
     adata: AnnData,

From f91f2fe11a95846506a820e84e6a3106628ce5ce Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 12 Mar 2021 16:18:46 +0100
Subject: [PATCH 08/96] fixing import issue, fixing docstring style, adding
 check_values param and warning as in #1642

---
 .../preprocessing/_highly_variable_genes.py   | 40 ++++++++++---------
 scanpy/preprocessing/_recipes.py              | 18 ++++-----
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 2c63b6161a..7f0387e81f 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -9,7 +9,8 @@
 
 from .. import logging as logg
 from .._settings import settings, Verbosity
-from .._utils import sanitize_anndata, check_nonnegative_integers
+from .._utils import sanitize_anndata, check_nonnegative_integers, view_to_actual
+from scanpy.get import _get_obs_rep, _set_obs_rep
 from .._compat import Literal
 from ._utils import _get_mean_var
 from ._distributed import materialize_as_ndarray
@@ -34,20 +35,20 @@ def _highly_variable_genes_seurat_v3(
     Returns
     -------
     Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or
-    updates `.var` with the following fields
+    updates `.var` with the following fields:
 
     highly_variable : bool
-        boolean indicator of highly-variable genes
+        boolean indicator of highly-variable genes.
     **means**
-        means per gene
+        means per gene.
     **variances**
-        variance per gene
+        variance per gene.
     **variances_norm**
-        normalized variance per gene, averaged in the case of multiple batches
+        normalized variance per gene, averaged in the case of multiple batches.
     highly_variable_rank : float
-        Rank of the gene according to normalized variance, median rank in the case of multiple batches
+        Rank of the gene according to normalized variance, median rank in the case of multiple batches.
     highly_variable_nbatches : int
-        If batch_key is given, this denotes in how many batches genes are detected as HVG
+        If batch_key is given, this denotes in how many batches genes are detected as HVG.
     """
 
     try:
@@ -184,6 +185,7 @@ def _highly_variable_pearson_residuals(
     theta: float = 100,
     clip: Union[Literal['auto', 'none'], float] = 'auto',
     chunksize: int = 100,
+    check_values: bool = True,
     subset: bool = False,
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
@@ -193,26 +195,26 @@ def _highly_variable_pearson_residuals(
     Returns
     -------
     Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`)
-    or updates `.var` with the following fields
+    or updates `.var` with the following fields:
 
     highly_variable
-        boolean indicator of highly-variable genes
+        boolean indicator of highly-variable genes.
     means
-        means per gene
+        means per gene.
     variances
-        variances per gene 
+        variances per gene.
     residual_variances
         Pearson residual variance per gene. Averaged in the case of multiple
         batches.
     highly_variable_rank
         Rank of the gene according to residual variance, median rank in the
-        case of multiple batches
+        case of multiple batches.
     highly_variable_nbatches : int
         If batch_key is given, this denotes in how many batches genes are
-        detected as HVG
+        detected as HVG.
     highly_variable_intersection : bool
         If batch_key is given, this denotes the genes that are highly variable
-        in all batches
+        in all batches.
     """
 
     view_to_actual(adata)
@@ -220,10 +222,10 @@ def _highly_variable_pearson_residuals(
     computed_on = layer if layer else 'adata.X'
 
     # Check for raw counts
-    if check_nonnegative_integers(X) is False:
-        raise ValueError(
-            "`pp.highly_variable_genes` with `flavor='pearson_residuals'`"
-            "expects raw count data."
+    if check_values and (check_nonnegative_integers(X) == False):
+        warnings.warn(
+            "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
+            UserWarning,
         )
 
     if batch_key is None:
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index 9cffa0f2de..891bbb55ce 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -235,33 +235,33 @@ def recipe_pearson_residuals(
     updates `adata` with the following fields for gene selection results…:
 
     `.var['highly_variable']`
-        boolean indicator of highly-variable genes
+        boolean indicator of highly-variable genes.
     `.var['means']`
-        means per gene
+        means per gene.
     `.var['variances']`
-        variances per gene
+        variances per gene.
     `.var['residual_variances']`
         Pearson residual variance per gene. Averaged in the case of multiple
         batches.
     `.var['highly_variable_rank']`
         Rank of the gene according to residual variance, median rank in the
-        case of multiple batches
+        case of multiple batches.
     `.var['highly_variable_nbatches']`
         If batch_key is given, this denotes in how many batches genes are
-        detected as HVG
+        detected as HVG.
     `.var['highly_variable_intersection']`
         If batch_key is given, this denotes the genes that are highly variable
-        in all batches
+        in all batches.
 
     …and the following fields for Pearson residual-based PCA results and
     normalization settings:
 
     `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
-         The hvg-subset, normalized by Pearson residuals
+         The hvg-subset, normalized by Pearson residuals.
     `.uns['pearson_residuals_normalization']['theta']`
-         The used value of the overdisperion parameter theta
+         The used value of the overdisperion parameter theta.
     `.uns['pearson_residuals_normalization']['clip']`
-         The used value of the clipping parameter
+         The used value of the clipping parameter.
 
     `.obsm['pearson_residuals_X_pca']`
         PCA representation of data after gene selection and Pearson residual

From 60de21d3aaa7c236fe523d2b9ad737391ea4712e Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 12 Mar 2021 16:37:26 +0100
Subject: [PATCH 09/96] fixed small NameError, simplified clip argument

---
 scanpy/preprocessing/_highly_variable_genes.py | 10 +++++-----
 scanpy/preprocessing/_normalization.py         | 14 +++++++-------
 scanpy/preprocessing/_recipes.py               |  4 ++--
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 7f0387e81f..1abc261c68 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -183,7 +183,7 @@ def _highly_variable_pearson_residuals(
     n_top_genes: int = 2000,
     batch_key: Optional[str] = None,
     theta: float = 100,
-    clip: Union[Literal['auto', 'none'], float] = 'auto',
+    clip: Optional[float] = None,
     chunksize: int = 100,
     check_values: bool = True,
     subset: bool = False,
@@ -251,11 +251,11 @@ def _highly_variable_pearson_residuals(
             X_batch = adata_subset.X
 
         # Prepare clipping
-        if clip == 'auto':
+        if clip is None:
             n = X_batch.shape[0]
             clip = np.sqrt(n)
         if clip < 0:
-            raise ValueError("Pearson residuals require `clip>=0` or `clip='auto'`.")
+            raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
 
         if sp_sparse.issparse(X_batch):
             sums_genes = np.sum(X_batch, axis=0)
@@ -492,7 +492,7 @@ def highly_variable_genes(
     span: Optional[float] = 0.3,
     n_bins: int = 20,
     theta: float = 100,
-    clip: Union[Literal['auto', 'none'], float] = 'auto',
+    clip: Optional[float] = None,
     chunksize: int = 1000,
     flavor: Literal[
         'seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals'
@@ -564,7 +564,7 @@ def highly_variable_genes(
     clip
         If `flavor='pearson_residuals'`, this determines if and how residuals are clipped:
         
-        * If `'auto'`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
+        * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
         where n is the number of cells in the dataset (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
         `clip=np.Inf` for no clipping.
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index be47494933..5d1e65de1b 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -42,16 +42,16 @@ def _pearson_residuals(X, theta, clip, copy=False):
         # then only theta=0 were undefined..
         raise ValueError('Pearson residuals require theta > 0')
     # prepare clipping
-    if clip == 'auto':
+    if clip is None:
         n = X.shape[0]
         clip = np.sqrt(n)
     if clip < 0:
-        raise ValueError("Pearson residuals require `clip>=0` or `clip='auto'`.")
+        raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
 
     if check_nonnegative_integers(X) is False:
         raise ValueError("`pp.normalize_pearson_residuals` expects raw count data")
 
-    if sp_sparse.issparse(X):
+    if issparse(X):
         sums_genes = np.sum(X, axis=0)
         sums_cells = np.sum(X, axis=1)
         sum_total = np.sum(sums_genes).squeeze()
@@ -73,7 +73,7 @@ def _pearson_residuals(X, theta, clip, copy=False):
 def normalize_pearson_residuals(
     adata: AnnData,
     theta: float = 100,
-    clip: Union[Literal['auto', 'none'], float] = 'auto',
+    clip: Optional[float] = None,
     layer: Optional[str] = None,
     copy: bool = False,
     inplace: bool = True,
@@ -95,7 +95,7 @@ def normalize_pearson_residuals(
     clip
         Determines if and how residuals are clipped:
         
-        * If `'auto'`, residuals are clipped to the interval
+        * If `None`, residuals are clipped to the interval
         [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
         (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
@@ -150,7 +150,7 @@ def normalize_pearson_residuals(
 def normalize_pearson_residuals_pca(
     adata: AnnData,
     theta: float = 100,
-    clip: Union[Literal['auto', 'none'], float] = 'auto',
+    clip: Optional[float] = None,
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
     use_highly_variable: bool = True,
@@ -180,7 +180,7 @@ def normalize_pearson_residuals_pca(
     clip
         This determines if and how Pearson residuals are clipped:
         
-        * If `'auto'`, residuals are clipped to the interval
+        * If `None`, residuals are clipped to the interval
         [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
         (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index 891bbb55ce..ef56cc48ca 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -176,7 +176,7 @@ def recipe_pearson_residuals(
     adata: AnnData,
     n_top_genes: int = 1000,
     theta: float = 100,
-    clip: Union[Literal['auto', 'none'], float] = 'auto',
+    clip: Optional[float] = None,
     chunksize: int = 1000,
     batch_key: Optional[str] = None,
     n_comps_pca: Optional[int] = 50,
@@ -208,7 +208,7 @@ def recipe_pearson_residuals(
     clip
         This determines if and how Pearson residuals are clipped:
 
-        * If `'auto'`, residuals are clipped to the interval
+        * If `None`, residuals are clipped to the interval
         [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
         (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set

From 1f86989b991c32642fdc63742c4044caa25aa695 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 12 Mar 2021 17:53:02 +0100
Subject: [PATCH 10/96] remove pd.categorical()

---
 scanpy/preprocessing/_highly_variable_genes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 1abc261c68..83a93eb675 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -229,7 +229,7 @@ def _highly_variable_pearson_residuals(
         )
 
     if batch_key is None:
-        batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int))
+        batch_info = np.zeros(adata.shape[0], dtype=int)
     else:
         batch_info = adata.obs[batch_key].values
     n_batches = len(np.unique(batch_info))

From 95ec0e5074c6ba5863315f19388514d3d7c0b5d2 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 12 Mar 2021 18:27:09 +0100
Subject: [PATCH 11/96] adding check_values to docstrings and remaining pearson
 residual functions

---
 .../preprocessing/_highly_variable_genes.py   |  3 ++-
 scanpy/preprocessing/_normalization.py        | 27 +++++++++++++------
 scanpy/preprocessing/_recipes.py              |  8 +++++-
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 83a93eb675..e9ba0609bd 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -593,7 +593,7 @@ def highly_variable_genes(
         based on within-batch residual variance.
     check_values
         Check if counts in selected layer are integers. A Warning is returned if set to True.
-        Only used if `flavor='seurat_v3'`.
+        Only used if `flavor='seurat_v3'` or `flavor='pearson_residuals'`.
 
 
     Returns
@@ -671,6 +671,7 @@ def highly_variable_genes(
             clip=clip,
             chunksize=chunksize,
             subset=subset,
+            check_values=check_values,
             inplace=inplace,
         )
 
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 5d1e65de1b..105b5d6aad 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -32,7 +32,7 @@ def _normalize_data(X, counts, after=None, copy=False):
     return X
 
 
-def _pearson_residuals(X, theta, clip, copy=False):
+def _pearson_residuals(X, theta, clip, check_values, copy=False):
 
     X = X.copy() if copy else X
 
@@ -48,8 +48,11 @@ def _pearson_residuals(X, theta, clip, copy=False):
     if clip < 0:
         raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
 
-    if check_nonnegative_integers(X) is False:
-        raise ValueError("`pp.normalize_pearson_residuals` expects raw count data")
+    if check_values and (check_nonnegative_integers(X) == False):
+        warnings.warn(
+            "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
+            UserWarning,
+        )
 
     if issparse(X):
         sums_genes = np.sum(X, axis=0)
@@ -76,6 +79,7 @@ def normalize_pearson_residuals(
     clip: Optional[float] = None,
     layer: Optional[str] = None,
     copy: bool = False,
+    check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[Dict[str, np.ndarray]]:
     """\
@@ -103,12 +107,14 @@ def normalize_pearson_residuals(
         
     layer
         Layer to normalize instead of `X`. If `None`, `X` is normalized.
-    inplace
-        Whether to update `adata` or return dictionary with normalized copies
-        of `adata.X` and `adata.layers`.
     copy
         Whether to modify copied input object. Not compatible with
         `inplace=False`.
+    check_values
+        Check if counts in selected layer are integers. A Warning is returned if set to True.
+    inplace
+        Whether to update `adata` or return dictionary with normalized copies
+        of `adata.X` and `adata.layers`.
 
     Returns
     -------
@@ -130,7 +136,7 @@ def normalize_pearson_residuals(
     msg = 'computing analytic Pearson residuals on %s' % computed_on
     start = logg.info(msg)
 
-    residuals = _pearson_residuals(X, theta, clip, copy=~inplace)
+    residuals = _pearson_residuals(X, theta, clip, check_values, copy=~inplace)
     settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on)
 
     if inplace:
@@ -154,6 +160,7 @@ def normalize_pearson_residuals_pca(
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
     use_highly_variable: bool = True,
+    check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
 
@@ -190,6 +197,8 @@ def normalize_pearson_residuals_pca(
         Number of principal components to compute.
     random_state_pca
         Change to use different initial states for the optimization.
+    check_values
+        Check if counts in selected layer are integers. A Warning is returned if set to True.
     inplace
         Whether to place results in `adata` or return them.
 
@@ -227,7 +236,9 @@ def normalize_pearson_residuals_pca(
         # TODO: are these copies needed?
         adata_pca = adata.copy()
 
-    normalize_pearson_residuals(adata_pca, theta=theta, clip=clip)
+    normalize_pearson_residuals(
+        adata_pca, theta=theta, clip=clip, check_values=check_values
+    )
     pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca)
 
     if inplace:
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index ef56cc48ca..5ecd58adea 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -181,6 +181,7 @@ def recipe_pearson_residuals(
     batch_key: Optional[str] = None,
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
+    check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
     """\
@@ -225,6 +226,8 @@ def recipe_pearson_residuals(
         Number of principal components to compute.
     random_state_pca
         Change to use different initial states for the optimization.
+    check_values
+        Check if counts in selected layer are integers. A Warning is returned if set to True.
     inplace
         Whether to place results in `adata` or return them.
 
@@ -283,6 +286,7 @@ def recipe_pearson_residuals(
         theta=theta,
         clip=clip,
         chunksize=chunksize,
+        check_values=check_values,
     )
 
     if inplace:
@@ -294,7 +298,9 @@ def recipe_pearson_residuals(
         # TODO: are these copies needed?
         adata_pca = adata[:, hvg['highly_variable']].copy()
 
-    pp.normalize_pearson_residuals(adata_pca, theta=theta, clip=clip)
+    pp.normalize_pearson_residuals(
+        adata_pca, theta=theta, clip=clip, check_values=check_values
+    )
     pp.pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca)
 
     if inplace:

From ff822905ad88be11dee82d760c1c8fdd88066b7f Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 12 Mar 2021 20:05:31 +0100
Subject: [PATCH 12/96] np.empty instead of np.nan

---
 scanpy/preprocessing/_highly_variable_genes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index e9ba0609bd..e971ff52d1 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -267,7 +267,7 @@ def _highly_variable_pearson_residuals(
             sum_total = np.sum(sums_genes)
 
         # Compute pearson residuals in chunks
-        residual_gene_var = np.ones((X_batch.shape[1])) * np.nan
+        residual_gene_var = np.empty((X_batch.shape[1]))
         for start in np.arange(0, X_batch.shape[1], chunksize):
             stop = start + chunksize
             mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total)

From f7f7dbdc9490c639233b49708ade4acd9844425a Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Mon, 15 Mar 2021 14:23:20 +0100
Subject: [PATCH 13/96] add references to docstrings, add HVG details to
 docstring

---
 docs/references.rst                            |  4 ++++
 scanpy/preprocessing/_highly_variable_genes.py | 12 +++++++++---
 scanpy/preprocessing/_normalization.py         |  6 ++++++
 scanpy/preprocessing/_recipes.py               |  3 +++
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/docs/references.rst b/docs/references.rst
index 6d05613328..458534a3a3 100644
--- a/docs/references.rst
+++ b/docs/references.rst
@@ -119,6 +119,10 @@ References
    *Laplacian Dynamics and Multiscale Modular Structure in Networks*
    `arXiv <https://arxiv.org/abs/0812.1770>`__.
 
+.. [Lause20] Lause *et al.* (2020)
+   *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*,
+   `BioRxiv <https://doi.org/10.1101/2020.12.01.405886>`__.
+
 .. [Leek12] Leek *et al.* (2012),
    *sva: Surrogate Variable Analysis. R package*
    `Bioconductor <https://doi.org/10.18129/B9.bioc.sva>`__.
diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index e971ff52d1..f899eb2f3b 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -505,11 +505,12 @@ def highly_variable_genes(
     """\
     Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_.
 
-    Expects logarithmized data, except when `flavor='seurat_v3'` in which
-    count data is expected.
+    Expects logarithmized data, except when `flavor='seurat_v3'` or
+    `flavor='pearson_residuals'`, in which count data is expected.
 
     Depending on `flavor`, this reproduces the R-implementations of Seurat
-    [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_.
+    [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_, or uses
+    analytical Peason residuals [Lause20]_.
 
     For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized
     dispersion is obtained by scaling with the mean and standard deviation of
@@ -522,6 +523,11 @@ def highly_variable_genes(
     standard deviation. Next, the normalized variance is computed as the variance
     of each gene after the transformation. Genes are ranked by the normalized variance.
 
+    For [Lause20]_, Pearson residuals of a negative binomial offset model (with
+    overdispersion theta shared across genes) are computed. By default, overdispersion
+    theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked
+    by residual variance.
+
     Parameters
     ----------
     adata
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 105b5d6aad..01f051096d 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -87,6 +87,9 @@ def normalize_pearson_residuals(
     model with overdispersion theta shared across genes. By default, residuals
     are clipped to sqrt(n) and overdispersion theta=100 is used.
 
+    Based on "Analytic Pearson residuals for normalization of single-cell
+    RNA-seq UMI data", bioRxiv, [Lause20]_.
+
     Params
     ------
     adata
@@ -169,6 +172,9 @@ def normalize_pearson_residuals_pca(
     subset of highly variable genes in `adata.var['highly_variable']` by
     default.
 
+    This workflow is based on "Analytic Pearson residuals for normalization of
+    single-cell RNA-seq UMI data", bioRxiv, [Lause20]_.
+
 
     Parameters
     ----------
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index 5ecd58adea..c03d2af737 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -188,6 +188,9 @@ def recipe_pearson_residuals(
     Applies gene selection based on Pearson residuals. On the resulting subset,
     Pearson residual normalization and PCA are performed.
 
+    This recipe is based on "Analytic Pearson residuals for normalization of
+    single-cell RNA-seq UMI data", bioRxiv, [Lause20]_.
+
 
     Parameters
     ----------

From af0a8255cdfd487a5dd060e90a7ab5e95e8d8bef Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Mon, 15 Mar 2021 15:03:31 +0100
Subject: [PATCH 14/96] exposing pca keyword arguments to the user for the
 bundle/recipe functions

---
 scanpy/preprocessing/_normalization.py | 5 ++++-
 scanpy/preprocessing/_recipes.py       | 7 +++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 01f051096d..409fa7b7b7 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -163,6 +163,7 @@ def normalize_pearson_residuals_pca(
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
     use_highly_variable: bool = True,
+    kwargs_pca: Optional[dict] = None,
     check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
@@ -203,6 +204,8 @@ def normalize_pearson_residuals_pca(
         Number of principal components to compute.
     random_state_pca
         Change to use different initial states for the optimization.
+    kwargs_pca
+        Dictionary of further keyword arguments passed on to `sc.pp.pca()`.
     check_values
         Check if counts in selected layer are integers. A Warning is returned if set to True.
     inplace
@@ -245,7 +248,7 @@ def normalize_pearson_residuals_pca(
     normalize_pearson_residuals(
         adata_pca, theta=theta, clip=clip, check_values=check_values
     )
-    pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca)
+    pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca)
 
     if inplace:
         norm_settings = adata_pca.uns['pearson_residuals_normalization']
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index c03d2af737..f68978ba41 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -181,6 +181,7 @@ def recipe_pearson_residuals(
     batch_key: Optional[str] = None,
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
+    kwargs_pca: Optional[dict] = None,
     check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
@@ -229,13 +230,15 @@ def recipe_pearson_residuals(
         Number of principal components to compute.
     random_state_pca
         Change to use different initial states for the optimization.
+    kwargs_pca
+        Dictionary of further keyword arguments passed on to `sc.pp.pca()`.
     check_values
         Check if counts in selected layer are integers. A Warning is returned if set to True.
     inplace
         Whether to place results in `adata` or return them.
 
     Returns
-    -------
+    ------
     If `inplace=False`, separately returns the gene selection results (`hvg`)
     and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`,
     updates `adata` with the following fields for gene selection results…:
@@ -304,7 +307,7 @@ def recipe_pearson_residuals(
     pp.normalize_pearson_residuals(
         adata_pca, theta=theta, clip=clip, check_values=check_values
     )
-    pp.pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca)
+    pp.pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca)
 
     if inplace:
         normalization_param = adata_pca.uns['pearson_residuals_normalization']

From 142eaca0938dc153909dd6b2a7f06d3c4f5eadb1 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Mon, 15 Mar 2021 16:15:19 +0100
Subject: [PATCH 15/96] removed unneeded reversal in hvg, fix kwargs_pca bug,
 consistent defaults across files

---
 scanpy/preprocessing/_highly_variable_genes.py | 10 ++++++----
 scanpy/preprocessing/_normalization.py         |  2 +-
 scanpy/preprocessing/_recipes.py               |  2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index f899eb2f3b..34a8e2ca5e 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -180,7 +180,7 @@ def _highly_variable_genes_seurat_v3(
 def _highly_variable_pearson_residuals(
     adata: AnnData,
     layer: Optional[str] = None,
-    n_top_genes: int = 2000,
+    n_top_genes: int = 1000,
     batch_key: Optional[str] = None,
     theta: float = 100,
     clip: Optional[float] = None,
@@ -287,9 +287,11 @@ def _highly_variable_pearson_residuals(
     residual_gene_vars = np.concatenate(residual_gene_vars, axis=0)
 
     # Get cutoffs and define hvgs per batch
-    residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1)[:, ::-1]
-    cutoffs_per_batch = residual_gene_vars_sorted[:, n_top_genes]
-    highly_variable_per_batch = np.greater(residual_gene_vars.T, cutoffs_per_batch).T
+    residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1)
+    cutoffs_per_batch = residual_gene_vars_sorted[:, -n_top_genes]
+    highly_variable_per_batch = np.greater_equal(
+        residual_gene_vars.T, cutoffs_per_batch
+    ).T
 
     # Merge hvgs across batches
     highly_variable_nbatches = np.sum(highly_variable_per_batch, axis=0)
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 409fa7b7b7..ffba1e771f 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -163,7 +163,7 @@ def normalize_pearson_residuals_pca(
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
     use_highly_variable: bool = True,
-    kwargs_pca: Optional[dict] = None,
+    kwargs_pca: Optional[dict] = {},
     check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index f68978ba41..7393e1a4ac 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -181,7 +181,7 @@ def recipe_pearson_residuals(
     batch_key: Optional[str] = None,
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
-    kwargs_pca: Optional[dict] = None,
+    kwargs_pca: dict = {},
     check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:

From 541b252deab0f99f50085d9802e810eb5bb46b47 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 11 Jun 2021 17:52:13 +0200
Subject: [PATCH 16/96] fixing handling of `inplace` and `subset` arguments
 (see issue #1886), explicit typing of output, adding theta input check

---
 .../preprocessing/_highly_variable_genes.py   | 34 ++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 34a8e2ca5e..fe9277f3b7 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -197,18 +197,18 @@ def _highly_variable_pearson_residuals(
     Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`)
     or updates `.var` with the following fields:
 
-    highly_variable
+    highly_variable : bool
         boolean indicator of highly-variable genes.
-    means
+    means : float
         means per gene.
-    variances
+    variances : float
         variances per gene.
-    residual_variances
+    residual_variances : float
         Pearson residual variance per gene. Averaged in the case of multiple
         batches.
-    highly_variable_rank
+    highly_variable_rank : float
         Rank of the gene according to residual variance, median rank in the
-        case of multiple batches.
+        case of multiple batches. NaN for non-HVGs.
     highly_variable_nbatches : int
         If batch_key is given, this denotes in how many batches genes are
         detected as HVG.
@@ -227,6 +227,12 @@ def _highly_variable_pearson_residuals(
             "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
             UserWarning,
         )
+    # check theta
+    if theta <= 0:
+        # TODO: would "underdispersion" with negative theta make sense?
+        # then only theta=0 were undefined..
+        raise ValueError('Pearson residuals require theta > 0')
+    # prepare clipping
 
     if batch_key is None:
         batch_info = np.zeros(adata.shape[0], dtype=int)
@@ -312,9 +318,11 @@ def _highly_variable_pearson_residuals(
         dict(
             means=means,
             variances=variances,
-            residual_variances=np.mean(residual_gene_vars, axis=0),
+            residual_variances=np.mean(residual_gene_vars, axis=0).astype(
+                np.float32, copy=False
+            ),
             highly_variable_rank=medianrank_residual_var,
-            highly_variable_nbatches=highly_variable_nbatches,
+            highly_variable_nbatches=highly_variable_nbatches.astype(np.int64),
             highly_variable_intersection=highly_variable_intersection,
         )
     )
@@ -334,7 +342,7 @@ def _highly_variable_pearson_residuals(
     # (also for flavor = seurat and cellranger..)
     df = df.loc[adata.var_names]
 
-    if inplace or subset:
+    if inplace:
         adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on}
         logg.hint(
             'added\n'
@@ -350,9 +358,8 @@ def _highly_variable_pearson_residuals(
         adata.var['highly_variable_rank'] = df['highly_variable_rank'].values
         adata.var['means'] = df['means'].values
         adata.var['variances'] = df['variances'].values
-        adata.var['residual_variances'] = df['residual_variances'].values.astype(
-            'float64', copy=False
-        )
+        adata.var['residual_variances'] = df['residual_variances']
+
         if batch_key is not None:
             adata.var['highly_variable_nbatches'] = df[
                 'highly_variable_nbatches'
@@ -367,6 +374,9 @@ def _highly_variable_pearson_residuals(
             df = df.drop(
                 ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1
             )
+        if subset:
+            df = df.iloc[df.highly_variable.values, :]
+
         return df
 
 

From fdd500be232260ad9e5c56c53813ea3a305edac8 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 11 Jun 2021 17:59:26 +0200
Subject: [PATCH 17/96] renaming output fields for consistency, fixing minor
 bug

---
 scanpy/preprocessing/_normalization.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index ffba1e771f..1f55e179e6 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -49,7 +49,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False):
         raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
 
     if check_values and (check_nonnegative_integers(X) == False):
-        warnings.warn(
+        warn(
             "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
             UserWarning,
         )
@@ -225,7 +225,7 @@ def normalize_pearson_residuals_pca(
     `.uns['pearson_residuals_normalization']['clip']`
          The used value of the clipping parameter
     
-    `.obsm['pearson_residuals_X_pca']`
+    `.obsm['X_pearson_residuals_pca']`
         PCA representation of data after gene selection and Pearson residual
         normalization.
     `.uns['pearson_residuals_pca']['PCs']`
@@ -257,9 +257,13 @@ def normalize_pearson_residuals_pca(
         pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs'])
         adata.uns['pearson_residuals_pca'] = pca_dict
         adata.uns['pearson_residuals_normalization'] = norm_dict
-        adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca']
+        adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca']
         return None
     else:
+        adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy()
+        adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy()
+        del adata_pca.obsm['X_pca']
+        del adata_pca.uns['pca']
         return adata_pca
 
 

From c6dfc1de5af8c7f000d9f5f3160b16be0b5395ef Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 11 Jun 2021 17:59:59 +0200
Subject: [PATCH 18/96] renaming output fields for consistency

---
 scanpy/preprocessing/_recipes.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index 7393e1a4ac..6d61c24bec 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -318,7 +318,11 @@ def recipe_pearson_residuals(
         pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs'])
         adata.uns['pearson_residuals_pca'] = pca_dict
         adata.uns['pearson_residuals_normalization'] = normalization_dict
-        adata.obsm['pearson_residuals_X_pca'] = adata_pca.obsm['X_pca']
+        adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca']
         return None
     else:
+        adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy()
+        adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy()
+        del adata_pca.obsm['X_pca']
+        del adata_pca.uns['pca']
         return adata_pca, hvg

From dc27c9f6c9caef7ae4df0447c9fc8f5ead462411 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 11 Jun 2021 18:01:36 +0200
Subject: [PATCH 19/96] adding function that prepares testdata (used for
 pearson residual tests)

---
 scanpy/tests/helpers.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py
index 61fc35e23e..b7e97c36dd 100644
--- a/scanpy/tests/helpers.py
+++ b/scanpy/tests/helpers.py
@@ -83,3 +83,26 @@ def check_rep_results(func, X, *, fields=["layer", "obsm"], **kwargs):
         assert_equal(adatas_proc[field_a], adatas_proc[field_b])
     for field in fields:
         assert_equal(adata_X, adatas_proc[field])
+
+
+def _prepare_pbmc_testdata(sparsity_func, dtype, small=False):
+    """Prepares 3k PBMC dataset with batch key `batch` and defined datatype/sparsity.
+
+    Params
+    ------
+    sparsity_func
+        sparsity function applied to adata.X (e.g. csr_matrix.toarray for dense or csr_matrix for sparse)
+    dtype
+        numpy dtype applied to adata.X (e.g.  'float32' or 'int64')
+    small
+        False (default) returns full data, True returns small subset of the data."""
+
+    adata = sc.datasets.pbmc3k()
+    if small:
+        adata = adata[:1000, :500]
+        sc.pp.filter_cells(adata, min_genes=1)
+    np.random.seed(42)
+    adata.obs['batch'] = np.random.randint(0, 3, size=adata.shape[0])
+    sc.pp.filter_genes(adata, min_cells=1)
+    adata.X = sparsity_func(adata.X.astype(dtype))
+    return adata

From aef44d8b1f8106976e6ceaca0fbe1a8e4a730220 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 11 Jun 2021 18:02:55 +0200
Subject: [PATCH 20/96] adding tests for all pearson residual functions

---
 scanpy/tests/test_highly_variable_genes.py | 270 +++++++++++++++++++
 scanpy/tests/test_normalization.py         | 293 ++++++++++++++++++++-
 2 files changed, 562 insertions(+), 1 deletion(-)

diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index 8b3e4f52c2..69703376cf 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -3,6 +3,9 @@
 import numpy as np
 import scanpy as sc
 from pathlib import Path
+from scipy.sparse import csr_matrix
+from scanpy.tests.helpers import _prepare_pbmc_testdata
+import warnings
 
 FILE = Path(__file__).parent / Path('_scripts/seurat_hvg.csv')
 FILE_V3 = Path(__file__).parent / Path('_scripts/seurat_hvg_v3.csv.gz')
@@ -54,6 +57,273 @@ def test_highly_variable_genes_basic():
     assert np.all(np.isin(colnames, hvg_df.columns))
 
 
+def _residual_var_reference(adata, clip=None, theta=100):
+    sc.pp.normalize_pearson_residuals(adata, clip=clip, theta=theta)
+    residuals = adata.X
+    return np.var(residuals, axis=0)
+
+
+@pytest.mark.parametrize(
+    'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
+)
+@pytest.mark.parametrize('dtype', ['float32', 'int64'])
+def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtype):
+
+    adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
+
+    # depending on check_values, warnings should be raised for non-integer data
+    if dtype == 'float32':
+
+        adata_noninteger = adata.copy()
+        x, y = np.nonzero(adata_noninteger.X)
+        adata_noninteger.X[x[0], y[0]] = 0.5
+        nonint_warn_msg = "`flavor='pearson_residuals'` expects raw count data, but non-integers were found."
+
+        # expecting 0 no-int warnings
+        with warnings.catch_warnings(record=True) as record:
+            sc.pp.highly_variable_genes(
+                adata_noninteger.copy(),
+                flavor='pearson_residuals',
+                n_top_genes=100,
+                check_values=False,
+            )
+        nonint_warnings = [
+            warning.message.args[0] == nonint_warn_msg for warning in record
+        ]
+        assert np.sum(nonint_warnings) == 0
+
+        # expecting 1 no-int warning
+        with warnings.catch_warnings(record=True) as record:
+            sc.pp.highly_variable_genes(
+                adata_noninteger.copy(),
+                flavor='pearson_residuals',
+                n_top_genes=100,
+                check_values=True,
+            )
+        nonint_warnings = np.array(
+            [warning.message.args[0] == nonint_warn_msg for warning in record]
+        )
+        assert np.sum(nonint_warnings) == 1
+
+    # errors should be raised for invalid theta values
+    with pytest.raises(ValueError) as record:
+        sc.pp.highly_variable_genes(
+            adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=0
+        )
+    with pytest.raises(ValueError) as record:
+        sc.pp.highly_variable_genes(
+            adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=-1
+        )
+
+    # error should be raised for invalid clipping values
+    with pytest.raises(ValueError) as record:
+        sc.pp.highly_variable_genes(
+            adata.copy(), flavor='pearson_residuals', n_top_genes=100, clip=-1
+        )
+
+
+@pytest.mark.parametrize(
+    'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
+)
+@pytest.mark.parametrize('dtype', ['float32', 'int64'])
+@pytest.mark.parametrize('subset', [True, False])
+@pytest.mark.parametrize('inplace', [True, False])
+@pytest.mark.parametrize('clip', [None, np.Inf, 30])
+@pytest.mark.parametrize('theta', [100, np.Inf])
+def test_highly_variable_genes_pearson_residuals_values(
+    subset, inplace, sparsity_func, dtype, clip, theta
+):
+
+    n_top_genes = 100
+    adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
+    # compute reference output
+    residual_variances_reference = _residual_var_reference(
+        adata.copy(), clip=clip, theta=theta
+    )
+    if subset:
+        # lazyly sort by residual variance and take top N
+        top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes]
+        # (results in sorted "gene order" in reference)
+        residual_variances_reference = residual_variances_reference[top_n_idx]
+    # compute output to be tested
+    output = sc.pp.highly_variable_genes(
+        adata,
+        flavor='pearson_residuals',
+        n_top_genes=n_top_genes,
+        subset=subset,
+        inplace=inplace,
+        clip=clip,
+        theta=theta,
+    )
+
+    # depending on inplace, check adata.var or output
+    if inplace:
+        assert output is None
+        output_df = adata.var
+    else:
+        output_df = output
+
+    # consistency with normalization method
+    if subset:
+        # sort values before comparing as reference is sorted as well for subset case
+        sort_output_idx = np.argsort(-output_df['residual_variances'].values)
+        assert np.allclose(
+            output_df['residual_variances'].values[sort_output_idx],
+            residual_variances_reference,
+        )
+    else:
+        assert np.allclose(
+            output_df['residual_variances'].values, residual_variances_reference
+        )
+
+
+@pytest.mark.parametrize(
+    'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
+)
+@pytest.mark.parametrize('dtype', ['float32', 'int64'])
+@pytest.mark.parametrize('subset', [True, False])
+@pytest.mark.parametrize('inplace', [True, False])
+def test_highly_variable_genes_pearson_residuals_general(
+    subset,
+    inplace,
+    sparsity_func,
+    dtype,
+):
+
+    n_top_genes = 1000
+
+    adata = _prepare_pbmc_testdata(sparsity_func, dtype)
+    # compute reference output
+    residual_variances_reference = _residual_var_reference(adata.copy())
+    if subset:
+        # lazyly sort by residual variance and take top N
+        top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes]
+        # (results in sorted "gene order" in reference)
+        residual_variances_reference = residual_variances_reference[top_n_idx]
+    # compute output to be tested
+    output = sc.pp.highly_variable_genes(
+        adata,
+        flavor='pearson_residuals',
+        n_top_genes=n_top_genes,
+        subset=subset,
+        inplace=inplace,
+    )
+
+    # depending on inplace, check adata.var or output
+    if inplace:
+        assert output is None
+        output_df = adata.var
+    else:
+        output_df = output
+
+    # check output is complete
+    for key in [
+        'highly_variable',
+        'means',
+        'variances',
+        'residual_variances',
+        'highly_variable_rank',
+    ]:
+        assert key in output_df.keys()
+
+    # check residual variances
+    assert output_df['residual_variances'].values.dtype is np.dtype('float32')
+    # consistency with normalization method
+    if subset:
+        # sort values before comparing as reference is sorted as well for subset case
+        sort_output_idx = np.argsort(-output_df['residual_variances'].values)
+        assert np.allclose(
+            output_df['residual_variances'].values[sort_output_idx],
+            residual_variances_reference,
+        )
+    else:
+        assert np.allclose(
+            output_df['residual_variances'].values, residual_variances_reference
+        )
+
+    # check hvg flag
+    assert output_df['highly_variable'].values.dtype is np.dtype('bool')
+    assert np.sum(output_df['highly_variable']) == n_top_genes
+    hvg_idx = np.where(output_df['highly_variable'])[0]
+    topn_idx = np.sort(
+        np.argsort(-output_df['residual_variances'].values)[:n_top_genes]
+    )
+    assert np.all(hvg_idx == topn_idx)
+
+    # check ranks
+    assert np.nanmin(output_df['highly_variable_rank'].values) == 0
+    assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1
+
+
+@pytest.mark.parametrize(
+    'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
+)
+@pytest.mark.parametrize('dtype', ['float32', 'int64'])
+@pytest.mark.parametrize('subset', [True, False])
+@pytest.mark.parametrize('inplace', [True, False])
+def test_highly_variable_genes_pearson_residuals_batch(
+    subset, inplace, sparsity_func, dtype
+):
+
+    n_top_genes = 1000
+
+    adata = _prepare_pbmc_testdata(sparsity_func, dtype)
+    n_genes = adata.shape[1]
+
+    output = sc.pp.highly_variable_genes(
+        adata,
+        flavor='pearson_residuals',
+        n_top_genes=n_top_genes,
+        batch_key='batch',
+        subset=subset,
+        inplace=inplace,
+    )
+
+    # depending on inplace, check adata.var or output
+    if inplace:
+        assert output is None
+        output_df = adata.var
+    else:
+        output_df = output
+
+    # check output is complete
+    for key in [
+        'highly_variable',
+        'means',
+        'variances',
+        'residual_variances',
+        'highly_variable_rank',
+        'highly_variable_nbatches',
+        'highly_variable_intersection',
+    ]:
+        assert key in output_df.keys()
+
+    # check hvg flag
+    assert output_df['highly_variable'].values.dtype is np.dtype('bool')
+    assert np.sum(output_df['highly_variable']) == n_top_genes
+
+    # check intersection flag
+    nbatches = len(np.unique(adata.obs['batch']))
+    assert output_df['highly_variable_intersection'].values.dtype is np.dtype('bool')
+    assert np.sum(output_df['highly_variable_intersection']) <= n_top_genes * nbatches
+    assert np.all(output_df['highly_variable'][output_df.highly_variable_intersection])
+
+    # check ranks (with batch_key these are the median of within-batch ranks)
+    assert output_df['highly_variable_rank'].values.dtype is np.dtype('float32')
+    assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1
+
+    # check nbatches
+    assert output_df['highly_variable_nbatches'].values.dtype is np.dtype('int')
+    assert np.min(output_df['highly_variable_nbatches'].values) >= 0
+    assert np.max(output_df['highly_variable_nbatches'].values) <= nbatches
+
+    # check subsetting
+    if subset:
+        assert len(output_df) == n_top_genes
+    else:
+        assert len(output_df) == n_genes
+
+
 def test_higly_variable_genes_compare_to_seurat():
     seurat_hvg_info = pd.read_csv(FILE, sep=' ')
 
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index 0f5dbb102d..b840702fec 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -3,11 +3,17 @@
 from anndata import AnnData
 from scipy.sparse import csr_matrix
 from scipy import sparse
+import warnings
 
 import scanpy as sc
-from scanpy.tests.helpers import check_rep_mutation, check_rep_results
+from scanpy.tests.helpers import (
+    check_rep_mutation,
+    check_rep_results,
+    _prepare_pbmc_testdata,
+)
 from anndata.tests.helpers import assert_equal, asarray
 
+
 X_total = [[1, 0], [3, 0], [5, 6]]
 X_frac = [[1, 0, 1], [3, 0, 1], [5, 6, 1]]
 
@@ -56,3 +62,288 @@ def test_normalize_total_view(typ, dtype):
 
     assert not v.is_view
     assert_equal(adata, v)
+
+
+@pytest.mark.parametrize(
+    'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
+)
+@pytest.mark.parametrize('dtype', ['float32', 'int64'])
+def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype):
+
+    adata = _prepare_pbmc_testdata(sparsity_func, dtype)
+
+    # depending on check_values, warnings should be raised for non-integer data
+    if dtype == 'float32':
+
+        adata_noninteger = adata.copy()
+        x, y = np.nonzero(adata_noninteger.X)
+        adata_noninteger.X[x[0], y[0]] = 0.5
+        nonint_warn_msg = "`normalize_pearson_residuals()` expects raw count data, but non-integers were found."
+
+        # expecting 0 no-int warnings
+        with warnings.catch_warnings(record=True) as record:
+            sc.pp.normalize_pearson_residuals(
+                adata_noninteger.copy(), check_values=False
+            )
+        nonint_warnings = [
+            warning.message.args[0] == nonint_warn_msg for warning in record
+        ]
+        assert np.sum(nonint_warnings) == 0
+
+        # expecting 1 no-int warning
+        with warnings.catch_warnings(record=True) as record:
+            sc.pp.normalize_pearson_residuals(
+                adata_noninteger.copy(), check_values=True
+            )
+        nonint_warnings = np.array(
+            [warning.message.args[0] == nonint_warn_msg for warning in record]
+        )
+        assert np.sum(nonint_warnings) == 1
+
+    # errors should be raised for invalid theta values
+    with pytest.raises(ValueError) as record:
+        sc.pp.normalize_pearson_residuals(adata.copy(), theta=0)
+    with pytest.raises(ValueError) as record:
+        sc.pp.normalize_pearson_residuals(adata.copy(), theta=-1)
+
+    # error should be raised for invalid clipping values
+    with pytest.raises(ValueError) as record:
+        sc.pp.normalize_pearson_residuals(adata.copy(), clip=-1)
+
+
+@pytest.mark.parametrize(
+    'sparsity_func', [np.array, csr_matrix], ids=lambda x: x.__name__
+)
+@pytest.mark.parametrize('dtype', ['float32', 'int64'])
+@pytest.mark.parametrize('theta', [0.01, 1, 100, np.Inf])
+@pytest.mark.parametrize('clip', [None, 1, np.Inf])
+@pytest.mark.parametrize('inplace', [True, False])
+def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip, inplace):
+
+    # toy data
+    X = np.array([[3, 6], [2, 4], [1, 0]])
+    ns = np.sum(X, axis=1)
+    ps = np.sum(X, axis=0) / np.sum(X)
+    mu = np.outer(ns, ps)
+
+    # compute reference residuals
+    if np.isinf(theta):
+        # Poisson case
+        residuals_reference = (X - mu) / np.sqrt(mu)
+    else:
+        # NB case
+        residuals_reference = (X - mu) / np.sqrt(mu + mu ** 2 / theta)
+
+    # compute output to test
+    adata = AnnData(sparsity_func(X), dtype=dtype)
+    output = sc.pp.normalize_pearson_residuals(
+        adata, theta=theta, clip=clip, inplace=inplace
+    )
+
+    # handle and test inplace argument
+    if inplace:
+        output_X = adata.X
+        assert output is None
+        # check for correct new `adata.uns` keys
+        assert np.all(
+            np.isin(['pearson_residuals_normalization'], list(adata.uns.keys()))
+        )
+        assert np.all(
+            np.isin(
+                ['theta', 'clip', 'computed_on'],
+                list(adata.uns['pearson_residuals_normalization'].keys()),
+            )
+        )
+
+    else:
+        output_X = output['X']
+
+    if clip is None:
+        # default clipping: compare to sqrt(n) threshold
+        clipping_threshold = np.sqrt(adata.shape[0]).astype(np.float32)
+        assert np.max(output_X) <= clipping_threshold
+        assert np.min(output_X) >= -clipping_threshold
+    elif np.isinf(clip):
+        # no clipping: compare to raw residuals
+        assert np.allclose(output_X, residuals_reference)
+    else:
+        # custom clipping: compare to custom threshold
+        assert np.max(output_X) <= clip
+        assert np.min(output_X) >= -clip
+
+
+@pytest.mark.parametrize(
+    'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
+)
+@pytest.mark.parametrize('dtype', ['float32', 'int64'])
+def test_normalize_pearson_residuals_pca(sparsity_func, dtype):
+
+    adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
+    n_cells = adata.shape[0]
+    n_genes = adata.shape[1]
+    n_hvgs = 100
+    n_comps_pca = 50
+    adata_with_hvgs = adata.copy()
+    sc.pp.highly_variable_genes(
+        adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs
+    )
+    adata_not_using_hvgs = adata_with_hvgs.copy()
+
+    ### inplace = False ###
+    # outputs the (potentially hvg-restricted) adata_pca object
+    # PCA on all genes
+    adata_pca = sc.pp.normalize_pearson_residuals_pca(
+        adata.copy(), inplace=False, n_comps_pca=n_comps_pca
+    )
+    # PCA on hvgs only
+    adata_pca_with_hvgs = sc.pp.normalize_pearson_residuals_pca(
+        adata_with_hvgs.copy(), inplace=False, n_comps_pca=n_comps_pca
+    )
+    # PCA again on all genes (hvg use supressed)
+    adata_pca_not_using_hvgs = sc.pp.normalize_pearson_residuals_pca(
+        adata_not_using_hvgs.copy(),
+        inplace=False,
+        n_comps_pca=n_comps_pca,
+        use_highly_variable=False,
+    )
+
+    # for both cases, check adata_pca keys are complete
+    for ad in [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs]:
+        assert np.all(
+            np.isin(
+                ['pearson_residuals_normalization', 'pearson_residuals_pca'],
+                list(ad.uns.keys()),
+            )
+        )
+        assert np.all(np.isin(['X_pearson_residuals_pca'], list(ad.obsm.keys())))
+        assert np.all(np.isin(['PCs'], list(ad.varm.keys())))
+        assert ad.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca)
+
+    # check adata shape to see if all genes or only HVGs are in the returned adata
+    assert adata_pca.shape == (n_cells, n_genes)
+    assert adata_pca_with_hvgs.shape == (n_cells, n_hvgs)  # only HVGs retained
+    assert adata_pca_not_using_hvgs.shape == (n_cells, n_genes)
+
+    # check PC shapes to see whether or not HVGs were used for PCA
+    assert adata_pca.varm['PCs'].shape == (n_genes, n_comps_pca)
+    assert adata_pca_with_hvgs.varm['PCs'].shape == (
+        n_hvgs,
+        n_comps_pca,
+    )  # only HVGs used
+    assert adata_pca_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps_pca)
+
+    ### inplace = True ###
+    # modifies the input adata object
+    # PCA on all genes
+    sc.pp.normalize_pearson_residuals_pca(adata, inplace=True, n_comps_pca=n_comps_pca)
+    # PCA on hvgs only
+    sc.pp.normalize_pearson_residuals_pca(
+        adata_with_hvgs, inplace=True, n_comps_pca=n_comps_pca
+    )
+    # PCA again on all genes (hvg use supressed)
+    sc.pp.normalize_pearson_residuals_pca(
+        adata_not_using_hvgs,
+        inplace=True,
+        n_comps_pca=n_comps_pca,
+        use_highly_variable=False,
+    )
+
+    for ad in [adata, adata_with_hvgs, adata_not_using_hvgs]:
+        # check adata_pca keys are complete
+        assert np.all(
+            np.isin(
+                ['pearson_residuals_normalization', 'pearson_residuals_pca'],
+                list(ad.uns.keys()),
+            )
+        )
+        assert np.all(np.isin(['X_pearson_residuals_pca'], list(ad.obsm.keys())))
+        # check shapes: adata should always retains original shape
+        assert ad.shape == (n_cells, n_genes)
+        assert ad.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca)
+
+    # check PC shapes to see whether or not HVGs were used for PCA
+    assert adata.uns['pearson_residuals_pca']['PCs'].shape == (n_genes, n_comps_pca)
+    assert adata_with_hvgs.uns['pearson_residuals_pca']['PCs'].shape == (
+        n_hvgs,
+        n_comps_pca,
+    )
+    assert adata_not_using_hvgs.uns['pearson_residuals_pca']['PCs'].shape == (
+        n_genes,
+        n_comps_pca,
+    )
+
+
+@pytest.mark.parametrize(
+    'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
+)
+@pytest.mark.parametrize('dtype', ['float32', 'int64'])
+def test_normalize_pearson_residuals_recipe(sparsity_func, dtype):
+    adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
+    n_cells = adata.shape[0]
+    n_genes = adata.shape[1]
+    n_hvgs = 100
+    n_comps_pca = 50
+    adata_with_hvgs = adata.copy()
+    sc.pp.highly_variable_genes(
+        adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs
+    )
+    adata_not_using_hvgs = adata_with_hvgs.copy()
+
+    ### inplace = False ###
+    # outputs the (potentially hvg-restricted) adata_pca object
+    # PCA on all genes
+    adata_pca, hvg = sc.pp.recipe_pearson_residuals(
+        adata.copy(), inplace=False, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs
+    )
+
+    # for both cases, check adata_pca keys are complete
+    assert np.all(
+        np.isin(
+            ['pearson_residuals_normalization', 'pearson_residuals_pca'],
+            list(adata_pca.uns.keys()),
+        )
+    )
+    assert np.all(np.isin(['X_pearson_residuals_pca'], list(adata_pca.obsm.keys())))
+    assert np.all(np.isin(['PCs'], list(adata_pca.varm.keys())))
+    assert adata_pca.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca)
+
+    # check adata shape
+    assert adata_pca.shape == (n_cells, n_hvgs)
+    # check PC shapes to check that HVGs were used for PCA
+    assert adata_pca.varm['PCs'].shape == (n_hvgs, n_comps_pca)
+
+    # check hvg df
+    assert np.all(
+        np.isin(
+            [
+                'means',
+                'variances',
+                'residual_variances',
+                'highly_variable_rank',
+                'highly_variable',
+            ],
+            list(hvg.columns),
+        )
+    )
+    assert np.sum(hvg['highly_variable']) == n_hvgs
+    assert hvg.shape[0] == n_genes
+
+    ### inplace = True ###
+    # modifies the input adata object
+    # PCA on all genes
+    sc.pp.recipe_pearson_residuals(
+        adata, inplace=True, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs
+    )
+
+    assert np.all(
+        np.isin(
+            ['pearson_residuals_normalization', 'pearson_residuals_pca'],
+            list(adata.uns.keys()),
+        )
+    )
+    assert np.all(np.isin(['X_pearson_residuals_pca'], list(adata.obsm.keys())))
+    assert adata.shape == (n_cells, n_genes)
+    assert adata.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca)
+
+    # check PC shapes to see whether or not HVGs were used for PCA
+    assert adata.uns['pearson_residuals_pca']['PCs'].shape == (n_hvgs, n_comps_pca)

From e76cf7b6bbc3b38c5bacf666a49ccaa3832075e8 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Jun 2021 11:58:23 +0200
Subject: [PATCH 21/96] fix precommit high_var_genes

---
 scanpy/preprocessing/_highly_variable_genes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index fe9277f3b7..3231076780 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -222,7 +222,7 @@ def _highly_variable_pearson_residuals(
     computed_on = layer if layer else 'adata.X'
 
     # Check for raw counts
-    if check_values and (check_nonnegative_integers(X) == False):
+    if check_values and (check_nonnegative_integers(X) is False):
         warnings.warn(
             "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
             UserWarning,

From bdb7ce23bed8e7016f9f4caa928ea8e3c2b78152 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Jun 2021 12:10:13 +0200
Subject: [PATCH 22/96] try to get precommit to work

---
 scanpy/preprocessing/_highly_variable_genes.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 3231076780..6c262a3087 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -586,7 +586,6 @@ def highly_variable_genes(
         where n is the number of cells in the dataset (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
         `clip=np.Inf` for no clipping.
-        
     chunksize
         If `flavor='pearson_residuals'`, this dertermines how many genes are processed at
         once while computing the residual variance. Choosing a smaller value will reduce 

From 6cea0404901b5cd3c729cb87d50a8c3f2bce55b0 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Jun 2021 12:12:44 +0200
Subject: [PATCH 23/96] try to get precommit to work

---
 .../preprocessing/_highly_variable_genes.py   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index ee26c7c5e6..bd32c2e653 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -546,23 +546,23 @@ def highly_variable_genes(
     layer
         If provided, use `adata.layers[layer]` for expression values instead of `adata.X`.
     n_top_genes
-        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or 
+        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or
         `flavor='pearson_residuals'`.
     min_mean
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or 
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
         `flavor='pearson_residuals'`.
     max_mean
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or 
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
         `flavor='pearson_residuals'`.
     min_disp
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or 
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
         `flavor='pearson_residuals'`.
     max_disp
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or 
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
         `flavor='pearson_residuals'`.
     span
         The fraction of the data (cells) used when estimating the variance in the loess
@@ -571,22 +571,22 @@ def highly_variable_genes(
         Number of bins for binning the mean gene expression. Normalization is
         done with respect to each bin. If just a single gene falls into a bin,
         the normalized dispersion is artificially set to 1. You'll be informed
-        about this if you set `settings.verbosity = 4`. Ignored if 
+        about this if you set `settings.verbosity = 4`. Ignored if
         `flavor='pearson_residuals'`.
     theta
         If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta.
-        Higher values correspond to less overdispersion (var = mean + mean^2/theta), and 
+        Higher values correspond to less overdispersion (var = mean + mean^2/theta), and
         `theta=np.Inf` corresponds to a Poisson model.
     clip
         If `flavor='pearson_residuals'`, this determines if and how residuals are clipped:
-        
+
         * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
         where n is the number of cells in the dataset (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
         `clip=np.Inf` for no clipping.
     chunksize
         If `flavor='pearson_residuals'`, this dertermines how many genes are processed at
-        once while computing the residual variance. Choosing a smaller value will reduce 
+        once while computing the residual variance. Choosing a smaller value will reduce
         the required memory.
     flavor
         Choose the flavor for identifying highly variable genes. For the dispersion
@@ -603,7 +603,7 @@ def highly_variable_genes(
         lightweight batch correction method. For all flavors, genes are first sorted
         by how many batches they are a HVG. For dispersion-based flavors ties are broken
         by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median
-        (across batches) rank based on within-batch normalized variance. If 
+        (across batches) rank based on within-batch normalized variance. If
         `flavor='pearson_residuals'`, ties are broken by the median rank (across batches)
         based on within-batch residual variance.
     check_values

From d7e63f78eb79815e2536653577f097225e03eb76 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Jun 2021 13:16:23 +0200
Subject: [PATCH 24/96] fix recipes

---
 scanpy/preprocessing/_recipes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index 6d61c24bec..2e49415874 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -1,5 +1,5 @@
 """Preprocessing recipes from the literature"""
-from typing import Optional, Union, Literal, Tuple
+from typing import Optional, Tuple
 
 from anndata import AnnData
 

From 0b5a02b086d59fa3b7aeb1512f6911e8aef3bdea Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Jun 2021 13:21:03 +0200
Subject: [PATCH 25/96] fix normalization

---
 scanpy/preprocessing/_normalization.py | 26 ++++++++++++--------------
 scanpy/tests/test_normalization.py     |  1 -
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 2f1bd66c4b..de105ee29f 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -7,14 +7,13 @@
 from scipy.sparse import issparse
 from sklearn.utils import sparsefuncs
 
-from .. import logging as logg
-from .._compat import Literal
+from scanpy import logging as logg
+from scanpy._compat import Literal
 
-from .._utils import view_to_actual, check_nonnegative_integers
+from scanpy.preprocessing._utils import view_to_actual, check_nonnegative_integers
 from scanpy.get import _get_obs_rep, _set_obs_rep
 
 from ._pca import pca
-from scanpy.get import _get_obs_rep, _set_obs_rep
 
 
 def _normalize_data(X, counts, after=None, copy=False):
@@ -48,7 +47,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False):
     if clip < 0:
         raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
 
-    if check_values and (check_nonnegative_integers(X) == False):
+    if check_values and (check_nonnegative_integers(X) is False):
         warn(
             "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
             UserWarning,
@@ -101,13 +100,13 @@ def normalize_pearson_residuals(
         corresponds to a Poisson model.
     clip
         Determines if and how residuals are clipped:
-        
+
         * If `None`, residuals are clipped to the interval
         [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
         (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
         `clip=np.Inf` for no clipping.
-        
+
     layer
         Layer to normalize instead of `X`. If `None`, `X` is normalized.
     copy
@@ -167,7 +166,6 @@ def normalize_pearson_residuals_pca(
     check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
-
     """\
     Applies PCA based on Pearson residual normalization. Operates on the
     subset of highly variable genes in `adata.var['highly_variable']` by
@@ -193,13 +191,13 @@ def normalize_pearson_residuals_pca(
         Poisson model.
     clip
         This determines if and how Pearson residuals are clipped:
-        
+
         * If `None`, residuals are clipped to the interval
         [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
         (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
         `clip=np.Inf` for no clipping.
-        
+
     n_comps_pca
         Number of principal components to compute.
     random_state_pca
@@ -217,14 +215,14 @@ def normalize_pearson_residuals_pca(
     If `inplace=False`, returns the Pearson residual-based PCA results
     (`adata_pca`).
     If `inplace=True`, updates `adata` with the following fields:
-    
+
     `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
          The hvg-subset, normalized by Pearson residuals
     `.uns['pearson_residuals_normalization']['theta']`
          The used value of the overdisperion parameter theta
     `.uns['pearson_residuals_normalization']['clip']`
          The used value of the clipping parameter
-    
+
     `.obsm['X_pearson_residuals_pca']`
         PCA representation of data after gene selection and Pearson residual
         normalization.
@@ -234,8 +232,8 @@ def normalize_pearson_residuals_pca(
          Ratio of explained variance.
     `.uns['pearson_residuals_pca']['variance']`
          Explained variance, equivalent to the eigenvalues of the
-         covariance matrix.        
-    
+         covariance matrix.
+
     """
 
     if use_highly_variable and 'highly_variable' in adata.var_keys():
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index b840702fec..faad18fe7d 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -287,7 +287,6 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype):
     sc.pp.highly_variable_genes(
         adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs
     )
-    adata_not_using_hvgs = adata_with_hvgs.copy()
 
     ### inplace = False ###
     # outputs the (potentially hvg-restricted) adata_pca object

From 6779d23acbc0c66ec0266d791a4c307c204d35f7 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Jun 2021 13:23:28 +0200
Subject: [PATCH 26/96] remove relative imports

---
 scanpy/preprocessing/_normalization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index de105ee29f..633ef15337 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -10,10 +10,10 @@
 from scanpy import logging as logg
 from scanpy._compat import Literal
 
-from scanpy.preprocessing._utils import view_to_actual, check_nonnegative_integers
+from scanpy._utils import view_to_actual, check_nonnegative_integers
 from scanpy.get import _get_obs_rep, _set_obs_rep
 
-from ._pca import pca
+from scanpy.preprocessing._pca import pca
 
 
 def _normalize_data(X, counts, after=None, copy=False):

From 237e7cd5243dc3bde629b6fd5f64d20c3a61d3e9 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Jun 2021 13:48:56 +0200
Subject: [PATCH 27/96] fix docstrings

---
 scanpy/preprocessing/_highly_variable_genes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index bd32c2e653..9eedc11310 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -584,6 +584,7 @@ def highly_variable_genes(
         where n is the number of cells in the dataset (default behavior).
         * If any scalar c, residuals are clipped to the interval [-c, c]. Set
         `clip=np.Inf` for no clipping.
+
     chunksize
         If `flavor='pearson_residuals'`, this dertermines how many genes are processed at
         once while computing the residual variance. Choosing a smaller value will reduce

From d75aa36b3252a63abb814dd04f0c34acfbec2a99 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Jun 2021 13:59:58 +0200
Subject: [PATCH 28/96] retry to build docs

---
 scanpy/preprocessing/_highly_variable_genes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 9eedc11310..8f39a60990 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -10,7 +10,7 @@
 from .. import logging as logg
 from .._settings import settings, Verbosity
 from .._utils import sanitize_anndata, check_nonnegative_integers, view_to_actual
-from scanpy.get import _get_obs_rep, _set_obs_rep
+from scanpy.get import _get_obs_rep
 from .._compat import Literal
 from ._utils import _get_mean_var
 from ._distributed import materialize_as_ndarray

From 293b47d1144b9ff2a601ef88700810e62aa3e029 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Tue, 29 Jun 2021 10:06:53 +0200
Subject: [PATCH 29/96] fix highvar docstring

---
 scanpy/preprocessing/_highly_variable_genes.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 8f39a60990..56288627f8 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -580,10 +580,10 @@ def highly_variable_genes(
     clip
         If `flavor='pearson_residuals'`, this determines if and how residuals are clipped:
 
-        * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
-        where n is the number of cells in the dataset (default behavior).
-        * If any scalar c, residuals are clipped to the interval [-c, c]. Set
-        `clip=np.Inf` for no clipping.
+            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
+            where n is the number of cells in the dataset (default behavior).
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set
+            `clip=np.Inf` for no clipping.
 
     chunksize
         If `flavor='pearson_residuals'`, this dertermines how many genes are processed at

From a61496b47849e1ef657f6e4319ddc22837ac3db1 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Tue, 29 Jun 2021 11:26:34 +0200
Subject: [PATCH 30/96] more fixing docstrings

---
 docs/api.rst                                  |  3 ++
 .../preprocessing/_highly_variable_genes.py   | 10 +++----
 scanpy/preprocessing/_normalization.py        | 29 ++++++++-----------
 scanpy/preprocessing/_recipes.py              | 15 +++++-----
 4 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/docs/api.rst b/docs/api.rst
index 3656bc24bf..2bc9283a75 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -39,6 +39,8 @@ For visual quality control, see :func:`~scanpy.pl.highest_expr_genes` and
    pp.log1p
    pp.pca
    pp.normalize_total
+   pp.normalize_pearson_residuals
+   pp.normalize_pearson_residuals_pca
    pp.regress_out
    pp.scale
    pp.subsample
@@ -53,6 +55,7 @@ Recipes
    pp.recipe_zheng17
    pp.recipe_weinreb17
    pp.recipe_seurat
+   pp.recipe_pearson_residuals
 
 Batch effect correction
 ~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 56288627f8..aec0fe3275 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -513,14 +513,14 @@ def highly_variable_genes(
     check_values: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_.
+    Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_ [Lause20]_.
 
     Expects logarithmized data, except when `flavor='seurat_v3'` or
     `flavor='pearson_residuals'`, in which count data is expected.
 
     Depending on `flavor`, this reproduces the R-implementations of Seurat
     [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_, or uses
-    analytical Peason residuals [Lause20]_.
+    analytical Pearson residuals [Lause20]_.
 
     For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized
     dispersion is obtained by scaling with the mean and standard deviation of
@@ -578,11 +578,11 @@ def highly_variable_genes(
         Higher values correspond to less overdispersion (var = mean + mean^2/theta), and
         `theta=np.Inf` corresponds to a Poisson model.
     clip
-        If `flavor='pearson_residuals'`, this determines if and how residuals are clipped:
+        If `flavor='pearson_residuals'`, this determines how residuals are clipped:
 
-            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)],
+            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
             where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
             `clip=np.Inf` for no clipping.
 
     chunksize
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 633ef15337..d5ea245663 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -82,12 +82,11 @@ def normalize_pearson_residuals(
     inplace: bool = True,
 ) -> Optional[Dict[str, np.ndarray]]:
     """\
-    Computes analytic Pearson residuals, assuming a negative binomial offset
-    model with overdispersion theta shared across genes. By default, residuals
-    are clipped to sqrt(n) and overdispersion theta=100 is used.
+    Computes analytic Pearson residuals, based on [Lause20]_.
 
-    Based on "Analytic Pearson residuals for normalization of single-cell
-    RNA-seq UMI data", bioRxiv, [Lause20]_.
+    Assuming a negative binomial offset model with overdispersion
+    theta shared across genes, computes Pearson residuals. By default, residuals
+    are clipped to sqrt(n) and overdispersion theta=100 is used.
 
     Params
     ------
@@ -167,12 +166,9 @@ def normalize_pearson_residuals_pca(
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    Applies PCA based on Pearson residual normalization. Operates on the
-    subset of highly variable genes in `adata.var['highly_variable']` by
-    default.
+    Applies Pearson residual normalization and PCA, based on [Lause20]_.
 
-    This workflow is based on "Analytic Pearson residuals for normalization of
-    single-cell RNA-seq UMI data", bioRxiv, [Lause20]_.
+    Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default.
 
 
     Parameters
@@ -190,20 +186,19 @@ def normalize_pearson_residuals_pca(
         (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
         Poisson model.
     clip
-        This determines if and how Pearson residuals are clipped:
+        This determines how Pearson residuals are clipped:
 
-        * If `None`, residuals are clipped to the interval
-        [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
-        (default behavior).
-        * If any scalar c, residuals are clipped to the interval [-c, c]. Set
-        `clip=np.Inf` for no clipping.
+            * If `None`, residuals are clipped to the interval \
+            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+            `clip=np.Inf` for no clipping.
 
     n_comps_pca
         Number of principal components to compute.
     random_state_pca
         Change to use different initial states for the optimization.
     kwargs_pca
-        Dictionary of further keyword arguments passed on to `sc.pp.pca()`.
+        Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
     check_values
         Check if counts in selected layer are integers. A Warning is returned if set to True.
     inplace
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index 2e49415874..fd28bea576 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -186,12 +186,11 @@ def recipe_pearson_residuals(
     inplace: bool = True,
 ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
     """\
+    Gene selection and normalization based on [Lause20]_.
+
     Applies gene selection based on Pearson residuals. On the resulting subset,
     Pearson residual normalization and PCA are performed.
 
-    This recipe is based on "Analytic Pearson residuals for normalization of
-    single-cell RNA-seq UMI data", bioRxiv, [Lause20]_.
-
 
     Parameters
     ----------
@@ -213,11 +212,11 @@ def recipe_pearson_residuals(
     clip
         This determines if and how Pearson residuals are clipped:
 
-        * If `None`, residuals are clipped to the interval
-        [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
-        (default behavior).
-        * If any scalar c, residuals are clipped to the interval [-c, c]. Set
-        `clip=np.Inf` for no clipping.
+            * If `None`, residuals are clipped to the interval \
+            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+            `clip=np.Inf` for no clipping.
+
     batch_key
         If specified, highly-variable genes are selected within each batch
         separately and merged. This simple process avoids the selection of

From 7afb94f4ae05a8b35f415de6055ec07b0a9d7a6d Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Tue, 29 Jun 2021 11:40:24 +0200
Subject: [PATCH 31/96] docs build locally ? :hammer:

---
 scanpy/preprocessing/_normalization.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index d5ea245663..11dc9b8860 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -100,11 +100,10 @@ def normalize_pearson_residuals(
     clip
         Determines if and how residuals are clipped:
 
-        * If `None`, residuals are clipped to the interval
-        [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset
-        (default behavior).
-        * If any scalar c, residuals are clipped to the interval [-c, c]. Set
-        `clip=np.Inf` for no clipping.
+            * If `None`, residuals are clipped to the interval \
+            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+            `clip=np.Inf` for no clipping.
 
     layer
         Layer to normalize instead of `X`. If `None`, `X` is normalized.

From e3e50457c451cab673f9328387e54add9648a9ae Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 5 Jul 2021 16:03:02 +0200
Subject: [PATCH 32/96] minor cleanup test normalization

---
 .../preprocessing/_highly_variable_genes.py   |  2 +-
 scanpy/preprocessing/_normalization.py        |  4 ++--
 scanpy/tests/test_normalization.py            | 22 ++++++-------------
 3 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index aec0fe3275..4bd00a98e9 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Optional, Union
+from typing import Optional
 
 import numpy as np
 import pandas as pd
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 11dc9b8860..adc8820b2e 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -47,7 +47,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False):
     if clip < 0:
         raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
 
-    if check_values and (check_nonnegative_integers(X) is False):
+    if check_values and not check_nonnegative_integers(X):
         warn(
             "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
             UserWarning,
@@ -133,7 +133,7 @@ def normalize_pearson_residuals(
     X = _get_obs_rep(adata, layer=layer)
     computed_on = layer if layer else 'adata.X'
 
-    msg = 'computing analytic Pearson residuals on %s' % computed_on
+    msg = f'computing analytic Pearson residuals on {computed_on}'
     start = logg.info(msg)
 
     residuals = _pearson_residuals(X, theta, clip, check_values, copy=~inplace)
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index faad18fe7d..7056443ce9 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -78,27 +78,19 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype):
         adata_noninteger = adata.copy()
         x, y = np.nonzero(adata_noninteger.X)
         adata_noninteger.X[x[0], y[0]] = 0.5
-        nonint_warn_msg = "`normalize_pearson_residuals()` expects raw count data, but non-integers were found."
 
-        # expecting 0 no-int warnings
-        with warnings.catch_warnings(record=True) as record:
+        with pytest.warns(UserWarning) as record:
             sc.pp.normalize_pearson_residuals(
-                adata_noninteger.copy(), check_values=False
+                adata_noninteger.copy(), check_values=True
             )
-        nonint_warnings = [
-            warning.message.args[0] == nonint_warn_msg for warning in record
-        ]
-        assert np.sum(nonint_warnings) == 0
+        assert len(record) == 1
+        assert "expects raw count data" in record[0].message.args[0]
 
-        # expecting 1 no-int warning
-        with warnings.catch_warnings(record=True) as record:
+        with pytest.warns(None) as record:
             sc.pp.normalize_pearson_residuals(
-                adata_noninteger.copy(), check_values=True
+                adata_noninteger.copy(), check_values=False
             )
-        nonint_warnings = np.array(
-            [warning.message.args[0] == nonint_warn_msg for warning in record]
-        )
-        assert np.sum(nonint_warnings) == 1
+        assert len(record) == 0
 
     # errors should be raised for invalid theta values
     with pytest.raises(ValueError) as record:

From e368b57eb1eadaa018ee1f3e94996a22bea5eb12 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 5 Jul 2021 16:21:57 +0200
Subject: [PATCH 33/96] more minor cleanups

---
 scanpy/tests/test_normalization.py | 32 ++++++++++++------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index 7056443ce9..c84fd40a60 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -109,8 +109,7 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype):
 @pytest.mark.parametrize('dtype', ['float32', 'int64'])
 @pytest.mark.parametrize('theta', [0.01, 1, 100, np.Inf])
 @pytest.mark.parametrize('clip', [None, 1, np.Inf])
-@pytest.mark.parametrize('inplace', [True, False])
-def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip, inplace):
+def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip):
 
     # toy data
     X = np.array([[3, 6], [2, 4], [1, 0]])
@@ -129,26 +128,21 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip, i
     # compute output to test
     adata = AnnData(sparsity_func(X), dtype=dtype)
     output = sc.pp.normalize_pearson_residuals(
-        adata, theta=theta, clip=clip, inplace=inplace
+        adata, theta=theta, clip=clip, inplace=False
     )
+    output_X = output['X']
+    sc.pp.normalize_pearson_residuals(adata, theta=theta, clip=clip, inplace=True)
 
-    # handle and test inplace argument
-    if inplace:
-        output_X = adata.X
-        assert output is None
-        # check for correct new `adata.uns` keys
-        assert np.all(
-            np.isin(['pearson_residuals_normalization'], list(adata.uns.keys()))
-        )
-        assert np.all(
-            np.isin(
-                ['theta', 'clip', 'computed_on'],
-                list(adata.uns['pearson_residuals_normalization'].keys()),
-            )
+    # check for correct new `adata.uns` keys
+    assert np.all(np.isin(['pearson_residuals_normalization'], list(adata.uns.keys())))
+    assert np.all(
+        np.isin(
+            ['theta', 'clip', 'computed_on'],
+            list(adata.uns['pearson_residuals_normalization'].keys()),
         )
-
-    else:
-        output_X = output['X']
+    )
+    # test against inplace
+    np.testing.assert_array_equal(adata.X, output_X)
 
     if clip is None:
         # default clipping: compare to sqrt(n) threshold

From bfbd4840e7e008967e1fd8e0ec8de3297f7beaaf Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 5 Jul 2021 16:43:36 +0200
Subject: [PATCH 34/96] final cleanup normalization

---
 scanpy/tests/test_normalization.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index c84fd40a60..eb9bbc8533 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -162,13 +162,13 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip):
     'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
 )
 @pytest.mark.parametrize('dtype', ['float32', 'int64'])
-def test_normalize_pearson_residuals_pca(sparsity_func, dtype):
+@pytest.mark.parametrize('n_hvgs', [100, 200])
+@pytest.mark.parametrize('n_comps_pca', [30, 50])
+def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_pca):
 
     adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
-    n_cells = adata.shape[0]
-    n_genes = adata.shape[1]
-    n_hvgs = 100
-    n_comps_pca = 50
+    n_cells, n_genes = adata.shape
+
     adata_with_hvgs = adata.copy()
     sc.pp.highly_variable_genes(
         adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs
@@ -258,17 +258,27 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype):
         n_comps_pca,
     )
 
+    # test for inplace/outplace
+    for ad_inplace, ad_outplace in zip(
+        [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs],
+        [adata, adata_with_hvgs, adata_not_using_hvgs],
+    ):
+        np.testing.assert_array_equal(
+            ad_inplace.obsm['X_pearson_residuals_pca'],
+            ad_outplace.obsm['X_pearson_residuals_pca'],
+        )
+
 
 @pytest.mark.parametrize(
     'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
 )
 @pytest.mark.parametrize('dtype', ['float32', 'int64'])
-def test_normalize_pearson_residuals_recipe(sparsity_func, dtype):
+@pytest.mark.parametrize('n_hvgs', [100, 200])
+@pytest.mark.parametrize('n_comps_pca', [30, 50])
+def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comps_pca):
     adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
-    n_cells = adata.shape[0]
-    n_genes = adata.shape[1]
-    n_hvgs = 100
-    n_comps_pca = 50
+    n_cells, n_genes = adata.shape
+
     adata_with_hvgs = adata.copy()
     sc.pp.highly_variable_genes(
         adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs

From a55e677d445323284ebae9ca1923d16516a95fa2 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 5 Jul 2021 18:19:03 +0200
Subject: [PATCH 35/96] fixes high var

---
 .../preprocessing/_highly_variable_genes.py   |  23 ++--
 scanpy/tests/test_highly_variable_genes.py    | 107 +++++++++---------
 2 files changed, 67 insertions(+), 63 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 4bd00a98e9..513270c318 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -335,11 +335,11 @@ def _highly_variable_pearson_residuals(
         na_position='last',
         inplace=True,
     )
-    df['highly_variable'] = False
-    df.highly_variable.iloc[:n_top_genes] = True
-    # TODO: following line raises a pandas warning
-    # (also for flavor = seurat and cellranger..)
-    df = df.loc[adata.var_names]
+
+    high_var = np.zeros(df.shape[0])
+    high_var[:n_top_genes] = True
+    df['highly_variable'] = high_var.astype(bool)
+    df = df.loc[adata.var_names, :]
 
     if inplace:
         adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on}
@@ -353,11 +353,11 @@ def _highly_variable_pearson_residuals(
             '    \'variances\', float vector (adata.var)\n'
             '    \'residual_variances\', float vector (adata.var)'
         )
-        adata.var['highly_variable'] = df['highly_variable'].values
-        adata.var['highly_variable_rank'] = df['highly_variable_rank'].values
         adata.var['means'] = df['means'].values
         adata.var['variances'] = df['variances'].values
         adata.var['residual_variances'] = df['residual_variances']
+        adata.var['highly_variable_rank'] = df['highly_variable_rank'].values
+        adata.var['highly_variable'] = df['highly_variable'].values
 
         if batch_key is not None:
             adata.var['highly_variable_nbatches'] = df[
@@ -770,11 +770,12 @@ def highly_variable_genes(
                 na_position='last',
                 inplace=True,
             )
-            df['highly_variable'] = False
-            df.highly_variable.iloc[:n_top_genes] = True
-            df = df.loc[adata.var_names]
+            high_var = np.zeros(df.shape[0])
+            high_var[:n_top_genes] = True
+            df['highly_variable'] = high_var.astype(bool)
+            df = df.loc[adata.var_names, :]
         else:
-            df = df.loc[adata.var_names]
+            df = df.loc[adata.var_names, :]
             dispersion_norm = df.dispersions_norm.values
             dispersion_norm[np.isnan(dispersion_norm)] = 0  # similar to Seurat
             gene_subset = np.logical_and.reduce(
diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index 359d648e96..aa5517eed5 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -77,33 +77,27 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
         adata_noninteger = adata.copy()
         x, y = np.nonzero(adata_noninteger.X)
         adata_noninteger.X[x[0], y[0]] = 0.5
-        nonint_warn_msg = "`flavor='pearson_residuals'` expects raw count data, but non-integers were found."
 
         # expecting 0 no-int warnings
-        with warnings.catch_warnings(record=True) as record:
+        with pytest.warns(None) as record:
             sc.pp.highly_variable_genes(
                 adata_noninteger.copy(),
                 flavor='pearson_residuals',
                 n_top_genes=100,
                 check_values=False,
             )
-        nonint_warnings = [
-            warning.message.args[0] == nonint_warn_msg for warning in record
-        ]
-        assert np.sum(nonint_warnings) == 0
+        assert len(record) == 0
 
         # expecting 1 no-int warning
-        with warnings.catch_warnings(record=True) as record:
+        with pytest.warns(None) as record:
             sc.pp.highly_variable_genes(
                 adata_noninteger.copy(),
                 flavor='pearson_residuals',
                 n_top_genes=100,
                 check_values=True,
             )
-        nonint_warnings = np.array(
-            [warning.message.args[0] == nonint_warn_msg for warning in record]
-        )
-        assert np.sum(nonint_warnings) == 1
+        assert len(record) == 1
+        assert "expects raw count data" in record[0].message.args[0]
 
     # errors should be raised for invalid theta values
     with pytest.raises(ValueError) as record:
@@ -127,15 +121,15 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
 )
 @pytest.mark.parametrize('dtype', ['float32', 'int64'])
 @pytest.mark.parametrize('subset', [True, False])
-@pytest.mark.parametrize('inplace', [True, False])
 @pytest.mark.parametrize('clip', [None, np.Inf, 30])
 @pytest.mark.parametrize('theta', [100, np.Inf])
+@pytest.mark.parametrize('n_top_genes', [100, 200])
 def test_highly_variable_genes_pearson_residuals_values(
-    subset, inplace, sparsity_func, dtype, clip, theta
+    subset, sparsity_func, dtype, clip, theta, n_top_genes
 ):
-
-    n_top_genes = 100
     adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
+    # cleanup var
+    adata.var.drop(columns=adata.var.columns, inplace=True)
     # compute reference output
     residual_variances_reference = _residual_var_reference(
         adata.copy(), clip=clip, theta=theta
@@ -145,23 +139,29 @@ def test_highly_variable_genes_pearson_residuals_values(
         top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes]
         # (results in sorted "gene order" in reference)
         residual_variances_reference = residual_variances_reference[top_n_idx]
+
     # compute output to be tested
-    output = sc.pp.highly_variable_genes(
+    output_df = sc.pp.highly_variable_genes(
         adata,
         flavor='pearson_residuals',
         n_top_genes=n_top_genes,
         subset=subset,
-        inplace=inplace,
+        inplace=False,
         clip=clip,
         theta=theta,
     )
 
-    # depending on inplace, check adata.var or output
-    if inplace:
-        assert output is None
-        output_df = adata.var
-    else:
-        output_df = output
+    sc.pp.highly_variable_genes(
+        adata,
+        flavor='pearson_residuals',
+        n_top_genes=n_top_genes,
+        subset=subset,
+        inplace=True,
+        clip=clip,
+        theta=theta,
+    )
+
+    pd.testing.assert_frame_equal(output_df, adata.var)
 
     # consistency with normalization method
     if subset:
@@ -182,39 +182,39 @@ def test_highly_variable_genes_pearson_residuals_values(
 )
 @pytest.mark.parametrize('dtype', ['float32', 'int64'])
 @pytest.mark.parametrize('subset', [True, False])
-@pytest.mark.parametrize('inplace', [True, False])
+@pytest.mark.parametrize('n_top_genes', [1000, 500])
 def test_highly_variable_genes_pearson_residuals_general(
-    subset,
-    inplace,
-    sparsity_func,
-    dtype,
+    subset, sparsity_func, dtype, n_top_genes
 ):
 
-    n_top_genes = 1000
-
     adata = _prepare_pbmc_testdata(sparsity_func, dtype)
+    # cleanup var
+    adata.var.drop(columns=adata.var.columns, inplace=True)
     # compute reference output
     residual_variances_reference = _residual_var_reference(adata.copy())
     if subset:
-        # lazyly sort by residual variance and take top N
+        # lazily sort by residual variance and take top N
         top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes]
         # (results in sorted "gene order" in reference)
         residual_variances_reference = residual_variances_reference[top_n_idx]
     # compute output to be tested
-    output = sc.pp.highly_variable_genes(
+    output_df = sc.pp.highly_variable_genes(
         adata,
         flavor='pearson_residuals',
         n_top_genes=n_top_genes,
         subset=subset,
-        inplace=inplace,
+        inplace=False,
     )
 
-    # depending on inplace, check adata.var or output
-    if inplace:
-        assert output is None
-        output_df = adata.var
-    else:
-        output_df = output
+    sc.pp.highly_variable_genes(
+        adata,
+        flavor='pearson_residuals',
+        n_top_genes=n_top_genes,
+        subset=subset,
+        inplace=True,
+    )
+
+    pd.testing.assert_frame_equal(output_df, adata.var)
 
     # check output is complete
     for key in [
@@ -260,31 +260,34 @@ def test_highly_variable_genes_pearson_residuals_general(
 )
 @pytest.mark.parametrize('dtype', ['float32', 'int64'])
 @pytest.mark.parametrize('subset', [True, False])
-@pytest.mark.parametrize('inplace', [True, False])
+@pytest.mark.parametrize('n_top_genes', [1000, 500])
 def test_highly_variable_genes_pearson_residuals_batch(
-    subset, inplace, sparsity_func, dtype
+    subset, n_top_genes, sparsity_func, dtype
 ):
-
-    n_top_genes = 1000
-
     adata = _prepare_pbmc_testdata(sparsity_func, dtype)
+    # cleanup var
+    adata.var.drop(columns=adata.var.columns, inplace=True)
     n_genes = adata.shape[1]
 
-    output = sc.pp.highly_variable_genes(
+    output_df = sc.pp.highly_variable_genes(
         adata,
         flavor='pearson_residuals',
         n_top_genes=n_top_genes,
         batch_key='batch',
         subset=subset,
-        inplace=inplace,
+        inplace=False,
     )
 
-    # depending on inplace, check adata.var or output
-    if inplace:
-        assert output is None
-        output_df = adata.var
-    else:
-        output_df = output
+    sc.pp.highly_variable_genes(
+        adata,
+        flavor='pearson_residuals',
+        n_top_genes=n_top_genes,
+        batch_key='batch',
+        subset=subset,
+        inplace=True,
+    )
+
+    # pd.testing.assert_frame_equal(output_df, adata.var)
 
     # check output is complete
     for key in [

From 4f47c1118550f93df4123a319d43a105626da9d9 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 5 Jul 2021 18:19:45 +0200
Subject: [PATCH 36/96] init experimental module

---
 scanpy/experimental/__init__.py                  | 0
 scanpy/experimental/pp/__init__.py               | 0
 scanpy/experimental/pp/_highly_variable_genes.py | 0
 scanpy/experimental/pp/_normalization.py         | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 scanpy/experimental/__init__.py
 create mode 100644 scanpy/experimental/pp/__init__.py
 create mode 100644 scanpy/experimental/pp/_highly_variable_genes.py
 create mode 100644 scanpy/experimental/pp/_normalization.py

diff --git a/scanpy/experimental/__init__.py b/scanpy/experimental/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scanpy/experimental/pp/__init__.py b/scanpy/experimental/pp/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
new file mode 100644
index 0000000000..e69de29bb2

From c32eafc8eb00a8f891ce4711639cc7cc0feabbb5 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Wed, 14 Jul 2021 15:23:03 +0200
Subject: [PATCH 37/96] fix column ordering for batch case

---
 scanpy/preprocessing/_highly_variable_genes.py | 5 +++--
 scanpy/tests/test_highly_variable_genes.py     | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 513270c318..1575f5b083 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -357,8 +357,6 @@ def _highly_variable_pearson_residuals(
         adata.var['variances'] = df['variances'].values
         adata.var['residual_variances'] = df['residual_variances']
         adata.var['highly_variable_rank'] = df['highly_variable_rank'].values
-        adata.var['highly_variable'] = df['highly_variable'].values
-
         if batch_key is not None:
             adata.var['highly_variable_nbatches'] = df[
                 'highly_variable_nbatches'
@@ -366,8 +364,11 @@ def _highly_variable_pearson_residuals(
             adata.var['highly_variable_intersection'] = df[
                 'highly_variable_intersection'
             ].values
+        adata.var['highly_variable'] = df['highly_variable'].values
+
         if subset:
             adata._inplace_subset_var(df['highly_variable'].values)
+
     else:
         if batch_key is None:
             df = df.drop(
diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index aa5517eed5..c80e0a442a 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -287,7 +287,7 @@ def test_highly_variable_genes_pearson_residuals_batch(
         inplace=True,
     )
 
-    # pd.testing.assert_frame_equal(output_df, adata.var)
+    pd.testing.assert_frame_equal(output_df, adata.var)
 
     # check output is complete
     for key in [

From f6d42865f4ff7f405b3ecd4957d9d85478a487bf Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Wed, 14 Jul 2021 16:46:54 +0200
Subject: [PATCH 38/96] moving to experimental, minor fix for experimental
 version of hvg selection

---
 scanpy/__init__.py                            |   2 +-
 scanpy/experimental/__init__.py               |   1 +
 scanpy/experimental/pp/__init__.py            |   8 +
 .../experimental/pp/_highly_variable_genes.py | 406 ++++++++++++++++++
 scanpy/experimental/pp/_normalization.py      | 242 +++++++++++
 scanpy/preprocessing/__init__.py              |  14 +-
 .../preprocessing/_highly_variable_genes.py   | 291 +------------
 scanpy/preprocessing/_normalization.py        | 228 ----------
 scanpy/preprocessing/_recipes.py              | 155 -------
 9 files changed, 676 insertions(+), 671 deletions(-)

diff --git a/scanpy/__init__.py b/scanpy/__init__.py
index 31b7c7dba9..12e48314a1 100644
--- a/scanpy/__init__.py
+++ b/scanpy/__init__.py
@@ -14,7 +14,7 @@
     from . import tools as tl
     from . import preprocessing as pp
     from . import plotting as pl
-    from . import datasets, logging, queries, external, get, metrics
+    from . import datasets, logging, queries, external, get, metrics, experimental
 
     from anndata import AnnData, concat
     from anndata import (
diff --git a/scanpy/experimental/__init__.py b/scanpy/experimental/__init__.py
index e69de29bb2..8a00c90df0 100644
--- a/scanpy/experimental/__init__.py
+++ b/scanpy/experimental/__init__.py
@@ -0,0 +1 @@
+from . import pp
diff --git a/scanpy/experimental/pp/__init__.py b/scanpy/experimental/pp/__init__.py
index e69de29bb2..7ecf999363 100644
--- a/scanpy/experimental/pp/__init__.py
+++ b/scanpy/experimental/pp/__init__.py
@@ -0,0 +1,8 @@
+from ._normalization import (
+    normalize_pearson_residuals,
+    normalize_pearson_residuals_pca,
+)
+
+from ._highly_variable_genes import highly_variable_genes
+
+from ._recipes import recipe_pearson_residuals
diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index e69de29bb2..af5f81ed74 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -0,0 +1,406 @@
+import warnings
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+import scipy.sparse as sp_sparse
+from anndata import AnnData
+
+
+from scanpy import logging as logg
+from scanpy._settings import settings, Verbosity
+from scanpy._utils import check_nonnegative_integers, view_to_actual
+from scanpy.get import _get_obs_rep
+from scanpy._compat import Literal
+from scanpy.preprocessing._utils import _get_mean_var
+from scanpy.preprocessing._distributed import materialize_as_ndarray
+from scanpy.preprocessing._simple import filter_genes
+
+
+def _highly_variable_pearson_residuals(
+    adata: AnnData,
+    layer: Optional[str] = None,
+    n_top_genes: int = 1000,
+    batch_key: Optional[str] = None,
+    theta: float = 100,
+    clip: Optional[float] = None,
+    chunksize: int = 100,
+    check_values: bool = True,
+    subset: bool = False,
+    inplace: bool = True,
+) -> Optional[pd.DataFrame]:
+    """\
+    See `highly_variable_genes`.
+
+    Returns
+    -------
+    Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`)
+    or updates `.var` with the following fields:
+
+    highly_variable : bool
+        boolean indicator of highly-variable genes.
+    means : float
+        means per gene.
+    variances : float
+        variances per gene.
+    residual_variances : float
+        Pearson residual variance per gene. Averaged in the case of multiple
+        batches.
+    highly_variable_rank : float
+        Rank of the gene according to residual variance, median rank in the
+        case of multiple batches. NaN for non-HVGs.
+    highly_variable_nbatches : int
+        If batch_key is given, this denotes in how many batches genes are
+        detected as HVG.
+    highly_variable_intersection : bool
+        If batch_key is given, this denotes the genes that are highly variable
+        in all batches.
+    """
+
+    view_to_actual(adata)
+    X = _get_obs_rep(adata, layer=layer)
+    computed_on = layer if layer else 'adata.X'
+
+    # Check for raw counts
+    if check_values and (check_nonnegative_integers(X) is False):
+        warnings.warn(
+            "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
+            UserWarning,
+        )
+    # check theta
+    if theta <= 0:
+        # TODO: would "underdispersion" with negative theta make sense?
+        # then only theta=0 were undefined..
+        raise ValueError('Pearson residuals require theta > 0')
+    # prepare clipping
+
+    if batch_key is None:
+        batch_info = np.zeros(adata.shape[0], dtype=int)
+    else:
+        batch_info = adata.obs[batch_key].values
+    n_batches = len(np.unique(batch_info))
+
+    # Get pearson residuals for each batch separately
+    residual_gene_vars = []
+    for batch in np.unique(batch_info):
+
+        adata_subset = adata[batch_info == batch]
+
+        # Filter out zero genes
+        with settings.verbosity.override(Verbosity.error):
+            nonzero_genes = filter_genes(adata_subset, min_cells=1, inplace=False)[0]
+        adata_subset = adata_subset[:, nonzero_genes]
+
+        if layer is not None:
+            X_batch = adata_subset.layers[layer]
+        else:
+            X_batch = adata_subset.X
+
+        # Prepare clipping
+        if clip is None:
+            n = X_batch.shape[0]
+            clip = np.sqrt(n)
+        if clip < 0:
+            raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
+
+        if sp_sparse.issparse(X_batch):
+            sums_genes = np.sum(X_batch, axis=0)
+            sums_cells = np.sum(X_batch, axis=1)
+            sum_total = np.sum(sums_genes).squeeze()
+        else:
+            sums_genes = np.sum(X_batch, axis=0, keepdims=True)
+            sums_cells = np.sum(X_batch, axis=1, keepdims=True)
+            sum_total = np.sum(sums_genes)
+
+        # Compute pearson residuals in chunks
+        residual_gene_var = np.empty((X_batch.shape[1]))
+        for start in np.arange(0, X_batch.shape[1], chunksize):
+            stop = start + chunksize
+            mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total)
+            X_dense = X_batch[:, start:stop].toarray()
+            residuals = (X_dense - mu) / np.sqrt(mu + mu ** 2 / theta)
+            residuals = np.clip(residuals, a_min=-clip, a_max=clip)
+            residual_gene_var[start:stop] = np.var(residuals, axis=0)
+
+        # Add 0 values for genes that were filtered out
+        zero_gene_var = np.zeros(np.sum(~nonzero_genes))
+        residual_gene_var = np.concatenate((residual_gene_var, zero_gene_var))
+        # Order as before filtering
+        idxs = np.concatenate((np.where(nonzero_genes)[0], np.where(~nonzero_genes)[0]))
+        residual_gene_var = residual_gene_var[np.argsort(idxs)]
+        residual_gene_vars.append(residual_gene_var.reshape(1, -1))
+
+    residual_gene_vars = np.concatenate(residual_gene_vars, axis=0)
+
+    # Get cutoffs and define hvgs per batch
+    residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1)
+    cutoffs_per_batch = residual_gene_vars_sorted[:, -n_top_genes]
+    highly_variable_per_batch = np.greater_equal(
+        residual_gene_vars.T, cutoffs_per_batch
+    ).T
+
+    # Merge hvgs across batches
+    highly_variable_nbatches = np.sum(highly_variable_per_batch, axis=0)
+    highly_variable_intersection = highly_variable_nbatches == n_batches
+
+    # Get rank per gene within each batch
+    # argsort twice gives ranks, small rank means most variable
+    ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1)
+    ranks_residual_var = ranks_residual_var.astype(np.float32)
+    ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan
+    ranks_masked_array = np.ma.masked_invalid(ranks_residual_var)
+    # Median rank across batches,
+    # ignoring batches in which gene was not selected
+    medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan)
+
+    means, variances = materialize_as_ndarray(_get_mean_var(X))
+    df = pd.DataFrame.from_dict(
+        dict(
+            means=means,
+            variances=variances,
+            residual_variances=np.mean(residual_gene_vars, axis=0).astype(
+                np.float32, copy=False
+            ),
+            highly_variable_rank=medianrank_residual_var,
+            highly_variable_nbatches=highly_variable_nbatches.astype(np.int64),
+            highly_variable_intersection=highly_variable_intersection,
+        )
+    )
+    df = df.set_index(adata.var_names)
+
+    # Sort genes by how often they selected as hvg within each batch and
+    # break ties with median rank of residual variance across batches
+    df.sort_values(
+        ['highly_variable_nbatches', 'highly_variable_rank'],
+        ascending=[False, True],
+        na_position='last',
+        inplace=True,
+    )
+
+    high_var = np.zeros(df.shape[0])
+    high_var[:n_top_genes] = True
+    df['highly_variable'] = high_var.astype(bool)
+    df = df.loc[adata.var_names, :]
+
+    if inplace:
+        adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on}
+        logg.hint(
+            'added\n'
+            '    \'highly_variable\', boolean vector (adata.var)\n'
+            '    \'highly_variable_rank\', float vector (adata.var)\n'
+            '    \'highly_variable_nbatches\', int vector (adata.var)\n'
+            '    \'highly_variable_intersection\', boolean vector (adata.var)\n'
+            '    \'means\', float vector (adata.var)\n'
+            '    \'variances\', float vector (adata.var)\n'
+            '    \'residual_variances\', float vector (adata.var)'
+        )
+        adata.var['means'] = df['means'].values
+        adata.var['variances'] = df['variances'].values
+        adata.var['residual_variances'] = df['residual_variances']
+        adata.var['highly_variable_rank'] = df['highly_variable_rank'].values
+        if batch_key is not None:
+            adata.var['highly_variable_nbatches'] = df[
+                'highly_variable_nbatches'
+            ].values
+            adata.var['highly_variable_intersection'] = df[
+                'highly_variable_intersection'
+            ].values
+        adata.var['highly_variable'] = df['highly_variable'].values
+
+        if subset:
+            adata._inplace_subset_var(df['highly_variable'].values)
+
+    else:
+        if batch_key is None:
+            df = df.drop(
+                ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1
+            )
+        if subset:
+            df = df.iloc[df.highly_variable.values, :]
+
+        return df
+
+
+def highly_variable_genes(
+    adata: AnnData,
+    layer: Optional[str] = None,
+    n_top_genes: Optional[int] = None,
+    min_disp: Optional[float] = 0.5,
+    max_disp: Optional[float] = np.inf,
+    min_mean: Optional[float] = 0.0125,
+    max_mean: Optional[float] = 3,
+    span: Optional[float] = 0.3,
+    n_bins: int = 20,
+    theta: float = 100,
+    clip: Optional[float] = None,
+    chunksize: int = 1000,
+    flavor: Literal[
+        'seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals'
+    ] = 'seurat',
+    subset: bool = False,
+    inplace: bool = True,
+    batch_key: Optional[str] = None,
+    check_values: bool = True,
+) -> Optional[pd.DataFrame]:
+    """\
+    Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_ [Lause20]_.
+
+    Expects logarithmized data, except when `flavor='seurat_v3'` or
+    `flavor='pearson_residuals'`, in which count data is expected.
+
+    Depending on `flavor`, this reproduces the R-implementations of Seurat
+    [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_, or uses
+    analytical Pearson residuals [Lause20]_.
+
+    For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized
+    dispersion is obtained by scaling with the mean and standard deviation of
+    the dispersions for genes falling into a given bin for mean expression of
+    genes. This means that for each bin of mean expression, highly variable
+    genes are selected.
+
+    For [Stuart19]_, a normalized variance for each gene is computed. First, the data
+    are standardized (i.e., z-score normalization per feature) with a regularized
+    standard deviation. Next, the normalized variance is computed as the variance
+    of each gene after the transformation. Genes are ranked by the normalized variance.
+
+    For [Lause20]_, Pearson residuals of a negative binomial offset model (with
+    overdispersion theta shared across genes) are computed. By default, overdispersion
+    theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked
+    by residual variance.
+
+    Parameters
+    ----------
+    adata
+        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
+        to cells and columns to genes.
+    layer
+        If provided, use `adata.layers[layer]` for expression values instead of `adata.X`.
+    n_top_genes
+        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or
+        `flavor='pearson_residuals'`.
+    min_mean
+        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
+        `flavor='pearson_residuals'`.
+    max_mean
+        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
+        `flavor='pearson_residuals'`.
+    min_disp
+        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
+        `flavor='pearson_residuals'`.
+    max_disp
+        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
+        `flavor='pearson_residuals'`.
+    span
+        The fraction of the data (cells) used when estimating the variance in the loess
+        model fit if `flavor='seurat_v3'`.
+    n_bins
+        Number of bins for binning the mean gene expression. Normalization is
+        done with respect to each bin. If just a single gene falls into a bin,
+        the normalized dispersion is artificially set to 1. You'll be informed
+        about this if you set `settings.verbosity = 4`. Ignored if
+        `flavor='pearson_residuals'`.
+    theta
+        If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta.
+        Higher values correspond to less overdispersion (var = mean + mean^2/theta), and
+        `theta=np.Inf` corresponds to a Poisson model.
+    clip
+        If `flavor='pearson_residuals'`, this determines how residuals are clipped:
+
+            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
+            where n is the number of cells in the dataset (default behavior).
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+            `clip=np.Inf` for no clipping.
+
+    chunksize
+        If `flavor='pearson_residuals'`, this dertermines how many genes are processed at
+        once while computing the residual variance. Choosing a smaller value will reduce
+        the required memory.
+    flavor
+        Choose the flavor for identifying highly variable genes. For the dispersion
+        based methods in their default workflows, Seurat passes the cutoffs whereas
+        Cell Ranger passes `n_top_genes`.
+    subset
+        Inplace subset to highly-variable genes if `True` otherwise merely indicate
+        highly variable genes.
+    inplace
+        Whether to place calculated metrics in `.var` or return them.
+    batch_key
+        If specified, highly-variable genes are selected within each batch separately and merged.
+        This simple process avoids the selection of batch-specific genes and acts as a
+        lightweight batch correction method. For all flavors, genes are first sorted
+        by how many batches they are a HVG. For dispersion-based flavors ties are broken
+        by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median
+        (across batches) rank based on within-batch normalized variance. If
+        `flavor='pearson_residuals'`, ties are broken by the median rank (across batches)
+        based on within-batch residual variance.
+    check_values
+        Check if counts in selected layer are integers. A Warning is returned if set to True.
+        Only used if `flavor='seurat_v3'` or `flavor='pearson_residuals'`.
+
+
+    Returns
+    -------
+    Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or
+    updates `.var` with the following fields
+
+    highly_variable : bool
+        boolean indicator of highly-variable genes
+    **means**
+        means per gene
+    **dispersions**
+        For dispersion-based flavors, dispersions per gene
+    **dispersions_norm**
+        For dispersion-based flavors, normalized dispersions per gene
+    **variances**
+        For `flavor='seurat_v3'` and `flavor='pearson_residuals'`, variance per gene
+    **variances_norm**
+        For `flavor='seurat_v3'`, normalized variance per gene, averaged in
+        the case of multiple batches
+    **residual_variances**
+        For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of
+        multiple batches.
+    highly_variable_rank : float
+        For `flavor='seurat_v3'`, rank of the gene according to normalized
+        variance, median rank in the case of multiple batches
+        For `flavor='pearson_residuals'`, rank of the gene according to residual
+        variance, median rank in the case of multiple batches
+    highly_variable_nbatches : int
+        If batch_key is given, this denotes in how many batches genes are detected as HVG
+    highly_variable_intersection : bool
+        If batch_key is given, this denotes the genes that are highly variable in all batches
+
+    Notes
+    -----
+    This function replaces :func:`~scanpy.pp.filter_genes_dispersion`.
+    """
+
+    logg.info('extracting highly variable genes')
+
+    if not isinstance(adata, AnnData):
+        raise ValueError(
+            '`pp.highly_variable_genes` expects an `AnnData` argument, '
+            'pass `inplace=False` if you want to return a `pd.DataFrame`.'
+        )
+
+    if flavor == 'pearson_residuals':
+        if n_top_genes is None:
+            raise ValueError(
+                "`pp.highly_variable_genes` requires the argument `n_top_genes`"
+                " for `flavor='pearson_residuals'`"
+            )
+        return _highly_variable_pearson_residuals(
+            adata,
+            layer=layer,
+            n_top_genes=n_top_genes,
+            batch_key=batch_key,
+            theta=theta,
+            clip=clip,
+            chunksize=chunksize,
+            subset=subset,
+            check_values=check_values,
+            inplace=inplace,
+        )
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index e69de29bb2..5e068db8db 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -0,0 +1,242 @@
+from typing import Optional, Union, Dict
+from warnings import warn
+
+import numpy as np
+import pandas as pd
+from anndata import AnnData
+from scipy.sparse import issparse
+
+from scanpy import logging as logg
+
+from scanpy._utils import view_to_actual, check_nonnegative_integers
+from scanpy.get import _get_obs_rep, _set_obs_rep
+
+from scanpy.preprocessing._pca import pca
+
+
+def _pearson_residuals(X, theta, clip, check_values, copy=False):
+
+    X = X.copy() if copy else X
+
+    # check theta
+    if theta <= 0:
+        # TODO: would "underdispersion" with negative theta make sense?
+        # then only theta=0 were undefined..
+        raise ValueError('Pearson residuals require theta > 0')
+    # prepare clipping
+    if clip is None:
+        n = X.shape[0]
+        clip = np.sqrt(n)
+    if clip < 0:
+        raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
+
+    if check_values and not check_nonnegative_integers(X):
+        warn(
+            "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
+            UserWarning,
+        )
+
+    if issparse(X):
+        sums_genes = np.sum(X, axis=0)
+        sums_cells = np.sum(X, axis=1)
+        sum_total = np.sum(sums_genes).squeeze()
+    else:
+        sums_genes = np.sum(X, axis=0, keepdims=True)
+        sums_cells = np.sum(X, axis=1, keepdims=True)
+        sum_total = np.sum(sums_genes)
+
+    mu = np.array(sums_cells @ sums_genes / sum_total)
+    diff = np.array(X - mu)
+    residuals = diff / np.sqrt(mu + mu ** 2 / theta)
+
+    # clip
+    residuals = np.clip(residuals, a_min=-clip, a_max=clip)
+
+    return residuals
+
+
+def normalize_pearson_residuals(
+    adata: AnnData,
+    theta: float = 100,
+    clip: Optional[float] = None,
+    layer: Optional[str] = None,
+    copy: bool = False,
+    check_values: bool = True,
+    inplace: bool = True,
+) -> Optional[Dict[str, np.ndarray]]:
+    """\
+    Computes analytic Pearson residuals, based on [Lause20]_.
+
+    Assuming a negative binomial offset model with overdispersion
+    theta shared across genes, computes Pearson residuals. By default, residuals
+    are clipped to sqrt(n) and overdispersion theta=100 is used.
+
+    Params
+    ------
+    adata
+        The annotated data matrix of shape `n_obs` × `n_vars`.
+        Rows correspond to cells and columns to genes.
+    theta
+        The NB overdispersion parameter theta. Higher values correspond to
+        less overdispersion (var = mean + mean^2/theta), and `theta=np.Inf`
+        corresponds to a Poisson model.
+    clip
+        Determines if and how residuals are clipped:
+
+            * If `None`, residuals are clipped to the interval \
+            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+            `clip=np.Inf` for no clipping.
+
+    layer
+        Layer to normalize instead of `X`. If `None`, `X` is normalized.
+    copy
+        Whether to modify copied input object. Not compatible with
+        `inplace=False`.
+    check_values
+        Check if counts in selected layer are integers. A Warning is returned if set to True.
+    inplace
+        Whether to update `adata` or return dictionary with normalized copies
+        of `adata.X` and `adata.layers`.
+
+    Returns
+    -------
+    Returns dictionary with Pearson residuals and settings
+    or updates `adata` with normalized version of the original
+    `adata.X` and `adata.layers`, depending on `inplace`.
+
+    """
+
+    if copy:
+        if not inplace:
+            raise ValueError("`copy=True` cannot be used with `inplace=False`.")
+        adata = adata.copy()
+
+    view_to_actual(adata)
+    X = _get_obs_rep(adata, layer=layer)
+    computed_on = layer if layer else 'adata.X'
+
+    msg = f'computing analytic Pearson residuals on {computed_on}'
+    start = logg.info(msg)
+
+    residuals = _pearson_residuals(X, theta, clip, check_values, copy=~inplace)
+    settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on)
+
+    if inplace:
+        _set_obs_rep(adata, residuals, layer=layer)
+        adata.uns['pearson_residuals_normalization'] = settings_dict
+    else:
+        results_dict = dict(X=residuals, **settings_dict)
+
+    logg.info('    finished ({time_passed})', time=start)
+
+    if copy:
+        return adata
+    elif not inplace:
+        return results_dict
+
+
+def normalize_pearson_residuals_pca(
+    adata: AnnData,
+    theta: float = 100,
+    clip: Optional[float] = None,
+    n_comps_pca: Optional[int] = 50,
+    random_state_pca: Optional[float] = 0,
+    use_highly_variable: bool = True,
+    kwargs_pca: Optional[dict] = {},
+    check_values: bool = True,
+    inplace: bool = True,
+) -> Optional[pd.DataFrame]:
+    """\
+    Applies Pearson residual normalization and PCA, based on [Lause20]_.
+
+    Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default.
+
+
+    Parameters
+    ----------
+    adata
+        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
+        to cells and columns to genes.
+    use_highly_variable
+        Whether to use the gene selection in `adata.var['highly_variable']` to
+        subset the data before normalizing (default) or proceed on the full
+        dataset.
+    theta
+        This is the NB overdispersion parameter theta for Pearson residual
+        computations. Higher values correspond to less overdispersion
+        (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
+        Poisson model.
+    clip
+        This determines how Pearson residuals are clipped:
+
+            * If `None`, residuals are clipped to the interval \
+            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+            `clip=np.Inf` for no clipping.
+
+    n_comps_pca
+        Number of principal components to compute.
+    random_state_pca
+        Change to use different initial states for the optimization.
+    kwargs_pca
+        Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
+    check_values
+        Check if counts in selected layer are integers. A Warning is returned if set to True.
+    inplace
+        Whether to place results in `adata` or return them.
+
+
+    Returns
+    -------
+    If `inplace=False`, returns the Pearson residual-based PCA results
+    (`adata_pca`).
+    If `inplace=True`, updates `adata` with the following fields:
+
+    `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
+         The hvg-subset, normalized by Pearson residuals
+    `.uns['pearson_residuals_normalization']['theta']`
+         The used value of the overdisperion parameter theta
+    `.uns['pearson_residuals_normalization']['clip']`
+         The used value of the clipping parameter
+
+    `.obsm['X_pearson_residuals_pca']`
+        PCA representation of data after gene selection and Pearson residual
+        normalization.
+    `.uns['pearson_residuals_pca']['PCs']`
+         The principal components containing the loadings.
+    `.uns['pearson_residuals_pca']['variance_ratio']`
+         Ratio of explained variance.
+    `.uns['pearson_residuals_pca']['variance']`
+         Explained variance, equivalent to the eigenvalues of the
+         covariance matrix.
+
+    """
+
+    if use_highly_variable and 'highly_variable' in adata.var_keys():
+        # TODO: are these copies needed?
+        adata_pca = adata[:, adata.var['highly_variable']].copy()
+    else:
+        # TODO: are these copies needed?
+        adata_pca = adata.copy()
+
+    normalize_pearson_residuals(
+        adata_pca, theta=theta, clip=clip, check_values=check_values
+    )
+    pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca)
+
+    if inplace:
+        norm_settings = adata_pca.uns['pearson_residuals_normalization']
+        norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df())
+        pca_settings = adata_pca.uns['pca']
+        pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs'])
+        adata.uns['pearson_residuals_pca'] = pca_dict
+        adata.uns['pearson_residuals_normalization'] = norm_dict
+        adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca']
+        return None
+    else:
+        adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy()
+        adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy()
+        del adata_pca.obsm['X_pca']
+        del adata_pca.uns['pca']
+        return adata_pca
diff --git a/scanpy/preprocessing/__init__.py b/scanpy/preprocessing/__init__.py
index 8adee1f813..b811a89cf0 100644
--- a/scanpy/preprocessing/__init__.py
+++ b/scanpy/preprocessing/__init__.py
@@ -1,9 +1,4 @@
-from ._recipes import (
-    recipe_zheng17,
-    recipe_weinreb17,
-    recipe_seurat,
-    recipe_pearson_residuals,
-)
+from ._recipes import recipe_zheng17, recipe_weinreb17, recipe_seurat
 from ._simple import filter_cells, filter_genes
 from ._deprecated.highly_variable_genes import filter_genes_dispersion
 from ._highly_variable_genes import highly_variable_genes
@@ -12,10 +7,5 @@
 from ._pca import pca
 from ._qc import calculate_qc_metrics
 from ._combat import combat
-from ._normalization import (
-    normalize_total,
-    normalize_pearson_residuals,
-    normalize_pearson_residuals_pca,
-)
-
+from ._normalization import normalize_total
 from ..neighbors import neighbors
diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 1575f5b083..d9c8aae568 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -176,210 +176,6 @@ def _highly_variable_genes_seurat_v3(
         return df
 
 
-def _highly_variable_pearson_residuals(
-    adata: AnnData,
-    layer: Optional[str] = None,
-    n_top_genes: int = 1000,
-    batch_key: Optional[str] = None,
-    theta: float = 100,
-    clip: Optional[float] = None,
-    chunksize: int = 100,
-    check_values: bool = True,
-    subset: bool = False,
-    inplace: bool = True,
-) -> Optional[pd.DataFrame]:
-    """\
-    See `highly_variable_genes`.
-
-    Returns
-    -------
-    Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`)
-    or updates `.var` with the following fields:
-
-    highly_variable : bool
-        boolean indicator of highly-variable genes.
-    means : float
-        means per gene.
-    variances : float
-        variances per gene.
-    residual_variances : float
-        Pearson residual variance per gene. Averaged in the case of multiple
-        batches.
-    highly_variable_rank : float
-        Rank of the gene according to residual variance, median rank in the
-        case of multiple batches. NaN for non-HVGs.
-    highly_variable_nbatches : int
-        If batch_key is given, this denotes in how many batches genes are
-        detected as HVG.
-    highly_variable_intersection : bool
-        If batch_key is given, this denotes the genes that are highly variable
-        in all batches.
-    """
-
-    view_to_actual(adata)
-    X = _get_obs_rep(adata, layer=layer)
-    computed_on = layer if layer else 'adata.X'
-
-    # Check for raw counts
-    if check_values and (check_nonnegative_integers(X) is False):
-        warnings.warn(
-            "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
-            UserWarning,
-        )
-    # check theta
-    if theta <= 0:
-        # TODO: would "underdispersion" with negative theta make sense?
-        # then only theta=0 were undefined..
-        raise ValueError('Pearson residuals require theta > 0')
-    # prepare clipping
-
-    if batch_key is None:
-        batch_info = np.zeros(adata.shape[0], dtype=int)
-    else:
-        batch_info = adata.obs[batch_key].values
-    n_batches = len(np.unique(batch_info))
-
-    # Get pearson residuals for each batch separately
-    residual_gene_vars = []
-    for batch in np.unique(batch_info):
-
-        adata_subset = adata[batch_info == batch]
-
-        # Filter out zero genes
-        with settings.verbosity.override(Verbosity.error):
-            nonzero_genes = filter_genes(adata_subset, min_cells=1, inplace=False)[0]
-        adata_subset = adata_subset[:, nonzero_genes]
-
-        if layer is not None:
-            X_batch = adata_subset.layers[layer]
-        else:
-            X_batch = adata_subset.X
-
-        # Prepare clipping
-        if clip is None:
-            n = X_batch.shape[0]
-            clip = np.sqrt(n)
-        if clip < 0:
-            raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
-
-        if sp_sparse.issparse(X_batch):
-            sums_genes = np.sum(X_batch, axis=0)
-            sums_cells = np.sum(X_batch, axis=1)
-            sum_total = np.sum(sums_genes).squeeze()
-        else:
-            sums_genes = np.sum(X_batch, axis=0, keepdims=True)
-            sums_cells = np.sum(X_batch, axis=1, keepdims=True)
-            sum_total = np.sum(sums_genes)
-
-        # Compute pearson residuals in chunks
-        residual_gene_var = np.empty((X_batch.shape[1]))
-        for start in np.arange(0, X_batch.shape[1], chunksize):
-            stop = start + chunksize
-            mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total)
-            X_dense = X_batch[:, start:stop].toarray()
-            residuals = (X_dense - mu) / np.sqrt(mu + mu ** 2 / theta)
-            residuals = np.clip(residuals, a_min=-clip, a_max=clip)
-            residual_gene_var[start:stop] = np.var(residuals, axis=0)
-
-        # Add 0 values for genes that were filtered out
-        zero_gene_var = np.zeros(np.sum(~nonzero_genes))
-        residual_gene_var = np.concatenate((residual_gene_var, zero_gene_var))
-        # Order as before filtering
-        idxs = np.concatenate((np.where(nonzero_genes)[0], np.where(~nonzero_genes)[0]))
-        residual_gene_var = residual_gene_var[np.argsort(idxs)]
-        residual_gene_vars.append(residual_gene_var.reshape(1, -1))
-
-    residual_gene_vars = np.concatenate(residual_gene_vars, axis=0)
-
-    # Get cutoffs and define hvgs per batch
-    residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1)
-    cutoffs_per_batch = residual_gene_vars_sorted[:, -n_top_genes]
-    highly_variable_per_batch = np.greater_equal(
-        residual_gene_vars.T, cutoffs_per_batch
-    ).T
-
-    # Merge hvgs across batches
-    highly_variable_nbatches = np.sum(highly_variable_per_batch, axis=0)
-    highly_variable_intersection = highly_variable_nbatches == n_batches
-
-    # Get rank per gene within each batch
-    # argsort twice gives ranks, small rank means most variable
-    ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1)
-    ranks_residual_var = ranks_residual_var.astype(np.float32)
-    ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan
-    ranks_masked_array = np.ma.masked_invalid(ranks_residual_var)
-    # Median rank across batches,
-    # ignoring batches in which gene was not selected
-    medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan)
-
-    means, variances = materialize_as_ndarray(_get_mean_var(X))
-    df = pd.DataFrame.from_dict(
-        dict(
-            means=means,
-            variances=variances,
-            residual_variances=np.mean(residual_gene_vars, axis=0).astype(
-                np.float32, copy=False
-            ),
-            highly_variable_rank=medianrank_residual_var,
-            highly_variable_nbatches=highly_variable_nbatches.astype(np.int64),
-            highly_variable_intersection=highly_variable_intersection,
-        )
-    )
-    df = df.set_index(adata.var_names)
-
-    # Sort genes by how often they selected as hvg within each batch and
-    # break ties with median rank of residual variance across batches
-    df.sort_values(
-        ['highly_variable_nbatches', 'highly_variable_rank'],
-        ascending=[False, True],
-        na_position='last',
-        inplace=True,
-    )
-
-    high_var = np.zeros(df.shape[0])
-    high_var[:n_top_genes] = True
-    df['highly_variable'] = high_var.astype(bool)
-    df = df.loc[adata.var_names, :]
-
-    if inplace:
-        adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on}
-        logg.hint(
-            'added\n'
-            '    \'highly_variable\', boolean vector (adata.var)\n'
-            '    \'highly_variable_rank\', float vector (adata.var)\n'
-            '    \'highly_variable_nbatches\', int vector (adata.var)\n'
-            '    \'highly_variable_intersection\', boolean vector (adata.var)\n'
-            '    \'means\', float vector (adata.var)\n'
-            '    \'variances\', float vector (adata.var)\n'
-            '    \'residual_variances\', float vector (adata.var)'
-        )
-        adata.var['means'] = df['means'].values
-        adata.var['variances'] = df['variances'].values
-        adata.var['residual_variances'] = df['residual_variances']
-        adata.var['highly_variable_rank'] = df['highly_variable_rank'].values
-        if batch_key is not None:
-            adata.var['highly_variable_nbatches'] = df[
-                'highly_variable_nbatches'
-            ].values
-            adata.var['highly_variable_intersection'] = df[
-                'highly_variable_intersection'
-            ].values
-        adata.var['highly_variable'] = df['highly_variable'].values
-
-        if subset:
-            adata._inplace_subset_var(df['highly_variable'].values)
-
-    else:
-        if batch_key is None:
-            df = df.drop(
-                ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1
-            )
-        if subset:
-            df = df.iloc[df.highly_variable.values, :]
-
-        return df
-
-
 def _highly_variable_genes_single_batch(
     adata: AnnData,
     layer: Optional[str] = None,
@@ -502,26 +298,20 @@ def highly_variable_genes(
     max_mean: Optional[float] = 3,
     span: Optional[float] = 0.3,
     n_bins: int = 20,
-    theta: float = 100,
-    clip: Optional[float] = None,
-    chunksize: int = 1000,
-    flavor: Literal[
-        'seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals'
-    ] = 'seurat',
+    flavor: Literal['seurat', 'cell_ranger', 'seurat_v3'] = 'seurat',
     subset: bool = False,
     inplace: bool = True,
     batch_key: Optional[str] = None,
     check_values: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_ [Lause20]_.
+    Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_.
 
-    Expects logarithmized data, except when `flavor='seurat_v3'` or
-    `flavor='pearson_residuals'`, in which count data is expected.
+    Expects logarithmized data, except when `flavor='seurat_v3'`, in which count
+    data is expected.
 
     Depending on `flavor`, this reproduces the R-implementations of Seurat
-    [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_, or uses
-    analytical Pearson residuals [Lause20]_.
+    [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_.
 
     For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized
     dispersion is obtained by scaling with the mean and standard deviation of
@@ -534,10 +324,8 @@ def highly_variable_genes(
     standard deviation. Next, the normalized variance is computed as the variance
     of each gene after the transformation. Genes are ranked by the normalized variance.
 
-    For [Lause20]_, Pearson residuals of a negative binomial offset model (with
-    overdispersion theta shared across genes) are computed. By default, overdispersion
-    theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked
-    by residual variance.
+    See also `scanpy.experimental.pp._highly_variable_genes` for additional flavours
+    (e.g. Pearson residuals).
 
     Parameters
     ----------
@@ -547,24 +335,19 @@ def highly_variable_genes(
     layer
         If provided, use `adata.layers[layer]` for expression values instead of `adata.X`.
     n_top_genes
-        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
+        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'`.
     min_mean
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
     max_mean
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
     min_disp
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
     max_disp
         If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
+        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'`.
     span
         The fraction of the data (cells) used when estimating the variance in the loess
         model fit if `flavor='seurat_v3'`.
@@ -572,24 +355,7 @@ def highly_variable_genes(
         Number of bins for binning the mean gene expression. Normalization is
         done with respect to each bin. If just a single gene falls into a bin,
         the normalized dispersion is artificially set to 1. You'll be informed
-        about this if you set `settings.verbosity = 4`. Ignored if
-        `flavor='pearson_residuals'`.
-    theta
-        If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta.
-        Higher values correspond to less overdispersion (var = mean + mean^2/theta), and
-        `theta=np.Inf` corresponds to a Poisson model.
-    clip
-        If `flavor='pearson_residuals'`, this determines how residuals are clipped:
-
-            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
-            where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-            `clip=np.Inf` for no clipping.
-
-    chunksize
-        If `flavor='pearson_residuals'`, this dertermines how many genes are processed at
-        once while computing the residual variance. Choosing a smaller value will reduce
-        the required memory.
+        about this if you set `settings.verbosity = 4`.
     flavor
         Choose the flavor for identifying highly variable genes. For the dispersion
         based methods in their default workflows, Seurat passes the cutoffs whereas
@@ -605,12 +371,10 @@ def highly_variable_genes(
         lightweight batch correction method. For all flavors, genes are first sorted
         by how many batches they are a HVG. For dispersion-based flavors ties are broken
         by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median
-        (across batches) rank based on within-batch normalized variance. If
-        `flavor='pearson_residuals'`, ties are broken by the median rank (across batches)
-        based on within-batch residual variance.
+        (across batches) rank based on within-batch normalized variance.
     check_values
         Check if counts in selected layer are integers. A Warning is returned if set to True.
-        Only used if `flavor='seurat_v3'` or `flavor='pearson_residuals'`.
+        Only used if `flavor='seurat_v3'`.
 
 
     Returns
@@ -627,18 +391,13 @@ def highly_variable_genes(
     **dispersions_norm**
         For dispersion-based flavors, normalized dispersions per gene
     **variances**
-        For `flavor='seurat_v3'` and `flavor='pearson_residuals'`, variance per gene
+        For `flavor='seurat_v3'`, variance per gene
     **variances_norm**
         For `flavor='seurat_v3'`, normalized variance per gene, averaged in
         the case of multiple batches
-    **residual_variances**
-        For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of
-        multiple batches.
     highly_variable_rank : float
         For `flavor='seurat_v3'`, rank of the gene according to normalized
         variance, median rank in the case of multiple batches
-        For `flavor='pearson_residuals'`, rank of the gene according to residual
-        variance, median rank in the case of multiple batches
     highly_variable_nbatches : int
         If batch_key is given, this denotes in how many batches genes are detected as HVG
     highly_variable_intersection : bool
@@ -673,24 +432,6 @@ def highly_variable_genes(
             subset=subset,
             inplace=inplace,
         )
-    if flavor == 'pearson_residuals':
-        if n_top_genes is None:
-            raise ValueError(
-                "`pp.highly_variable_genes` requires the argument `n_top_genes`"
-                " for `flavor='pearson_residuals'`"
-            )
-        return _highly_variable_pearson_residuals(
-            adata,
-            layer=layer,
-            n_top_genes=n_top_genes,
-            batch_key=batch_key,
-            theta=theta,
-            clip=clip,
-            chunksize=chunksize,
-            subset=subset,
-            check_values=check_values,
-            inplace=inplace,
-        )
 
     if batch_key is None:
         df = _highly_variable_genes_single_batch(
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index adc8820b2e..0a853d3c89 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -31,234 +31,6 @@ def _normalize_data(X, counts, after=None, copy=False):
     return X
 
 
-def _pearson_residuals(X, theta, clip, check_values, copy=False):
-
-    X = X.copy() if copy else X
-
-    # check theta
-    if theta <= 0:
-        # TODO: would "underdispersion" with negative theta make sense?
-        # then only theta=0 were undefined..
-        raise ValueError('Pearson residuals require theta > 0')
-    # prepare clipping
-    if clip is None:
-        n = X.shape[0]
-        clip = np.sqrt(n)
-    if clip < 0:
-        raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
-
-    if check_values and not check_nonnegative_integers(X):
-        warn(
-            "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
-            UserWarning,
-        )
-
-    if issparse(X):
-        sums_genes = np.sum(X, axis=0)
-        sums_cells = np.sum(X, axis=1)
-        sum_total = np.sum(sums_genes).squeeze()
-    else:
-        sums_genes = np.sum(X, axis=0, keepdims=True)
-        sums_cells = np.sum(X, axis=1, keepdims=True)
-        sum_total = np.sum(sums_genes)
-
-    mu = np.array(sums_cells @ sums_genes / sum_total)
-    diff = np.array(X - mu)
-    residuals = diff / np.sqrt(mu + mu ** 2 / theta)
-
-    # clip
-    residuals = np.clip(residuals, a_min=-clip, a_max=clip)
-
-    return residuals
-
-
-def normalize_pearson_residuals(
-    adata: AnnData,
-    theta: float = 100,
-    clip: Optional[float] = None,
-    layer: Optional[str] = None,
-    copy: bool = False,
-    check_values: bool = True,
-    inplace: bool = True,
-) -> Optional[Dict[str, np.ndarray]]:
-    """\
-    Computes analytic Pearson residuals, based on [Lause20]_.
-
-    Assuming a negative binomial offset model with overdispersion
-    theta shared across genes, computes Pearson residuals. By default, residuals
-    are clipped to sqrt(n) and overdispersion theta=100 is used.
-
-    Params
-    ------
-    adata
-        The annotated data matrix of shape `n_obs` × `n_vars`.
-        Rows correspond to cells and columns to genes.
-    theta
-        The NB overdispersion parameter theta. Higher values correspond to
-        less overdispersion (var = mean + mean^2/theta), and `theta=np.Inf`
-        corresponds to a Poisson model.
-    clip
-        Determines if and how residuals are clipped:
-
-            * If `None`, residuals are clipped to the interval \
-            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-            `clip=np.Inf` for no clipping.
-
-    layer
-        Layer to normalize instead of `X`. If `None`, `X` is normalized.
-    copy
-        Whether to modify copied input object. Not compatible with
-        `inplace=False`.
-    check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to True.
-    inplace
-        Whether to update `adata` or return dictionary with normalized copies
-        of `adata.X` and `adata.layers`.
-
-    Returns
-    -------
-    Returns dictionary with Pearson residuals and settings
-    or updates `adata` with normalized version of the original
-    `adata.X` and `adata.layers`, depending on `inplace`.
-
-    """
-
-    if copy:
-        if not inplace:
-            raise ValueError("`copy=True` cannot be used with `inplace=False`.")
-        adata = adata.copy()
-
-    view_to_actual(adata)
-    X = _get_obs_rep(adata, layer=layer)
-    computed_on = layer if layer else 'adata.X'
-
-    msg = f'computing analytic Pearson residuals on {computed_on}'
-    start = logg.info(msg)
-
-    residuals = _pearson_residuals(X, theta, clip, check_values, copy=~inplace)
-    settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on)
-
-    if inplace:
-        _set_obs_rep(adata, residuals, layer=layer)
-        adata.uns['pearson_residuals_normalization'] = settings_dict
-    else:
-        results_dict = dict(X=residuals, **settings_dict)
-
-    logg.info('    finished ({time_passed})', time=start)
-
-    if copy:
-        return adata
-    elif not inplace:
-        return results_dict
-
-
-def normalize_pearson_residuals_pca(
-    adata: AnnData,
-    theta: float = 100,
-    clip: Optional[float] = None,
-    n_comps_pca: Optional[int] = 50,
-    random_state_pca: Optional[float] = 0,
-    use_highly_variable: bool = True,
-    kwargs_pca: Optional[dict] = {},
-    check_values: bool = True,
-    inplace: bool = True,
-) -> Optional[pd.DataFrame]:
-    """\
-    Applies Pearson residual normalization and PCA, based on [Lause20]_.
-
-    Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default.
-
-
-    Parameters
-    ----------
-    adata
-        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
-        to cells and columns to genes.
-    use_highly_variable
-        Whether to use the gene selection in `adata.var['highly_variable']` to
-        subset the data before normalizing (default) or proceed on the full
-        dataset.
-    theta
-        This is the NB overdispersion parameter theta for Pearson residual
-        computations. Higher values correspond to less overdispersion
-        (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
-        Poisson model.
-    clip
-        This determines how Pearson residuals are clipped:
-
-            * If `None`, residuals are clipped to the interval \
-            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-            `clip=np.Inf` for no clipping.
-
-    n_comps_pca
-        Number of principal components to compute.
-    random_state_pca
-        Change to use different initial states for the optimization.
-    kwargs_pca
-        Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
-    check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to True.
-    inplace
-        Whether to place results in `adata` or return them.
-
-
-    Returns
-    -------
-    If `inplace=False`, returns the Pearson residual-based PCA results
-    (`adata_pca`).
-    If `inplace=True`, updates `adata` with the following fields:
-
-    `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
-         The hvg-subset, normalized by Pearson residuals
-    `.uns['pearson_residuals_normalization']['theta']`
-         The used value of the overdisperion parameter theta
-    `.uns['pearson_residuals_normalization']['clip']`
-         The used value of the clipping parameter
-
-    `.obsm['X_pearson_residuals_pca']`
-        PCA representation of data after gene selection and Pearson residual
-        normalization.
-    `.uns['pearson_residuals_pca']['PCs']`
-         The principal components containing the loadings.
-    `.uns['pearson_residuals_pca']['variance_ratio']`
-         Ratio of explained variance.
-    `.uns['pearson_residuals_pca']['variance']`
-         Explained variance, equivalent to the eigenvalues of the
-         covariance matrix.
-
-    """
-
-    if use_highly_variable and 'highly_variable' in adata.var_keys():
-        # TODO: are these copies needed?
-        adata_pca = adata[:, adata.var['highly_variable']].copy()
-    else:
-        # TODO: are these copies needed?
-        adata_pca = adata.copy()
-
-    normalize_pearson_residuals(
-        adata_pca, theta=theta, clip=clip, check_values=check_values
-    )
-    pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca)
-
-    if inplace:
-        norm_settings = adata_pca.uns['pearson_residuals_normalization']
-        norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df())
-        pca_settings = adata_pca.uns['pca']
-        pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs'])
-        adata.uns['pearson_residuals_pca'] = pca_dict
-        adata.uns['pearson_residuals_normalization'] = norm_dict
-        adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca']
-        return None
-    else:
-        adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy()
-        adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy()
-        del adata_pca.obsm['X_pca']
-        del adata_pca.uns['pca']
-        return adata_pca
-
-
 def normalize_total(
     adata: AnnData,
     target_sum: Optional[float] = None,
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index fd28bea576..a4696e0827 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -170,158 +170,3 @@ def recipe_zheng17(
     pp.scale(adata)
     logg.info('    finished', time=start)
     return adata if copy else None
-
-
-def recipe_pearson_residuals(
-    adata: AnnData,
-    n_top_genes: int = 1000,
-    theta: float = 100,
-    clip: Optional[float] = None,
-    chunksize: int = 1000,
-    batch_key: Optional[str] = None,
-    n_comps_pca: Optional[int] = 50,
-    random_state_pca: Optional[float] = 0,
-    kwargs_pca: dict = {},
-    check_values: bool = True,
-    inplace: bool = True,
-) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
-    """\
-    Gene selection and normalization based on [Lause20]_.
-
-    Applies gene selection based on Pearson residuals. On the resulting subset,
-    Pearson residual normalization and PCA are performed.
-
-
-    Parameters
-    ----------
-    adata
-        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
-        to cells and columns to genes.
-    n_top_genes
-        Number of highly-variable genes to keep. Mandatory if
-        `flavor='seurat_v3'` or `flavor='pearson_residuals'`.
-    chunksize
-        This dertermines how many genes are processed at once while computing
-        the Pearson residual variance. Choosing a smaller value will reduce
-        the required memory.
-    theta
-        This is the NB overdispersion parameter theta for Pearson residual
-        computations. Higher values correspond to less overdispersion
-        (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
-        Poisson model.
-    clip
-        This determines if and how Pearson residuals are clipped:
-
-            * If `None`, residuals are clipped to the interval \
-            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-            `clip=np.Inf` for no clipping.
-
-    batch_key
-        If specified, highly-variable genes are selected within each batch
-        separately and merged. This simple process avoids the selection of
-        batch-specific genes and acts as a lightweight batch correction
-        method. For all flavors, genes are first sorted by how many batches
-        they are a HVG. Ties are broken by the median rank (across batches)
-        based on within-batch residual variance.
-
-    n_comps_pca
-        Number of principal components to compute.
-    random_state_pca
-        Change to use different initial states for the optimization.
-    kwargs_pca
-        Dictionary of further keyword arguments passed on to `sc.pp.pca()`.
-    check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to True.
-    inplace
-        Whether to place results in `adata` or return them.
-
-    Returns
-    ------
-    If `inplace=False`, separately returns the gene selection results (`hvg`)
-    and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`,
-    updates `adata` with the following fields for gene selection results…:
-
-    `.var['highly_variable']`
-        boolean indicator of highly-variable genes.
-    `.var['means']`
-        means per gene.
-    `.var['variances']`
-        variances per gene.
-    `.var['residual_variances']`
-        Pearson residual variance per gene. Averaged in the case of multiple
-        batches.
-    `.var['highly_variable_rank']`
-        Rank of the gene according to residual variance, median rank in the
-        case of multiple batches.
-    `.var['highly_variable_nbatches']`
-        If batch_key is given, this denotes in how many batches genes are
-        detected as HVG.
-    `.var['highly_variable_intersection']`
-        If batch_key is given, this denotes the genes that are highly variable
-        in all batches.
-
-    …and the following fields for Pearson residual-based PCA results and
-    normalization settings:
-
-    `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
-         The hvg-subset, normalized by Pearson residuals.
-    `.uns['pearson_residuals_normalization']['theta']`
-         The used value of the overdisperion parameter theta.
-    `.uns['pearson_residuals_normalization']['clip']`
-         The used value of the clipping parameter.
-
-    `.obsm['pearson_residuals_X_pca']`
-        PCA representation of data after gene selection and Pearson residual
-        normalization.
-    `.uns['pearson_residuals_pca']['PCs']`
-         The principal components containing the loadings.
-    `.uns['pearson_residuals_pca']['variance_ratio']`
-         Ratio of explained variance.
-    `.uns['pearson_residuals_pca']['variance']`
-         Explained variance, equivalent to the eigenvalues of the
-         covariance matrix.
-
-    """
-
-    hvg_args = dict(
-        flavor='pearson_residuals',
-        n_top_genes=n_top_genes,
-        batch_key=batch_key,
-        theta=theta,
-        clip=clip,
-        chunksize=chunksize,
-        check_values=check_values,
-    )
-
-    if inplace:
-        pp.highly_variable_genes(adata, **hvg_args, inplace=True)
-        # TODO: are these copies needed?
-        adata_pca = adata[:, adata.var['highly_variable']].copy()
-    else:
-        hvg = pp.highly_variable_genes(adata, **hvg_args, inplace=False)
-        # TODO: are these copies needed?
-        adata_pca = adata[:, hvg['highly_variable']].copy()
-
-    pp.normalize_pearson_residuals(
-        adata_pca, theta=theta, clip=clip, check_values=check_values
-    )
-    pp.pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca)
-
-    if inplace:
-        normalization_param = adata_pca.uns['pearson_residuals_normalization']
-        normalization_dict = dict(
-            **normalization_param, pearson_residuals_df=adata_pca.to_df()
-        )
-        pca_param = adata_pca.uns['pca']
-        pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs'])
-        adata.uns['pearson_residuals_pca'] = pca_dict
-        adata.uns['pearson_residuals_normalization'] = normalization_dict
-        adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca']
-        return None
-    else:
-        adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy()
-        adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy()
-        del adata_pca.obsm['X_pca']
-        del adata_pca.uns['pca']
-        return adata_pca, hvg

From dd16140cdb95961d8d26497ce3fd20dfea6904d4 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Wed, 14 Jul 2021 16:49:43 +0200
Subject: [PATCH 39/96] linking tests to new experimental submodule, style
 cleanup

---
 scanpy/tests/test_highly_variable_genes.py | 24 +++++++-------
 scanpy/tests/test_normalization.py         | 38 ++++++++++++----------
 2 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index c80e0a442a..a92c59ebf5 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -58,7 +58,7 @@ def test_highly_variable_genes_basic():
 
 
 def _residual_var_reference(adata, clip=None, theta=100):
-    sc.pp.normalize_pearson_residuals(adata, clip=clip, theta=theta)
+    sc.experimental.pp.normalize_pearson_residuals(adata, clip=clip, theta=theta)
     residuals = adata.X
     return np.var(residuals, axis=0)
 
@@ -80,7 +80,7 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
 
         # expecting 0 no-int warnings
         with pytest.warns(None) as record:
-            sc.pp.highly_variable_genes(
+            sc.experimental.pp.highly_variable_genes(
                 adata_noninteger.copy(),
                 flavor='pearson_residuals',
                 n_top_genes=100,
@@ -90,7 +90,7 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
 
         # expecting 1 no-int warning
         with pytest.warns(None) as record:
-            sc.pp.highly_variable_genes(
+            sc.experimental.pp.highly_variable_genes(
                 adata_noninteger.copy(),
                 flavor='pearson_residuals',
                 n_top_genes=100,
@@ -101,17 +101,17 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
 
     # errors should be raised for invalid theta values
     with pytest.raises(ValueError) as record:
-        sc.pp.highly_variable_genes(
+        sc.experimental.pp.highly_variable_genes(
             adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=0
         )
     with pytest.raises(ValueError) as record:
-        sc.pp.highly_variable_genes(
+        sc.experimental.pp.highly_variable_genes(
             adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=-1
         )
 
     # error should be raised for invalid clipping values
     with pytest.raises(ValueError) as record:
-        sc.pp.highly_variable_genes(
+        sc.experimental.pp.highly_variable_genes(
             adata.copy(), flavor='pearson_residuals', n_top_genes=100, clip=-1
         )
 
@@ -141,7 +141,7 @@ def test_highly_variable_genes_pearson_residuals_values(
         residual_variances_reference = residual_variances_reference[top_n_idx]
 
     # compute output to be tested
-    output_df = sc.pp.highly_variable_genes(
+    output_df = sc.experimental.pp.highly_variable_genes(
         adata,
         flavor='pearson_residuals',
         n_top_genes=n_top_genes,
@@ -151,7 +151,7 @@ def test_highly_variable_genes_pearson_residuals_values(
         theta=theta,
     )
 
-    sc.pp.highly_variable_genes(
+    sc.experimental.pp.highly_variable_genes(
         adata,
         flavor='pearson_residuals',
         n_top_genes=n_top_genes,
@@ -198,7 +198,7 @@ def test_highly_variable_genes_pearson_residuals_general(
         # (results in sorted "gene order" in reference)
         residual_variances_reference = residual_variances_reference[top_n_idx]
     # compute output to be tested
-    output_df = sc.pp.highly_variable_genes(
+    output_df = sc.experimental.pp.highly_variable_genes(
         adata,
         flavor='pearson_residuals',
         n_top_genes=n_top_genes,
@@ -206,7 +206,7 @@ def test_highly_variable_genes_pearson_residuals_general(
         inplace=False,
     )
 
-    sc.pp.highly_variable_genes(
+    sc.experimental.pp.highly_variable_genes(
         adata,
         flavor='pearson_residuals',
         n_top_genes=n_top_genes,
@@ -269,7 +269,7 @@ def test_highly_variable_genes_pearson_residuals_batch(
     adata.var.drop(columns=adata.var.columns, inplace=True)
     n_genes = adata.shape[1]
 
-    output_df = sc.pp.highly_variable_genes(
+    output_df = sc.experimental.pp.highly_variable_genes(
         adata,
         flavor='pearson_residuals',
         n_top_genes=n_top_genes,
@@ -278,7 +278,7 @@ def test_highly_variable_genes_pearson_residuals_batch(
         inplace=False,
     )
 
-    sc.pp.highly_variable_genes(
+    sc.experimental.pp.highly_variable_genes(
         adata,
         flavor='pearson_residuals',
         n_top_genes=n_top_genes,
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index eb9bbc8533..ad7ba7bde7 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -80,27 +80,27 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype):
         adata_noninteger.X[x[0], y[0]] = 0.5
 
         with pytest.warns(UserWarning) as record:
-            sc.pp.normalize_pearson_residuals(
+            sc.experimental.pp.normalize_pearson_residuals(
                 adata_noninteger.copy(), check_values=True
             )
         assert len(record) == 1
         assert "expects raw count data" in record[0].message.args[0]
 
         with pytest.warns(None) as record:
-            sc.pp.normalize_pearson_residuals(
+            sc.experimental.pp.normalize_pearson_residuals(
                 adata_noninteger.copy(), check_values=False
             )
         assert len(record) == 0
 
     # errors should be raised for invalid theta values
     with pytest.raises(ValueError) as record:
-        sc.pp.normalize_pearson_residuals(adata.copy(), theta=0)
+        sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=0)
     with pytest.raises(ValueError) as record:
-        sc.pp.normalize_pearson_residuals(adata.copy(), theta=-1)
+        sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=-1)
 
     # error should be raised for invalid clipping values
     with pytest.raises(ValueError) as record:
-        sc.pp.normalize_pearson_residuals(adata.copy(), clip=-1)
+        sc.experimental.pp.normalize_pearson_residuals(adata.copy(), clip=-1)
 
 
 @pytest.mark.parametrize(
@@ -127,11 +127,13 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip):
 
     # compute output to test
     adata = AnnData(sparsity_func(X), dtype=dtype)
-    output = sc.pp.normalize_pearson_residuals(
+    output = sc.experimental.pp.normalize_pearson_residuals(
         adata, theta=theta, clip=clip, inplace=False
     )
     output_X = output['X']
-    sc.pp.normalize_pearson_residuals(adata, theta=theta, clip=clip, inplace=True)
+    sc.experimental.pp.normalize_pearson_residuals(
+        adata, theta=theta, clip=clip, inplace=True
+    )
 
     # check for correct new `adata.uns` keys
     assert np.all(np.isin(['pearson_residuals_normalization'], list(adata.uns.keys())))
@@ -170,7 +172,7 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
     n_cells, n_genes = adata.shape
 
     adata_with_hvgs = adata.copy()
-    sc.pp.highly_variable_genes(
+    sc.experimental.pp.highly_variable_genes(
         adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs
     )
     adata_not_using_hvgs = adata_with_hvgs.copy()
@@ -178,15 +180,15 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
     ### inplace = False ###
     # outputs the (potentially hvg-restricted) adata_pca object
     # PCA on all genes
-    adata_pca = sc.pp.normalize_pearson_residuals_pca(
+    adata_pca = sc.experimental.pp.normalize_pearson_residuals_pca(
         adata.copy(), inplace=False, n_comps_pca=n_comps_pca
     )
     # PCA on hvgs only
-    adata_pca_with_hvgs = sc.pp.normalize_pearson_residuals_pca(
+    adata_pca_with_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_with_hvgs.copy(), inplace=False, n_comps_pca=n_comps_pca
     )
     # PCA again on all genes (hvg use supressed)
-    adata_pca_not_using_hvgs = sc.pp.normalize_pearson_residuals_pca(
+    adata_pca_not_using_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_not_using_hvgs.copy(),
         inplace=False,
         n_comps_pca=n_comps_pca,
@@ -221,13 +223,15 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
     ### inplace = True ###
     # modifies the input adata object
     # PCA on all genes
-    sc.pp.normalize_pearson_residuals_pca(adata, inplace=True, n_comps_pca=n_comps_pca)
+    sc.experimental.pp.normalize_pearson_residuals_pca(
+        adata, inplace=True, n_comps_pca=n_comps_pca
+    )
     # PCA on hvgs only
-    sc.pp.normalize_pearson_residuals_pca(
+    sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_with_hvgs, inplace=True, n_comps_pca=n_comps_pca
     )
     # PCA again on all genes (hvg use supressed)
-    sc.pp.normalize_pearson_residuals_pca(
+    sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_not_using_hvgs,
         inplace=True,
         n_comps_pca=n_comps_pca,
@@ -280,14 +284,14 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
     n_cells, n_genes = adata.shape
 
     adata_with_hvgs = adata.copy()
-    sc.pp.highly_variable_genes(
+    sc.experimental.pp.highly_variable_genes(
         adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs
     )
 
     ### inplace = False ###
     # outputs the (potentially hvg-restricted) adata_pca object
     # PCA on all genes
-    adata_pca, hvg = sc.pp.recipe_pearson_residuals(
+    adata_pca, hvg = sc.experimental.pp.recipe_pearson_residuals(
         adata.copy(), inplace=False, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs
     )
 
@@ -326,7 +330,7 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
     ### inplace = True ###
     # modifies the input adata object
     # PCA on all genes
-    sc.pp.recipe_pearson_residuals(
+    sc.experimental.pp.recipe_pearson_residuals(
         adata, inplace=True, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs
     )
 

From a19f90ec25f60f2421f257695ecbd0b3a8fb47c5 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Wed, 14 Jul 2021 17:18:50 +0200
Subject: [PATCH 40/96] adapt input arguments and docstring for experimental
 version of hvg selection function

---
 .../experimental/pp/_highly_variable_genes.py | 73 ++-----------------
 1 file changed, 7 insertions(+), 66 deletions(-)

diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index af5f81ed74..c90b468027 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -225,43 +225,19 @@ def highly_variable_genes(
     adata: AnnData,
     layer: Optional[str] = None,
     n_top_genes: Optional[int] = None,
-    min_disp: Optional[float] = 0.5,
-    max_disp: Optional[float] = np.inf,
-    min_mean: Optional[float] = 0.0125,
-    max_mean: Optional[float] = 3,
-    span: Optional[float] = 0.3,
-    n_bins: int = 20,
     theta: float = 100,
     clip: Optional[float] = None,
     chunksize: int = 1000,
-    flavor: Literal[
-        'seurat', 'cell_ranger', 'seurat_v3', 'pearson_residuals'
-    ] = 'seurat',
+    flavor: Literal['pearson_residuals'] = 'pearson_residuals',
     subset: bool = False,
     inplace: bool = True,
     batch_key: Optional[str] = None,
     check_values: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    Annotate highly variable genes [Satija15]_ [Zheng17]_ [Stuart19]_ [Lause20]_.
+    Annotate highly variable genes using Analytical Pearson residuals [Lause20]_.
 
-    Expects logarithmized data, except when `flavor='seurat_v3'` or
-    `flavor='pearson_residuals'`, in which count data is expected.
-
-    Depending on `flavor`, this reproduces the R-implementations of Seurat
-    [Satija15]_, Cell Ranger [Zheng17]_, and Seurat v3 [Stuart19]_, or uses
-    analytical Pearson residuals [Lause20]_.
-
-    For the dispersion-based methods ([Satija15]_ and [Zheng17]_), the normalized
-    dispersion is obtained by scaling with the mean and standard deviation of
-    the dispersions for genes falling into a given bin for mean expression of
-    genes. This means that for each bin of mean expression, highly variable
-    genes are selected.
-
-    For [Stuart19]_, a normalized variance for each gene is computed. First, the data
-    are standardized (i.e., z-score normalization per feature) with a regularized
-    standard deviation. Next, the normalized variance is computed as the variance
-    of each gene after the transformation. Genes are ranked by the normalized variance.
+    Expects count data input.
 
     For [Lause20]_, Pearson residuals of a negative binomial offset model (with
     overdispersion theta shared across genes) are computed. By default, overdispersion
@@ -278,31 +254,6 @@ def highly_variable_genes(
     n_top_genes
         Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or
         `flavor='pearson_residuals'`.
-    min_mean
-        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
-    max_mean
-        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
-    min_disp
-        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
-    max_disp
-        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
-        normalized dispersions are ignored. Ignored if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
-    span
-        The fraction of the data (cells) used when estimating the variance in the loess
-        model fit if `flavor='seurat_v3'`.
-    n_bins
-        Number of bins for binning the mean gene expression. Normalization is
-        done with respect to each bin. If just a single gene falls into a bin,
-        the normalized dispersion is artificially set to 1. You'll be informed
-        about this if you set `settings.verbosity = 4`. Ignored if
-        `flavor='pearson_residuals'`.
     theta
         If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta.
         Higher values correspond to less overdispersion (var = mean + mean^2/theta), and
@@ -320,9 +271,8 @@ def highly_variable_genes(
         once while computing the residual variance. Choosing a smaller value will reduce
         the required memory.
     flavor
-        Choose the flavor for identifying highly variable genes. For the dispersion
-        based methods in their default workflows, Seurat passes the cutoffs whereas
-        Cell Ranger passes `n_top_genes`.
+        Choose the flavor for identifying highly variable genes. In this experimental
+        version, only 'pearson_residuals' is functional.
     subset
         Inplace subset to highly-variable genes if `True` otherwise merely indicate
         highly variable genes.
@@ -351,21 +301,12 @@ def highly_variable_genes(
         boolean indicator of highly-variable genes
     **means**
         means per gene
-    **dispersions**
-        For dispersion-based flavors, dispersions per gene
-    **dispersions_norm**
-        For dispersion-based flavors, normalized dispersions per gene
     **variances**
-        For `flavor='seurat_v3'` and `flavor='pearson_residuals'`, variance per gene
-    **variances_norm**
-        For `flavor='seurat_v3'`, normalized variance per gene, averaged in
-        the case of multiple batches
+        variance per gene
     **residual_variances**
         For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of
         multiple batches.
     highly_variable_rank : float
-        For `flavor='seurat_v3'`, rank of the gene according to normalized
-        variance, median rank in the case of multiple batches
         For `flavor='pearson_residuals'`, rank of the gene according to residual
         variance, median rank in the case of multiple batches
     highly_variable_nbatches : int
@@ -375,7 +316,7 @@ def highly_variable_genes(
 
     Notes
     -----
-    This function replaces :func:`~scanpy.pp.filter_genes_dispersion`.
+    Experimental version of `sc.pp.highly_variable_genes()`
     """
 
     logg.info('extracting highly variable genes')

From 659da16c498bed4ed8b7e6e34b1f85b5b142168f Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Sun, 1 Aug 2021 19:18:45 +0200
Subject: [PATCH 41/96] add recipes

---
 scanpy/experimental/pp/__init__.py |   6 +-
 scanpy/experimental/pp/_recipes.py | 148 +++++++++++++++++++++++++++++
 2 files changed, 151 insertions(+), 3 deletions(-)
 create mode 100644 scanpy/experimental/pp/_recipes.py

diff --git a/scanpy/experimental/pp/__init__.py b/scanpy/experimental/pp/__init__.py
index 7ecf999363..a5eaf9d9c2 100644
--- a/scanpy/experimental/pp/__init__.py
+++ b/scanpy/experimental/pp/__init__.py
@@ -1,8 +1,8 @@
-from ._normalization import (
+from scanpy.experimental.pp._normalization import (
     normalize_pearson_residuals,
     normalize_pearson_residuals_pca,
 )
 
-from ._highly_variable_genes import highly_variable_genes
+from scanpy.experimental.pp._highly_variable_genes import highly_variable_genes
 
-from ._recipes import recipe_pearson_residuals
+from scanpy.experimental.pp._recipes import recipe_pearson_residuals
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
new file mode 100644
index 0000000000..ba557abc8b
--- /dev/null
+++ b/scanpy/experimental/pp/_recipes.py
@@ -0,0 +1,148 @@
+from typing import Optional, Tuple
+from anndata import AnnData
+import pandas as pd
+from scanpy import experimental
+from scanpy.preprocessing import pca
+
+
+def recipe_pearson_residuals(
+    adata: AnnData,
+    n_top_genes: int = 1000,
+    theta: float = 100,
+    clip: Optional[float] = None,
+    chunksize: int = 1000,
+    batch_key: Optional[str] = None,
+    n_comps_pca: Optional[int] = 50,
+    random_state_pca: Optional[float] = 0,
+    kwargs_pca: dict = {},
+    check_values: bool = True,
+    inplace: bool = True,
+) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
+    """\
+    Gene selection and normalization based on [Lause20]_.
+    Applies gene selection based on Pearson residuals. On the resulting subset,
+    Pearson residual normalization and PCA are performed.
+    Parameters
+    ----------
+    adata
+        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
+        to cells and columns to genes.
+    n_top_genes
+        Number of highly-variable genes to keep. Mandatory if
+        `flavor='seurat_v3'` or `flavor='pearson_residuals'`.
+    chunksize
+        This dertermines how many genes are processed at once while computing
+        the Pearson residual variance. Choosing a smaller value will reduce
+        the required memory.
+    theta
+        This is the NB overdispersion parameter theta for Pearson residual
+        computations. Higher values correspond to less overdispersion
+        (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
+        Poisson model.
+    clip
+        This determines if and how Pearson residuals are clipped:
+            * If `None`, residuals are clipped to the interval \
+            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+            `clip=np.Inf` for no clipping.
+    batch_key
+        If specified, highly-variable genes are selected within each batch
+        separately and merged. This simple process avoids the selection of
+        batch-specific genes and acts as a lightweight batch correction
+        method. For all flavors, genes are first sorted by how many batches
+        they are a HVG. Ties are broken by the median rank (across batches)
+        based on within-batch residual variance.
+    n_comps_pca
+        Number of principal components to compute.
+    random_state_pca
+        Change to use different initial states for the optimization.
+    kwargs_pca
+        Dictionary of further keyword arguments passed on to `sc.pp.pca()`.
+    check_values
+        Check if counts in selected layer are integers. A Warning is returned if set to True.
+    inplace
+        Whether to place results in `adata` or return them.
+    Returns
+    ------
+    If `inplace=False`, separately returns the gene selection results (`hvg`)
+    and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`,
+    updates `adata` with the following fields for gene selection results…:
+    `.var['highly_variable']`
+        boolean indicator of highly-variable genes.
+    `.var['means']`
+        means per gene.
+    `.var['variances']`
+        variances per gene.
+    `.var['residual_variances']`
+        Pearson residual variance per gene. Averaged in the case of multiple
+        batches.
+    `.var['highly_variable_rank']`
+        Rank of the gene according to residual variance, median rank in the
+        case of multiple batches.
+    `.var['highly_variable_nbatches']`
+        If batch_key is given, this denotes in how many batches genes are
+        detected as HVG.
+    `.var['highly_variable_intersection']`
+        If batch_key is given, this denotes the genes that are highly variable
+        in all batches.
+    …and the following fields for Pearson residual-based PCA results and
+    normalization settings:
+    `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
+         The hvg-subset, normalized by Pearson residuals.
+    `.uns['pearson_residuals_normalization']['theta']`
+         The used value of the overdisperion parameter theta.
+    `.uns['pearson_residuals_normalization']['clip']`
+         The used value of the clipping parameter.
+    `.obsm['pearson_residuals_X_pca']`
+        PCA representation of data after gene selection and Pearson residual
+        normalization.
+    `.uns['pearson_residuals_pca']['PCs']`
+         The principal components containing the loadings.
+    `.uns['pearson_residuals_pca']['variance_ratio']`
+         Ratio of explained variance.
+    `.uns['pearson_residuals_pca']['variance']`
+         Explained variance, equivalent to the eigenvalues of the
+         covariance matrix.
+    """
+
+    hvg_args = dict(
+        flavor='pearson_residuals',
+        n_top_genes=n_top_genes,
+        batch_key=batch_key,
+        theta=theta,
+        clip=clip,
+        chunksize=chunksize,
+        check_values=check_values,
+    )
+
+    if inplace:
+        experimental.pp.highly_variable_genes(adata, **hvg_args, inplace=True)
+        # TODO: are these copies needed?
+        adata_pca = adata[:, adata.var['highly_variable']].copy()
+    else:
+        hvg = experimental.pp.highly_variable_genes(adata, **hvg_args, inplace=False)
+        # TODO: are these copies needed?
+        adata_pca = adata[:, hvg['highly_variable']].copy()
+
+    experimental.pp.normalize_pearson_residuals(
+        adata_pca, theta=theta, clip=clip, check_values=check_values
+    )
+    pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca)
+
+    if inplace:
+        normalization_param = adata_pca.uns['pearson_residuals_normalization']
+        normalization_dict = dict(
+            **normalization_param, pearson_residuals_df=adata_pca.to_df()
+        )
+        pca_param = adata_pca.uns['pca']
+        pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs'])
+        adata.uns['pearson_residuals_pca'] = pca_dict
+        adata.uns['pearson_residuals_normalization'] = normalization_dict
+        adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca']
+        return None
+    else:
+        adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy()
+        adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy()
+        del adata_pca.obsm['X_pca']
+        del adata_pca.uns['pca']
+        return adata_pca, hvg

From bf0bb8e86b1187649b4612e1558c29f06c5c556b Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Sun, 1 Aug 2021 19:29:51 +0200
Subject: [PATCH 42/96] fix docs

---
 docs/api.rst | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/docs/api.rst b/docs/api.rst
index 2bc9283a75..cdcb2c7b87 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -39,8 +39,6 @@ For visual quality control, see :func:`~scanpy.pl.highest_expr_genes` and
    pp.log1p
    pp.pca
    pp.normalize_total
-   pp.normalize_pearson_residuals
-   pp.normalize_pearson_residuals_pca
    pp.regress_out
    pp.scale
    pp.subsample
@@ -55,7 +53,6 @@ Recipes
    pp.recipe_zheng17
    pp.recipe_weinreb17
    pp.recipe_seurat
-   pp.recipe_pearson_residuals
 
 Batch effect correction
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -372,6 +369,24 @@ Collections of useful measurements for evaluating results.
    metrics.morans_i
 
 
+Experimental
+------------
+
+.. module:: scanpy.experimental
+.. currentmodule:: scanpy
+
+New methods that are in early development which are not (yet)
+integrated in Scanpy core.
+
+.. autosummary::
+   :toctree: generated/
+
+   pp.normalize_pearson_residuals
+   pp.normalize_pearson_residuals_pca
+   pp.highly_variable_genes
+   pp.recipe_pearson_residuals
+
+
 Classes
 -------
 

From 191c449bbd8d24990cc87db1fc2c2b96f0f1fd04 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Sun, 1 Aug 2021 19:52:22 +0200
Subject: [PATCH 43/96] add correct module docs

---
 docs/api.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/api.rst b/docs/api.rst
index cdcb2c7b87..1400e3807b 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -381,10 +381,10 @@ integrated in Scanpy core.
 .. autosummary::
    :toctree: generated/
 
-   pp.normalize_pearson_residuals
-   pp.normalize_pearson_residuals_pca
-   pp.highly_variable_genes
-   pp.recipe_pearson_residuals
+   experimental.pp.normalize_pearson_residuals
+   experimental.pp.normalize_pearson_residuals_pca
+   experimental.pp.highly_variable_genes
+   experimental.pp.recipe_pearson_residuals
 
 
 Classes

From 7f3d6ed9bff995736c00fcafdea5dd6a4920fdc2 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Sun, 1 Aug 2021 20:37:08 +0200
Subject: [PATCH 44/96] fix recipe docstrings

---
 scanpy/experimental/pp/_recipes.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index ba557abc8b..e8c143f9d9 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -20,8 +20,10 @@ def recipe_pearson_residuals(
 ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
     """\
     Gene selection and normalization based on [Lause20]_.
+
     Applies gene selection based on Pearson residuals. On the resulting subset,
     Pearson residual normalization and PCA are performed.
+
     Parameters
     ----------
     adata
@@ -40,11 +42,13 @@ def recipe_pearson_residuals(
         (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
         Poisson model.
     clip
-        This determines if and how Pearson residuals are clipped:
+        Determines if and how residuals are clipped:
+
             * If `None`, residuals are clipped to the interval \
             [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
             * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
             `clip=np.Inf` for no clipping.
+
     batch_key
         If specified, highly-variable genes are selected within each batch
         separately and merged. This simple process avoids the selection of
@@ -62,6 +66,7 @@ def recipe_pearson_residuals(
         Check if counts in selected layer are integers. A Warning is returned if set to True.
     inplace
         Whether to place results in `adata` or return them.
+
     Returns
     ------
     If `inplace=False`, separately returns the gene selection results (`hvg`)
@@ -103,6 +108,7 @@ def recipe_pearson_residuals(
     `.uns['pearson_residuals_pca']['variance']`
          Explained variance, equivalent to the eigenvalues of the
          covariance matrix.
+
     """
 
     hvg_args = dict(

From 87bf42506e8719745d3fd3cfe40478aa7d5a0d76 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Sun, 1 Aug 2021 20:48:40 +0200
Subject: [PATCH 45/96] try fix indentation

---
 scanpy/experimental/pp/_normalization.py | 4 ++--
 scanpy/experimental/pp/_recipes.py       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index 5e068db8db..b43240569b 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -71,8 +71,8 @@ def normalize_pearson_residuals(
     theta shared across genes, computes Pearson residuals. By default, residuals
     are clipped to sqrt(n) and overdispersion theta=100 is used.
 
-    Params
-    ------
+    Parameters
+    ----------
     adata
         The annotated data matrix of shape `n_obs` × `n_vars`.
         Rows correspond to cells and columns to genes.
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index e8c143f9d9..79aec9fbba 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -44,9 +44,9 @@ def recipe_pearson_residuals(
     clip
         Determines if and how residuals are clipped:
 
-            * If `None`, residuals are clipped to the interval \
+            * If `None`, residuals are clipped to the interval
             [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set
             `clip=np.Inf` for no clipping.
 
     batch_key

From 0b8ba5f7744a2a5cd22763105a228ddec1736b92 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Sun, 1 Aug 2021 22:06:52 +0200
Subject: [PATCH 46/96] fix indentation

---
 scanpy/experimental/pp/_recipes.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 79aec9fbba..10cdb72400 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -42,11 +42,11 @@ def recipe_pearson_residuals(
         (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
         Poisson model.
     clip
-        Determines if and how residuals are clipped:
+        This determines how Pearson residuals are clipped:
 
-            * If `None`, residuals are clipped to the interval
+            * If `None`, residuals are clipped to the interval \
             [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
             `clip=np.Inf` for no clipping.
 
     batch_key
@@ -68,7 +68,7 @@ def recipe_pearson_residuals(
         Whether to place results in `adata` or return them.
 
     Returns
-    ------
+    -------
     If `inplace=False`, separately returns the gene selection results (`hvg`)
     and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`,
     updates `adata` with the following fields for gene selection results…:

From 88bf93a41adfa0784ab1eb48f24900c066c233d6 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Sun, 1 Aug 2021 22:20:43 +0200
Subject: [PATCH 47/96] fix

---
 scanpy/experimental/pp/_recipes.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 10cdb72400..b22ffe5496 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -44,10 +44,10 @@ def recipe_pearson_residuals(
     clip
         This determines how Pearson residuals are clipped:
 
-            * If `None`, residuals are clipped to the interval \
-            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-            `clip=np.Inf` for no clipping.
+        * If `None`, residuals are clipped to the interval \
+        [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+        * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+        `clip=np.Inf` for no clipping.
 
     batch_key
         If specified, highly-variable genes are selected within each batch

From ef81b72133938381f4a623f0ba76bad95a60cb60 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Sun, 1 Aug 2021 22:26:10 +0200
Subject: [PATCH 48/96] new indentation

---
 scanpy/experimental/pp/_recipes.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index b22ffe5496..014c13d203 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -44,10 +44,10 @@ def recipe_pearson_residuals(
     clip
         This determines how Pearson residuals are clipped:
 
-        * If `None`, residuals are clipped to the interval \
-        [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
-        * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-        `clip=np.Inf` for no clipping.
+        * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)]
+        where n is the number of cells in the dataset (default behavior).
+        * If any scalar c, residuals are clipped to the interval [-c, c].
+        Set`clip=np.Inf` for no clipping.
 
     batch_key
         If specified, highly-variable genes are selected within each batch

From 900c12c6c5f71485ff7113fcaf1fd6907a620b09 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 2 Aug 2021 08:31:39 +0200
Subject: [PATCH 49/96] add space

---
 scanpy/experimental/pp/_recipes.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 014c13d203..ab64aa79b6 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -24,6 +24,7 @@ def recipe_pearson_residuals(
     Applies gene selection based on Pearson residuals. On the resulting subset,
     Pearson residual normalization and PCA are performed.
 
+
     Parameters
     ----------
     adata
@@ -44,10 +45,10 @@ def recipe_pearson_residuals(
     clip
         This determines how Pearson residuals are clipped:
 
-        * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)]
-        where n is the number of cells in the dataset (default behavior).
-        * If any scalar c, residuals are clipped to the interval [-c, c].
-        Set`clip=np.Inf` for no clipping.
+            * If `None`, residuals are clipped to the interval \
+            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+            `clip=np.Inf` for no clipping.
 
     batch_key
         If specified, highly-variable genes are selected within each batch

From b00a0b627ef67eb00015643cd6c74e7f672980c8 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Mon, 2 Aug 2021 13:25:26 +0200
Subject: [PATCH 50/96] fixing typo in docstring

---
 scanpy/experimental/pp/_recipes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index ab64aa79b6..2f297a307d 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -99,7 +99,7 @@ def recipe_pearson_residuals(
          The used value of the overdisperion parameter theta.
     `.uns['pearson_residuals_normalization']['clip']`
          The used value of the clipping parameter.
-    `.obsm['pearson_residuals_X_pca']`
+    `.obsm['X_pearson_residuals_pca']`
         PCA representation of data after gene selection and Pearson residual
         normalization.
     `.uns['pearson_residuals_pca']['PCs']`

From 617aff1f4b22d6a67c11628a4333ea038f484d08 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Mon, 2 Aug 2021 13:46:39 +0200
Subject: [PATCH 51/96] renaming pca output fields

---
 scanpy/experimental/pp/_recipes.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 2f297a307d..38e2923bcf 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -99,14 +99,14 @@ def recipe_pearson_residuals(
          The used value of the overdisperion parameter theta.
     `.uns['pearson_residuals_normalization']['clip']`
          The used value of the clipping parameter.
-    `.obsm['X_pearson_residuals_pca']`
+    `.obsm['X_pca']`
         PCA representation of data after gene selection and Pearson residual
         normalization.
-    `.uns['pearson_residuals_pca']['PCs']`
+    `.uns['pca']['PCs']`
          The principal components containing the loadings.
-    `.uns['pearson_residuals_pca']['variance_ratio']`
+    `.uns['pca']['variance_ratio']`
          Ratio of explained variance.
-    `.uns['pearson_residuals_pca']['variance']`
+    `.uns['pca']['variance']`
          Explained variance, equivalent to the eigenvalues of the
          covariance matrix.
 
@@ -143,13 +143,9 @@ def recipe_pearson_residuals(
         )
         pca_param = adata_pca.uns['pca']
         pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs'])
-        adata.uns['pearson_residuals_pca'] = pca_dict
+        adata.uns['pca'] = pca_dict
         adata.uns['pearson_residuals_normalization'] = normalization_dict
-        adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca']
+        adata.obsm['X_pca'] = adata_pca.obsm['X_pca']
         return None
     else:
-        adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy()
-        adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy()
-        del adata_pca.obsm['X_pca']
-        del adata_pca.uns['pca']
         return adata_pca, hvg

From 4dabfcdb5415fd95fe300f5e6ee10cef744fb252 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Mon, 2 Aug 2021 22:55:59 +0200
Subject: [PATCH 52/96] adapting tests to new output fieldname

---
 scanpy/tests/test_normalization.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index ad7ba7bde7..09068ced3f 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -298,13 +298,13 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
     # for both cases, check adata_pca keys are complete
     assert np.all(
         np.isin(
-            ['pearson_residuals_normalization', 'pearson_residuals_pca'],
+            ['pearson_residuals_normalization', 'pca'],
             list(adata_pca.uns.keys()),
         )
     )
-    assert np.all(np.isin(['X_pearson_residuals_pca'], list(adata_pca.obsm.keys())))
+    assert np.all(np.isin(['X_pca'], list(adata_pca.obsm.keys())))
     assert np.all(np.isin(['PCs'], list(adata_pca.varm.keys())))
-    assert adata_pca.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca)
+    assert adata_pca.obsm['X_pca'].shape == (n_cells, n_comps_pca)
 
     # check adata shape
     assert adata_pca.shape == (n_cells, n_hvgs)
@@ -336,13 +336,13 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
 
     assert np.all(
         np.isin(
-            ['pearson_residuals_normalization', 'pearson_residuals_pca'],
+            ['pearson_residuals_normalization', 'pca'],
             list(adata.uns.keys()),
         )
     )
-    assert np.all(np.isin(['X_pearson_residuals_pca'], list(adata.obsm.keys())))
+    assert np.all(np.isin(['X_pca'], list(adata.obsm.keys())))
     assert adata.shape == (n_cells, n_genes)
-    assert adata.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca)
+    assert adata.obsm['X_pca'].shape == (n_cells, n_comps_pca)
 
     # check PC shapes to see whether or not HVGs were used for PCA
-    assert adata.uns['pearson_residuals_pca']['PCs'].shape == (n_hvgs, n_comps_pca)
+    assert adata.uns['pca']['PCs'].shape == (n_hvgs, n_comps_pca)

From 58ac8e0825d2903594cdfad0e61b00ee4607c9b5 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Fri, 6 Aug 2021 11:15:04 +0200
Subject: [PATCH 53/96] fix docs :hammer:

---
 scanpy/experimental/pp/_highly_variable_genes.py | 2 +-
 scanpy/experimental/pp/_recipes.py               | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index c90b468027..110913e932 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -30,7 +30,7 @@ def _highly_variable_pearson_residuals(
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    See `highly_variable_genes`.
+    See `scanpy.pp.highly_variable_genes`.
 
     Returns
     -------
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 38e2923bcf..35b5338878 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -21,8 +21,8 @@ def recipe_pearson_residuals(
     """\
     Gene selection and normalization based on [Lause20]_.
 
-    Applies gene selection based on Pearson residuals. On the resulting subset,
-    Pearson residual normalization and PCA are performed.
+    Applies gene selection based on Pearson residuals.
+    On the resulting subset, Pearson residual normalization and PCA are performed.
 
 
     Parameters

From 8ae83380ec37a48374bcb443de704f045a754f03 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Fri, 6 Aug 2021 11:56:12 +0200
Subject: [PATCH 54/96] update docs

---
 scanpy/experimental/pp/_normalization.py | 27 ++++++++++--------------
 scanpy/experimental/pp/_recipes.py       | 25 ++++++++++++----------
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index b43240569b..9689eb8476 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union, Dict
+from typing import Optional, Dict
 from warnings import warn
 
 import numpy as np
@@ -71,8 +71,8 @@ def normalize_pearson_residuals(
     theta shared across genes, computes Pearson residuals. By default, residuals
     are clipped to sqrt(n) and overdispersion theta=100 is used.
 
-    Parameters
-    ----------
+    Params
+    ------
     adata
         The annotated data matrix of shape `n_obs` × `n_vars`.
         Rows correspond to cells and columns to genes.
@@ -152,9 +152,8 @@ def normalize_pearson_residuals_pca(
 
     Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default.
 
-
-    Parameters
-    ----------
+    Params
+    ------
     adata
         The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
         to cells and columns to genes.
@@ -200,14 +199,14 @@ def normalize_pearson_residuals_pca(
     `.uns['pearson_residuals_normalization']['clip']`
          The used value of the clipping parameter
 
-    `.obsm['X_pearson_residuals_pca']`
+    `.obsm['X_pca']`
         PCA representation of data after gene selection and Pearson residual
         normalization.
-    `.uns['pearson_residuals_pca']['PCs']`
+    `.uns['pca']['PCs']`
          The principal components containing the loadings.
-    `.uns['pearson_residuals_pca']['variance_ratio']`
+    `.uns['pca']['variance_ratio']`
          Ratio of explained variance.
-    `.uns['pearson_residuals_pca']['variance']`
+    `.uns['pca']['variance']`
          Explained variance, equivalent to the eigenvalues of the
          covariance matrix.
 
@@ -230,13 +229,9 @@ def normalize_pearson_residuals_pca(
         norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df())
         pca_settings = adata_pca.uns['pca']
         pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs'])
-        adata.uns['pearson_residuals_pca'] = pca_dict
+        adata.uns['pca'] = pca_dict
         adata.uns['pearson_residuals_normalization'] = norm_dict
-        adata.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca']
+        adata.obsm['X_pca'] = adata_pca.obsm['X_pca']
         return None
     else:
-        adata_pca.obsm['X_pearson_residuals_pca'] = adata_pca.obsm['X_pca'].copy()
-        adata_pca.uns['pearson_residuals_pca'] = adata_pca.uns['pca'].copy()
-        del adata_pca.obsm['X_pca']
-        del adata_pca.uns['pca']
         return adata_pca
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 35b5338878..45c2c41725 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -21,15 +21,14 @@ def recipe_pearson_residuals(
     """\
     Gene selection and normalization based on [Lause20]_.
 
-    Applies gene selection based on Pearson residuals.
-    On the resulting subset, Pearson residual normalization and PCA are performed.
+    Applies gene selection based on Pearson residuals. On the resulting subset,
+    Pearson residual normalization and PCA are performed.
 
-
-    Parameters
-    ----------
+    Params
+    ------
     adata
-        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
-        to cells and columns to genes.
+        The annotated data matrix of shape `n_obs` × `n_vars`.
+        Rows correspond to cells and columns to genes.
     n_top_genes
         Number of highly-variable genes to keep. Mandatory if
         `flavor='seurat_v3'` or `flavor='pearson_residuals'`.
@@ -43,7 +42,7 @@ def recipe_pearson_residuals(
         (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
         Poisson model.
     clip
-        This determines how Pearson residuals are clipped:
+        Determines if and how residuals are clipped:
 
             * If `None`, residuals are clipped to the interval \
             [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
@@ -62,17 +61,19 @@ def recipe_pearson_residuals(
     random_state_pca
         Change to use different initial states for the optimization.
     kwargs_pca
-        Dictionary of further keyword arguments passed on to `sc.pp.pca()`.
+        Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
     check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to True.
+        Check if counts in selected layer are integers. A `Warning` is returned if set to True.
     inplace
         Whether to place results in `adata` or return them.
 
+
     Returns
     -------
     If `inplace=False`, separately returns the gene selection results (`hvg`)
     and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`,
-    updates `adata` with the following fields for gene selection results…:
+    updates `adata` with the following fields for gene selection results:
+
     `.var['highly_variable']`
         boolean indicator of highly-variable genes.
     `.var['means']`
@@ -91,8 +92,10 @@ def recipe_pearson_residuals(
     `.var['highly_variable_intersection']`
         If batch_key is given, this denotes the genes that are highly variable
         in all batches.
+
     …and the following fields for Pearson residual-based PCA results and
     normalization settings:
+
     `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
          The hvg-subset, normalized by Pearson residuals.
     `.uns['pearson_residuals_normalization']['theta']`

From 535129cb3c2c971cd7b5128573da93afb8881490 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Fri, 6 Aug 2021 12:20:56 +0200
Subject: [PATCH 55/96] fix test :hammer:

---
 scanpy/tests/test_normalization.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index 09068ced3f..b2defaeead 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -199,13 +199,13 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
     for ad in [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs]:
         assert np.all(
             np.isin(
-                ['pearson_residuals_normalization', 'pearson_residuals_pca'],
+                ['pearson_residuals_normalization', 'pca'],
                 list(ad.uns.keys()),
             )
         )
-        assert np.all(np.isin(['X_pearson_residuals_pca'], list(ad.obsm.keys())))
+        assert np.all(np.isin(['X_pca'], list(ad.obsm.keys())))
         assert np.all(np.isin(['PCs'], list(ad.varm.keys())))
-        assert ad.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca)
+        assert ad.obsm['X_pca'].shape == (n_cells, n_comps_pca)
 
     # check adata shape to see if all genes or only HVGs are in the returned adata
     assert adata_pca.shape == (n_cells, n_genes)
@@ -242,22 +242,24 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
         # check adata_pca keys are complete
         assert np.all(
             np.isin(
-                ['pearson_residuals_normalization', 'pearson_residuals_pca'],
+                [
+                    'pearson_residuals_normalization',
+                ],
                 list(ad.uns.keys()),
             )
         )
-        assert np.all(np.isin(['X_pearson_residuals_pca'], list(ad.obsm.keys())))
+        assert np.all(np.isin(['X_pca'], list(ad.obsm.keys())))
         # check shapes: adata should always retains original shape
         assert ad.shape == (n_cells, n_genes)
-        assert ad.obsm['X_pearson_residuals_pca'].shape == (n_cells, n_comps_pca)
+        assert ad.obsm['X_pca'].shape == (n_cells, n_comps_pca)
 
     # check PC shapes to see whether or not HVGs were used for PCA
-    assert adata.uns['pearson_residuals_pca']['PCs'].shape == (n_genes, n_comps_pca)
-    assert adata_with_hvgs.uns['pearson_residuals_pca']['PCs'].shape == (
+    assert adata.uns['pca']['PCs'].shape == (n_genes, n_comps_pca)
+    assert adata_with_hvgs.uns['pca']['PCs'].shape == (
         n_hvgs,
         n_comps_pca,
     )
-    assert adata_not_using_hvgs.uns['pearson_residuals_pca']['PCs'].shape == (
+    assert adata_not_using_hvgs.uns['pca']['PCs'].shape == (
         n_genes,
         n_comps_pca,
     )
@@ -268,8 +270,8 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
         [adata, adata_with_hvgs, adata_not_using_hvgs],
     ):
         np.testing.assert_array_equal(
-            ad_inplace.obsm['X_pearson_residuals_pca'],
-            ad_outplace.obsm['X_pearson_residuals_pca'],
+            ad_inplace.obsm['X_pca'],
+            ad_outplace.obsm['X_pca'],
         )
 
 

From 3addbe75216d42bf495c616e7be1b9663f4aeaad Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Tue, 10 Aug 2021 16:49:36 +0200
Subject: [PATCH 56/96] ensure argument and docstring consistency

---
 .../experimental/pp/_highly_variable_genes.py | 102 +++++++++---------
 scanpy/experimental/pp/_normalization.py      |  76 +++++++------
 scanpy/experimental/pp/_recipes.py            |  64 +++++------
 3 files changed, 122 insertions(+), 120 deletions(-)

diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index 110913e932..a0d344d390 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -19,18 +19,18 @@
 
 def _highly_variable_pearson_residuals(
     adata: AnnData,
-    layer: Optional[str] = None,
-    n_top_genes: int = 1000,
-    batch_key: Optional[str] = None,
     theta: float = 100,
     clip: Optional[float] = None,
-    chunksize: int = 100,
+    n_top_genes: int = 1000,
+    batch_key: Optional[str] = None,
+    chunksize: int = 1000,
     check_values: bool = True,
+    layer: Optional[str] = None,
     subset: bool = False,
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    See `scanpy.pp.highly_variable_genes`.
+    See `scanpy.experimental.pp.highly_variable_genes`.
 
     Returns
     -------
@@ -38,23 +38,19 @@ def _highly_variable_pearson_residuals(
     or updates `.var` with the following fields:
 
     highly_variable : bool
-        boolean indicator of highly-variable genes.
+        boolean indicator of highly-variable genes
     means : float
-        means per gene.
+        means per gene
     variances : float
-        variances per gene.
+        variance per gene
     residual_variances : float
-        Pearson residual variance per gene. Averaged in the case of multiple
-        batches.
+        Residual variance per gene. Averaged in the case of multiple batches.
     highly_variable_rank : float
-        Rank of the gene according to residual variance, median rank in the
-        case of multiple batches. NaN for non-HVGs.
+        Rank of the gene according to residual variance, median rank in the case of multiple batches
     highly_variable_nbatches : int
-        If batch_key is given, this denotes in how many batches genes are
-        detected as HVG.
+        If `batch_key` given, denotes in how many batches genes are detected as HVG
     highly_variable_intersection : bool
-        If batch_key is given, this denotes the genes that are highly variable
-        in all batches.
+        If `batch_key` given, denotes the genes that are highly variable in all batches
     """
 
     view_to_actual(adata)
@@ -223,49 +219,55 @@ def _highly_variable_pearson_residuals(
 
 def highly_variable_genes(
     adata: AnnData,
-    layer: Optional[str] = None,
-    n_top_genes: Optional[int] = None,
     theta: float = 100,
     clip: Optional[float] = None,
+    n_top_genes: Optional[int] = None,
+    batch_key: Optional[str] = None,
     chunksize: int = 1000,
     flavor: Literal['pearson_residuals'] = 'pearson_residuals',
+    check_values: bool = True,
+    layer: Optional[str] = None,
     subset: bool = False,
     inplace: bool = True,
-    batch_key: Optional[str] = None,
-    check_values: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    Annotate highly variable genes using Analytical Pearson residuals [Lause20]_.
-
-    Expects count data input.
+    Annotate highly variable genes using analytic Pearson residuals [Lause20]_.
 
     For [Lause20]_, Pearson residuals of a negative binomial offset model (with
     overdispersion theta shared across genes) are computed. By default, overdispersion
     theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked
     by residual variance.
 
+    Expects raw count input.
+
+
     Parameters
     ----------
     adata
-        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
-        to cells and columns to genes.
-    layer
-        If provided, use `adata.layers[layer]` for expression values instead of `adata.X`.
-    n_top_genes
-        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
+        The annotated data matrix of shape `n_obs` × `n_vars`.
+        Rows correspond to cells and columns to genes.
     theta
-        If `flavor='pearson_residuals'`, this is the NB overdispersion parameter theta.
-        Higher values correspond to less overdispersion (var = mean + mean^2/theta), and
-        `theta=np.Inf` corresponds to a Poisson model.
+        The negative binomial overdispersion parameter theta for Pearson residuals.
+        Higher values correspond to less overdispersion (var = mean + mean^2/theta),
+        and `theta=np.Inf` corresponds to a Poisson model.
     clip
-        If `flavor='pearson_residuals'`, this determines how residuals are clipped:
+        If `flavor='pearson_residuals'`, determines if and how residuals are clipped:
 
             * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
             where n is the number of cells in the dataset (default behavior).
             * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
             `clip=np.Inf` for no clipping.
 
+    n_top_genes
+        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or
+        `flavor='pearson_residuals'`.
+    batch_key
+        If specified, highly-variable genes are selected within each batch separately
+        and merged. This simple process avoids the selection of batch-specific genes
+        and acts as a lightweight batch correction method. Genes are first sorted by
+        how many batches they are a HVG. If `flavor='pearson_residuals'`, ties are
+        broken by the median rank (across batches) based on within-batch residual
+        variance.
     chunksize
         If `flavor='pearson_residuals'`, this dertermines how many genes are processed at
         once while computing the residual variance. Choosing a smaller value will reduce
@@ -273,24 +275,16 @@ def highly_variable_genes(
     flavor
         Choose the flavor for identifying highly variable genes. In this experimental
         version, only 'pearson_residuals' is functional.
+    check_values
+        Check if counts in selected layer are integers. A Warning is returned if set to
+        True. Only used if `flavor='pearson_residuals'`.
+    layer
+        If provided, use `adata.layers[layer]` for expression values instead of `adata.X`.
     subset
         Inplace subset to highly-variable genes if `True` otherwise merely indicate
         highly variable genes.
     inplace
         Whether to place calculated metrics in `.var` or return them.
-    batch_key
-        If specified, highly-variable genes are selected within each batch separately and merged.
-        This simple process avoids the selection of batch-specific genes and acts as a
-        lightweight batch correction method. For all flavors, genes are first sorted
-        by how many batches they are a HVG. For dispersion-based flavors ties are broken
-        by normalized dispersion. If `flavor = 'seurat_v3'`, ties are broken by the median
-        (across batches) rank based on within-batch normalized variance. If
-        `flavor='pearson_residuals'`, ties are broken by the median rank (across batches)
-        based on within-batch residual variance.
-    check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to True.
-        Only used if `flavor='seurat_v3'` or `flavor='pearson_residuals'`.
-
 
     Returns
     -------
@@ -299,20 +293,20 @@ def highly_variable_genes(
 
     highly_variable : bool
         boolean indicator of highly-variable genes
-    **means**
+    means : float
         means per gene
-    **variances**
+    variances : float
         variance per gene
-    **residual_variances**
-        For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the case of
-        multiple batches.
+    residual_variances : float
+        For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the
+        case of multiple batches.
     highly_variable_rank : float
         For `flavor='pearson_residuals'`, rank of the gene according to residual
         variance, median rank in the case of multiple batches
     highly_variable_nbatches : int
-        If batch_key is given, this denotes in how many batches genes are detected as HVG
+        If `batch_key` given, denotes in how many batches genes are detected as HVG
     highly_variable_intersection : bool
-        If batch_key is given, this denotes the genes that are highly variable in all batches
+        If `batch_key` given, denotes the genes that are highly variable in all batches
 
     Notes
     -----
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index 9689eb8476..dbdf5e59d0 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -59,17 +59,19 @@ def normalize_pearson_residuals(
     adata: AnnData,
     theta: float = 100,
     clip: Optional[float] = None,
+    check_values: bool = True,
     layer: Optional[str] = None,
     copy: bool = False,
-    check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[Dict[str, np.ndarray]]:
     """\
-    Computes analytic Pearson residuals, based on [Lause20]_.
+    Applies analytic Pearson residual normalization, based on [Lause20]_.
 
-    Assuming a negative binomial offset model with overdispersion
-    theta shared across genes, computes Pearson residuals. By default, residuals
-    are clipped to sqrt(n) and overdispersion theta=100 is used.
+    The residuals are based on a negative binomial offset model with overdispersion
+    `theta` shared across genes. By default, residuals are clipped to sqrt(n) and
+    overdispersion `theta=100` is used.
+
+    Expects raw count input.
 
     Params
     ------
@@ -77,24 +79,24 @@ def normalize_pearson_residuals(
         The annotated data matrix of shape `n_obs` × `n_vars`.
         Rows correspond to cells and columns to genes.
     theta
-        The NB overdispersion parameter theta. Higher values correspond to
-        less overdispersion (var = mean + mean^2/theta), and `theta=np.Inf`
-        corresponds to a Poisson model.
+        The negative binomial overdispersion parameter theta for Pearson residuals.
+        Higher values correspond to less overdispersion (var = mean + mean^2/theta),
+        and `theta=np.Inf` corresponds to a Poisson model.
     clip
         Determines if and how residuals are clipped:
 
-            * If `None`, residuals are clipped to the interval \
-            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
+            where n is the number of cells in the dataset (default behavior).
             * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
             `clip=np.Inf` for no clipping.
 
+    check_values
+        Check if counts in selected layer are integers. A Warning is returned if set to
+        True.
     layer
         Layer to normalize instead of `X`. If `None`, `X` is normalized.
     copy
-        Whether to modify copied input object. Not compatible with
-        `inplace=False`.
-    check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to True.
+        Whether to modify copied input object. Not compatible with `inplace=False`.
     inplace
         Whether to update `adata` or return dictionary with normalized copies
         of `adata.X` and `adata.layers`.
@@ -142,35 +144,36 @@ def normalize_pearson_residuals_pca(
     clip: Optional[float] = None,
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
-    use_highly_variable: bool = True,
     kwargs_pca: Optional[dict] = {},
+    use_highly_variable: bool = True,
     check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    Applies Pearson residual normalization and PCA, based on [Lause20]_.
+    Applies analytic Pearson residual normalization and PCA, based on [Lause20]_.
+
+    The residuals are based on a negative binomial offset model with overdispersion
+    `theta` shared across genes. By default, residuals are clipped to sqrt(n),
+    overdispersion `theta=100` is used, and PCA is run with 50 components.
+
+    Operates on the subset of highly variable genes in `adata.var['highly_variable']`
+    by default. Expects raw count input.
 
-    Operates on the subset of highly variable genes in `adata.var['highly_variable']` by default.
 
     Params
     ------
     adata
-        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
-        to cells and columns to genes.
-    use_highly_variable
-        Whether to use the gene selection in `adata.var['highly_variable']` to
-        subset the data before normalizing (default) or proceed on the full
-        dataset.
+        The annotated data matrix of shape `n_obs` × `n_vars`.
+        Rows correspond to cells and columns to genes.
     theta
-        This is the NB overdispersion parameter theta for Pearson residual
-        computations. Higher values correspond to less overdispersion
-        (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
-        Poisson model.
+        The negative binomial overdispersion parameter theta for Pearson residuals.
+        Higher values correspond to less overdispersion (var = mean + mean^2/theta),
+        and `theta=np.Inf` corresponds to a Poisson model.
     clip
-        This determines how Pearson residuals are clipped:
+        Determines if and how residuals are clipped:
 
-            * If `None`, residuals are clipped to the interval \
-            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
+            where n is the number of cells in the dataset (default behavior).
             * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
             `clip=np.Inf` for no clipping.
 
@@ -180,8 +183,12 @@ def normalize_pearson_residuals_pca(
         Change to use different initial states for the optimization.
     kwargs_pca
         Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
+    use_highly_variable
+        Whether to use the gene selection in `adata.var['highly_variable']` to subset
+        the data before normalizing (default) or proceed on the full dataset.
     check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to True.
+        Check if counts in selected layer are integers. A Warning is returned if set to
+        True.
     inplace
         Whether to place results in `adata` or return them.
 
@@ -200,15 +207,14 @@ def normalize_pearson_residuals_pca(
          The used value of the clipping parameter
 
     `.obsm['X_pca']`
-        PCA representation of data after gene selection and Pearson residual
-        normalization.
+        PCA representation of data after gene selection (if applicable) and Pearson
+        residual normalization.
     `.uns['pca']['PCs']`
          The principal components containing the loadings.
     `.uns['pca']['variance_ratio']`
          Ratio of explained variance.
     `.uns['pca']['variance']`
-         Explained variance, equivalent to the eigenvalues of the
-         covariance matrix.
+         Explained variance, equivalent to the eigenvalues of the covariance matrix.
 
     """
 
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 45c2c41725..67fc6e81a9 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -7,11 +7,11 @@
 
 def recipe_pearson_residuals(
     adata: AnnData,
-    n_top_genes: int = 1000,
     theta: float = 100,
     clip: Optional[float] = None,
-    chunksize: int = 1000,
+    n_top_genes: int = 1000,
     batch_key: Optional[str] = None,
+    chunksize: int = 1000,
     n_comps_pca: Optional[int] = 50,
     random_state_pca: Optional[float] = 0,
     kwargs_pca: dict = {},
@@ -24,38 +24,39 @@ def recipe_pearson_residuals(
     Applies gene selection based on Pearson residuals. On the resulting subset,
     Pearson residual normalization and PCA are performed.
 
+    Expects raw count input.
+
+
     Params
     ------
     adata
         The annotated data matrix of shape `n_obs` × `n_vars`.
         Rows correspond to cells and columns to genes.
-    n_top_genes
-        Number of highly-variable genes to keep. Mandatory if
-        `flavor='seurat_v3'` or `flavor='pearson_residuals'`.
-    chunksize
-        This dertermines how many genes are processed at once while computing
-        the Pearson residual variance. Choosing a smaller value will reduce
-        the required memory.
     theta
-        This is the NB overdispersion parameter theta for Pearson residual
-        computations. Higher values correspond to less overdispersion
-        (var = mean + mean^2/theta), and `theta=np.Inf` corresponds to a
-        Poisson model.
+        The negative binomial overdispersion parameter theta for Pearson residuals.
+        Higher values correspond to less overdispersion (var = mean + mean^2/theta),
+        and `theta=np.Inf` corresponds to a Poisson model.
     clip
         Determines if and how residuals are clipped:
 
-            * If `None`, residuals are clipped to the interval \
-            [-sqrt(n), sqrt(n)], where n is the number of cells in the dataset (default behavior).
+            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
+            where n is the number of cells in the dataset (default behavior).
             * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
             `clip=np.Inf` for no clipping.
 
+    n_top_genes
+        Number of highly-variable genes to keep.
     batch_key
-        If specified, highly-variable genes are selected within each batch
-        separately and merged. This simple process avoids the selection of
-        batch-specific genes and acts as a lightweight batch correction
-        method. For all flavors, genes are first sorted by how many batches
-        they are a HVG. Ties are broken by the median rank (across batches)
-        based on within-batch residual variance.
+        If specified, highly-variable genes are selected within each batch separately
+        and merged. This simple process avoids the selection of batch-specific genes
+        and acts as a lightweight batch correction method. Genes are first sorted by
+        how many batches they are a HVG. Ties are broken by the median rank (across
+        batches) based on within-batch residual variance.
+    chunksize
+        This dertermines how many genes are processed at once while computing
+        the Pearson residual variance. Choosing a smaller value will reduce
+        the required memory.
+
     n_comps_pca
         Number of principal components to compute.
     random_state_pca
@@ -63,7 +64,8 @@ def recipe_pearson_residuals(
     kwargs_pca
         Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
     check_values
-        Check if counts in selected layer are integers. A `Warning` is returned if set to True.
+        Check if counts in selected layer are integers. A Warning is returned if set to
+        True.
     inplace
         Whether to place results in `adata` or return them.
 
@@ -74,22 +76,22 @@ def recipe_pearson_residuals(
     and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`,
     updates `adata` with the following fields for gene selection results:
 
-    `.var['highly_variable']`
+    `.var['highly_variable']` : bool
         boolean indicator of highly-variable genes.
-    `.var['means']`
+    `.var['means']` : float
         means per gene.
-    `.var['variances']`
+    `.var['variances']` : float
         variances per gene.
-    `.var['residual_variances']`
+    `.var['residual_variances']` : float
         Pearson residual variance per gene. Averaged in the case of multiple
         batches.
-    `.var['highly_variable_rank']`
+    `.var['highly_variable_rank']` : float
         Rank of the gene according to residual variance, median rank in the
         case of multiple batches.
-    `.var['highly_variable_nbatches']`
+    `.var['highly_variable_nbatches']` : int
         If batch_key is given, this denotes in how many batches genes are
         detected as HVG.
-    `.var['highly_variable_intersection']`
+    `.var['highly_variable_intersection']` : bool
         If batch_key is given, this denotes the genes that are highly variable
         in all batches.
 
@@ -102,6 +104,7 @@ def recipe_pearson_residuals(
          The used value of the overdisperion parameter theta.
     `.uns['pearson_residuals_normalization']['clip']`
          The used value of the clipping parameter.
+
     `.obsm['X_pca']`
         PCA representation of data after gene selection and Pearson residual
         normalization.
@@ -110,8 +113,7 @@ def recipe_pearson_residuals(
     `.uns['pca']['variance_ratio']`
          Ratio of explained variance.
     `.uns['pca']['variance']`
-         Explained variance, equivalent to the eigenvalues of the
-         covariance matrix.
+         Explained variance, equivalent to the eigenvalues of the covariance matrix.
 
     """
 

From 92159836926cfc44722c2d811dfae448479483ba Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Tue, 10 Aug 2021 16:52:47 +0200
Subject: [PATCH 57/96] update citation year

---
 docs/references.rst                              | 2 +-
 scanpy/experimental/pp/_highly_variable_genes.py | 4 ++--
 scanpy/experimental/pp/_normalization.py         | 4 ++--
 scanpy/experimental/pp/_recipes.py               | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/references.rst b/docs/references.rst
index 458534a3a3..84bdd7f7fe 100644
--- a/docs/references.rst
+++ b/docs/references.rst
@@ -119,7 +119,7 @@ References
    *Laplacian Dynamics and Multiscale Modular Structure in Networks*
    `arXiv <https://arxiv.org/abs/0812.1770>`__.
 
-.. [Lause20] Lause *et al.* (2020)
+.. [Lause21] Lause *et al.* (2021)
    *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*,
    `BioRxiv <https://doi.org/10.1101/2020.12.01.405886>`__.
 
diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index a0d344d390..9d357a7466 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -231,9 +231,9 @@ def highly_variable_genes(
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    Annotate highly variable genes using analytic Pearson residuals [Lause20]_.
+    Annotate highly variable genes using analytic Pearson residuals [Lause21]_.
 
-    For [Lause20]_, Pearson residuals of a negative binomial offset model (with
+    For [Lause21]_, Pearson residuals of a negative binomial offset model (with
     overdispersion theta shared across genes) are computed. By default, overdispersion
     theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked
     by residual variance.
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index dbdf5e59d0..39c8c4141a 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -65,7 +65,7 @@ def normalize_pearson_residuals(
     inplace: bool = True,
 ) -> Optional[Dict[str, np.ndarray]]:
     """\
-    Applies analytic Pearson residual normalization, based on [Lause20]_.
+    Applies analytic Pearson residual normalization, based on [Lause21]_.
 
     The residuals are based on a negative binomial offset model with overdispersion
     `theta` shared across genes. By default, residuals are clipped to sqrt(n) and
@@ -150,7 +150,7 @@ def normalize_pearson_residuals_pca(
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    Applies analytic Pearson residual normalization and PCA, based on [Lause20]_.
+    Applies analytic Pearson residual normalization and PCA, based on [Lause21]_.
 
     The residuals are based on a negative binomial offset model with overdispersion
     `theta` shared across genes. By default, residuals are clipped to sqrt(n),
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 67fc6e81a9..243e2b8379 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -19,7 +19,7 @@ def recipe_pearson_residuals(
     inplace: bool = True,
 ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
     """\
-    Gene selection and normalization based on [Lause20]_.
+    Gene selection and normalization based on [Lause21]_.
 
     Applies gene selection based on Pearson residuals. On the resulting subset,
     Pearson residual normalization and PCA are performed.

From 37695a9ed96f2ca1062bf0599456b547465c542d Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Tue, 17 Aug 2021 21:57:12 +0200
Subject: [PATCH 58/96] cleaning imports in `preprocessing` functions

---
 scanpy/preprocessing/_highly_variable_genes.py | 5 +----
 scanpy/preprocessing/_normalization.py         | 5 +----
 scanpy/preprocessing/_recipes.py               | 3 +--
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index d9c8aae568..7cede9e528 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -1,6 +1,5 @@
 import warnings
 from typing import Optional
-
 import numpy as np
 import pandas as pd
 import scipy.sparse as sp_sparse
@@ -9,8 +8,7 @@
 
 from .. import logging as logg
 from .._settings import settings, Verbosity
-from .._utils import sanitize_anndata, check_nonnegative_integers, view_to_actual
-from scanpy.get import _get_obs_rep
+from .._utils import sanitize_anndata, check_nonnegative_integers
 from .._compat import Literal
 from ._utils import _get_mean_var
 from ._distributed import materialize_as_ndarray
@@ -376,7 +374,6 @@ def highly_variable_genes(
         Check if counts in selected layer are integers. A Warning is returned if set to True.
         Only used if `flavor='seurat_v3'`.
 
-
     Returns
     -------
     Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or
diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py
index 0a853d3c89..c4ab816085 100644
--- a/scanpy/preprocessing/_normalization.py
+++ b/scanpy/preprocessing/_normalization.py
@@ -2,7 +2,6 @@
 from warnings import warn
 
 import numpy as np
-import pandas as pd
 from anndata import AnnData
 from scipy.sparse import issparse
 from sklearn.utils import sparsefuncs
@@ -10,11 +9,9 @@
 from scanpy import logging as logg
 from scanpy._compat import Literal
 
-from scanpy._utils import view_to_actual, check_nonnegative_integers
+from scanpy._utils import view_to_actual
 from scanpy.get import _get_obs_rep, _set_obs_rep
 
-from scanpy.preprocessing._pca import pca
-
 
 def _normalize_data(X, counts, after=None, copy=False):
     X = X.copy() if copy else X
diff --git a/scanpy/preprocessing/_recipes.py b/scanpy/preprocessing/_recipes.py
index a4696e0827..6abc04ed74 100644
--- a/scanpy/preprocessing/_recipes.py
+++ b/scanpy/preprocessing/_recipes.py
@@ -1,9 +1,8 @@
 """Preprocessing recipes from the literature"""
-from typing import Optional, Tuple
+from typing import Optional
 
 from anndata import AnnData
 
-import pandas as pd
 
 from .. import preprocessing as pp
 from ._deprecated.highly_variable_genes import (

From f42f4b8d4c8398bb89fba975165be7e1213c71c9 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Tue, 17 Aug 2021 23:23:41 +0200
Subject: [PATCH 59/96] making inputcheck tests specific to error/warning
 messages

---
 scanpy/tests/test_highly_variable_genes.py | 31 +++++++++++++++-------
 scanpy/tests/test_normalization.py         | 29 ++++++++++++++------
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index a92c59ebf5..ce3d343e0e 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -79,38 +79,51 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
         adata_noninteger.X[x[0], y[0]] = 0.5
 
         # expecting 0 no-int warnings
-        with pytest.warns(None) as record:
+        with warnings.catch_warnings(record=True) as record:
             sc.experimental.pp.highly_variable_genes(
                 adata_noninteger.copy(),
                 flavor='pearson_residuals',
                 n_top_genes=100,
-                check_values=False,
+                check_values=True,
             )
-        assert len(record) == 0
+
+        warning_msgs = [w.message.args[0] for w in record]
+        assert (
+            "`flavor='pearson_residuals'` expects raw count data, but non-integers were found."
+            not in warning_msgs
+        )
 
         # expecting 1 no-int warning
-        with pytest.warns(None) as record:
+        with pytest.warns(
+            UserWarning,
+            match="`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
+        ) as record:
             sc.experimental.pp.highly_variable_genes(
                 adata_noninteger.copy(),
                 flavor='pearson_residuals',
                 n_top_genes=100,
                 check_values=True,
             )
-        assert len(record) == 1
-        assert "expects raw count data" in record[0].message.args[0]
 
     # errors should be raised for invalid theta values
-    with pytest.raises(ValueError) as record:
+    with pytest.raises(
+        ValueError, match='Pearson residuals require theta > 0'
+    ) as record:
         sc.experimental.pp.highly_variable_genes(
             adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=0
         )
-    with pytest.raises(ValueError) as record:
+
+    with pytest.raises(
+        ValueError, match='Pearson residuals require theta > 0'
+    ) as record:
         sc.experimental.pp.highly_variable_genes(
             adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=-1
         )
 
     # error should be raised for invalid clipping values
-    with pytest.raises(ValueError) as record:
+    with pytest.raises(
+        ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.'
+    ) as record:
         sc.experimental.pp.highly_variable_genes(
             adata.copy(), flavor='pearson_residuals', n_top_genes=100, clip=-1
         )
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index b2defaeead..53141419e0 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -79,27 +79,40 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype):
         x, y = np.nonzero(adata_noninteger.X)
         adata_noninteger.X[x[0], y[0]] = 0.5
 
-        with pytest.warns(UserWarning) as record:
+        with warnings.catch_warnings(record=True) as record:
             sc.experimental.pp.normalize_pearson_residuals(
                 adata_noninteger.copy(), check_values=True
             )
-        assert len(record) == 1
-        assert "expects raw count data" in record[0].message.args[0]
+        warning_msgs = [w.message.args[0] for w in record]
+        assert (
+            "`normalize_pearson_residuals()` expects raw count data, but non-integers were found."
+            in warning_msgs
+        )
 
-        with pytest.warns(None) as record:
+        with warnings.catch_warnings(record=True) as record:
             sc.experimental.pp.normalize_pearson_residuals(
                 adata_noninteger.copy(), check_values=False
             )
-        assert len(record) == 0
+        warning_msgs = [w.message.args[0] for w in record]
+        assert (
+            "`normalize_pearson_residuals()` expects raw count data, but non-integers were found."
+            not in warning_msgs
+        )
 
     # errors should be raised for invalid theta values
-    with pytest.raises(ValueError) as record:
+    with pytest.raises(
+        ValueError, match='Pearson residuals require theta > 0'
+    ) as record:
         sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=0)
-    with pytest.raises(ValueError) as record:
+    with pytest.raises(
+        ValueError, match='Pearson residuals require theta > 0'
+    ) as record:
         sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=-1)
 
     # error should be raised for invalid clipping values
-    with pytest.raises(ValueError) as record:
+    with pytest.raises(
+        ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.'
+    ) as record:
         sc.experimental.pp.normalize_pearson_residuals(adata.copy(), clip=-1)
 
 

From 1e20c3bda7c30dbf520b30035cee6660ba562f94 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Tue, 17 Aug 2021 23:50:27 +0200
Subject: [PATCH 60/96] making inputcheck tests specific to error/warning
 messages

---
 scanpy/tests/test_highly_variable_genes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index ce3d343e0e..a8cbf0d9f0 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -84,7 +84,7 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
                 adata_noninteger.copy(),
                 flavor='pearson_residuals',
                 n_top_genes=100,
-                check_values=True,
+                check_values=False,
             )
 
         warning_msgs = [w.message.args[0] for w in record]

From 1f02e2c457d0f6df9636d5c36997b14784096201 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 20 Aug 2021 17:55:35 +0200
Subject: [PATCH 61/96] resolve HVGs across batches more cleanly, fix dtype
 issue

---
 .../experimental/pp/_highly_variable_genes.py | 34 ++++++-------------
 scanpy/tests/test_highly_variable_genes.py    |  4 +--
 2 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index 9d357a7466..03fc078ac8 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -119,34 +119,24 @@ def _highly_variable_pearson_residuals(
             residual_gene_var[start:stop] = np.var(residuals, axis=0)
 
         # Add 0 values for genes that were filtered out
-        zero_gene_var = np.zeros(np.sum(~nonzero_genes))
-        residual_gene_var = np.concatenate((residual_gene_var, zero_gene_var))
-        # Order as before filtering
-        idxs = np.concatenate((np.where(nonzero_genes)[0], np.where(~nonzero_genes)[0]))
-        residual_gene_var = residual_gene_var[np.argsort(idxs)]
-        residual_gene_vars.append(residual_gene_var.reshape(1, -1))
+        unmasked_residual_gene_var = np.zeros(len(nonzero_genes))
+        unmasked_residual_gene_var[nonzero_genes] = residual_gene_var
+        residual_gene_vars.append(unmasked_residual_gene_var.reshape(1, -1))
 
     residual_gene_vars = np.concatenate(residual_gene_vars, axis=0)
 
-    # Get cutoffs and define hvgs per batch
-    residual_gene_vars_sorted = np.sort(residual_gene_vars, axis=1)
-    cutoffs_per_batch = residual_gene_vars_sorted[:, -n_top_genes]
-    highly_variable_per_batch = np.greater_equal(
-        residual_gene_vars.T, cutoffs_per_batch
-    ).T
-
-    # Merge hvgs across batches
-    highly_variable_nbatches = np.sum(highly_variable_per_batch, axis=0)
-    highly_variable_intersection = highly_variable_nbatches == n_batches
-
     # Get rank per gene within each batch
     # argsort twice gives ranks, small rank means most variable
     ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1)
     ranks_residual_var = ranks_residual_var.astype(np.float32)
+    # count in how many batches a genes was among the n_top_genes
+    highly_variable_nbatches = np.sum(
+        (ranks_residual_var < n_top_genes).astype(int), axis=0
+    )
+    # set non-top genes within each batch to nan
     ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan
     ranks_masked_array = np.ma.masked_invalid(ranks_residual_var)
-    # Median rank across batches,
-    # ignoring batches in which gene was not selected
+    # Median rank across batches, ignoring batches in which gene was not selected
     medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan)
 
     means, variances = materialize_as_ndarray(_get_mean_var(X))
@@ -154,12 +144,10 @@ def _highly_variable_pearson_residuals(
         dict(
             means=means,
             variances=variances,
-            residual_variances=np.mean(residual_gene_vars, axis=0).astype(
-                np.float32, copy=False
-            ),
+            residual_variances=np.mean(residual_gene_vars, axis=0),
             highly_variable_rank=medianrank_residual_var,
             highly_variable_nbatches=highly_variable_nbatches.astype(np.int64),
-            highly_variable_intersection=highly_variable_intersection,
+            highly_variable_intersection=highly_variable_nbatches == n_batches,
         )
     )
     df = df.set_index(adata.var_names)
diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index a8cbf0d9f0..d58a1840b9 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -240,7 +240,7 @@ def test_highly_variable_genes_pearson_residuals_general(
         assert key in output_df.keys()
 
     # check residual variances
-    assert output_df['residual_variances'].values.dtype is np.dtype('float32')
+    assert pd.api.types.is_float_dtype(output_df['residual_variances'].dtype)
     # consistency with normalization method
     if subset:
         # sort values before comparing as reference is sorted as well for subset case
@@ -325,7 +325,7 @@ def test_highly_variable_genes_pearson_residuals_batch(
     assert np.all(output_df['highly_variable'][output_df.highly_variable_intersection])
 
     # check ranks (with batch_key these are the median of within-batch ranks)
-    assert output_df['highly_variable_rank'].values.dtype is np.dtype('float32')
+    assert pd.api.types.is_float_dtype(output_df['highly_variable_rank'].dtype)
     assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1
 
     # check nbatches

From 0add1b7ba4e1bd139fb2b9de81be868bfb5632e9 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 20 Aug 2021 18:33:26 +0200
Subject: [PATCH 62/96] renaming pca input arguments

---
 scanpy/experimental/pp/_normalization.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index 39c8c4141a..29fb1cf60d 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -142,8 +142,8 @@ def normalize_pearson_residuals_pca(
     adata: AnnData,
     theta: float = 100,
     clip: Optional[float] = None,
-    n_comps_pca: Optional[int] = 50,
-    random_state_pca: Optional[float] = 0,
+    n_comps: Optional[int] = 50,
+    random_state: Optional[float] = 0,
     kwargs_pca: Optional[dict] = {},
     use_highly_variable: bool = True,
     check_values: bool = True,
@@ -177,10 +177,10 @@ def normalize_pearson_residuals_pca(
             * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
             `clip=np.Inf` for no clipping.
 
-    n_comps_pca
-        Number of principal components to compute.
-    random_state_pca
-        Change to use different initial states for the optimization.
+    n_comps
+        Number of principal components to compute for the PCA step.
+    random_state
+        Change to use different initial states for the optimization of the PCA step.
     kwargs_pca
         Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
     use_highly_variable
@@ -228,7 +228,7 @@ def normalize_pearson_residuals_pca(
     normalize_pearson_residuals(
         adata_pca, theta=theta, clip=clip, check_values=check_values
     )
-    pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca)
+    pca(adata_pca, n_comps=n_comps, random_state=random_state, **kwargs_pca)
 
     if inplace:
         norm_settings = adata_pca.uns['pearson_residuals_normalization']

From 2a2b98a10fdd6799a3a4c73fc51b332aa3cb7c77 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 20 Aug 2021 19:04:37 +0200
Subject: [PATCH 63/96] renaming pca input arguments

---
 scanpy/experimental/pp/_recipes.py | 14 ++++-----
 scanpy/tests/test_normalization.py | 48 +++++++++++++++---------------
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 243e2b8379..066dd2d590 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -12,8 +12,8 @@ def recipe_pearson_residuals(
     n_top_genes: int = 1000,
     batch_key: Optional[str] = None,
     chunksize: int = 1000,
-    n_comps_pca: Optional[int] = 50,
-    random_state_pca: Optional[float] = 0,
+    n_comps: Optional[int] = 50,
+    random_state: Optional[float] = 0,
     kwargs_pca: dict = {},
     check_values: bool = True,
     inplace: bool = True,
@@ -57,10 +57,10 @@ def recipe_pearson_residuals(
         the Pearson residual variance. Choosing a smaller value will reduce
         the required memory.
 
-    n_comps_pca
-        Number of principal components to compute.
-    random_state_pca
-        Change to use different initial states for the optimization.
+    n_comps
+        Number of principal components to compute in the PCA step.
+    random_state
+        Change to use different initial states for the optimization in the PCA step.
     kwargs_pca
         Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
     check_values
@@ -139,7 +139,7 @@ def recipe_pearson_residuals(
     experimental.pp.normalize_pearson_residuals(
         adata_pca, theta=theta, clip=clip, check_values=check_values
     )
-    pca(adata_pca, n_comps=n_comps_pca, random_state=random_state_pca, **kwargs_pca)
+    pca(adata_pca, n_comps=n_comps, random_state=random_state, **kwargs_pca)
 
     if inplace:
         normalization_param = adata_pca.uns['pearson_residuals_normalization']
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index 53141419e0..53e0da37b7 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -178,8 +178,8 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip):
 )
 @pytest.mark.parametrize('dtype', ['float32', 'int64'])
 @pytest.mark.parametrize('n_hvgs', [100, 200])
-@pytest.mark.parametrize('n_comps_pca', [30, 50])
-def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_pca):
+@pytest.mark.parametrize('n_comps', [30, 50])
+def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps):
 
     adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
     n_cells, n_genes = adata.shape
@@ -194,17 +194,17 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
     # outputs the (potentially hvg-restricted) adata_pca object
     # PCA on all genes
     adata_pca = sc.experimental.pp.normalize_pearson_residuals_pca(
-        adata.copy(), inplace=False, n_comps_pca=n_comps_pca
+        adata.copy(), inplace=False, n_comps=n_comps
     )
     # PCA on hvgs only
     adata_pca_with_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca(
-        adata_with_hvgs.copy(), inplace=False, n_comps_pca=n_comps_pca
+        adata_with_hvgs.copy(), inplace=False, n_comps=n_comps
     )
     # PCA again on all genes (hvg use supressed)
     adata_pca_not_using_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_not_using_hvgs.copy(),
         inplace=False,
-        n_comps_pca=n_comps_pca,
+        n_comps=n_comps,
         use_highly_variable=False,
     )
 
@@ -218,7 +218,7 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
         )
         assert np.all(np.isin(['X_pca'], list(ad.obsm.keys())))
         assert np.all(np.isin(['PCs'], list(ad.varm.keys())))
-        assert ad.obsm['X_pca'].shape == (n_cells, n_comps_pca)
+        assert ad.obsm['X_pca'].shape == (n_cells, n_comps)
 
     # check adata shape to see if all genes or only HVGs are in the returned adata
     assert adata_pca.shape == (n_cells, n_genes)
@@ -226,28 +226,28 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
     assert adata_pca_not_using_hvgs.shape == (n_cells, n_genes)
 
     # check PC shapes to see whether or not HVGs were used for PCA
-    assert adata_pca.varm['PCs'].shape == (n_genes, n_comps_pca)
+    assert adata_pca.varm['PCs'].shape == (n_genes, n_comps)
     assert adata_pca_with_hvgs.varm['PCs'].shape == (
         n_hvgs,
-        n_comps_pca,
+        n_comps,
     )  # only HVGs used
-    assert adata_pca_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps_pca)
+    assert adata_pca_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps)
 
     ### inplace = True ###
     # modifies the input adata object
     # PCA on all genes
     sc.experimental.pp.normalize_pearson_residuals_pca(
-        adata, inplace=True, n_comps_pca=n_comps_pca
+        adata, inplace=True, n_comps=n_comps
     )
     # PCA on hvgs only
     sc.experimental.pp.normalize_pearson_residuals_pca(
-        adata_with_hvgs, inplace=True, n_comps_pca=n_comps_pca
+        adata_with_hvgs, inplace=True, n_comps=n_comps
     )
     # PCA again on all genes (hvg use supressed)
     sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_not_using_hvgs,
         inplace=True,
-        n_comps_pca=n_comps_pca,
+        n_comps=n_comps,
         use_highly_variable=False,
     )
 
@@ -264,17 +264,17 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
         assert np.all(np.isin(['X_pca'], list(ad.obsm.keys())))
         # check shapes: adata should always retains original shape
         assert ad.shape == (n_cells, n_genes)
-        assert ad.obsm['X_pca'].shape == (n_cells, n_comps_pca)
+        assert ad.obsm['X_pca'].shape == (n_cells, n_comps)
 
     # check PC shapes to see whether or not HVGs were used for PCA
-    assert adata.uns['pca']['PCs'].shape == (n_genes, n_comps_pca)
+    assert adata.uns['pca']['PCs'].shape == (n_genes, n_comps)
     assert adata_with_hvgs.uns['pca']['PCs'].shape == (
         n_hvgs,
-        n_comps_pca,
+        n_comps,
     )
     assert adata_not_using_hvgs.uns['pca']['PCs'].shape == (
         n_genes,
-        n_comps_pca,
+        n_comps,
     )
 
     # test for inplace/outplace
@@ -293,8 +293,8 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps_p
 )
 @pytest.mark.parametrize('dtype', ['float32', 'int64'])
 @pytest.mark.parametrize('n_hvgs', [100, 200])
-@pytest.mark.parametrize('n_comps_pca', [30, 50])
-def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comps_pca):
+@pytest.mark.parametrize('n_comps', [30, 50])
+def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comps):
     adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
     n_cells, n_genes = adata.shape
 
@@ -307,7 +307,7 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
     # outputs the (potentially hvg-restricted) adata_pca object
     # PCA on all genes
     adata_pca, hvg = sc.experimental.pp.recipe_pearson_residuals(
-        adata.copy(), inplace=False, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs
+        adata.copy(), inplace=False, n_comps=n_comps, n_top_genes=n_hvgs
     )
 
     # for both cases, check adata_pca keys are complete
@@ -319,12 +319,12 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
     )
     assert np.all(np.isin(['X_pca'], list(adata_pca.obsm.keys())))
     assert np.all(np.isin(['PCs'], list(adata_pca.varm.keys())))
-    assert adata_pca.obsm['X_pca'].shape == (n_cells, n_comps_pca)
+    assert adata_pca.obsm['X_pca'].shape == (n_cells, n_comps)
 
     # check adata shape
     assert adata_pca.shape == (n_cells, n_hvgs)
     # check PC shapes to check that HVGs were used for PCA
-    assert adata_pca.varm['PCs'].shape == (n_hvgs, n_comps_pca)
+    assert adata_pca.varm['PCs'].shape == (n_hvgs, n_comps)
 
     # check hvg df
     assert np.all(
@@ -346,7 +346,7 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
     # modifies the input adata object
     # PCA on all genes
     sc.experimental.pp.recipe_pearson_residuals(
-        adata, inplace=True, n_comps_pca=n_comps_pca, n_top_genes=n_hvgs
+        adata, inplace=True, n_comps=n_comps, n_top_genes=n_hvgs
     )
 
     assert np.all(
@@ -357,7 +357,7 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
     )
     assert np.all(np.isin(['X_pca'], list(adata.obsm.keys())))
     assert adata.shape == (n_cells, n_genes)
-    assert adata.obsm['X_pca'].shape == (n_cells, n_comps_pca)
+    assert adata.obsm['X_pca'].shape == (n_cells, n_comps)
 
     # check PC shapes to see whether or not HVGs were used for PCA
-    assert adata.uns['pca']['PCs'].shape == (n_hvgs, n_comps_pca)
+    assert adata.uns['pca']['PCs'].shape == (n_hvgs, n_comps)

From 01500570f6a917415afecf84f951f7504642ef68 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 20 Aug 2021 23:53:28 +0200
Subject: [PATCH 64/96] _pca bundle: more efficient copy handling, added input
 check. both _pca and _recipe: varm field for PCs, adapted tests and docs

---
 scanpy/experimental/pp/_normalization.py | 38 +++++++++++++++-------
 scanpy/experimental/pp/_recipes.py       | 13 +++++---
 scanpy/tests/test_normalization.py       | 41 +++++++++++++-----------
 3 files changed, 57 insertions(+), 35 deletions(-)

diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index 29fb1cf60d..097620ce33 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -145,7 +145,7 @@ def normalize_pearson_residuals_pca(
     n_comps: Optional[int] = 50,
     random_state: Optional[float] = 0,
     kwargs_pca: Optional[dict] = {},
-    use_highly_variable: bool = True,
+    use_highly_variable: Optional[bool] = None,
     check_values: bool = True,
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
@@ -209,8 +209,10 @@ def normalize_pearson_residuals_pca(
     `.obsm['X_pca']`
         PCA representation of data after gene selection (if applicable) and Pearson
         residual normalization.
-    `.uns['pca']['PCs']`
-         The principal components containing the loadings.
+    `.varm['PCs']`
+         The principal components containing the loadings. When `inplace=True` and
+         `use_highly_variable=True`, this will contain empty rows for the genes not
+         selected.
     `.uns['pca']['variance_ratio']`
          Ratio of explained variance.
     `.uns['pca']['variance']`
@@ -218,12 +220,23 @@ def normalize_pearson_residuals_pca(
 
     """
 
-    if use_highly_variable and 'highly_variable' in adata.var_keys():
-        # TODO: are these copies needed?
-        adata_pca = adata[:, adata.var['highly_variable']].copy()
+    # check if HVG selection is there if user wants to use it
+    if use_highly_variable and 'highly_variable' not in adata.var_keys():
+        raise ValueError(
+            'You passed `use_highly_variable=True`, but no HVG selection was found (`highly_variable` missing in `adata.var_keys()`.'
+        )
+
+    # default behavior: if there is a HVG selection, we will use it
+    if use_highly_variable is None and 'highly_variable' in adata.var_keys():
+        use_highly_variable = True
+
+    if use_highly_variable:
+        adata_sub = adata[:, adata.var['highly_variable']].copy()
+        adata_pca = AnnData(
+            adata_sub.X.copy(), obs=adata_sub.obs[[]], var=adata_sub.var[[]]
+        )
     else:
-        # TODO: are these copies needed?
-        adata_pca = adata.copy()
+        adata_pca = AnnData(adata.X.copy(), obs=adata.obs[[]], var=adata.var[[]])
 
     normalize_pearson_residuals(
         adata_pca, theta=theta, clip=clip, check_values=check_values
@@ -233,9 +246,12 @@ def normalize_pearson_residuals_pca(
     if inplace:
         norm_settings = adata_pca.uns['pearson_residuals_normalization']
         norm_dict = dict(**norm_settings, pearson_residuals_df=adata_pca.to_df())
-        pca_settings = adata_pca.uns['pca']
-        pca_dict = dict(**pca_settings, PCs=adata_pca.varm['PCs'])
-        adata.uns['pca'] = pca_dict
+        if use_highly_variable:
+            adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
+            adata.varm['PCs'][adata.var['highly_variable']] = adata_pca.varm['PCs']
+        else:
+            adata.varm['PCs'] = adata_pca.varm['PCs']
+        adata.uns['pca'] = adata_pca.uns['pca']
         adata.uns['pearson_residuals_normalization'] = norm_dict
         adata.obsm['X_pca'] = adata_pca.obsm['X_pca']
         return None
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 066dd2d590..5589e816aa 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -1,6 +1,7 @@
 from typing import Optional, Tuple
 from anndata import AnnData
 import pandas as pd
+import numpy as np
 from scanpy import experimental
 from scanpy.preprocessing import pca
 
@@ -108,8 +109,9 @@ def recipe_pearson_residuals(
     `.obsm['X_pca']`
         PCA representation of data after gene selection and Pearson residual
         normalization.
-    `.uns['pca']['PCs']`
-         The principal components containing the loadings.
+    `.varm['PCs']`
+         The principal components containing the loadings. When `inplace=True` this
+         will contain empty rows for the genes not selected during HVG selection.
     `.uns['pca']['variance_ratio']`
          Ratio of explained variance.
     `.uns['pca']['variance']`
@@ -146,9 +148,10 @@ def recipe_pearson_residuals(
         normalization_dict = dict(
             **normalization_param, pearson_residuals_df=adata_pca.to_df()
         )
-        pca_param = adata_pca.uns['pca']
-        pca_dict = dict(**pca_param, PCs=adata_pca.varm['PCs'])
-        adata.uns['pca'] = pca_dict
+
+        adata.uns['pca'] = adata_pca.uns['pca']
+        adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
+        adata.varm['PCs'][adata.var['highly_variable']] = adata_pca.varm['PCs']
         adata.uns['pearson_residuals_normalization'] = normalization_dict
         adata.obsm['X_pca'] = adata_pca.obsm['X_pca']
         return None
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index 53e0da37b7..73005fea3c 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -235,15 +235,15 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps):
 
     ### inplace = True ###
     # modifies the input adata object
-    # PCA on all genes
+    # PCA on all genes (no HVG supplied)
     sc.experimental.pp.normalize_pearson_residuals_pca(
         adata, inplace=True, n_comps=n_comps
     )
-    # PCA on hvgs only
+    # PCA on hvgs only (HVGs supplied and automatically used)
     sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_with_hvgs, inplace=True, n_comps=n_comps
     )
-    # PCA again on all genes (hvg use supressed)
+    # PCA again on all genes (HVGs supplied and NOT used)
     sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_not_using_hvgs,
         inplace=True,
@@ -266,17 +266,23 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps):
         assert ad.shape == (n_cells, n_genes)
         assert ad.obsm['X_pca'].shape == (n_cells, n_comps)
 
-    # check PC shapes to see whether or not HVGs were used for PCA
-    assert adata.uns['pca']['PCs'].shape == (n_genes, n_comps)
-    assert adata_with_hvgs.uns['pca']['PCs'].shape == (
-        n_hvgs,
-        n_comps,
-    )
-    assert adata_not_using_hvgs.uns['pca']['PCs'].shape == (
-        n_genes,
-        n_comps,
+    # check if there are columns of all-zeros in the PCs shapes
+    # to see whether or not HVGs were used for PCA
+    assert adata.varm['PCs'].shape == (n_genes, n_comps)
+    # no all-zero-colums should exist
+    assert sum(np.sum(np.abs(adata.varm['PCs']), axis=1) == 0) == 0
+
+    assert adata_with_hvgs.varm['PCs'].shape == (n_genes, n_comps)
+    # number of all-zero-colums should be number of non-hvgs
+    assert (
+        sum(np.sum(np.abs(adata_with_hvgs.varm['PCs']), axis=1) == 0)
+        == n_genes - n_hvgs
     )
 
+    assert adata_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps)
+    # no all-zero-colums should exist
+    assert sum(np.sum(np.abs(adata_not_using_hvgs.varm['PCs']), axis=1) == 0) == 0
+
     # test for inplace/outplace
     for ad_inplace, ad_outplace in zip(
         [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs],
@@ -298,11 +304,6 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
     adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
     n_cells, n_genes = adata.shape
 
-    adata_with_hvgs = adata.copy()
-    sc.experimental.pp.highly_variable_genes(
-        adata_with_hvgs, flavor='pearson_residuals', n_top_genes=n_hvgs
-    )
-
     ### inplace = False ###
     # outputs the (potentially hvg-restricted) adata_pca object
     # PCA on all genes
@@ -359,5 +360,7 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
     assert adata.shape == (n_cells, n_genes)
     assert adata.obsm['X_pca'].shape == (n_cells, n_comps)
 
-    # check PC shapes to see whether or not HVGs were used for PCA
-    assert adata.uns['pca']['PCs'].shape == (n_hvgs, n_comps)
+    # check PC shape
+    assert adata.varm['PCs'].shape == (n_genes, n_comps)
+    # number of all-zero-colums should be number of non-hvgs
+    assert sum(np.sum(np.abs(adata.varm['PCs']), axis=1) == 0) == n_genes - n_hvgs

From e9c0b89be81e99fd23c499aaf5d7fb78017657ab Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Sun, 22 Aug 2021 21:53:41 +0200
Subject: [PATCH 65/96] move repeated inputcheck code to helpers

---
 scanpy/tests/helpers.py                    | 35 +++++++++++
 scanpy/tests/test_highly_variable_genes.py | 71 ++++++++--------------
 scanpy/tests/test_normalization.py         | 55 +++++++----------
 3 files changed, 81 insertions(+), 80 deletions(-)

diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py
index b7e97c36dd..aa47bef971 100644
--- a/scanpy/tests/helpers.py
+++ b/scanpy/tests/helpers.py
@@ -6,6 +6,8 @@
 
 import scanpy as sc
 import numpy as np
+import warnings
+import pytest
 
 from anndata.tests.helpers import asarray, assert_equal
 
@@ -106,3 +108,36 @@ def _prepare_pbmc_testdata(sparsity_func, dtype, small=False):
     sc.pp.filter_genes(adata, min_cells=1)
     adata.X = sparsity_func(adata.X.astype(dtype))
     return adata
+
+
+def _make_noninteger_data(adata):
+    '''Adds a single non-integer to the data matrix, e.g. for testing `check_value` arguments.'''
+
+    adata_noninteger = adata.copy()
+    x, y = np.nonzero(adata_noninteger.X)
+    adata_noninteger.X[x[0], y[0]] = 0.5
+
+    return adata_noninteger
+
+
+def _test_check_values_warnings(function, adata, expected_warning, kwargs={}):
+    '''Runs `function` on `adata` with provided arguments `kwargs` twice: once with `check_values=True` and once with `check_values=False`. Checks that the `expected_warning` is only raised whtn `check_values=True`.'''
+
+    # expecting 0 no-int warnings
+    with warnings.catch_warnings(record=True) as record:
+        function(adata.copy(), **kwargs, check_values=False)
+    warning_msgs = [w.message.args[0] for w in record]
+    assert expected_warning not in warning_msgs
+
+    # expecting 1 no-int warning
+    with warnings.catch_warnings(record=True) as record:
+        function(adata.copy(), **kwargs, check_values=True)
+    warning_msgs = [w.message.args[0] for w in record]
+    assert expected_warning in warning_msgs
+
+
+def _test_value_error(function, adata, expected_error, kwargs={}):
+    '''Runs `function` on `adata` with provided arguments `kwargs` and checks if `error_msg` is raised as an `ValueError`.'''
+
+    with pytest.raises(ValueError, match=expected_error):
+        function(adata.copy(), **kwargs)
diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index d58a1840b9..32c86dde60 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -4,7 +4,12 @@
 import scanpy as sc
 from pathlib import Path
 from scipy.sparse import csr_matrix
-from scanpy.tests.helpers import _prepare_pbmc_testdata
+from scanpy.tests.helpers import (
+    _prepare_pbmc_testdata,
+    _make_noninteger_data,
+    _test_check_values_warnings,
+    _test_value_error,
+)
 import warnings
 
 FILE = Path(__file__).parent / Path('_scripts/seurat_hvg.csv')
@@ -74,59 +79,33 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
     # depending on check_values, warnings should be raised for non-integer data
     if dtype == 'float32':
 
-        adata_noninteger = adata.copy()
-        x, y = np.nonzero(adata_noninteger.X)
-        adata_noninteger.X[x[0], y[0]] = 0.5
+        adata_noninteger = _make_noninteger_data(adata)
 
-        # expecting 0 no-int warnings
-        with warnings.catch_warnings(record=True) as record:
-            sc.experimental.pp.highly_variable_genes(
-                adata_noninteger.copy(),
+        _test_check_values_warnings(
+            function=sc.experimental.pp.highly_variable_genes,
+            adata=adata_noninteger,
+            expected_warning="`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
+            kwargs=dict(
                 flavor='pearson_residuals',
                 n_top_genes=100,
-                check_values=False,
-            )
-
-        warning_msgs = [w.message.args[0] for w in record]
-        assert (
-            "`flavor='pearson_residuals'` expects raw count data, but non-integers were found."
-            not in warning_msgs
+            ),
         )
 
-        # expecting 1 no-int warning
-        with pytest.warns(
-            UserWarning,
-            match="`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
-        ) as record:
-            sc.experimental.pp.highly_variable_genes(
-                adata_noninteger.copy(),
-                flavor='pearson_residuals',
-                n_top_genes=100,
-                check_values=True,
-            )
-
     # errors should be raised for invalid theta values
-    with pytest.raises(
-        ValueError, match='Pearson residuals require theta > 0'
-    ) as record:
-        sc.experimental.pp.highly_variable_genes(
-            adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=0
+    for theta in [0, -1]:
+        _test_value_error(
+            function=sc.experimental.pp.highly_variable_genes,
+            adata=adata,
+            expected_error='Pearson residuals require theta > 0',
+            kwargs=dict(theta=theta, flavor='pearson_residuals', n_top_genes=100),
         )
 
-    with pytest.raises(
-        ValueError, match='Pearson residuals require theta > 0'
-    ) as record:
-        sc.experimental.pp.highly_variable_genes(
-            adata.copy(), flavor='pearson_residuals', n_top_genes=100, theta=-1
-        )
-
-    # error should be raised for invalid clipping values
-    with pytest.raises(
-        ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.'
-    ) as record:
-        sc.experimental.pp.highly_variable_genes(
-            adata.copy(), flavor='pearson_residuals', n_top_genes=100, clip=-1
-        )
+    _test_value_error(
+        function=sc.experimental.pp.highly_variable_genes,
+        adata=adata,
+        expected_error='Pearson residuals require `clip>=0` or `clip=None`.',
+        kwargs=dict(clip=-1, flavor='pearson_residuals', n_top_genes=100),
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index 73005fea3c..47131952e3 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -10,6 +10,9 @@
     check_rep_mutation,
     check_rep_results,
     _prepare_pbmc_testdata,
+    _make_noninteger_data,
+    _test_check_values_warnings,
+    _test_value_error,
 )
 from anndata.tests.helpers import assert_equal, asarray
 
@@ -75,45 +78,29 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype):
     # depending on check_values, warnings should be raised for non-integer data
     if dtype == 'float32':
 
-        adata_noninteger = adata.copy()
-        x, y = np.nonzero(adata_noninteger.X)
-        adata_noninteger.X[x[0], y[0]] = 0.5
+        adata_noninteger = _make_noninteger_data(adata)
 
-        with warnings.catch_warnings(record=True) as record:
-            sc.experimental.pp.normalize_pearson_residuals(
-                adata_noninteger.copy(), check_values=True
-            )
-        warning_msgs = [w.message.args[0] for w in record]
-        assert (
-            "`normalize_pearson_residuals()` expects raw count data, but non-integers were found."
-            in warning_msgs
+        _test_check_values_warnings(
+            function=sc.experimental.pp.normalize_pearson_residuals,
+            adata=adata_noninteger,
+            expected_warning="`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
         )
 
-        with warnings.catch_warnings(record=True) as record:
-            sc.experimental.pp.normalize_pearson_residuals(
-                adata_noninteger.copy(), check_values=False
-            )
-        warning_msgs = [w.message.args[0] for w in record]
-        assert (
-            "`normalize_pearson_residuals()` expects raw count data, but non-integers were found."
-            not in warning_msgs
+    # errors should be raised for invalid theta values
+    for theta in [0, -1]:
+        _test_value_error(
+            function=sc.experimental.pp.normalize_pearson_residuals,
+            adata=adata,
+            expected_error='Pearson residuals require theta > 0',
+            kwargs=dict(theta=theta),
         )
 
-    # errors should be raised for invalid theta values
-    with pytest.raises(
-        ValueError, match='Pearson residuals require theta > 0'
-    ) as record:
-        sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=0)
-    with pytest.raises(
-        ValueError, match='Pearson residuals require theta > 0'
-    ) as record:
-        sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=-1)
-
-    # error should be raised for invalid clipping values
-    with pytest.raises(
-        ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.'
-    ) as record:
-        sc.experimental.pp.normalize_pearson_residuals(adata.copy(), clip=-1)
+    _test_value_error(
+        function=sc.experimental.pp.normalize_pearson_residuals,
+        adata=adata,
+        expected_error='Pearson residuals require `clip>=0` or `clip=None`.',
+        kwargs=dict(clip=-1),
+    )
 
 
 @pytest.mark.parametrize(

From 3e02b056ab37d51ef11aa5c65932739ba0dd413a Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Mon, 23 Aug 2021 13:42:37 +0200
Subject: [PATCH 66/96] merging tests *_values and *_general

---
 scanpy/tests/test_highly_variable_genes.py | 60 ++--------------------
 1 file changed, 4 insertions(+), 56 deletions(-)

diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index 32c86dde60..926e50bda3 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -116,12 +116,12 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
 @pytest.mark.parametrize('clip', [None, np.Inf, 30])
 @pytest.mark.parametrize('theta', [100, np.Inf])
 @pytest.mark.parametrize('n_top_genes', [100, 200])
-def test_highly_variable_genes_pearson_residuals_values(
+def test_highly_variable_genes_pearson_residuals_general(
     subset, sparsity_func, dtype, clip, theta, n_top_genes
 ):
     adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
     # cleanup var
-    adata.var.drop(columns=adata.var.columns, inplace=True)
+    del adata.var
     # compute reference output
     residual_variances_reference = _residual_var_reference(
         adata.copy(), clip=clip, theta=theta
@@ -153,59 +153,7 @@ def test_highly_variable_genes_pearson_residuals_values(
         theta=theta,
     )
 
-    pd.testing.assert_frame_equal(output_df, adata.var)
-
-    # consistency with normalization method
-    if subset:
-        # sort values before comparing as reference is sorted as well for subset case
-        sort_output_idx = np.argsort(-output_df['residual_variances'].values)
-        assert np.allclose(
-            output_df['residual_variances'].values[sort_output_idx],
-            residual_variances_reference,
-        )
-    else:
-        assert np.allclose(
-            output_df['residual_variances'].values, residual_variances_reference
-        )
-
-
-@pytest.mark.parametrize(
-    'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
-)
-@pytest.mark.parametrize('dtype', ['float32', 'int64'])
-@pytest.mark.parametrize('subset', [True, False])
-@pytest.mark.parametrize('n_top_genes', [1000, 500])
-def test_highly_variable_genes_pearson_residuals_general(
-    subset, sparsity_func, dtype, n_top_genes
-):
-
-    adata = _prepare_pbmc_testdata(sparsity_func, dtype)
-    # cleanup var
-    adata.var.drop(columns=adata.var.columns, inplace=True)
-    # compute reference output
-    residual_variances_reference = _residual_var_reference(adata.copy())
-    if subset:
-        # lazily sort by residual variance and take top N
-        top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes]
-        # (results in sorted "gene order" in reference)
-        residual_variances_reference = residual_variances_reference[top_n_idx]
-    # compute output to be tested
-    output_df = sc.experimental.pp.highly_variable_genes(
-        adata,
-        flavor='pearson_residuals',
-        n_top_genes=n_top_genes,
-        subset=subset,
-        inplace=False,
-    )
-
-    sc.experimental.pp.highly_variable_genes(
-        adata,
-        flavor='pearson_residuals',
-        n_top_genes=n_top_genes,
-        subset=subset,
-        inplace=True,
-    )
-
+    # compare inplace=True and inplace=False output
     pd.testing.assert_frame_equal(output_df, adata.var)
 
     # check output is complete
@@ -258,7 +206,7 @@ def test_highly_variable_genes_pearson_residuals_batch(
 ):
     adata = _prepare_pbmc_testdata(sparsity_func, dtype)
     # cleanup var
-    adata.var.drop(columns=adata.var.columns, inplace=True)
+    del adata.var
     n_genes = adata.shape[1]
 
     output_df = sc.experimental.pp.highly_variable_genes(

From 720578de421093e3c13b0a99590026db68beff84 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Mon, 23 Aug 2021 14:23:02 +0200
Subject: [PATCH 67/96] condense code in pearson hvg selection test, smaller
 test data for speedup

---
 scanpy/tests/test_highly_variable_genes.py | 31 +++++++++++++---------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index 926e50bda3..d32237df4d 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -68,6 +68,16 @@ def _residual_var_reference(adata, clip=None, theta=100):
     return np.var(residuals, axis=0)
 
 
+def _check_pearson_hvg_columns(output_df, n_top_genes):
+
+    assert pd.api.types.is_float_dtype(output_df['residual_variances'].dtype)
+
+    assert output_df['highly_variable'].values.dtype is np.dtype('bool')
+    assert np.sum(output_df['highly_variable']) == n_top_genes
+
+    assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1
+
+
 @pytest.mark.parametrize(
     'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
 )
@@ -166,9 +176,7 @@ def test_highly_variable_genes_pearson_residuals_general(
     ]:
         assert key in output_df.keys()
 
-    # check residual variances
-    assert pd.api.types.is_float_dtype(output_df['residual_variances'].dtype)
-    # consistency with normalization method
+    # check consistency with normalization method
     if subset:
         # sort values before comparing as reference is sorted as well for subset case
         sort_output_idx = np.argsort(-output_df['residual_variances'].values)
@@ -182,8 +190,6 @@ def test_highly_variable_genes_pearson_residuals_general(
         )
 
     # check hvg flag
-    assert output_df['highly_variable'].values.dtype is np.dtype('bool')
-    assert np.sum(output_df['highly_variable']) == n_top_genes
     hvg_idx = np.where(output_df['highly_variable'])[0]
     topn_idx = np.sort(
         np.argsort(-output_df['residual_variances'].values)[:n_top_genes]
@@ -192,7 +198,9 @@ def test_highly_variable_genes_pearson_residuals_general(
 
     # check ranks
     assert np.nanmin(output_df['highly_variable_rank'].values) == 0
-    assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1
+
+    # more general checks on ranks, hvg flag and residual variance
+    _check_pearson_hvg_columns(output_df, n_top_genes)
 
 
 @pytest.mark.parametrize(
@@ -200,11 +208,11 @@ def test_highly_variable_genes_pearson_residuals_general(
 )
 @pytest.mark.parametrize('dtype', ['float32', 'int64'])
 @pytest.mark.parametrize('subset', [True, False])
-@pytest.mark.parametrize('n_top_genes', [1000, 500])
+@pytest.mark.parametrize('n_top_genes', [100, 200])
 def test_highly_variable_genes_pearson_residuals_batch(
     subset, n_top_genes, sparsity_func, dtype
 ):
-    adata = _prepare_pbmc_testdata(sparsity_func, dtype)
+    adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
     # cleanup var
     del adata.var
     n_genes = adata.shape[1]
@@ -227,6 +235,7 @@ def test_highly_variable_genes_pearson_residuals_batch(
         inplace=True,
     )
 
+    # compare inplace=True and inplace=False output
     pd.testing.assert_frame_equal(output_df, adata.var)
 
     # check output is complete
@@ -241,9 +250,8 @@ def test_highly_variable_genes_pearson_residuals_batch(
     ]:
         assert key in output_df.keys()
 
-    # check hvg flag
-    assert output_df['highly_variable'].values.dtype is np.dtype('bool')
-    assert np.sum(output_df['highly_variable']) == n_top_genes
+    # general checks on ranks, hvg flag and residual variance
+    _check_pearson_hvg_columns(output_df, n_top_genes)
 
     # check intersection flag
     nbatches = len(np.unique(adata.obs['batch']))
@@ -253,7 +261,6 @@ def test_highly_variable_genes_pearson_residuals_batch(
 
     # check ranks (with batch_key these are the median of within-batch ranks)
     assert pd.api.types.is_float_dtype(output_df['highly_variable_rank'].dtype)
-    assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1
 
     # check nbatches
     assert output_df['highly_variable_nbatches'].values.dtype is np.dtype('int')

From 83b7338cc78fb85c41fbd3a866bb791e8eb5050f Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Mon, 23 Aug 2021 15:30:40 +0200
Subject: [PATCH 68/96] condensing code in normalization tests

---
 scanpy/tests/test_normalization.py | 103 +++++++++++++----------------
 1 file changed, 45 insertions(+), 58 deletions(-)

diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index 47131952e3..3d41effe05 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -160,6 +160,28 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip):
         assert np.min(output_X) >= -clip
 
 
+def _check_pearson_pca_fields(ad, n_cells, n_comps):
+    assert np.all(
+        np.isin(
+            ['pearson_residuals_normalization', 'pca'],
+            list(ad.uns.keys()),
+        )
+    ), (
+        """Missing `.uns` keys. Expected `['pearson_residuals_normalization', 'pca']`, but only %s were found"""
+        % (list(ad.uns.keys()))
+    )
+    assert 'X_pca' in list(
+        ad.obsm.keys()
+    ), """Missing `obsm` key `'X_pca'`, only %s were found""" % (list(ad.obsm.keys()))
+    assert 'PCs' in list(
+        ad.varm.keys()
+    ), """Missing `varm` key `'PCs'`, only %s were found""" % (list(ad.varm.keys()))
+    assert ad.obsm['X_pca'].shape == (
+        n_cells,
+        n_comps,
+    ), 'Wrong shape of PCA output in `X_pca`'
+
+
 @pytest.mark.parametrize(
     'sparsity_func', [csr_matrix.toarray, csr_matrix], ids=lambda x: x.__name__
 )
@@ -179,15 +201,15 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps):
 
     ### inplace = False ###
     # outputs the (potentially hvg-restricted) adata_pca object
-    # PCA on all genes
+    # PCA on all genes (no HVGs present)
     adata_pca = sc.experimental.pp.normalize_pearson_residuals_pca(
         adata.copy(), inplace=False, n_comps=n_comps
     )
-    # PCA on hvgs only
+    # PCA on hvgs only (HVGs present, and by default, `use_highly_variable=True`)
     adata_pca_with_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_with_hvgs.copy(), inplace=False, n_comps=n_comps
     )
-    # PCA again on all genes (hvg use supressed)
+    # PCA again on all genes (HVGs present, but hvg use supressed by `use_highly_variable=False`)
     adata_pca_not_using_hvgs = sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_not_using_hvgs.copy(),
         inplace=False,
@@ -195,17 +217,9 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps):
         use_highly_variable=False,
     )
 
-    # for both cases, check adata_pca keys are complete
+    # for all cases, check adata_pca keys are complete
     for ad in [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs]:
-        assert np.all(
-            np.isin(
-                ['pearson_residuals_normalization', 'pca'],
-                list(ad.uns.keys()),
-            )
-        )
-        assert np.all(np.isin(['X_pca'], list(ad.obsm.keys())))
-        assert np.all(np.isin(['PCs'], list(ad.varm.keys())))
-        assert ad.obsm['X_pca'].shape == (n_cells, n_comps)
+        _check_pearson_pca_fields(ad, n_cells, n_comps)
 
     # check adata shape to see if all genes or only HVGs are in the returned adata
     assert adata_pca.shape == (n_cells, n_genes)
@@ -217,20 +231,20 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps):
     assert adata_pca_with_hvgs.varm['PCs'].shape == (
         n_hvgs,
         n_comps,
-    )  # only HVGs used
+    )
     assert adata_pca_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps)
 
     ### inplace = True ###
     # modifies the input adata object
-    # PCA on all genes (no HVG supplied)
+    # PCA on all genes (no HVGs present)
     sc.experimental.pp.normalize_pearson_residuals_pca(
         adata, inplace=True, n_comps=n_comps
     )
-    # PCA on hvgs only (HVGs supplied and automatically used)
+    # PCA on hvgs only (HVGs present, and by default, `use_highly_variable=True`)
     sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_with_hvgs, inplace=True, n_comps=n_comps
     )
-    # PCA again on all genes (HVGs supplied and NOT used)
+    # PCA again on all genes (HVGs present, but hvg use supressed by `use_highly_variable=False`)
     sc.experimental.pp.normalize_pearson_residuals_pca(
         adata_not_using_hvgs,
         inplace=True,
@@ -238,39 +252,27 @@ def test_normalize_pearson_residuals_pca(sparsity_func, dtype, n_hvgs, n_comps):
         use_highly_variable=False,
     )
 
+    # for all cases, check adata_pca keys are complete
     for ad in [adata, adata_with_hvgs, adata_not_using_hvgs]:
-        # check adata_pca keys are complete
-        assert np.all(
-            np.isin(
-                [
-                    'pearson_residuals_normalization',
-                ],
-                list(ad.uns.keys()),
-            )
-        )
-        assert np.all(np.isin(['X_pca'], list(ad.obsm.keys())))
-        # check shapes: adata should always retains original shape
+        _check_pearson_pca_fields(ad, n_cells, n_comps)
+
+        # check shapes: inplace adata's should always retains original shape
         assert ad.shape == (n_cells, n_genes)
-        assert ad.obsm['X_pca'].shape == (n_cells, n_comps)
+        assert ad.varm['PCs'].shape == (n_genes, n_comps)
 
     # check if there are columns of all-zeros in the PCs shapes
     # to see whether or not HVGs were used for PCA
-    assert adata.varm['PCs'].shape == (n_genes, n_comps)
     # no all-zero-colums should exist
     assert sum(np.sum(np.abs(adata.varm['PCs']), axis=1) == 0) == 0
-
-    assert adata_with_hvgs.varm['PCs'].shape == (n_genes, n_comps)
     # number of all-zero-colums should be number of non-hvgs
     assert (
         sum(np.sum(np.abs(adata_with_hvgs.varm['PCs']), axis=1) == 0)
         == n_genes - n_hvgs
     )
-
-    assert adata_not_using_hvgs.varm['PCs'].shape == (n_genes, n_comps)
     # no all-zero-colums should exist
     assert sum(np.sum(np.abs(adata_not_using_hvgs.varm['PCs']), axis=1) == 0) == 0
 
-    # test for inplace/outplace
+    # compare PCA results beteen inplace/outplace
     for ad_inplace, ad_outplace in zip(
         [adata_pca, adata_pca_with_hvgs, adata_pca_not_using_hvgs],
         [adata, adata_with_hvgs, adata_not_using_hvgs],
@@ -298,20 +300,11 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
         adata.copy(), inplace=False, n_comps=n_comps, n_top_genes=n_hvgs
     )
 
-    # for both cases, check adata_pca keys are complete
-    assert np.all(
-        np.isin(
-            ['pearson_residuals_normalization', 'pca'],
-            list(adata_pca.uns.keys()),
-        )
-    )
-    assert np.all(np.isin(['X_pca'], list(adata_pca.obsm.keys())))
-    assert np.all(np.isin(['PCs'], list(adata_pca.varm.keys())))
-    assert adata_pca.obsm['X_pca'].shape == (n_cells, n_comps)
-
-    # check adata shape
+    # check PCA fields
+    _check_pearson_pca_fields(adata_pca, n_cells, n_comps)
+    # check adata output shape (only HVGs in output)
     assert adata_pca.shape == (n_cells, n_hvgs)
-    # check PC shapes to check that HVGs were used for PCA
+    # check PC shape (non-hvgs are removed, so only `n_hvgs` genes)
     assert adata_pca.varm['PCs'].shape == (n_hvgs, n_comps)
 
     # check hvg df
@@ -337,17 +330,11 @@ def test_normalize_pearson_residuals_recipe(sparsity_func, dtype, n_hvgs, n_comp
         adata, inplace=True, n_comps=n_comps, n_top_genes=n_hvgs
     )
 
-    assert np.all(
-        np.isin(
-            ['pearson_residuals_normalization', 'pca'],
-            list(adata.uns.keys()),
-        )
-    )
-    assert np.all(np.isin(['X_pca'], list(adata.obsm.keys())))
+    # check PCA fields and output shape
+    _check_pearson_pca_fields(adata, n_cells, n_comps)
+    # check adata shape (no change to input)
     assert adata.shape == (n_cells, n_genes)
-    assert adata.obsm['X_pca'].shape == (n_cells, n_comps)
-
-    # check PC shape
+    # check PC shape (non-hvgs are masked with 0s, so original number of genes)
     assert adata.varm['PCs'].shape == (n_genes, n_comps)
     # number of all-zero-colums should be number of non-hvgs
     assert sum(np.sum(np.abs(adata.varm['PCs']), axis=1) == 0) == n_genes - n_hvgs

From a616419a4f1c95a19a81f2878f7ad91c45a1eaa8 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Tue, 31 Aug 2021 10:56:19 +0200
Subject: [PATCH 69/96] add asteriks for keyword

---
 scanpy/experimental/pp/_highly_variable_genes.py | 1 +
 scanpy/experimental/pp/_normalization.py         | 1 +
 2 files changed, 2 insertions(+)

diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index 03fc078ac8..d3fed085b1 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -207,6 +207,7 @@ def _highly_variable_pearson_residuals(
 
 def highly_variable_genes(
     adata: AnnData,
+    *,
     theta: float = 100,
     clip: Optional[float] = None,
     n_top_genes: Optional[int] = None,
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index 097620ce33..463d3230b2 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -57,6 +57,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False):
 
 def normalize_pearson_residuals(
     adata: AnnData,
+    *,
     theta: float = 100,
     clip: Optional[float] = None,
     check_values: bool = True,

From 62660a2f525f7ba08200ece6791f0dae3792edb0 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Tue, 14 Sep 2021 12:02:41 +0200
Subject: [PATCH 70/96] updating refs to Genome Biology publication

---
 docs/references.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/references.rst b/docs/references.rst
index 84bdd7f7fe..7c09607f44 100644
--- a/docs/references.rst
+++ b/docs/references.rst
@@ -121,7 +121,7 @@ References
 
 .. [Lause21] Lause *et al.* (2021)
    *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*,
-   `BioRxiv <https://doi.org/10.1101/2020.12.01.405886>`__.
+   `Genome Biology <https://doi.org/10.1186/s13059-021-02451-7>`__.
 
 .. [Leek12] Leek *et al.* (2012),
    *sva: Surrogate Variable Analysis. R package*

From b5cb3aa6ccc1b79b76e857776c3d371cf201b934 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 24 Dec 2021 09:53:40 +0100
Subject: [PATCH 71/96] cleanup helpers.py

---
 scanpy/tests/helpers.py                    | 27 ++++--------
 scanpy/tests/test_highly_variable_genes.py | 48 ++++++++++------------
 scanpy/tests/test_normalization.py         | 29 ++++++-------
 3 files changed, 42 insertions(+), 62 deletions(-)

diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py
index aa47bef971..bdbbe7b156 100644
--- a/scanpy/tests/helpers.py
+++ b/scanpy/tests/helpers.py
@@ -99,7 +99,13 @@ def _prepare_pbmc_testdata(sparsity_func, dtype, small=False):
     small
         False (default) returns full data, True returns small subset of the data."""
 
-    adata = sc.datasets.pbmc3k()
+    # loading from disk takes long, so cache raw data after loading it once
+    if 'ADATA_PBMC_RAW' not in globals():
+        global ADATA_PBMC_RAW
+        ADATA_PBMC_RAW = sc.datasets.pbmc3k()
+
+    adata = ADATA_PBMC_RAW.copy()
+
     if small:
         adata = adata[:1000, :500]
         sc.pp.filter_cells(adata, min_genes=1)
@@ -110,17 +116,7 @@ def _prepare_pbmc_testdata(sparsity_func, dtype, small=False):
     return adata
 
 
-def _make_noninteger_data(adata):
-    '''Adds a single non-integer to the data matrix, e.g. for testing `check_value` arguments.'''
-
-    adata_noninteger = adata.copy()
-    x, y = np.nonzero(adata_noninteger.X)
-    adata_noninteger.X[x[0], y[0]] = 0.5
-
-    return adata_noninteger
-
-
-def _test_check_values_warnings(function, adata, expected_warning, kwargs={}):
+def _check_check_values_warnings(function, adata, expected_warning, kwargs={}):
     '''Runs `function` on `adata` with provided arguments `kwargs` twice: once with `check_values=True` and once with `check_values=False`. Checks that the `expected_warning` is only raised whtn `check_values=True`.'''
 
     # expecting 0 no-int warnings
@@ -134,10 +130,3 @@ def _test_check_values_warnings(function, adata, expected_warning, kwargs={}):
         function(adata.copy(), **kwargs, check_values=True)
     warning_msgs = [w.message.args[0] for w in record]
     assert expected_warning in warning_msgs
-
-
-def _test_value_error(function, adata, expected_error, kwargs={}):
-    '''Runs `function` on `adata` with provided arguments `kwargs` and checks if `error_msg` is raised as an `ValueError`.'''
-
-    with pytest.raises(ValueError, match=expected_error):
-        function(adata.copy(), **kwargs)
diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index d32237df4d..31addb225c 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -6,9 +6,7 @@
 from scipy.sparse import csr_matrix
 from scanpy.tests.helpers import (
     _prepare_pbmc_testdata,
-    _make_noninteger_data,
-    _test_check_values_warnings,
-    _test_value_error,
+    _check_check_values_warnings,
 )
 import warnings
 
@@ -62,12 +60,6 @@ def test_highly_variable_genes_basic():
     assert np.all(np.isin(colnames, hvg_df.columns))
 
 
-def _residual_var_reference(adata, clip=None, theta=100):
-    sc.experimental.pp.normalize_pearson_residuals(adata, clip=clip, theta=theta)
-    residuals = adata.X
-    return np.var(residuals, axis=0)
-
-
 def _check_pearson_hvg_columns(output_df, n_top_genes):
 
     assert pd.api.types.is_float_dtype(output_df['residual_variances'].dtype)
@@ -89,9 +81,11 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
     # depending on check_values, warnings should be raised for non-integer data
     if dtype == 'float32':
 
-        adata_noninteger = _make_noninteger_data(adata)
+        adata_noninteger = adata.copy()
+        x, y = np.nonzero(adata_noninteger.X)
+        adata_noninteger.X[x[0], y[0]] = 0.5
 
-        _test_check_values_warnings(
+        _check_check_values_warnings(
             function=sc.experimental.pp.highly_variable_genes,
             adata=adata_noninteger,
             expected_warning="`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
@@ -103,19 +97,18 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(sparsity_func, dtyp
 
     # errors should be raised for invalid theta values
     for theta in [0, -1]:
-        _test_value_error(
-            function=sc.experimental.pp.highly_variable_genes,
-            adata=adata,
-            expected_error='Pearson residuals require theta > 0',
-            kwargs=dict(theta=theta, flavor='pearson_residuals', n_top_genes=100),
-        )
 
-    _test_value_error(
-        function=sc.experimental.pp.highly_variable_genes,
-        adata=adata,
-        expected_error='Pearson residuals require `clip>=0` or `clip=None`.',
-        kwargs=dict(clip=-1, flavor='pearson_residuals', n_top_genes=100),
-    )
+        with pytest.raises(ValueError, match='Pearson residuals require theta > 0'):
+            sc.experimental.pp.highly_variable_genes(
+                adata.copy(), theta=theta, flavor='pearson_residuals', n_top_genes=100
+            )
+
+    with pytest.raises(
+        ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.'
+    ):
+        sc.experimental.pp.highly_variable_genes(
+            adata.copy(), clip=-1, flavor='pearson_residuals', n_top_genes=100
+        )
 
 
 @pytest.mark.parametrize(
@@ -132,10 +125,13 @@ def test_highly_variable_genes_pearson_residuals_general(
     adata = _prepare_pbmc_testdata(sparsity_func, dtype, small=True)
     # cleanup var
     del adata.var
+
     # compute reference output
-    residual_variances_reference = _residual_var_reference(
-        adata.copy(), clip=clip, theta=theta
-    )
+    residuals_reference = sc.experimental.pp.normalize_pearson_residuals(
+        adata, clip=clip, theta=theta, inplace=False
+    )['X']
+    residual_variances_reference = np.var(residuals_reference, axis=0)
+
     if subset:
         # lazyly sort by residual variance and take top N
         top_n_idx = np.argsort(-residual_variances_reference)[:n_top_genes]
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index 3d41effe05..21588a2610 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -10,9 +10,7 @@
     check_rep_mutation,
     check_rep_results,
     _prepare_pbmc_testdata,
-    _make_noninteger_data,
-    _test_check_values_warnings,
-    _test_value_error,
+    _check_check_values_warnings,
 )
 from anndata.tests.helpers import assert_equal, asarray
 
@@ -78,9 +76,11 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype):
     # depending on check_values, warnings should be raised for non-integer data
     if dtype == 'float32':
 
-        adata_noninteger = _make_noninteger_data(adata)
+        adata_noninteger = adata.copy()
+        x, y = np.nonzero(adata_noninteger.X)
+        adata_noninteger.X[x[0], y[0]] = 0.5
 
-        _test_check_values_warnings(
+        _check_check_values_warnings(
             function=sc.experimental.pp.normalize_pearson_residuals,
             adata=adata_noninteger,
             expected_warning="`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
@@ -88,19 +88,14 @@ def test_normalize_pearson_residuals_inputchecks(sparsity_func, dtype):
 
     # errors should be raised for invalid theta values
     for theta in [0, -1]:
-        _test_value_error(
-            function=sc.experimental.pp.normalize_pearson_residuals,
-            adata=adata,
-            expected_error='Pearson residuals require theta > 0',
-            kwargs=dict(theta=theta),
-        )
 
-    _test_value_error(
-        function=sc.experimental.pp.normalize_pearson_residuals,
-        adata=adata,
-        expected_error='Pearson residuals require `clip>=0` or `clip=None`.',
-        kwargs=dict(clip=-1),
-    )
+        with pytest.raises(ValueError, match='Pearson residuals require theta > 0'):
+            sc.experimental.pp.normalize_pearson_residuals(adata.copy(), theta=theta)
+
+    with pytest.raises(
+        ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.'
+    ):
+        sc.experimental.pp.normalize_pearson_residuals(adata.copy(), clip=-1)
 
 
 @pytest.mark.parametrize(

From aa9037f1ef57df9da98e0f2f1da5517935feb2f3 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 24 Dec 2021 10:51:24 +0100
Subject: [PATCH 72/96] cleanup main files as requested by @ivirshup

---
 .../experimental/pp/_highly_variable_genes.py   | 17 +++++++----------
 scanpy/experimental/pp/_normalization.py        |  4 +++-
 scanpy/experimental/pp/_recipes.py              |  2 +-
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index d3fed085b1..f98370544f 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -80,17 +80,14 @@ def _highly_variable_pearson_residuals(
     residual_gene_vars = []
     for batch in np.unique(batch_info):
 
-        adata_subset = adata[batch_info == batch]
+        adata_subset_prefilter = adata[batch_info == batch]
+        X_batch_prefilter = _get_obs_rep(adata_subset_prefilter, layer=layer)
 
         # Filter out zero genes
         with settings.verbosity.override(Verbosity.error):
-            nonzero_genes = filter_genes(adata_subset, min_cells=1, inplace=False)[0]
-        adata_subset = adata_subset[:, nonzero_genes]
-
-        if layer is not None:
-            X_batch = adata_subset.layers[layer]
-        else:
-            X_batch = adata_subset.X
+            nonzero_genes = np.ravel(X_batch_prefilter.sum(axis=0)) != 0
+        adata_subset = adata_subset_prefilter[:, nonzero_genes]
+        X_batch = _get_obs_rep(adata_subset, layer=layer)
 
         # Prepare clipping
         if clip is None:
@@ -161,9 +158,9 @@ def _highly_variable_pearson_residuals(
         inplace=True,
     )
 
-    high_var = np.zeros(df.shape[0])
+    high_var = np.zeros(df.shape[0], dtype=bool)
     high_var[:n_top_genes] = True
-    df['highly_variable'] = high_var.astype(bool)
+    df['highly_variable'] = high_var
     df = df.loc[adata.var_names, :]
 
     if inplace:
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index 463d3230b2..449ca02f1d 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -141,6 +141,7 @@ def normalize_pearson_residuals(
 
 def normalize_pearson_residuals_pca(
     adata: AnnData,
+    *,
     theta: float = 100,
     clip: Optional[float] = None,
     n_comps: Optional[int] = 50,
@@ -224,7 +225,8 @@ def normalize_pearson_residuals_pca(
     # check if HVG selection is there if user wants to use it
     if use_highly_variable and 'highly_variable' not in adata.var_keys():
         raise ValueError(
-            'You passed `use_highly_variable=True`, but no HVG selection was found (`highly_variable` missing in `adata.var_keys()`.'
+            "You passed `use_highly_variable=True`, but no HVG selection was found "
+            "(e.g., there was no 'highly_variable' column in adata.var).'"
         )
 
     # default behavior: if there is a HVG selection, we will use it
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 5589e816aa..ea7da74f59 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -8,6 +8,7 @@
 
 def recipe_pearson_residuals(
     adata: AnnData,
+    *,
     theta: float = 100,
     clip: Optional[float] = None,
     n_top_genes: int = 1000,
@@ -57,7 +58,6 @@ def recipe_pearson_residuals(
         This dertermines how many genes are processed at once while computing
         the Pearson residual variance. Choosing a smaller value will reduce
         the required memory.
-
     n_comps
         Number of principal components to compute in the PCA step.
     random_state

From e972daf7f4e124efac8e7797e49d4a31bb69ab54 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Fri, 24 Dec 2021 11:48:52 +0100
Subject: [PATCH 73/96] revert unneeded settingWithCopy fix

---
 scanpy/preprocessing/_highly_variable_genes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
index 7cede9e528..561a7c7bf0 100644
--- a/scanpy/preprocessing/_highly_variable_genes.py
+++ b/scanpy/preprocessing/_highly_variable_genes.py
@@ -514,7 +514,7 @@ def highly_variable_genes(
             df['highly_variable'] = high_var.astype(bool)
             df = df.loc[adata.var_names, :]
         else:
-            df = df.loc[adata.var_names, :]
+            df = df.loc[adata.var_names]
             dispersion_norm = df.dispersions_norm.values
             dispersion_norm[np.isnan(dispersion_norm)] = 0  # similar to Seurat
             gene_subset = np.logical_and.reduce(

From 47bd877a681d83ade8f4abb030e95d5fa3d21c79 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 23 Feb 2022 19:04:09 +0100
Subject: [PATCH 74/96] cache data

---
 scanpy/tests/helpers.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py
index bdbbe7b156..d1dec31995 100644
--- a/scanpy/tests/helpers.py
+++ b/scanpy/tests/helpers.py
@@ -8,7 +8,7 @@
 import numpy as np
 import warnings
 import pytest
-
+from functools import cache
 from anndata.tests.helpers import asarray, assert_equal
 
 # TODO: Report more context on the fields being compared on error
@@ -87,6 +87,11 @@ def check_rep_results(func, X, *, fields=["layer", "obsm"], **kwargs):
         assert_equal(adata_X, adatas_proc[field])
 
 
+@cache
+def _get_pbmc3k():
+    return sc.datasets.pbmc3k()
+
+
 def _prepare_pbmc_testdata(sparsity_func, dtype, small=False):
     """Prepares 3k PBMC dataset with batch key `batch` and defined datatype/sparsity.
 
@@ -95,16 +100,11 @@ def _prepare_pbmc_testdata(sparsity_func, dtype, small=False):
     sparsity_func
         sparsity function applied to adata.X (e.g. csr_matrix.toarray for dense or csr_matrix for sparse)
     dtype
-        numpy dtype applied to adata.X (e.g.  'float32' or 'int64')
+        numpy dtype applied to adata.X (e.g. 'float32' or 'int64')
     small
         False (default) returns full data, True returns small subset of the data."""
 
-    # loading from disk takes long, so cache raw data after loading it once
-    if 'ADATA_PBMC_RAW' not in globals():
-        global ADATA_PBMC_RAW
-        ADATA_PBMC_RAW = sc.datasets.pbmc3k()
-
-    adata = ADATA_PBMC_RAW.copy()
+    adata = _get_pbmc3k()
 
     if small:
         adata = adata[:1000, :500]

From 13a44be8fb07814e4a44b6388418dae363cc66d1 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 23 Feb 2022 19:36:13 +0100
Subject: [PATCH 75/96] use doc_params for doc

---
 scanpy/experimental/_docs.py             | 47 +++++++++++++++++++++++
 scanpy/experimental/pp/_normalization.py | 49 ++++++++++--------------
 scanpy/experimental/pp/_recipes.py       | 41 +++++++++-----------
 3 files changed, 87 insertions(+), 50 deletions(-)
 create mode 100644 scanpy/experimental/_docs.py

diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py
new file mode 100644
index 0000000000..e8fecf19be
--- /dev/null
+++ b/scanpy/experimental/_docs.py
@@ -0,0 +1,47 @@
+"""Shared docstrings for experimental function parameters.
+"""
+
+doc_adata = """\
+adata
+    The annotated data matrix of shape `n_obs` × `n_vars`.
+    Rows correspond to cells and columns to genes.
+"""
+
+doc_norm_params = """\
+theta
+    The negative binomial overdispersion parameter theta for Pearson residuals.
+    Higher values correspond to less overdispersion (var = mean + mean^2/theta),
+    and `theta=np.Inf` corresponds to a Poisson model.
+clip
+    Determines if and how residuals are clipped:
+
+    * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
+    where n is the number of cells in the dataset (default behavior).
+    * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+    `clip=np.Inf` for no clipping.0
+
+"""
+
+doc_layer_copy = """\
+check_values
+    Check if counts in selected layer are integers. A Warning is returned if set to
+    True.
+layer
+    Layer to normalize instead of `X`. If `None`, `X` is normalized.
+copy
+    Whether to modify copied input object. Not compatible with `inplace=False`.
+inplace
+    Whether to update `adata` or return dictionary with normalized copies
+    of `adata.X` and `adata.layers`.
+"""
+
+doc_inplace = """\
+inplace
+    Whether to update `adata` or return dictionary with normalized copies
+    of `adata.X` and `adata.layers`.
+"""
+
+doc_copy = """\
+copy
+    Whether to modify copied input object. Not compatible with `inplace=False`.
+"""
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index 449ca02f1d..254950b2e1 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -10,8 +10,15 @@
 
 from scanpy._utils import view_to_actual, check_nonnegative_integers
 from scanpy.get import _get_obs_rep, _set_obs_rep
-
+from scanpy._utils import _doc_params
 from scanpy.preprocessing._pca import pca
+from scanpy.experimental._docs import (
+    doc_adata,
+    doc_norm_params,
+    doc_layer,
+    doc_copy,
+    doc_inplace,
+)
 
 
 def _pearson_residuals(X, theta, clip, check_values, copy=False):
@@ -55,6 +62,13 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False):
     return residuals
 
 
+@_doc_params(
+    adata=doc_adata,
+    norm_params=doc_norm_params,
+    layer=doc_layer,
+    inplace=doc_inplace,
+    copy=doc_copy,
+)
 def normalize_pearson_residuals(
     adata: AnnData,
     *,
@@ -62,8 +76,8 @@ def normalize_pearson_residuals(
     clip: Optional[float] = None,
     check_values: bool = True,
     layer: Optional[str] = None,
-    copy: bool = False,
     inplace: bool = True,
+    copy: bool = False,
 ) -> Optional[Dict[str, np.ndarray]]:
     """\
     Applies analytic Pearson residual normalization, based on [Lause21]_.
@@ -76,38 +90,17 @@ def normalize_pearson_residuals(
 
     Params
     ------
-    adata
-        The annotated data matrix of shape `n_obs` × `n_vars`.
-        Rows correspond to cells and columns to genes.
-    theta
-        The negative binomial overdispersion parameter theta for Pearson residuals.
-        Higher values correspond to less overdispersion (var = mean + mean^2/theta),
-        and `theta=np.Inf` corresponds to a Poisson model.
-    clip
-        Determines if and how residuals are clipped:
-
-            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
-            where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-            `clip=np.Inf` for no clipping.
-
-    check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to
-        True.
-    layer
-        Layer to normalize instead of `X`. If `None`, `X` is normalized.
-    copy
-        Whether to modify copied input object. Not compatible with `inplace=False`.
-    inplace
-        Whether to update `adata` or return dictionary with normalized copies
-        of `adata.X` and `adata.layers`.
+    {adata}
+    {norm_params}
+    {layer}
+    {inplace}
+    {copy}
 
     Returns
     -------
     Returns dictionary with Pearson residuals and settings
     or updates `adata` with normalized version of the original
     `adata.X` and `adata.layers`, depending on `inplace`.
-
     """
 
     if copy:
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index ea7da74f59..e4fa4cec5c 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -4,8 +4,21 @@
 import numpy as np
 from scanpy import experimental
 from scanpy.preprocessing import pca
-
-
+from scanpy.experimental._docs import (
+    doc_adata,
+    doc_norm_params,
+    doc_layer,
+    doc_inplace,
+)
+from scanpy._utils import _doc_params
+
+
+@_doc_params(
+    adata=doc_adata,
+    norm_params=doc_norm_params,
+    layer=doc_layer,
+    inplace=doc_inplace,
+)
 def recipe_pearson_residuals(
     adata: AnnData,
     *,
@@ -31,21 +44,8 @@ def recipe_pearson_residuals(
 
     Params
     ------
-    adata
-        The annotated data matrix of shape `n_obs` × `n_vars`.
-        Rows correspond to cells and columns to genes.
-    theta
-        The negative binomial overdispersion parameter theta for Pearson residuals.
-        Higher values correspond to less overdispersion (var = mean + mean^2/theta),
-        and `theta=np.Inf` corresponds to a Poisson model.
-    clip
-        Determines if and how residuals are clipped:
-
-            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
-            where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-            `clip=np.Inf` for no clipping.
-
+    {adata}
+    {norm_params}
     n_top_genes
         Number of highly-variable genes to keep.
     batch_key
@@ -64,11 +64,8 @@ def recipe_pearson_residuals(
         Change to use different initial states for the optimization in the PCA step.
     kwargs_pca
         Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
-    check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to
-        True.
-    inplace
-        Whether to place results in `adata` or return them.
+    {layer}
+    {inplace}
 
 
     Returns

From 0e4711d60964c8f9c3b8c630aa7ac4e2b8af6fe5 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 23 Feb 2022 20:10:22 +0100
Subject: [PATCH 76/96] fix doc_params var

---
 scanpy/experimental/_docs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py
index e8fecf19be..a767bf9a06 100644
--- a/scanpy/experimental/_docs.py
+++ b/scanpy/experimental/_docs.py
@@ -22,7 +22,7 @@
 
 """
 
-doc_layer_copy = """\
+doc_layer = """\
 check_values
     Check if counts in selected layer are integers. A Warning is returned if set to
     True.

From aa55183dadfd732c6d2c2a1b897a32a3e056f8e0 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Thu, 24 Feb 2022 09:30:50 +0100
Subject: [PATCH 77/96] finalize docs

---
 scanpy/experimental/_docs.py                  | 30 +++++++--
 .../experimental/pp/_highly_variable_genes.py | 63 ++++++++-----------
 scanpy/experimental/pp/_normalization.py      |  6 +-
 scanpy/experimental/pp/_recipes.py            | 22 ++-----
 4 files changed, 59 insertions(+), 62 deletions(-)

diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py
index a767bf9a06..ad05c56c7e 100644
--- a/scanpy/experimental/_docs.py
+++ b/scanpy/experimental/_docs.py
@@ -7,7 +7,7 @@
     Rows correspond to cells and columns to genes.
 """
 
-doc_norm_params = """\
+doc_dist_params = """\
 theta
     The negative binomial overdispersion parameter theta for Pearson residuals.
     Higher values correspond to less overdispersion (var = mean + mean^2/theta),
@@ -28,11 +28,29 @@
     True.
 layer
     Layer to normalize instead of `X`. If `None`, `X` is normalized.
-copy
-    Whether to modify copied input object. Not compatible with `inplace=False`.
-inplace
-    Whether to update `adata` or return dictionary with normalized copies
-    of `adata.X` and `adata.layers`.
+"""
+
+doc_subset = """\
+subset
+    Inplace subset to highly-variable genes if `True` otherwise merely indicate
+    highly variable genes.
+"""
+
+doc_genes_batch_chunk = """\
+n_top_genes
+    Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or
+    `flavor='pearson_residuals'`.
+batch_key
+    If specified, highly-variable genes are selected within each batch separately
+    and merged. This simple process avoids the selection of batch-specific genes
+    and acts as a lightweight batch correction method. Genes are first sorted by
+    how many batches they are a HVG. If `flavor='pearson_residuals'`, ties are
+    broken by the median rank (across batches) based on within-batch residual
+    variance.
+chunksize
+    If `flavor='pearson_residuals'`, this dertermines how many genes are processed at
+    once while computing the residual variance. Choosing a smaller value will reduce
+    the required memory.
 """
 
 doc_inplace = """\
diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index f98370544f..aed70f6e5f 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -1,3 +1,4 @@
+from multiprocessing.sharedctypes import Value
 import warnings
 from typing import Optional
 
@@ -12,9 +13,18 @@
 from scanpy._utils import check_nonnegative_integers, view_to_actual
 from scanpy.get import _get_obs_rep
 from scanpy._compat import Literal
+from scanpy._utils import _doc_params
 from scanpy.preprocessing._utils import _get_mean_var
 from scanpy.preprocessing._distributed import materialize_as_ndarray
 from scanpy.preprocessing._simple import filter_genes
+from scanpy.experimental._docs import (
+    doc_adata,
+    doc_dist_params,
+    doc_genes_batch_chunk,
+    doc_layer,
+    doc_copy,
+    doc_inplace,
+)
 
 
 def _highly_variable_pearson_residuals(
@@ -202,6 +212,13 @@ def _highly_variable_pearson_residuals(
         return df
 
 
+@_doc_params(
+    adata=doc_adata,
+    dist_params=doc_dist_params,
+    genes_batch_chunk=doc_genes_batch_chunk,
+    layer=doc_layer,
+    inplace=doc_inplace,
+)
 def highly_variable_genes(
     adata: AnnData,
     *,
@@ -226,51 +243,19 @@ def highly_variable_genes(
 
     Expects raw count input.
 
-
     Parameters
     ----------
-    adata
-        The annotated data matrix of shape `n_obs` × `n_vars`.
-        Rows correspond to cells and columns to genes.
-    theta
-        The negative binomial overdispersion parameter theta for Pearson residuals.
-        Higher values correspond to less overdispersion (var = mean + mean^2/theta),
-        and `theta=np.Inf` corresponds to a Poisson model.
-    clip
-        If `flavor='pearson_residuals'`, determines if and how residuals are clipped:
-
-            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
-            where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-            `clip=np.Inf` for no clipping.
-
-    n_top_genes
-        Number of highly-variable genes to keep. Mandatory if `flavor='seurat_v3'` or
-        `flavor='pearson_residuals'`.
-    batch_key
-        If specified, highly-variable genes are selected within each batch separately
-        and merged. This simple process avoids the selection of batch-specific genes
-        and acts as a lightweight batch correction method. Genes are first sorted by
-        how many batches they are a HVG. If `flavor='pearson_residuals'`, ties are
-        broken by the median rank (across batches) based on within-batch residual
-        variance.
-    chunksize
-        If `flavor='pearson_residuals'`, this dertermines how many genes are processed at
-        once while computing the residual variance. Choosing a smaller value will reduce
-        the required memory.
+    {adata}
+    {dist_params}
+    {genes_batch_chunk}
     flavor
         Choose the flavor for identifying highly variable genes. In this experimental
         version, only 'pearson_residuals' is functional.
-    check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to
-        True. Only used if `flavor='pearson_residuals'`.
-    layer
-        If provided, use `adata.layers[layer]` for expression values instead of `adata.X`.
+    {layer}
     subset
         Inplace subset to highly-variable genes if `True` otherwise merely indicate
         highly variable genes.
-    inplace
-        Whether to place calculated metrics in `.var` or return them.
+    {in_place}
 
     Returns
     -------
@@ -325,3 +310,7 @@ def highly_variable_genes(
             check_values=check_values,
             inplace=inplace,
         )
+    else:
+        raise ValueError(
+            "This is an experimental API and only `flavor=pearson_residuals` is available."
+        )
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index 254950b2e1..6ef838c5cb 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -14,7 +14,7 @@
 from scanpy.preprocessing._pca import pca
 from scanpy.experimental._docs import (
     doc_adata,
-    doc_norm_params,
+    doc_dist_params,
     doc_layer,
     doc_copy,
     doc_inplace,
@@ -64,7 +64,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False):
 
 @_doc_params(
     adata=doc_adata,
-    norm_params=doc_norm_params,
+    norm_params=doc_dist_params,
     layer=doc_layer,
     inplace=doc_inplace,
     copy=doc_copy,
@@ -91,7 +91,7 @@ def normalize_pearson_residuals(
     Params
     ------
     {adata}
-    {norm_params}
+    {dist_params}
     {layer}
     {inplace}
     {copy}
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index e4fa4cec5c..76336a1e4d 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -6,7 +6,8 @@
 from scanpy.preprocessing import pca
 from scanpy.experimental._docs import (
     doc_adata,
-    doc_norm_params,
+    doc_dist_params,
+    doc_genes_batch_chunk,
     doc_layer,
     doc_inplace,
 )
@@ -15,7 +16,8 @@
 
 @_doc_params(
     adata=doc_adata,
-    norm_params=doc_norm_params,
+    dist_params=doc_dist_params,
+    genes_batch_chunk=doc_genes_batch_chunk,
     layer=doc_layer,
     inplace=doc_inplace,
 )
@@ -45,19 +47,8 @@ def recipe_pearson_residuals(
     Params
     ------
     {adata}
-    {norm_params}
-    n_top_genes
-        Number of highly-variable genes to keep.
-    batch_key
-        If specified, highly-variable genes are selected within each batch separately
-        and merged. This simple process avoids the selection of batch-specific genes
-        and acts as a lightweight batch correction method. Genes are first sorted by
-        how many batches they are a HVG. Ties are broken by the median rank (across
-        batches) based on within-batch residual variance.
-    chunksize
-        This dertermines how many genes are processed at once while computing
-        the Pearson residual variance. Choosing a smaller value will reduce
-        the required memory.
+    {dist_params}
+    {genes_batch_chunk}
     n_comps
         Number of principal components to compute in the PCA step.
     random_state
@@ -67,7 +58,6 @@ def recipe_pearson_residuals(
     {layer}
     {inplace}
 
-
     Returns
     -------
     If `inplace=False`, separately returns the gene selection results (`hvg`)

From 8e9b07b4d154ae567bc80da88fc7c0d05e448885 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Thu, 24 Feb 2022 10:21:39 +0100
Subject: [PATCH 78/96] fix param doc

---
 scanpy/experimental/pp/_normalization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index 6ef838c5cb..ef0ca75b65 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -64,7 +64,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False):
 
 @_doc_params(
     adata=doc_adata,
-    norm_params=doc_dist_params,
+    dist_params=doc_dist_params,
     layer=doc_layer,
     inplace=doc_inplace,
     copy=doc_copy,

From dce90b2920b0dfb7d0dbfba227a60c4e9b8c2633 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Thu, 24 Feb 2022 12:02:55 +0100
Subject: [PATCH 79/96] wrong var still

---
 scanpy/experimental/pp/_highly_variable_genes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index aed70f6e5f..e03beae206 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -255,7 +255,7 @@ def highly_variable_genes(
     subset
         Inplace subset to highly-variable genes if `True` otherwise merely indicate
         highly variable genes.
-    {in_place}
+    {inplace}
 
     Returns
     -------

From ca65af525d03b0938ff4c7e0dd7c144078e43c9a Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Feb 2022 14:08:56 +0100
Subject: [PATCH 80/96] add cached datasets module and test on high_var_genes
 tests

---
 scanpy/tests/_data/_cached_datasets.py     | 19 +++++++++++++++++++
 scanpy/tests/helpers.py                    |  9 ++-------
 scanpy/tests/test_highly_variable_genes.py | 12 +++++++-----
 3 files changed, 28 insertions(+), 12 deletions(-)
 create mode 100644 scanpy/tests/_data/_cached_datasets.py

diff --git a/scanpy/tests/_data/_cached_datasets.py b/scanpy/tests/_data/_cached_datasets.py
new file mode 100644
index 0000000000..0ccdf00b75
--- /dev/null
+++ b/scanpy/tests/_data/_cached_datasets.py
@@ -0,0 +1,19 @@
+from functools import wraps
+import scanpy as sc
+
+
+def cached_dataset(func):
+    store = []
+
+    @wraps(func)
+    def wrapper():
+        if len(store) < 1:
+            store.append(func())
+        return store[0].copy()
+
+    return wrapper
+
+
+pbmc3k = cached_dataset(sc.datasets.pbmc3k)
+pbmc68k_reduced = cached_dataset(sc.datasets.pbmc68k_reduced)
+pbmc3k_processed = cached_dataset(sc.datasets.pbmc3k_processed)
diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py
index d1dec31995..9bae484355 100644
--- a/scanpy/tests/helpers.py
+++ b/scanpy/tests/helpers.py
@@ -8,8 +8,8 @@
 import numpy as np
 import warnings
 import pytest
-from functools import cache
 from anndata.tests.helpers import asarray, assert_equal
+from scanpy.tests._data._cached_datasets import pbmc3k
 
 # TODO: Report more context on the fields being compared on error
 # TODO: Allow specifying paths to ignore on comparison
@@ -87,11 +87,6 @@ def check_rep_results(func, X, *, fields=["layer", "obsm"], **kwargs):
         assert_equal(adata_X, adatas_proc[field])
 
 
-@cache
-def _get_pbmc3k():
-    return sc.datasets.pbmc3k()
-
-
 def _prepare_pbmc_testdata(sparsity_func, dtype, small=False):
     """Prepares 3k PBMC dataset with batch key `batch` and defined datatype/sparsity.
 
@@ -104,7 +99,7 @@ def _prepare_pbmc_testdata(sparsity_func, dtype, small=False):
     small
         False (default) returns full data, True returns small subset of the data."""
 
-    adata = _get_pbmc3k()
+    adata = pbmc3k().copy()
 
     if small:
         adata = adata[:1000, :500]
diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index 31addb225c..068ebd3280 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -10,6 +10,8 @@
 )
 import warnings
 
+from scanpy.tests._data._cached_datasets import pbmc3k, pbmc68k_reduced
+
 FILE = Path(__file__).parent / Path('_scripts/seurat_hvg.csv')
 FILE_V3 = Path(__file__).parent / Path('_scripts/seurat_hvg_v3.csv.gz')
 FILE_V3_BATCH = Path(__file__).parent / Path('_scripts/seurat_hvg_v3_batch.csv')
@@ -273,7 +275,7 @@ def test_highly_variable_genes_pearson_residuals_batch(
 def test_higly_variable_genes_compare_to_seurat():
     seurat_hvg_info = pd.read_csv(FILE, sep=' ')
 
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced().copy()
     pbmc.X = pbmc.raw.X
     pbmc.var_names_make_unique()
 
@@ -314,7 +316,7 @@ def test_higly_variable_genes_compare_to_seurat_v3():
         FILE_V3, sep=' ', dtype={"variances_norm": np.float64}
     )
 
-    pbmc = sc.datasets.pbmc3k()
+    pbmc = pbmc3k().copy()
     pbmc.var_names_make_unique()
 
     pbmc_dense = pbmc.copy()
@@ -377,7 +379,7 @@ def test_higly_variable_genes_compare_to_seurat_v3():
 def test_filter_genes_dispersion_compare_to_seurat():
     seurat_hvg_info = pd.read_csv(FILE, sep=' ')
 
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced().copy()
     pbmc.X = pbmc.raw.X
     pbmc.var_names_make_unique()
 
@@ -419,7 +421,7 @@ def test_filter_genes_dispersion_compare_to_seurat():
 
 
 def test_highly_variable_genes_batches():
-    adata = sc.datasets.pbmc68k_reduced()
+    adata = pbmc68k_reduced().copy()
     adata[:100, :100].X = np.zeros((100, 100))
 
     adata.obs['batch'] = ['0' if i < 100 else '1' for i in range(adata.n_obs)]
@@ -468,7 +470,7 @@ def test_highly_variable_genes_batches():
 
 
 def test_seurat_v3_mean_var_output_with_batchkey():
-    pbmc = sc.datasets.pbmc3k()
+    pbmc = pbmc3k().copy()
     pbmc.var_names_make_unique()
     n_cells = pbmc.shape[0]
     batch = np.zeros((n_cells), dtype=int)

From d3a07cb9aa3e14449c7fea2cf7dbfb513e1ddfc5 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Feb 2022 15:35:49 +0100
Subject: [PATCH 81/96] use new cache dataset module for tests

---
 scanpy/tests/_data/_cached_datasets.py        |  2 +
 scanpy/tests/test_clustering.py               |  3 +-
 scanpy/tests/test_dendrogram_key_added.py     |  3 +-
 scanpy/tests/test_deprecations.py             |  3 +-
 scanpy/tests/test_embedding.py                | 12 +--
 scanpy/tests/test_embedding_density.py        |  4 +-
 scanpy/tests/test_filter_rank_genes_groups.py |  2 +-
 scanpy/tests/test_get.py                      |  4 +-
 scanpy/tests/test_highly_variable_genes.py    | 10 +--
 scanpy/tests/test_ingest.py                   |  4 +-
 scanpy/tests/test_metrics.py                  |  7 +-
 scanpy/tests/test_neighbors_key_added.py      |  4 +-
 scanpy/tests/test_paga.py                     |  8 +-
 scanpy/tests/test_plotting.py                 | 75 ++++++++++---------
 scanpy/tests/test_preprocessing.py            |  5 +-
 scanpy/tests/test_queries.py                  |  5 +-
 scanpy/tests/test_rank_genes_groups.py        |  6 +-
 scanpy/tests/test_score_genes.py              |  3 +-
 18 files changed, 88 insertions(+), 72 deletions(-)

diff --git a/scanpy/tests/_data/_cached_datasets.py b/scanpy/tests/_data/_cached_datasets.py
index 0ccdf00b75..f66bece206 100644
--- a/scanpy/tests/_data/_cached_datasets.py
+++ b/scanpy/tests/_data/_cached_datasets.py
@@ -17,3 +17,5 @@ def wrapper():
 pbmc3k = cached_dataset(sc.datasets.pbmc3k)
 pbmc68k_reduced = cached_dataset(sc.datasets.pbmc68k_reduced)
 pbmc3k_processed = cached_dataset(sc.datasets.pbmc3k_processed)
+krumsiek11 = cached_dataset(sc.datasets.krumsiek11)
+paul15 = cached_dataset(sc.datasets.paul15)
diff --git a/scanpy/tests/test_clustering.py b/scanpy/tests/test_clustering.py
index 1bb65d27ce..e0cbca4be2 100644
--- a/scanpy/tests/test_clustering.py
+++ b/scanpy/tests/test_clustering.py
@@ -1,10 +1,11 @@
 import pytest
 import scanpy as sc
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 
 
 @pytest.fixture
 def adata_neighbors():
-    return sc.datasets.pbmc68k_reduced()
+    return pbmc68k_reduced()
 
 
 def test_leiden_basic(adata_neighbors):
diff --git a/scanpy/tests/test_dendrogram_key_added.py b/scanpy/tests/test_dendrogram_key_added.py
index 6d43042914..a656b03c6a 100644
--- a/scanpy/tests/test_dendrogram_key_added.py
+++ b/scanpy/tests/test_dendrogram_key_added.py
@@ -1,6 +1,7 @@
 import scanpy as sc
 import numpy as np
 import pytest
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 
 n_neighbors = 5
 key = 'test'
@@ -8,7 +9,7 @@
 
 @pytest.fixture
 def adata():
-    return sc.AnnData(sc.datasets.pbmc68k_reduced())
+    return pbmc68k_reduced()
 
 
 @pytest.mark.parametrize('groupby', ['bulk_labels', ['bulk_labels', 'phase']])
diff --git a/scanpy/tests/test_deprecations.py b/scanpy/tests/test_deprecations.py
index 620c774bd6..5006779c3b 100644
--- a/scanpy/tests/test_deprecations.py
+++ b/scanpy/tests/test_deprecations.py
@@ -1,10 +1,11 @@
 import scanpy as sc
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 
 import pytest
 
 
 def test_deprecate_multicore_tsne():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
 
     with pytest.warns(
         UserWarning, match="calling tsne with n_jobs > 1 would use MulticoreTSNE"
diff --git a/scanpy/tests/test_embedding.py b/scanpy/tests/test_embedding.py
index 1f778959c3..9120bd09e0 100644
--- a/scanpy/tests/test_embedding.py
+++ b/scanpy/tests/test_embedding.py
@@ -1,7 +1,7 @@
 from importlib.util import find_spec
 from unittest.mock import patch
 import warnings
-
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 import numpy as np
 import pytest
 from numpy.testing import assert_array_almost_equal, assert_array_equal, assert_raises
@@ -10,7 +10,7 @@
 
 
 def test_tsne():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
 
     euclidean1 = sc.tl.tsne(pbmc, metric="euclidean", copy=True)
     with pytest.warns(UserWarning, match="In previous versions of scanpy"):
@@ -32,7 +32,7 @@ def test_tsne():
 
 
 def test_tsne_metric_warning():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     import sklearn
 
     with patch.object(sklearn, "__version__", "0.23.0"), pytest.warns(
@@ -42,7 +42,7 @@ def test_tsne_metric_warning():
 
 
 def test_umap_init_dtype():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     pbmc = pbmc[:100, :].copy()
     sc.tl.umap(pbmc, init_pos=pbmc.obsm["X_pca"][:, :2].astype(np.float32))
     embed1 = pbmc.obsm["X_umap"].copy()
@@ -57,7 +57,7 @@ def test_umap_init_dtype():
 
 @pytest.mark.parametrize("layout", [pytest.param("fa", marks=needs_fa2), "fr"])
 def test_umap_init_paga(layout):
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     pbmc = pbmc[:100, :].copy()
     sc.tl.paga(pbmc)
     sc.pl.paga(pbmc, layout=layout, show=False)
@@ -65,7 +65,7 @@ def test_umap_init_paga(layout):
 
 
 def test_diffmap():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
 
     sc.tl.diffmap(pbmc)
     d1 = pbmc.obsm['X_diffmap'].copy()
diff --git a/scanpy/tests/test_embedding_density.py b/scanpy/tests/test_embedding_density.py
index 3e49f45f3f..e38bdc65db 100644
--- a/scanpy/tests/test_embedding_density.py
+++ b/scanpy/tests/test_embedding_density.py
@@ -1,6 +1,6 @@
 import numpy as np
 from anndata import AnnData
-
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 import scanpy as sc
 
 
@@ -22,6 +22,6 @@ def test_embedding_density():
 
 def test_embedding_density_plot():
     # Test that sc.pl.embedding_density() runs without error
-    adata = sc.datasets.pbmc68k_reduced()
+    adata = pbmc68k_reduced()
     sc.tl.embedding_density(adata, 'umap')
     sc.pl.embedding_density(adata, 'umap', 'umap_density', show=False)
diff --git a/scanpy/tests/test_filter_rank_genes_groups.py b/scanpy/tests/test_filter_rank_genes_groups.py
index b6d7bb474c..7325d07376 100644
--- a/scanpy/tests/test_filter_rank_genes_groups.py
+++ b/scanpy/tests/test_filter_rank_genes_groups.py
@@ -1,6 +1,6 @@
 import numpy as np
 from scanpy.tools import rank_genes_groups, filter_rank_genes_groups
-from scanpy.datasets import pbmc68k_reduced
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 
 
 names_no_reference = np.array(
diff --git a/scanpy/tests/test_get.py b/scanpy/tests/test_get.py
index 6fac0386c5..30a3eb84f5 100644
--- a/scanpy/tests/test_get.py
+++ b/scanpy/tests/test_get.py
@@ -9,7 +9,7 @@
 
 import scanpy as sc
 from scanpy.datasets._utils import filter_oldformatwarning
-
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 
 TRANSPOSE_PARAMS = pytest.mark.parametrize(
     "dim,transform,func",
@@ -202,7 +202,7 @@ def test_backed_vs_memory():
 
 def test_column_content():
     "uses a larger dataset to test column order and content"
-    adata = sc.datasets.pbmc68k_reduced()
+    adata = pbmc68k_reduced()
 
     # test that columns content is correct for obs_df
     query = ['CST3', 'NKG7', 'GNLY', 'louvain', 'n_counts', 'n_genes']
diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
index 068ebd3280..9483848051 100644
--- a/scanpy/tests/test_highly_variable_genes.py
+++ b/scanpy/tests/test_highly_variable_genes.py
@@ -275,7 +275,7 @@ def test_highly_variable_genes_pearson_residuals_batch(
 def test_higly_variable_genes_compare_to_seurat():
     seurat_hvg_info = pd.read_csv(FILE, sep=' ')
 
-    pbmc = pbmc68k_reduced().copy()
+    pbmc = pbmc68k_reduced()
     pbmc.X = pbmc.raw.X
     pbmc.var_names_make_unique()
 
@@ -316,7 +316,7 @@ def test_higly_variable_genes_compare_to_seurat_v3():
         FILE_V3, sep=' ', dtype={"variances_norm": np.float64}
     )
 
-    pbmc = pbmc3k().copy()
+    pbmc = pbmc3k()
     pbmc.var_names_make_unique()
 
     pbmc_dense = pbmc.copy()
@@ -379,7 +379,7 @@ def test_higly_variable_genes_compare_to_seurat_v3():
 def test_filter_genes_dispersion_compare_to_seurat():
     seurat_hvg_info = pd.read_csv(FILE, sep=' ')
 
-    pbmc = pbmc68k_reduced().copy()
+    pbmc = pbmc68k_reduced()
     pbmc.X = pbmc.raw.X
     pbmc.var_names_make_unique()
 
@@ -421,7 +421,7 @@ def test_filter_genes_dispersion_compare_to_seurat():
 
 
 def test_highly_variable_genes_batches():
-    adata = pbmc68k_reduced().copy()
+    adata = pbmc68k_reduced()
     adata[:100, :100].X = np.zeros((100, 100))
 
     adata.obs['batch'] = ['0' if i < 100 else '1' for i in range(adata.n_obs)]
@@ -470,7 +470,7 @@ def test_highly_variable_genes_batches():
 
 
 def test_seurat_v3_mean_var_output_with_batchkey():
-    pbmc = pbmc3k().copy()
+    pbmc = pbmc3k()
     pbmc.var_names_make_unique()
     n_cells = pbmc.shape[0]
     batch = np.zeros((n_cells), dtype=int)
diff --git a/scanpy/tests/test_ingest.py b/scanpy/tests/test_ingest.py
index a7ba765f98..8bd9c05be2 100644
--- a/scanpy/tests/test_ingest.py
+++ b/scanpy/tests/test_ingest.py
@@ -7,7 +7,7 @@
 import scanpy as sc
 from scanpy import settings
 from scanpy._compat import pkg_version
-
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 
 X = np.array(
     [
@@ -25,7 +25,7 @@
 
 @pytest.fixture
 def adatas():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     n_split = 500
     adata_ref = sc.AnnData(pbmc.X[:n_split, :], obs=pbmc.obs.iloc[:n_split])
     adata_new = sc.AnnData(pbmc.X[n_split:, :])
diff --git a/scanpy/tests/test_metrics.py b/scanpy/tests/test_metrics.py
index 025d788abb..4ace4a9c54 100644
--- a/scanpy/tests/test_metrics.py
+++ b/scanpy/tests/test_metrics.py
@@ -9,10 +9,11 @@
 
 from anndata.tests.helpers import asarray
 import pytest
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 
 
 def test_gearys_c_consistency():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     pbmc.layers["raw"] = pbmc.raw.X.copy()
     g = pbmc.obsp["connectivities"]
 
@@ -69,7 +70,7 @@ def test_gearys_c_correctness():
 
 
 def test_morans_i_consistency():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     pbmc.layers["raw"] = pbmc.raw.X.copy()
     g = pbmc.obsp["connectivities"]
 
@@ -133,7 +134,7 @@ def test_morans_i_correctness():
 )
 def test_graph_metrics_w_constant_values(metric, array_type):
     # https://github.com/theislab/scanpy/issues/1806
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     XT = array_type(pbmc.raw.X.T.copy())
     g = pbmc.obsp["connectivities"].copy()
 
diff --git a/scanpy/tests/test_neighbors_key_added.py b/scanpy/tests/test_neighbors_key_added.py
index 310d7e5d62..3b2ee68a2c 100644
--- a/scanpy/tests/test_neighbors_key_added.py
+++ b/scanpy/tests/test_neighbors_key_added.py
@@ -5,10 +5,12 @@
 n_neighbors = 5
 key = 'test'
 
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
+
 
 @pytest.fixture
 def adata():
-    return sc.AnnData(sc.datasets.pbmc68k_reduced().X)
+    return sc.AnnData(pbmc68k_reduced().X)
 
 
 def test_neighbors_key_added(adata):
diff --git a/scanpy/tests/test_paga.py b/scanpy/tests/test_paga.py
index d4440a2884..fabfe243c4 100644
--- a/scanpy/tests/test_paga.py
+++ b/scanpy/tests/test_paga.py
@@ -5,7 +5,7 @@
 import numpy as np
 
 import scanpy as sc
-
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced, pbmc3k_processed
 import pytest
 
 HERE: Path = Path(__file__).parent
@@ -15,7 +15,7 @@
 
 @pytest.fixture(scope="module")
 def pbmc():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.tl.paga(pbmc, groups='bulk_labels')
     pbmc.obs['cool_feature'] = pbmc[:, 'CST3'].X.squeeze()
     return pbmc
@@ -81,7 +81,7 @@ def test_paga_compare(image_comparer):
     # Tests that https://github.com/theislab/scanpy/issues/1887 is fixed
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
 
-    pbmc = sc.datasets.pbmc3k_processed()
+    pbmc = pbmc3k_processed()
     sc.tl.paga(pbmc, groups="louvain")
 
     sc.pl.paga_compare(pbmc, basis="umap", show=False)
@@ -92,7 +92,7 @@ def test_paga_compare(image_comparer):
 def test_paga_positions_reproducible():
     """Check exact reproducibility and effect of random_state on paga positions"""
     # https://github.com/theislab/scanpy/issues/1859
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.tl.paga(pbmc, "bulk_labels")
 
     a = pbmc.copy()
diff --git a/scanpy/tests/test_plotting.py b/scanpy/tests/test_plotting.py
index b0acd839f3..98611ab6f4 100644
--- a/scanpy/tests/test_plotting.py
+++ b/scanpy/tests/test_plotting.py
@@ -6,7 +6,12 @@
 import pytest
 from matplotlib.testing import setup
 from packaging import version
-
+from scanpy.tests._data._cached_datasets import (
+    pbmc3k,
+    pbmc3k_processed,
+    pbmc68k_reduced,
+    krumsiek11,
+)
 from scanpy._compat import pkg_version
 
 setup()
@@ -38,7 +43,7 @@
 def test_heatmap(image_comparer):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
 
-    adata = sc.datasets.krumsiek11()
+    adata = krumsiek11()
     sc.pl.heatmap(
         adata, adata.var_names, 'cell_type', use_raw=False, show=False, dendrogram=True
     )
@@ -100,7 +105,7 @@ def test_heatmap(image_comparer):
     save_and_compare_images('master_heatmap_std_scale_obs')
 
     # test var_names as dict
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.tl.leiden(pbmc, key_added="clusters", resolution=0.5)
     # call umap to trigger colors for the clusters
     sc.pl.umap(pbmc, color="clusters")
@@ -152,7 +157,7 @@ def test_heatmap(image_comparer):
 )
 def test_clustermap(image_comparer, obs_keys, name):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
-    adata = sc.datasets.krumsiek11()
+    adata = krumsiek11()
     sc.pl.clustermap(adata, obs_keys)
     save_and_compare_images(name)
 
@@ -312,7 +317,7 @@ def test_clustermap(image_comparer, obs_keys, name):
 def test_dotplot_matrixplot_stacked_violin(image_comparer, id, fn):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
 
-    adata = sc.datasets.krumsiek11()
+    adata = krumsiek11()
     adata.obs['numeric_column'] = adata.X[:, 0]
     adata.layers['test'] = -1 * adata.X.copy()
     genes_dict = {
@@ -331,7 +336,7 @@ def test_dotplot_matrixplot_stacked_violin(image_comparer, id, fn):
 def test_dotplot_obj(image_comparer):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
     # test dotplot dot_min, dot_max, color_map, and var_groups
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     genes = [
         'CD79A',
         'MS4A1',
@@ -369,7 +374,7 @@ def test_dotplot_obj(image_comparer):
 
 def test_matrixplot_obj(image_comparer):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
-    adata = sc.datasets.pbmc68k_reduced()
+    adata = pbmc68k_reduced()
     marker_genes_dict = {
         "3": ["GNLY", "NKG7"],
         "1": ["FCER1A"],
@@ -396,7 +401,7 @@ def test_matrixplot_obj(image_comparer):
 def test_stacked_violin_obj(image_comparer, plt):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=26)
 
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     markers = {
         'T-cell': ['CD3D', 'CD3E', 'IL32'],
         'B-cell': ['CD79A', 'CD79B', 'MS4A1'],
@@ -417,7 +422,7 @@ def test_stacked_violin_obj(image_comparer, plt):
 def test_tracksplot(image_comparer):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
 
-    adata = sc.datasets.krumsiek11()
+    adata = krumsiek11()
     sc.pl.tracksplot(
         adata, adata.var_names, 'cell_type', dendrogram=True, use_raw=False
     )
@@ -428,7 +433,7 @@ def test_multiple_plots(image_comparer):
     # only testing stacked_violin, matrixplot and dotplot
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
 
-    adata = sc.datasets.pbmc68k_reduced()
+    adata = pbmc68k_reduced()
     markers = {
         'T-cell': ['CD3D', 'CD3E', 'IL32'],
         'B-cell': ['CD79A', 'CD79B', 'MS4A1'],
@@ -474,7 +479,7 @@ def test_violin(image_comparer):
         sc.pl.set_rcParams_defaults()
         sc.set_figure_params(dpi=50, color_map='viridis')
 
-        pbmc = sc.datasets.pbmc68k_reduced()
+        pbmc = pbmc68k_reduced()
         sc.pl.violin(
             pbmc,
             ['n_genes', 'percent_mito', 'n_counts'],
@@ -523,7 +528,7 @@ def test_violin_without_raw(tmpdir):
     has_raw_pth = TESTDIR / "has_raw.png"
     no_raw_pth = TESTDIR / "no_raw.png"
 
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     pbmc_no_raw = pbmc.raw.to_adata().copy()
 
     sc.pl.violin(pbmc, 'CST3', groupby="bulk_labels", show=False, jitter=False)
@@ -540,7 +545,7 @@ def test_violin_without_raw(tmpdir):
 def test_dendrogram(image_comparer):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=10)
 
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.pl.dendrogram(pbmc, 'bulk_labels')
     save_and_compare_images('dendrogram')
 
@@ -548,7 +553,7 @@ def test_dendrogram(image_comparer):
 def test_correlation(image_comparer):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
 
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.pl.correlation_matrix(pbmc, 'bulk_labels')
     save_and_compare_images('correlation')
 
@@ -772,7 +777,7 @@ def test_correlation(image_comparer):
 def test_rank_genes_groups(image_comparer, name, fn):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
 
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.tl.rank_genes_groups(pbmc, 'louvain', n_genes=pbmc.raw.shape[1])
 
     # add gene symbol
@@ -791,8 +796,8 @@ def gene_symbols_adatas():
     Both have ensembl ids and hgnc symbols as columns in var. The first has ensembl
     ids as var_names, the second has symbols.
     """
-    pbmc = sc.datasets.pbmc3k_processed().raw.to_adata()
-    pbmc_counts = sc.datasets.pbmc3k()
+    pbmc = pbmc3k_processed().raw.to_adata()
+    pbmc_counts = pbmc3k()
 
     pbmc.layers["counts"] = pbmc_counts[pbmc.obs_names, pbmc.var_names].X.copy()
     pbmc.var["gene_symbol"] = pbmc.var_names
@@ -877,7 +882,7 @@ def test_rank_genes_groups_plots_n_genes_vs_var_names(tmpdir, func, check_same_i
     var_names as a dict works.
     """
     N = 3
-    pbmc = sc.datasets.pbmc68k_reduced().raw.to_adata()
+    pbmc = pbmc68k_reduced().raw.to_adata()
     groups = pbmc.obs["louvain"].cat.categories[:3]
     pbmc = pbmc[pbmc.obs["louvain"].isin(groups)][::3].copy()
 
@@ -929,7 +934,7 @@ def wrapped(pth, **kwargs):
 def test_genes_symbols(image_comparer, id, fn):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
 
-    adata = sc.datasets.krumsiek11()
+    adata = krumsiek11()
 
     # add a 'symbols' column
     adata.var['symbols'] = adata.var.index.map(lambda x: "symbol_{}".format(x))
@@ -941,7 +946,7 @@ def test_genes_symbols(image_comparer, id, fn):
 
 @pytest.fixture(scope="module")
 def pbmc_scatterplots():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     pbmc.layers["sparse"] = pbmc.raw.X / 2
     pbmc.layers["test"] = pbmc.X.copy() + 100
     pbmc.var["numbers"] = [str(x) for x in range(pbmc.shape[1])]
@@ -1069,7 +1074,7 @@ def test_scatter_embedding_groups_and_size(image_comparer):
     # plotted on top. This new ordering requires that the size
     # vector is also ordered (if given).
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.pl.embedding(
         pbmc,
         'umap',
@@ -1082,7 +1087,7 @@ def test_scatter_embedding_groups_and_size(image_comparer):
 
 def test_scatter_embedding_add_outline_vmin_vmax_norm(image_comparer, check_same_image):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
 
     sc.pl.embedding(
         pbmc,
@@ -1183,7 +1188,7 @@ def test_scatter_embedding_add_outline_vmin_vmax_norm(image_comparer, check_same
 
 
 def test_timeseries():
-    adata = sc.datasets.pbmc68k_reduced()
+    adata = pbmc68k_reduced()
     sc.pp.neighbors(adata, n_neighbors=5, method='gauss', knn=False)
     sc.tl.diffmap(adata)
     sc.tl.dpt(adata, n_branchings=1, n_dcs=10)
@@ -1191,7 +1196,7 @@ def test_timeseries():
 
 
 def test_scatter_raw(tmp_path):
-    pbmc = sc.datasets.pbmc68k_reduced()[:100].copy()
+    pbmc = pbmc68k_reduced()[:100].copy()
     raw_pth = tmp_path / "raw.png"
     x_pth = tmp_path / "X.png"
 
@@ -1208,7 +1213,7 @@ def test_scatter_raw(tmp_path):
 
 
 def test_scatter_specify_layer_and_raw():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     pbmc.layers["layer"] = pbmc.raw.X.copy()
     with pytest.raises(ValueError):
         sc.pl.umap(pbmc, color="HES4", use_raw=True, layer="layer")
@@ -1217,7 +1222,7 @@ def test_scatter_specify_layer_and_raw():
 def test_scatter_no_basis_per_obs(image_comparer):
     """Test scatterplot of per-obs points with no basis"""
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.pl.scatter(pbmc, x="HES4", y="percent_mito", color="n_genes", use_raw=False)
     save_and_compare_images("scatter_HES_percent_mito_n_genes")
 
@@ -1225,14 +1230,14 @@ def test_scatter_no_basis_per_obs(image_comparer):
 def test_scatter_no_basis_per_var(image_comparer):
     """Test scatterplot of per-var points with no basis"""
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.pl.scatter(pbmc, x="AAAGCCTGGCTAAC-1", y="AAATTCGATGCACA-1", use_raw=False)
     save_and_compare_images("scatter_AAAGCCTGGCTAAC-1_vs_AAATTCGATGCACA-1")
 
 
 @pytest.fixture
 def pbmc_filtered():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.pp.filter_genes(pbmc, min_cells=10)
     return pbmc
 
@@ -1287,7 +1292,7 @@ def test_scatter_no_basis_value_error(pbmc_filtered, x, y, color, use_raw):
 def test_rankings(image_comparer):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
 
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.pp.pca(pbmc)
     sc.pl.pca_loadings(pbmc)
     save_and_compare_images('master_pca_loadings')
@@ -1357,7 +1362,7 @@ def test_no_copy():
     # https://github.com/theislab/scanpy/issues/1000
     # Tests that plotting functions don't make a copy from a view unless they
     # actually have to
-    actual = sc.datasets.pbmc68k_reduced()
+    actual = pbmc68k_reduced()
     sc.pl.umap(actual, color=["bulk_labels", "louvain"], show=False)  # Set colors
 
     view = actual[np.random.choice(actual.obs_names, size=actual.shape[0] // 5), :]
@@ -1395,7 +1400,7 @@ def test_no_copy():
 
 def test_groupby_index(image_comparer):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=15)
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
 
     genes = [
         'CD79A',
@@ -1420,7 +1425,7 @@ def test_groupby_index(image_comparer):
 # test category order when groupby is a list (#1735)
 def test_groupby_list(image_comparer):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=30)
-    adata = sc.datasets.krumsiek11()
+    adata = krumsiek11()
 
     np.random.seed(1)
 
@@ -1441,7 +1446,7 @@ def test_color_cycler(caplog):
     # https://github.com/theislab/scanpy/issues/1885
     import logging
 
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     colors = sns.color_palette("deep")
     cyl = sns.rcmod.cycler('color', sns.color_palette("deep"))
 
@@ -1469,7 +1474,7 @@ def test_filter_rank_genes_groups_plots(tmpdir, plot, check_same_image):
     TESTDIR = Path(tmpdir)
     N_GENES = 4
 
-    adata = sc.datasets.pbmc68k_reduced()
+    adata = pbmc68k_reduced()
 
     sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon', pts=True)
 
@@ -1504,7 +1509,7 @@ def test_filter_rank_genes_groups_plots(tmpdir, plot, check_same_image):
 def test_scrublet_plots(image_comparer, plt):
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=30)
 
-    adata = sc.datasets.pbmc3k()
+    adata = pbmc3k()
     sc.external.pp.scrublet(adata, use_approx_neighbors=False)
 
     sc.external.pl.scrublet_score_distribution(adata, return_fig=True)
diff --git a/scanpy/tests/test_preprocessing.py b/scanpy/tests/test_preprocessing.py
index b9decb0579..e3e6712400 100644
--- a/scanpy/tests/test_preprocessing.py
+++ b/scanpy/tests/test_preprocessing.py
@@ -11,6 +11,7 @@
 from anndata.tests.helpers import assert_equal, asarray
 
 from scanpy.tests.helpers import check_rep_mutation, check_rep_results
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 
 
 def test_log1p(tmp_path):
@@ -115,7 +116,7 @@ def test_subsample_copy():
 
 
 def test_scale():
-    adata = sc.datasets.pbmc68k_reduced()
+    adata = pbmc68k_reduced()
     adata.X = adata.raw.X
     v = adata[:, 0 : adata.shape[1] // 2]
     # Should turn view to copy https://github.com/theislab/anndata/issues/171#issuecomment-508689965
@@ -336,7 +337,7 @@ def test_downsample_total_counts(count_matrix_format, replace, dtype):
 
 def test_recipe_weinreb():
     # Just tests for failure for now
-    adata = sc.datasets.pbmc68k_reduced().raw.to_adata()
+    adata = pbmc68k_reduced().raw.to_adata()
     adata.X = adata.X.toarray()
 
     orig = adata.copy()
diff --git a/scanpy/tests/test_queries.py b/scanpy/tests/test_queries.py
index e4ef9cc69d..09c9aef0db 100644
--- a/scanpy/tests/test_queries.py
+++ b/scanpy/tests/test_queries.py
@@ -1,11 +1,12 @@
 import pandas as pd
 import pytest
 import scanpy as sc
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 
 
 @pytest.mark.internet
 def test_enrich():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     sc.tl.rank_genes_groups(pbmc, "louvain", n_genes=pbmc.shape[1])
     enrich_anndata = sc.queries.enrich(pbmc, "1")
     de = pd.DataFrame()
@@ -29,7 +30,7 @@ def test_enrich():
 
 @pytest.mark.internet
 def test_mito_genes():
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     mt_genes = sc.queries.mitochondrial_genes("hsapiens")
     assert (
         pbmc.var_names.isin(mt_genes["external_gene_name"]).sum() == 1
diff --git a/scanpy/tests/test_rank_genes_groups.py b/scanpy/tests/test_rank_genes_groups.py
index febe80a4b4..9daa73de9c 100644
--- a/scanpy/tests/test_rank_genes_groups.py
+++ b/scanpy/tests/test_rank_genes_groups.py
@@ -16,7 +16,7 @@
 from scanpy.tools import rank_genes_groups
 from scanpy.tools._rank_genes_groups import _RankGenes
 from scanpy.get import rank_genes_groups_df
-from scanpy.datasets import pbmc68k_reduced
+from scanpy.tests._data._cached_datasets import pbmc68k_reduced
 from scanpy._utils import select_groups
 
 
@@ -216,12 +216,12 @@ def test_results_layers():
 
 def test_rank_genes_groups_use_raw():
     # https://github.com/theislab/scanpy/issues/1929
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     assert pbmc.raw is not None
 
     sc.tl.rank_genes_groups(pbmc, groupby="bulk_labels", use_raw=True)
 
-    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc = pbmc68k_reduced()
     del pbmc.raw
     assert pbmc.raw is None
 
diff --git a/scanpy/tests/test_score_genes.py b/scanpy/tests/test_score_genes.py
index 6f38237f86..56a9e4ac41 100644
--- a/scanpy/tests/test_score_genes.py
+++ b/scanpy/tests/test_score_genes.py
@@ -5,6 +5,7 @@
 import pytest
 import pickle
 from pathlib import Path
+from scanpy.tests._data._cached_datasets import paul15
 
 HERE = Path(__file__).parent / Path('_data/')
 
@@ -54,7 +55,7 @@ def test_score_with_reference():
     and stored as a pickle object in ./data
     """
 
-    adata = sc.datasets.paul15()
+    adata = paul15()
     sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000)
     sc.pp.scale(adata)
 

From bdd37cdf329d81157e271561a6028fe956f260a8 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Feb 2022 15:43:32 +0100
Subject: [PATCH 82/96] fix precommit

---
 scanpy/experimental/pp/_highly_variable_genes.py | 2 +-
 scanpy/experimental/pp/_normalization.py         | 2 +-
 scanpy/tests/test_normalization.py               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index e03beae206..9bf5bfbd06 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -121,7 +121,7 @@ def _highly_variable_pearson_residuals(
             stop = start + chunksize
             mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total)
             X_dense = X_batch[:, start:stop].toarray()
-            residuals = (X_dense - mu) / np.sqrt(mu + mu ** 2 / theta)
+            residuals = (X_dense - mu) / np.sqrt(mu + mu**2 / theta)
             residuals = np.clip(residuals, a_min=-clip, a_max=clip)
             residual_gene_var[start:stop] = np.var(residuals, axis=0)
 
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index ef0ca75b65..d076f03ffd 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -54,7 +54,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False):
 
     mu = np.array(sums_cells @ sums_genes / sum_total)
     diff = np.array(X - mu)
-    residuals = diff / np.sqrt(mu + mu ** 2 / theta)
+    residuals = diff / np.sqrt(mu + mu**2 / theta)
 
     # clip
     residuals = np.clip(residuals, a_min=-clip, a_max=clip)
diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py
index 21588a2610..8a3fc9d357 100644
--- a/scanpy/tests/test_normalization.py
+++ b/scanpy/tests/test_normalization.py
@@ -118,7 +118,7 @@ def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip):
         residuals_reference = (X - mu) / np.sqrt(mu)
     else:
         # NB case
-        residuals_reference = (X - mu) / np.sqrt(mu + mu ** 2 / theta)
+        residuals_reference = (X - mu) / np.sqrt(mu + mu**2 / theta)
 
     # compute output to test
     adata = AnnData(sparsity_func(X), dtype=dtype)

From aba3906d3959cc397fee1264e530300b36b162cf Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Mon, 28 Feb 2022 16:25:35 +0100
Subject: [PATCH 83/96] fix docs

---
 scanpy/experimental/_docs.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py
index ad05c56c7e..40dd6f119a 100644
--- a/scanpy/experimental/_docs.py
+++ b/scanpy/experimental/_docs.py
@@ -18,8 +18,7 @@
     * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
     where n is the number of cells in the dataset (default behavior).
     * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-    `clip=np.Inf` for no clipping.0
-
+    `clip=np.Inf` for no clipping.
 """
 
 doc_layer = """\

From c9dbf489bb465b385958e2b4a7736b65c8a56b43 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 9 Mar 2022 21:30:58 +0100
Subject: [PATCH 84/96] fix reference and add notebook to tutorials

---
 docs/references.rst | 4 ++++
 docs/tutorials.rst  | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/docs/references.rst b/docs/references.rst
index 7c09607f44..e1adc6012c 100644
--- a/docs/references.rst
+++ b/docs/references.rst
@@ -268,3 +268,7 @@ References
 .. [Zunder15] Zunder *et al.* (2015),
    *A continuous molecular roadmap to iPSC reprogramming through progression analysis of single-cell mass cytometry*,
    `Cell Stem Cell <https://doi.org/10.1016/j.stem.2015.01.015>`__.
+
+.. [Lause21] Lause *et al.* (2021),
+   *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*,
+   `Genome Biology <https://doi.org/10.1186/s13059-021-02451-7>`__.
diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 8c304332c2..95f2527093 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -101,6 +101,11 @@ See the `cell cycle`_ notebook.
    :width: 120px
    :align: right
 
+Normalization with Pearson Residuals
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Normalization of scRNA-seq data with Pearson Residuals, from [Lause21]_: :tutorial:`tutorial_pearson_residuals`
+
 
 Scaling Computations
 ~~~~~~~~~~~~~~~~~~~~

From e335966b762c745937117e72786cba2401176714 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 9 Mar 2022 21:33:02 +0100
Subject: [PATCH 85/96] add release note

---
 docs/release-notes/1.9.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst
index 3d1c73beda..a0061736c5 100644
--- a/docs/release-notes/1.9.0.rst
+++ b/docs/release-notes/1.9.0.rst
@@ -5,3 +5,4 @@
 
 - :func:`~scanpy.tl.filter_rank_genes_groups` now allows to filter with absolute values of log fold change :pr:`1649` :smaller:`S Rybakov`
 - :func:`~scanpy.pl.embedding_density` now allows more than 10 groups :pr:`1936` :smaller:`A Wolf`
+- :mod:`~scanpy.experimental` new Scanpy experimental module with Pearson Residuals method for normalization and HVG selection :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`

From bf7fb25cd5f825f095dbdac7f2d820c27093099a Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 9 Mar 2022 21:43:14 +0100
Subject: [PATCH 86/96] add release note

---
 docs/release-notes/1.9.0.rst | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst
index a0061736c5..842cf95ff2 100644
--- a/docs/release-notes/1.9.0.rst
+++ b/docs/release-notes/1.9.0.rst
@@ -5,4 +5,11 @@
 
 - :func:`~scanpy.tl.filter_rank_genes_groups` now allows to filter with absolute values of log fold change :pr:`1649` :smaller:`S Rybakov`
 - :func:`~scanpy.pl.embedding_density` now allows more than 10 groups :pr:`1936` :smaller:`A Wolf`
-- :mod:`~scanpy.experimental` new Scanpy experimental module with Pearson Residuals method for normalization and HVG selection :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
+
+.. rubric:: Experimental module
+
+- Added :mod:`scanpy.experimental` module!
+
+    - Added :func:`scanpy.experimental.pp.normalization_pearson_residuals` for Perason Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
+    - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
+    - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA  :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`

From 1045d984973eed5b6b48c953caea909739e2a074 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 9 Mar 2022 21:45:00 +0100
Subject: [PATCH 87/96] fix release note

---
 docs/release-notes/1.9.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst
index 842cf95ff2..304341e220 100644
--- a/docs/release-notes/1.9.0.rst
+++ b/docs/release-notes/1.9.0.rst
@@ -10,6 +10,6 @@
 
 - Added :mod:`scanpy.experimental` module!
 
-    - Added :func:`scanpy.experimental.pp.normalization_pearson_residuals` for Perason Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
+    - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Perason Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA  :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`

From f7d4c49cf3a38d498a4bde120b55c017d0d98a3b Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 9 Mar 2022 21:45:23 +0100
Subject: [PATCH 88/96] typo

---
 docs/release-notes/1.9.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst
index 304341e220..d225cf09f2 100644
--- a/docs/release-notes/1.9.0.rst
+++ b/docs/release-notes/1.9.0.rst
@@ -10,6 +10,6 @@
 
 - Added :mod:`scanpy.experimental` module!
 
-    - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Perason Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
+    - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Pearson Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA  :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`

From 5f76cdfb4bd94760d406c7bed7619df6d538363b Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 9 Mar 2022 22:05:31 +0100
Subject: [PATCH 89/96] remove duplicate reference

---
 docs/references.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/references.rst b/docs/references.rst
index e1adc6012c..7c09607f44 100644
--- a/docs/references.rst
+++ b/docs/references.rst
@@ -268,7 +268,3 @@ References
 .. [Zunder15] Zunder *et al.* (2015),
    *A continuous molecular roadmap to iPSC reprogramming through progression analysis of single-cell mass cytometry*,
    `Cell Stem Cell <https://doi.org/10.1016/j.stem.2015.01.015>`__.
-
-.. [Lause21] Lause *et al.* (2021),
-   *Analytic Pearson residuals for normalization of single-cell RNA-seq UMI data*,
-   `Genome Biology <https://doi.org/10.1186/s13059-021-02451-7>`__.

From 19b018cf79d74fc938eff201596ee18f79b04268 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Sat, 12 Mar 2022 13:53:31 +0100
Subject: [PATCH 90/96] fixing black flake etc requirements

---
 scanpy/experimental/_docs.py                  | 36 ++++++---
 .../experimental/pp/_highly_variable_genes.py | 17 +++--
 scanpy/experimental/pp/_normalization.py      | 75 +++++++++----------
 scanpy/experimental/pp/_recipes.py            | 25 +++----
 4 files changed, 84 insertions(+), 69 deletions(-)

diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py
index 40dd6f119a..d040bd1262 100644
--- a/scanpy/experimental/_docs.py
+++ b/scanpy/experimental/_docs.py
@@ -10,21 +10,24 @@
 doc_dist_params = """\
 theta
     The negative binomial overdispersion parameter theta for Pearson residuals.
-    Higher values correspond to less overdispersion (var = mean + mean^2/theta),
-    and `theta=np.Inf` corresponds to a Poisson model.
+    Higher values correspond to less overdispersion \
+    (`var = mean + mean^2/theta`), and `theta=np.Inf` corresponds to a Poisson model.
 clip
     Determines if and how residuals are clipped:
 
-    * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
-    where n is the number of cells in the dataset (default behavior).
-    * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
+    * If `None`, residuals are clipped to the interval \
+    `[-sqrt(n_obs), sqrt(n_obs)]`, where `n_obs` is the number of cells in the dataset (default behavior).
+    * If any scalar `c`, residuals are clipped to the interval `[-c, c]`. Set \
     `clip=np.Inf` for no clipping.
 """
 
-doc_layer = """\
+doc_check_values = """\
 check_values
-    Check if counts in selected layer are integers. A Warning is returned if set to
-    True.
+    Check if counts in selected layer are integers. A warning is returned if set to
+    `True`.
+"""
+
+doc_layer = """\
 layer
     Layer to normalize instead of `X`. If `None`, `X` is normalized.
 """
@@ -52,13 +55,24 @@
     the required memory.
 """
 
+doc_pca_chunk = """\
+n_comps
+    Number of principal components to compute in the PCA step.
+random_state
+    Change to use different initial states for the optimization in the PCA step.
+kwargs_pca
+    Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
+"""
+
 doc_inplace = """\
 inplace
-    Whether to update `adata` or return dictionary with normalized copies
-    of `adata.X` and `adata.layers`.
+    If `True`, update `adata` with results. Otherwise, return results. See below for
+    details of what is returned.
 """
 
 doc_copy = """\
 copy
-    Whether to modify copied input object. Not compatible with `inplace=False`.
+    If `True`, the function runs on a copy of the input object and returns the
+    modified copy. Otherwise, the input object is modified direcly. Not compatible
+    with `inplace=False`.
 """
diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index 9bf5bfbd06..1becb2a1d1 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -21,6 +21,7 @@
     doc_adata,
     doc_dist_params,
     doc_genes_batch_chunk,
+    doc_check_values,
     doc_layer,
     doc_copy,
     doc_inplace,
@@ -44,8 +45,8 @@ def _highly_variable_pearson_residuals(
 
     Returns
     -------
-    Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`)
-    or updates `.var` with the following fields:
+    If `inplace=True`, `adata.var` is updated with the following fields. Otherwise,
+    returns the same fields as :class:`~pandas.DataFrame`.
 
     highly_variable : bool
         boolean indicator of highly-variable genes
@@ -216,6 +217,7 @@ def _highly_variable_pearson_residuals(
     adata=doc_adata,
     dist_params=doc_dist_params,
     genes_batch_chunk=doc_genes_batch_chunk,
+    check_values=doc_check_values,
     layer=doc_layer,
     inplace=doc_inplace,
 )
@@ -237,9 +239,9 @@ def highly_variable_genes(
     Annotate highly variable genes using analytic Pearson residuals [Lause21]_.
 
     For [Lause21]_, Pearson residuals of a negative binomial offset model (with
-    overdispersion theta shared across genes) are computed. By default, overdispersion
-    theta=100 is used and residuals are clipped to sqrt(n). Finally, genes are ranked
-    by residual variance.
+    overdispersion theta shared across genes) are computed. By default,
+    overdispersion `theta=100` is used and residuals are clipped to `sqrt(n_obs)`.
+    Finally, genes are ranked by residual variance.
 
     Expects raw count input.
 
@@ -251,6 +253,7 @@ def highly_variable_genes(
     flavor
         Choose the flavor for identifying highly variable genes. In this experimental
         version, only 'pearson_residuals' is functional.
+    {check_values}
     {layer}
     subset
         Inplace subset to highly-variable genes if `True` otherwise merely indicate
@@ -259,8 +262,8 @@ def highly_variable_genes(
 
     Returns
     -------
-    Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or
-    updates `.var` with the following fields
+    If `inplace=True`, `adata.var` is updated with the following fields. Otherwise,
+    returns the same fields as :class:`~pandas.DataFrame`.
 
     highly_variable : bool
         boolean indicator of highly-variable genes
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index d076f03ffd..ab26ae2af3 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -16,8 +16,10 @@
     doc_adata,
     doc_dist_params,
     doc_layer,
+    doc_check_values,
     doc_copy,
     doc_inplace,
+    doc_pca_chunk,
 )
 
 
@@ -65,6 +67,7 @@ def _pearson_residuals(X, theta, clip, check_values, copy=False):
 @_doc_params(
     adata=doc_adata,
     dist_params=doc_dist_params,
+    check_values=doc_check_values,
     layer=doc_layer,
     inplace=doc_inplace,
     copy=doc_copy,
@@ -83,8 +86,8 @@ def normalize_pearson_residuals(
     Applies analytic Pearson residual normalization, based on [Lause21]_.
 
     The residuals are based on a negative binomial offset model with overdispersion
-    `theta` shared across genes. By default, residuals are clipped to sqrt(n) and
-    overdispersion `theta=100` is used.
+    `theta` shared across genes. By default, residuals are clipped to `sqrt(n_obs)`
+    and overdispersion `theta=100` is used.
 
     Expects raw count input.
 
@@ -92,15 +95,24 @@ def normalize_pearson_residuals(
     ------
     {adata}
     {dist_params}
+    {check_values}
     {layer}
     {inplace}
     {copy}
 
     Returns
     -------
-    Returns dictionary with Pearson residuals and settings
-    or updates `adata` with normalized version of the original
-    `adata.X` and `adata.layers`, depending on `inplace`.
+    If `inplace=True`, `adata.X` or the selected layer in `adata.layers` is updated
+    with the normalized values. `adata.uns` is updated with the following fields.
+    If `inplace=False`, the same fields are returned as dictionary with the
+    normalized values in `results_dict['X']`.
+
+    `.uns['pearson_residuals_normalization']['theta']`
+         The used value of the overdisperion parameter theta
+    `.uns['pearson_residuals_normalization']['clip']`
+         The used value of the clipping parameter
+    `.uns['pearson_residuals_normalization']['computed_on']`
+         The name of the layer on which the residuals were computed.
     """
 
     if copy:
@@ -132,6 +144,13 @@ def normalize_pearson_residuals(
         return results_dict
 
 
+@_doc_params(
+    adata=doc_adata,
+    dist_params=doc_dist_params,
+    pca_chunk=doc_pca_chunk,
+    check_values=doc_check_values,
+    inplace=doc_inplace,
+)
 def normalize_pearson_residuals_pca(
     adata: AnnData,
     *,
@@ -143,12 +162,12 @@ def normalize_pearson_residuals_pca(
     use_highly_variable: Optional[bool] = None,
     check_values: bool = True,
     inplace: bool = True,
-) -> Optional[pd.DataFrame]:
+) -> Optional[AnnData]:
     """\
     Applies analytic Pearson residual normalization and PCA, based on [Lause21]_.
 
     The residuals are based on a negative binomial offset model with overdispersion
-    `theta` shared across genes. By default, residuals are clipped to sqrt(n),
+    `theta` shared across genes. By default, residuals are clipped to `sqrt(n_obs)`,
     overdispersion `theta=100` is used, and PCA is run with 50 components.
 
     Operates on the subset of highly variable genes in `adata.var['highly_variable']`
@@ -157,42 +176,22 @@ def normalize_pearson_residuals_pca(
 
     Params
     ------
-    adata
-        The annotated data matrix of shape `n_obs` × `n_vars`.
-        Rows correspond to cells and columns to genes.
-    theta
-        The negative binomial overdispersion parameter theta for Pearson residuals.
-        Higher values correspond to less overdispersion (var = mean + mean^2/theta),
-        and `theta=np.Inf` corresponds to a Poisson model.
-    clip
-        Determines if and how residuals are clipped:
-
-            * If `None`, residuals are clipped to the interval [-sqrt(n), sqrt(n)], \
-            where n is the number of cells in the dataset (default behavior).
-            * If any scalar c, residuals are clipped to the interval [-c, c]. Set \
-            `clip=np.Inf` for no clipping.
-
-    n_comps
-        Number of principal components to compute for the PCA step.
-    random_state
-        Change to use different initial states for the optimization of the PCA step.
-    kwargs_pca
-        Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
+    {adata}
+    {dist_params}
+    {pca_chunk}
     use_highly_variable
-        Whether to use the gene selection in `adata.var['highly_variable']` to subset
-        the data before normalizing (default) or proceed on the full dataset.
-    check_values
-        Check if counts in selected layer are integers. A Warning is returned if set to
-        True.
-    inplace
-        Whether to place results in `adata` or return them.
+        If `True`, use gene selection present in `adata.var['highly_variable']` to
+        subset the data before normalizing (default). Otherwise, proceed on the full
+        dataset.
+    {check_values}
+    {inplace}
 
 
     Returns
     -------
-    If `inplace=False`, returns the Pearson residual-based PCA results
-    (`adata_pca`).
-    If `inplace=True`, updates `adata` with the following fields:
+    If `inplace=False`, returns the Pearson residual-based PCA results (`adata_pca`,
+    :class:`~anndata.AnnData`). If `inplace=True`, updates `adata` with the following
+    fields:
 
     `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
          The hvg-subset, normalized by Pearson residuals
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 76336a1e4d..b2ccbf6335 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -8,7 +8,9 @@
     doc_adata,
     doc_dist_params,
     doc_genes_batch_chunk,
+    doc_pca_chunk,
     doc_layer,
+    doc_check_values,
     doc_inplace,
 )
 from scanpy._utils import _doc_params
@@ -18,7 +20,8 @@
     adata=doc_adata,
     dist_params=doc_dist_params,
     genes_batch_chunk=doc_genes_batch_chunk,
-    layer=doc_layer,
+    pca_chunk=doc_pca_chunk,
+    check_values=doc_check_values,
     inplace=doc_inplace,
 )
 def recipe_pearson_residuals(
@@ -34,7 +37,7 @@ def recipe_pearson_residuals(
     kwargs_pca: dict = {},
     check_values: bool = True,
     inplace: bool = True,
-) -> Optional[Tuple[pd.DataFrame, pd.DataFrame]]:
+) -> Optional[Tuple[AnnData, pd.DataFrame]]:
     """\
     Gene selection and normalization based on [Lause21]_.
 
@@ -49,20 +52,16 @@ def recipe_pearson_residuals(
     {adata}
     {dist_params}
     {genes_batch_chunk}
-    n_comps
-        Number of principal components to compute in the PCA step.
-    random_state
-        Change to use different initial states for the optimization in the PCA step.
-    kwargs_pca
-        Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
-    {layer}
+    {pca_chunk}
+    {check_values}
     {inplace}
 
     Returns
     -------
-    If `inplace=False`, separately returns the gene selection results (`hvg`)
-    and Pearson residual-based PCA results (`adata_pca`). If `inplace=True`,
-    updates `adata` with the following fields for gene selection results:
+    If `inplace=False`, separately returns the gene selection results (`hvg`,
+    :class:`~pandas.DataFrame`) and Pearson residual-based PCA results (`adata_pca`,
+    :class:`~anndata.AnnData`). If `inplace=True`, updates `adata` with the
+    following fields for gene selection results:
 
     `.var['highly_variable']` : bool
         boolean indicator of highly-variable genes.
@@ -83,7 +82,7 @@ def recipe_pearson_residuals(
         If batch_key is given, this denotes the genes that are highly variable
         in all batches.
 
-    …and the following fields for Pearson residual-based PCA results and
+    The following fields contain Pearson residual-based PCA results and
     normalization settings:
 
     `.uns['pearson_residuals_normalization']['pearson_residuals_df']`

From ce9ee43d43b53205717fa29352510f0603ca50b9 Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Sun, 13 Mar 2022 00:42:16 +0100
Subject: [PATCH 91/96] add _pca function to release note

---
 docs/release-notes/1.9.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst
index d225cf09f2..51963e6f72 100644
--- a/docs/release-notes/1.9.0.rst
+++ b/docs/release-notes/1.9.0.rst
@@ -12,4 +12,5 @@
 
     - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Pearson Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
+    - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA  :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`

From 7ffdec3b391d039dd9991788232f3f534e614dee Mon Sep 17 00:00:00 2001
From: Jan Lause <jan.lause@uni-tuebingen.de>
Date: Sun, 13 Mar 2022 01:43:05 +0100
Subject: [PATCH 92/96] last edits to docs

---
 scanpy/experimental/_docs.py                  | 11 ++++----
 .../experimental/pp/_highly_variable_genes.py | 28 +++++++++----------
 scanpy/experimental/pp/_normalization.py      | 17 ++++++-----
 scanpy/experimental/pp/_recipes.py            |  8 +++---
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/scanpy/experimental/_docs.py b/scanpy/experimental/_docs.py
index d040bd1262..a1408adf01 100644
--- a/scanpy/experimental/_docs.py
+++ b/scanpy/experimental/_docs.py
@@ -9,7 +9,7 @@
 
 doc_dist_params = """\
 theta
-    The negative binomial overdispersion parameter theta for Pearson residuals.
+    The negative binomial overdispersion parameter `theta` for Pearson residuals.
     Higher values correspond to less overdispersion \
     (`var = mean + mean^2/theta`), and `theta=np.Inf` corresponds to a Poisson model.
 clip
@@ -23,13 +23,14 @@
 
 doc_check_values = """\
 check_values
-    Check if counts in selected layer are integers. A warning is returned if set to
-    `True`.
+    If `True`, checks if counts in selected layer are integers as expected by this
+    function, and return a warning if non-integers are found. Otherwise, proceed
+    without checking. Setting this to `False` can speed up code for large datasets.
 """
 
 doc_layer = """\
 layer
-    Layer to normalize instead of `X`. If `None`, `X` is normalized.
+    Layer to use as input instead of `X`. If `None`, `X` is used.
 """
 
 doc_subset = """\
@@ -59,7 +60,7 @@
 n_comps
     Number of principal components to compute in the PCA step.
 random_state
-    Change to use different initial states for the optimization in the PCA step.
+    Random seed for setting the initial states for the optimization in the PCA step.
 kwargs_pca
     Dictionary of further keyword arguments passed on to `scanpy.pp.pca()`.
 """
diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py
index 1becb2a1d1..166b6e06df 100644
--- a/scanpy/experimental/pp/_highly_variable_genes.py
+++ b/scanpy/experimental/pp/_highly_variable_genes.py
@@ -236,12 +236,12 @@ def highly_variable_genes(
     inplace: bool = True,
 ) -> Optional[pd.DataFrame]:
     """\
-    Annotate highly variable genes using analytic Pearson residuals [Lause21]_.
+    Select highly variable genes using analytic Pearson residuals [Lause21]_.
 
-    For [Lause21]_, Pearson residuals of a negative binomial offset model (with
-    overdispersion theta shared across genes) are computed. By default,
-    overdispersion `theta=100` is used and residuals are clipped to `sqrt(n_obs)`.
-    Finally, genes are ranked by residual variance.
+    In [Lause21]_, Pearson residuals of a negative binomial offset model are computed
+    (with overdispersion `theta` shared across genes). By default, overdispersion
+    `theta=100` is used and residuals are clipped to `sqrt(n_obs)`. Finally, genes
+    are ranked by residual variance.
 
     Expects raw count input.
 
@@ -256,8 +256,8 @@ def highly_variable_genes(
     {check_values}
     {layer}
     subset
-        Inplace subset to highly-variable genes if `True` otherwise merely indicate
-        highly variable genes.
+        If `True`, subset the data to highly-variable genes after finding them.
+        Otherwise merely indicate highly variable genes in `adata.var` (see below).
     {inplace}
 
     Returns
@@ -266,21 +266,21 @@ def highly_variable_genes(
     returns the same fields as :class:`~pandas.DataFrame`.
 
     highly_variable : bool
-        boolean indicator of highly-variable genes
+        boolean indicator of highly-variable genes.
     means : float
-        means per gene
+        means per gene.
     variances : float
-        variance per gene
+        variance per gene.
     residual_variances : float
         For `flavor='pearson_residuals'`, residual variance per gene. Averaged in the
         case of multiple batches.
     highly_variable_rank : float
-        For `flavor='pearson_residuals'`, rank of the gene according to residual
-        variance, median rank in the case of multiple batches
+        For `flavor='pearson_residuals'`, rank of the gene according to residual.
+        variance, median rank in the case of multiple batches.
     highly_variable_nbatches : int
-        If `batch_key` given, denotes in how many batches genes are detected as HVG
+        If `batch_key` given, denotes in how many batches genes are detected as HVG.
     highly_variable_intersection : bool
-        If `batch_key` given, denotes the genes that are highly variable in all batches
+        If `batch_key` given, denotes the genes that are highly variable in all batches.
 
     Notes
     -----
diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index ab26ae2af3..4445b55d21 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -108,9 +108,9 @@ def normalize_pearson_residuals(
     normalized values in `results_dict['X']`.
 
     `.uns['pearson_residuals_normalization']['theta']`
-         The used value of the overdisperion parameter theta
+         The used value of the overdisperion parameter theta.
     `.uns['pearson_residuals_normalization']['clip']`
-         The used value of the clipping parameter
+         The used value of the clipping parameter.
     `.uns['pearson_residuals_normalization']['computed_on']`
          The name of the layer on which the residuals were computed.
     """
@@ -180,7 +180,7 @@ def normalize_pearson_residuals_pca(
     {dist_params}
     {pca_chunk}
     use_highly_variable
-        If `True`, use gene selection present in `adata.var['highly_variable']` to
+        If `True`, uses gene selection present in `adata.var['highly_variable']` to
         subset the data before normalizing (default). Otherwise, proceed on the full
         dataset.
     {check_values}
@@ -189,16 +189,15 @@ def normalize_pearson_residuals_pca(
 
     Returns
     -------
-    If `inplace=False`, returns the Pearson residual-based PCA results (`adata_pca`,
-    :class:`~anndata.AnnData`). If `inplace=True`, updates `adata` with the following
-    fields:
+    If `inplace=False`, returns the Pearson residual-based PCA results (as :class:`~anndata.AnnData`
+    object). If `inplace=True`, updates `adata` with the following fields:
 
     `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
-         The hvg-subset, normalized by Pearson residuals
+         The subset of highly variable genes, normalized by Pearson residuals.
     `.uns['pearson_residuals_normalization']['theta']`
-         The used value of the overdisperion parameter theta
+         The used value of the overdisperion parameter theta.
     `.uns['pearson_residuals_normalization']['clip']`
-         The used value of the clipping parameter
+         The used value of the clipping parameter.
 
     `.obsm['X_pca']`
         PCA representation of data after gene selection (if applicable) and Pearson
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index b2ccbf6335..7f11beaeac 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -39,7 +39,7 @@ def recipe_pearson_residuals(
     inplace: bool = True,
 ) -> Optional[Tuple[AnnData, pd.DataFrame]]:
     """\
-    Gene selection and normalization based on [Lause21]_.
+    Full pipeline for HVG selection and normalization by analytic Pearson residuals ([Lause21]_).
 
     Applies gene selection based on Pearson residuals. On the resulting subset,
     Pearson residual normalization and PCA are performed.
@@ -58,8 +58,8 @@ def recipe_pearson_residuals(
 
     Returns
     -------
-    If `inplace=False`, separately returns the gene selection results (`hvg`,
-    :class:`~pandas.DataFrame`) and Pearson residual-based PCA results (`adata_pca`,
+    If `inplace=False`, separately returns the gene selection results (as
+    :class:`~pandas.DataFrame`) and Pearson residual-based PCA results (as
     :class:`~anndata.AnnData`). If `inplace=True`, updates `adata` with the
     following fields for gene selection results:
 
@@ -86,7 +86,7 @@ def recipe_pearson_residuals(
     normalization settings:
 
     `.uns['pearson_residuals_normalization']['pearson_residuals_df']`
-         The hvg-subset, normalized by Pearson residuals.
+         The subset of highly variable genes, normalized by Pearson residuals.
     `.uns['pearson_residuals_normalization']['theta']`
          The used value of the overdisperion parameter theta.
     `.uns['pearson_residuals_normalization']['clip']`

From a0aaf966b854285463a0fd90483005337d367d19 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 16 Mar 2022 19:37:29 +0100
Subject: [PATCH 93/96] fix release and tutorial image

---
 docs/release-notes/1.9.0.rst | 1 +
 docs/tutorials.rst           | 7 +++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst
index 51963e6f72..634199b464 100644
--- a/docs/release-notes/1.9.0.rst
+++ b/docs/release-notes/1.9.0.rst
@@ -11,6 +11,7 @@
 - Added :mod:`scanpy.experimental` module!
 
     - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Pearson Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
+    - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA  :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 95f2527093..5412675b6b 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -97,15 +97,14 @@ See the `cell cycle`_ notebook.
 
 .. _cell cycle: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/180209_cell_cycle/cell_cycle.ipynb
 
-.. image:: _static/img/tutorials/170522_visualizing_one_million_cells/tsne_1.3M.png
-   :width: 120px
-   :align: right
-
 Normalization with Pearson Residuals
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Normalization of scRNA-seq data with Pearson Residuals, from [Lause21]_: :tutorial:`tutorial_pearson_residuals`
 
+.. image:: _static/img/tutorials/170522_visualizing_one_million_cells/tsne_1.3M.png
+   :width: 120px
+   :align: right
 
 Scaling Computations
 ~~~~~~~~~~~~~~~~~~~~

From ad81e29dc26b53af44834b8df4ab0afa912dbe1c Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 16 Mar 2022 20:12:09 +0100
Subject: [PATCH 94/96] try fix pre-commit

---
 docs/release-notes/1.9.0.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst
index 756357394e..4ff60a7cd8 100644
--- a/docs/release-notes/1.9.0.rst
+++ b/docs/release-notes/1.9.0.rst
@@ -16,4 +16,3 @@
     - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
     - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA  :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
-    
\ No newline at end of file

From d74a0e6811b234b99acf32281a1ce30b2dc94f33 Mon Sep 17 00:00:00 2001
From: giovp <giov.pll@gmail.com>
Date: Wed, 16 Mar 2022 20:43:44 +0100
Subject: [PATCH 95/96] minor docs

---
 scanpy/experimental/pp/_normalization.py | 3 ---
 scanpy/experimental/pp/_recipes.py       | 2 --
 2 files changed, 5 deletions(-)

diff --git a/scanpy/experimental/pp/_normalization.py b/scanpy/experimental/pp/_normalization.py
index 4445b55d21..ee116c8df1 100644
--- a/scanpy/experimental/pp/_normalization.py
+++ b/scanpy/experimental/pp/_normalization.py
@@ -173,7 +173,6 @@ def normalize_pearson_residuals_pca(
     Operates on the subset of highly variable genes in `adata.var['highly_variable']`
     by default. Expects raw count input.
 
-
     Params
     ------
     {adata}
@@ -186,7 +185,6 @@ def normalize_pearson_residuals_pca(
     {check_values}
     {inplace}
 
-
     Returns
     -------
     If `inplace=False`, returns the Pearson residual-based PCA results (as :class:`~anndata.AnnData`
@@ -210,7 +208,6 @@ def normalize_pearson_residuals_pca(
          Ratio of explained variance.
     `.uns['pca']['variance']`
          Explained variance, equivalent to the eigenvalues of the covariance matrix.
-
     """
 
     # check if HVG selection is there if user wants to use it
diff --git a/scanpy/experimental/pp/_recipes.py b/scanpy/experimental/pp/_recipes.py
index 7f11beaeac..5aba49345a 100644
--- a/scanpy/experimental/pp/_recipes.py
+++ b/scanpy/experimental/pp/_recipes.py
@@ -46,7 +46,6 @@ def recipe_pearson_residuals(
 
     Expects raw count input.
 
-
     Params
     ------
     {adata}
@@ -102,7 +101,6 @@ def recipe_pearson_residuals(
          Ratio of explained variance.
     `.uns['pca']['variance']`
          Explained variance, equivalent to the eigenvalues of the covariance matrix.
-
     """
 
     hvg_args = dict(

From 970b0fadba479e83b0644e45f873f2a677c86bf0 Mon Sep 17 00:00:00 2001
From: Isaac Virshup <ivirshup@gmail.com>
Date: Tue, 29 Mar 2022 17:51:29 +0200
Subject: [PATCH 96/96] Remove accidentally included files from merge

---
 docs/release-notes/1.9.0.rst |  18 -----
 docs/tutorials.rst           | 147 -----------------------------------
 2 files changed, 165 deletions(-)
 delete mode 100644 docs/release-notes/1.9.0.rst
 delete mode 100644 docs/tutorials.rst

diff --git a/docs/release-notes/1.9.0.rst b/docs/release-notes/1.9.0.rst
deleted file mode 100644
index 4ff60a7cd8..0000000000
--- a/docs/release-notes/1.9.0.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-1.9.0 :small:`the future`
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. rubric:: Features
-
-- :func:`~scanpy.tl.filter_rank_genes_groups` now allows to filter with absolute values of log fold change :pr:`1649` :smaller:`S Rybakov`
-- :func:`~scanpy.pl.embedding_density` now allows more than 10 groups :pr:`1936` :smaller:`A Wolf`
-- :func:`~scanpy.logging.print_versions` now uses `session_info` :pr:`2089` :smaller:`P Angerer` :smaller:`I Virshup`
-
-.. rubric:: Experimental module
-
-- Added :mod:`scanpy.experimental` module!
-
-    - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals` for Pearson Residuals normalization :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
-    - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
-    - Added :func:`scanpy.experimental.pp.highly_variable_genes` for HVG selection with Pearson Residuals :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
-    - Added :func:`scanpy.experimental.pp.normalize_pearson_residuals_pca` for Pearson Residuals normalization and dimensionality reduction with PCA :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
-    - Added :func:`scanpy.experimental.pp.recipe_pearson_residuals` for Pearson Residuals normalization, HVG selection and dimensionality reduction with PCA  :pr:`1715` :smaller:`J Lause, G Palla, I Virshup`
diff --git a/docs/tutorials.rst b/docs/tutorials.rst
deleted file mode 100644
index 5412675b6b..0000000000
--- a/docs/tutorials.rst
+++ /dev/null
@@ -1,147 +0,0 @@
-Tutorials
-=========
-
-Clustering
-----------
-
-For getting started, we recommend Scanpy’s reimplementation :tutorial:`pbmc3k`
-of Seurat’s [Satija15]_ clustering tutorial for 3k PBMCs from 10x Genomics,
-containing preprocessing, clustering and the identification of cell types via
-known marker genes.
-
-.. image:: _static/img/tutorials/170505_seurat/filter_genes_dispersion.png
-   :width: 100px
-.. image:: _static/img/tutorials/170505_seurat/louvain.png
-   :width: 100px
-.. image:: _static/img/tutorials/170505_seurat/NKG7.png
-   :width: 100px
-.. image:: _static/img/tutorials/170505_seurat/violin.png
-   :width: 100px
-.. image:: _static/img/tutorials/170505_seurat/cell_types.png
-   :width: 200px
-
-
-Visualization
--------------
-
-This tutorial shows how to visually explore genes using scanpy. :tutorial:`plotting/core`
-
-.. image:: _static/img/stacked_violin_dotplot_matrixplot.png
-   :width: 550px
-
-
-Trajectory inference
---------------------
-
-Get started with the following example for hematopoiesis for data of [Paul15]_: :tutorial:`paga-paul15`
-
-.. image:: _static/img/tutorials/paga_paul15.png
-   :width: 450px
-
-More examples for trajectory inference on complex datasets can be found in the
-`PAGA <https://github.com/theislab/paga>`_ repository [Wolf19]_, for instance, multi-resolution analyses of whole
-animals, such as for planaria_ for data of [Plass18]_.
-
-.. image:: _static/img/tutorials/paga_planaria.png
-   :width: 350px
-
-As a reference for simple pseudotime analyses, we provide the diffusion pseudotime (DPT) analyses of [Haghverdi16]_
-for two hematopoiesis datasets: `DPT example 1`_ [Paul15]_ and `DPT example 2`_ [Moignard15]_.
-
-.. _planaria: https://nbviewer.jupyter.org/github/theislab/paga/blob/master/planaria/planaria.ipynb
-.. _DPT example 1: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/170502_paul15/paul15.ipynb
-.. _DPT example 2: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/170501_moignard15/moignard15.ipynb
-
-
-Integrating datasets
---------------------
-
-Map labels and embeddings of reference data to new data: :tutorial:`integrating-data-using-ingest`
-
-.. image:: https://scanpy-tutorials.readthedocs.io/en/latest/_images/integrating-data-using-ingest_21_0.png
-   :width: 350px
-
-
-Spatial data
-------------
-
-* Basic analysis of spatial data: :tutorial:`spatial/basic-analysis`
-* Integrating spatial data with scRNA-seq using scanorama: :tutorial:`spatial/integration-scanorama`
-
-.. image:: _static/img/spatial-basic-analysis.png
-   :width: 250px
-
-
-Further Tutorials
------------------
-
-.. _conversion_to_r:
-
-Conversion: AnnData, SingleCellExperiment, and Seurat objects
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. image:: https://github.com/theislab/scanpy-in-R/raw/master/logo.png
-   :width: 200px
-   :align: right
-
-* See `Seurat to AnnData`_ for a tutorial on `anndata2ri`.
-* See the `Scanpy in R`_ guide for a tutorial on interacting with Scanpy from R.
-
-.. _Seurat to AnnData: https://github.com/LuckyMD/Code_snippets/blob/master/Seurat_to_anndata.ipynb
-.. _Scanpy in R: https://theislab.github.io/scanpy-in-R/
-
-Regressing out cell cycle
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-See the `cell cycle`_ notebook.
-
-.. _cell cycle: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/180209_cell_cycle/cell_cycle.ipynb
-
-Normalization with Pearson Residuals
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Normalization of scRNA-seq data with Pearson Residuals, from [Lause21]_: :tutorial:`tutorial_pearson_residuals`
-
-.. image:: _static/img/tutorials/170522_visualizing_one_million_cells/tsne_1.3M.png
-   :width: 120px
-   :align: right
-
-Scaling Computations
-~~~~~~~~~~~~~~~~~~~~
-
-* Visualize and cluster `1.3M neurons`_ from 10x Genomics.
-
-.. _1.3M neurons: https://github.com/theislab/scanpy_usage/tree/master/170522_visualizing_one_million_cells
-
-Simulations
-~~~~~~~~~~~
-
-Simulating single cells using literature-curated gene regulatory networks [Wittmann09]_.
-
-.. image:: _static/img/tutorials/170430_krumsiek11/timeseries.png
-   :width: 200px
-   :align: right
-
-* Notebook for `myeloid differentiation`_
-* Notebook for simple toggleswitch_
-
-.. _myeloid differentiation: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/170430_krumsiek11/krumsiek11.ipynb
-.. _toggleswitch: https://nbviewer.jupyter.org/github/theislab/scanpy_usage/blob/master/170430_krumsiek11/toggleswitch.ipynb
-
-Images
-~~~~~~
-
-See pseudotime-time inference on deep-learning based features for `cell cycle reconstruction`_ from image data [Eulenberg17]_.
-
-.. _cell cycle reconstruction: https://github.com/theislab/scanpy_usage/tree/master/170529_images
-
-..
-    User Examples
-    ~~~~~~~~~~~~~
-
-    January 12, 2018: `Exploring the mouse cell atlas`_ by `David P. Cook`_.
-    Data by `Tabula Muris Consortium`_.
-
-    .. _Exploring the mouse cell atlas: https://github.com/dpcook/fun_analysis/blob/master/tabula_muris/mouse_atlas_scanpy.ipynb
-    .. _David P. Cook: https://twitter.com/DavidPCook
-    .. _Tabula Muris Consortium: https://www.biorxiv.org/content/early/2017/12/20/237446