From bfd569ee3d6ec5383cce3ee8cc86ce945d77a6c9 Mon Sep 17 00:00:00 2001 From: James Mathews Date: Thu, 1 Aug 2024 15:02:12 -0400 Subject: [PATCH] Refine Ripley metric extraction (#343) * Fix 1-p issue in ripley implementation, report weighted scale value * Update feature description * Update test artifact --- .../db/data_model/feature_descriptions.tsv | 2 +- spatialprofilingtoolbox/ondemand/worker.py | 1 + .../workflow/common/squidpy.py | 148 ++++++++++++++++-- .../module_tests/expected_squidpy5.json | 14 +- 4 files changed, 147 insertions(+), 18 deletions(-) diff --git a/spatialprofilingtoolbox/db/data_model/feature_descriptions.tsv b/spatialprofilingtoolbox/db/data_model/feature_descriptions.tsv index 5359aafd1..0d4688ce1 100644 --- a/spatialprofilingtoolbox/db/data_model/feature_descriptions.tsv +++ b/spatialprofilingtoolbox/db/data_model/feature_descriptions.tsv @@ -1,7 +1,7 @@ spatial autocorrelation For a given cell phenotype (first specifier), i.e. cell set, the p-value associated with the Moran I statistic value for this cell set, with respect to the standard null distribution for this statistic. neighborhood enrichment For two given cell phenotypes (first and second specifiers), the estimated p-value (CDF of z-score) of the number of occurrences of neighbor relations between two cells of the given respective phenotypes, the z-score computed in reference to a bootstrapped empirical distribution constructed using 1000 random permutations of the cell set with labels fixed. co-occurrence For two given cell phenotypes (first and second specifiers), and a given radius (third specifier), the ratio of the two probabilities of occurrence of the first phenotype within the given radius around some cell, calculated with and without conditioning on at least one occurrence of the second phenotype in the same radius. -ripley For a given cell phenotype (first specifier), the p-value associated with the Ripley F-statistic for the point set of cell locations, at the radius scale for which this p-value is smallest. +ripley For a given cell phenotype (first specifier), the radius scale which is the expected value with respect to the p-values associated with elevated Ripley F-statistic for the point set of cell locations, that is, a radius scale at which non-trivial clustering is observed in the Ripley F sense. gnn importance score For a given cohort stratification variable (the specifier), the integer rank of each cell (the subjects of the feature) with respect to the importance scores derived from a GNN trained on this variable. population fractions For a given cell phenotype, the average number of cells of that phenotype in the given sample relative to the number of cells in the sample. proximity For a given cell phenotype (first specifier), the average number of cells of a second phenotype (second specifier) within a specified radius (third specifier). diff --git a/spatialprofilingtoolbox/ondemand/worker.py b/spatialprofilingtoolbox/ondemand/worker.py index a34833eaa..bd370ad43 100644 --- a/spatialprofilingtoolbox/ondemand/worker.py +++ b/spatialprofilingtoolbox/ondemand/worker.py @@ -86,6 +86,7 @@ def _compute(self, job: Job) -> None: except Exception as error: logger.error(error) print_exception(type(error), error, error.__traceback__) + self._no_value_wrapup(job) finally: generic_handler.disalarm() diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py index c51a20a69..75c8e3d6a 100644 --- a/spatialprofilingtoolbox/workflow/common/squidpy.py +++ b/spatialprofilingtoolbox/workflow/common/squidpy.py @@ -2,22 +2,48 @@ from typing import Any from typing import cast +from typing import ( + TYPE_CHECKING, + Literal, +) from warnings import filterwarnings from numpy.typing import NDArray from numpy import isnan +from numpy import inner +from numpy import sum as np_sum from pandas import DataFrame, Series from squidpy.gr import ( # type: ignore spatial_neighbors, nhood_enrichment, co_occurrence, - ripley, spatial_autocorr, + # ripley, ) from anndata import AnnData # type: ignore from scipy.stats import norm # type: ignore +import numpy as np +import pandas as pd +from scanpy import logging as logg + +from scipy.spatial import ConvexHull +from scipy.spatial.distance import pdist +from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import LabelEncoder +from spatialdata import SpatialData + +from squidpy._constants._constants import RipleyStat +from squidpy._constants._pkg_constants import Key +from squidpy._utils import NDArrayA +from squidpy.gr._utils import _assert_categorical_obs, _assert_spatial_basis, _save_data + +from squidpy.gr._ripley import _reshape_res +from squidpy.gr._ripley import _f_g_function +from squidpy.gr._ripley import _l_function +from squidpy.gr._ripley import _ppp + from spatialprofilingtoolbox.db.exchange_data_formats.metrics import PhenotypeCriteria from spatialprofilingtoolbox.db.describe_features import get_feature_description from spatialprofilingtoolbox.db.describe_features import squidpy_feature_classnames @@ -28,6 +54,112 @@ filterwarnings('ignore', message='1 variables were constant, will return nan for these.') +def ripley_custom( + adata: AnnData | SpatialData, + cluster_key: str, + mode: Literal["F", "G", "L"] = "F", + spatial_key: str = Key.obsm.spatial, + metric: str = "euclidean", + n_neigh: int = 2, + n_simulations: int = 100, + n_observations: int = 1000, + max_dist: float | None = None, + n_steps: int = 50, + seed: int | None = None, + copy: bool = False, +) -> dict[str, pd.DataFrame | NDArrayA]: + """ + Copied from squidpy in order to alter the treatement of p values. + https://github.com/scverse/squidpy/blob/93ee854fe4ab18583bd8bb6f1a45e9e052a0d8fa/src/squidpy/gr/_ripley.py + """ + if isinstance(adata, SpatialData): + adata = adata.table + _assert_categorical_obs(adata, key=cluster_key) + _assert_spatial_basis(adata, key=spatial_key) + coordinates = adata.obsm[spatial_key] + clusters = adata.obs[cluster_key].values + + mode = RipleyStat(mode) # type: ignore[assignment] + if TYPE_CHECKING: + assert isinstance(mode, RipleyStat) + + # prepare support + N = coordinates.shape[0] + hull = ConvexHull(coordinates) + area = hull.volume + if max_dist is None: + max_dist = ((area / 2) ** 0.5) / 4 + support = np.linspace(0, max_dist, n_steps) + + # prepare labels + le = LabelEncoder().fit(clusters) + cluster_idx = le.transform(clusters) + obs_arr = np.empty((le.classes_.shape[0], n_steps)) + + start = logg.info( + f"Calculating Ripley's {mode} statistic for `{le.classes_.shape[0]}` clusters and `{n_simulations}` simulations" + ) + + for i in np.arange(np.max(cluster_idx) + 1): + coord_c = coordinates[cluster_idx == i, :] + if mode == RipleyStat.F: + random = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=seed) + tree_c = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(coord_c) + distances, _ = tree_c.kneighbors(random, n_neighbors=n_neigh) + bins, obs_stats = _f_g_function(distances.squeeze(), support) + elif mode == RipleyStat.G: + tree_c = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(coord_c) + distances, _ = tree_c.kneighbors(coordinates[cluster_idx != i, :], n_neighbors=n_neigh) + bins, obs_stats = _f_g_function(distances.squeeze(), support) + elif mode == RipleyStat.L: + distances = pdist(coord_c, metric=metric) + bins, obs_stats = _l_function(distances, support, N, area) + else: + raise NotImplementedError(f"Mode `{mode.s!r}` is not yet implemented.") + obs_arr[i] = obs_stats + + sims = np.empty((n_simulations, len(bins))) + pvalues = np.ones((le.classes_.shape[0], len(bins))) + + for i in range(n_simulations): + random_i = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=seed) + if mode == RipleyStat.F: + tree_i = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(random_i) + distances_i, _ = tree_i.kneighbors(random, n_neighbors=1) + _, stats_i = _f_g_function(distances_i.squeeze(), support) + elif mode == RipleyStat.G: + tree_i = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(random_i) + distances_i, _ = tree_i.kneighbors(coordinates, n_neighbors=1) + _, stats_i = _f_g_function(distances_i.squeeze(), support) + elif mode == RipleyStat.L: + distances_i = pdist(random_i, metric=metric) + _, stats_i = _l_function(distances_i, support, N, area) + else: + raise NotImplementedError(f"Mode `{mode.s!r}` is not yet implemented.") + + for j in range(obs_arr.shape[0]): + pvalues[j] += stats_i < obs_arr[j] + sims[i] = stats_i + + pvalues /= n_simulations + 1 + # pvalues = np.minimum(pvalues, 1 - pvalues) + + obs_df = _reshape_res(obs_arr.T, columns=le.classes_, index=bins, var_name=cluster_key) + sims_df = _reshape_res(sims.T, columns=np.arange(n_simulations), index=bins, var_name="simulations") + + res = {f"{mode}_stat": obs_df, "sims_stat": sims_df, "bins": bins, "pvalues": pvalues} + + if TYPE_CHECKING: + assert isinstance(res, dict) + + if copy: + logg.info("Finish", time=start) + return res + + _save_data(adata, attr="uns", key=Key.uns.ripley(cluster_key, mode), data=res, time=start) + return None + + def lookup_squidpy_feature_class(method: str) -> str | None: for key in squidpy_feature_classnames(): if get_feature_description(key) == method: @@ -95,14 +227,10 @@ def _summarize_co_occurrence(unstructured_metrics: dict[str, NDArray[Any]] | Non def _summarize_ripley(unstructured_metrics: dict[str, NDArray[Any]]) -> float | None: bins = unstructured_metrics['bins'] pvalues = unstructured_metrics['pvalues'] - pairs = list(zip(bins.tolist(), pvalues.tolist())) - if len(bins) != len(pvalues) or len(pairs) == 0: + total_p = np_sum(pvalues) + if len(bins) != len(pvalues) or total_p == 0: return None - filtered = [pair for pair in pairs if pair[1] > 0] - if len(filtered) == 0: - return 1.0 - sorted_pairs = sorted(filtered, key=lambda pair: pair[1]) - return float(sorted_pairs[0][1]) + return round(inner(pvalues, bins) / total_p, 1) def _summarize_spatial_autocorrelation(unstructured_metrics: DataFrame) -> float | None: @@ -184,8 +312,8 @@ def _co_occurrence(adata: AnnData, radius: float) -> dict[str, NDArray[Any]] | N def _ripley(adata: AnnData) -> dict[str, NDArray[Any]]: - r"""Compute various Ripley\'s statistics for point processes.""" - result = ripley(adata, 'cluster', copy=True) + r"""Compute Ripley statistics.""" + result = ripley_custom(adata, 'cluster', copy=True, n_steps=100) bins = cast(NDArray[Any], result['bins']) pvalues = cast(NDArray[Any], result['pvalues']) return { diff --git a/test/apiserver/module_tests/expected_squidpy5.json b/test/apiserver/module_tests/expected_squidpy5.json index c8441a2a1..13d2fa340 100644 --- a/test/apiserver/module_tests/expected_squidpy5.json +++ b/test/apiserver/module_tests/expected_squidpy5.json @@ -1,12 +1,12 @@ { "values": { - "lesion 0_1": 1.0, - "lesion 0_2": 1.0, - "lesion 0_3": 1.0, - "lesion 6_1": 1.0, - "lesion 6_2": 1.0, - "lesion 6_3": 1.0, - "lesion 6_4": 1.0 + "lesion 0_1": 34.1, + "lesion 0_2": 36.5, + "lesion 0_3": 38.3, + "lesion 6_1": 52.4, + "lesion 6_2": 90.9, + "lesion 6_3": 205.5, + "lesion 6_4": 56.6 }, "is_pending": false }