Skip to content

Commit

Permalink
Refine Ripley metric extraction (#343)
Browse files Browse the repository at this point in the history
* Fix 1-p issue in ripley implementation, report weighted scale value

* Update feature description

* Update test artifact
  • Loading branch information
jimmymathews authored Aug 1, 2024
1 parent cda3a87 commit bfd569e
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 18 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
spatial autocorrelation For a given cell phenotype (first specifier), i.e. cell set, the p-value associated with the Moran I statistic value for this cell set, with respect to the standard null distribution for this statistic.
neighborhood enrichment For two given cell phenotypes (first and second specifiers), the estimated p-value (CDF of z-score) of the number of occurrences of neighbor relations between two cells of the given respective phenotypes, the z-score computed in reference to a bootstrapped empirical distribution constructed using 1000 random permutations of the cell set with labels fixed.
co-occurrence For two given cell phenotypes (first and second specifiers), and a given radius (third specifier), the ratio of the two probabilities of occurrence of the first phenotype within the given radius around some cell, calculated with and without conditioning on at least one occurrence of the second phenotype in the same radius.
ripley For a given cell phenotype (first specifier), the p-value associated with the Ripley F-statistic for the point set of cell locations, at the radius scale for which this p-value is smallest.
ripley For a given cell phenotype (first specifier), the radius scale which is the expected value with respect to the p-values associated with elevated Ripley F-statistic for the point set of cell locations, that is, a radius scale at which non-trivial clustering is observed in the Ripley F sense.
gnn importance score For a given cohort stratification variable (the specifier), the integer rank of each cell (the subjects of the feature) with respect to the importance scores derived from a GNN trained on this variable.
population fractions For a given cell phenotype, the average number of cells of that phenotype in the given sample relative to the number of cells in the sample.
proximity For a given cell phenotype (first specifier), the average number of cells of a second phenotype (second specifier) within a specified radius (third specifier).
1 change: 1 addition & 0 deletions spatialprofilingtoolbox/ondemand/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def _compute(self, job: Job) -> None:
except Exception as error:
logger.error(error)
print_exception(type(error), error, error.__traceback__)
self._no_value_wrapup(job)
finally:
generic_handler.disalarm()

Expand Down
148 changes: 138 additions & 10 deletions spatialprofilingtoolbox/workflow/common/squidpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,48 @@

from typing import Any
from typing import cast
from typing import (
TYPE_CHECKING,
Literal,
)
from warnings import filterwarnings

from numpy.typing import NDArray
from numpy import isnan
from numpy import inner
from numpy import sum as np_sum
from pandas import DataFrame, Series
from squidpy.gr import ( # type: ignore
spatial_neighbors,
nhood_enrichment,
co_occurrence,
ripley,
spatial_autocorr,
# ripley,
)

from anndata import AnnData # type: ignore
from scipy.stats import norm # type: ignore

import numpy as np
import pandas as pd
from scanpy import logging as logg

from scipy.spatial import ConvexHull
from scipy.spatial.distance import pdist
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from spatialdata import SpatialData

from squidpy._constants._constants import RipleyStat
from squidpy._constants._pkg_constants import Key
from squidpy._utils import NDArrayA
from squidpy.gr._utils import _assert_categorical_obs, _assert_spatial_basis, _save_data

from squidpy.gr._ripley import _reshape_res
from squidpy.gr._ripley import _f_g_function
from squidpy.gr._ripley import _l_function
from squidpy.gr._ripley import _ppp

from spatialprofilingtoolbox.db.exchange_data_formats.metrics import PhenotypeCriteria
from spatialprofilingtoolbox.db.describe_features import get_feature_description
from spatialprofilingtoolbox.db.describe_features import squidpy_feature_classnames
Expand All @@ -28,6 +54,112 @@
filterwarnings('ignore', message='1 variables were constant, will return nan for these.')


def ripley_custom(
adata: AnnData | SpatialData,
cluster_key: str,
mode: Literal["F", "G", "L"] = "F",
spatial_key: str = Key.obsm.spatial,
metric: str = "euclidean",
n_neigh: int = 2,
n_simulations: int = 100,
n_observations: int = 1000,
max_dist: float | None = None,
n_steps: int = 50,
seed: int | None = None,
copy: bool = False,
) -> dict[str, pd.DataFrame | NDArrayA]:
"""
Copied from squidpy in order to alter the treatement of p values.
https://github.com/scverse/squidpy/blob/93ee854fe4ab18583bd8bb6f1a45e9e052a0d8fa/src/squidpy/gr/_ripley.py
"""
if isinstance(adata, SpatialData):
adata = adata.table
_assert_categorical_obs(adata, key=cluster_key)
_assert_spatial_basis(adata, key=spatial_key)
coordinates = adata.obsm[spatial_key]
clusters = adata.obs[cluster_key].values

mode = RipleyStat(mode) # type: ignore[assignment]
if TYPE_CHECKING:
assert isinstance(mode, RipleyStat)

# prepare support
N = coordinates.shape[0]
hull = ConvexHull(coordinates)
area = hull.volume
if max_dist is None:
max_dist = ((area / 2) ** 0.5) / 4
support = np.linspace(0, max_dist, n_steps)

# prepare labels
le = LabelEncoder().fit(clusters)
cluster_idx = le.transform(clusters)
obs_arr = np.empty((le.classes_.shape[0], n_steps))

start = logg.info(
f"Calculating Ripley's {mode} statistic for `{le.classes_.shape[0]}` clusters and `{n_simulations}` simulations"
)

for i in np.arange(np.max(cluster_idx) + 1):
coord_c = coordinates[cluster_idx == i, :]
if mode == RipleyStat.F:
random = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=seed)
tree_c = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(coord_c)
distances, _ = tree_c.kneighbors(random, n_neighbors=n_neigh)
bins, obs_stats = _f_g_function(distances.squeeze(), support)
elif mode == RipleyStat.G:
tree_c = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(coord_c)
distances, _ = tree_c.kneighbors(coordinates[cluster_idx != i, :], n_neighbors=n_neigh)
bins, obs_stats = _f_g_function(distances.squeeze(), support)
elif mode == RipleyStat.L:
distances = pdist(coord_c, metric=metric)
bins, obs_stats = _l_function(distances, support, N, area)
else:
raise NotImplementedError(f"Mode `{mode.s!r}` is not yet implemented.")
obs_arr[i] = obs_stats

sims = np.empty((n_simulations, len(bins)))
pvalues = np.ones((le.classes_.shape[0], len(bins)))

for i in range(n_simulations):
random_i = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=seed)
if mode == RipleyStat.F:
tree_i = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(random_i)
distances_i, _ = tree_i.kneighbors(random, n_neighbors=1)
_, stats_i = _f_g_function(distances_i.squeeze(), support)
elif mode == RipleyStat.G:
tree_i = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(random_i)
distances_i, _ = tree_i.kneighbors(coordinates, n_neighbors=1)
_, stats_i = _f_g_function(distances_i.squeeze(), support)
elif mode == RipleyStat.L:
distances_i = pdist(random_i, metric=metric)
_, stats_i = _l_function(distances_i, support, N, area)
else:
raise NotImplementedError(f"Mode `{mode.s!r}` is not yet implemented.")

for j in range(obs_arr.shape[0]):
pvalues[j] += stats_i < obs_arr[j]
sims[i] = stats_i

pvalues /= n_simulations + 1
# pvalues = np.minimum(pvalues, 1 - pvalues)

obs_df = _reshape_res(obs_arr.T, columns=le.classes_, index=bins, var_name=cluster_key)
sims_df = _reshape_res(sims.T, columns=np.arange(n_simulations), index=bins, var_name="simulations")

res = {f"{mode}_stat": obs_df, "sims_stat": sims_df, "bins": bins, "pvalues": pvalues}

if TYPE_CHECKING:
assert isinstance(res, dict)

if copy:
logg.info("Finish", time=start)
return res

_save_data(adata, attr="uns", key=Key.uns.ripley(cluster_key, mode), data=res, time=start)
return None


def lookup_squidpy_feature_class(method: str) -> str | None:
for key in squidpy_feature_classnames():
if get_feature_description(key) == method:
Expand Down Expand Up @@ -95,14 +227,10 @@ def _summarize_co_occurrence(unstructured_metrics: dict[str, NDArray[Any]] | Non
def _summarize_ripley(unstructured_metrics: dict[str, NDArray[Any]]) -> float | None:
bins = unstructured_metrics['bins']
pvalues = unstructured_metrics['pvalues']
pairs = list(zip(bins.tolist(), pvalues.tolist()))
if len(bins) != len(pvalues) or len(pairs) == 0:
total_p = np_sum(pvalues)
if len(bins) != len(pvalues) or total_p == 0:
return None
filtered = [pair for pair in pairs if pair[1] > 0]
if len(filtered) == 0:
return 1.0
sorted_pairs = sorted(filtered, key=lambda pair: pair[1])
return float(sorted_pairs[0][1])
return round(inner(pvalues, bins) / total_p, 1)


def _summarize_spatial_autocorrelation(unstructured_metrics: DataFrame) -> float | None:
Expand Down Expand Up @@ -184,8 +312,8 @@ def _co_occurrence(adata: AnnData, radius: float) -> dict[str, NDArray[Any]] | N


def _ripley(adata: AnnData) -> dict[str, NDArray[Any]]:
r"""Compute various Ripley\'s statistics for point processes."""
result = ripley(adata, 'cluster', copy=True)
r"""Compute Ripley statistics."""
result = ripley_custom(adata, 'cluster', copy=True, n_steps=100)
bins = cast(NDArray[Any], result['bins'])
pvalues = cast(NDArray[Any], result['pvalues'])
return {
Expand Down
14 changes: 7 additions & 7 deletions test/apiserver/module_tests/expected_squidpy5.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"values": {
"lesion 0_1": 1.0,
"lesion 0_2": 1.0,
"lesion 0_3": 1.0,
"lesion 6_1": 1.0,
"lesion 6_2": 1.0,
"lesion 6_3": 1.0,
"lesion 6_4": 1.0
"lesion 0_1": 34.1,
"lesion 0_2": 36.5,
"lesion 0_3": 38.3,
"lesion 6_1": 52.4,
"lesion 6_2": 90.9,
"lesion 6_3": 205.5,
"lesion 6_4": 56.6
},
"is_pending": false
}

0 comments on commit bfd569e

Please sign in to comment.