From bfd569ee3d6ec5383cce3ee8cc86ce945d77a6c9 Mon Sep 17 00:00:00 2001
From: James Mathews <mathewj2@mskcc.org>
Date: Thu, 1 Aug 2024 15:02:12 -0400
Subject: [PATCH] Refine Ripley metric extraction (#343)

* Fix 1-p issue in ripley implementation, report weighted scale value

* Update feature description

* Update test artifact
---
 .../db/data_model/feature_descriptions.tsv    |   2 +-
 spatialprofilingtoolbox/ondemand/worker.py    |   1 +
 .../workflow/common/squidpy.py                | 148 ++++++++++++++++--
 .../module_tests/expected_squidpy5.json       |  14 +-
 4 files changed, 147 insertions(+), 18 deletions(-)

diff --git a/spatialprofilingtoolbox/db/data_model/feature_descriptions.tsv b/spatialprofilingtoolbox/db/data_model/feature_descriptions.tsv
index 5359aafd1..0d4688ce1 100644
--- a/spatialprofilingtoolbox/db/data_model/feature_descriptions.tsv
+++ b/spatialprofilingtoolbox/db/data_model/feature_descriptions.tsv
@@ -1,7 +1,7 @@
 spatial autocorrelation	For a given cell phenotype (first specifier), i.e. cell set, the p-value associated with the Moran I statistic value for this cell set, with respect to the standard null distribution for this statistic.
 neighborhood enrichment	For two given cell phenotypes (first and second specifiers), the estimated p-value (CDF of z-score) of the number of occurrences of neighbor relations between two cells of the given respective phenotypes, the z-score computed in reference to a bootstrapped empirical distribution constructed using 1000 random permutations of the cell set with labels fixed.
 co-occurrence	For two given cell phenotypes (first and second specifiers), and a given radius (third specifier), the ratio of the two probabilities of occurrence of the first phenotype within the given radius around some cell, calculated with and without conditioning on at least one occurrence of the second phenotype in the same radius.
-ripley	For a given cell phenotype (first specifier), the p-value associated with the Ripley F-statistic for the point set of cell locations, at the radius scale for which this p-value is smallest.
+ripley	For a given cell phenotype (first specifier), the radius scale which is the expected value with respect to the p-values associated with elevated Ripley F-statistic for the point set of cell locations, that is, a radius scale at which non-trivial clustering is observed in the Ripley F sense.
 gnn importance score	For a given cohort stratification variable (the specifier), the integer rank of each cell (the subjects of the feature) with respect to the importance scores derived from a GNN trained on this variable.
 population fractions	For a given cell phenotype, the average number of cells of that phenotype in the given sample relative to the number of cells in the sample.
 proximity	For a given cell phenotype (first specifier), the average number of cells of a second phenotype (second specifier) within a specified radius (third specifier).
diff --git a/spatialprofilingtoolbox/ondemand/worker.py b/spatialprofilingtoolbox/ondemand/worker.py
index a34833eaa..bd370ad43 100644
--- a/spatialprofilingtoolbox/ondemand/worker.py
+++ b/spatialprofilingtoolbox/ondemand/worker.py
@@ -86,6 +86,7 @@ def _compute(self, job: Job) -> None:
         except Exception as error:
             logger.error(error)
             print_exception(type(error), error, error.__traceback__)
+            self._no_value_wrapup(job)
         finally:
             generic_handler.disalarm()
 
diff --git a/spatialprofilingtoolbox/workflow/common/squidpy.py b/spatialprofilingtoolbox/workflow/common/squidpy.py
index c51a20a69..75c8e3d6a 100644
--- a/spatialprofilingtoolbox/workflow/common/squidpy.py
+++ b/spatialprofilingtoolbox/workflow/common/squidpy.py
@@ -2,22 +2,48 @@
 
 from typing import Any
 from typing import cast
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+)
 from warnings import filterwarnings
 
 from numpy.typing import NDArray
 from numpy import isnan
+from numpy import inner
+from numpy import sum as np_sum
 from pandas import DataFrame, Series
 from squidpy.gr import (  # type: ignore
     spatial_neighbors,
     nhood_enrichment,
     co_occurrence,
-    ripley,
     spatial_autocorr,
+    # ripley,
 )
 
 from anndata import AnnData  # type: ignore
 from scipy.stats import norm  # type: ignore
 
+import numpy as np
+import pandas as pd
+from scanpy import logging as logg
+
+from scipy.spatial import ConvexHull
+from scipy.spatial.distance import pdist
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import LabelEncoder
+from spatialdata import SpatialData
+
+from squidpy._constants._constants import RipleyStat
+from squidpy._constants._pkg_constants import Key
+from squidpy._utils import NDArrayA
+from squidpy.gr._utils import _assert_categorical_obs, _assert_spatial_basis, _save_data
+
+from squidpy.gr._ripley import _reshape_res
+from squidpy.gr._ripley import _f_g_function
+from squidpy.gr._ripley import _l_function
+from squidpy.gr._ripley import _ppp
+
 from spatialprofilingtoolbox.db.exchange_data_formats.metrics import PhenotypeCriteria
 from spatialprofilingtoolbox.db.describe_features import get_feature_description
 from spatialprofilingtoolbox.db.describe_features import squidpy_feature_classnames
@@ -28,6 +54,112 @@
 filterwarnings('ignore', message='1 variables were constant, will return nan for these.')
 
 
+def ripley_custom(
+    adata: AnnData | SpatialData,
+    cluster_key: str,
+    mode: Literal["F", "G", "L"] = "F",
+    spatial_key: str = Key.obsm.spatial,
+    metric: str = "euclidean",
+    n_neigh: int = 2,
+    n_simulations: int = 100,
+    n_observations: int = 1000,
+    max_dist: float | None = None,
+    n_steps: int = 50,
+    seed: int | None = None,
+    copy: bool = False,
+) -> dict[str, pd.DataFrame | NDArrayA]:
+    """
+    Copied from squidpy in order to alter the treatement of p values.
+    https://github.com/scverse/squidpy/blob/93ee854fe4ab18583bd8bb6f1a45e9e052a0d8fa/src/squidpy/gr/_ripley.py
+    """
+    if isinstance(adata, SpatialData):
+        adata = adata.table
+    _assert_categorical_obs(adata, key=cluster_key)
+    _assert_spatial_basis(adata, key=spatial_key)
+    coordinates = adata.obsm[spatial_key]
+    clusters = adata.obs[cluster_key].values
+
+    mode = RipleyStat(mode)  # type: ignore[assignment]
+    if TYPE_CHECKING:
+        assert isinstance(mode, RipleyStat)
+
+    # prepare support
+    N = coordinates.shape[0]
+    hull = ConvexHull(coordinates)
+    area = hull.volume
+    if max_dist is None:
+        max_dist = ((area / 2) ** 0.5) / 4
+    support = np.linspace(0, max_dist, n_steps)
+
+    # prepare labels
+    le = LabelEncoder().fit(clusters)
+    cluster_idx = le.transform(clusters)
+    obs_arr = np.empty((le.classes_.shape[0], n_steps))
+
+    start = logg.info(
+        f"Calculating Ripley's {mode} statistic for `{le.classes_.shape[0]}` clusters and `{n_simulations}` simulations"
+    )
+
+    for i in np.arange(np.max(cluster_idx) + 1):
+        coord_c = coordinates[cluster_idx == i, :]
+        if mode == RipleyStat.F:
+            random = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=seed)
+            tree_c = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(coord_c)
+            distances, _ = tree_c.kneighbors(random, n_neighbors=n_neigh)
+            bins, obs_stats = _f_g_function(distances.squeeze(), support)
+        elif mode == RipleyStat.G:
+            tree_c = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(coord_c)
+            distances, _ = tree_c.kneighbors(coordinates[cluster_idx != i, :], n_neighbors=n_neigh)
+            bins, obs_stats = _f_g_function(distances.squeeze(), support)
+        elif mode == RipleyStat.L:
+            distances = pdist(coord_c, metric=metric)
+            bins, obs_stats = _l_function(distances, support, N, area)
+        else:
+            raise NotImplementedError(f"Mode `{mode.s!r}` is not yet implemented.")
+        obs_arr[i] = obs_stats
+
+    sims = np.empty((n_simulations, len(bins)))
+    pvalues = np.ones((le.classes_.shape[0], len(bins)))
+
+    for i in range(n_simulations):
+        random_i = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=seed)
+        if mode == RipleyStat.F:
+            tree_i = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(random_i)
+            distances_i, _ = tree_i.kneighbors(random, n_neighbors=1)
+            _, stats_i = _f_g_function(distances_i.squeeze(), support)
+        elif mode == RipleyStat.G:
+            tree_i = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(random_i)
+            distances_i, _ = tree_i.kneighbors(coordinates, n_neighbors=1)
+            _, stats_i = _f_g_function(distances_i.squeeze(), support)
+        elif mode == RipleyStat.L:
+            distances_i = pdist(random_i, metric=metric)
+            _, stats_i = _l_function(distances_i, support, N, area)
+        else:
+            raise NotImplementedError(f"Mode `{mode.s!r}` is not yet implemented.")
+
+        for j in range(obs_arr.shape[0]):
+            pvalues[j] += stats_i < obs_arr[j]
+        sims[i] = stats_i
+
+    pvalues /= n_simulations + 1
+    # pvalues = np.minimum(pvalues, 1 - pvalues)
+
+    obs_df = _reshape_res(obs_arr.T, columns=le.classes_, index=bins, var_name=cluster_key)
+    sims_df = _reshape_res(sims.T, columns=np.arange(n_simulations), index=bins, var_name="simulations")
+
+    res = {f"{mode}_stat": obs_df, "sims_stat": sims_df, "bins": bins, "pvalues": pvalues}
+
+    if TYPE_CHECKING:
+        assert isinstance(res, dict)
+
+    if copy:
+        logg.info("Finish", time=start)
+        return res
+
+    _save_data(adata, attr="uns", key=Key.uns.ripley(cluster_key, mode), data=res, time=start)
+    return None
+
+
 def lookup_squidpy_feature_class(method: str) -> str | None:
     for key in squidpy_feature_classnames():
         if get_feature_description(key) == method:
@@ -95,14 +227,10 @@ def _summarize_co_occurrence(unstructured_metrics: dict[str, NDArray[Any]] | Non
 def _summarize_ripley(unstructured_metrics: dict[str, NDArray[Any]]) -> float | None:
     bins = unstructured_metrics['bins']
     pvalues = unstructured_metrics['pvalues']
-    pairs = list(zip(bins.tolist(), pvalues.tolist()))
-    if len(bins) != len(pvalues) or len(pairs) == 0:
+    total_p = np_sum(pvalues)
+    if len(bins) != len(pvalues) or total_p == 0:
         return None
-    filtered = [pair for pair in pairs if pair[1] > 0]
-    if len(filtered) == 0:
-        return 1.0
-    sorted_pairs = sorted(filtered, key=lambda pair: pair[1])
-    return float(sorted_pairs[0][1])
+    return round(inner(pvalues, bins) / total_p, 1)
 
 
 def _summarize_spatial_autocorrelation(unstructured_metrics: DataFrame) -> float | None:
@@ -184,8 +312,8 @@ def _co_occurrence(adata: AnnData, radius: float) -> dict[str, NDArray[Any]] | N
 
 
 def _ripley(adata: AnnData) -> dict[str, NDArray[Any]]:
-    r"""Compute various Ripley\'s statistics for point processes."""
-    result = ripley(adata, 'cluster', copy=True)
+    r"""Compute Ripley statistics."""
+    result = ripley_custom(adata, 'cluster', copy=True, n_steps=100)
     bins = cast(NDArray[Any], result['bins'])
     pvalues = cast(NDArray[Any], result['pvalues'])
     return {
diff --git a/test/apiserver/module_tests/expected_squidpy5.json b/test/apiserver/module_tests/expected_squidpy5.json
index c8441a2a1..13d2fa340 100644
--- a/test/apiserver/module_tests/expected_squidpy5.json
+++ b/test/apiserver/module_tests/expected_squidpy5.json
@@ -1,12 +1,12 @@
 {
     "values": {
-        "lesion 0_1": 1.0,
-        "lesion 0_2": 1.0,
-        "lesion 0_3": 1.0,
-        "lesion 6_1": 1.0,
-        "lesion 6_2": 1.0,
-        "lesion 6_3": 1.0,
-        "lesion 6_4": 1.0
+        "lesion 0_1": 34.1,
+        "lesion 0_2": 36.5,
+        "lesion 0_3": 38.3,
+        "lesion 6_1": 52.4,
+        "lesion 6_2": 90.9,
+        "lesion 6_3": 205.5,
+        "lesion 6_4": 56.6
     },
     "is_pending": false
 }