gustaveroussy · quentinblampey · Jul 4, 2024 · May 25, 2024 · Jun 5, 2024 · Jun 5, 2024
diff --git a/docs/cli.md b/docs/cli.md
@@ -347,12 +347,13 @@ $ sopa patchify [OPTIONS] COMMAND [ARGS]...
 
 **Commands**:
 
-* `baysor`: Prepare the patches for Baysor segmentation
+* `baysor`: Prepare patches for transcript-based segmentation with Baysor
+* `comseg`: Prepare patches for transcript-based segmentation with ComSeg
 * `image`: Prepare patches for staining-based...
 
 #### `sopa patchify baysor`
 
-Prepare the patches for Baysor segmentation
+Prepare patches for transcript-based segmentation with Baysor
 
 **Usage**:
 
@@ -376,6 +377,31 @@ $ sopa patchify baysor [OPTIONS] SDATA_PATH
 * `--use-prior / --no-use-prior`: Whether to use cellpose segmentation as a prior for baysor (if True, make sure to first run Cellpose)  [default: no-use-prior]
 * `--help`: Show this message and exit.
 
+#### `sopa patchify comseg`
+
+Prepare patches for transcript-based segmentation with ComSeg
+
+**Usage**:
+
+```console
+$ sopa patchify comseg [OPTIONS] SDATA_PATH
+```
+
+**Arguments**:
+
+* `SDATA_PATH`: Path to the SpatialData `.zarr` directory  [required]
+
+**Options**:
+
+* `--patch-width-microns FLOAT`: Width (and height) of each patch in microns  [required]
+* `--patch-overlap-microns FLOAT`: Number of overlapping microns between the patches. We advise to choose approximately twice the diameter of a cell  [required]
+* `--baysor-temp-dir TEXT`: Temporary directory where ComSeg inputs and outputs will be saved. By default, uses `.sopa_cache/comseg_boundaries`
+* `--config-path TEXT`: Path to the baysor config (you can also directly provide the argument via the `config` option)
+* `--config TEXT`: Dictionnary of baysor parameters  [default: {}]
+* `--cell-key TEXT`: Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation. Default is 'cell' if cell_key=None
+* `--unassigned-value INTEGER`: If --cell-key is provided, this is the value given to transcripts that are not inside any cell (if it's already 0, don't provide this argument)
+* `--help`: Show this message and exit.
+
 #### `sopa patchify image`
 
 Prepare patches for staining-based segmentation (including Cellpose)
@@ -500,6 +526,28 @@ $ sopa resolve cellpose [OPTIONS] SDATA_PATH
 * `--patch-dir TEXT`: Directory containing the cellpose segmentation on patches (or multiple directories if using multi-step segmentation). By default, uses the `.sopa_cache/cellpose_boundaries` directory
 * `--help`: Show this message and exit.
 
+#### `sopa resolve comseg`
+
+Resolve patches conflicts after comseg segmentation. Provide either `--baysor-temp-dir` or `--patches-dirs`
+
+**Usage**:
+
+```console
+$ sopa resolve comseg [OPTIONS] SDATA_PATH
+```
+
+**Arguments**:
+
+* `SDATA_PATH`: Path to the SpatialData `.zarr` directory  [required]
+
+**Options**:
+
+* `--gene-column TEXT`: Column of the transcripts dataframe containing the genes names  [required]
+* `--comseg-temp-dir TEXT`: Path to the directory containing all the comseg patches (see `sopa patchify`). By default, uses the `.sopa_cache/comseg_boundaries` directory
+* `--min-area FLOAT`: Cells with an area less than this value (in microns^2) will be filtered  [default: 0]
+* `--patches-dirs TEXT`: List of patches directories inside `comseg_temp_dir`
+* `--help`: Show this message and exit.
+
 #### `sopa resolve generic`
 
 Resolve patches conflicts after generic segmentation
@@ -537,6 +585,7 @@ $ sopa segmentation [OPTIONS] COMMAND [ARGS]...
 **Commands**:
 
 * `cellpose`: Perform cellpose segmentation.
+* `comseg`:  Perform ComSeg segmentation.
 * `generic-staining`: Perform generic staining-based segmentation.
 
 #### `sopa segmentation cellpose`
@@ -575,6 +624,31 @@ $ sopa segmentation cellpose [OPTIONS] SDATA_PATH
 * `--method-kwargs TEXT`: Kwargs for the cellpose method builder. This should be a dictionnary, in inline string format.  [default: {}]
 * `--help`: Show this message and exit.
 
+#### `sopa segmentation comseg`
+
+Perform ComSeg segmentation. This can be done on all patches directly, or on one individual patch.
+
+!!! note "Usage"
+
+    - [On one patch] Use this mode to run patches in parallel. Provide `--patch-index` to run one patch, and execute all patches in a parallel manner (you need to define your own parallelization, else, use the Snakemake pipeline).
+
+    - [On all patches at once] For small images, you can run the segmentation method sequentially (`--patch-index` is not needed)
+
+**Usage**:
+
+```console
+$ sopa segmentation comseg [OPTIONS] SDATA_PATH
+```
+
+**Arguments**:
+
+* `SDATA_PATH`: Path to the SpatialData `.zarr` directory  [required]
+
+**Options**:
+
+* `--patch-dir TEXT`: Path to the temporary comseg directory inside which we will store each individual patch segmentation. By default, saves into the `.sopa_cache/comseg_boundaries` directory
+* `--patch-index INTEGER`: Index of the patch on which cellpose should be run. NB: the number of patches is `len(sdata['sopa_patches'])`
+
 #### `sopa segmentation generic-staining`
 
 Perform generic staining-based segmentation. This can be done on all patches directly, or on one individual patch.

diff --git a/docs/tutorials/cli_other_segmentation.md b/docs/tutorials/cli_other_segmentation.md
@@ -0,0 +1,76 @@
+
+### Option 3: ComSeg
+
+
+[ComSeg](https://github.com/fish-quant/ComSeg) is a transcript-based segmentation method. It uses a segmentation prior (here, Cellpose) and improves it using the transcripts information.
+
+#### Run Cellpose to segment nuclei
+
+```
+sopa patchify image tuto.zarr --patch-width-pixel 1500 --patch-overlap-pixel 50
+sopa segmentation cellpose tuto.zarr --channels DAPI --diameter 35 --min-area 2000
+sopa resolve cellpose tuto.zarr
+```
+
+####  Save a ComSeg config file as config.jsons
+More information on the parameters can be found in the [ComSeg documentation](https://comseg.readthedocs.io/en/latest/userguide/Minimal_example.html).
+Below we display a minimal example of a ComSeg config file.
+
+
+```json
+{"dict_scale": {"x": 1, "y": 1, "z": 1},
+"mean_cell_diameter": 15,
+"max_cell_radius": 50,
+"alpha": 0.5,
+"min_rna_per_cell": 5,
+"gene_column": "genes"}
+```
+
+####  Run ComSeg with the sopa command line tool
+
+1) create the ComSeg patches
+On the toy dataset, we will generate 4 patches.
+```
+sopa patchify comseg tuto.zarr --config-path config.json --patch-width-microns 200 --patch-overlap-microns 50
+```
+
+2) run ComSeg on all patches
+
+!!! tip
+    Manually running the commands below can involve using many consecutive commands, so we recommend automatizing it. For instance, this can be done using Snakemake or Nextflow. This will help you parallelize it since you can run each task on separate jobs or using multithreading. You can also see how we do it in the [Sopa Snakemake pipeline](https://github.com/gustaveroussy/sopa/blob/master/workflow/Snakefile).
+
+    To automatically get the number of patches, you can open the `tuto.zarr/.sopa_cache/patches_file_comseg` file. This lists the names of the directories inside `tuto.zarr/.sopa_cache/comseg` related to each patch. If you selected an ROI, the excluded patches are effectively not in the `patches_file_comseg` file.
+
+=== "Patch 0"
+    ```sh
+    cd tuto.zarr/.sopa_cache/comseg_boundaries/0
+
+    # 'comseg' is the official comseg executable. If unavailable, replace it with your path to the executable
+    comseg run --save-polygons GeoJSON -c config.toml transcripts.csv
+    ```
+=== "Patch 1"
+    ```sh
+    cd tuto.zarr/.sopa_cache/comseg_boundaries/1
+
+    # 'comseg' is the official comseg executable. If unavailable, replace it with your path to the executable
+    comseg run --save-polygons GeoJSON -c config.toml transcripts.csv
+    ```
+=== "Patch 2"
+    ```sh
+    cd tuto.zarr/.sopa_cache/comseg_boundaries/2
+
+    # 'comseg' is the official comseg executable. If unavailable, replace it with your path to the executable
+    comseg run --save-polygons GeoJSON -c config.toml transcripts.csv
+    ```
+=== "Patch 3"
+    ```sh
+    cd tuto.zarr/.sopa_cache/comseg_boundaries/3
+
+    # 'comseg' is the official comseg executable. If unavailable, replace it with your path to the executable
+    comseg run --save-polygons GeoJSON -c config.toml transcripts.csv
+    ```
+
+3) Merge the results
+```sh
+sopa resolve comseg tuto.zarr --gene-column genes
+```
diff --git a/sopa/_constants.py b/sopa/_constants.py
@@ -51,6 +51,7 @@ class SopaFiles:
     SOPA_CACHE_DIR = ".sopa_cache"
     PATCHES_FILE_IMAGE = "patches_file_image"
     PATCHES_DIRS_BAYSOR = "patches_file_baysor"
+    PATCHES_DIRS_COMSEG = "patches_file_comseg"
     TRANSCRIPTS_FILE = "transcripts.csv"
     CENTROIDS_FILE = "centroids.csv"
     JSON_CONFIG_FILE = "config.json"

diff --git a/sopa/cli/patchify.py b/sopa/cli/patchify.py
@@ -4,6 +4,7 @@
 
 import typer
 
+from .._constants import SopaKeys
 from .utils import SDATA_HELPER
 
 app_patchify = typer.Typer()
@@ -59,7 +60,8 @@ def baysor(
     ),
     cell_key: str = typer.Option(
         None,
-        help="Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation",
+        help="Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation"
+        f" Default is '{SopaKeys.DEFAULT_CELL_KEY}' if cell_key=None",
     ),
     unassigned_value: int = typer.Option(
         None,
@@ -70,7 +72,108 @@ def baysor(
         help="Whether to use cellpose segmentation as a prior for baysor (if True, make sure to first run Cellpose)",
     ),
 ):
-    """Prepare the patches for Baysor segmentation"""
+    """Prepare patches for transcript-based segmentation with baysor"""
+    return transcript_segmentation(
+        sdata_path=sdata_path,
+        method="baysor",
+        patch_width_microns=patch_width_microns,
+        patch_overlap_microns=patch_overlap_microns,
+        temp_dir=baysor_temp_dir,
+        config_path=config_path,
+        config=config,
+        cell_key=cell_key,
+        unassigned_value=unassigned_value,
+        use_prior=use_prior,
+    )
+
+
+@app_patchify.command()
+def comseg(
+    sdata_path: str = typer.Argument(help=SDATA_HELPER),
+    patch_width_microns: float = typer.Option(help="Width (and height) of each patch in microns"),
+    patch_overlap_microns: float = typer.Option(
+        help="Number of overlapping microns between the patches. We advise to choose approximately twice the diameter of a cell"
+    ),
+    baysor_temp_dir: str = typer.Option(
+        None,
+        help="Temporary directory where baysor inputs and outputs will be saved. By default, uses `.sopa_cache/comseg_boundaries`",
+    ),
+    config_path: str = typer.Option(
+        None,
+        help="Path to the ComSeg json config file (you can also directly provide the argument via the `config` option)",
+    ),
+    config: str = typer.Option(
+        default={},
+        callback=ast.literal_eval,
+        help="Dictionnary of ComSeg parameters",
+    ),
+    cell_key: str = typer.Option(
+        None,
+        help="Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation."
+        f" Default is {SopaKeys.DEFAULT_CELL_KEY} if cell_key=None",
+    ),
+    unassigned_value: int = typer.Option(
+        None,
+        help="If --cell-key is provided, this is the value given to transcripts that are not inside any cell (if it's already 0, don't provide this argument)",
+    ),
+):
+    """Prepare patches for transcript-based segmentation with ComSeg"""
+
+    return transcript_segmentation(
+        sdata_path=sdata_path,
+        method="comseg",
+        patch_width_microns=patch_width_microns,
+        patch_overlap_microns=patch_overlap_microns,
+        temp_dir=baysor_temp_dir,
+        config_path=config_path,
+        config=config,
+        cell_key=cell_key,
+        unassigned_value=unassigned_value,
+        use_prior=True,
+    )
+
+
+@app_patchify.command()
+def transcript_segmentation(
+    sdata_path: str = typer.Argument(help=SDATA_HELPER),
+    method: str = typer.Option(
+        "baysor",
+        help="Name of the method to use, choose in ['baysor', 'comseg']. for ComSeg, make sure to first run Cellpose or "
+        f"manually add the segmentation boundaries to the sdata.shapes as {SopaKeys.CELLPOSE_BOUNDARIES} key",
+    ),
+    patch_width_microns: float = typer.Option(help="Width (and height) of each patch in microns"),
+    patch_overlap_microns: float = typer.Option(
+        help="Number of overlapping microns between the patches. We advise to choose approximately twice the diameter of a cell"
+    ),
+    temp_dir: str = typer.Option(
+        None,
+        help="Temporary directory where baysor inputs and outputs will be saved. By default, uses `.sopa_cache/baysor_boundaries`",
+    ),
+    config_path: str = typer.Option(
+        None,
+        help="Path to the baysor config (you can also directly provide the argument via the `config` option)",
+    ),
+    config: str = typer.Option(
+        default={},
+        callback=ast.literal_eval,
+        help="Dictionnary of baysor parameters",
+    ),
+    cell_key: str = typer.Option(
+        None,
+        help="Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation. "
+        f" Default is {SopaKeys.DEFAULT_CELL_KEY} if cell_key=None",
+    ),
+    unassigned_value: int = typer.Option(
+        None,
+        help="If --cell-key is provided, this is the value given to transcripts that are not inside any cell (if it's already 0, don't provide this argument)",
+    ),
+    use_prior: bool = typer.Option(
+        False,
+        help="Whether to use cellpose segmentation as a prior for baysor and comseg (if True, make sure to first run Cellpose or "
+        f"manually add the segmentation boundaries to the sdata.shapes as {SopaKeys.CELLPOSE_BOUNDARIES} key)",
+    ),
+):
+    """Prepare patches for transcript-based segmentation for the different available methods (baysor, comseg)"""
     from sopa._constants import SopaFiles, SopaKeys
     from sopa._sdata import get_key
     from sopa.io.standardize import read_zarr_standardized, sanity_check
@@ -83,18 +186,38 @@ def baysor(
 
     assert (
         config or config_path is not None
-    ), "Provide '--config-path', the path to a Baysor config file (toml)"
-
-    if baysor_temp_dir is None:
-        baysor_temp_dir = _default_boundary_dir(sdata_path, SopaKeys.BAYSOR_BOUNDARIES)
+    ), "Provide '--config-path', the path to a Baysor config file (toml) or comseg file (jsons)"
+    assert method in ["baysor", "comseg"], "method must be either 'baysor' or 'comseg'"
+
+    if temp_dir is None:
+        if method == "baysor":
+            temp_dir = _default_boundary_dir(sdata_path, SopaKeys.BAYSOR_BOUNDARIES)
+            filename = SopaFiles.PATCHES_DIRS_BAYSOR
+            config_name = SopaFiles.TOML_CONFIG_FILE
+        elif method == "comseg":
+            temp_dir = _default_boundary_dir(sdata_path, SopaKeys.COMSEG_BOUNDARIES)
+            filename = SopaFiles.PATCHES_DIRS_COMSEG
+            config_name = SopaFiles.JSON_CONFIG_FILE
+        else:
+            raise ValueError("method must be either 'baysor' or 'comseg'")
 
     df_key = get_key(sdata, "points")
     patches = Patches2D(sdata, df_key, patch_width_microns, patch_overlap_microns)
+    if method == "comseg":
+        patches.patchify_centroids(temp_dir)
+        assert (
+            use_prior
+        ), "For ComSeg, you must use the prior segmentation of nuclei or from other staining"
     valid_indices = patches.patchify_transcripts(
-        baysor_temp_dir, cell_key, unassigned_value, use_prior, config, config_path
+        temp_dir,
+        cell_key,
+        unassigned_value,
+        use_prior,
+        config,
+        config_path,
+        config_name=config_name,
     )
-
-    _save_cache(sdata_path, SopaFiles.PATCHES_DIRS_BAYSOR, "\n".join(map(str, valid_indices)))
+    _save_cache(sdata_path, filename, "\n".join(map(str, valid_indices)))
 
 
 def _save_cache(sdata_path: str, filename: str, content: str):