Merge pull request #66 from gustaveroussy/dev

Dev
gustaveroussy · May 17, 2024 · 372969a · 372969a
2 parents 04af5c9 + 850213d
commit 372969a
Show file tree

Hide file tree

Showing 9 changed files with 106 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,15 @@
-## [1.x.x] - xxxx-xx-xx
+## [1.0.12] - 2024-05-17
+
+### Fix
+- Fix polygon selection when no channel is provided
+- Fix CosMX reader for proteins
+- Fix FOV column issue for CosMX data (#65)
+
+### Added
+- Check the columns of CosMX data to see if the correct export module was used
+
+### Changed
+- Ensure categorical variables are used for patches clustering
 
 ## [1.0.11] - 2024-04-26
 

diff --git a/docs/api/embedding.md b/docs/api/embedding.md
@@ -1 +1,3 @@
 ::: sopa.embedding.embed_wsi_patches
+
+::: sopa.embedding.cluster_embeddings
diff --git a/docs/tutorials/api_usage.ipynb b/docs/tutorials/api_usage.ipynb
@@ -670,7 +670,7 @@
     }
    ],
    "source": [
-    "sopa.io.explorer.write(\"tuto.explorer\", sdata, image_key, points_key=points_key, gene_column=gene_column)"
+    "sopa.io.write(\"tuto.explorer\", sdata, image_key, points_key=points_key, gene_column=gene_column)"
    ]
   },
   {
@@ -805,7 +805,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sopa"
-version = "1.0.11"
+version = "1.0.12"
 description = "Spatial-omics pipeline and analysis"
 documentation = "https://gustaveroussy.github.io/sopa"
 homepage = "https://gustaveroussy.github.io/sopa"

diff --git a/sopa/cli/app.py b/sopa/cli/app.py
@@ -147,7 +147,12 @@ def crop(
     sdata = read_zarr_standardized(sdata_path)
 
     polygon_selection(
-        sdata, intermediate_image, intermediate_polygon, list(channels), scale_factor, margin_ratio
+        sdata,
+        intermediate_image,
+        intermediate_polygon,
+        None if channels is None else list(channels),
+        scale_factor,
+        margin_ratio,
     )
 
 

diff --git a/sopa/embedding/cluster.py b/sopa/embedding/cluster.py
@@ -59,5 +59,6 @@ def cluster_embeddings(
     embeddings = element.compute().data[:, ilocs[:, 1], ilocs[:, 0]].T
 
     gdf_patches[key_added] = method(embeddings, **method_kwargs)
+    gdf_patches[key_added] = gdf_patches[key_added].astype("category")
 
     return gdf_patches
diff --git a/sopa/io/reader/cosmx.py b/sopa/io/reader/cosmx.py
@@ -6,14 +6,16 @@
 from typing import Optional
 
 import dask.array as da
+import numpy as np
 import pandas as pd
 import tifffile
+import xarray as xr
 from dask_image.imread import imread
 from spatialdata import SpatialData
 from spatialdata.models import Image2DModel, PointsModel
 from spatialdata_io._constants._constants import CosmxKeys
 
-from .utils import _default_image_kwargs
+from .utils import _deduplicate_c_coords, _default_image_kwargs
 
 log = logging.getLogger(__name__)
 
@@ -51,7 +53,7 @@ def cosmx(
     image_models_kwargs, imread_kwargs = _default_image_kwargs(image_models_kwargs, imread_kwargs)
 
     dataset_id = _infer_dataset_id(path, dataset_id)
-    fov_locs = _read_cosmx_fov_locs(path, dataset_id)
+    fov_locs = _read_fov_locs(path, dataset_id)
     fov_id, fov = _check_fov_id(fov)
 
     protein_dir_dict = {}
@@ -64,14 +66,16 @@ def cosmx(
 
     ### Read image(s)
     images_dir = _find_dir(path, "Morphology2D")
+    morphology_coords = _cosmx_morphology_coords(path)
+
     if fov is None:
-        image, protein_names = _read_stitched_image(
+        image, c_coords = _read_stitched_image(
             images_dir,
             fov_locs,
             protein_dir_dict,
+            morphology_coords,
             **imread_kwargs,
         )
-        image = image.rechunk(image_models_kwargs["chunks"])
         image_name = "stitched_image"
     else:
         pattern = f"*{fov_id}.TIF"
@@ -82,13 +86,11 @@ def cosmx(
             len(fov_files) == 1
         ), f"Multiple files match the pattern {pattern}: {', '.join(fov_files)}"
 
-        image, protein_names = _read_fov_image(
-            images_dir / fov_files[0], protein_dir_dict.get(fov), **imread_kwargs
+        image, c_coords = _read_fov_image(
+            fov_files[0], protein_dir_dict.get(fov), morphology_coords, **imread_kwargs
         )
         image_name = f"{fov}_image"
 
-    c_coords = _cosmx_c_coords(path, image.shape[0], protein_names)
-
     parsed_image = Image2DModel.parse(
         image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs
     )
@@ -97,7 +99,7 @@ def cosmx(
         return SpatialData(images={image_name: parsed_image})
 
     ### Read transcripts
-    transcripts_data = _read_cosmx_csv(path, dataset_id)
+    transcripts_data = _read_transcripts_csv(path, dataset_id)
 
     if fov is None:
         transcripts_data["x"] = transcripts_data["x_global_px"] - fov_locs["xmin"].min()
@@ -118,25 +120,12 @@ def cosmx(
     return SpatialData(images={image_name: parsed_image}, points={points_name: transcripts})
 
 
-def _read_fov_image(
-    morphology_path: Path, protein_path: Path | None, **imread_kwargs
-) -> tuple[da.Array, list[str] | None]:
-    image = imread(morphology_path, **imread_kwargs)
-
-    protein_names = None
-    if protein_path is not None:
-        protein_image, protein_names = _read_protein_fov(protein_path)
-        image = da.concatenate([image, protein_image], axis=0)
-
-    return image, protein_names
-
-
 def _infer_dataset_id(path: Path, dataset_id: str | None) -> str:
     if isinstance(dataset_id, str):
         return dataset_id
 
     for suffix in [".csv", ".csv.gz"]:
-        counts_files = list(path.rglob(f"*_fov_positions_file{suffix}"))
+        counts_files = list(path.rglob(f"[!\\.]*_fov_positions_file{suffix}"))
 
         if len(counts_files) == 1:
             found = re.match(rf"(.*)_fov_positions_file{suffix}", str(counts_files[0]))
@@ -148,15 +137,35 @@ def _infer_dataset_id(path: Path, dataset_id: str | None) -> str:
     )
 
 
-def _read_cosmx_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:
+def _read_fov_image(
+    morphology_path: Path, protein_path: Path | None, morphology_coords: list[str], **imread_kwargs
+) -> tuple[da.Array, list[str]]:
+    image = imread(morphology_path, **imread_kwargs)
+
+    protein_names = []
+    if protein_path is not None:
+        protein_image, protein_names = _read_protein_fov(protein_path)
+        image = da.concatenate([image, protein_image], axis=0)
+
+    return image, _deduplicate_c_coords(morphology_coords + protein_names)
+
+
+def _read_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:
     fov_file = path / f"{dataset_id}_fov_positions_file.csv"
 
     if not fov_file.exists():
         fov_file = path / f"{dataset_id}_fov_positions_file.csv.gz"
 
     assert fov_file.exists(), f"Missing field of view file: {fov_file}"
 
-    fov_locs = pd.read_csv(fov_file, index_col=1)
+    fov_locs = pd.read_csv(fov_file)
+
+    FOV_COLUMNS = ["X_mm", "Y_mm", "FOV"]
+    assert np.isin(
+        FOV_COLUMNS, fov_locs.columns
+    ).all(), f"The file {fov_file} must contain the following columns: {', '.join(FOV_COLUMNS)}. Consider using a different export module."
+
+    fov_locs.index = fov_locs["FOV"]
 
     pixel_size = 0.120280945  # size of a pixel in microns
 
@@ -170,21 +179,27 @@ def _read_cosmx_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:
 
 
 def _read_stitched_image(
-    images_dir: Path, fov_locs: pd.DataFrame, protein_dir_dict: dict, **imread_kwargs
+    images_dir: Path,
+    fov_locs: pd.DataFrame,
+    protein_dir_dict: dict,
+    morphology_coords: list[str],
+    **imread_kwargs,
 ) -> tuple[da.Array, list[str] | None]:
     log.warn("Image stitching is currently experimental")
 
     fov_images = {}
-    protein_names = None
+    c_coords_dict = {}
     pattern = re.compile(r".*_F(\d+)")
     for image_path in images_dir.iterdir():
         if image_path.suffix == ".TIF":
             fov = int(pattern.findall(image_path.name)[0])
 
-            image, protein_names = _read_fov_image(
-                image_path, protein_dir_dict.get(fov), **imread_kwargs
+            image, c_coords = _read_fov_image(
+                image_path, protein_dir_dict.get(fov), morphology_coords, **imread_kwargs
             )
 
+            c_coords_dict[fov] = c_coords
+
             fov_images[fov] = da.flip(image, axis=1)
 
             fov_locs.loc[fov, "xmax"] = fov_locs.loc[fov, "xmin"] + image.shape[2]
@@ -195,16 +210,24 @@ def _read_stitched_image(
         fov_locs[f"{dim}0"] = (fov_locs[f"{dim}min"] - shift).round().astype(int)
         fov_locs[f"{dim}1"] = (fov_locs[f"{dim}max"] - shift).round().astype(int)
 
+    c_coords = list(set.union(*[set(names) for names in c_coords_dict.values()]))
+
     stitched_image = da.zeros(
-        shape=(image.shape[0], fov_locs["y1"].max(), fov_locs["x1"].max()), dtype=image.dtype
+        shape=(len(c_coords), fov_locs["y1"].max(), fov_locs["x1"].max()), dtype=image.dtype
     )
+    stitched_image = xr.DataArray(stitched_image, dims=("c", "y", "x"), coords={"c": c_coords})
 
     for fov, im in fov_images.items():
         xmin, xmax = fov_locs.loc[fov, "x0"], fov_locs.loc[fov, "x1"]
         ymin, ymax = fov_locs.loc[fov, "y0"], fov_locs.loc[fov, "y1"]
-        stitched_image[:, ymin:ymax, xmin:xmax] = im
+        stitched_image.loc[
+            {"c": c_coords_dict[fov], "y": slice(ymin, ymax), "x": slice(xmin, xmax)}
+        ] = im
+
+        if len(c_coords_dict[fov]) < len(c_coords):
+            log.warn(f"Missing channels ({len(c_coords) - len(c_coords_dict[fov])}) for FOV {fov}")
 
-    return stitched_image, protein_names
+    return stitched_image.data, c_coords
 
 
 def _check_fov_id(fov: str | int | None) -> tuple[str, int]:
@@ -221,33 +244,29 @@ def _check_fov_id(fov: str | int | None) -> tuple[str, int]:
     return fov, int(fov[1:])
 
 
-def _read_cosmx_csv(path: Path, dataset_id: str) -> pd.DataFrame:
+def _read_transcripts_csv(path: Path, dataset_id: str) -> pd.DataFrame:
     transcripts_file = path / f"{dataset_id}_tx_file.csv.gz"
 
     if transcripts_file.exists():
-        return pd.read_csv(transcripts_file, compression="gzip")
-
-    transcripts_file = path / f"{dataset_id}_tx_file.csv"
+        df = pd.read_csv(transcripts_file, compression="gzip")
+    else:
+        transcripts_file = path / f"{dataset_id}_tx_file.csv"
+        assert transcripts_file.exists(), f"Transcript file {transcripts_file} not found."
+        df = pd.read_csv(transcripts_file)
 
-    assert transcripts_file.exists(), f"Transcript file {transcripts_file} not found."
+    TRANSCRIPT_COLUMNS = ["x_global_px", "y_global_px", "target"]
+    assert np.isin(
+        TRANSCRIPT_COLUMNS, df.columns
+    ).all(), f"The file {transcripts_file} must contain the following columns: {', '.join(TRANSCRIPT_COLUMNS)}. Consider using a different export module."
 
-    return pd.read_csv(transcripts_file)
+    return df
 
 
-def _cosmx_c_coords(path: Path, n_channels: int, protein_names: list[str] | None) -> list[str]:
+def _cosmx_morphology_coords(path: Path) -> list[str]:
     channel_ids_path = path / "Morphology_ChannelID_Dictionary.txt"
 
-    if channel_ids_path.exists():
-        channel_names = list(pd.read_csv(channel_ids_path, delimiter="\t")["BiologicalTarget"])
-    else:
-        n_channels = n_channels - len(protein_names) if protein_names is not None else n_channels
-        channel_names = [str(i) for i in range(n_channels)]
-        log.warn(f"Channel file not found at {channel_ids_path}, using {channel_names=} instead.")
-
-    if protein_names is not None:
-        channel_names += protein_names
-
-    return channel_names
+    assert channel_ids_path.exists(), f"Channel file not found at {channel_ids_path}"
+    return list(pd.read_csv(channel_ids_path, delimiter="\t")["BiologicalTarget"])
 
 
 def _find_dir(path: Path, name: str):

diff --git a/sopa/io/reader/utils.py b/sopa/io/reader/utils.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+from collections import defaultdict
 from pathlib import Path
 from typing import Callable
 
@@ -36,6 +37,17 @@ def _deduplicate_names(df):
     return df[0].values
 
 
+def _deduplicate_c_coords(c_coords: list[str]) -> list[str]:
+    counter, res = defaultdict(int), []
+    for channel in c_coords:
+        if channel not in counter:
+            res.append(channel)
+        else:
+            res.append(f"{channel} ({counter[channel]})")
+        counter[channel] += 1
+    return res
+
+
 def _get_files_stem(files: list[Path]):
     return [file.stem for file in files]
 

diff --git a/sopa/utils/polygon_crop.py b/sopa/utils/polygon_crop.py
@@ -40,7 +40,7 @@ def _prepare(sdata: SpatialData, channels: list[str], scale_factor: float):
     else:
         assert (
             len(image.coords["c"]) in VALID_N_CHANNELS
-        ), f"Choose one or three channels among {image.c.values} by using the -c argument"
+        ), f"Choose one or three channels among {image.c.values} by using the --channels argument"
 
     log.info(f"Resizing image by a factor of {scale_factor}")
     return image_key, resize(image, scale_factor).compute()
Original file line number	Diff line number	Diff line change
		@@ -1 +1,3 @@
		::: sopa.embedding.embed_wsi_patches

		::: sopa.embedding.cluster_embeddings