diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f9637e5..3c8c5e19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,15 @@ -## [1.x.x] - xxxx-xx-xx +## [1.0.12] - 2024-05-17 + +### Fix +- Fix polygon selection when no channel is provided +- Fix CosMX reader for proteins +- Fix FOV column issue for CosMX data (#65) + +### Added +- Check the columns of CosMX data to see if the correct export module was used + +### Changed +- Ensure categorical variables are used for patches clustering ## [1.0.11] - 2024-04-26 diff --git a/docs/api/embedding.md b/docs/api/embedding.md index 023255ad..68885e9e 100644 --- a/docs/api/embedding.md +++ b/docs/api/embedding.md @@ -1 +1,3 @@ ::: sopa.embedding.embed_wsi_patches + +::: sopa.embedding.cluster_embeddings diff --git a/docs/tutorials/api_usage.ipynb b/docs/tutorials/api_usage.ipynb index a1ee9f06..7cff0ba0 100644 --- a/docs/tutorials/api_usage.ipynb +++ b/docs/tutorials/api_usage.ipynb @@ -670,7 +670,7 @@ } ], "source": [ - "sopa.io.explorer.write(\"tuto.explorer\", sdata, image_key, points_key=points_key, gene_column=gene_column)" + "sopa.io.write(\"tuto.explorer\", sdata, image_key, points_key=points_key, gene_column=gene_column)" ] }, { @@ -805,7 +805,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index dbe2a0ea..8a94b851 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "sopa" -version = "1.0.11" +version = "1.0.12" description = "Spatial-omics pipeline and analysis" documentation = "https://gustaveroussy.github.io/sopa" homepage = "https://gustaveroussy.github.io/sopa" diff --git a/sopa/cli/app.py b/sopa/cli/app.py index 12ea3379..e0cbb4fc 100644 --- a/sopa/cli/app.py +++ b/sopa/cli/app.py @@ -147,7 +147,12 @@ def crop( sdata = read_zarr_standardized(sdata_path) polygon_selection( - sdata, intermediate_image, intermediate_polygon, list(channels), scale_factor, margin_ratio + sdata, + intermediate_image, + intermediate_polygon, + None if channels is None else list(channels), + scale_factor, + margin_ratio, ) diff --git a/sopa/embedding/cluster.py b/sopa/embedding/cluster.py index 4ed3b64d..dd86a1d0 100644 --- a/sopa/embedding/cluster.py +++ b/sopa/embedding/cluster.py @@ -59,5 +59,6 @@ def cluster_embeddings( embeddings = element.compute().data[:, ilocs[:, 1], ilocs[:, 0]].T gdf_patches[key_added] = method(embeddings, **method_kwargs) + gdf_patches[key_added] = gdf_patches[key_added].astype("category") return gdf_patches diff --git a/sopa/io/reader/cosmx.py b/sopa/io/reader/cosmx.py index efb62207..a1db9097 100644 --- a/sopa/io/reader/cosmx.py +++ b/sopa/io/reader/cosmx.py @@ -6,14 +6,16 @@ from typing import Optional import dask.array as da +import numpy as np import pandas as pd import tifffile +import xarray as xr from dask_image.imread import imread from spatialdata import SpatialData from spatialdata.models import Image2DModel, PointsModel from spatialdata_io._constants._constants import CosmxKeys -from .utils import _default_image_kwargs +from .utils import _deduplicate_c_coords, _default_image_kwargs log = logging.getLogger(__name__) @@ -51,7 +53,7 @@ def cosmx( image_models_kwargs, imread_kwargs = _default_image_kwargs(image_models_kwargs, imread_kwargs) dataset_id = _infer_dataset_id(path, dataset_id) - fov_locs = _read_cosmx_fov_locs(path, dataset_id) + fov_locs = _read_fov_locs(path, dataset_id) fov_id, fov = _check_fov_id(fov) protein_dir_dict = {} @@ -64,14 +66,16 @@ def cosmx( ### Read image(s) images_dir = _find_dir(path, "Morphology2D") + morphology_coords = _cosmx_morphology_coords(path) + if fov is None: - image, protein_names = _read_stitched_image( + image, c_coords = _read_stitched_image( images_dir, fov_locs, protein_dir_dict, + morphology_coords, **imread_kwargs, ) - image = image.rechunk(image_models_kwargs["chunks"]) image_name = "stitched_image" else: pattern = f"*{fov_id}.TIF" @@ -82,13 +86,11 @@ def cosmx( len(fov_files) == 1 ), f"Multiple files match the pattern {pattern}: {', '.join(fov_files)}" - image, protein_names = _read_fov_image( - images_dir / fov_files[0], protein_dir_dict.get(fov), **imread_kwargs + image, c_coords = _read_fov_image( + fov_files[0], protein_dir_dict.get(fov), morphology_coords, **imread_kwargs ) image_name = f"{fov}_image" - c_coords = _cosmx_c_coords(path, image.shape[0], protein_names) - parsed_image = Image2DModel.parse( image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs ) @@ -97,7 +99,7 @@ def cosmx( return SpatialData(images={image_name: parsed_image}) ### Read transcripts - transcripts_data = _read_cosmx_csv(path, dataset_id) + transcripts_data = _read_transcripts_csv(path, dataset_id) if fov is None: transcripts_data["x"] = transcripts_data["x_global_px"] - fov_locs["xmin"].min() @@ -118,25 +120,12 @@ def cosmx( return SpatialData(images={image_name: parsed_image}, points={points_name: transcripts}) -def _read_fov_image( - morphology_path: Path, protein_path: Path | None, **imread_kwargs -) -> tuple[da.Array, list[str] | None]: - image = imread(morphology_path, **imread_kwargs) - - protein_names = None - if protein_path is not None: - protein_image, protein_names = _read_protein_fov(protein_path) - image = da.concatenate([image, protein_image], axis=0) - - return image, protein_names - - def _infer_dataset_id(path: Path, dataset_id: str | None) -> str: if isinstance(dataset_id, str): return dataset_id for suffix in [".csv", ".csv.gz"]: - counts_files = list(path.rglob(f"*_fov_positions_file{suffix}")) + counts_files = list(path.rglob(f"[!\\.]*_fov_positions_file{suffix}")) if len(counts_files) == 1: found = re.match(rf"(.*)_fov_positions_file{suffix}", str(counts_files[0])) @@ -148,7 +137,20 @@ def _infer_dataset_id(path: Path, dataset_id: str | None) -> str: ) -def _read_cosmx_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame: +def _read_fov_image( + morphology_path: Path, protein_path: Path | None, morphology_coords: list[str], **imread_kwargs +) -> tuple[da.Array, list[str]]: + image = imread(morphology_path, **imread_kwargs) + + protein_names = [] + if protein_path is not None: + protein_image, protein_names = _read_protein_fov(protein_path) + image = da.concatenate([image, protein_image], axis=0) + + return image, _deduplicate_c_coords(morphology_coords + protein_names) + + +def _read_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame: fov_file = path / f"{dataset_id}_fov_positions_file.csv" if not fov_file.exists(): @@ -156,7 +158,14 @@ def _read_cosmx_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame: assert fov_file.exists(), f"Missing field of view file: {fov_file}" - fov_locs = pd.read_csv(fov_file, index_col=1) + fov_locs = pd.read_csv(fov_file) + + FOV_COLUMNS = ["X_mm", "Y_mm", "FOV"] + assert np.isin( + FOV_COLUMNS, fov_locs.columns + ).all(), f"The file {fov_file} must contain the following columns: {', '.join(FOV_COLUMNS)}. Consider using a different export module." + + fov_locs.index = fov_locs["FOV"] pixel_size = 0.120280945 # size of a pixel in microns @@ -170,21 +179,27 @@ def _read_cosmx_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame: def _read_stitched_image( - images_dir: Path, fov_locs: pd.DataFrame, protein_dir_dict: dict, **imread_kwargs + images_dir: Path, + fov_locs: pd.DataFrame, + protein_dir_dict: dict, + morphology_coords: list[str], + **imread_kwargs, ) -> tuple[da.Array, list[str] | None]: log.warn("Image stitching is currently experimental") fov_images = {} - protein_names = None + c_coords_dict = {} pattern = re.compile(r".*_F(\d+)") for image_path in images_dir.iterdir(): if image_path.suffix == ".TIF": fov = int(pattern.findall(image_path.name)[0]) - image, protein_names = _read_fov_image( - image_path, protein_dir_dict.get(fov), **imread_kwargs + image, c_coords = _read_fov_image( + image_path, protein_dir_dict.get(fov), morphology_coords, **imread_kwargs ) + c_coords_dict[fov] = c_coords + fov_images[fov] = da.flip(image, axis=1) fov_locs.loc[fov, "xmax"] = fov_locs.loc[fov, "xmin"] + image.shape[2] @@ -195,16 +210,24 @@ def _read_stitched_image( fov_locs[f"{dim}0"] = (fov_locs[f"{dim}min"] - shift).round().astype(int) fov_locs[f"{dim}1"] = (fov_locs[f"{dim}max"] - shift).round().astype(int) + c_coords = list(set.union(*[set(names) for names in c_coords_dict.values()])) + stitched_image = da.zeros( - shape=(image.shape[0], fov_locs["y1"].max(), fov_locs["x1"].max()), dtype=image.dtype + shape=(len(c_coords), fov_locs["y1"].max(), fov_locs["x1"].max()), dtype=image.dtype ) + stitched_image = xr.DataArray(stitched_image, dims=("c", "y", "x"), coords={"c": c_coords}) for fov, im in fov_images.items(): xmin, xmax = fov_locs.loc[fov, "x0"], fov_locs.loc[fov, "x1"] ymin, ymax = fov_locs.loc[fov, "y0"], fov_locs.loc[fov, "y1"] - stitched_image[:, ymin:ymax, xmin:xmax] = im + stitched_image.loc[ + {"c": c_coords_dict[fov], "y": slice(ymin, ymax), "x": slice(xmin, xmax)} + ] = im + + if len(c_coords_dict[fov]) < len(c_coords): + log.warn(f"Missing channels ({len(c_coords) - len(c_coords_dict[fov])}) for FOV {fov}") - return stitched_image, protein_names + return stitched_image.data, c_coords def _check_fov_id(fov: str | int | None) -> tuple[str, int]: @@ -221,33 +244,29 @@ def _check_fov_id(fov: str | int | None) -> tuple[str, int]: return fov, int(fov[1:]) -def _read_cosmx_csv(path: Path, dataset_id: str) -> pd.DataFrame: +def _read_transcripts_csv(path: Path, dataset_id: str) -> pd.DataFrame: transcripts_file = path / f"{dataset_id}_tx_file.csv.gz" if transcripts_file.exists(): - return pd.read_csv(transcripts_file, compression="gzip") - - transcripts_file = path / f"{dataset_id}_tx_file.csv" + df = pd.read_csv(transcripts_file, compression="gzip") + else: + transcripts_file = path / f"{dataset_id}_tx_file.csv" + assert transcripts_file.exists(), f"Transcript file {transcripts_file} not found." + df = pd.read_csv(transcripts_file) - assert transcripts_file.exists(), f"Transcript file {transcripts_file} not found." + TRANSCRIPT_COLUMNS = ["x_global_px", "y_global_px", "target"] + assert np.isin( + TRANSCRIPT_COLUMNS, df.columns + ).all(), f"The file {transcripts_file} must contain the following columns: {', '.join(TRANSCRIPT_COLUMNS)}. Consider using a different export module." - return pd.read_csv(transcripts_file) + return df -def _cosmx_c_coords(path: Path, n_channels: int, protein_names: list[str] | None) -> list[str]: +def _cosmx_morphology_coords(path: Path) -> list[str]: channel_ids_path = path / "Morphology_ChannelID_Dictionary.txt" - if channel_ids_path.exists(): - channel_names = list(pd.read_csv(channel_ids_path, delimiter="\t")["BiologicalTarget"]) - else: - n_channels = n_channels - len(protein_names) if protein_names is not None else n_channels - channel_names = [str(i) for i in range(n_channels)] - log.warn(f"Channel file not found at {channel_ids_path}, using {channel_names=} instead.") - - if protein_names is not None: - channel_names += protein_names - - return channel_names + assert channel_ids_path.exists(), f"Channel file not found at {channel_ids_path}" + return list(pd.read_csv(channel_ids_path, delimiter="\t")["BiologicalTarget"]) def _find_dir(path: Path, name: str): diff --git a/sopa/io/reader/utils.py b/sopa/io/reader/utils.py index 82f3342a..f2b7042a 100644 --- a/sopa/io/reader/utils.py +++ b/sopa/io/reader/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +from collections import defaultdict from pathlib import Path from typing import Callable @@ -36,6 +37,17 @@ def _deduplicate_names(df): return df[0].values +def _deduplicate_c_coords(c_coords: list[str]) -> list[str]: + counter, res = defaultdict(int), [] + for channel in c_coords: + if channel not in counter: + res.append(channel) + else: + res.append(f"{channel} ({counter[channel]})") + counter[channel] += 1 + return res + + def _get_files_stem(files: list[Path]): return [file.stem for file in files] diff --git a/sopa/utils/polygon_crop.py b/sopa/utils/polygon_crop.py index ebc8ff55..2e5df32f 100644 --- a/sopa/utils/polygon_crop.py +++ b/sopa/utils/polygon_crop.py @@ -40,7 +40,7 @@ def _prepare(sdata: SpatialData, channels: list[str], scale_factor: float): else: assert ( len(image.coords["c"]) in VALID_N_CHANNELS - ), f"Choose one or three channels among {image.c.values} by using the -c argument" + ), f"Choose one or three channels among {image.c.values} by using the --channels argument" log.info(f"Resizing image by a factor of {scale_factor}") return image_key, resize(image, scale_factor).compute()