Skip to content

Commit

Permalink
Merge pull request #66 from gustaveroussy/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
quentinblampey authored May 17, 2024
2 parents 04af5c9 + 850213d commit 372969a
Show file tree
Hide file tree
Showing 9 changed files with 106 additions and 56 deletions.
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,15 @@
## [1.x.x] - xxxx-xx-xx
## [1.0.12] - 2024-05-17

### Fix
- Fix polygon selection when no channel is provided
- Fix CosMX reader for proteins
- Fix FOV column issue for CosMX data (#65)

### Added
- Check the columns of CosMX data to see if the correct export module was used

### Changed
- Ensure categorical variables are used for patches clustering

## [1.0.11] - 2024-04-26

Expand Down
2 changes: 2 additions & 0 deletions docs/api/embedding.md
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
::: sopa.embedding.embed_wsi_patches

::: sopa.embedding.cluster_embeddings
4 changes: 2 additions & 2 deletions docs/tutorials/api_usage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@
}
],
"source": [
"sopa.io.explorer.write(\"tuto.explorer\", sdata, image_key, points_key=points_key, gene_column=gene_column)"
"sopa.io.write(\"tuto.explorer\", sdata, image_key, points_key=points_key, gene_column=gene_column)"
]
},
{
Expand Down Expand Up @@ -805,7 +805,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.10.13"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "sopa"
version = "1.0.11"
version = "1.0.12"
description = "Spatial-omics pipeline and analysis"
documentation = "https://gustaveroussy.github.io/sopa"
homepage = "https://gustaveroussy.github.io/sopa"
Expand Down
7 changes: 6 additions & 1 deletion sopa/cli/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,12 @@ def crop(
sdata = read_zarr_standardized(sdata_path)

polygon_selection(
sdata, intermediate_image, intermediate_polygon, list(channels), scale_factor, margin_ratio
sdata,
intermediate_image,
intermediate_polygon,
None if channels is None else list(channels),
scale_factor,
margin_ratio,
)


Expand Down
1 change: 1 addition & 0 deletions sopa/embedding/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,6 @@ def cluster_embeddings(
embeddings = element.compute().data[:, ilocs[:, 1], ilocs[:, 0]].T

gdf_patches[key_added] = method(embeddings, **method_kwargs)
gdf_patches[key_added] = gdf_patches[key_added].astype("category")

return gdf_patches
119 changes: 69 additions & 50 deletions sopa/io/reader/cosmx.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@
from typing import Optional

import dask.array as da
import numpy as np
import pandas as pd
import tifffile
import xarray as xr
from dask_image.imread import imread
from spatialdata import SpatialData
from spatialdata.models import Image2DModel, PointsModel
from spatialdata_io._constants._constants import CosmxKeys

from .utils import _default_image_kwargs
from .utils import _deduplicate_c_coords, _default_image_kwargs

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -51,7 +53,7 @@ def cosmx(
image_models_kwargs, imread_kwargs = _default_image_kwargs(image_models_kwargs, imread_kwargs)

dataset_id = _infer_dataset_id(path, dataset_id)
fov_locs = _read_cosmx_fov_locs(path, dataset_id)
fov_locs = _read_fov_locs(path, dataset_id)
fov_id, fov = _check_fov_id(fov)

protein_dir_dict = {}
Expand All @@ -64,14 +66,16 @@ def cosmx(

### Read image(s)
images_dir = _find_dir(path, "Morphology2D")
morphology_coords = _cosmx_morphology_coords(path)

if fov is None:
image, protein_names = _read_stitched_image(
image, c_coords = _read_stitched_image(
images_dir,
fov_locs,
protein_dir_dict,
morphology_coords,
**imread_kwargs,
)
image = image.rechunk(image_models_kwargs["chunks"])
image_name = "stitched_image"
else:
pattern = f"*{fov_id}.TIF"
Expand All @@ -82,13 +86,11 @@ def cosmx(
len(fov_files) == 1
), f"Multiple files match the pattern {pattern}: {', '.join(fov_files)}"

image, protein_names = _read_fov_image(
images_dir / fov_files[0], protein_dir_dict.get(fov), **imread_kwargs
image, c_coords = _read_fov_image(
fov_files[0], protein_dir_dict.get(fov), morphology_coords, **imread_kwargs
)
image_name = f"{fov}_image"

c_coords = _cosmx_c_coords(path, image.shape[0], protein_names)

parsed_image = Image2DModel.parse(
image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs
)
Expand All @@ -97,7 +99,7 @@ def cosmx(
return SpatialData(images={image_name: parsed_image})

### Read transcripts
transcripts_data = _read_cosmx_csv(path, dataset_id)
transcripts_data = _read_transcripts_csv(path, dataset_id)

if fov is None:
transcripts_data["x"] = transcripts_data["x_global_px"] - fov_locs["xmin"].min()
Expand All @@ -118,25 +120,12 @@ def cosmx(
return SpatialData(images={image_name: parsed_image}, points={points_name: transcripts})


def _read_fov_image(
morphology_path: Path, protein_path: Path | None, **imread_kwargs
) -> tuple[da.Array, list[str] | None]:
image = imread(morphology_path, **imread_kwargs)

protein_names = None
if protein_path is not None:
protein_image, protein_names = _read_protein_fov(protein_path)
image = da.concatenate([image, protein_image], axis=0)

return image, protein_names


def _infer_dataset_id(path: Path, dataset_id: str | None) -> str:
if isinstance(dataset_id, str):
return dataset_id

for suffix in [".csv", ".csv.gz"]:
counts_files = list(path.rglob(f"*_fov_positions_file{suffix}"))
counts_files = list(path.rglob(f"[!\\.]*_fov_positions_file{suffix}"))

if len(counts_files) == 1:
found = re.match(rf"(.*)_fov_positions_file{suffix}", str(counts_files[0]))
Expand All @@ -148,15 +137,35 @@ def _infer_dataset_id(path: Path, dataset_id: str | None) -> str:
)


def _read_cosmx_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:
def _read_fov_image(
morphology_path: Path, protein_path: Path | None, morphology_coords: list[str], **imread_kwargs
) -> tuple[da.Array, list[str]]:
image = imread(morphology_path, **imread_kwargs)

protein_names = []
if protein_path is not None:
protein_image, protein_names = _read_protein_fov(protein_path)
image = da.concatenate([image, protein_image], axis=0)

return image, _deduplicate_c_coords(morphology_coords + protein_names)


def _read_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:
fov_file = path / f"{dataset_id}_fov_positions_file.csv"

if not fov_file.exists():
fov_file = path / f"{dataset_id}_fov_positions_file.csv.gz"

assert fov_file.exists(), f"Missing field of view file: {fov_file}"

fov_locs = pd.read_csv(fov_file, index_col=1)
fov_locs = pd.read_csv(fov_file)

FOV_COLUMNS = ["X_mm", "Y_mm", "FOV"]
assert np.isin(
FOV_COLUMNS, fov_locs.columns
).all(), f"The file {fov_file} must contain the following columns: {', '.join(FOV_COLUMNS)}. Consider using a different export module."

fov_locs.index = fov_locs["FOV"]

pixel_size = 0.120280945 # size of a pixel in microns

Expand All @@ -170,21 +179,27 @@ def _read_cosmx_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:


def _read_stitched_image(
images_dir: Path, fov_locs: pd.DataFrame, protein_dir_dict: dict, **imread_kwargs
images_dir: Path,
fov_locs: pd.DataFrame,
protein_dir_dict: dict,
morphology_coords: list[str],
**imread_kwargs,
) -> tuple[da.Array, list[str] | None]:
log.warn("Image stitching is currently experimental")

fov_images = {}
protein_names = None
c_coords_dict = {}
pattern = re.compile(r".*_F(\d+)")
for image_path in images_dir.iterdir():
if image_path.suffix == ".TIF":
fov = int(pattern.findall(image_path.name)[0])

image, protein_names = _read_fov_image(
image_path, protein_dir_dict.get(fov), **imread_kwargs
image, c_coords = _read_fov_image(
image_path, protein_dir_dict.get(fov), morphology_coords, **imread_kwargs
)

c_coords_dict[fov] = c_coords

fov_images[fov] = da.flip(image, axis=1)

fov_locs.loc[fov, "xmax"] = fov_locs.loc[fov, "xmin"] + image.shape[2]
Expand All @@ -195,16 +210,24 @@ def _read_stitched_image(
fov_locs[f"{dim}0"] = (fov_locs[f"{dim}min"] - shift).round().astype(int)
fov_locs[f"{dim}1"] = (fov_locs[f"{dim}max"] - shift).round().astype(int)

c_coords = list(set.union(*[set(names) for names in c_coords_dict.values()]))

stitched_image = da.zeros(
shape=(image.shape[0], fov_locs["y1"].max(), fov_locs["x1"].max()), dtype=image.dtype
shape=(len(c_coords), fov_locs["y1"].max(), fov_locs["x1"].max()), dtype=image.dtype
)
stitched_image = xr.DataArray(stitched_image, dims=("c", "y", "x"), coords={"c": c_coords})

for fov, im in fov_images.items():
xmin, xmax = fov_locs.loc[fov, "x0"], fov_locs.loc[fov, "x1"]
ymin, ymax = fov_locs.loc[fov, "y0"], fov_locs.loc[fov, "y1"]
stitched_image[:, ymin:ymax, xmin:xmax] = im
stitched_image.loc[
{"c": c_coords_dict[fov], "y": slice(ymin, ymax), "x": slice(xmin, xmax)}
] = im

if len(c_coords_dict[fov]) < len(c_coords):
log.warn(f"Missing channels ({len(c_coords) - len(c_coords_dict[fov])}) for FOV {fov}")

return stitched_image, protein_names
return stitched_image.data, c_coords


def _check_fov_id(fov: str | int | None) -> tuple[str, int]:
Expand All @@ -221,33 +244,29 @@ def _check_fov_id(fov: str | int | None) -> tuple[str, int]:
return fov, int(fov[1:])


def _read_cosmx_csv(path: Path, dataset_id: str) -> pd.DataFrame:
def _read_transcripts_csv(path: Path, dataset_id: str) -> pd.DataFrame:
transcripts_file = path / f"{dataset_id}_tx_file.csv.gz"

if transcripts_file.exists():
return pd.read_csv(transcripts_file, compression="gzip")

transcripts_file = path / f"{dataset_id}_tx_file.csv"
df = pd.read_csv(transcripts_file, compression="gzip")
else:
transcripts_file = path / f"{dataset_id}_tx_file.csv"
assert transcripts_file.exists(), f"Transcript file {transcripts_file} not found."
df = pd.read_csv(transcripts_file)

assert transcripts_file.exists(), f"Transcript file {transcripts_file} not found."
TRANSCRIPT_COLUMNS = ["x_global_px", "y_global_px", "target"]
assert np.isin(
TRANSCRIPT_COLUMNS, df.columns
).all(), f"The file {transcripts_file} must contain the following columns: {', '.join(TRANSCRIPT_COLUMNS)}. Consider using a different export module."

return pd.read_csv(transcripts_file)
return df


def _cosmx_c_coords(path: Path, n_channels: int, protein_names: list[str] | None) -> list[str]:
def _cosmx_morphology_coords(path: Path) -> list[str]:
channel_ids_path = path / "Morphology_ChannelID_Dictionary.txt"

if channel_ids_path.exists():
channel_names = list(pd.read_csv(channel_ids_path, delimiter="\t")["BiologicalTarget"])
else:
n_channels = n_channels - len(protein_names) if protein_names is not None else n_channels
channel_names = [str(i) for i in range(n_channels)]
log.warn(f"Channel file not found at {channel_ids_path}, using {channel_names=} instead.")

if protein_names is not None:
channel_names += protein_names

return channel_names
assert channel_ids_path.exists(), f"Channel file not found at {channel_ids_path}"
return list(pd.read_csv(channel_ids_path, delimiter="\t")["BiologicalTarget"])


def _find_dir(path: Path, name: str):
Expand Down
12 changes: 12 additions & 0 deletions sopa/io/reader/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import logging
from collections import defaultdict
from pathlib import Path
from typing import Callable

Expand Down Expand Up @@ -36,6 +37,17 @@ def _deduplicate_names(df):
return df[0].values


def _deduplicate_c_coords(c_coords: list[str]) -> list[str]:
counter, res = defaultdict(int), []
for channel in c_coords:
if channel not in counter:
res.append(channel)
else:
res.append(f"{channel} ({counter[channel]})")
counter[channel] += 1
return res


def _get_files_stem(files: list[Path]):
return [file.stem for file in files]

Expand Down
2 changes: 1 addition & 1 deletion sopa/utils/polygon_crop.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def _prepare(sdata: SpatialData, channels: list[str], scale_factor: float):
else:
assert (
len(image.coords["c"]) in VALID_N_CHANNELS
), f"Choose one or three channels among {image.c.values} by using the -c argument"
), f"Choose one or three channels among {image.c.values} by using the --channels argument"

log.info(f"Resizing image by a factor of {scale_factor}")
return image_key, resize(image, scale_factor).compute()
Expand Down

0 comments on commit 372969a

Please sign in to comment.