From 575bfdb93835c88836a7ceb59106a07f1c184069 Mon Sep 17 00:00:00 2001 From: Laurent Guerard Date: Tue, 24 Sep 2024 09:18:03 +0200 Subject: [PATCH 1/5] Fix compatibility with latest version of Baysor Latest version of Baysor outputs either `segmentation_polygons2D.json` or `segmentation_polygons3D.json` depending on input data. This very basic fix would reestablish compatibility, in case users aren't running both 2D and 3D and outputting results in same folder --- sopa/segmentation/transcripts.py | 65 ++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/sopa/segmentation/transcripts.py b/sopa/segmentation/transcripts.py index 2cb397c..54ef99b 100644 --- a/sopa/segmentation/transcripts.py +++ b/sopa/segmentation/transcripts.py @@ -1,5 +1,6 @@ from __future__ import annotations +import glob import json import logging from pathlib import Path @@ -42,7 +43,9 @@ def resolve( if min_area > 0: log.info(f"Cells whose area is less than {min_area} microns^2 will be removed") - patches_cells, adatas = _read_all_segmented_patches(temp_dir, min_area, patches_dirs) + patches_cells, adatas = _read_all_segmented_patches( + temp_dir, min_area, patches_dirs + ) geo_df, cells_indices, new_ids = _resolve_patches(patches_cells, adatas) image_key, _ = get_spatial_image(sdata, return_key=True) @@ -58,22 +61,31 @@ def resolve( geo_df_new = ShapesModel.parse(geo_df_new, transformations=transformations) log.info("Aggregating transcripts on merged cells") - table_conflicts = aggregation.count_transcripts(sdata, gene_column, geo_df=geo_df_new) + table_conflicts = aggregation.count_transcripts( + sdata, gene_column, geo_df=geo_df_new + ) table_conflicts.obs_names = new_ids table_conflicts = [table_conflicts] valid_ids = set(list(geo_df.index)) table = anndata.concat( - [adata[list(valid_ids & set(list(adata.obs_names)))] for adata in adatas] + table_conflicts, + [adata[list(valid_ids & set(list(adata.obs_names)))] for adata in adatas] + + table_conflicts, join="outer", ) table.obs.dropna(axis="columns", inplace=True) geo_df = geo_df.loc[table.obs_names] - table.obsm["spatial"] = np.array([[centroid.x, centroid.y] for centroid in geo_df.centroid]) - table.obs[SopaKeys.REGION_KEY] = pd.Series(shapes_key, index=table.obs_names, dtype="category") - table.obs[SopaKeys.SLIDE_KEY] = pd.Series(image_key, index=table.obs_names, dtype="category") + table.obsm["spatial"] = np.array( + [[centroid.x, centroid.y] for centroid in geo_df.centroid] + ) + table.obs[SopaKeys.REGION_KEY] = pd.Series( + shapes_key, index=table.obs_names, dtype="category" + ) + table.obs[SopaKeys.SLIDE_KEY] = pd.Series( + image_key, index=table.obs_names, dtype="category" + ) table.obs[SopaKeys.INSTANCE_KEY] = geo_df.index table = TableModel.parse( @@ -86,7 +98,9 @@ def resolve( add_spatial_element(sdata, shapes_key, geo_df) add_spatial_element(sdata, SopaKeys.TABLE, table) - log.info(f"Added sdata.tables['{SopaKeys.TABLE}'], and {len(geo_df)} cell boundaries to sdata['{shapes_key}']") + log.info( + f"Added sdata.tables['{SopaKeys.TABLE}'], and {len(geo_df)} cell boundaries to sdata['{shapes_key}']" + ) def _read_one_segmented_patch( @@ -96,7 +110,9 @@ def _read_one_segmented_patch( loom_file = directory / "segmentation_counts.loom" if loom_file.exists(): - adata = anndata.read_loom(directory / "segmentation_counts.loom", obs_names="Name", var_names="Name") + adata = anndata.read_loom( + directory / "segmentation_counts.loom", obs_names="Name", var_names="Name" + ) else: adata = anndata.read_h5ad(directory / "segmentation_counts.h5ad") @@ -105,13 +121,20 @@ def _read_one_segmented_patch( cells_num = pd.Series(adata.obs["CellID"].astype(int), index=adata.obs_names) del adata.obs["CellID"] - with open(directory / "segmentation_polygons.json") as f: + with open(glob.glob(directory / "segmentation_polygons*.json")[0]) as f: polygons_dict = json.load(f) polygons_dict = {c["cell"]: c for c in polygons_dict["geometries"]} - cells_num = cells_num[cells_num.map(lambda num: len(polygons_dict[num]["coordinates"][0]) >= min_vertices)] + cells_num = cells_num[ + cells_num.map( + lambda num: len(polygons_dict[num]["coordinates"][0]) >= min_vertices + ) + ] - gdf = gpd.GeoDataFrame(index=cells_num.index, geometry=[shape(polygons_dict[cell_num]) for cell_num in cells_num]) + gdf = gpd.GeoDataFrame( + index=cells_num.index, + geometry=[shape(polygons_dict[cell_num]) for cell_num in cells_num], + ) gdf.geometry = gdf.geometry.map(lambda cell: shapes._ensure_polygon(cell)) gdf = gdf[~gdf.geometry.isna()] @@ -131,7 +154,9 @@ def _read_all_segmented_patches( patches_dirs: list[str] | None = None, ) -> tuple[list[list[Polygon]], list[AnnData]]: if patches_dirs is None or not len(patches_dirs): - patches_dirs = [subdir for subdir in Path(temp_dir).iterdir() if subdir.is_dir()] + patches_dirs = [ + subdir for subdir in Path(temp_dir).iterdir() if subdir.is_dir() + ] outs = [ _read_one_segmented_patch(path, min_area) @@ -157,14 +182,20 @@ def _resolve_patches( """ patch_ids = [adata.obs_names for adata in adatas] - patch_indices = np.arange(len(patches_cells)).repeat([len(cells) for cells in patches_cells]) + patch_indices = np.arange(len(patches_cells)).repeat( + [len(cells) for cells in patches_cells] + ) cells = [cell for cells in patches_cells for cell in cells] segmentation_ids = np.array([cell_id for ids in patch_ids for cell_id in ids]) - cells_resolved, cells_indices = shapes.solve_conflicts(cells, patch_indices=patch_indices, return_indices=True) + cells_resolved, cells_indices = shapes.solve_conflicts( + cells, patch_indices=patch_indices, return_indices=True + ) existing_ids = segmentation_ids[cells_indices[cells_indices >= 0]] - new_ids = np.char.add("merged_cell_", np.arange((cells_indices == -1).sum()).astype(str)) + new_ids = np.char.add( + "merged_cell_", np.arange((cells_indices == -1).sum()).astype(str) + ) index = np.concatenate([existing_ids, new_ids]) return ( @@ -206,4 +237,6 @@ def copy_segmentation_config(path: Path, config: dict, config_path: str | None): toml.dump(config, f) return - raise ValueError(f"Config file must be either a .json or a .toml file. Found: {path.suffix}") + raise ValueError( + f"Config file must be either a .json or a .toml file. Found: {path.suffix}" + ) From 69f048e6192a2ee0a4186027c1620578ca2f04af Mon Sep 17 00:00:00 2001 From: Laurent Guerard Date: Tue, 24 Sep 2024 10:00:29 +0200 Subject: [PATCH 2/5] Fix issue with WindowPath and glob --- sopa/segmentation/transcripts.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sopa/segmentation/transcripts.py b/sopa/segmentation/transcripts.py index 54ef99b..35091ef 100644 --- a/sopa/segmentation/transcripts.py +++ b/sopa/segmentation/transcripts.py @@ -1,6 +1,5 @@ from __future__ import annotations -import glob import json import logging from pathlib import Path @@ -121,7 +120,7 @@ def _read_one_segmented_patch( cells_num = pd.Series(adata.obs["CellID"].astype(int), index=adata.obs_names) del adata.obs["CellID"] - with open(glob.glob(directory / "segmentation_polygons*.json")[0]) as f: + with open(list(directory.glob("segmentation_polygons*.json"))[0]) as f: polygons_dict = json.load(f) polygons_dict = {c["cell"]: c for c in polygons_dict["geometries"]} @@ -141,7 +140,9 @@ def _read_one_segmented_patch( ratio_filtered = (gdf.area <= min_area).mean() if ratio_filtered > 0.2: - log.warning(f"{ratio_filtered:.2%} of cells will be filtered due to {min_area=}") + log.warning( + f"{ratio_filtered:.2%} of cells will be filtered due to {min_area=}" + ) gdf = gdf[gdf.area > min_area] From b05e3ce04d6c5e0365c1ca0cb545978fc6635327 Mon Sep 17 00:00:00 2001 From: Laurent Guerard Date: Tue, 24 Sep 2024 15:00:39 +0200 Subject: [PATCH 3/5] Revert last commit and update API documentation Some Baysor options were removed and the output format of the polygons needs to be changed to fit with the next steps of the pipeline. --- docs/tutorials/api_usage.ipynb | 60 +++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/docs/tutorials/api_usage.ipynb b/docs/tutorials/api_usage.ipynb index 2dd0fc1..fc985f7 100644 --- a/docs/tutorials/api_usage.ipynb +++ b/docs/tutorials/api_usage.ipynb @@ -78,8 +78,10 @@ "outputs": [], "source": [ "image_key = \"image\"\n", - "points_key = \"transcripts\" # (ignore this for multiplex imaging)\n", - "gene_column = \"genes\" # (optional) column of sdata[points_key] containing the gene names" + "points_key = \"transcripts\" # (ignore this for multiplex imaging)\n", + "gene_column = (\n", + " \"genes\" # (optional) column of sdata[points_key] containing the gene names\n", + ")" ] }, { @@ -117,7 +119,9 @@ } ], "source": [ - "patches = sopa.segmentation.Patches2D(sdata, image_key, patch_width=1200, patch_overlap=50)\n", + "patches = sopa.segmentation.Patches2D(\n", + " sdata, image_key, patch_width=1200, patch_overlap=50\n", + ")\n", "patches.write();" ] }, @@ -163,8 +167,12 @@ "source": [ "channels = [\"DAPI\"]\n", "\n", - "method = sopa.segmentation.methods.cellpose_patch(diameter=35, channels=channels, flow_threshold=2, cellprob_threshold=-6)\n", - "segmentation = sopa.segmentation.StainingSegmentation(sdata, method, channels, min_area=2500)\n", + "method = sopa.segmentation.methods.cellpose_patch(\n", + " diameter=35, channels=channels, flow_threshold=2, cellprob_threshold=-6\n", + ")\n", + "segmentation = sopa.segmentation.StainingSegmentation(\n", + " sdata, method, channels, min_area=2500\n", + ")\n", "\n", "# The cellpose boundaries will be temporary saved here. You can choose a different path\n", "cellpose_temp_dir = \"tuto.zarr/.sopa_cache/cellpose\"" @@ -231,7 +239,7 @@ ], "source": [ "# parallelize this for loop yourself (or use the Snakemake pipeline)\n", - "for patch_index in range(len(sdata['sopa_patches'])):\n", + "for patch_index in range(len(sdata[\"sopa_patches\"])):\n", " segmentation.write_patch_cells(cellpose_temp_dir, patch_index)" ] }, @@ -269,7 +277,7 @@ "cells = sopa.segmentation.StainingSegmentation.read_patches_cells(cellpose_temp_dir)\n", "cells = sopa.segmentation.shapes.solve_conflicts(cells)\n", "\n", - "shapes_key = \"cellpose_boundaries\" # name of the key given to the cells in sdata.shapes\n", + "shapes_key = \"cellpose_boundaries\" # name of the key given to the cells in sdata.shapes\n", "\n", "sopa.segmentation.StainingSegmentation.add_shapes(sdata, cells, image_key, shapes_key)" ] @@ -287,7 +295,7 @@ "metadata": {}, "outputs": [], "source": [ - "shapes_key = \"baysor_boundaries\" # the name that we will give to the baysor \"shapes\"" + "shapes_key = \"baysor_boundaries\" # the name that we will give to the baysor \"shapes\"" ] }, { @@ -317,7 +325,7 @@ " \"gene\": \"genes\",\n", " \"min_molecules_per_gene\": 0,\n", " \"min_molecules_per_segment\": 3,\n", - " \"confidence_nn_id\": 6\n", + " \"confidence_nn_id\": 6,\n", " },\n", " \"segmentation\": {\n", " \"scale\": 3, # Important parameter: typical cell diameter, in microns (see our configs)\n", @@ -329,9 +337,7 @@ " \"n_cells_init\": 0,\n", " \"nuclei_genes\": \"\",\n", " \"cyto_genes\": \"\",\n", - " \"new_component_weight\": 0.2,\n", - " \"new_component_fraction\": 0.3\n", - " }\n", + " },\n", "}" ] }, @@ -373,7 +379,9 @@ "# The cellpose boundaries will be temporary saved here. You can choose a different path\n", "baysor_temp_dir = \"tuto.zarr/.sopa_cache/baysor\"\n", "\n", - "patches = sopa.segmentation.Patches2D(sdata, points_key, patch_width=3000, patch_overlap=50)\n", + "patches = sopa.segmentation.Patches2D(\n", + " sdata, points_key, patch_width=3000, patch_overlap=50\n", + ")\n", "valid_indices = patches.patchify_transcripts(baysor_temp_dir, config=config)" ] }, @@ -409,7 +417,7 @@ "for patch_index in valid_indices:\n", " command = f\"\"\"\n", " cd {baysor_temp_dir}/{patch_index}\n", - " {baysor_executable_path} run --save-polygons GeoJSON -c config.toml transcripts.csv\n", + " {baysor_executable_path} run --polygon-format=GeometryCollection -c config.toml transcripts.csv\n", " \"\"\"\n", " subprocess.run(command, shell=True)" ] @@ -501,7 +509,9 @@ } ], "source": [ - "aggregator = sopa.segmentation.Aggregator(sdata, image_key=image_key, shapes_key=shapes_key)\n", + "aggregator = sopa.segmentation.Aggregator(\n", + " sdata, image_key=image_key, shapes_key=shapes_key\n", + ")\n", "\n", "aggregator.compute_table(gene_column=gene_column, average_intensities=True)" ] @@ -611,11 +621,7 @@ "source": [ "from sopa.annotation import higher_z_score\n", "\n", - "marker_cell_dict = {\n", - " \"CK\": \"Tumoral cell\",\n", - " \"CD20\": \"B cell\",\n", - " \"CD3\": \"T cell\"\n", - "}\n", + "marker_cell_dict = {\"CK\": \"Tumoral cell\", \"CD20\": \"B cell\", \"CD3\": \"T cell\"}\n", "\n", "higher_z_score(sdata.tables[\"table\"], marker_cell_dict)" ] @@ -698,7 +704,9 @@ } ], "source": [ - "sopa.io.write(\"tuto.explorer\", sdata, image_key, points_key=points_key, gene_column=gene_column)" + "sopa.io.write(\n", + " \"tuto.explorer\", sdata, image_key, points_key=points_key, gene_column=gene_column\n", + ")" ] }, { @@ -767,11 +775,11 @@ } ], "source": [ - "sdata\\\n", - " .pl.render_points(size=0.01, color=\"r\", alpha=0.5)\\\n", - " .pl.render_images()\\\n", - " .pl.render_shapes(shapes_key, outline=True, fill_alpha=0, outline_color=\"w\")\\\n", - " .pl.show(\"global\")" + "sdata.pl.render_points(\n", + " size=0.01, color=\"r\", alpha=0.5\n", + ").pl.render_images().pl.render_shapes(\n", + " shapes_key, outline=True, fill_alpha=0, outline_color=\"w\"\n", + ").pl.show(\"global\")" ] }, { From 938b7dbf82666c5a5025f7e901a7feca2640fdb7 Mon Sep 17 00:00:00 2001 From: Laurent Guerard Date: Tue, 24 Sep 2024 16:57:42 +0200 Subject: [PATCH 4/5] Fix parsing of Cell number from loom file --- sopa/segmentation/transcripts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sopa/segmentation/transcripts.py b/sopa/segmentation/transcripts.py index 35091ef..462b6f0 100644 --- a/sopa/segmentation/transcripts.py +++ b/sopa/segmentation/transcripts.py @@ -122,7 +122,7 @@ def _read_one_segmented_patch( with open(list(directory.glob("segmentation_polygons*.json"))[0]) as f: polygons_dict = json.load(f) - polygons_dict = {c["cell"]: c for c in polygons_dict["geometries"]} + polygons_dict = {cells_num[c["cell"]]: c for c in polygons_dict["geometries"]} cells_num = cells_num[ cells_num.map( From ba537a37c343f9438a9345fcb9a04c5957ff6ad7 Mon Sep 17 00:00:00 2001 From: Blampey Quentin Date: Wed, 25 Sep 2024 12:59:47 +0200 Subject: [PATCH 5/5] check whether the cell ids are string or int --- sopa/segmentation/transcripts.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/sopa/segmentation/transcripts.py b/sopa/segmentation/transcripts.py index 1a5d6b8..8a30909 100644 --- a/sopa/segmentation/transcripts.py +++ b/sopa/segmentation/transcripts.py @@ -97,6 +97,7 @@ def _read_one_segmented_patch( directory: str, min_area: float = 0, min_vertices: int = 4 ) -> tuple[list[Polygon], AnnData]: directory: Path = Path(directory) + id_as_string, polygon_file = _find_polygon_file(directory) loom_file = directory / "segmentation_counts.loom" if loom_file.exists(): @@ -106,12 +107,12 @@ def _read_one_segmented_patch( adata.obs.rename(columns={"area": SopaKeys.ORIGINAL_AREA_OBS}, inplace=True) - cells_num = pd.Series(adata.obs["CellID"].astype(int), index=adata.obs_names) + cells_num = pd.Series(adata.obs_names if id_as_string else adata.obs["CellID"].astype(int), index=adata.obs_names) del adata.obs["CellID"] - with open(list(directory.glob("segmentation_polygons*.json"))[0]) as f: + with open(polygon_file) as f: polygons_dict = json.load(f) - polygons_dict = {cells_num[c["cell"]]: c for c in polygons_dict["geometries"]} + polygons_dict = {c["cell"]: c for c in polygons_dict["geometries"]} cells_num = cells_num[cells_num.map(lambda num: len(polygons_dict[num]["coordinates"][0]) >= min_vertices)] @@ -132,6 +133,15 @@ def _read_one_segmented_patch( return gdf.geometry.values, adata[gdf.index].copy() +def _find_polygon_file(directory: Path) -> tuple[bool, Path]: + old_baysor_path = directory / "segmentation_polygons.json" + if old_baysor_path.exists(): + return False, old_baysor_path + new_baysor_path = directory / "segmentation_polygons_2d.json" + assert new_baysor_path.exists(), f"Could not find the segmentation polygons file in {directory}" + return True, new_baysor_path + + def _read_all_segmented_patches( temp_dir: str, min_area: float = 0,