From 11500315ea85bb3c39f211e0d39270efe5364b6e Mon Sep 17 00:00:00 2001 From: Raymond Lim Date: Wed, 1 Nov 2023 13:12:44 -0400 Subject: [PATCH] docs: update docstrings --- src/luna/pathology/cli/dsa_upload.py | 27 ++- src/luna/pathology/cli/dsa_viz.py | 182 ++++++++++++++++-- .../pathology/cli/extract_shape_features.py | 4 + .../cli/extract_tile_shape_features.py | 31 ++- src/luna/pathology/cli/infer_tile_labels.py | 45 ++++- .../cli/run_stardist_cell_detection.py | 63 +++++- .../pathology/cli/run_tissue_detection.py | 25 ++- src/luna/pathology/cli/save_tiles.py | 17 +- src/luna/pathology/cli/slide_etl.py | 10 +- 9 files changed, 348 insertions(+), 56 deletions(-) diff --git a/src/luna/pathology/cli/dsa_upload.py b/src/luna/pathology/cli/dsa_upload.py index 7a68c1fd..7a2d9e35 100644 --- a/src/luna/pathology/cli/dsa_upload.py +++ b/src/luna/pathology/cli/dsa_upload.py @@ -117,9 +117,27 @@ def upload_annotation_to_dsa( insecure: bool = False, storage_options: dict = {}, ): - uuids = [] + """Upload annotation to DSA + + Upload json annotation file as a new annotation to the image in the DSA collection. + + Args: + dsa_endpoint_url (string): DSA API endpoint e.g. http://localhost:8080/api/v1 + slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl + annotation_column (string): annotation column of slide_manifest containing the dsa url + collection_name (string): name of the collection in DSA + image_filename (string): name of the image file in DSA e.g. 123.svs. If not specified, infer from annotiaton_file_urpath + username (string): DSA username (defaults to environment variable DSA_USERNAME) + password (string): DSA password (defaults to environment variable DSA_PASSWORD) + force (bool): upload even if annotation with same name exists for the slide + insecure (bool): insecure ssl + storage_options (dict): options to pass to reading functions + + Returns: + DataFrame[SlideSchema]: slide manifest + """ for slide in slide_manifest.itertuples(name="Slide"): - uuids += _upload_annotation_to_dsa( + uuids = _upload_annotation_to_dsa( dsa_endpoint_url, slide[annotation_column], collection_name, @@ -130,7 +148,10 @@ def upload_annotation_to_dsa( insecure, storage_options, ) - return uuids + slide_manifest.at[ + slide.Index, annotation_column.replace("url", "uuid") + ] = uuids[0] + return slide_manifest def _upload_annotation_to_dsa( diff --git a/src/luna/pathology/cli/dsa_viz.py b/src/luna/pathology/cli/dsa_viz.py index 21611522..6f2612ab 100644 --- a/src/luna/pathology/cli/dsa_viz.py +++ b/src/luna/pathology/cli/dsa_viz.py @@ -207,9 +207,7 @@ def stardist_polygon_cli( def stardist_polygon( slide_manifest: DataFrame[SlideSchema], - object_urlpath: str, output_urlpath: str, - image_filename: str, annotation_name: str, line_colors: Dict[str, str], fill_colors: Dict[str, str], @@ -218,6 +216,22 @@ def stardist_polygon( annotation_column: str = "stardist_polygon_geojson_url", output_column: str = "regional_dsa_url", ): + """Build DSA annotation json from stardist geojson classification results + + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl + output_urlpath (string): URL/path prefix to save annotations + annotation_name (string): name of the annotation to be displayed in DSA + line_colors (dict): user-provided line color map with {feature name:rgb values} + fill_colors (dict): user-provided fill color map with {feature name:rgba values} + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions + annotation_column (string): column containing url to stardist polygon geojson + output_column (string): column with result url to add to slide_manifest + + Returns: + DataFrame[SlideSchema]: slide manifest + """ if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() @@ -322,6 +336,7 @@ def stardist_polygon_tile_cli( line_colors: dict[str, str] = {}, fill_colors: dict[str, str] = {}, storage_options: dict = {}, + output_storage_options: dict = {}, local_config: str = "", ): """Build DSA annotation json from stardist geojson classification and labeled tiles @@ -334,7 +349,8 @@ def stardist_polygon_tile_cli( output_urlpath (string): URL/path prefix to save annotations line_colors (dict): user-provided line color map with {feature name:rgb values} fill_colors (dict): user-provided fill color map with {feature name:rgba values} - storage_options (dict): storage options to pass to read/write functions + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions local_config (string): local config YAML file Returns: @@ -357,10 +373,7 @@ def stardist_polygon_tile_cli( def stardist_polygon_tile( slide_manifest: DataFrame[SlideSchema], - object_urlpath: str, - tiles_urlpath: str, output_urlpath: str, - image_filename: str, annotation_name_prefix: str, line_colors: Dict[str, str], fill_colors: Dict[str, str], @@ -369,6 +382,22 @@ def stardist_polygon_tile( annotation_column: str = "stardist_polygon_geojson_url", output_column_suffix: str = "regional_dsa_url", ): + """Build DSA annotation json from stardist geojson classification and labeled tiles + + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest + annotation_name_prefix (string): name of the annotation to be displayed in DSA + output_urlpath (string): URL/path prefix to save annotations + line_colors (dict): user-provided line color map with {feature name:rgb values} + fill_colors (dict): user-provided fill color map with {feature name:rgba values} + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions + annotation_column (string): column containing url to stardist polygon geojson + output_column_suffix (string): column suffix with result url to add to slide_manifest + + Returns: + dict[str,str]: annotation file path + """ if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() @@ -415,11 +444,13 @@ def __stardist_polygon_tile( Args: object_urlpath (string): URL/path to stardist geojson classification results tiles_urlpath (string): URL/path to tiles manifest parquet - annotation_name_prefix (string): name of the annotation to be displayed in DSA output_urlpath (string): URL/path prefix to save annotations + image_filename (string): name of the image file in DSA e.g. 123.svs + annotation_name_prefix (string): name of the annotation to be displayed in DSA line_colors (dict): user-provided line color map with {feature name:rgb values} fill_colors (dict): user-provided fill color map with {feature name:rgba values} - storage_options (dict): storage options to pass to read/write functions + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions Returns: dict: DSA annotations @@ -503,6 +534,7 @@ def stardist_cell_cli( line_colors: Optional[dict[str, str]] = None, fill_colors: Optional[dict[str, str]] = None, storage_options: dict = {}, + output_storage_options: dict = {}, local_config: str = "", ): """Build DSA annotation json from TSV classification data generated by @@ -519,7 +551,8 @@ def stardist_cell_cli( annotation_name (string): name of the annotation to be displayed in DSA line_colors (dict, optional): line color map with {feature name:rgb values} fill_colors (dict, optional): fill color map with {feature name:rgba values} - storage_options (dict): storage options to pass to read/write functions + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions local_config (string): local config YAML file Returns: @@ -542,7 +575,6 @@ def stardist_cell_cli( def stardist_cell( slide_manifest: DataFrame[SlideSchema], output_urlpath: str, - image_filename: str, annotation_name: str, line_colors: Optional[Dict[str, str]], fill_colors: Optional[Dict[str, str]], @@ -551,6 +583,27 @@ def stardist_cell( annotation_column: str = "stardist_cell_tsv_url", output_column: str = "stardist_cell_dsa_url", ): + """Build DSA annotation json from TSV classification data generated by + stardist + + Processes a cell classification data generated by Qupath/stardist and + adds the center coordinates of the cells + as annotation elements. + + Args: + input_urlpath (string): URL/path to TSV classification data generated by stardist + output_urlpath (string): URL/path prefix for saving dsa annotation json + annotation_name (string): name of the annotation to be displayed in DSA + line_colors (dict, optional): line color map with {feature name:rgb values} + fill_colors (dict, optional): fill color map with {feature name:rgba values} + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions + annotation_column (string): column containing url to stardist polygon geojson + output_column_suffix (string): column suffix with result url to add to slide_manifest + + Returns: + DataFrame[SlideSchema]: slide manifest + """ if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() @@ -671,6 +724,7 @@ def regional_polygon_cli( line_colors: Optional[dict[str, str]] = None, fill_colors: Optional[dict[str, str]] = None, storage_options: dict = {}, + output_storage_options: dict = {}, local_config: str = "", ): """Build DSA annotation json from regional annotation geojson @@ -681,7 +735,8 @@ def regional_polygon_cli( annotation_name (string): name of the annotation to be displayed in DSA line_colors (dict, optional): line color map with {feature name:rgb values} fill_colors (dict, optional): fill color map with {feature name:rgba values} - storage_options (dict): storage options to pass to read/write functions + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions local_config (string): local config yaml file Returns: @@ -707,9 +762,7 @@ def regional_polygon_cli( def regional_polygon( slide_manifest: DataFrame[SlideSchema], output_urlpath: str, - image_filename: str, annotation_name: str, - classes_to_include: List, line_colors: Optional[Dict[str, str]], fill_colors: Optional[Dict[str, str]], storage_options: Dict, @@ -717,6 +770,23 @@ def regional_polygon( annotation_column: str = "regional_geojson_url", output_column: str = "regional_dsa_url", ): + """Build DSA annotation json from regional annotation geojson + + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest + output_urlpath (string): URL/path prefix for saving dsa annotation json + annotation_name (string): name of the annotation to be displayed in DSA + line_colors (dict, optional): line color map with {feature name:rgb values} + fill_colors (dict, optional): fill color map with {feature name:rgba values} + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions + annotation_column (string): column containing url to regional geojson + output_column_suffix (string): column suffix with result url to add to slide_manifest + + Returns: + DataFrame[SlideSchema]: slide schema + """ + if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() @@ -812,6 +882,7 @@ def qupath_polygon_cli( line_colors: Optional[dict[str, str]] = None, fill_colors: Optional[dict[str, str]] = None, storage_options: dict = {}, + output_storage_options: dict = {}, local_config: str = "", ): """Build DSA annotation json from Qupath polygon geojson @@ -826,8 +897,8 @@ def qupath_polygon_cli( e.g. ["Tumor", "Stroma", ...] line_colors (dict, optional): line color map with {feature name:rgb values} fill_colors (dict, optional): fill color map with {feature name:rgba values} - storage_options (dict): storage options to pass to read/write functions - output_storage_options (dict): storage options to pass to read/write functions + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions local_config (string): local config yaml file Returns: @@ -862,6 +933,26 @@ def qupath_polygon( annotation_column: str = "qupath_geojson_url", output_column: str = "qupath_dsa_url", ): + """Build DSA annotation json from Qupath polygon geojson + + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl + output_urlpath (string): URL/path prefix for saving the DSA compatible annotation + json + image_filename (string): name of the image file in DSA e.g. 123.svs + annotation_name (string): name of the annotation to be displayed in DSA + classes_to_include (list): list of classification labels to visualize + e.g. ["Tumor", "Stroma", ...] + line_colors (dict, optional): line color map with {feature name:rgb values} + fill_colors (dict, optional): fill color map with {feature name:rgba values} + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions + annotation_column (string): column containing url to qupath geojson + output_column_suffix (string): column suffix with result url to add to slide_manifest + + Returns: + DataFrame[SlideSchema]: slide manifest + """ if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() @@ -987,14 +1078,16 @@ def bitmask_polygon_cli( Args: input_map (map): map of {label:path_to_bitmask_png} - output_dir (string): directory to save the DSA compatible annotation + output_urlpath (string): url/path to save the DSA compatible annotation json image_filename (string): name of the image file in DSA e.g. 123.svs annotation_name (string): name of the annotation to be displayed in DSA line_colors (dict, optional): line color map with {feature name:rgb values} fill_colors (dict, optional): fill color map with {feature name:rgba values} scale_factor (int, optional): scale to match the image on DSA. - storage_options (dict): storage options to pass to read/write functions + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions + local_config (string): local config yaml file Returns: dict: annotation file path @@ -1144,6 +1237,28 @@ def heatmap( storage_options: Dict, output_storage_options: Dict, ): + """Generate heatmap based on the tile scores + + Creates a heatmap for the given column, using the color palette `viridis` + to set a fill value + - the color ranges from purple to yellow, for scores from 0 to 1. + + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl + output_urlpath (string): URL/path prefix to save the DSA compatible annotation + json + annotation_name (string): name of the annotation to be displayed in DSA + column (string): column to visualize e.g. tile_score + tile_size (int): size of tiles + scale_factor (int, optional): scale to match the image on DSA. + line_colors (dict, optional): line color map with {feature name:rgb values} + fill_colors (dict, optional): fill color map with {feature name:rgba values} + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions + + Returns: + dict: annotation file path. None if error in writing the file. + """ if "tiles_url" not in slide_manifest.columns: raise ValueError("tiles_url not found in slide manifest") client = get_or_create_dask_client() @@ -1199,9 +1314,10 @@ def __heatmap( column (list[string]): columns to visualize e.g. tile_score tile_size (int): size of tiles scale_factor (int, optional): scale to match the image on DSA. - line_colors (Optional[dict[str,str]]): line color map with {feature name:rgb values} fill_colors (Optional[dict[str,str]]): fill color map with {feature name:rgba values} - storage_options (dict): storage options to pass to read/write functions + line_colors (Optional[dict[str,str]]): line color map with {feature name:rgb values} + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions Returns: dict: DSA annotation @@ -1272,6 +1388,7 @@ def bmp_polygon_cli( fill_colors: Optional[Dict[str, str]] = None, scale_factor: Optional[int] = 1, storage_options: Dict = {}, + output_storage_options: Dict = {}, local_config: str = "", ): """Build DSA annotation json from a BMP with multiple labels. @@ -1288,7 +1405,8 @@ def bmp_polygon_cli( line_colors (dict[str,str], optional): line color map with {feature name:rgb values} fill_colors (dict[str,str], optional): fill color map with {feature name:rgba values} scale_factor (int, optional): scale to match image DSA. - storage_options (dict): storage options to pass to read/write functions + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions Returns: dict: annotation file path @@ -1323,6 +1441,27 @@ def bmp_polygon( annotation_column: str = "bmp_polygon_url", output_column: str = "bmp_polygon_dsa_url", ): + """Build DSA annotation json from a BMP with multiple labels. + + Vectorizes and simplifies contours per label. + + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl + output_urlpath (string): url/path prefix to save the DSA compatible annotation + json + label_map (dict[int,str]): map of label number to label name + annotation_name (string): name of the annotation to be displayed in DSA + line_colors (dict[str,str], optional): line color map with {feature name:rgb values} + fill_colors (dict[str,str], optional): fill color map with {feature name:rgba values} + scale_factor (int, optional): scale to match image DSA. + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions + annotation_column (string): column containing url to BMP polygon + output_column_suffix (string): column suffix with result url to add to slide_manifest + + Returns: + dict: annotation file path + """ if annotation_column not in slide_manifest.columns: raise ValueError(f"{annotation_column} not found in slide manifest") client = get_or_create_dask_client() @@ -1374,7 +1513,8 @@ def __bmp_polygon( line_colors (dict[str,str], optional): line color map with {feature name:rgb values} fill_colors (dict[str,str], optional): fill color map with {feature name:rgba values} scale_factor (int, optional): scale to match image DSA. - storage_options (dict): storage options to pass to read/write functions + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions Returns: dict: DSA annotation diff --git a/src/luna/pathology/cli/extract_shape_features.py b/src/luna/pathology/cli/extract_shape_features.py index db092514..35a82a1a 100644 --- a/src/luna/pathology/cli/extract_shape_features.py +++ b/src/luna/pathology/cli/extract_shape_features.py @@ -43,6 +43,10 @@ def cli( slide_mask_urlpath (str): URL/path to slide mask (*.tif) label_cols (List[str]): list of labels that coorespond to those in slide_mask_urlpath output_urlpath (str): output URL/path prefix + include_smaller_regions (bool): include the smaller regions (not just larget) + storage_options (dict): storage options to pass to read functions + output_storage_options (dict): storage options to pass to write functions + local_config (str): local config YAML file Returns: dict: output .tif path and the number of shapes for which features were generated diff --git a/src/luna/pathology/cli/extract_tile_shape_features.py b/src/luna/pathology/cli/extract_tile_shape_features.py index 0ab7ff49..91a62d55 100644 --- a/src/luna/pathology/cli/extract_tile_shape_features.py +++ b/src/luna/pathology/cli/extract_tile_shape_features.py @@ -135,7 +135,6 @@ def cli( def extract_tile_shape_features( slide_manifest: DataFrame[SlideSchema], - slide_urlpath: str, output_urlpath: str, resize_factor: int = 16, detection_probability_threshold: Optional[float] = None, @@ -161,6 +160,27 @@ def extract_tile_shape_features( "solidity", ], ): + """Extracts shape and spatial features (HIF features) from a slide mask. + + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl + output_urlpath (str): output URL/path + resize_factor (int): factor to downsample slide image + detection_probability_threshold (Optional[float]): detection probability threshold + statistical_descriptors (str): statistical descriptors to calculate. One of All, Quantiles, Stats, or Density + cellular_features (str): cellular features to include. One of All, Nucleus, Cell, Cytoplasm, and Membrane + property_type (str): properties to include. One of All, Geometric, or Stain + include_smaller_regions (bool): include smaller regions in output + label_cols (List[str]): list of score columns to use for the classification. Tile is classified as the column with the max score + storage_options (dict): storage options to pass to reading functions + output_storage_options (dict): storage options to pass to writing functions + local_config (str): local config yaml file + objects_column (str): slide manifest column name with stardist geoJSON URLs + properties (List[str]): properties to extract + + Returns: + DataFrame[SlideSchema]: slide manifest + """ client = get_or_create_dask_client() futures = [] @@ -225,8 +245,10 @@ def __extract_tile_shape_features( """Extracts shape and spatial features (HIF features) from a slide mask. Args: - objects (Union[str, gpd.GeoDataFrame]): URL/path to slide (tiffslide supported formats) - tiles (Union[str, pd.DataFrame]): URL/path to object file (geopandas supported formats) + objects_urlpath (str): URL/path to object file (geopandas supported formats) + tiles_urlpath (str): URL/path to tiles manifest (parquet) + slide_urlpath (str): URL/path to slide (tiffslide supported formats) + output_urlpath (str): output URL/path resize_factor (int): factor to downsample slide image detection_probability_threshold (Optional[float]): detection probability threshold @@ -234,7 +256,10 @@ def __extract_tile_shape_features( statistical_descriptors (StatisticalDescriptors): statistical descriptors to calculate cellular_features (CellularFeatures): cellular features to include property_type (PropertyType): properties to include + include_smaller_regions (bool): include smaller regions label_cols (List[str]): list of score columns to use for the classification. Tile is classified as the column with the max score + storage_options (dict): storage options to pass to reading functions + output_storage_options (dict): storage options to pass to writing functions properties (List[str]): list of whole slide image properties to extract. Needs to be parquet compatible (numeric). Returns: diff --git a/src/luna/pathology/cli/infer_tile_labels.py b/src/luna/pathology/cli/infer_tile_labels.py index 259595fb..1b73c74f 100644 --- a/src/luna/pathology/cli/infer_tile_labels.py +++ b/src/luna/pathology/cli/infer_tile_labels.py @@ -16,6 +16,7 @@ from tqdm import tqdm from luna.common.dask import configure_dask_client, get_or_create_dask_client +from luna.common.models import SlideSchema from luna.common.utils import get_config, make_temp_directory, save_metadata, timed from luna.pathology.analysis.ml import ( HDF5Dataset, @@ -54,8 +55,9 @@ def cli( Args: slide_urlpath (str): url/path to slide image (virtual slide formats compatible with TiffSlide, .svs, .tif, .scn, ...) tiles_urlpath (str): path to a slide-tile manifest file (.tiles.csv) - tile_size (int): size of tiles to use (at the requested magnification) + tile_size (Optional[int]): size of tiles to use (at the requested magnification) filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores + requested_magnification (Optional[int]): Magnification scale at which to perform computation torch_model_repo_or_dir (str): repository root name like (namespace/repo) at github.com to serve torch.hub models. Or path to a local model (e.g. msk-mind/luna-ml) model_name (str): torch hub model name (a nn.Module at the repo repo_name) num_cores (int): Number of cores to use for CPU parallelization @@ -72,6 +74,7 @@ def cli( dict: metadata """ config = get_config(vars()) + configure_dask_client(**config["dask_options"]) if not config["slide_urlpath"] and not config["tiles_urlpath"]: raise fire.core.FireError("Specify either tiles_urlpath or slide_urlpath") @@ -130,7 +133,7 @@ def cli( def infer_tile_labels( - slide_manifest: DataFrame, + slide_manifest: DataFrame[SlideSchema], tile_size: Optional[int] = None, filter_query: str = "", thumbnail_magnification: Optional[int] = None, @@ -142,13 +145,35 @@ def infer_tile_labels( output_urlpath: str = ".", kwargs: dict = {}, use_gpu: bool = False, - dask_options: dict = {}, insecure: bool = False, storage_options: dict = {}, output_storage_options: dict = {}, -) -> pd.DataFrame: +) -> DataFrame[SlideSchema]: + """Run inference using a model and transform definition (either local or using torch.hub) + + Decorates existing tiles manifests with additional columns corresponding to class prediction/scores from the model + + Args: + slide_manifest (DataFrame): slide manifest from slide_etl + tile_size (Optional[int]): size of tiles to use (at the requested magnification) + filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores + thumbnail_magnification (Optional[int]): Magnification scale at which to detect tissue + tile_magnification (Optional[int]): Magnification scale at which to generate tiles + torch_model_repo_or_dir (str): repository root name like (namespace/repo) at github.com to serve torch.hub models. Or path to a local model (e.g. msk-mind/luna-ml) + model_name (str): torch hub model name (a nn.Module at the repo repo_name) + num_cores (int): Number of cores to use for CPU parallelization + batch_size (int): size in batch dimension to chuck inference (8-256 recommended, depending on memory usage) + output_urlpath (str): output/working directory + kwargs (dict): additional keywords to pass to model initialization + use_gpu (bool): use GPU if available + insecure (bool): insecure SSL + storage_options (dict): storage options to pass to reading functions + output_storage_options (dict): storage options to pass to writing functions + + Returns: + pd.DataFrame: slide manifest + """ client = get_or_create_dask_client() - configure_dask_client(**dask_options) if "tiles_url" not in slide_manifest.columns: if tile_size is None: @@ -221,20 +246,20 @@ def __infer_tile_labels( Args: tiles_urlpath (str): path to a slide-tile manifest file (.tiles.parquet) - tile_size (int): size of tiles to use (at the requested magnification) - filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores - requested_magnification (Optional[int]): Magnification scale at which to perform computation + slide_id (str): slide ID + output_urlpath (str): output/working directory torch_model_repo_or_dir (str): repository root name like (namespace/repo) at github.com to serve torch.hub models. Or path to a local model (e.g. msk-mind/luna-ml) model_name (str): torch hub model name (a nn.Module at the repo repo_name) num_cores (int): Number of cores to use for CPU parallelization batch_size (int): size in batch dimension to chuck inference (8-256 recommended, depending on memory usage) - output_urlpath (str): output/working directory kwargs (dict): additional keywords to pass to model initialization + use_gpu (bool): use GPU if available + insecure (bool): insecure SSL storage_options (dict): storage options to pass to reading functions output_storage_options (dict): storage options to pass to writing functions Returns: - pd.DataFrame: augmented tiles dataframe + dict: metadata """ if insecure: ssl._create_default_https_context = ssl._create_unverified_context diff --git a/src/luna/pathology/cli/run_stardist_cell_detection.py b/src/luna/pathology/cli/run_stardist_cell_detection.py index a9924850..a944d5ca 100644 --- a/src/luna/pathology/cli/run_stardist_cell_detection.py +++ b/src/luna/pathology/cli/run_stardist_cell_detection.py @@ -46,7 +46,7 @@ def stardist_simple_cli( local_config (str): local config yaml file Returns: - pd.DataFrame: metadata about function call + dict: metadata about function call """ config = get_config(vars()) @@ -79,7 +79,29 @@ def stardist_simple( storage_options: dict, output_storage_options: dict, annotation_column: str = "stardist_geojson_url", -) -> pd.DataFrame: +) -> DataFrame[SlideSchema]: + """Run stardist using qupath CLI on slides in a slide manifest from + slide_etl. URIs to resulting GeoJSON will be stored in a specified column + of the returned slide manifest. + + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl + cell_expansion_size (float): size in pixels to expand cell cytoplasm + image_type (str): qupath image type (BRIGHTFIELD_H_DAB) + output_urlpath (str): output url/path + debug_opts (str): debug options passed as arguments to groovy script + num_cores (int): Number of cores to use for CPU parallelization + image (str): docker/singularity image + use_singularity (bool): use singularity instead of docker + max_heap_size (str): maximum heap size to pass to java options + storage_options (dict): storage options to pass to reading functions + output_storage_options (dict): storage options to pass to writing functions + annotation_column (str): name of column in resulting slide manifest to store GeoJson URIs + + Returns: + DataFrame[SlideSchema]: slide manifest + """ + client = get_or_create_dask_client() futures = [] @@ -122,8 +144,10 @@ def __stardist_simple( max_heap_size: str, storage_options: dict, output_storage_options: dict, -) -> pd.DataFrame: - """Run stardist using qupath CLI +) -> dict: + """Run stardist using qupath CLI on slides in a slide manifest from + slide_etl. URIs to resulting GeoJSON will be stored in a specified column + of the returned slide manifest. Args: slide_urlpath (str): path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...) @@ -139,7 +163,7 @@ def __stardist_simple( output_storage_options (dict): storage options to pass to writing functions Returns: - pd.DataFrame: cell detections + dict: run metadata """ fs, slide_path = fsspec.core.url_to_fs(slide_urlpath, **storage_options) ofs, output_path = fsspec.core.url_to_fs(output_urlpath, **output_storage_options) @@ -228,7 +252,7 @@ def stardist_cell_lymphocyte_cli( max_heap_size: str = "64G", storage_options: dict = {}, output_storage_options: dict = {}, -): +) -> dict: """Run stardist using qupath CLI Args: @@ -236,13 +260,14 @@ def stardist_cell_lymphocyte_cli( output_urlpath (str): output url/path num_cores (int): Number of cores to use for CPU parallelization use_gpu (bool): use GPU + image (str): docker/singularity image use_singularity (bool): use singularity instead of docker max_heap_size (str): maximum heap size to pass to java options storage_options (dict): storage options to pass to reading functions output_storage_options (dict): storage options to pass to writing functions Returns: - pd.DataFrame: cell detections + dict: run metadata """ config = get_config(vars()) slide_id = Path(config["slide_urlpath"]).stem @@ -272,7 +297,24 @@ def stardist_cell_lymphocyte( storage_options: dict = {}, output_storage_options: dict = {}, annotation_column: str = "lymphocyte_geojson_url", -): +) -> DataFrame[SlideSchema]: + """Run stardist using qupath CLI + + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl + output_urlpath (str): output url/path + num_cores (int): Number of cores to use for CPU parallelization + use_gpu (bool): use GPU + image (str): docker/singularity image + use_singularity (bool): use singularity instead of docker + max_heap_size (str): maximum heap size to pass to java options + storage_options (dict): storage options to pass to reading functions + output_storage_options (dict): storage options to pass to writing functions + annotation_column (str): name of column in resulting slide manifest to store GeoJson URIs + + Returns: + DataFrame[SlideSchema]: slide manifest + """ client = get_or_create_dask_client() futures = [] @@ -313,7 +355,7 @@ def __stardist_cell_lymphocyte( max_heap_size: str = "64G", storage_options: dict = {}, output_storage_options: dict = {}, -) -> pd.DataFrame: +) -> dict: """Run stardist using qupath CLI Args: @@ -321,12 +363,13 @@ def __stardist_cell_lymphocyte( output_urlpath (str): output url/path num_cores (int): Number of cores to use for CPU parallelization use_gpu (bool): use GPU + image (str): docker/singularity image use_singularity (bool): use singularity instead of docker max_heap_size (str): maximum heap size to pass to java options storage_options (dict): storage options to pass to reading functions Returns: - pd.DataFrame: cell detections + dict: run metadata """ fs, slide_path = fsspec.core.url_to_fs(slide_urlpath, **storage_options) ofs, output_path = fsspec.core.url_to_fs(output_urlpath, **output_storage_options) diff --git a/src/luna/pathology/cli/run_tissue_detection.py b/src/luna/pathology/cli/run_tissue_detection.py index 76088cdf..0f888418 100644 --- a/src/luna/pathology/cli/run_tissue_detection.py +++ b/src/luna/pathology/cli/run_tissue_detection.py @@ -17,7 +17,7 @@ from tiffslide import TiffSlide from luna.common.dask import configure_dask_client, get_or_create_dask_client -from luna.common.models import Tile +from luna.common.models import SlideSchema, Tile from luna.common.utils import ( get_config, local_cache_urlpath, @@ -115,7 +115,9 @@ def cli( tiles_urlpath (str): url/path to tiles manifest (parquet) filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores tile_size (int): size of tiles to use (at the requested magnification) - thumbnail_magnification (Optional[int]): Magnification scale at which to perform computation + thumbnail_magnification (Optional[int]): Magnification scale at which to create thumbnail for tissue detection + tile_magnification (Optional[int]): Magnification scale at which to generate tiles + batch_size (int): batch size for processing output_urlpath (str): Output url/path dask_options (dict): dask options storage_options (dict): storage options to pass to reading functions @@ -163,7 +165,7 @@ def cli( def detect_tissue( - slide_manifest: DataFrame, + slide_manifest: DataFrame[SlideSchema], tile_size: Optional[int] = None, thumbnail_magnification: Optional[int] = None, tile_magnification: Optional[int] = None, @@ -172,7 +174,22 @@ def detect_tissue( storage_options: dict = {}, output_urlpath: str = ".", output_storage_options: dict = {}, -) -> pd.DataFrame: +) -> DataFrame[SlideSchema]: + """Run simple/deterministic tissue detection algorithms based on a filter query, to reduce tiles to those (likely) to contain actual tissue + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl + tile_size (int): size of tiles to use (at the requested magnification) + thumbnail_magnification (Optional[int]): Magnification scale at which to create thumbnail for tissue detection + tile_magnification (Optional[int]): Magnification scale at which to generate tiles + filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores + batch_size (int): batch size for processing + storage_options (dict): storage options to pass to reading functions + output_urlpath (str): Output url/path + output_storage_options (dict): storage options to pass to writing functions + Returns: + DataFrame[SlideSchema]: slide manifest + + """ client = get_or_create_dask_client() with make_temp_directory() as temp_dir: diff --git a/src/luna/pathology/cli/save_tiles.py b/src/luna/pathology/cli/save_tiles.py index 4af71b87..b771edae 100644 --- a/src/luna/pathology/cli/save_tiles.py +++ b/src/luna/pathology/cli/save_tiles.py @@ -68,7 +68,22 @@ def save_tiles( batch_size: int = 2000, storage_options: dict = {}, output_storage_options: dict = {}, -): +) -> DataFrame[SlideSchema]: + """Saves tiles to disk + + Tiles addresses and arrays are saved as key-value pairs in (tiles.h5), + and the corresponding manifest/header file (tiles.parquet) is also generated + + Args: + slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl + output_urlpath (str): output url/path prefix + batch_size (int): size in batch dimension to chuck jobs + storage_options (dict): storage options to reading functions + output_storage_options (dict): storage options to writing functions + + Returns: + DataFrame[SlideSchema]: slide manifest + """ client = get_or_create_dask_client() if "tiles_url" not in slide_manifest.columns: diff --git a/src/luna/pathology/cli/slide_etl.py b/src/luna/pathology/cli/slide_etl.py index b02eeafc..93042e1b 100644 --- a/src/luna/pathology/cli/slide_etl.py +++ b/src/luna/pathology/cli/slide_etl.py @@ -42,12 +42,13 @@ def cli( Args: - slide_url (str): path to slide image + slide_urlpath (str): path to slide image project_name (str): project name underwhich the slides should reside comment (str): comment and description of dataset subset_csv_urlpath (str): url/path to subset csv - storage_options (dict): storage options to pass to reading functions + debug_limit (int): limit number of slides output_urlpath (str): url/path to output table + storage_options (dict): storage options to pass to reading functions output_storage_options (dict): storage options to pass to writing functions local_config (str): url/path to YAML config file no_copy (bool): determines whether we copy slides to output_urlpath @@ -120,16 +121,17 @@ def slide_etl( """Ingest slides by adding them to a file or s3 based storage location and generating metadata about them Args: - slide_url (str): path to slide image + slide_urls (Union[str, List[str])): path to slide image(s) project_name (str): project name underwhich the slides should reside comment (str): comment and description of dataset storage_options (dict): storage options to pass to reading functions output_urlpath (str): url/path to output table output_storage_options (dict): storage options to pass to writing functions + no_copy (bool): do not copy slides to output path Returns: - df (DataFrame): dataframe containing the metadata of all the slides + DataFrame[SlideSchema]: dataframe containing the metadata of all the slides """ if isinstance(slide_urls, str): slide_urls = [slide_urls]