From ea79b81f64b4dbfc19c93a5af5073960fb5de6c0 Mon Sep 17 00:00:00 2001 From: armaank Date: Fri, 1 Dec 2023 16:59:22 -0500 Subject: [PATCH] chore: cleanup stale files, unused code (#385) Co-authored-by: kohlia --- src/luna/.cli_template.py | 70 ----- src/luna/common/constants.py | 82 ------ src/luna/common/utils.py | 134 --------- src/luna/pathology/examples/__init__.py | 0 .../examples/extract_feature_vectors.py | 256 ------------------ .../import_point_geojson_into_qupath.groovy | 54 ---- ...import_regional_geojson_into_qupath.groovy | 57 ---- tests/luna/common/test_constants.py | 65 ----- .../common/test_slideviewer_client.py | 3 +- 9 files changed, 1 insertion(+), 720 deletions(-) delete mode 100644 src/luna/.cli_template.py delete mode 100644 src/luna/common/constants.py delete mode 100644 src/luna/pathology/examples/__init__.py delete mode 100644 src/luna/pathology/examples/extract_feature_vectors.py delete mode 100644 src/luna/pathology/examples/qupath/import_point_geojson_into_qupath.groovy delete mode 100644 src/luna/pathology/examples/qupath/import_regional_geojson_into_qupath.groovy delete mode 100644 tests/luna/common/test_constants.py diff --git a/src/luna/.cli_template.py b/src/luna/.cli_template.py deleted file mode 100644 index 59cf81a2..00000000 --- a/src/luna/.cli_template.py +++ /dev/null @@ -1,70 +0,0 @@ -# General imports -import json -import logging -import os - -import click -import yaml - -from luna.common.custom_logger import init_logger - -init_logger() -logger = logging.getLogger() ### Add CLI tool name - -from luna.common.utils import cli_runner - -_params_ = [("input_data", str), ("output_dir", str)] - - -@click.command() -@click.argument("input_data", nargs=1) -@click.option( - "-o", - "--output_dir", - required=False, - help="path to output directory to save results", -) -### Additional options -@click.option( - "-m", - "--method_param_path", - required=False, - help="path to a metadata json/yaml file with method parameters to reproduce results", -) -def cli(**cli_kwargs): - """A cli tool - - \b - Inputs: - input: input data - \b - Outputs: - output data - \b - Example: - CLI_TOOL ./slides/10001.svs ./halo/10001.job18484.annotations - -an Tumor - -o ./masks/10001/ - """ - cli_runner(cli_kwargs, _params_, transform_method) - - -### Transform imports -def transform_method(input_data, output_dir): - """CLI tool method - - Args: - input_data (str): path to input data - output_dir (str): output/working directory - - Returns: - dict: metadata about function call - """ - - properties = {} - - return properties - - -if __name__ == "__main__": - cli() diff --git a/src/luna/common/constants.py b/src/luna/common/constants.py deleted file mode 100644 index 50cf8920..00000000 --- a/src/luna/common/constants.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -Created on November 16, 2020 - -@author: rosed2@mskcc.org -""" -import os - -# Table Names -from luna.common.utils import get_absolute_path - -# Application Constants - - -TABLE_DIR = "tables/" -# clinical -DIAGNOSIS_TABLE = TABLE_DIR + "diagnosis" -MEDICATION_TABLE = TABLE_DIR + "medication" -PATIENT_TABLE = TABLE_DIR + "patient" - -# radiology -DICOM_TABLE = TABLE_DIR + "dicom" -SCAN_TABLE = TABLE_DIR + "scan" -SCAN_ANNOTATION_TABLE = TABLE_DIR + "scan_annotation" -FEATURE_TABLE = TABLE_DIR + "feature" - -# Raw Data Directories -DICOMS = "dicoms" -SCANS = "scans" -SCAN_ANNOTATIONS = "scan_annotations" -PATHOLOGY_ANNOTATIONS = "pathology_annotations" -FEATURES = "features" - -# Configurations -APP_CFG = "APP_CFG" -DATA_CFG = "DATA_CFG" -SCHEMA_FILE = get_absolute_path(__file__, "../data_ingestion_template_schema.yml") -PUBLIC_DIR = "/gpfs/mskmind_ess/mind_public" - -ANNOTATION_TABLE_MAPPINGS = { - "regional": { - "DATA_TYPE": "REGIONAL_METADATA_RESULTS", - "GEOJSON_COLUMN_NAME": "geojson", - }, - "point": {"DATA_TYPE": "POINT_GEOJSON", "GEOJSON_COLUMN_NAME": "geojson"}, -} - - -def PROJECT_LOCATION(cfg): - """ - ROOT_PATH is a path to mind data e.g. /gpfs/mind/data or hdfs://server:port/data - - :param cfg: - :return: ROOT_PATH/PROJECT_NAME - """ - # This function assumes /data as "ROOT_PATH" - return os.path.join( - cfg.get_value(path=DATA_CFG + "::ROOT_PATH"), - cfg.get_value(path=DATA_CFG + "::PROJECT"), - ) - - -def CONFIG_LOCATION(cfg): - return "{0}/configs/{1}".format(PROJECT_LOCATION(cfg), TABLE_NAME(cfg)) - - -def TABLE_LOCATION(cfg, is_source=False): - return "{0}/tables/{1}".format(PROJECT_LOCATION(cfg), TABLE_NAME(cfg, is_source)) - - -def TABLE_NAME(cfg, is_source=False): - - if is_source: - table_name = cfg.get_value(path=DATA_CFG + "::SOURCE_DATA_TYPE").upper() - else: - table_name = cfg.get_value(path=DATA_CFG + "::DATA_TYPE").upper() - - dataset_name = cfg.get_value(path=DATA_CFG + "::DATASET_NAME") - - if dataset_name != "" and dataset_name is not None: - table_name += "_{0}".format(dataset_name) - - return table_name diff --git a/src/luna/common/utils.py b/src/luna/common/utils.py index 7b0d6a87..80f7934f 100644 --- a/src/luna/common/utils.py +++ b/src/luna/common/utils.py @@ -159,31 +159,6 @@ def wrapper(*args, **kwargs): return wrapper -def to_sql_field(s): - filter1 = s.replace(".", "_").replace(" ", "_") - filter2 = "".join(e for e in filter1 if e.isalnum() or e == "_") - return filter2 - - -def to_sql_value(s): - if isinstance(s, str): - return f"'{s}'" - if not s == s: - return "Null" - if s is None: - return "Null" - else: - return f"{s}" - - -def clean_nested_colname(s): - """ - Removes map name for MapType columns. - e.g. metadata.SeriesInstanceUID -> SeriesInstanceUID - """ - return s[s.find(".") + 1 :] - - def generate_uuid(urlpath: str, prefix, storage_options={}): """ Returns hash of the file given path, preceded by the prefix. @@ -199,39 +174,6 @@ def generate_uuid(urlpath: str, prefix, storage_options={}): return "-".join(prefix) -def rebase_schema_numeric(df): - """ - Tries to convert all columns in a dataframe to numeric types, if possible, with integer types taking precident - - Note: this is an in-place operation - - Args: - df (pd.DataFrame): dataframe to convert columns - """ - for col in df.columns: - if df[col].dtype != object: - continue - - df[col] = df[col].astype(float, errors="ignore") - - -def rebase_schema_mixed(df): - """ - Tries to convert all columns with mixed types to strings. - - Note: this is an in-place operation - - Args: - df (pd.DataFrame): dataframe to convert columns - """ - for col in df.columns: - mixed = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis=1) - if len(df[mixed]) > 0: - df[col] = df[col].astype(str) - if df[col].dtype == list: - df[col] = df[col].astype(str) - - def generate_uuid_binary(content, prefix): """ Returns hash of the binary, preceded by the prefix. @@ -267,68 +209,6 @@ def generate_uuid_dict(json_str, prefix): return "-".join(prefix) -def does_not_contain(token, value): - """ - Validate that `token` is not a substring of `value` - - :param: token: string e.g. : | . - :param: value: dictionary, list, or str - """ - if isinstance(value, str): - if token in value: - raise ValueError(f"{value} cannot contain {token}") - - if isinstance(value, list): - if any([token in v for v in value]): - raise ValueError(str(value) + f" cannot contain {token}") - - if isinstance(value, dict): - if any( - [ - isinstance(key, str) - and token in key - or isinstance(val, str) - and token in val - for key, val in value.items() - ] - ): - raise ValueError(str(value) + f" cannot contain {token}") - - return True - - -def replace_token(token, token_replacement, value): - """ - Replace `token` with `token_replacement` in `value` - - :param: token: string e.g. : | . - :param: token_replacement: string e.g. _ - - :param: value: dictionary, list, or str - """ - if isinstance(value, str): - return value.replace(token, token_replacement) - - if isinstance(value, list): - new_value = [] - for v in value: - new_value.append(v.replace(token, token_replacement)) - return new_value - - if isinstance(value, dict): - new_value = {} - for key, val in value.items(): - new_key, new_val = key, val - if isinstance(key, str): - new_key = key.replace(token, token_replacement) - if isinstance(val, str): - new_val = val.replace(token, token_replacement) - new_value[new_key] = new_val - - return new_value - - return value - - def grouper(iterable, n): """Turn an iterable into an iterable of iterables @@ -349,20 +229,6 @@ def grouper(iterable, n): ] -def get_method_data(cohort_id, method_id): - """ - Return method dict - - :param: cohort_id: string - :param: method_id: string - """ - - method_dir = os.path.join(os.environ["MIND_GPFS_DIR"], "data", cohort_id, "methods") - with open(os.path.join(method_dir, f"{method_id}.json")) as json_file: - method_config = json.load(json_file)["params"] - return method_config - - def get_absolute_path(module_path, relative_path): """Given the path to a module file and the path, relative to the module file, of another file that needs to be referenced in the module, this method returns the absolute path of the file diff --git a/src/luna/pathology/examples/__init__.py b/src/luna/pathology/examples/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/luna/pathology/examples/extract_feature_vectors.py b/src/luna/pathology/examples/extract_feature_vectors.py deleted file mode 100644 index 161d5afd..00000000 --- a/src/luna/pathology/examples/extract_feature_vectors.py +++ /dev/null @@ -1,256 +0,0 @@ -import logging -import os -import time - -import click -import h5py -import numpy as np -import torch -import torch.nn as nn -from CLAM.datasets.dataset_h5 import Dataset_All_Bags, eval_transforms -from CLAM.models.resnet_custom import resnet50_baseline -from CLAM.utils.file_utils import save_hdf5 -from CLAM.utils.utils import collate_features -from PIL import Image -from torch.utils.data import DataLoader, Dataset - -from luna.common.custom_logger import init_logger -from luna.common.utils import cli_runner -from luna.pathology.common.utils import address_to_coord - -init_logger() -logger = logging.getLogger("extract_feature_vectors") - -_params_ = [ - ("csv_path", str), - ("output_dir", str), - ("batch_size", int), - ("no_auto_skip", bool), -] - -device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - - -class Luna_Bag(Dataset): - """ - Dataset with Luna tile images. - Luna tile images are saved in h5 file with address-tile image array as key-value pairs. - """ - - def __init__(self, file_path, pretrained=False, custom_transforms=None): - """ - Args: - file_path (string): Path to the .h5 file containing patched data. - pretrained (bool): Use ImageNet transforms - custom_transforms (callable, optional): Optional transform to be applied on a sample - """ - self.pretrained = pretrained - if not custom_transforms: - self.roi_transforms = eval_transforms(pretrained=pretrained) - else: - self.roi_transforms = custom_transforms - - self.file_path = file_path - - with h5py.File(self.file_path, "r") as f: - dset = list(f.keys()) - self.length = len(dset) - - self.summary() - - def __len__(self): - return self.length - - def summary(self): - logger.info("\nfeature extraction settings") - logger.info(f"pretrained: {self.pretrained}") - logger.info(f"transformations: {self.roi_transforms}") - - def __getitem__(self, idx): - with h5py.File(self.file_path, "r") as hdf5_file: - addr = list(hdf5_file.keys())[idx] - img = np.array(hdf5_file[addr]) - img = Image.fromarray(img) - img = self.roi_transforms(img).unsqueeze(0) - return img, address_to_coord(addr) - - -def compute_w_loader( - file_path, - output_path, - model, - batch_size=8, - verbose=0, - print_every=20, - pretrained=True, -): - """ - Extract features and save the vectors. - - Args: - file_path: directory of bag (.h5 file) - output_path: directory to save computed features (.h5 file) - model: pytorch model - batch_size: batch_size for computing features in batches - verbose: level of feedback - pretrained: use weights pretrained on imagenet - """ - dataset = Luna_Bag(file_path=file_path, pretrained=pretrained) - - kwargs = {"num_workers": 4, "pin_memory": True} if device.type == "cuda" else {} - loader = DataLoader( - dataset=dataset, batch_size=batch_size, **kwargs, collate_fn=collate_features - ) - - if verbose > 0: - logger.info("processing {}: total of {} batches".format(file_path, len(loader))) - - mode = "w" - for count, (batch, coords) in enumerate(loader): - with torch.no_grad(): - if count % print_every == 0: - logger.info( - "batch {}/{}, {} files processed".format( - count, len(loader), count * batch_size - ) - ) - batch = batch.to(device, non_blocking=True) - - features = model(batch) - features = features.cpu().numpy() - - asset_dict = {"features": features, "coords": coords} - save_hdf5(output_path, asset_dict, attr_dict=None, mode=mode) - mode = "a" - - return output_path - - -@click.command() -@click.option( - "-c", - "--csv_path", - type=str, - required=False, - help="path to csv with slide_id, tile_image_file columns", -) -@click.option( - "-o", - "--output_dir", - type=str, - required=False, - help="path to save extracted features", -) -@click.option("-bs", "--batch_size", type=int, default=256) -@click.option( - "-as", - "--no_auto_skip", - default=False, - is_flag=True, - help="If true, override existing output. By default, skip if output exists.", -) -@click.option( - "-m", - "--method_param_path", - required=False, - help="path to a metadata json/yaml file with method parameters to reproduce results", -) -def cli(**cli_kwargs): - """An example showing how Luna tile images can be used with CLAM. - Extract 1024-dimensional feature vector from each tile, using a pre-trained ResNet50. - - Note: - Adapted from feature extraction CLI: https://github.com/msk-mind/CLAM/blob/master/extract_features_fp.py - - Setup (utilizes CLAM): - pip install pyluna[pathology] - git clone https://github.com/msk-mind/CLAM.git - export PYTHONPATH=$PYTONPATH:/path/to/CLAM:. - - Example: - !python3 -m luna.pathology.examples.extract_feature_vectors \ - --csv_path dataset.csv \ - --output_dir /path/to/output \ - """ - cli_runner(cli_kwargs, _params_, extract_feature_vectors) - - -def extract_feature_vectors(csv_path, output_dir, batch_size, no_auto_skip): - """ - Extract 1024-dimensional feature vector from each tile, using a pre-trained ResNet50. - - Args: - csv_path (str): path to csv with slide_id, tile_image_file columns - output_dir (str): path to save extracted features - batch_size (int): batch size - no_auto_skip (bool): If true, override existing output. By default, skip if output exists. - - Returns: - dict: metadata about function call - """ - if csv_path is None: - raise NotImplementedError - - bags_dataset = Dataset_All_Bags(csv_path) - - os.makedirs(os.path.join(output_dir, "pt_files"), exist_ok=True) - os.makedirs(os.path.join(output_dir, "h5_files"), exist_ok=True) - dest_files = os.listdir(os.path.join(output_dir, "pt_files")) - - print("loading model checkpoint") - model = resnet50_baseline(pretrained=True) - model = model.to(device) - - # print_network(model) - if torch.cuda.device_count() > 1: - model = nn.DataParallel(model) - - model.eval() - total = len(bags_dataset) - - for bag_candidate_idx in range(total): - slide_id, h5_file_path = bags_dataset[bag_candidate_idx] - - bag_name = slide_id + ".h5" - logger.info("\nprogress: {}/{}".format(bag_candidate_idx + 1, total)) - logger.info(f"processing {slide_id}") - if not no_auto_skip and slide_id + ".pt" in dest_files: - logger.info("skipped {}".format(slide_id)) - continue - - output_path = os.path.join(output_dir, "h5_files", bag_name) - time_start = time.time() - output_file_path = compute_w_loader( - h5_file_path, - output_path, - model=model, - batch_size=batch_size, - verbose=1, - print_every=20, - ) - time_elapsed = time.time() - time_start - logger.info( - "\ncomputing features for {} took {} s".format( - output_file_path, time_elapsed - ) - ) - file = h5py.File(output_file_path, "r") - - features = file["features"][:] - logger.info(f"features size: {features.shape}") - logger.info(f"coordinates size: {file['coords'].shape}") - features = torch.from_numpy(features) - bag_base, _ = os.path.splitext(bag_name) - torch.save(features, os.path.join(output_dir, "pt_files", bag_base + ".pt")) - - properties = { - "dataset_csv": csv_path, - "feat_dir": output_dir, - "batch_size": batch_size, - } - - return properties - - -if __name__ == "__main__": - cli() diff --git a/src/luna/pathology/examples/qupath/import_point_geojson_into_qupath.groovy b/src/luna/pathology/examples/qupath/import_point_geojson_into_qupath.groovy deleted file mode 100644 index 0ed6530b..00000000 --- a/src/luna/pathology/examples/qupath/import_point_geojson_into_qupath.groovy +++ /dev/null @@ -1,54 +0,0 @@ -import qupath.lib.io.GsonTools -import static qupath.lib.gui.scripting.QPEx.* - -// Name of the subdirectory containing the TMA grid -def subDirectory = "Manual points" - -// If true, don't check for existing points -boolean ignoreExisting = false - -// Check we have an image open from a project -def hierarchy = getCurrentHierarchy() -if (hierarchy == null) { - println "No image is available!" - return -} -def name = getProjectEntry()?.getImageName() -if (name == null) { - println "No image name found! Be sure to run this script with a project and image open." - return -} - -// Resist adding (potentially duplicate) points unless the user explicitly requests this -def existingPoints = getAnnotationObjects().findAll {it.getROI()?.isPoint()} -if (!ignoreExisting && !existingPoints.isEmpty()) { - println "Point annotations are already present! Please delete these first, or set ignoreExisting = true at the start of this script." - return -} - -def imageData = getCurrentImageData() -def filename = GeneralTools.getNameWithoutExtension(imageData.getServer().getMetadata().getName()) - -def image_id = filename.replace(".svs", "").toString() -// make sure to change URL, details can be found on confluence on how to configure the API's url accordingly. -// API FORMAT http://{SERVER}:{PORT}/mind/api/v1/getPathologyAnnotation/{PROJECT}/{image_id}/point/{labelset_name} -def url = "http://SERVER:PORT/mind/api/v1/getPathologyAnnotation/OV_16-158/" + image_id + "/point/lymphocyte_detection_labelset" -print(url) - -def get = new URL(url).openConnection(); -def getRC = get.getResponseCode(); - -if(getRC.equals(200)) { - - def text = get.getInputStream().getText(); - - - def type = new com.google.gson.reflect.TypeToken>() {}.getType() - def points = GsonTools.getInstance().fromJson(text, type) - hierarchy.insertPathObjects(points) - println(hierarchy.getAnnotationObjects().size() + " point annotations added to" + filename) - -} - -fireHierarchyUpdate() - diff --git a/src/luna/pathology/examples/qupath/import_regional_geojson_into_qupath.groovy b/src/luna/pathology/examples/qupath/import_regional_geojson_into_qupath.groovy deleted file mode 100644 index c620b368..00000000 --- a/src/luna/pathology/examples/qupath/import_regional_geojson_into_qupath.groovy +++ /dev/null @@ -1,57 +0,0 @@ -import qupath.lib.objects.* -import qupath.lib.roi.* -import com.google.gson.Gson -import java.io.FileReader; -import groovy.io.FileType -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.stream.Collectors; -import qupath.lib.geom.Point2 - -def imageData = QPEx.getCurrentImageData() -def server = getCurrentServer() -def filename = GeneralTools.getNameWithoutExtension(server.getMetadata().getName()) -def image_id = filename.replace(".svs", "").toString() - -// make sure to change URL, details can be found on confluence on how to configure the API's url accordingly. -// API FORMAT http://{SERVER}:{PORT}/mind/api/v1/getPathologyAnnotation/{PROJECT}/{image_id}/regional/{labelset_name} -def url = "http://SERVER:PORT/mind/api/v1/getPathologyAnnotation/OV_16-158/" + image_id + "/regional" + "/simplified_pixel_classifier_labels" -print(url) - -def get = new URL(url).openConnection(); -def getRC = get.getResponseCode(); - -if(getRC.equals(200)) { - - - def hierarchy = imageData.getHierarchy() - def text = get.getInputStream().getText(); - - //Read into a map - def map = new Gson().fromJson(text, Map) - - - annotations = [] - - for (feat in map['features']) { - def name = feat['properties']['label_name'].toString() - def vertices = feat['geometry']['coordinates'][0] - def points = vertices.collect {new Point2(it[0], it[1])} - def polygon = new PolygonROI(points) - def pathAnnotation = new PathAnnotationObject(polygon) - pathAnnotation.setPathClass(getPathClass(name)) - annotations << pathAnnotation - } - - hierarchy.addPathObjects(annotations) - println(hierarchy.getAnnotationObjects().size() + " annotations added to" + filename) - - -} - - -fireHierarchyUpdate() - - - diff --git a/tests/luna/common/test_constants.py b/tests/luna/common/test_constants.py deleted file mode 100644 index ed64e28d..00000000 --- a/tests/luna/common/test_constants.py +++ /dev/null @@ -1,65 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on October 17, 2019 - -@author: aukermaa@mskcc.org -""" -import luna.common.constants as const -from luna.common.config import ConfigSet - - -def test_table_name(): - c1 = ConfigSet( - name=const.DATA_CFG, - config_file="tests/testdata/common/data_ingestion_template_valid.yml", - ) - - assert const.TABLE_NAME(c1) == "CT_OV_16-158_CT_20201028" - - -def test_table_location(): - c1 = ConfigSet( - name=const.DATA_CFG, - config_file="tests/testdata/common/data_ingestion_template_valid.yml", - ) - - assert ( - const.TABLE_LOCATION(c1) - == "tests/luna/radiology/proxy_table/test_data/OV_16-158/tables/CT_OV_16-158_CT_20201028" - ) - - -def test_table_location_emptystring(): - c1 = ConfigSet( - name=const.DATA_CFG, - config_file="tests/testdata/common/data_ingestion_template_valid_empty_dataset.yml", - ) - - assert ( - const.TABLE_LOCATION(c1) - == "tests/luna/radiology/proxy_table/test_data/OV_16-158/tables/CT" - ) - - -def test_table_location_none(): - c1 = ConfigSet( - name=const.DATA_CFG, - config_file="tests/testdata/common/data_ingestion_template_valid_empty_dataset_2.yml", - ) - - assert ( - const.TABLE_LOCATION(c1) - == "tests/luna/radiology/proxy_table/test_data/OV_16-158/tables/CT" - ) - - -def test_project_location(): - c1 = ConfigSet( - name=const.DATA_CFG, - config_file="tests/testdata/common/data_ingestion_template_valid.yml", - ) - - assert ( - const.PROJECT_LOCATION(c1) - == "tests/luna/radiology/proxy_table/test_data/OV_16-158" - ) diff --git a/tests/luna/pathology/common/test_slideviewer_client.py b/tests/luna/pathology/common/test_slideviewer_client.py index 0811219c..6257482c 100644 --- a/tests/luna/pathology/common/test_slideviewer_client.py +++ b/tests/luna/pathology/common/test_slideviewer_client.py @@ -9,7 +9,6 @@ from pathlib import Path from luna.common.config import ConfigSet -from luna.common.constants import DATA_CFG from luna.pathology.common.slideviewer_client import ( download_sv_point_annotation, download_zip, @@ -26,7 +25,7 @@ zipfile_path = None PROJECT_PATH = None ROOT_PATH = None - +DATA_CFG = "DATA_CFG" def setup_module(module): """setup any state specific to the execution of the given module."""