diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2acc7b5b..f1c8467a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,16 +6,23 @@ jobs: build: runs-on: ubuntu-20.04 + + services: + localstack: + image: localstack/localstack + ports: + - "4566:4566" + - "4510-4559:4510-4559" + container: image: mambaorg/micromamba:1.0.0 options: --user root - steps: - name: Install required packages run: | apt-get update - apt-get install -y build-essential libgdal-dev liblapack-dev libblas-dev gfortran libgl1 git curl make + apt-get install -y build-essential libgdal-dev liblapack-dev libblas-dev gfortran libgl1 git curl make python3-pip git config --system --add safe.directory * - uses: actions/checkout@v3 @@ -52,6 +59,7 @@ jobs: run: | . ./venv/bin/activate export LUNA_HOME=$PWD + export LOCALSTACK_ENDPOINT_URL=http://localstack:4566 pytest -v --capture=tee-sys --show-capture=all tests --cov-report=xml --junitxml=./luna-tests/results.xml - name: Build mkdocs diff --git a/docker/localstack/docker-compose.yml b/docker/localstack/docker-compose.yml new file mode 100644 index 00000000..62b48e77 --- /dev/null +++ b/docker/localstack/docker-compose.yml @@ -0,0 +1,17 @@ +version: "3.8" + +services: + localstack: + container_name: "${LOCALSTACK_DOCKER_NAME-localstack_main}" + image: localstack/localstack + ports: + - "4566:4566" # LocalStack Gateway + - "4510-4559:4510-4559" # external services port range + environment: + - DEBUG=${DEBUG-} + - DOCKER_HOST=unix:///var/run/docker.sock + volumes: + - "${LOCALSTACK_VOLUME_DIR:-./volume}:/var/lib/localstack" + - "/var/run/docker.sock:/var/run/docker.sock" + healthcheck: + test: "bash -c 'AWS_ACCESS_KEY_ID=fake AWS_SECRET_ACCESS_KEY=fake aws --endpoint-url=http://localhost:4566 s3 ls'" diff --git a/poetry.lock b/poetry.lock index 0b801c93..a1e37fa6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -19,21 +19,21 @@ docs = ["furo", "myst-parser", "numpydoc", "sphinx"] [[package]] name = "aiobotocore" -version = "2.4.2" +version = "2.5.4" description = "Async client for aws services using botocore and aiohttp" category = "main" optional = false python-versions = ">=3.7" [package.dependencies] -aiohttp = ">=3.3.1" -aioitertools = ">=0.5.1" -botocore = ">=1.27.59,<1.27.60" -wrapt = ">=1.10.10" +aiohttp = ">=3.3.1,<4.0.0" +aioitertools = ">=0.5.1,<1.0.0" +botocore = ">=1.31.17,<1.31.18" +wrapt = ">=1.10.10,<2.0.0" [package.extras] -awscli = ["awscli (>=1.25.60,<1.25.61)"] -boto3 = ["boto3 (>=1.24.59,<1.24.60)"] +awscli = ["awscli (>=1.29.17,<1.29.18)"] +boto3 = ["boto3 (>=1.28.17,<1.28.18)"] [[package]] name = "aiohttp" @@ -373,7 +373,7 @@ typing-extensions = ">=3.10.0" [[package]] name = "botocore" -version = "1.27.59" +version = "1.31.17" description = "Low-level, data-driven core of boto 3." category = "main" optional = false @@ -385,7 +385,7 @@ python-dateutil = ">=2.1,<3.0.0" urllib3 = ">=1.25.4,<1.27" [package.extras] -crt = ["awscrt (==0.14.0)"] +crt = ["awscrt (==0.16.26)"] [[package]] name = "bracex" @@ -1091,19 +1091,20 @@ python-versions = ">=3.7" [[package]] name = "fsspec" -version = "2022.10.0" +version = "2023.6.0" description = "File-system specification" category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" [package.extras] abfs = ["adlfs"] adl = ["adlfs"] arrow = ["pyarrow (>=1)"] dask = ["dask", "distributed"] +devel = ["pytest", "pytest-cov"] dropbox = ["dropbox", "dropboxdrivefs", "requests"] -entrypoints = ["importlib-metadata"] +full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"] fuse = ["fusepy"] gcs = ["gcsfs"] git = ["pygit2"] @@ -1959,7 +1960,7 @@ python-versions = ">=3.7" [[package]] name = "large-image" -version = "1.23.5" +version = "1.23.6" description = "Python modules to work with large, multiresolution images." category = "main" optional = false @@ -1967,51 +1968,51 @@ python-versions = ">=3.6" [package.dependencies] cachetools = "*" -large-image-source-pil = {version = ">=1.23.5", optional = true, markers = "extra == \"pil\""} +large-image-source-pil = {version = ">=1.23.6", optional = true, markers = "extra == \"pil\""} numpy = "*" palettable = "*" Pillow = "*" [package.extras] -all = ["large-image-converter (>=1.23.5)", "large-image-source-bioformats (>=1.23.5)", "large-image-source-deepzoom (>=1.23.5)", "large-image-source-dicom (>=1.23.5)", "large-image-source-dummy (>=1.23.5)", "large-image-source-gdal (>=1.23.5)", "large-image-source-mapnik (>=1.23.5)", "large-image-source-multi (>=1.23.5)", "large-image-source-nd2 (>=1.23.5)", "large-image-source-ometiff (>=1.23.5)", "large-image-source-openjpeg (>=1.23.5)", "large-image-source-openslide (>=1.23.5)", "large-image-source-pil (>=1.23.5)", "large-image-source-pil[all] (>=1.23.5)", "large-image-source-rasterio (>=1.23.5)", "large-image-source-rasterio[all] (>=1.23.5)", "large-image-source-test (>=1.23.5)", "large-image-source-tiff (>=1.23.5)", "large-image-source-tifffile (>=1.23.5)", "large-image-source-vips (>=1.23.5)", "matplotlib", "psutil (>=4.2.0)", "pylibmc (>=1.5.1)", "pyvips", "simplejpeg", "simplejpeg (<1.6.6)"] -bioformats = ["large-image-source-bioformats (>=1.23.5)"] +all = ["large-image-converter (>=1.23.6)", "large-image-source-bioformats (>=1.23.6)", "large-image-source-deepzoom (>=1.23.6)", "large-image-source-dicom (>=1.23.6)", "large-image-source-dummy (>=1.23.6)", "large-image-source-gdal (>=1.23.6)", "large-image-source-mapnik (>=1.23.6)", "large-image-source-multi (>=1.23.6)", "large-image-source-nd2 (>=1.23.6)", "large-image-source-ometiff (>=1.23.6)", "large-image-source-openjpeg (>=1.23.6)", "large-image-source-openslide (>=1.23.6)", "large-image-source-pil (>=1.23.6)", "large-image-source-pil[all] (>=1.23.6)", "large-image-source-rasterio (>=1.23.6)", "large-image-source-rasterio[all] (>=1.23.6)", "large-image-source-test (>=1.23.6)", "large-image-source-tiff (>=1.23.6)", "large-image-source-tifffile (>=1.23.6)", "large-image-source-vips (>=1.23.6)", "matplotlib", "psutil (>=4.2.0)", "pylibmc (>=1.5.1)", "pyvips", "simplejpeg", "simplejpeg (<1.6.6)"] +bioformats = ["large-image-source-bioformats (>=1.23.6)"] colormaps = ["matplotlib"] -converter = ["large-image-converter (>=1.23.5)"] -deepzoom = ["large-image-source-deepzoom (>=1.23.5)"] -dicom = ["large-image-source-dicom (>=1.23.5)"] -dummy = ["large-image-source-dummy (>=1.23.5)"] -gdal = ["large-image-source-gdal (>=1.23.5)"] -mapnik = ["large-image-source-mapnik (>=1.23.5)"] +converter = ["large-image-converter (>=1.23.6)"] +deepzoom = ["large-image-source-deepzoom (>=1.23.6)"] +dicom = ["large-image-source-dicom (>=1.23.6)"] +dummy = ["large-image-source-dummy (>=1.23.6)"] +gdal = ["large-image-source-gdal (>=1.23.6)"] +mapnik = ["large-image-source-mapnik (>=1.23.6)"] memcached = ["pylibmc (>=1.5.1)"] -multi = ["large-image-source-multi (>=1.23.5)"] -nd2 = ["large-image-source-nd2 (>=1.23.5)"] -ometiff = ["large-image-source-ometiff (>=1.23.5)"] -openjpeg = ["large-image-source-openjpeg (>=1.23.5)"] -openslide = ["large-image-source-openslide (>=1.23.5)"] +multi = ["large-image-source-multi (>=1.23.6)"] +nd2 = ["large-image-source-nd2 (>=1.23.6)"] +ometiff = ["large-image-source-ometiff (>=1.23.6)"] +openjpeg = ["large-image-source-openjpeg (>=1.23.6)"] +openslide = ["large-image-source-openslide (>=1.23.6)"] performance = ["psutil (>=4.2.0)", "simplejpeg", "simplejpeg (<1.6.6)"] -pil = ["large-image-source-pil (>=1.23.5)"] -rasterio = ["large-image-source-rasterio (>=1.23.5)"] -sources = ["large-image-source-bioformats (>=1.23.5)", "large-image-source-deepzoom (>=1.23.5)", "large-image-source-dicom (>=1.23.5)", "large-image-source-dummy (>=1.23.5)", "large-image-source-gdal (>=1.23.5)", "large-image-source-mapnik (>=1.23.5)", "large-image-source-multi (>=1.23.5)", "large-image-source-nd2 (>=1.23.5)", "large-image-source-ometiff (>=1.23.5)", "large-image-source-openjpeg (>=1.23.5)", "large-image-source-openslide (>=1.23.5)", "large-image-source-pil (>=1.23.5)", "large-image-source-rasterio (>=1.23.5)", "large-image-source-test (>=1.23.5)", "large-image-source-tiff (>=1.23.5)", "large-image-source-tifffile (>=1.23.5)", "large-image-source-vips (>=1.23.5)"] -test = ["large-image-source-test (>=1.23.5)"] -tiff = ["large-image-source-tiff (>=1.23.5)"] -tifffile = ["large-image-source-tifffile (>=1.23.5)"] +pil = ["large-image-source-pil (>=1.23.6)"] +rasterio = ["large-image-source-rasterio (>=1.23.6)"] +sources = ["large-image-source-bioformats (>=1.23.6)", "large-image-source-deepzoom (>=1.23.6)", "large-image-source-dicom (>=1.23.6)", "large-image-source-dummy (>=1.23.6)", "large-image-source-gdal (>=1.23.6)", "large-image-source-mapnik (>=1.23.6)", "large-image-source-multi (>=1.23.6)", "large-image-source-nd2 (>=1.23.6)", "large-image-source-ometiff (>=1.23.6)", "large-image-source-openjpeg (>=1.23.6)", "large-image-source-openslide (>=1.23.6)", "large-image-source-pil (>=1.23.6)", "large-image-source-rasterio (>=1.23.6)", "large-image-source-test (>=1.23.6)", "large-image-source-tiff (>=1.23.6)", "large-image-source-tifffile (>=1.23.6)", "large-image-source-vips (>=1.23.6)"] +test = ["large-image-source-test (>=1.23.6)"] +tiff = ["large-image-source-tiff (>=1.23.6)"] +tifffile = ["large-image-source-tifffile (>=1.23.6)"] tiledoutput = ["pyvips"] -vips = ["large-image-source-vips (>=1.23.5)"] +vips = ["large-image-source-vips (>=1.23.6)"] [[package]] name = "large-image-source-pil" -version = "1.23.5" +version = "1.23.6" description = "A Pillow tilesource for large_image." category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -large-image = ">=1.23.5" +large-image = ">=1.23.6" [package.extras] all = ["pillow-heif", "rawpy"] -girder = ["girder-large-image (>=1.23.5)"] +girder = ["girder-large-image (>=1.23.6)"] [[package]] name = "locket" @@ -3852,20 +3853,20 @@ python-versions = ">=3.5" [[package]] name = "s3fs" -version = "2022.10.0" +version = "2023.6.0" description = "Convenient Filesystem interface over S3" category = "main" optional = false -python-versions = ">= 3.7" +python-versions = ">= 3.8" [package.dependencies] -aiobotocore = ">=2.4.0,<2.5.0" +aiobotocore = ">=2.5.0,<2.6.0" aiohttp = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1" -fsspec = "2022.10.0" +fsspec = "2023.6.0" [package.extras] -awscli = ["aiobotocore[awscli] (>=2.4.0,<2.5.0)"] -boto3 = ["aiobotocore[boto3] (>=2.4.0,<2.5.0)"] +awscli = ["aiobotocore[awscli] (>=2.5.0,<2.6.0)"] +boto3 = ["aiobotocore[boto3] (>=2.5.0,<2.6.0)"] [[package]] name = "scantree" @@ -4887,7 +4888,7 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [metadata] lock-version = "1.1" python-versions = ">=3.9,<3.12" -content-hash = "0e4ea0ad97396e576795f5f34622b94bdf191a6b3d175375d2f278e54aaaa1db" +content-hash = "94d643c68985a9b1bb7e6276c0c31c4f34e7ce7a418cf42367d81624d8d93041" [metadata.files] adlfs = [ @@ -4895,8 +4896,8 @@ adlfs = [ {file = "adlfs-2023.8.0.tar.gz", hash = "sha256:07e804f6df4593acfcaf01025b162e30ac13e523d3570279c98b2d91a18026d9"}, ] aiobotocore = [ - {file = "aiobotocore-2.4.2-py3-none-any.whl", hash = "sha256:4acd1ebe2e44be4b100aa553910bda899f6dc090b3da2bc1cf3d5de2146ed208"}, - {file = "aiobotocore-2.4.2.tar.gz", hash = "sha256:0603b74a582dffa7511ce7548d07dc9b10ec87bc5fb657eb0b34f9bd490958bf"}, + {file = "aiobotocore-2.5.4-py3-none-any.whl", hash = "sha256:4b32218728ca3d0be83835b604603a0cd6c329066e884bb78149334267f92440"}, + {file = "aiobotocore-2.5.4.tar.gz", hash = "sha256:60341f19eda77e41e1ab11eef171b5a98b5dbdb90804f5334b6f90e560e31fae"}, ] aiohttp = [ {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"}, @@ -5115,8 +5116,8 @@ bokeh = [ {file = "bokeh-2.4.3.tar.gz", hash = "sha256:ef33801161af379665ab7a34684f2209861e3aefd5c803a21fbbb99d94874b03"}, ] botocore = [ - {file = "botocore-1.27.59-py3-none-any.whl", hash = "sha256:69d756791fc024bda54f6c53f71ae34e695ee41bbbc1743d9179c4837a4929da"}, - {file = "botocore-1.27.59.tar.gz", hash = "sha256:eda4aed6ee719a745d1288eaf1beb12f6f6448ad1fa12f159405db14ba9c92cf"}, + {file = "botocore-1.31.17-py3-none-any.whl", hash = "sha256:6ac34a1d34aa3750e78b77b8596617e2bab938964694d651939dba2cbde2c12b"}, + {file = "botocore-1.31.17.tar.gz", hash = "sha256:396459065dba4339eb4da4ec8b4e6599728eb89b7caaceea199e26f7d824a41c"}, ] bracex = [ {file = "bracex-2.3.post1-py3-none-any.whl", hash = "sha256:351b7f20d56fb9ea91f9b9e9e7664db466eb234188c175fd943f8f755c807e73"}, @@ -5650,8 +5651,8 @@ frozenlist = [ {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"}, ] fsspec = [ - {file = "fsspec-2022.10.0-py3-none-any.whl", hash = "sha256:6b7c6ab3b476cdf17efcfeccde7fca28ef5a48f73a71010aaceec5fc15bf9ebf"}, - {file = "fsspec-2022.10.0.tar.gz", hash = "sha256:cb6092474e90487a51de768170f3afa50ca8982c26150a59072b16433879ff1d"}, + {file = "fsspec-2023.6.0-py3-none-any.whl", hash = "sha256:1cbad1faef3e391fba6dc005ae9b5bdcbf43005c9167ce78c915549c352c869a"}, + {file = "fsspec-2023.6.0.tar.gz", hash = "sha256:d0b2f935446169753e7a5c5c55681c54ea91996cc67be93c39a154fb3a2742af"}, ] future = [ {file = "future-0.18.2.tar.gz", hash = "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"}, @@ -6299,12 +6300,12 @@ kiwisolver = [ {file = "kiwisolver-1.4.4.tar.gz", hash = "sha256:d41997519fcba4a1e46eb4a2fe31bc12f0ff957b2b81bac28db24744f333e955"}, ] large-image = [ - {file = "large-image-1.23.5.tar.gz", hash = "sha256:20b5540317f147b6ee185c33de2e06a9b9bd4e852e1ef0950008efacc00de57d"}, - {file = "large_image-1.23.5-py3-none-any.whl", hash = "sha256:eda204ef87ff5de0540a98f02024ad92d7bf73f5116bb791fa4741b59e951885"}, + {file = "large-image-1.23.6.tar.gz", hash = "sha256:2141b9b84822c913a6db10f82b5c7a1a273189bd409032732cfc5904ba916977"}, + {file = "large_image-1.23.6-py3-none-any.whl", hash = "sha256:7087f5ef436918d648e944b032d9d275349d13860243f00cc9367f7c31e15050"}, ] large-image-source-pil = [ - {file = "large-image-source-pil-1.23.5.tar.gz", hash = "sha256:53be3a219fdd255dab27388f4718672bc5df835083c5ed17869ef79666656a3e"}, - {file = "large_image_source_pil-1.23.5-py3-none-any.whl", hash = "sha256:a2eaa6d6eaea1b116dbd77fcc58d907cfa5c6c4a66be0525140f3a3b3085f356"}, + {file = "large-image-source-pil-1.23.6.tar.gz", hash = "sha256:362f89f2477a893dcd2dafa25c6eec607182e3556c2997859471949c88b16077"}, + {file = "large_image_source_pil-1.23.6-py3-none-any.whl", hash = "sha256:aca9a6cf9f46994288a6257b785e572e23f47669a94ea7d9200cfdaec261068b"}, ] locket = [ {file = "locket-1.0.0-py2.py3-none-any.whl", hash = "sha256:b6c819a722f7b6bd955b80781788e4a66a55628b858d347536b7e81325a3a5e3"}, @@ -7806,8 +7807,8 @@ rtree = [ {file = "ruamel.yaml.clib-0.2.7.tar.gz", hash = "sha256:1f08fd5a2bea9c4180db71678e850b995d2a5f4537be0e94557668cf0f5f9497"}, ] s3fs = [ - {file = "s3fs-2022.10.0-py3-none-any.whl", hash = "sha256:1e134c3577171699feb7c1a0c4713260d5b48296e1708737ff940baef6e2c153"}, - {file = "s3fs-2022.10.0.tar.gz", hash = "sha256:e8deb80f20bd0b2059141b874fdb9d6aeb8cce35312ea5f2c02b225a78a00406"}, + {file = "s3fs-2023.6.0-py3-none-any.whl", hash = "sha256:d1a0a423d0d2e17fb2a193d9531935dc3f45ba742693448a461b6b34f6a92a24"}, + {file = "s3fs-2023.6.0.tar.gz", hash = "sha256:63fd8ddf05eb722de784b7b503196107f2a518061298cf005a8a4715b4d49117"}, ] scantree = [ {file = "scantree-0.0.1.tar.gz", hash = "sha256:2a8b163de0e4b2f9e4f37f8caf3f0b265172bbf174111e1bebc7955581895b39"}, diff --git a/pyproject.toml b/pyproject.toml index 06632e96..ad776700 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ appdirs = "^1.4.4" loguru = "^0.6.0" pyvips = "^2.2.1" tiffslide = "^2.1.0" -s3fs = "^2022.10.0" +s3fs = "^2023.6.0" pandera = {extras = ["io"], version = "^0.14.5"} multimethod = "^1.9.1" trimesh = "^3.22.0" diff --git a/src/luna/common/utils.py b/src/luna/common/utils.py index cb8cd202..bc6fd8a4 100644 --- a/src/luna/common/utils.py +++ b/src/luna/common/utils.py @@ -42,7 +42,7 @@ def validate_dask_address(addr: str) -> bool: The typical format for this will be something like 'tcp://192.168.0.37:8786', but there could be a hostname instead of an IP address, and maybe some other - URL schemes are supported. This function will be used to check whether a + URL schemes are supported. This function will be used to check whether a user-defined dask scheduler address is plausible, or obviously invalid. """ HOSTPORT_RE = re.compile( @@ -51,7 +51,7 @@ def validate_dask_address(addr: str) -> bool: [A-Za-z][A-Za-z0-9.-]*[A-Za-z0-9] | [A-Za-z]) : (?P\d+)$""", - re.VERBOSE + re.VERBOSE, ) return bool(HOSTPORT_RE.match(addr)) @@ -88,7 +88,6 @@ def wrapper(*args, **kwargs): args_dict = _get_args_dict(func, args, kwargs) new_args_dict = args_dict.copy() - filesystem = None tmp_dir_dest = [] for key, write_mode in dir_key_write_mode.items(): if not args_dict[key]: @@ -99,11 +98,11 @@ def wrapper(*args, **kwargs): fs, dir = fsspec.core.url_to_fs( args_dict[key], **args_dict.get(storage_options_key, {}) ) - if fs.protocol != "file" and 'cache' not in fs.protocol: + if fs.protocol != "file" and "cache" not in fs.protocol: new_args_dict[storage_options_key] = {"auto_mkdir": True} tmp_dir = tempfile.TemporaryDirectory() new_args_dict[key] = tmp_dir.name - tmp_dir_dest.append((tmp_dir, dir)) + tmp_dir_dest.append((tmp_dir, dir, fs)) result = None with ExitStack() as stack: @@ -116,7 +115,7 @@ def wrapper(*args, **kwargs): fs, path = fsspec.core.url_to_fs( args_dict[key], **args_dict.get(storage_options_key, {}) ) - if 'cache' not in fs.protocol: + if "cache" not in fs.protocol: simplecache_fs = fsspec.filesystem("simplecache", fs=fs) of = simplecache_fs.open(path, write_mode) @@ -125,8 +124,8 @@ def wrapper(*args, **kwargs): result = func(**new_args_dict) - for tmp_dir, dest in tmp_dir_dest: - copy_files(tmp_dir.name, dest, destination_filesystem=filesystem) + for tmp_dir, dest, fs in tmp_dir_dest: + copy_files(tmp_dir.name, dest, destination_filesystem=fs) return result @@ -204,6 +203,7 @@ def rebase_schema_numeric(df): df[col] = df[col].astype(float, errors="ignore") + def rebase_schema_mixed(df): """ Tries to convert all columns with mixed types to strings. @@ -220,6 +220,7 @@ def rebase_schema_mixed(df): if df[col].dtype == list: df[col] = df[col].astype(str) + def generate_uuid_binary(content, prefix): """ Returns hash of the binary, preceded by the prefix. diff --git a/src/luna/pathology/cli/dsa_viz.py b/src/luna/pathology/cli/dsa_viz.py index 6ae3b252..3cd18499 100644 --- a/src/luna/pathology/cli/dsa_viz.py +++ b/src/luna/pathology/cli/dsa_viz.py @@ -144,17 +144,17 @@ def save_dsa_annotation( annotation_name_replaced = dsa_annotation["name"].replace(" ", "_") fs, output_urlpath_prefix = fsspec.core.url_to_fs(output_urlpath, **storage_options) - output_url = ( + output_path = ( Path(output_urlpath_prefix) / f"{annotation_name_replaced}_{image_id}.json" ) try: - with open(output_url, "w", **storage_options).open() as outfile: + with fs.open(output_path, "w").open() as outfile: json.dump(dsa_annotation, outfile) logger.info( - f"Saved {len(dsa_annotation['elements'])} to {fs.unstrip_protocol(str(output_url))}" + f"Saved {len(dsa_annotation['elements'])} to {fs.unstrip_protocol(str(output_path))}" ) - return output_url + return fs.unstrip_protocol(str(output_path)) except Exception as exc: logger.error(exc) return None diff --git a/src/luna/pathology/cli/extract_shape_features.py b/src/luna/pathology/cli/extract_shape_features.py index 2bf21150..db092514 100644 --- a/src/luna/pathology/cli/extract_shape_features.py +++ b/src/luna/pathology/cli/extract_shape_features.py @@ -7,16 +7,11 @@ import numpy as np import pandas as pd import tifffile -import tiffslide from fsspec import open from loguru import logger -from pandera.typing import DataFrame from skimage import measure -from scipy.stats import entropy -from luna.common.models import LabeledTileSchema from luna.common.utils import get_config, save_metadata, timed -from luna.pathology.cli.generate_tile_mask import convert_tiles_to_mask @timed @@ -59,7 +54,9 @@ def cli( mask = tifffile.imread(of) mask_values = {k: v + 1 for v, k in enumerate(config["label_cols"])} - result_df = extract_shape_features(mask, mask_values, config['include_smaller_regions']) + result_df = extract_shape_features( + mask, mask_values, config["include_smaller_regions"] + ) fs, urlpath = fsspec.core.url_to_fs( config["output_urlpath"], **config["output_storage_options"] @@ -114,7 +111,6 @@ def extract_whole_slide_features( value, counts = np.unique(mask, return_counts=True) - logger.info("Extracting whole slide features") # gathering whole slide features, one vector per label whole_slide_features = measure.regionprops_table( @@ -122,14 +118,19 @@ def extract_whole_slide_features( ) whole_slide_features_df = pd.DataFrame.from_dict(whole_slide_features) - if 'perimeter' in whole_slide_features_df.columns and 'area' in whole_slide_features_df.columns: - whole_slide_features_df['perimeter_area_ratio'] = whole_slide_features_df['perimeter'] / whole_slide_features_df['area'] + if ( + "perimeter" in whole_slide_features_df.columns + and "area" in whole_slide_features_df.columns + ): + whole_slide_features_df["perimeter_area_ratio"] = ( + whole_slide_features_df["perimeter"] / whole_slide_features_df["area"] + ) # add column with label name whole_slide_features_df["Class"] = whole_slide_features_df["label"].map( label_mapper ) - whole_slide_features_df = whole_slide_features_df.drop('label', axis=1) + whole_slide_features_df = whole_slide_features_df.drop("label", axis=1) logger.info( f"Extracted whole slide features for {len(whole_slide_features_df)} labels" ) @@ -181,8 +182,13 @@ def extract_regional_features( ) regional_features_df = pd.DataFrame.from_dict(regional_features) - if 'perimeter' in regional_features_df.columns and 'area' in regional_features_df.columns: - regional_features_df['perimeter_area_ratio'] = regional_features_df['perimeter'] / regional_features_df['area'] + if ( + "perimeter" in regional_features_df.columns + and "area" in regional_features_df.columns + ): + regional_features_df["perimeter_area_ratio"] = ( + regional_features_df["perimeter"] / regional_features_df["area"] + ) # add column with label name regional_features_df["Class"] = regional_features_df["min_intensity"].map( @@ -191,7 +197,7 @@ def extract_regional_features( regional_features_df = regional_features_df.drop( columns=["max_intensity", "min_intensity"] ) - regional_features_df = regional_features_df.drop('label', axis=1) + regional_features_df = regional_features_df.drop("label", axis=1) logger.info(f"Extracted regional features for {len(regional_features_df)} regions") @@ -201,7 +207,7 @@ def extract_regional_features( def extract_shape_features( mask: np.ndarray, mask_values: Dict[int, str], - include_smaller_regions = False, + include_smaller_regions=False, properties: List[str] = [ "area", "bbox", @@ -246,11 +252,17 @@ def extract_shape_features( logger.info(f"Mask shape={mask.shape}") logger.info("Extracting regional features based on connectivity") - whole_slide_features_df = extract_whole_slide_features(mask, mask_values, properties) - whole_slide_features_df['Parent'] = 'whole_region' - whole_slide_features_df = whole_slide_features_df.set_index('Class') - whole_slide_features_df['area_fraction'] = whole_slide_features_df['area'] / whole_slide_features_df['area'].sum() - whole_slide_features_mdf = pd.melt(whole_slide_features_df.reset_index(), id_vars=['Parent', 'Class']) + whole_slide_features_df = extract_whole_slide_features( + mask, mask_values, properties + ) + whole_slide_features_df["Parent"] = "whole_region" + whole_slide_features_df = whole_slide_features_df.set_index("Class") + whole_slide_features_df["area_fraction"] = ( + whole_slide_features_df["area"] / whole_slide_features_df["area"].sum() + ) + whole_slide_features_mdf = pd.melt( + whole_slide_features_df.reset_index(), id_vars=["Parent", "Class"] + ) area_col = whole_slide_features_df.columns.get_loc("area") idx0, idx1 = np.triu_indices(len(whole_slide_features_df), 1) @@ -260,7 +272,8 @@ def extract_shape_features( "Parent": "whole_region", "variable": np.array( [ - f"area_log_ratio_to_{row}" for row in whole_slide_features_df.index.values + f"area_log_ratio_to_{row}" + for row in whole_slide_features_df.index.values ] )[idx1], "value": np.log(whole_slide_features_df.iloc[idx0, area_col].values) @@ -270,18 +283,29 @@ def extract_shape_features( ) whole_slide_ratio_df = whole_slide_ratio_df.reset_index() - properties += ['min_intensity', 'max_intensity'] - regional_features_df = extract_regional_features(mask, mask_values, properties) - regional_features_df = regional_features_df.assign(Parent=[f'region_{x}' for x in range(len(regional_features_df))]) - regional_features_df = regional_features_df.set_index(['Parent', 'Class']) - regional_features_df['area_fraction'] = regional_features_df['area'] / whole_slide_features_df['area'] - regional_features_mdf = pd.melt(regional_features_df.reset_index(), id_vars=['Parent', 'Class']) + regional_features_df = extract_regional_features( + mask, mask_values, properties + ["min_intensity", "max_intensity"] + ) + regional_features_df = regional_features_df.assign( + Parent=[f"region_{x}" for x in range(len(regional_features_df))] + ) + regional_features_df = regional_features_df.set_index(["Parent", "Class"]) + regional_features_df["area_fraction"] = ( + regional_features_df["area"] / whole_slide_features_df["area"] + ) + regional_features_mdf = pd.melt( + regional_features_df.reset_index(), id_vars=["Parent", "Class"] + ) regional_features_df = regional_features_df.reset_index() - largest_regional_features_df = regional_features_df.loc[regional_features_df.groupby('Class')['area'].idxmax()] - largest_regional_features_df['Parent'] = 'largest_region' - largest_regional_features_df = largest_regional_features_df.set_index('Class') - largest_regional_features_mdf = pd.melt(largest_regional_features_df.reset_index(), id_vars=['Parent', 'Class']) + largest_regional_features_df = regional_features_df.loc[ + regional_features_df.groupby("Class")["area"].idxmax() + ] + largest_regional_features_df["Parent"] = "largest_region" + largest_regional_features_df = largest_regional_features_df.set_index("Class") + largest_regional_features_mdf = pd.melt( + largest_regional_features_df.reset_index(), id_vars=["Parent", "Class"] + ) area_col = largest_regional_features_df.columns.get_loc("area") idx0, idx1 = np.triu_indices(len(largest_regional_features_df), 1) @@ -291,7 +315,8 @@ def extract_shape_features( "Parent": "largest_region", "variable": np.array( [ - f"area_log_ratio_to_{row}" for row in largest_regional_features_df.index.values + f"area_log_ratio_to_{row}" + for row in largest_regional_features_df.index.values ] )[idx1], "value": np.log(largest_regional_features_df.iloc[idx0, area_col].values) @@ -301,8 +326,14 @@ def extract_shape_features( ) ratio_df = ratio_df.reset_index() - result_df = pd.concat([whole_slide_features_mdf, whole_slide_ratio_df, - largest_regional_features_mdf, ratio_df]) + result_df = pd.concat( + [ + whole_slide_features_mdf, + whole_slide_ratio_df, + largest_regional_features_mdf, + ratio_df, + ] + ) if include_smaller_regions: result_df = pd.concat([result_df, regional_features_mdf]) diff --git a/src/luna/pathology/cli/extract_tile_shape_features.py b/src/luna/pathology/cli/extract_tile_shape_features.py index de8deb2c..1258205a 100644 --- a/src/luna/pathology/cli/extract_tile_shape_features.py +++ b/src/luna/pathology/cli/extract_tile_shape_features.py @@ -1,13 +1,13 @@ # General imports import itertools +import json from enum import Enum from pathlib import Path -from typing import Dict, Optional, List +from typing import List, Optional import fire import fsspec import geopandas as gpd -import json import numpy as np import pandas as pd import tiffslide @@ -22,39 +22,48 @@ from luna.pathology.cli.generate_tile_mask import convert_tiles_to_mask from luna.pathology.common.utils import resize_array + class StatisticalDescriptors(str, Enum): - ALL = 'All' - QUANTILES = 'Quantiles' - STATS = 'Stats' - DENSITY = 'Density' + ALL = "All" + QUANTILES = "Quantiles" + STATS = "Stats" + DENSITY = "Density" + STATISTICAL_DESCRIPTOR_PERCENTILES = np.arange(0.1, 1, 0.1) STATISTICAL_DESCRIPTOR_MAP = { StatisticalDescriptors.STATS: ["min", "mean", "median", "max", "sum"], StatisticalDescriptors.DENSITY: ["var", "skew", ("kurt", kurtosis)], - StatisticalDescriptors.QUANTILES: [(f"{p:.0%}", lambda x: x.quantile(p)) for p in STATISTICAL_DESCRIPTOR_PERCENTILES], + StatisticalDescriptors.QUANTILES: [ + (f"{p:.0%}", lambda x: x.quantile(p)) + for p in STATISTICAL_DESCRIPTOR_PERCENTILES + ], } STATISTICAL_DESCRIPTOR_MAP[StatisticalDescriptors.ALL] = list( - itertools.chain(*STATISTICAL_DESCRIPTOR_MAP.values())) + itertools.chain(*STATISTICAL_DESCRIPTOR_MAP.values()) +) + class CellularFeatures(str, Enum): - ALL = 'All' - NUCLEUS = 'Nucleus' - CELL = 'Cell' - CYTOPLASM = 'Cytoplasm' - MEMBRANE = 'Membrane' + ALL = "All" + NUCLEUS = "Nucleus" + CELL = "Cell" + CYTOPLASM = "Cytoplasm" + MEMBRANE = "Membrane" class PropertyType(str, Enum): - ALL = 'All' - GEOMETRIC = 'Geometric' - STAIN = 'Stain' + ALL = "All" + GEOMETRIC = "Geometric" + STAIN = "Stain" + PROPERTY_TYPE_MAP = { - PropertyType.GEOMETRIC: ['Cell', 'Nucleus'], - PropertyType.STAIN: ['Hematoxylin', 'Eosin', 'DAB'], + PropertyType.GEOMETRIC: ["Cell", "Nucleus"], + PropertyType.STAIN: ["Hematoxylin", "Eosin", "DAB"], } + @timed @save_metadata def cli( @@ -64,7 +73,7 @@ def cli( output_urlpath: str = ".", resize_factor: int = 16, detection_probability_threshold: Optional[float] = None, - statistical_descriptors: str = StatisticalDescriptors.ALL, + statistical_descriptors: str = StatisticalDescriptors.ALL, cellular_features: str = CellularFeatures.ALL, property_type: str = PropertyType.ALL, storage_options: dict = {}, @@ -117,9 +126,9 @@ def cli( slide_id = Path(config["slide_urlpath"]).stem - statistical_descriptors = config['statistical_descriptors'].capitalize() - cellular_features = config['cellular_features'].capitalize() - property_type = config['property_type'].capitalize() + statistical_descriptors = config["statistical_descriptors"].capitalize() + cellular_features = config["cellular_features"].capitalize() + property_type = config["property_type"].capitalize() df = extract_tile_shape_features( object_gdf, @@ -154,7 +163,7 @@ def extract_tile_shape_features( resize_factor: int = 16, detection_probability_threshold: Optional[float] = None, slide_id: str = "", - statistical_descriptors: StatisticalDescriptors = StatisticalDescriptors.ALL, + statistical_descriptors: StatisticalDescriptors = StatisticalDescriptors.ALL, cellular_features: CellularFeatures = CellularFeatures.ALL, property_type: PropertyType = PropertyType.ALL, include_smaller_regions: bool = False, @@ -192,6 +201,9 @@ def extract_tile_shape_features( Returns: dict: output paths and the number of features generated """ + import pdb + + pdb.set_trace() LabeledTileSchema.validate(tiles_df.reset_index()) tile_area = tiles_df.iloc[0].tile_size ** 2 @@ -205,21 +217,24 @@ def extract_tile_shape_features( ent["Parent"] = "whole_region" ent["Class"] = i ent["variable"] = f"Joint Entropy to {j}" - ent["value"] = entropy(counts[[i,j]], base=2) + ent["value"] = entropy(counts[[i, j]], base=2) joint_entropy.append(ent) entropy_df = pd.DataFrame(joint_entropy) shannon_entropy = entropy(counts, base=2) - entropy_df = entropy_df.append({ - "Parent": "whole_region", - "Class": "All", - "variable": "Entropy", - "value": shannon_entropy - }, ignore_index=True) + entropy_df = entropy_df.append( + { + "Parent": "whole_region", + "Class": "All", + "variable": "Entropy", + "value": shannon_entropy, + }, + ignore_index=True, + ) slide_area = counts * tile_area - slide_area.index.name = 'Parent' + slide_area.index.name = "Parent" mask, mask_values = convert_tiles_to_mask( tiles_df, slide_width, slide_height, "Classification" @@ -248,8 +263,10 @@ def extract_tile_shape_features( try: measurement_keys = list(gdf.measurements.iloc[0].keys()) gdf = gdf.join(gdf.measurements.apply(lambda x: pd.Series(x))) - except: - measurements = gdf.measurements.apply(lambda x: pd.DataFrame(json.loads(x)).set_index('name').squeeze()) + except Exception: + measurements = gdf.measurements.apply( + lambda x: pd.DataFrame(json.loads(x)).set_index("name").squeeze() + ) measurement_keys = list(measurements.columns.values) gdf = gdf.join(measurements) gdf = gdf.join(gdf.classification.apply(lambda x: pd.Series(x))) @@ -271,10 +288,8 @@ def extract_tile_shape_features( agg_df["Object Counts"] = gb.size() agg_df["Normalized Cell Density"] = agg_df["Object Counts"] / slide_area - if 'Cell: Area µm^2 sum' in agg_df.columns: - agg_df["Cell Density"] = agg_df["Cell: Area µm^2 sum"] / ( - slide_area / 4 - ) + if "Cell: Area µm^2 sum" in agg_df.columns: + agg_df["Cell Density"] = agg_df["Cell: Area µm^2 sum"] / (slide_area / 4) logger.info( "Calculating obj count log ratios between all tile label obj classification groups" @@ -301,8 +316,7 @@ def extract_tile_shape_features( if property_type != PropertyType.ALL: property_types = PROPERTY_TYPE_MAP[property_type] - agg_df = agg_df.filter(regex='|'.join(property_types)) - + agg_df = agg_df.filter(regex="|".join(property_types)) mdf = pd.melt(agg_df.reset_index(), id_vars=["Parent", "Class"]).dropna() mdf = pd.concat([mdf, ratio_df.reset_index(), shape_features_df, entropy_df]) @@ -310,7 +324,9 @@ def extract_tile_shape_features( if slide_id: mdf.insert(loc=0, column="slide_id", value=slide_id) - mdf[['Parent', 'Class', 'variable']] = mdf[['Parent', 'Class', 'variable']].replace(r"_", " ", regex=True) + mdf[["Parent", "Class", "variable"]] = mdf[["Parent", "Class", "variable"]].replace( + r"_", " ", regex=True + ) return mdf diff --git a/src/luna/pathology/cli/generate_mask.py b/src/luna/pathology/cli/generate_mask.py index 25fb7461..ccd0e8c9 100644 --- a/src/luna/pathology/cli/generate_mask.py +++ b/src/luna/pathology/cli/generate_mask.py @@ -13,7 +13,7 @@ from PIL import Image from skimage.measure import block_reduce -from luna.common.utils import get_config, save_metadata, timed +from luna.common.utils import get_config, local_cache_urlpath, save_metadata, timed from luna.pathology.common.utils import convert_xml_to_mask, get_layer_names @@ -71,6 +71,11 @@ def cli( return properties +@local_cache_urlpath( + dir_key_write_mode={ + "output_urlpath": "w", + } +) def generate_mask( slide_urlpath: str, roi_urlpath: str, @@ -99,11 +104,8 @@ def generate_mask( slide = tiffslide.TiffSlide(of) thumbnail = slide.get_thumbnail((1000, 1000)) - fs, output_urlpath_prefix = fsspec.core.url_to_fs( - output_urlpath, **output_storage_options - ) - with fs.open(Path(output_urlpath_prefix) / "slide_thumbnail.png", "wb") as of: - thumbnail.save(of) + with open(Path(output_urlpath) / "slide_thumbnail.png", "wb") as of: + thumbnail.save(of, format="PNG") wsi_shape = ( slide.dimensions[1], @@ -130,11 +132,11 @@ def generate_mask( ) ).get_thumbnail((1000, 1000)) - with fs.open(Path(output_urlpath_prefix) / "mask_thumbnail.png", "wb") as of: - mask_thumbnail.save(of) + with open(Path(output_urlpath) / "mask_thumbnail.png", "wb") as of: + mask_thumbnail.save(of, format="PNG") - slide_mask_file = Path(output_urlpath_prefix) / "mask_full_res.tif" - with fs.open(slide_mask_file, "wb") as of: + slide_mask_file = Path(output_urlpath) / "mask_full_res.tif" + with open(slide_mask_file, "wb") as of: tifffile.imwrite(of, mask_arr) return pd.DataFrame(mask_properties) diff --git a/src/luna/pathology/cli/generate_tile_mask.py b/src/luna/pathology/cli/generate_tile_mask.py index 185c657e..6a80a444 100644 --- a/src/luna/pathology/cli/generate_tile_mask.py +++ b/src/luna/pathology/cli/generate_tile_mask.py @@ -13,7 +13,7 @@ from multimethod import multimethod from luna.common.models import TileSchema -from luna.common.utils import get_config, save_metadata, timed +from luna.common.utils import get_config, local_cache_urlpath, save_metadata, timed @timed @@ -56,19 +56,15 @@ def cli( slide_width, slide_height, config["label_cols"], + config["output_urlpath"], + config["output_storage_options"], ) - fs, output_urlpath_prefix = fsspec.core.url_to_fs( - config["output_urlpath"], **config["output_storage_options"] - ) - - slide_mask = Path(output_urlpath_prefix) / "tile_mask.tif" - logger.info(f"Saving output mask to {slide_mask}") - with fs.open(slide_mask, "wb") as of: - tifffile.imwrite(of, mask_arr) + fs, output_path = fsspec.core.url_to_fs(config["output_urlpath"]) + slide_mask = Path(output_path) / "tile_mask.tif" properties = { - "slide_mask": slide_mask, + "slide_mask": fs.unstrip_protocol(str(slide_mask)), "mask_values": mask_values, "mask_size": mask_arr.shape, } @@ -81,6 +77,8 @@ def convert_tiles_to_mask( tiles_df: pd.DataFrame, slide: tiffslide.TiffSlide, label_cols: Union[str, List[str]], + output_urlpath: str, + output_storage_options: dict, ): """Converts categorical tile labels to a slide image mask. This mask can be used for feature extraction and spatial analysis. @@ -96,15 +94,27 @@ def convert_tiles_to_mask( slide_width = slide.dimensions[0] slide_height = slide.dimensions[1] - return convert_tiles_to_mask(tiles_df, slide_width, slide_height, label_cols) + return convert_tiles_to_mask( + tiles_df, + slide_width, + slide_height, + label_cols, + output_urlpath, + output_storage_options, + ) @multimethod +@local_cache_urlpath( + dir_key_write_mode={"output_urlpath": "w"}, +) def convert_tiles_to_mask( tiles_df: pd.DataFrame, slide_width: int, slide_height: int, label_cols: Union[str, List[str]], + output_urlpath: str, + output_storage_options: dict, ): """Converts categorical tile labels to a slide image mask. This mask can be used for feature extraction and spatial analysis. @@ -144,6 +154,11 @@ def convert_tiles_to_mask( logger.info(f"{address}, {row['mask']}, {value}") + slide_mask = Path(output_urlpath) / "tile_mask.tif" + logger.info(f"Saving output mask to {slide_mask}") + with open(slide_mask, "wb") as of: + tifffile.imwrite(of, mask_arr) + return mask_arr, mask_values diff --git a/src/luna/pathology/cli/generate_tiles.py b/src/luna/pathology/cli/generate_tiles.py index a491e94d..22a16d6b 100644 --- a/src/luna/pathology/cli/generate_tiles.py +++ b/src/luna/pathology/cli/generate_tiles.py @@ -7,14 +7,13 @@ import fire import fsspec import pandas as pd -from dask.distributed import Client, progress -from fsspec import open # type: ignore +from dask.distributed import progress from loguru import logger from multimethod import multimethod from pandera.typing import DataFrame from tiffslide import TiffSlide -from luna.common.dask import get_or_create_dask_client, configure_dask_client +from luna.common.dask import configure_dask_client, get_or_create_dask_client from luna.common.models import SlideSchema, Tile, TileSchema from luna.common.utils import get_config, save_metadata, timed from luna.pathology.common.utils import ( @@ -54,7 +53,7 @@ def cli( """ config = get_config(vars()) - configure_dask_client(**config['dask_options']) + configure_dask_client(**config["dask_options"]) output_filesystem, output_urlpath_prefix = fsspec.core.url_to_fs( config["output_urlpath"], **config["output_storage_options"] @@ -72,8 +71,6 @@ def cli( print(f"saving to {output_header_file}") df.to_parquet(of) - df.to_parquet(output_header_file) - properties = { "slide_tiles": output_header_file, # "Tiles" are the metadata that describe them "total_tiles": len(df), diff --git a/src/luna/pathology/cli/run_tissue_detection.py b/src/luna/pathology/cli/run_tissue_detection.py index b042dfd8..e2accb9f 100644 --- a/src/luna/pathology/cli/run_tissue_detection.py +++ b/src/luna/pathology/cli/run_tissue_detection.py @@ -8,7 +8,7 @@ import fsspec # type: ignore import numpy as np import pandas as pd -from dask.distributed import Client, progress +from dask.distributed import progress from fsspec import open # type: ignore from loguru import logger from multimethod import multimethod @@ -18,9 +18,15 @@ from skimage.filters import threshold_otsu # type: ignore from tiffslide import TiffSlide -from luna.common.dask import get_or_create_dask_client, configure_dask_client +from luna.common.dask import configure_dask_client, get_or_create_dask_client from luna.common.models import SlideSchema, Tile -from luna.common.utils import get_config, grouper, local_cache_urlpath, save_metadata, timed +from luna.common.utils import ( + get_config, + grouper, + local_cache_urlpath, + save_metadata, + timed, +) from luna.pathology.cli.generate_tiles import generate_tiles from luna.pathology.common.utils import ( get_array_from_tile, @@ -120,7 +126,7 @@ def cli( """ config = get_config(vars()) - configure_dask_client(**config['dask_options']) + configure_dask_client(**config["dask_options"]) if not config["tile_size"] and not config["tiles_urlpath"]: raise fire.core.FireError("Specify either tiles_urlpath or tile_size") @@ -130,9 +136,7 @@ def cli( ) slide_id = Path(urlparse(config["slide_urlpath"]).path).stem - output_header_file = ( - Path(output_urlpath_prefix) / f"{slide_id}.tiles.parquet" - ) + output_header_file = Path(output_urlpath_prefix) / f"{slide_id}.tiles.parquet" df = detect_tissue( config["slide_urlpath"], @@ -146,8 +150,8 @@ def cli( config["output_urlpath"], config["output_storage_options"], ) - with open(output_header_file, "wb", **config["output_storage_options"]) as of: - print(f"saving to {output_header_file}") + with output_filesystem.open(output_header_file, "wb") as of: + logger.info(f"saving to {output_header_file}") df.to_parquet(of) properties = { "tiles_manifest": output_header_file, @@ -281,7 +285,9 @@ def detect_tissue( with open(tiles_urlpath, **storage_options) as of: tiles_df = pd.read_parquet(of) elif type(tile_size) == int: - tiles_df = generate_tiles(slide_urlpath, tile_size, storage_options, tile_magnification) + tiles_df = generate_tiles( + slide_urlpath, tile_size, storage_options, tile_magnification + ) else: raise RuntimeError("Specify tile_size or tile_urlpath") @@ -399,8 +405,12 @@ def detect_tissue( deconv_sample_arr = pull_stain_channel( sample_arr_filtered, vectors=stain_vectors ) - with open(output_urlpath_prefix + "/deconv_sample_arr.png", "wb") as f: - Image.fromarray(deconv_sample_arr).save(f, "png", **output_storage_options) + with open( + output_urlpath_prefix + "/deconv_sample_arr.png", + "wb", + **output_storage_options, + ) as f: + Image.fromarray(deconv_sample_arr).save(f, "png") logger.info("Saving stain masks") stain0_mask = np.where( diff --git a/src/luna/pathology/cli/visualize_tile_labels_png.py b/src/luna/pathology/cli/visualize_tile_labels_png.py index baf5b8b4..05964e1f 100644 --- a/src/luna/pathology/cli/visualize_tile_labels_png.py +++ b/src/luna/pathology/cli/visualize_tile_labels_png.py @@ -71,11 +71,14 @@ def cli( images = {} for score_type, thumbnail_overlayed in thumbnails_overlayed.items(): - output_file = f"{output_path_prefix}/tile_scores_and_labels_visualization_{score_type}.png" + output_file = ( + Path(output_path_prefix) + / f"tile_scores_and_labels_visualization_{score_type}.png" + ) thumbnail_overlayed = Image.fromarray(thumbnail_overlayed) with fs.open(output_file, "wb") as of: - thumbnail_overlayed.save(of) - images[score_type] = output_file + thumbnail_overlayed.save(of, format="PNG") + images[score_type] = str(output_file) logger.info(f"Saved {score_type} visualization at {output_file}") properties = { @@ -142,7 +145,9 @@ def visualize_tiles( if unit_sf: to_mag_scale_factor *= unit_sf else: - logger.warning("No MPP scale factor was recognized in slide properties.") + logger.warning( + "No MPP scale factor was recognized in slide properties." + ) # only visualize tile scores that were able to be computed all_score_types = set(plot_labels) diff --git a/tests/luna/pathology/cli/conftest.py b/tests/luna/pathology/cli/conftest.py index b78b881e..feaf852d 100644 --- a/tests/luna/pathology/cli/conftest.py +++ b/tests/luna/pathology/cli/conftest.py @@ -1,4 +1,7 @@ +import os + import pytest +import s3fs from dask.distributed import Client, LocalCluster @@ -9,3 +12,13 @@ def dask_client(): yield client client.close() cluster.close() + + +@pytest.fixture(scope="module") +def s3fs_client(): + localstack_endpoint = os.getenv( + "LOCALSTACK_ENDPOINT_URL", default="http://localhost:4566" + ) + return s3fs.core.S3FileSystem( + key="", secret="", client_kwargs={"endpoint_url": localstack_endpoint} + ) diff --git a/tests/luna/pathology/cli/test_dsa_upload.py b/tests/luna/pathology/cli/test_dsa_upload.py index e1592ff0..545906bf 100644 --- a/tests/luna/pathology/cli/test_dsa_upload.py +++ b/tests/luna/pathology/cli/test_dsa_upload.py @@ -4,7 +4,7 @@ from luna.pathology.cli.dsa_upload import cli -def test_upload(monkeypatch): +def test_upload(monkeypatch, s3fs_client): def mock_get(*args, **kwargs): if args[1] == "/system/check": return {} @@ -64,3 +64,30 @@ def mock_auth(*args, **kwargs): "pw", ], ) + + s3fs_client.mkdirs("testupload", exist_ok=True) + s3fs_client.put( + "tests/luna/pathology/cli/testouts/Tile-Based_Pixel_Classifier_Inference_123.json", + "testupload/json/", + ) + fire.Fire( + cli, + [ + "--dsa_endpoint_url", + "http://localhost:8080/", + "--annotation_file_urlpath", + "s3://testupload/json/" + "Tile-Based_Pixel_Classifier_Inference_123.json", + "--image_filename", + "123.svs", + "--collection_name", + "test_collection", + "--username", + "user", + "--password", + "pw", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) diff --git a/tests/luna/pathology/cli/test_dsa_viz.py b/tests/luna/pathology/cli/test_dsa_viz.py index 36a3bf94..fc4b4aba 100644 --- a/tests/luna/pathology/cli/test_dsa_viz.py +++ b/tests/luna/pathology/cli/test_dsa_viz.py @@ -21,6 +21,29 @@ def verify_cleanup(output_file): os.remove(str(Path(output_file).parent) + "/metadata.yml") +def test_stardist_polygon_s3(s3fs_client): + s3fs_client.mkdirs("dsatest", exist_ok=True) + s3fs_client.put( + "tests/testdata/pathology/test_object_classification.geojson", + "s3://dsatest/test/", + ) + fire.Fire( + stardist_polygon, + [ + "--local_config", + "tests/testdata/pathology/stardist_polygon_s3.yml", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists( + "s3://dsatest/out/StarDist_Segmentations_with_Lymphocyte_Classifications_123.json" + ) + + def test_stardist_polygon(): fire.Fire( stardist_polygon, diff --git a/tests/luna/pathology/cli/test_extract_kfunction_statistics.py b/tests/luna/pathology/cli/test_extract_kfunction_statistics.py index 064df16a..dd3b52b9 100644 --- a/tests/luna/pathology/cli/test_extract_kfunction_statistics.py +++ b/tests/luna/pathology/cli/test_extract_kfunction_statistics.py @@ -41,3 +41,35 @@ def test_cli(tmp_path, dask_client): df = pd.read_parquet(f"{tmp_path}/test_tile_stats_kfunction_supertiles.parquet") assert "ikfunction_r160.0_stainCentroid_X_µm" in df.columns assert df["ikfunction_r160.0_stainCentroid_X_µm_norm"].values[0] == 1.0 + + +def test_cli_s3(s3fs_client, dask_client): + s3fs_client.mkdirs("teststat", exist_ok=True) + s3fs_client.put( + "tests/testdata/pathology/test_tile_stats.parquet", "teststat/test/" + ) + fire.Fire( + cli, + [ + "--input_cell_objects_urlpath", + "s3://teststat/test/test_tile_stats.parquet", + "--output_urlpath", + "s3://teststat/out/", + "--intensity_label", + "Centroid X µm", + "--radius", + str(160.0), + "--tile_stride", + str(300), + "--tile_size", + str(300), + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists( + "teststat/out/test_tile_stats_kfunction_supertiles.parquet" + ) diff --git a/tests/luna/pathology/cli/test_extract_shape_features.py b/tests/luna/pathology/cli/test_extract_shape_features.py index bc0177cb..2acb7c99 100644 --- a/tests/luna/pathology/cli/test_extract_shape_features.py +++ b/tests/luna/pathology/cli/test_extract_shape_features.py @@ -6,6 +6,30 @@ from luna.pathology.cli.extract_shape_features import cli +def test_cli_generate_mask_s3(s3fs_client): + s3fs_client.mkdirs("testmask", exist_ok=True) + s3fs_client.put( + "tests/testdata/pathology/generate_tile_mask/tile_mask.tif", "testmask/test/" + ) + fire.Fire( + cli, + [ + "--slide_mask_urlpath", + "s3://testmask/test/tile_mask.tif", + "--output_urlpath", + "s3://testmask/out", + "--label_cols", + "Background,Tumor", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists("testmask/out/shape_features.csv") + + def test_cli_generate_mask(tmp_path): fire.Fire( cli, diff --git a/tests/luna/pathology/cli/test_extract_stain_texture.py b/tests/luna/pathology/cli/test_extract_stain_texture.py index 15ae4854..e2770337 100644 --- a/tests/luna/pathology/cli/test_extract_stain_texture.py +++ b/tests/luna/pathology/cli/test_extract_stain_texture.py @@ -31,3 +31,35 @@ def test_cli(tmp_path): assert bool(df.notnull().values.any()) is True assert df.shape == (1, 216) + + +def test_cli_s3(s3fs_client): + s3fs_client.mkdirs("teststain", exist_ok=True) + s3fs_client.put("tests/testdata/pathology/123.svs", "teststain/test/") + s3fs_client.put( + "tests/testdata/pathology/generate_mask/mask_full_res.tif", "teststain/test/" + ) + + fire.Fire( + cli, + [ + "--slide_image_urlpath", + "s3://teststain/test/123.svs", + "--slide_mask_urlpath", + "s3://teststain/test/mask_full_res.tif", + "--output_urlpath", + "s3://teststain/out", + "--tile_size", + str(512), + "--stain_channel", + str(0), + "--stain_sample_factor", + str(10), + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists("teststain/out/stainomics.parquet") diff --git a/tests/luna/pathology/cli/test_extract_tile_statistics.py b/tests/luna/pathology/cli/test_extract_tile_statistics.py index e645cc43..cf86b7ab 100644 --- a/tests/luna/pathology/cli/test_extract_tile_statistics.py +++ b/tests/luna/pathology/cli/test_extract_tile_statistics.py @@ -33,3 +33,25 @@ def test_cli_extract_tile_statistics(tmp_path): "Centroid X µm_pct100", ]: assert col in cols + + +def test_cli_extract_tile_statistics_s3(s3fs_client): + s3fs_client.mkdirs("tiletest", exist_ok=True) + s3fs_client.put( + "tests/testdata/pathology/test_tile_stats.parquet", "tiletest/test/" + ) + fire.Fire( + cli, + [ + "--tiles-urlpath", + "s3://tiletest/test/test_tile_stats.parquet", + "--output-urlpath", + "s3://tiletest/out/", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists("s3://tiletest/out/test_tile_stats_tile_stats.parquet") diff --git a/tests/luna/pathology/cli/test_generate_mask.py b/tests/luna/pathology/cli/test_generate_mask.py index 9b03bf31..616c85c5 100644 --- a/tests/luna/pathology/cli/test_generate_mask.py +++ b/tests/luna/pathology/cli/test_generate_mask.py @@ -25,3 +25,32 @@ def test_cli(tmp_path): assert os.path.exists(f"{tmp_path}/metadata.yml") openslide.ImageSlide(f"{tmp_path}/mask_full_res.tif") + + +def test_cli_s3(s3fs_client): + s3fs_client.mkdirs("masktest", exist_ok=True) + s3fs_client.put("tests/testdata/pathology/123.svs", "masktest/test/") + s3fs_client.put( + "tests/testdata/pathology/test-project/pathology_annotations/123456_annotation_from_halo.xml", + "masktest/test/", + ) + fire.Fire( + cli, + [ + "--slide-urlpath", + "s3://masktest/test/123.svs", + "--roi-urlpath", + "s3://masktest/test/123456_annotation_from_halo.xml", + "--output-urlpath", + "s3://masktest/out/", + "--annotation-name", + "Tumor", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists("s3://masktest/out/mask_full_res.tif") + assert s3fs_client.exists("s3://masktest/out/metadata.yml") diff --git a/tests/luna/pathology/cli/test_generate_tile_labels.py b/tests/luna/pathology/cli/test_generate_tile_labels.py index 04e8fe3a..d9586370 100644 --- a/tests/luna/pathology/cli/test_generate_tile_labels.py +++ b/tests/luna/pathology/cli/test_generate_tile_labels.py @@ -27,3 +27,33 @@ def test_cli(tmp_path): assert out_tile.loc["x1_y1_z10.0", "regional_label"] == "Other" assert out_tile.loc["x3_y4_z10.0", "regional_label"] == "Tumor" + + +def test_cli_s3(s3fs_client): + s3fs_client.mkdirs("tilelabel", exist_ok=True) + s3fs_client.put( + "tests/testdata/pathology/dsa_annots/slide_annotation_dataset_lung-project_Tissue-clf.parquet", + "tilelabel/test/", + ) + s3fs_client.put( + "tests/testdata/pathology/save_tiles/123/123.tiles.parquet", "tilelabel/test/" + ) + fire.Fire( + cli, + [ + "--annotation-urlpath", + "s3://tilelabel/test/slide_annotation_dataset_lung-project_Tissue-clf.parquet", + "--tiles-urlpath", + "s3://tilelabel/test/123.tiles.parquet", + "--output-urlpath", + "s3://tilelabel/out/", + "--slide-id", + "123", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists("tilelabel/out/123.regional_label.tiles.parquet") diff --git a/tests/luna/pathology/cli/test_generate_tile_mask.py b/tests/luna/pathology/cli/test_generate_tile_mask.py index a53efa92..ab892252 100644 --- a/tests/luna/pathology/cli/test_generate_tile_mask.py +++ b/tests/luna/pathology/cli/test_generate_tile_mask.py @@ -28,3 +28,32 @@ def test_cli_generate_mask(tmp_path): mask = tifffile.imread(f"{tmp_path}/tile_mask.tif") assert np.array_equal(np.unique(mask), [0, 1]) + + +def test_cli_generate_mask_s3(s3fs_client): + s3fs_client.mkdirs("tilemask", exist_ok=True) + s3fs_client.put("tests/testdata/pathology/123.svs", "tilemask/test/") + s3fs_client.put( + "tests/testdata/pathology/infer_tumor_background/123/tile_scores_and_labels_pytorch_inference.parquet", + "tilemask/test/", + ) + fire.Fire( + cli, + [ + "--slide_urlpath", + "s3://tilemask/test/123.svs", + "--tiles_urlpath", + "s3://tilemask/test/tile_scores_and_labels_pytorch_inference.parquet", + "--output-urlpath", + "s3://tilemask/out/", + "--label_cols", + "Background,Tumor", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists("s3://tilemask/out/tile_mask.tif") + assert s3fs_client.exists("s3://tilemask/out/metadata.yml") diff --git a/tests/luna/pathology/cli/test_generate_tiles.py b/tests/luna/pathology/cli/test_generate_tiles.py index e2560697..0b5801f7 100644 --- a/tests/luna/pathology/cli/test_generate_tiles.py +++ b/tests/luna/pathology/cli/test_generate_tiles.py @@ -8,6 +8,28 @@ from luna.pathology.cli.generate_tiles import cli +def test_cli_s3(s3fs_client, dask_client): + s3fs_client.mkdirs("test2", exist_ok=True) + fire.Fire( + cli, + [ + "--slide-urlpath", + "tests/testdata/pathology/123.svs", + "--output-urlpath", + "s3://test2/test", + "--tile-size", + "256", + "--requested-magnification", + "10", + "--output_storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + assert s3fs_client.exists("test2/test/123.tiles.parquet") + + def test_cli(tmp_path, dask_client): fire.Fire( cli, diff --git a/tests/luna/pathology/cli/test_infer_tile_labels.py b/tests/luna/pathology/cli/test_infer_tile_labels.py index d3d1e013..7e1be23e 100644 --- a/tests/luna/pathology/cli/test_infer_tile_labels.py +++ b/tests/luna/pathology/cli/test_infer_tile_labels.py @@ -30,6 +30,33 @@ def test_cli(tmp_path): ) +def test_cli_s3(s3fs_client): + s3fs_client.mkdirs("infertile", exist_ok=True) + s3fs_client.put( + "tests/testdata/pathology/save_tiles/123/123.tiles.parquet", "infertile/test/" + ) + fire.Fire( + cli, + [ + "--tiles-urlpath", + "s3://infertile/test/123.tiles.parquet", + "--insecure", + "--output-urlpath", + "s3://infertile/out", + "--torch-model-repo-or-dir", + "tests/testdata/pathology/testhub", + "--model-name", + "test_custom_model", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists("s3://infertile/out/123.tiles.parquet") + + def test_cli_kwargs(tmp_path): fire.Fire( cli, diff --git a/tests/luna/pathology/cli/test_run_tissue_detection.py b/tests/luna/pathology/cli/test_run_tissue_detection.py index acbf101e..eef3069d 100644 --- a/tests/luna/pathology/cli/test_run_tissue_detection.py +++ b/tests/luna/pathology/cli/test_run_tissue_detection.py @@ -5,6 +5,32 @@ from luna.pathology.cli.run_tissue_detection import cli +def test_otsu_s3(dask_client, s3fs_client): + s3fs_client.mkdirs("tissuetest", exist_ok=True) + s3fs_client.put("tests/testdata/pathology/123.svs", "tissuetest/test/") + fire.Fire( + cli, + [ + "--slide-urlpath", + "s3://tissuetest/test/123.svs", + "--tile-size", + str(256), + "--batch-size", + str(8), + "--output-urlpath", + "s3://tissuetest/test", + "--filter-query", + "otsu_score > 0.5", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + assert s3fs_client.exists("tissuetest/test/123.tiles.parquet") + assert s3fs_client.exists("tissuetest/test/metadata.yml") + + def test_otsu(tmp_path, dask_client): fire.Fire( cli, diff --git a/tests/luna/pathology/cli/test_save_tiles.py b/tests/luna/pathology/cli/test_save_tiles.py index ca4785b0..6c0886c5 100644 --- a/tests/luna/pathology/cli/test_save_tiles.py +++ b/tests/luna/pathology/cli/test_save_tiles.py @@ -7,6 +7,33 @@ from luna.pathology.cli.save_tiles import cli +def test_save_cli_s3(tmp_path, dask_client, s3fs_client): + s3fs_client.mkdirs("mybucket", exist_ok=True) + s3fs_client.put("tests/testdata/pathology/123.svs", "mybucket/test/") + s3fs_client.put( + "tests/testdata/pathology/generate_tiles/123/123.tiles.parquet", + "mybucket/test/", + ) + fire.Fire( + cli, + [ + "--slide_urlpath", + "s3://mybucket/test/123.svs", + "--tiles_urlpath", + "s3://mybucket/test/123.tiles.parquet", + "--output_urlpath", + "s3://mybucket/test", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + "--batch_size", + "16", + ], + ) + assert s3fs_client.exists("mybucket/test/123.tiles.parquet") + + def test_save_cli(tmp_path, dask_client): fire.Fire( cli, diff --git a/tests/luna/pathology/cli/test_slide_etl.py b/tests/luna/pathology/cli/test_slide_etl.py index d59366e2..ffe6dc33 100644 --- a/tests/luna/pathology/cli/test_slide_etl.py +++ b/tests/luna/pathology/cli/test_slide_etl.py @@ -14,6 +14,35 @@ def env(monkeypatch): monkeypatch.setenv("HOSTNAME", "localhost") +def test_slide_etl_s3(s3fs_client, dask_client): + s3fs_client.mkdirs("etltest", exist_ok=True) + s3fs_client.put( + "tests/testdata/pathology/test-project/wsi/123.svs", "etltest/test/" + ) + s3fs_client.put( + "tests/testdata/pathology/test-project/wsi/fake_slide.svs", "etltest/test/" + ) + fire.Fire( + cli, + [ + "--slide_urlpath", + "s3://etltest/test/", + "--output-urlpath", + "s3://etltest/out", + "--project_name", + "TEST-00-000", + "--comment", + "Test ingestion", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists("etltest/out/slide_ingest_TEST-00-000.parquet") + + def test_slide_etl(tmp_path, dask_client): fire.Fire( cli, diff --git a/tests/luna/pathology/cli/test_visualize_tile_labels_png.py b/tests/luna/pathology/cli/test_visualize_tile_labels_png.py index bc7375c0..ae2e3f15 100644 --- a/tests/luna/pathology/cli/test_visualize_tile_labels_png.py +++ b/tests/luna/pathology/cli/test_visualize_tile_labels_png.py @@ -29,3 +29,43 @@ def test_viz(tmp_path): ) assert os.path.exists(f"{tmp_path}/tile_scores_and_labels_visualization_random.png") + + +def test_viz_s3(s3fs_client): + s3fs_client.mkdirs("viz", exist_ok=True) + s3fs_client.put("tests/testdata/pathology/123.svs", "viz/test/") + + df = pd.read_csv("tests/testdata/pathology/generate_tiles/123/123.tiles.csv") + df["random"] = np.random.rand(len(df)) + df.to_parquet( + "s3://viz/test/input_tiles.parquet", + storage_options={ + "key": "", + "secret": "", + "endpoint_url": s3fs_client.client_kwargs["endpoint_url"], + }, + ) + + fire.Fire( + cli, + [ + "--slide-urlpath", + "s3://viz/test/123.svs", + "--tiles-urlpath", + "s3://viz/test/input_tiles.parquet", + "--output-urlpath", + "s3://viz/out/", + "--plot_labels", + "random", + "--requested-magnification", + "5", + "--storage_options", + "{'key': '', 'secret': '', 'client_kwargs': {'endpoint_url': '" + + s3fs_client.client_kwargs["endpoint_url"] + + "'}}", + ], + ) + + assert s3fs_client.exists( + "s3://viz/out/tile_scores_and_labels_visualization_random.png" + ) diff --git a/tests/testdata/pathology/stardist_polygon_s3.yml b/tests/testdata/pathology/stardist_polygon_s3.yml new file mode 100644 index 00000000..828c5c82 --- /dev/null +++ b/tests/testdata/pathology/stardist_polygon_s3.yml @@ -0,0 +1,10 @@ +input_urlpath: s3://dsatest/test/test_object_classification.geojson +image_filename: 123.svs +output_urlpath: s3://dsatest/out/ +annotation_name: StarDist Segmentations with Lymphocyte Classifications +line_colors: + Other: rgb(0, 255, 0) + Lymphocyte: rgb(255, 0, 0) +fill_colors: + Other: rgba(0, 255, 0, 0) + Lymphocyte: rgba(255, 0, 0, 0)