diff --git a/.github/workflows/stage.yaml b/.github/workflows/stage.yaml new file mode 100644 index 00000000..4ad172de --- /dev/null +++ b/.github/workflows/stage.yaml @@ -0,0 +1,25 @@ +name: Stage new contribution + +on: + workflow_dispatch: + inputs: + resource_id: + description: "Bioimage.io resource identifier" + required: true + type: string + package_url: + description: "Download URL of the resource package zip-file" + required: true + type: string + +concurrency: ${{inputs.resource_id}} + +jobs: + call: + uses: ./.github/workflows/stage_call.yml + with: + resource_id: ${{inputs.resource_id}} + S3_HOST: ${{vars.S3_HOST}} + S3_BUCKET: ${{vars.S3_BUCKET}} + S3_FOLDER: ${{vars.S3_FOLDER}} + secrets: inherit diff --git a/.github/workflows/stage_call.yaml b/.github/workflows/stage_call.yaml new file mode 100644 index 00000000..9adb295a --- /dev/null +++ b/.github/workflows/stage_call.yaml @@ -0,0 +1,97 @@ +name: stage + +on: + workflow_call: + inputs: + resource_id: + description: "Bioimage.io resource identifier" + required: true + type: string + package_url: + description: "Download URL of the resource package zip-file" + required: true + type: string + S3_HOST: + required: true + type: string + S3_BUCKET: + required: true + type: string + S3_FOLDER: + required: true + type: string + +concurrency: ${{inputs.resource_id}} + +env: + S3_HOST: ${{ inputs.S3_HOST }} + S3_BUCKET: ${{ inputs.S3_BUCKET }} + S3_FOLDER: ${{ inputs.S3_FOLDER }} + S3_ACCESS_KEY_ID: ${{secrets.S3_ACCESS_KEY_ID}} + S3_SECRET_ACCESS_KEY: ${{secrets.S3_SECRET_ACCESS_KEY}} + +jobs: + stage: + runs-on: ubuntu-latest + outputs: + version: ${{ steps.stage.outputs.version }} + dynamic_test_cases: ${{ steps.stage.outputs.dynamic_test_cases }} + has_dynamic_test_cases: ${{ steps.stage.outputs.has_dynamic_test_cases }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: "pip" # caching pip dependencies + - run: pip install -r requirements.txt + - name: stage + run: | + python scripts/stage.py "${{ inputs.resource_id }}" "${{ inputs.package_url }}" + + test: + needs: stage + if: needs.stage.outputs.has_dynamic_test_cases == 'yes' + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.static-validation.outputs.dynamic_test_cases) }} # [{env_name: ..., weight_format: ...}, ...] + max-parallel: 1 # avoid prallel updates to log.json + steps: + - uses: actions/checkout@v4 + - name: install validation dependencies + id: create_env + uses: mamba-org/setup-micromamba@v1 + with: + cache-downloads: true + environment-name: ${{ matrix.env_name }} + environment-file: conda_env_${{ matrix.weight_format }}.yaml + create-args: >- # script dependencies + typer + conda-forge::bioimageio.spec + minio + loguru + continue-on-error: true # we inspect this step's outcome in test_dynamically.py + timeout-minutes: 60 + - name: install minimal script dependencies if val env failed + if: ${{ steps.create_env.outcome != 'success' }} + run: pip install typer bioimageio.spec minio loguru + - name: dynamic validation + shell: bash -l {0} + run: python scripts/test_dynamically.py "${{inputs.resource_id}}" "${{needs.stage.outputs.version}}" "${{ matrix.weight_format }}" "${{ steps.create_env.outcome }}" + timeout-minutes: 60 + + conclude: + needs: [stage, test] + if: always() # run even if test job fails + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: "pip" # caching pip dependencies + - run: pip install -r scripts/requirements.txt + - run: | + python scripts/conclude.py "${{ inputs.resource_id }}" "${{needs.stage.outputs.version}}" + + # TODO: call emailer diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml new file mode 100644 index 00000000..9b5f2d8a --- /dev/null +++ b/.github/workflows/test.yaml @@ -0,0 +1,55 @@ +name: test stage call + +on: push + +concurrency: ${{inputs.resource_id}} + +jobs: + test-stage-wf: + uses: ./.github/workflows/stage_call.yml + with: + resource_id: ${{vars.TEST_PACKAGE_ID}} + package_url: ${{vars.TEST_PACKAGE_URL}} + S3_HOST: ${{vars.S3_HOST}} + S3_BUCKET: ${{vars.S3_TEST_BUCKET}} # using test bucket + S3_FOLDER: ${{vars.S3_TEST_FOLDER}}/ci # using test folder + secrets: inherit + + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: "pip" # caching pip dependencies + - run: pip install -r .github/scripts/requirements.txt + - run: black . + - run: pyright -p pyproject.toml + - run: pytest + - name: export documentation + if: ${{ github.ref == 'ref/head/main' }} + run: pdoc . -o ./docs + - uses: actions/upload-pages-artifact@v3 + if: ${{ github.ref == 'ref/head/main' }} + with: + path: docs/ + + deploy: + needs: build + if: ${{ github.ref == 'ref/head/main' }} + # Grant GITHUB_TOKEN the permissions required to make a Pages deployment + permissions: + pages: write # to deploy to Pages + id-token: write # to verify the deployment originates from an appropriate source + + # Deploy to the github-pages environment + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + + runs-on: ubuntu-latest + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..c61f6af0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +.env diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..c44d070e --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,14 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "python", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "args": [], + "justMyCode": false, + } + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..32f3c48d --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,20 @@ +{ + "window.title": "collection", + "python.analysis.typeCheckingMode": "strict", + "python.analysis.extraPaths": [ + "../spec-bioimage-io", + "../core-bioimage-io", + "../core-bioimage-io-python" + ], + "notebook.codeActionsOnSave": { + "source.organizeImports": true + }, + "editor.codeActionsOnSave": { + "source.organizeImports": "explicit" + }, + "python.testing.pytestArgs": [ + "tests" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..da9cb67a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[tool.black] +target-version = ["py312"] + +[tool.pyright] +include = ["scripts"] +pythonPlatform = "All" +pythonVersion = "3.12" +reportIncompatibleMethodOverride = true +reportMissingSuperCall = "error" +reportMissingTypeArgument = true +reportMissingTypeStubs = "warning" +reportUninitializedInstanceVariable = "error" +reportUnknownMemberType = false +reportUnnecessaryIsInstance = false +reportUnnecessaryTypeIgnoreComment = "error" +reportUnusedCallResult = "error" +reportUnusedVariable = "error" +typeCheckingMode = "strict" +useLibraryCodeForTypes = true + +[tool.pytest.ini_options] +addopts = "--capture=no --doctest-modules --failed-first" +testpaths = ["scripts"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..fae6f6fd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +bioimageio.spec @ git+https://github.com/bioimage-io/spec-bioimage-io@0b707b83b061da7584e554cedbf0c6d725980619 # TODO: chenage to released version +bioimageio.core @ git+https://github.com/bioimage-io/core-bioimage-io-python@fcb6bcd674e211c61161105eaf3552cf3cb804ec # TODO: chenage to released version +black==24.2.0 +loguru==0.7.2 +minio==7.2.3 +packaging==23.2 +pdoc==14.4.0 +pyright==1.1.351 +pytest==8.0.0 +ruyaml==0.91.0 +spdx-license-list==3.22 +typer==0.9.0 diff --git a/scripts/stage.py b/scripts/stage.py new file mode 100644 index 00000000..1e1f8fbc --- /dev/null +++ b/scripts/stage.py @@ -0,0 +1,14 @@ +import typer +from utils.remote_resource import RemoteResource +from utils.s3_client import Client +from utils.validate_format import validate_format + + +def stage(resource_id: str, package_url: str): + resource = RemoteResource(client=Client(), id=resource_id) + staged = resource.stage_new_version(package_url) + validate_format(staged) + + +if __name__ == "__main__": + typer.run(stage) diff --git a/scripts/test_dynamically.py b/scripts/test_dynamically.py new file mode 100644 index 00000000..37523a95 --- /dev/null +++ b/scripts/test_dynamically.py @@ -0,0 +1,130 @@ +import traceback +from functools import partialmethod +from pathlib import Path +from typing import Optional + +import bioimageio.core +import bioimageio.spec +import typer +from bioimageio.spec.model.v0_5 import WeightsFormat +from bioimageio.spec.summary import ( + ErrorEntry, + InstalledPackage, + ValidationDetail, + ValidationSummary, +) +from ruyaml import YAML +from utils.remote_resource import StagedVersion +from utils.s3_client import Client + +try: + from tqdm import tqdm +except ImportError: + pass +else: + # silence tqdm + tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) # type: ignore + +yaml = YAML(typ="safe") + + +def get_summary_detail_from_exception(name: str, exception: Exception): + return ValidationDetail( + name=name, + status="failed", + errors=[ + ErrorEntry( + loc=(), + msg=str(exception), + type="exception", + traceback=traceback.format_tb(exception.__traceback__), + ) + ], + ) + + +def test_dynamically( + resource_id: str, + version: int, + weight_format: Optional[WeightsFormat] = typer.Argument( + ..., help="weight format to test model with." + ), + create_env_outcome: str = "success", +): + staged = StagedVersion(client=Client(), id=resource_id, version=version) + staged.set_status( + "testing", + "Testing" + ("" if weight_format is None else f" {weight_format} weights"), + ) + rdf_source = staged.get_rdf_url() + if weight_format is None: + # no dynamic tests for non-model resources yet... + return + + summary = ValidationSummary( + name="bioimageio.core.test_description", + status="passed", + details=[], + env=[ + InstalledPackage( + name="bioimageio.spec", version=bioimageio.spec.__version__ + ), + InstalledPackage( + name="bioimageio.core", version=bioimageio.core.__version__ + ), + ], + source_name=rdf_source, + ) + + if create_env_outcome == "success": + try: + from bioimageio.core import test_description + except Exception as e: + summary.add_detail( + get_summary_detail_from_exception( + "import test_description from test environment", e + ) + ) + else: + try: + rdf = yaml.load(rdf_source) + test_kwargs = ( + rdf.get("config", {}) + .get("bioimageio", {}) + .get("test_kwargs", {}) + .get(weight_format, {}) + ) + except Exception as e: + summary.add_detail( + get_summary_detail_from_exception("check for test kwargs", e) + ) + else: + try: + summary = test_description( + rdf_source, weight_format=weight_format, **test_kwargs + ) + except Exception as e: + summary.add_detail( + get_summary_detail_from_exception("call 'test_resource'", e) + ) + + else: + env_path = Path(f"conda_env_{weight_format}.yaml") + if env_path.exists(): + error = "Failed to install conda environment:\n" + env_path.read_text() + else: + error = f"Conda environment yaml file not found: {env_path}" + + summary.add_detail( + ValidationDetail( + name="install test environment", + status="failed", + errors=[ErrorEntry(loc=(), msg=error, type="env")], + ) + ) + + staged.add_log_entry("bioimageio.core", summary.model_dump(mode="json")) + + +if __name__ == "__main__": + typer.run(test_dynamically) diff --git a/scripts/upload_model_to_zenodo.py b/scripts/upload_model_to_zenodo.py new file mode 100644 index 00000000..fd89b449 --- /dev/null +++ b/scripts/upload_model_to_zenodo.py @@ -0,0 +1,313 @@ +import argparse +import logging +import os +import pprint +from datetime import datetime +from io import BytesIO +from pathlib import Path +from typing import Optional +from urllib.parse import quote_plus, urljoin, urlparse + +import requests # type: ignore +import spdx_license_list # type: ignore +from loguru import logger # type: ignore +from packaging.version import parse as parse_version +from ruyaml import YAML # type: ignore +from s3_client import create_client, version_from_resource_path_or_s3 + +from scripts.conclude import update_status + +yaml = YAML(typ="safe") + +spdx_licenses = [item.id for item in spdx_license_list.LICENSES.values()] + +GOOD_STATUS_CODES = ( + 200, # OK Request succeeded. Response included. Usually sent for GET/PUT/PATCH requests. + 201, # Created Request succeeded. Response included. Usually sent for POST requests + 202, # Accepted Request succeeded. Response included. Usually sent for POST requests, + # where background processing is needed to fulfill the request. + 204, # No Content Request succeeded. No response included. Usually sent for DELETE requests. +) +ACCESS_TOKEN = os.getenv("ZENODO_API_ACCESS_TOKEN") +S3_HOST = os.getenv("S3_HOST") +S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY_ID") +S3_SECRET_KEY = os.getenv("S3_SECRET_ACCESS_KEY") +S3_BUCKET = os.getenv("S3_BUCKET") +S3_FOLDER = os.getenv("S3_FOLDER") +ZENODO_URL = os.getenv("ZENODO_URL") + +MAX_RDF_VERSION = parse_version("0.5.0") + + +logging.basicConfig() +logging.getLogger().setLevel(logging.DEBUG) +requests_log = logging.getLogger("requests.packages.urllib3") +requests_log.setLevel(logging.DEBUG) +requests_log.propagate = True + + +def assert_good_response(response, message, info=None): + if response.status_code not in GOOD_STATUS_CODES: + pprint.pprint(response) + pprint.pprint(response.content) + if info: + pprint.pprint(info) + raise Exception(message) + + +def create_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser() + parser.add_argument("resource_path", help="Resource path") + return parser + + +def get_args(argv: Optional[list] = None): + """ + $Get command-line arguments + """ + parser = create_parser() + return parser.parse_args(argv) + + +def main(): + args = get_args() + headers = {"Content-Type": "application/json"} + params = {"access_token": ACCESS_TOKEN} + + client = create_client() + resource_path, version = version_from_resource_path_or_s3( + args.resource_path, client + ) + + s3_path = f"{resource_path}/{version}/files" + + # List the files at the model URL + file_urls = client.get_file_urls(path=s3_path) + logger.info("Using file URLs:\n{}", "\n".join((str(obj) for obj in file_urls))) + + # Create empty deposition + response = requests.post( + f"{ZENODO_URL}/api/deposit/depositions", params=params, json={}, headers=headers + ) + assert_good_response(response, "Failed to create deposition") + + # Use the bucket link + deposition_info = response.json() + bucket_url = deposition_info["links"]["bucket"] + + rdf_text = client.load_file(Path(s3_path, "rdf.yaml")) + rdf = yaml.load(rdf_text) + if not isinstance(rdf, dict): + raise Exception("Failed to load rdf.yaml from S3") + + # PUT files to the deposition + for file_url in file_urls: + response = put_file_from_url(file_url, bucket_url, params) + assert_good_response(response, f"Failed to PUT file from {file_url}") + + # Report deposition URL + deposition_id = deposition_info["id"] + deposition_doi = deposition_info["metadata"]["prereserve_doi"]["doi"] + + docstring = rdf.get("documentation", "") + if not docstring.startswith("http") and docstring.endswith(".md"): + # docstring should point to one of the files present... + + # Get the file URL + docstring = docstring.replace("./", "") + text = client.load_file(Path(s3_path, docstring)) + # Load markdown? + docstring = text + + # const file = this.zipPackage.files[ + # this.rdf.documentation.replace("./", "") + # ]; + # if (file) { + # docstring = await file.async("string"); // get markdown + # docstring = DOMPurify.sanitize(marked(docstring)); + # } + + base_url = f"{ZENODO_URL}/record/{deposition_id}/files/" + + metadata = rdf_to_metadata(rdf, base_url, deposition_info, docstring) + + response = requests.put( + f"{ZENODO_URL}/api/deposit/depositions/%s" % deposition_id, + params={"access_token": ACCESS_TOKEN}, + json={"metadata": metadata}, + headers=headers, + ) + assert_good_response( + response, "Failed to put metadata", info={"metadata": metadata} + ) + + update_status( + args.resource_path, + "Would be publishing now...(but leaving as draft)", + step=None, + num_steps=None, + ) + return + + response = requests.post( + f"{ZENODO_URL}/api/deposit/depositions/%s/actions/publish" % deposition_id, + params=params, + ) + + assert_good_response(response, "Failed to publish deposition") + + update_status( + args.resource_path, + f"The deposition DOI is {deposition_doi}", + step=None, + num_steps=None, + ) + + +def put_file_from_url(file_url: str, destination_url: str, params: dict) -> dict: + """Gets a remote file and pushes it up to a destination""" + # TODO: Can we use stream=True and pass response.raw into requests.put? + filename = Path(urlparse(file_url).path).name + response = requests.get(file_url) + file_like = BytesIO(response.content) + return put_file(file_like, filename, destination_url, params) + # response = requests.get(file_url, stream=True) + # return put_file(response.raw, filename, destination_url, params) + + +def put_file_path(path: str | Path, url: str, params: dict) -> dict: + """PUT file to url with params, given a file-path""" + path = Path(path) + filename = path.name + with path.open(mode="rb") as fileobj: + response = put_file(fileobj, filename, url, params) + return response + + +def put_file(file_object, name, url, params): + response = requests.put( + "%s/%s" % (url, name), + data=file_object, + params=params, + ) + return response + + +def rdf_authors_to_metadata_creators(rdf): + if "authors" not in rdf: + return [] + authors = rdf["authors"] + + creators = [] + for author in authors: + if isinstance(author, str): + creator = {"name": author.split(";")[0], "affiliation": ""} + else: + creator = { + "name": author["name"].split(";")[0], + "affiliation": author["affiliation"], + } + if "orcid" in author: + creator["orcid"] = author["orcid"] + creators.append(creator) + return creators + + +def rdf_to_metadata( + rdf: dict, + base_url: str, + deposition_info: dict, + docstring: str, + additional_note="(Uploaded via https://bioimage.io)", +) -> dict: + + creators = rdf_authors_to_metadata_creators(rdf) + rdf["config"]["_deposit"] = deposition_info + url = quote_plus(f"{rdf['config']['_deposit']['id']}") + docstring_html = "" + if docstring: + docstring_html = f"
{docstring}
" + description = f"""Download RDF Package{docstring_html}
""" + keywords = ["bioimage.io", "bioimage.io:" + rdf["type"]] + related_identifiers = generate_related_identifiers_from_rdf(rdf, base_url) + metadata = { + "title": rdf["name"], + "description": description, + "access_right": "open", + "license": rdf["license"], + "upload_type": "other", + "creators": creators, + "publication_date": datetime.now().date().isoformat(), + "keywords": keywords + rdf["tags"], + "notes": rdf["description"] + additional_note, + "related_identifiers": related_identifiers, + "communities": [], + } + return metadata + + +def generate_related_identifiers_from_rdf(rdf, base_url): + related_identifiers = [] + covers = [] + for cover in rdf.get("covers", ()): + if not cover.startswith("http"): + cover = urljoin(base_url, cover) + covers.append(cover) + + related_identifiers.append( + { + "relation": "hasPart", # is part of this upload + "identifier": cover, + "resource_type": "image-figure", + "scheme": "url", + } + ) + + for link in rdf.get("links", ()): + related_identifiers.append( + { + "identifier": f"https://bioimage.io/#/r/{quote_plus(link)}", + "relation": "references", # // is referenced by this upload + "resource_type": "other", + "scheme": "url", + } + ) + + # rdf.yaml or model.yaml + if rdf["rdf_source"].startswith("http"): + rdf_file = rdf["rdf_source"] + else: + rdf_file = urljoin(base_url, rdf["rdf_source"]) + # When we update an existing deposit, make sure we save the relative link + if rdf_file.startswith("http") and ("api/files" in rdf_file): + rdf_file = rdf_file.split("/") + rdf_file = rdf_file[-1] + rdf_file = urljoin(base_url, rdf_file) + + related_identifiers.append( + { + "identifier": rdf_file, + "relation": "isCompiledBy", # // compiled/created this upload + "resource_type": "other", + "scheme": "url", + } + ) + + documentation = rdf.get("documentation") + if documentation: + if not documentation.startswith("http"): + documentation = urljoin(base_url, documentation) + + related_identifiers.append( + { + "identifier": documentation, + "relation": "isDocumentedBy", # is referenced by this upload + "resource_type": "publication-technicalnote", + "scheme": "url", + } + ) + return related_identifiers + + +if __name__ == "__main__": + main() diff --git a/scripts/utils/__init__.py b/scripts/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/utils/remote_resource.py b/scripts/utils/remote_resource.py new file mode 100644 index 00000000..132c6f70 --- /dev/null +++ b/scripts/utils/remote_resource.py @@ -0,0 +1,233 @@ +from __future__ import annotations + +import io +import json +import urllib.request +import zipfile +from dataclasses import dataclass +from datetime import datetime +from typing import ( + Any, + ClassVar, + Optional, + assert_never, +) + +from loguru import logger +from ruyaml import YAML + +from .s3_client import Client +from .s3_structure import Details, Log, LogCategory, Status, StatusName + +yaml = YAML(typ="safe") + + +@dataclass +class RemoteResource: + client: Client + id: str + + def _get_latest_staged_version_id(self) -> Optional[int]: + staged = list(map(int, self.client.ls(f"{self.id}/staged/", only_folders=True))) + if not staged: + return None + else: + return max(staged) + + def get_latest_staged_version(self) -> Optional[StagedVersion]: + v = self._get_latest_staged_version_id() + if v is None: + return None + else: + return StagedVersion(client=self.client, id=self.id, version=v) + + def stage_new_version(self, package_url: str) -> StagedVersion: + v = self._get_latest_staged_version_id() + if v is None: + v = 1 + + ret = StagedVersion(client=self.client, id=self.id, version=v) + ret.set_status("staging", f"unzipping {package_url} to {ret.folder}") + + # Download the model zip file + try: + remotezip = urllib.request.urlopen(package_url) + except Exception: + logger.error("failed to open %s", package_url) + raise + + zipinmemory = io.BytesIO(remotezip.read()) + + # Unzip the zip file + zipobj = zipfile.ZipFile(zipinmemory) + + rdf = yaml.load(zipobj.open("rdf.yaml").read().decode()) + if (rdf_id := rdf.get("id")) is None: + rdf["id"] = ret.id + elif rdf_id != ret.id: + raise ValueError( + f"Expected package for {ret.id}, but got packaged {rdf_id} ({package_url})" + ) + + # overwrite version information + rdf["version"] = ret.version + + if rdf.get("id_emoji") is None: + # TODO: set `id_emoji` according to id + raise ValueError(f"RDF in {package_url} is missing `id_emoji`") + + for filename in zipobj.namelist(): + file_data = zipobj.open(filename).read() + path = f"{ret.folder}files/{filename}" + self.client.put(path, io.BytesIO(file_data), length=len(file_data)) + + return ret + + +@dataclass +class _RemoteResourceVersion(RemoteResource): + version: int + version_prefix: ClassVar[str] + + @property + def folder(self) -> str: + return f"{self.id}/{self.version_prefix}{self.version}/" + + def get_rdf_url(self) -> str: + return self.client.get_file_url(f"{self.folder}files/rdf.yaml") + + def get_log(self) -> Log: + path = f"{self.folder}log.json" + log_data = self.client.load_file(path) + if log_data is None: + log: Log = {} + else: + log = json.loads(log_data) + assert isinstance(log, dict) + + return log + + def _get_details(self) -> Details: + details_data = self.client.load_file(f"{self.folder}details.json") + if details_data is None: + details: Details = { + "messages": [], + "status": self._create_status("unknown", "no status information found"), + } + else: + details = json.load(io.BytesIO(details_data)) + + return details + + def _set_details(self, details: Details): + self.client.put_json(f"{self.folder}details.json", details) + + def get_messages(self): + details = self._get_details() + return details["messages"] + + def add_message(self, author: str, text: str): + logger.info("msg from {}: text", author) + details = self._get_details() + now = datetime.now().isoformat() + details["messages"].append({"author": author, "text": text, "time": now}) + self._set_details(details) + + def set_status(self, name: StatusName, description: str) -> None: + details = self._get_details() + details["status"] = self._create_status(name, description) + self._set_details(details) + + @staticmethod + def _create_status(name: StatusName, description: str) -> Status: + num_steps = 5 + if name == "unknown": + step = 1 + num_steps = 1 + elif name == "staging": + step = 1 + elif name == "testing": + step = 2 + elif name == "awaiting review": + step = 3 + elif name == "publishing": + step = 4 + elif name == "published": + step = 5 + else: + assert_never(name) + + return Status( + name=name, description=description, step=step, num_steps=num_steps + ) + + def get_status(self) -> Status: + details = self._get_details() + return details["status"] + + def add_log_entry( + self, + category: LogCategory, + content: list[Any] | dict[Any, Any] | int | float | str | None | bool, + ): + log = self.get_log() + entries = log.setdefault(category, []) + now = datetime.now().isoformat() + entries.append({"timestamp": now, "log": content}) + self._set_log(log) + + def _set_log(self, log: Log) -> None: + self.client.put_json(f"{self.folder}log.json", log) + + +@dataclass +class StagedVersion(_RemoteResourceVersion): + version_prefix: ClassVar[str] = "staged/" + + def publish(self) -> PublishedVersion: + logger.debug("Publishing {}", self.folder) + # get next version and update versions.json + versions_path = f"{self.id}/versions.json" + versions_data = self.client.load_file(versions_path) + if versions_data is None: + versions: dict[str, Any] = {"1": {}} + else: + versions = json.loads(versions_data) + + next_version = max(map(int, versions)) + 1 + + assert next_version not in versions, (next_version, versions) + + versions[str(next_version)] = {} + updated_versions_data = json.dumps(versions).encode() + self.client.put( + versions_path, + io.BytesIO(updated_versions_data), + length=len(updated_versions_data), + ) + ret = PublishedVersion(client=self.client, id=self.id, version=next_version) + + # move rdf.yaml and set version in it + staged_rdf_path = f"{self.folder}files/rdf.yaml" + rdf_data = self.client.load_file(staged_rdf_path) + rdf = yaml.load(rdf_data) + rdf["version"] = ret.version + stream = io.StringIO() + yaml.dump(rdf, stream) + rdf_data = stream.read().encode() + self.client.put( + f"{ret.folder}files/rdf.yaml", io.BytesIO(rdf_data), length=len(rdf_data) + ) + self.client.rm_obj(staged_rdf_path) + + # move all other files + self.client.mv_dir(self.folder, ret.folder) + + # remove all preceding staged versions + self.client.rm_dir(f"{self.id}/{self.version_prefix}") + return ret + + +@dataclass +class PublishedVersion(_RemoteResourceVersion): + version_prefix: ClassVar[str] = "" diff --git a/scripts/utils/s3_client.py b/scripts/utils/s3_client.py new file mode 100644 index 00000000..9288b41e --- /dev/null +++ b/scripts/utils/s3_client.py @@ -0,0 +1,197 @@ +from __future__ import annotations + +import io +import json +import os +from collections.abc import Sequence +from dataclasses import dataclass, field +from datetime import timedelta +from pathlib import Path +from typing import Any, BinaryIO, Iterator, Optional + +from dotenv import load_dotenv +from loguru import logger +from minio import Minio, S3Error +from minio.commonconfig import CopySource +from minio.datatypes import Object +from minio.deleteobjects import DeleteObject + +_ = load_dotenv() + + +@dataclass +class Client: + host: str = os.environ["S3_HOST"] + bucket: str = os.environ["S3_BUCKET"] + prefix: str = os.environ["S3_FOLDER"] + access_key: str = field(default=os.environ["S3_ACCESS_KEY_ID"], repr=False) + secret_key: str = field(default=os.environ["S3_SECRET_ACCESS_KEY"], repr=False) + _client: Minio = field(init=False, repr=False) + + def __post_init__(self): + self.prefix = self.prefix.strip("/") + self._client = Minio( + self.host, + access_key=self.access_key, + secret_key=self.secret_key, + ) + found = self.bucket_exists(self.bucket) + if not found: + raise Exception("target bucket does not exist: {self.bucket}") + logger.debug("Created S3-Client: {}", self) + + def bucket_exists(self, bucket: str) -> bool: + return self._client.bucket_exists(bucket) + + def put( + self, path: str, file_object: io.BytesIO | BinaryIO, length: Optional[int] + ) -> None: + # For unknown length (ie without reading file into mem) give `part_size` + part_size = 0 + if length is None: + length = -1 + + if length == -1: + part_size = 10 * 1024 * 1024 + + path = f"{self.prefix}/{path}" + _ = self._client.put_object( + self.bucket, + path, + file_object, + length=length, + part_size=part_size, + ) + + def put_json(self, path: str, json_value: Any): + data = json.dumps(json_value).encode() + self.put(path, io.BytesIO(data), length=len(data)) + + def get_file_urls( + self, + path: str = "", + exclude_files: Sequence[str] = ("details.json",), + lifetime: timedelta = timedelta(hours=1), + ) -> list[str]: + """Checks an S3 'folder' for its list of files""" + logger.debug("Getting file list using {}, at {}", self, path) + path = f"{self.prefix}/{path}" + objects = self._client.list_objects(self.bucket, prefix=path, recursive=True) + file_urls: list[str] = [] + for obj in objects: + if obj.is_dir or obj.object_name is None: + continue + if Path(obj.object_name).name in exclude_files: + continue + # Option 1: + url = self._client.get_presigned_url( + "GET", + obj.bucket_name, + obj.object_name, + expires=lifetime, + ) + file_urls.append(url) + # Option 2: Work with minio.datatypes.Object directly + return file_urls + + def ls( + self, path: str, only_folders: bool = False, only_files: bool = False + ) -> Iterator[str]: + """ + List folder contents, non-recursive, ala `ls` + but no "." or ".." + """ + # path = str(Path(self.prefix, path)) + path = f"{self.prefix}/{path}" + logger.debug("Running ls at path: {}", path) + objects = self._client.list_objects(self.bucket, prefix=path, recursive=False) + for obj in objects: + if ( + (only_files and obj.is_dir) + or (only_folders and not obj.is_dir) + or obj.object_name is None + ): + continue + + yield Path(obj.object_name).name + + def mv_dir(self, src: str, tgt: str, *, bypass_governance_mode: bool = False): + assert src.endswith("/") + assert tgt.endswith("/") + objects = list( + self._client.list_objects( + self.bucket, f"{self.prefix}/{src}", recursive=True + ) + ) + self._cp_objs(objects, src, tgt) + self._rm_objs(objects, bypass_governance_mode=bypass_governance_mode) + + def rm_dir(self, prefix: str, *, bypass_governance_mode: bool = False): + assert prefix == "" or prefix.endswith("/") + objects = list( + self._client.list_objects( + self.bucket, f"{self.prefix}/{prefix}", recursive=True + ) + ) + self._rm_objs(objects, bypass_governance_mode=bypass_governance_mode) + + def _cp_objs(self, objects: Sequence[Object], src: str, tgt: str) -> None: + assert src.endswith("/") + assert tgt.endswith("/") + src = f"{self.prefix}/{src}" + tgt = f"{self.prefix}/{tgt}" + # copy + for obj in objects: + assert obj.object_name is not None and obj.object_name.startswith(src) + tgt_obj_name = f"{tgt}{obj.object_name[len(src) :]}" + + _ = self._client.copy_object( + self.bucket, + tgt_obj_name, + CopySource(self.bucket, obj.object_name), + ) + + def rm_obj(self, name: str) -> None: + self._client.remove_object(self.bucket, name) + + def _rm_objs( + self, objects: Sequence[Object], *, bypass_governance_mode: bool + ) -> None: + _ = list( + self._client.remove_objects( + self.bucket, + ( + DeleteObject(obj.object_name) + for obj in objects + if obj.object_name is not None + ), + bypass_governance_mode=bypass_governance_mode, + ) + ) + + def load_file(self, path: str) -> bytes | None: + """Load file from S3""" + try: + response = self._client.get_object(self.bucket, f"{self.prefix}/{path}") + content = response.read() + except Exception as e: + if isinstance(e, S3Error) and e.code == "NoSuchKey": + logger.info("Object {} not found with {}", path, self) + content = None + else: + logger.critical("Failed to get object {} with {}", path, self) + raise + + else: + logger.debug("Loaded {}", path) + + try: + response.close() + response.release_conn() + except Exception: + pass + + return content + + def get_file_url(self, path: str) -> str: + return f"https://{self.host}/{self.bucket}/{self.prefix}/{path}" diff --git a/scripts/utils/s3_structure.py b/scripts/utils/s3_structure.py new file mode 100644 index 00000000..c15bbfd3 --- /dev/null +++ b/scripts/utils/s3_structure.py @@ -0,0 +1,37 @@ +from typing import Any, Literal, TypedDict + +LogCategory = Literal[ + "bioimageio.spec", "bioimageio.core", "ilastik", "deepimagej", "icy", "biapy" +] + + +class LogEntry(TypedDict): + timestamp: str + log: Any + + +Log = dict[LogCategory, list[LogEntry]] + + +class Message(TypedDict): + author: str + text: str + time: str + + +StatusName = Literal["unknown", "staging", "testing", "awaiting review"] + + +class Status(TypedDict): + name: StatusName + description: str + step: int + num_steps: int + + +class Details(TypedDict): + """version specific details at `