diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..9d999d5 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,43 @@ +image: + name: python:3.10 + +stages: + - docs + - update + +pages: + stage: docs + before_script: + - pip install -r mkdocs-requirements.txt + script: + - mkdocs build --strict --verbose --site-dir public + artifacts: + paths: + - public + rules: + - if: $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH + +update:onprem-runai: + stage: update + tags: + - on-prem + #variables: + # REPO_URL_ONPREM_RUNAI: url-in-settings + # REPO_USERNAME_ONPREM_RUNAI: username-in-settings + # REPO_PASSWORD_ONPREM_RUNAI: password-in-settings + before_script: + - pip install cookiecutter + - mkdir -p /tmp/kapitan-hull && cd $_ + - git config --global user.name "Kapitan Hull Bot" + - git config --global user.email "mlops@aisingapore.org" + script: + - cookiecutter --replay-file $CI_PROJECT_DIR/cookiecutter-onprem-runai.json $CI_PROJECT_DIR + - git clone https://$REPO_USERNAME_ONPREM_RUNAI:$REPO_PASSWORD_ONPREM_RUNAI@$REPO_URL_ONPREM_RUNAI git-repo + - cd git-repo + - git checkout -B $CI_COMMIT_BRANCH + - git pull origin $CI_COMMIT_BRANCH || echo "Nothing to pull" + - cd ../kapitan-hull-onprem-runai-test + - cp -rv ../git-repo/.git . + - git add . + - git commit -m "$CI_COMMIT_MESSAGE" || echo "Nothing to commit" + - git push origin $CI_COMMIT_BRANCH || echo "Nothing to push" \ No newline at end of file diff --git a/README.md b/README.md index aa9a0f2..a6188d0 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# AI Singapore's Cookiecutter Template for End-to-end ML Projects (On-prem | Run:ai) +# Kapitan Hull ![AI Singapore's Kapitan Hull EPTG Onprem Run:ai Banner](./assets/kapitan-hull-eptg-onprem-runai-banner.png) ## Table of Contents -- [AI Singapore's Cookiecutter Template for End-to-end ML Projects (On-prem | Run:ai)](#ai-singapores-cookiecutter-template-for-end-to-end-ml-projects-on-prem--runai) +- [Kapitan Hull)](#kapitan-hull) - [Table of Contents](#table-of-contents) - [Preface](#preface) - [Usage](#usage) @@ -13,12 +13,16 @@ ## Preface -This repository contains the -[`cookiecutter`](https://cookiecutter.readthedocs.io/en/stable/) -template for generating a repository that provides boilerplates touching -on the differing components of an end-to-end ML project. This template -is dedicated for AI Singapore's on-premise environment, and where -Run:ai is used as the MLOps platform. +This repository contains the [`cookiecutter`][ccutter] template for +generating a repository that provides boilerplates touching on the +differing components of an end-to-end ML project. + +For now, this template is dedicated for AI Singapore's on-premise +environment, and where Run:ai is used as the MLOps platform. Other +platforms and orchestrators would be integrated to this repository in +the near future. + +[ccutter]: https://cookiecutter.readthedocs.io/en/stable/ ## Usage @@ -35,22 +39,25 @@ You will then be prompted to provide inputs.These inputs will be used to populate different parts of the repository to be generated by `cookiecutter`. +> **Note**: Your cookiecutter version must be at least ***2.2.0***. + ### Input Parameters -| Parameter | Detail | Default | Regex Reference | -|------------------------------- |-------------------------------------------------------------------------------------------------------------------------------- |-------------------------------------------------------------------------------------------- |-------------------------------------------------------------------------------------------------------------------- | -| `project_name` | Name of project that will be the header for the `README.md`. Input to start with alphabet. Only whitespace as separators. | Name of project (not name of repository). Use whitespace instead of underscores or hyphens. | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L8) | -| `description` | A short description of the project that will be populated in `README.md`. Max of 72 characters. | A short description of the project. | NIL | -| `repo_name` | Name of the repository folder. Input to start with alphabet characters. No whitespaces or underscores are allowed. | `project_name` where whitespaces and underscores are replaced with hyphens. | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L13) | -| `src_package_name` | Name of the source code's package under `src`. Input to start with alphabet characters. No whitespaces or hyphens are allowed. | `repo_name` where hyphens are replaced with underscores. | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L16) | -| `src_package_name_short` | The alias for the source code's package. Input to start with alphabet characters. No whitespaces or hyphens are allowed. | `src_package_name` | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L19) | -| `runai_proj_name` | The RunAI namespace used by the project - the project name that you're sending jobs to. | The RunAI namespace used by the project. | NIL -| `harbor_registry_project_path` | Path of the Harbor registry repository for your container images to be located under. Cannot end with a slash character. | Path of the project's container registry on Harbor. | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L22) | -| `author_name` | Your alias or project team's name. Relatively arbitrary. No hyphens are allowed. | Your alias or project team's name. | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L25) | +| Parameter | Detail | Default | Regex Reference | +|------------------------- |--------------------------------------------------------------------------------------------------------------------------------- |---------------------------------------------------------------------------- |-------------------------------------------------------------------------------------------------------------------- | +| `project_name` | Name of project that will be the header for the `README.md`. Input to start with alphabet. Only whitespace as separators. | My Project | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L8) | +| `description` | A short description of the project that will be populated in `README.md`. Max of 72 characters. | A short description of the project. | NIL | +| `repo_name` | Name of the repository folder. Input to start with alphabet characters. No whitespaces or underscores are allowed. | `project_name` where whitespaces and underscores are replaced with hyphens. | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L13) | +| `src_package_name` | Name of the source code's package under `src`. Input to start with alphabet characters. No whitespaces or hyphens are allowed. | `repo_name` where hyphens are replaced with underscores. | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L16) | +| `src_package_name_short` | The alias for the source code's package. Input to start with alphabet characters. No whitespaces or hyphens are allowed. | `src_package_name` | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L19) | +| `platform` | The platform the project is running on. (Choose between "on-premise" or "Google Cloud Platform") | `onprem` or `gcp` | NIL | +| `orchestrator` | The orchestrator the project is using. (Choose between "Run:AI", "Polyaxon" or "No orchestrator") | `runai` or `polyaxon` or `none` | NIL | +| `proj_name` | The project name used in by the repository. If you're using Run:AI, this will be the Run:AI project name used by the repository. | `sample-project` | NIL | +| `registry_project_path` | Path of the registry repository for your container images to be located under. Cannot end with a slash character. | `registry.domain.tld/sample-project/my-project` | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L22) | +| `author_name` | Your alias or project team's name. Relatively arbitrary. No hyphens are allowed. | `AISG` | [Link](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/hooks/pre_gen_project.py#L25) | ### Version Control -Following the creation of your repository, -initialise it with Git, push it to a -remote, and follow its -`README.md` document for a full guide on its usage. +Following the creation of your repository, initialise it with Git, push +it to a remote, and follow its `README.md` document for a full guide on +its usage. diff --git a/cookiecutter-gcp-runai.json b/cookiecutter-gcp-runai.json new file mode 100644 index 0000000..fc873ba --- /dev/null +++ b/cookiecutter-gcp-runai.json @@ -0,0 +1,14 @@ +{ + "cookiecutter": { + "project_name": "Kapitan Hull GCP RunAI Test", + "description": "Testing Grounds for Kapitan Hull on GCP using RunAI.", + "repo_name": "kapitan-hull-gcp-runai-test", + "src_package_name": "kapitan_hull_gcp_runai_test", + "src_package_name_short": "khgr_test", + "platform": "gcp", + "orchestrator": "runai", + "proj_name": "mlops-test", + "registry_project_path": "registry.aisingapore.net/mlops/kapitan-hull-gcp-runai", + "author_name": "mlops" + } +} \ No newline at end of file diff --git a/cookiecutter-onprem-runai.json b/cookiecutter-onprem-runai.json new file mode 100644 index 0000000..f662699 --- /dev/null +++ b/cookiecutter-onprem-runai.json @@ -0,0 +1,14 @@ +{ + "cookiecutter": { + "project_name": "Kapitan Hull Onprem RunAI Test", + "description": "Testing Grounds for Kapitan Hull on premise using RunAI.", + "repo_name": "kapitan-hull-onprem-runai-test", + "src_package_name": "kapitan_hull_onprem_runai_test", + "src_package_name_short": "khor_test", + "platform": "onprem", + "orchestrator": "runai", + "proj_name": "mlops-test", + "registry_project_path": "registry.aisingapore.net/mlops/kapitan-hull-onprem-runai", + "author_name": "mlops" + } +} \ No newline at end of file diff --git a/cookiecutter.json b/cookiecutter.json index a3ff5a1..89dd199 100644 --- a/cookiecutter.json +++ b/cookiecutter.json @@ -1,10 +1,33 @@ { - "project_name": "Name of project (not name of repository). Use whitespace instead of underscores or hyphens.", + "project_name": "My Project", "description": "A short description of the project.", "repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '-').replace('_', '-') }}", "src_package_name": "{{ cookiecutter.repo_name.replace('-', '_') }}", "src_package_name_short": "{{ cookiecutter.src_package_name }}", - "runai_proj_name": "The RunAI namespace used by the project.", - "harbor_registry_project_path": "Path of the project's container registry on Harbor.", - "author_name": "Your alias or project team's name." + "platform": ["onprem", "gcp"], + "orchestrator": ["runai", "polyaxon", "none"], + "proj_name": "sample-project", + "registry_project_path": "registry.domain.tld/sample-project/my-project", + "author_name": "AISG", + "__prompts__": { + "project_name": "Name of project that will be the header for the README.md. Input to start with alphabet. Only whitespace as separators.", + "description": "A short description of the project that will be populated in README.md. Max of 72 characters.", + "repo_name": "Name of repository. Name of the repository folder. Input to start with alphabet characters. No whitespaces or underscores are allowed.", + "src_package_name": "Name of the source code's package under the src folder. Input to start with alphabet characters. No whitespaces or hyphens are allowed.", + "src_package_name_short": "The alias for the source code's package. Input to start with alphabet characters. No whitespaces or hyphens are allowed.", + "platform": { + "__prompt__": "The platform the project is running on.", + "onprem": "On premise", + "gcp": "Google Cloud Platform (not implemented yet)" + }, + "orchestrator": { + "__prompt__": "The orchestrator the project is using.", + "runai": "Run:AI", + "polyaxon": "Polyaxon (not implemented yet)", + "noorch": "No orchestrator (not implemented yet)" + }, + "proj_name": "The project name used in by the repository. If you're using Run:AI, this will be the Run:AI project name used by the repository.", + "registry_project_path": "Path of the registry repository for your container images to be located under. Cannot end with a slash character.", + "author_name": "Your alias or project team's name. Relatively arbitrary. No hyphens are allowed." + } } \ No newline at end of file diff --git a/hooks/pre_gen_project.py b/hooks/pre_gen_project.py index 5b676c0..2719b13 100644 --- a/hooks/pre_gen_project.py +++ b/hooks/pre_gen_project.py @@ -17,8 +17,8 @@ "src_package_name_short": { "user_input": "{{cookiecutter.src_package_name_short}}", "regex": r"^[a-z](?:_?[a-z0-9]+)*$"}, - "harbor_registry_project_path": { - "user_input": "{{cookiecutter.harbor_registry_project_path}}", + "registry_project_path": { + "user_input": "{{cookiecutter.registry_project_path}}", "regex": r"^registry\.aisingapore\.net[\-\/\w\d]+(?:[a-z0-9]+)$"}, "author_name": { "user_input": "{{cookiecutter.author_name}}", @@ -52,7 +52,7 @@ def check_input_regex(cookie_input_key, cookie_input_val): ERROR_MSG_LIST.append("ERROR: %s - '%s' is not a valid Python package name." % (cookie_input_key, cookie_input_val["user_input"])) - if cookie_input_key == "harbor_registry_project_path": + if cookie_input_key == "registry_project_path": ERROR_MSG_LIST.append("ERROR: %s - '%s' is not a valid Harbor path." % (cookie_input_key, cookie_input_val["user_input"])) diff --git a/{{cookiecutter.repo_name}}/.gitlab-ci.yml b/{{cookiecutter.repo_name}}/.gitlab-ci.yml index f4f2eac..04676df 100644 --- a/{{cookiecutter.repo_name}}/.gitlab-ci.yml +++ b/{{cookiecutter.repo_name}}/.gitlab-ci.yml @@ -46,7 +46,7 @@ build:data-prep-image: /kaniko/executor --context "${CI_PROJECT_DIR}" --dockerfile "${CI_PROJECT_DIR}/docker/{{cookiecutter.repo_name}}-data-prep.Dockerfile" - --destination "{{cookiecutter.harbor_registry_project_path}}/data-prep:${CI_COMMIT_SHORT_SHA}" + --destination "{{cookiecutter.registry_project_path}}/data-prep:${CI_COMMIT_SHORT_SHA}" rules: - if: $CI_MERGE_REQUEST_IID changes: @@ -54,6 +54,8 @@ build:data-prep-image: - src/**/* - conf/**/* - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + - if: $CI_PIPELINE_SOURCE == "web" && $BUILD_ALL + - if: $CI_PIPELINE_SOURCE == "web" && $BUILD_DATAPREP build:model-training-image: stage: build @@ -67,7 +69,7 @@ build:model-training-image: /kaniko/executor --context "${CI_PROJECT_DIR}" --dockerfile "${CI_PROJECT_DIR}/docker/{{cookiecutter.repo_name}}-model-training.Dockerfile" - --destination "{{cookiecutter.harbor_registry_project_path}}/model-training:${CI_COMMIT_SHORT_SHA}" + --destination "{{cookiecutter.registry_project_path}}/model-training:${CI_COMMIT_SHORT_SHA}" rules: - if: $CI_MERGE_REQUEST_IID changes: @@ -75,6 +77,8 @@ build:model-training-image: - src/**/* - conf/**/* - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + - if: $CI_PIPELINE_SOURCE == "web" && $BUILD_ALL + - if: $CI_PIPELINE_SOURCE == "web" && $BUILD_MODEL build:retag-images: stage: build @@ -83,8 +87,8 @@ build:retag-images: entrypoint: [""] script: - cat $HARBOR_ROBOT_CREDS_JSON > /root/.docker/config.json - - crane tag {{cookiecutter.harbor_registry_project_path}}/data-prep:${CI_COMMIT_SHORT_SHA} ${CI_COMMIT_TAG} - - crane tag {{cookiecutter.harbor_registry_project_path}}/model-training:${CI_COMMIT_SHORT_SHA} ${CI_COMMIT_TAG} + - crane tag {{cookiecutter.registry_project_path}}/data-prep:${CI_COMMIT_SHORT_SHA} ${CI_COMMIT_TAG} + - crane tag {{cookiecutter.registry_project_path}}/model-training:${CI_COMMIT_SHORT_SHA} ${CI_COMMIT_TAG} rules: - if: $CI_COMMIT_TAG && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH diff --git a/{{cookiecutter.repo_name}}/README.md b/{{cookiecutter.repo_name}}/README.md index b4f916c..90c0452 100644 --- a/{{cookiecutter.repo_name}}/README.md +++ b/{{cookiecutter.repo_name}}/README.md @@ -1,53 +1,48 @@ -# End-to-end Project Template (On-prem | Run:ai) +# {{cookiecutter.project_name}} -![AI Singapore's Kapitan Hull EPTG Onprem Run:ai Banner](./aisg-context/guide-site/docs/assets/images/kapitan-hull-eptg-onprem-runai-banner.png) +![AI Singapore's Kapitan Hull EPTG Onprem Run:ai Banner](./aisg-context/guide-site/docs/kapitan-hull-eptg-onprem-runai-banner.png) -__Customised for `{{cookiecutter.project_name}}`__. +_{{cookiecutter.description}}_ -__Project Description:__ {{cookiecutter.description}} +__A project generated using AI Singpaore's Kapitan Hull, an end-to-end +ML project template.__ This template that is also accompanied with an end-to-end guide was -generated and customised using the -following -[`cookiecutter`](https://cookiecutter.readthedocs.io/en/stable/) +generated and customised using the following [`cookiecutter`][ccutter] template: https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai -The contents of the guide have been customised -according to the inputs provided upon generation of this repository -through the usage of `cookiecutter` CLI, -following instructions detailed -[here](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/README.md) -. +The contents of the guide have been customised according to the inputs +provided upon generation of this repository through the usage of +`cookiecutter` CLI, following instructions detailed [here][kh-readme]. -Inputs provided to `cookiecutter` for the generation of this -template: +Inputs provided to `cookiecutter` for the generation of this template: - __`project_name`:__ {{cookiecutter.project_name}} - __`description`:__ {{cookiecutter.description}} - __`repo_name`:__ {{cookiecutter.repo_name}} - __`src_package_name`:__ {{cookiecutter.src_package_name}} - __`src_package_name_short`:__ {{cookiecutter.src_package_name_short}} -- __`runai_proj_name`:__ {{cookiecutter.runai_proj_name}} -- __`harbor_registry_project_path`:__ {{cookiecutter.harbor_registry_project_path}} +- __`platform`:__ {{cookiecutter.platform}} +- __`orchestrator`:__ {{cookiecutter.orchestrator}} +- __`proj_name`:__ {{cookiecutter.proj_name}} +- __`registry_project_path`:__ {{cookiecutter.registry_project_path}} - __`author_name`:__ {{cookiecutter.author_name}} +[ccutter]: https://cookiecutter.readthedocs.io/en/stable/ +[kh-readme]: https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/README.md + ## End-to-end Guide This repository contains a myriad of boilerplate codes and configuration -files. On how to make use of these boilerplates, this repository -has an end-to-end guide on that. +files. On how to make use of these boilerplates, this repository has an +end-to-end guide on that. The guide's contents are written in Markdown formatted files, located within `aisg-context/guide-site` and its subdirectories. While the Markdown files can be viewed directly through text editors or IDEs, -the contents are optimised for viewing through -[`mkdocs`](https://www.mkdocs.org) (or -[`mkdocs-material`](https://squidfunk.github.io/mkdocs-material) -specifically) -. -A demo of the site for the guide can be viewed -[here](https://aisingapore.github.io/ml-project-cookiecutter-onprem-runai) -. +the contents are optimised for viewing through [`mkdocs`][mkdocs] (or +[`mkdocs-material`][mkdocs-material] specifically). +A demo of the site for the guide can be viewed [here][kh-site]. To spin up the site on your local machine, you can create a virtual environment to install the dependencies first: @@ -58,12 +53,17 @@ $ conda activate aisg-eptg-onprem-runai-guide $ pip install -r aisg-context/guide-site/mkdocs-requirements.txt ``` -After creating the virtual environment and installing the required +After creating the virtual environment and installing the required dependencies, serve it like so: ```bash $ mkdocs serve --config-file aisg-context/guide-site/mkdocs.yml ``` -The site for the guide will then be viewable on -[`http://localhost:8000`](http://localhost:8000). +The site for the guide should then be viewable on +[`http://localhost:8000`][lhost]. + +[mkdocs]: https://www.mkdocs.org +[mkdocs-material]: https://squidfunk.github.io/mkdocs-material +[kh-site]: https://aisingapore.github.io/ml-project-cookiecutter-onprem-runai +[lhost]: http://localhost:8000 \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/01-prerequisites.md b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/01-prerequisites.md index 0264c3f..afaa06b 100644 --- a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/01-prerequisites.md +++ b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/01-prerequisites.md @@ -8,58 +8,62 @@ follow through with the guide: - NUS Staff/Student account. - Azure account provisioned by AI Singapore. - PC with the following installed: - - If your machine is with a Windows OS, use - [__PowerShell__](https://docs.microsoft.com/en-us/powershell/scripting/install/installing-powershell-on-windows?view=powershell-7.2) + - If your machine is with a Windows OS, use [__PowerShell__][pshell] instead of the default Command (`cmd.exe`) shell. Best if you - resort to - [Windows Terminal](https://docs.microsoft.com/en-us/windows/terminal/). + resort to [Windows Terminal][winterm]. - __Pulse Secure__ - - Refer to [NUS IT eGuides](https://nusit.nus.edu.sg/eguides/) - for installation guides. + - Refer to [NUS IT eGuides][nus-it] for installation guides. - __Web browser__ - __Terminal__ - - __[Git](https://git-scm.com/downloads)__ - - __[Rancher Desktop](https://rancherdesktop.io)__ or - __[Docker Engine](https://docs.docker.com/engine/install):__ + - __[Git][git]__ + - __[Rancher Desktop][rancher]__ or __[Docker Engine][docker]:__ Client-server application for containerising applications as well as interacting with the Docker daemon. - - For Linux users, you may install the Docker Engine (Docker daemon) - directly. + - For Linux users, you may install the Docker Engine (Docker + daemon) directly. - For Windows or macOS users, the Docker daemon can be installed - through [Rancher Desktop](https://rancherdesktop.io). - - __[miniconda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html):__ - for Python virtual environment management. - - __[`kubectl`](https://kubernetes.io/docs/tasks/tools/):__ - CLI for Kubernetes. - - __[AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html):__ CLI for AWS services, but we will specifically be using it - for interacting with the AI Singapore's Elastic Cloud Storage - (ECS) service through the S3 protocol. - - You may choose to just use - [`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html), - the Python SDK for AWS instead, to interact with the ECS - service within a Python environment. However, this does - not fall under the scope of this guide. - - *(Optional)* __[`helm`](https://helm.sh/docs/intro/install/):__ - CLI for Kubernetes' package manager. -- Access to a project on AI Singapore's Run:ai cluster. - See [here](./03-mlops-components-platform.md#runai) for more information. -- Credentials for AI Singapore's Elastic Cloud Storage (ECS) service. - See [here](./03-mlops-components-platform.md#elastic-cloud-storage-ecs) for more information. -- Credentials for AI Singapore's Harbor registry. - See [here](./03-mlops-components-platform.md#harbor) for more information. -- Credentials for an MLflow Tracking server. - See [here](./03-mlops-components-platform.md#mlflow) for more information. + through [Rancher Desktop][rancher]. + - __[miniconda][mcond]:__ for Python virtual environment management. + - __[`kubectl`][kubectl]:__ CLI for Kubernetes. +{% if cookiecutter.platform == 'onprem' -%} + - __[AWS CLI][awscli]:__ CLI for AWS services, but we will + specifically be using it for interacting with the AI Singapore's + Elastic Cloud Storage (ECS) service through the S3 protocol. + - You may choose to just use [`boto3`][boto3], the Python SDK + for AWS instead, to interact with the ECS service within a + Python environment. However, this does not fall under the + scope of this guide. +{% elif cookiecutter.platform == 'gcp' -%} + - __[`gcloud` CLI][gcloud]:__ CLI for interacting with GCP services. +{% endif -%} + - *(Optional)* __[`helm`][helm]:__ CLI for Kubernetes' package + manager. +{% if cookiecutter.orchestrator == 'runai' -%} +- Access to a project on AI Singapore's Run:ai cluster. + See [here][runai-page] for more information. +{% elif cookiecutter.orchestrator == 'polyaxon' -%} +{% elif cookiecutter.orchestrator == 'noorch' -%} +{% endif -%} +{% if cookiecutter.platform == 'onprem' -%} +- Credentials for AI Singapore's Elastic Cloud Storage (ECS) service. + See [here][ecs-page] for more information. +- Credentials for AI Singapore's Harbor registry. + See [here][harbor-page] for more information. +{% elif cookiecutter.platform == 'gcp' -%} +- Access to a project on [Google Cloud Platform][gcp]. + See [here][gcp-page] for more information. +{% endif -%} +- Credentials for an MLflow Tracking server. + See [here][mlflow-page] for more information. !!! note - If you do not have any of the required credentials, - please verify with or notify the MLOps team at - `mlops@aisingapore.org`. + If you do not have any of the required credentials, please verify + with or notify the MLOps team at `mlops@aisingapore.org`. !!! info Wherever relevant, you can toggle between the different commands - that need to be executed - for either Linux/macOS or the Windows environment (PowerShell). - See below for an example: + that need to be executed for either Linux/macOS or the Windows + environment (PowerShell). See below for an example: === "Linux/macOS" @@ -76,13 +80,40 @@ follow through with the guide: ``` !!! warning - If you are on Windows OS, you would need to ensure that the - files you've cloned or written on your machine be with - `LF` line endings. Otherwise, issues would arise when Docker - containers are being built or run. See - [here](https://stackoverflow.com/questions/48692741/how-can-i-make-all-line-endings-eols-in-all-files-in-visual-studio-code-unix) - on how to configure consistent line endings for a whole folder - or workspace using VSCode. + If you are on Windows OS, you would need to ensure that the + files you've cloned or written on your machine be with `LF` line + endings. Otherwise, issues would arise when Docker containers + are being built or run. See [here][lf-set] on how to configure + consistent line endings for a whole folder or workspace using + VSCode. + +[pshell]: https://docs.microsoft.com/en-us/powershell/scripting/install/installing-powershell-on-windows?view=powershell-7.2 +[winterm]: https://docs.microsoft.com/en-us/windows/terminal/ +[nus-it]: https://nusit.nus.edu.sg/eguides/ +[git]: https://git-scm.com/downloads +[rancher]: https://rancherdesktop.io +[docker]: https://docs.docker.com/engine/install +[mcond]: https://conda.io/projects/conda/en/latest/user-guide/install/index.html +[kubectl]: https://kubernetes.io/docs/tasks/tools/ +[helm]: https://helm.sh/docs/intro/install/ +[runai-page]: ./03-mlops-components-platform.md#runai +[mlflow-page]: ./03-mlops-components-platform.md#mlflow +[lf-set]: https://stackoverflow.com/questions/48692741/how-can-i-make-all-line-endings-eols-in-all-files-in-visual-studio-code-unix +{%- if cookiecutter.platform == 'onprem' -%} +[awscli]: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html +[boto3]: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html +[ecs-page]: ./03-mlops-components-platform.md#elastic-cloud-storage-ecs +[harbor-page]: ./03-mlops-components-platform.md#harbor +{%- elif cookiecutter.platform == 'gcp' -%} +[gcloud]: https://cloud.google.com/sdk/docs/install +[gcp]: https://console.cloud.google.com +[gcp-page]: ./02-preface.md#google-cloud-platform-gcp-projects +{% endif %} +{%- if cookiecutter.orchestrator == 'runai' -%} +[runai-page]: ./03-mlops-components-platform.md#runai +{%- elif cookiecutter.orchestrator == 'polyaxon' -%} +{%- elif cookiecutter.orchestrator == 'noorch' -%} +{% endif %} ## Tips and Tricks @@ -92,8 +123,8 @@ follow through with the guide: button on the top right. Then, proceed to the Virtual Machines section and increase your CPU and memory resources directly. - - For Windows users, create a `.wslconfig` file user `%UserProfile%` - with the following content: + - For Windows users, create a `.wslconfig` file user + `%UserProfile%` with the following content: ```toml [wsl2] memory=8GB diff --git a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/02-preface.md b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/02-preface.md index 2481ee3..46d39a7 100644 --- a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/02-preface.md +++ b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/02-preface.md @@ -2,17 +2,15 @@ ## Repository Setup -This repository provides an end-to-end template for AI -Singapore's AI engineers to onboard their AI projects. -Instructions for generating this template is detailed in the -[`cookiecutter`](https://github.com/cookiecutter/cookiecutter) -template's repository's -[`README.md`](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/README.md). +This repository provides an end-to-end template for AI Singapore's AI +engineers to onboard their AI projects. Instructions for generating +this template is detailed in the [`cookiecutter`][ccutter] template's +repository's [`README.md`][readme]. -While this repository provides users with a set of boilerplates, -here you are also presented with a linear guide on -how to use them. The boilerplates are rendered and customised -when you generated this repository using `cookiecutter`. +While this repository provides users with a set of boilerplates, here +you are also presented with a linear guide on how to use them. The +boilerplates are rendered and customised when you generated this +repository using `cookiecutter`. !!! info You can begin by following along the guide as it brings you through @@ -20,15 +18,11 @@ when you generated this repository using `cookiecutter`. has to offer, you can deviate from it as much as you wish and customise it to your needs. -Since we will be making use of this repository in multiple -environments, __ensure that this repository is pushed to a -remote__. -Most probably you will be resorting to -[AI Singapore's GitLab instance](https://gitlab.aisingapore.net/) as -the remote. -Refer to -[here](https://docs.gitlab.com/ee/user/project/working_with_projects.html#create-a-project) -on creating a blank remote repository (or project in GitLab's term). +Since we will be making use of this repository in multiple environments, +__ensure that this repository is pushed to a remote__. Most probably you +will be resorting to [AI Singapore's GitLab instance][aisg-gitlab] as +the remote. Refer to [this section][cr8-proj] on creating a blank remote +_repository_ (or _project_ using GitLab's terminology). After creating the remote repository, retrieve the remote URL and push the local repository to remote: @@ -42,23 +36,30 @@ $ git commit -m "Initial commit." $ git push -u origin main ``` -Go to [this section](./03-mlops-components-platform.md#gitlab) for more -information on interacting with the on-premise GitLab instance. +Go to [this section][gitlab-page] for more information on interacting +with the on-premise GitLab instance. + +[ccutter]: https://github.com/cookiecutter/cookiecutter +[readme]: https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/README.md +[aisg-gitlab]: https://gitlab.aisingapore.net/ +[cr8-proj]: https://docs.gitlab.com/ee/user/project/working_with_projects.html#create-a-project +[gitlab-page]: ./03-mlops-components-platform.md#gitlab ## Guide's Problem Statement For this guide, we will work towards building a neural network that is -able to classify handwritten digits, widely known as the MNIST -use-case. +able to classify handwritten digits, widely known as the MNIST use-case. The model is then to be deployed through a REST API and used for batch -inferencing as well. +inferencing as well. The raw dataset to be used is obtainable through a Google Cloud Storage bucket; instructions for downloading the data into your development -environment are detailed under -["Data Storage & Versioning"](./06-data-storage-versioning.md), +environment are detailed under ["Data Storage & Versioning"][data-page], to be referred to later on. !!! info __License:__ Yann LeCun and Corinna Cortes hold the copyright of MNIST dataset. MNIST dataset is made available under the terms of the - [Creative Commons Attribution-Share Alike 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/). + [Creative Commons Attribution-Share Alike 3.0 license][cc-sa3.0]. + +[data-page]: ./06-data-storage-versioning.md +[cc-sa3.0]: https://creativecommons.org/licenses/by-sa/3.0/ \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/03-mlops-components-platform.md b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/03-mlops-components-platform.md index f7c81f3..eb291c8 100644 --- a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/03-mlops-components-platform.md +++ b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/03-mlops-components-platform.md @@ -14,31 +14,85 @@ Your credentials for your NUS Staff/Student account is needed to login to NUS' VPN for access to the following: - [AI Singapore's GitLab instance](#gitlab) +{% if cookiecutter.platform == 'onprem' -%} - [AI Singapore's Kubernetes clusters](#kubernetes) - [AI Singapore's Run:ai platform](#runai) - [AI Singapore's Harbor registry](#harbor) - [AI Singapore's Elastic Cloud Storage (ECS)](#elastic-cloud-storage-ecs) - [Your project's on-premise MLflow Tracking server](#mlflow) +{% endif -%} - Other miscellaneous NUS resources +{% if cookiecutter.platform == 'gcp' -%} +## Google Cloud Platform (GCP) Projects + +Each project in AI Singapore that requires the usage of GCP resources +would be provided with a [GCP project][gcp-proj]. Such projects are +accessible through the [GCP console][gcp-csl] once you've logged into +your AI Singapore Google account. + +!!! info + Projects are managed and provisioned by AI Singapore's Platforms + team. If you'd like to request for a project to be created (or for + any other enquiries as well), please contact `mlops@aisingapore.org`. + +[gcp-proj]: https://cloud.google.com/docs/overview#projects +[gcp-csl]: https://console.cloud.google.com/home + +### Authorisation + +You can use GCP's [Cloud SDK][gcp-sdk] to interact with the varying GCP +services. When you're using the SDK for the first time, you are to +provide authorisation using a user or service account. In AI Singapore's +context, unless your use case concerns some automation or CI/CD +pipelines, you will probably be using your user account (i.e. Google +accounts with AI Singapore domains such as `@aisingapore.org` or +`@aiap.sg`). +See [here][sdk-auth] for more information on authorising your SDK. + +A simple command to authorise access: + +```bash +$ gcloud auth login +``` + +To register `gcloud` for Docker so you can push to Google Container +Registry: + +```bash +$ gcloud auth configure-docker \ + asia-southeast1-docker.pkg.dev +``` + +With your user account, you should have access to the following GCP +products/services: + +- [Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) +- [Cloud Storage (GCS)](https://cloud.google.com/storage) +- [Artifact Registry](https://cloud.google.com/artifact-registry) + +[gcp-sdk]: https://cloud.google.com/sdk +[sdk-auth]: https://cloud.google.com/sdk/docs/authorizing +{%- set kubeplat = 'GKE' -%} +{% endif -%} + ## Kubernetes Before we dive into the different MLOps components that you will be -interacting with in the context of this guide, we have to first -introduce -[Kubernetes](https://kubernetes.io/) -as the underlying orchestration tool to execute -pipelines and manage containerised applications and environments. +interacting with in the context of this guide, we have to first +introduce [Kubernetes][k8s] as the underlying orchestration tool to +execute pipelines and manage containerised applications and +environments. From the Kubernetes site: -> _Kubernetes, also known as K8s, is an open-source system for automating_ -> _deployment, scaling, and management of containerized applications._ -> _It groups containers that make up an application into logical units_ -> _for easy management and discovery._ +> _Kubernetes, also known as K8s, is an open-source system for_ +> _automating deployment, scaling, and management of containerized_ +> _applications. It groups containers that make up an application into_ +> _logical units for easy management and discovery._ A number of services and applications that you will be interacting with -(or deploying) are deployed (to be deployed) within a Kubernetes +(or deploying) are deployed (to be deployed) within a Kubernetes cluster. Some of the MLOps components which the Kubernetes cluster(s) will be relevant for are: @@ -54,50 +108,79 @@ These components will be further elaborated in the upcoming sections. - [IBM - What is Kubernetes?](https://www.ibm.com/topics/kubernetes) +[k8s]: https://kubernetes.io/ + +{% if cookiecutter.platform == 'onprem' -%} ### Rancher Upon one's assignment to a project, any relevant clusters that one has -access to would be viewable on -[AI Singapore's Rancher dashboard](https://rancher.aisingapore.net). +access to would be viewable on +[AI Singapore's Rancher dashboard][aisg-rke]. ![AISG's Rancher Dashboard - Home Dashboard](assets/screenshots/rancher-home-dashboard.png) -[Rancher](https://www.rancher.com) is a Kubernetes management platform -that provides cluster administrators or users to manage Kubernetes -clusters or facilitate Kubernetes workflows. -To login, use your Azure account i.e. the same set of credentials that -you use for your GitLab account. +[Rancher][rancher] is a Kubernetes management platform that provides +cluster administrators or users to manage Kubernetes clusters or +facilitate Kubernetes workflows. To login, use your Azure account i.e. +the same set of credentials that you use for your GitLab account. !!! note If you do not have rightful access to a cluster, please notify the MLOps team at `mlops@aisingapore.org`. -### Kubernetes VS Rancher VS Run:ai +[aisg-rke]: https://rancher.aisingapore.net +[rancher]: https://www.rancher.com +{%- set kubeplat = 'Rancher' -%} +{% endif -%} + +{% if cookiecutter.orchestrator == 'runai' -%} + {%- set orch = 'Run:AI' -%} + {%- set vs_orch = " VS " + orch -%} + {%- set and_orch = " and " + orch -%} +{% elif cookiecutter.orchestrator == 'polyaxon' -%} + {%- set orch = 'Polyaxon' -%} + {%- set vs_orch = " VS " + orch -%} + {%- set and_orch = " and " + orch -%} +{% elif cookiecutter.orchestrator == "noorch" -%} + {%- set vs_orch = " " -%} + {%- set and_orch = " " -%} +{% endif %} + +### Kubernetes VS {{kubeplat}}{{vs_orch}} One might be confused as to how each of the aforementioned tools and platforms differ from each other. To put it simply, Kubernetes lies -underneath the Rancher and Run:ai platform/interface. Rancher and Run:ai -are abstraction layers on top of Kubernetes; they both -essentially communicate with the -[Kubernetes API server](https://kubernetes.io/docs/concepts/overview/kubernetes-api) -to carry out actions or orchestrate workloads through each of their own -interface. - -Developers can use Rancher's interface or Run:ai's interface/CLI to -spin up workspaces, jobs or deploy applications. However, the latter can -better serve machine learning engineers in carrying out their machine -learning workflows as that was the intended usage of the platform. -Besides, Run:ai's unique selling point is its better utilisation of -GPU resources (through fractionalisation and other features) -so when it comes to workloads that require GPU, like -model training and evaluation, the usage of Run:ai is recommended. -Also, on the surface, it is easier for one to spin up developer -workspaces on Run:ai. +underneath the {{kubeplat}}{{and_orch}} platform/interface. +{{kubeplat}}{{and_orch}} are abstraction layers on top of Kubernetes; +they both essentially communicate with the +[Kubernetes API server][kubeapi] to carry out actions or orchestrate +workloads through each of their own interface. + +{% if cookiecutter.orchestrator == "runai" -%} +Developers can use {{kubeplat ~ "\'s"}} interface or Run:AI\'s +interface/CLI to spin up workspaces, jobs or deploy applications. +However, the latter can better serve machine learning engineers in +carrying out their machine learning workflows as that was the intended +usage of the platform. Moreover, Run:AI\'s unique selling point is its +better utilisation of GPU resources (through fractionalisation and other +features) so when it comes to workloads that require GPU, like model +training and evaluation, the usage of Run:AI is recommended. Also, on +the surface, it is easier for one to spin up developer workspaces on +Run:AI. +{%- endif %} ??? info "Reference Link(s)" +{% if cookiecutter.platform == "onprem" -%} - [Rancher Docs - Rancher Server and Components](https://ranchermanager.docs.rancher.com/reference-guides/rancher-manager-architecture/rancher-server-and-components) +{% elif cookiecutter.platform == "gcp" -%} + - [GKE Overview](https://cloud.google.com/kubernetes-engine/docs/concepts/kubernetes-engine-overview) +{%- endif -%} +{% if cookiecutter.orchestrator == "runai" -%} - [Run:ai Docs - System Components](https://docs.run.ai/home/components) +{%- endif %} + +[kubeapi]: https://kubernetes.io/docs/concepts/overview/kubernetes-api ## MLOps Components @@ -116,22 +199,12 @@ will cover as well as how each of them relate to each other. Developers begin by having the client (laptop/VM) to be authenticated by whichever platform they have been provided access to. -- Developers with access to Google Cloud projects would have to - authenticate through the `gcloud` CLI which allows them to access the - Google Kubernetes Engine (GKE) cluster, which in turn would allow them - access to the default orchestration platform, Polyaxon. -- Developers with access to AI Singapore’s on-premise clusters would - have to authenticate through the default orchestration platform that - runs on top of the on-premise Kubernetes clusters, Run:ai. - This is done through Run:ai’s CLI. - __Following authentication, developers can make use of templates__ provided by the MLOps team __to spin up developer workspaces__ -([VSCode server](https://github.com/coder/code-server), -[JupyterLab](https://jupyter.org/), etc.) on the respective platforms. -Within these developer workspaces, developers can work on their -codebase, execute light workloads, and carry out other steps of the -end-to-end machine learning workflow. +([VSCode server][vscode], [JupyterLab][jlab], etc.) on the respective +platforms. Within these developer workspaces, developers can work on +their codebase, execute light workloads, and carry out other steps of +the end-to-end machine learning workflow. A typical machine learning or AI project would require the team to carry out exploratory data analysis (EDA) on whatever domain-specific data is @@ -141,13 +214,18 @@ managers__. ??? info "Reference Link(s)" +{%- if cookiecutter.orchestrator == 'runai' %} + - [Run:ai Docs - Workspaces Introduction](https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/overview) +{%- endif %} + +[vscode]: https://github.com/coder/code-server +[jlab]: https://jupyter.org/ ### Version Control Within a developer workspace and environment, developers can interact -(pull, push, etc.) with AI Singapore’s -[__GitLab__](https://about.gitlab.com) instance, +(pull, push, etc.) with AI Singapore’s [__GitLab__][gitlab] instance, which serves as __the organisation’s default version control (Git) remote server__. @@ -155,16 +233,16 @@ remote server__. - [Atlassian Tutorials - What is Git?](https://www.atlassian.com/git/tutorials/what-is-git) +[gitlab]: https://about.gitlab.com + ### Continuous X GitLab also serves as a DevOps platform where the Continuous X of things (Continuous Integration, Continuous Delivery, etc.) can be implemented -and automated. This is done through -[GitLab CI/CD](https://docs.gitlab.com/ee/ci). -Interactions made with -repositories on GitLab can be made to trigger CI/CD workflows. The -__purpose of such workflows are to facilitate the development lifecycle -and streamline the process of delivering quality codebase__. +and automated. This is done through [GitLab CI/CD][gl-ci]. Interactions +made with repositories on GitLab can be made to trigger CI/CD workflows. +The __purpose of such workflows are to facilitate the development +lifecycle and streamline the process of delivering quality codebase__. - The workflows at the very least should include unit and integration testing where the __codebase is subjected to tests and linting tools @@ -175,10 +253,9 @@ and streamline the process of delivering quality codebase__. (SAST)__ where application security tools are utilised to __identify any vulnerabilities that exist within the codebase__. - GitLab CI/CD can also __invoke interactions with other MLOps - components__ such as submitting jobs - (model training, data processing, etc.) to the - aforementioned orchestration platforms or even deploy applications. - This fulfils the __Continuous Delivery (CD)__ and + components__ such as submitting jobs (model training, data processing, + etc.) to the aforementioned orchestration platforms or even deploy + applications. This fulfils the __Continuous Delivery (CD)__ and __Continuous Training (CT)__ portion. @@ -189,35 +266,42 @@ and streamline the process of delivering quality codebase__. - [Google Cloud - MLOps: Continuous delivery and automation pipelines in machine learning](https://cloud.google.com/architecture/mlops-continuous-delivery-and-automation-pipelines-in-machine-learning) - [GitLab - What is DevOps?](https://about.gitlab.com/topics/devops) +[gl-ci]: https://docs.gitlab.com/ee/ci + ### Container Image Registry AI Singapore has a strong __emphasis on containerising pipelines for the purpose of reproducibility and ease of delivery__. Images built through -CI/CD workflows or manual builds can be pushed to container image -registries, be it Google Cloud’s -[Artifact Registry](https://cloud.google.com/artifact-registry) -or AI Singapore’s on-premise [Harbor](https://goharbor.io) registry. +CI/CD workflows or manual builds can be pushed to container image +registries, be it Google Cloud’s [Artifact Registry][gcp-ar] or AI +Singapore’s on-premise [Harbor][harbor] registry. +{% if cookiecutter.platform == 'onprem' -%} ![Harbor Registry - Sample Screenshot](assets/screenshots/external/goharbor-blog-harbor-2.0-artifacts.png)

Harbor Registry

- +{% elif cookiecutter.platform == 'gcp' -%} +![GCP Artifact Registry - Sample Screenshot](assets/screenshots/gcp-artifact-registry.png) +

GCP Artifact Registry

+{% endif %} ??? info "Reference Link(s)" - [Red Hat - What is a container registry?](https://www.redhat.com/en/topics/cloud-native-apps/what-is-a-container-registry) +[gcp-ar]: https://cloud.google.com/artifact-registry +[harbor]: https://goharbor.io + ### Data Preparation Following the EDA phase, the project team would map out and work on data processing and preparation pipelines. These pipelines would __first be -developed with manual invocation__ in mind but a team can __strive towards -automating the processes__ where the pipelines can be triggered by the -CI/CD workflows that they would have defined. +developed with manual invocation__ in mind but a team can __strive +towards automating the processes__ where the pipelines can be triggered +by the CI/CD workflows that they would have defined. As the quality of data to be used for training the models is important, -components like __data preparation can be prefaced with -data validation__, where checks are done to __examine the data’s -adherence to conventions and standards__ set by the stakeholders of the -project. +components like __data preparation can be prefaced with data +validation__, where checks are done to __examine the data’s adherence +to conventions and standards__ set by the stakeholders of the project. ### Model Training & Evaluation @@ -230,25 +314,26 @@ Minimum Viable Model (MVM), a lot of experimentations would have to be done as part of the model training process. Part of such __experiments includes parameter tuning__ where a search space is iterated through to find the best set of configurations that optimises the model’s -performance or objectives. Tools like -[Optuna](https://optuna.org) can greatly assist in facilitating such -workflows. +performance or objectives. Tools like [Optuna][optuna] can greatly +assist in facilitating such workflows. + +[optuna]: https://optuna.org ### Experiment & Pipeline Tracking As there would be a myriad of experiments to be carried out, __there is -a need for the configurations, results, artefacts, and any other -relevant metadata of every experiment to be logged and persisted__. +a need for the configurations, results, artefacts, and any other +relevant metadata of every experiment to be logged and persisted__. Purpose of tracking such information would __allow for easy comparison__ -of models’ performances and if there is a need to -__reproduce experiments__, relevant information can be referred back. -__With the right information__, metadata and utilisation of containers -for reproducible workflows, __pipelines can be tracked as well__. -Carrying these out would provide a team with a __model -registry__ of sorts where experiments with tagged models can be referred -to when they are to be deployed and served. +of models’ performances and if there is a need to __reproduce +experiments__, relevant information can be referred back. __With the +right information__, metadata and utilisation of containers for +reproducible workflows, __pipelines can be tracked as well__. Carrying +these out would provide a team with a __model registry__ of sorts where +experiments with tagged models can be referred to when they are to be +deployed and served. -A tool with relevant features would be [MLflow](https://mlflow.org/). +A tool with relevant features would be [MLflow][mlflow]. @@ -256,32 +341,34 @@ A tool with relevant features would be [MLflow](https://mlflow.org/). - [Databricks Blog - Introducing MLflow: an Open Source Machine Learning Platform](https://www.databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) +[mlflow]: https://mlflow.org/ + ### Model Serving -With the models that have been trained, applications that allow for -end-users to interact with the model can be deployed on test +With the models that have been trained, applications that allow for +end-users to interact with the model can be deployed on test environments. __Deployment of models__ can be and are conventionally done by using __API frameworks__. However, not all problem statements require such frameworks and scripts for executing __batch inference might suffice in some cases__. -One of the popular Python frameworks for building APIs is -[FastAPI](https://fastapi.tiangolo.com). It is easy to pick up and has -many useful out-of-the-box features. +One of the popular Python frameworks for building APIs is [FastAPI][fapi]. +It is easy to pick up and has many useful out-of-the-box features. ??? info "Reference Link(s)" - [Ubuntu Blog - A guide to ML model serving](https://ubuntu.com/blog/guide-to-ml-model-serving) +[fapi]: https://fastapi.tiangolo.com + ## GitLab We at AI Singapore host our own GitLab server: > https://gitlab.aisingapore.net -You should -be provided with a set of credentials during onboarding for access to -the server. +You should be provided with a set of credentials during onboarding for +access to the server. In order to interact with remote Git repositories situated on AI Singapore's GitLab instance (clone, push, fetch, etc.) @@ -290,8 +377,8 @@ outside of NUS' network or GCP (regions `asia-southeast1` and ### Push & Pull with HTTPS VS SSH -The usage of either the HTTPS or SSH protocol for communicating with -the GitLab server depends on the environment in question. If an +The usage of either the HTTPS or SSH protocol for communicating with +the GitLab server depends on the environment in question. If an environment is made accessible by multiple developers, then HTTPS-based access where passwords are prompted for would be better fitting. SSH-based access would be more fitting for clients that are more @@ -312,15 +399,15 @@ Host gitlab.aisingapore.net - [GitLab Docs - Use SSH keys to communicate with GitLab](https://docs.gitlab.com/ee/user/ssh.html) -## Run:ai +{%- if cookiecutter.orchestrator == 'runai' -%} +## Run:AI -Run:ai is an enterprise orchestration and cluster management platform +Run:AI is an enterprise orchestration and cluster management platform that works as an abstraction layer on top of AI Singapore's hybrid infrastructure to maximise the usage of such resources. The platform -utilises [Kubernetes](https://kubernetes.io) in the backend. -Orchestration platforms such as Run:ai allows end-users to easily spin -up workloads, execute jobs, set up services or carry out any -interaction with relevant resources. +utilises [Kubernetes][k8s] in the backend. Orchestration platforms such +as Run:AI allows end-users to easily spin up workloads, execute jobs, +set up services or carry out any interaction with relevant resources. The video below provides a quick and high-level overview of that the platform's unique selling point. @@ -334,34 +421,34 @@ login page at the following link: The link above will bring you to the login page: -![AISG's Run:ai Login Page](assets/screenshots/runai-login-page.png) +![AISG's Run:AI Login Page](assets/screenshots/runai-login-page.png) To login, click on `CONTINUE WITH SSO`. You will be redirected to login -with your Azure account. After a successful login, you will -be brought to the platform's home (`Overview`) page. +with your Azure account. After a successful login, you will be brought +to the platform's home (`Overview`) page. -![AISG's Run:ai Home Page](assets/screenshots/runai-home-page.png) +![AISG's Run:AI Home Page](assets/screenshots/runai-home-page.png) ### Authentication While one can make use of the platform's front-end UI to interact with the Kubernetes cluster in the backend, one might be inclined towards the -programmatic approach where a CLI is to be relied on. Run:ai -provides a CLI that can be used to interact with the platform's API. +programmatic approach where a CLI is to be relied on. Run:AI provides a +CLI that can be used to interact with the platform's API. To use the CLI, you need to be authenticated. For that, you need the following: - A Kubernetes configuration file a.k.a `kubeconfig`. This is provided by the MLOps team. -- Run:ai CLI to be installed on your local machine (or any client). +- Run:AI CLI to be installed on your local machine (or any client). #### `kubeconfig` A client that intends to communicate with a Kubernetes cluster would have to rely on a configuration file called `kubeconfig`. The YAML-formatted `kubeconfig` would contain information such as cluster -endpoints, authentication details, as well as any other access +endpoints, authentication details, as well as any other access parameters. `kubeconfig` files are relied on by the `kubectl` CLI tool for information and credentials to access Kubernetes clusters. @@ -409,9 +496,9 @@ to the reference document below. - [Kubernetes Docs - Organizing Cluster Access Using kubeconfig Files](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig) -#### Run:ai CLI +#### Run:AI CLI -With the aforementioned `kubeconfig` file, we can now use the Run:ai CLI +With the aforementioned `kubeconfig` file, we can now use the Run:AI CLI for authentication. We first have to download the CLI. === "Windows" @@ -501,7 +588,11 @@ verification code and paste it into the terminal. the necessary authentication details, specifically the `id-token` and `refresh-token` fields, which are then used by the `kubectl` CLI tool to communicate with the Run:ai cluster. +{%- elif cookiecutter.orchestrator == 'polyaxon' -%} +{%- elif cookiecutter.orchestrator == 'none' -%} +{% endif %} +{%- if cookiecutter.platform == 'onprem' -%} ## Harbor AI Singapore uses a self-hosted Harbor as the on-premise container image @@ -511,10 +602,9 @@ registry. ![AI Singapore's Harbor Registry - Login Page](assets/screenshots/harbor-login-page.png) -To login, use your Azure account username without the domain -(if your username is `user@aisingapore.org`, your username in this -context will just be `user`) and the same password as your Azure -account. +To login, use your Azure account username without the domain (if your +username is `user@aisingapore.org`, your username in this context will +just be `user`) and the same password as your Azure account. On a successful login, you should be able to see a list of Harbor projects that you have access to. @@ -608,8 +698,7 @@ of the end-users. AI Singapore's ECS makes use of the S3 protocol and so we can make use of the AWS CLI's S3 commands to interact with the storage system. -Instructions for installing the AWS CLI (v2) can be found -[here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html). +Instructions for installing the AWS CLI (v2) can be found [here][awscli]. Following installation of the CLI, you would need to configure the settings to be used. The settings can be populated within separate @@ -620,8 +709,7 @@ profiles would look like the following: !!! note inline end The `aws_access_key_id` and `aws_secret_access_key` are provided by - the DataOps team. The team is reachable at - `dataops@aisingapore.org`. + the DataOps team. The team is reachable at `dataops@aisingapore.org`. ```config [profile-1] @@ -646,34 +734,60 @@ YYYY-MM-DD hh:mm:ss bucket-2 ``` The `--endpoint-url` flag is required for the AWS CLI to know where to -send the requests to. In this case, we are sending requests to -AI Singapore's ECS server. +send the requests to. In this case, we are sending requests to AI +Singapore's ECS server. !!! note - Some buckets may be hidden when listing buckets. This is due - various access permissions that might have been set by - administrators. For some buckets, while you may not be able to list - them, you may still view the objects that are contained within them. + Some buckets may be hidden when listing buckets. This is due various + access permissions that might have been set by administrators. For + some buckets, while you may not be able to list them, you may still + view the objects that are contained within them. ??? info "Reference Link(s)" - [AWS Docs - AWS CLI Configuration and credential file settings](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html) - [AWS CLI Command Reference - s3](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html) +[awscli]: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html + ### ECS Robot/Service Accounts Project teams may also make use of robot/service accounts to interact with ECS. Robot/service accounts are essentially IAM users that are -created by administrators. These accounts are usually created for +created by administrators. These accounts are usually created for automated workflows that require access to ECS. Configuring them for the CLI works the same as configuring a regular user account. +{%- elif cookiecutter.platform == 'gcp' -%} +## Google Container Registry + +AI Singapore's emphases on reproducibility and portability of +workflows and accompanying environments translates to heavy usage of +containerisation. Throughout this guide, we will be building Docker +images necessary for setting up development environments, jobs for +the various pipelines and deployment of the predictive model. + +Within the context of GCP, the [GCP Artifact Registry][gcp-ar] will be +used to store and version our Docker images. Following authorisation to +`gcloud`, you can view the image repositories of your project's registry +like so: +```bash +$ gcloud container images list --repository={{cookiecutter.registry_project_path}} +``` +You will be pushing the Docker images to the aforementioned repository. + +??? info "Reference Link(s)" + + - [`gcloud` Reference - `gcloud container images list`](https://cloud.google.com/sdk/gcloud/reference/container/images/list) + - [Artifact Registry Guide - Pushing & Pulling Images](https://cloud.google.com/artifact-registry/docs/docker/pushing-and-pulling) +{% endif %} + ## MLflow For model experimentation and tracking needs, AI Singapore mainly relies -on [MLflow](https://mlflow.org). MLflow is an open-source platform for -the machine learning lifecycle. It has several components but we will -mainly be using the Tracking server component. +on [MLflow][mlflow]. MLflow is an open-source platform for the machine +learning lifecycle. It has several components but we will mainly be +using the Tracking server component. ### Accessing Tracking Server Dashboard @@ -685,7 +799,11 @@ to make use of the MLflow Tracking server: - MLflow Tracking server URL(s) - Your own username and password for the same server(s) +{%- if cookiecutter.platform == 'onprem' -%} - _(Optional)_ ECS credentials for artifact storage +{%- elif cookiecutter.platform == 'gcp' -%} +- _(Optional)_ GCS credentials for artifact storage +{% endif %} One would be prompted for a username and password when accessing an MLflow Tracking server for the first time: @@ -721,9 +839,6 @@ MLflow Tracking server. $ pip install boto3==1.28.2 mlflow $ export MLFLOW_TRACKING_USERNAME= $ export MLFLOW_TRACKING_PASSWORD= - $ export MLFLOW_S3_ENDPOINT_URL="https://necs.nus.edu.sg" - $ export AWS_ACCESS_KEY_ID= - $ export AWS_SECRET_ACCESS_KEY= $ python src/mlflow_test.py ``` @@ -735,9 +850,6 @@ MLflow Tracking server. $ pip install boto3==1.28.2 mlflow $ $MLFLOW_TRACKING_USERNAME= $ $MLFLOW_TRACKING_PASSWORD= - $ $MLFLOW_S3_ENDPOINT_URL="https://necs.nus.edu.sg" - $ $AWS_ACCESS_KEY_ID= - $ $AWS_SECRET_ACCESS_KEY= $ python src/mlflow_test.py ``` diff --git a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/04-dev-wksp.md b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/04-dev-wksp.md index a4a61e4..1d15c07 100644 --- a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/04-dev-wksp.md +++ b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/04-dev-wksp.md @@ -82,7 +82,7 @@ following command: === "Linux/macOS" ```bash - $ runai exec -vscode -p {{cookiecutter.runai_proj_name}} -- cat /home/coder/.config/code-server/config.yaml + $ runai exec -vscode -p {{cookiecutter.proj_name}} -- cat /home/coder/.config/code-server/config.yaml bind-addr: 127.0.0.1:8080 auth: password password: xxxxxxxxxxxxxxxxxxxxxxxx @@ -92,7 +92,7 @@ following command: === "Windows PowerShell" ```powershell - $ runai exec -vscode -p {{cookiecutter.runai_proj_name}} -- cat /home/coder/.config/code-server/config.yaml + $ runai exec -vscode -p {{cookiecutter.proj_name}} -- cat /home/coder/.config/code-server/config.yaml bind-addr: 127.0.0.1:8080 auth: password password: xxxxxxxxxxxxxxxxxxxxxxxx @@ -139,7 +139,7 @@ persisted. === "VSCode Server Terminal" ```bash - $ cd /{{cookiecutter.runai_proj_name}}_pvc/workspaces + $ cd /{{cookiecutter.proj_name}}_pvc/workspaces $ mkdir ``` @@ -160,7 +160,7 @@ Now, let's clone your repository from the remote: === "VSCode Server Terminal" ```bash - $ cd /{{cookiecutter.runai_proj_name}}/workspaces/ + $ cd /{{cookiecutter.proj_name}}/workspaces/ $ git clone $ cd {{cookiecutter.repo_name}} ``` @@ -208,20 +208,20 @@ custom image: ```bash $ docker build \ - -t {{cookiecutter.harbor_registry_project_path}}/vscode-server-custom:0.1.0 \ + -t {{cookiecutter.registry_project_path}}/vscode-server-custom:0.1.0 \ -f docker/vscode-server/vscode-server.Dockerfile \ --platform linux/amd64 . - $ docker push {{cookiecutter.harbor_registry_project_path}}/vscode-server-custom:0.1.0 + $ docker push {{cookiecutter.registry_project_path}}/vscode-server-custom:0.1.0 ``` === "Windows PowerShell" ```powershell $ docker build ` - -t {{cookiecutter.harbor_registry_project_path}}/vscode-server-custom:0.1.0 ` + -t {{cookiecutter.registry_project_path}}/vscode-server-custom:0.1.0 ` -f docker/vscode-server/vscode-server.Dockerfile ` --platform linux/amd64 . - $ docker push {{cookiecutter.harbor_registry_project_path}}/vscode-server-custom:0.1.0 + $ docker push {{cookiecutter.registry_project_path}}/vscode-server-custom:0.1.0 ``` ## JupyterLab @@ -278,7 +278,7 @@ following command: === "Linux/macOS" ```bash - $ runai logs -jupyterlab -p {{cookiecutter.runai_proj_name}} | grep "lab?token" + $ runai logs -jupyterlab -p {{cookiecutter.proj_name}} | grep "lab?token" [I YYYY-MM-DD hh:mm:ss ServerApp] http://-X-X:8888/lab?token=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX [I YYYY-MM-DD hh:mm:ss ServerApp] http://127.0.0.1:8888/lab?token=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX http://-X-X:8888/lab?token=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX @@ -288,7 +288,7 @@ following command: === "Windows PowerShell" ```powershell - $ runai logs -jupyterlab -p {{cookiecutter.runai_proj_name}} | Where-Object{$_ -match "lab?token"} + $ runai logs -jupyterlab -p {{cookiecutter.proj_name}} | Where-Object{$_ -match "lab?token"} [I YYYY-MM-DD hh:mm:ss ServerApp] http://-X-X:8888/lab?token=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX [I YYYY-MM-DD hh:mm:ss ServerApp] http://127.0.0.1:8888/lab?token=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX http://-X-X:8888/lab?token=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX @@ -321,20 +321,20 @@ server as well as any associated files can be found under ```bash $ docker build \ - -t {{cookiecutter.harbor_registry_project_path}}/jupyterlab-server-custom:0.1.0 \ + -t {{cookiecutter.registry_project_path}}/jupyterlab-server-custom:0.1.0 \ -f docker/jupyterlab-server/jupyterlab-server.Dockerfile \ --platform linux/amd64 . - $ docker push {{cookiecutter.harbor_registry_project_path}}/jupyterlab-server-custom:0.1.0 + $ docker push {{cookiecutter.registry_project_path}}/jupyterlab-server-custom:0.1.0 ``` === "Windows PowerShell" ```powershell $ docker build ` - -t {{cookiecutter.harbor_registry_project_path}}/jupyterlab-server-custom:0.1.0 ` + -t {{cookiecutter.registry_project_path}}/jupyterlab-server-custom:0.1.0 ` -f docker/jupyterlab-server/jupyterlab-server.Dockerfile ` --platform linux/amd64 . - $ docker push {{cookiecutter.harbor_registry_project_path}}/jupyterlab-server-custom:0.1.0 + $ docker push {{cookiecutter.registry_project_path}}/jupyterlab-server-custom:0.1.0 ``` ## Using Docker within Kubernetes diff --git a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/07-job-orchestration.md b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/07-job-orchestration.md index eaa9089..2347f0a 100644 --- a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/07-job-orchestration.md +++ b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/07-job-orchestration.md @@ -47,20 +47,20 @@ provided in this template: ```bash $ docker build \ - -t {{cookiecutter.harbor_registry_project_path}}/data-prep:0.1.0 \ + -t {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ -f docker/{{cookiecutter.repo_name}}-data-prep.Dockerfile \ --platform linux/amd64 . - $ docker push {{cookiecutter.harbor_registry_project_path}}/data-prep:0.1.0 + $ docker push {{cookiecutter.registry_project_path}}/data-prep:0.1.0 ``` === "Windows PowerShell" ```powershell $ docker build ` - -t {{cookiecutter.harbor_registry_project_path}}/data-prep:0.1.0 ` + -t {{cookiecutter.registry_project_path}}/data-prep:0.1.0 ` -f docker/{{cookiecutter.repo_name}}-data-prep.Dockerfile ` --platform linux/amd64 . - $ docker push {{cookiecutter.harbor_registry_project_path}}/data-prep:0.1.0 + $ docker push {{cookiecutter.registry_project_path}}/data-prep:0.1.0 ``` Now that we have the Docker image pushed to the registry, we can submit @@ -71,7 +71,7 @@ a job using that image to Run:ai\: ```bash $ runai submit \ --job-name-prefix -data-prep \ - -i {{cookiecutter.harbor_registry_project_path}}/data-prep:0.1.0 \ + -i {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ --working-dir //workspaces//{{cookiecutter.repo_name}} \ --pvc :/ \ --cpu 2 \ @@ -84,7 +84,7 @@ a job using that image to Run:ai\: ```powershell $ runai submit ` --job-name-prefix -data-prep ` - -i {{cookiecutter.harbor_registry_project_path}}/data-prep:0.1.0 ` + -i {{cookiecutter.registry_project_path}}/data-prep:0.1.0 ` --working-dir //workspaces//{{cookiecutter.repo_name}} ` --pvc :/ ` --cpu 2 ` @@ -143,20 +143,20 @@ we need to build the Docker image to be used for it: ```bash $ docker build \ - -t {{cookiecutter.harbor_registry_project_path}}/model-training:0.1.0 \ + -t {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ -f docker/{{cookiecutter.repo_name}}-model-training.Dockerfile \ --platform linux/amd64 . - $ docker push {{cookiecutter.harbor_registry_project_path}}/model-training:0.1.0 + $ docker push {{cookiecutter.registry_project_path}}/model-training:0.1.0 ``` === "Windows PowerShell" ```powershell $ docker build ` - -t {{cookiecutter.harbor_registry_project_path}}/model-training:0.1.0 ` + -t {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` -f docker/{{cookiecutter.repo_name}}-model-training.Dockerfile ` --platform linux/amd64 . - $ docker push {{cookiecutter.harbor_registry_project_path}}/model-training:0.1.0 + $ docker push {{cookiecutter.registry_project_path}}/model-training:0.1.0 ``` Now that we have the Docker image pushed to the registry, @@ -167,7 +167,7 @@ we can run a job using it: ```bash $ runai submit \ --job-name-prefix -train \ - -i {{cookiecutter.harbor_registry_project_path}}/model-training:0.1.0 \ + -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ --working-dir /home/aisg/{{cookiecutter.repo_name}} \ --pvc :/ \ --cpu 2 \ @@ -185,7 +185,7 @@ we can run a job using it: ```powershell $ runai submit ` --job-name-prefix -train ` - -i {{cookiecutter.harbor_registry_project_path}}/model-training:0.1.0 ` + -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` --working-dir /home/aisg/{{cookiecutter.repo_name}} ` --pvc :/ ` --cpu 2 ` @@ -333,7 +333,7 @@ by default. ```bash $ runai submit \ --job-name-prefix -train \ - -i {{cookiecutter.harbor_registry_project_path}}/model-training:0.1.0 \ + -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ --working-dir /home/aisg/{{cookiecutter.repo_name}} \ --pvc :/ \ --cpu 2 \ @@ -352,7 +352,7 @@ by default. ```powershell $ runai submit ` --job-name-prefix -train ` - -i {{cookiecutter.harbor_registry_project_path}}/model-training:0.1.0 ` + -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` --working-dir /home/aisg/{{cookiecutter.repo_name}} ` --pvc :/ ` --cpu 2 ` diff --git a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/10-cicd.md b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/10-cicd.md index 191a10a..826ab46 100644 --- a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/10-cicd.md +++ b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/10-cicd.md @@ -225,8 +225,8 @@ __Reference(s):__ The template has thus far introduced a couple of Docker images relevant for the team. The tags for all the Docker images are listed below: -- `{{cookiecutter.harbor_registry_project_path}}/data-prep` -- `{{cookiecutter.harbor_registry_project_path}}/model-training` +- `{{cookiecutter.registry_project_path}}/data-prep` +- `{{cookiecutter.registry_project_path}}/model-training` The `build` stage aims at automating the building of these Docker images in a parallel manner. Let's look at a snippet for a single job @@ -248,7 +248,7 @@ that builds a Docker image: /kaniko/executor --context "${CI_PROJECT_DIR}" --dockerfile "${CI_PROJECT_DIR}/docker/{{cookiecutter.repo_name}}-data-prep.Dockerfile" - --destination "{{cookiecutter.harbor_registry_project_path}}/data-prep:${CI_COMMIT_SHORT_SHA}" + --destination "{{cookiecutter.registry_project_path}}/data-prep:${CI_COMMIT_SHORT_SHA}" rules: - if: $CI_MERGE_REQUEST_IID changes: @@ -325,8 +325,8 @@ to the default branch before this. script: - cat $HARBOR_ROBOT_CREDS_JSON > /root/.docker/config.json - crane auth login registry.aisingapore.net - - crane tag {{cookiecutter.harbor_registry_project_path}}/data-prep:${CI_COMMIT_SHORT_SHA} ${$CI_COMMIT_TAG} - - crane tag {{cookiecutter.harbor_registry_project_path}}/model-training:${CI_COMMIT_SHORT_SHA} ${$CI_COMMIT_TAG} + - crane tag {{cookiecutter.registry_project_path}}/data-prep:${CI_COMMIT_SHORT_SHA} ${$CI_COMMIT_TAG} + - crane tag {{cookiecutter.registry_project_path}}/model-training:${CI_COMMIT_SHORT_SHA} ${$CI_COMMIT_TAG} rules: - if: $CI_COMMIT_TAG && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH ... diff --git a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/assets/screenshots/gcp-artifact-registry.png b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/assets/screenshots/gcp-artifact-registry.png new file mode 100644 index 0000000..bce3a9e Binary files /dev/null and b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/guide-for-user/assets/screenshots/gcp-artifact-registry.png differ diff --git a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/index.md b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/index.md index 37cdc5c..1ed9587 100644 --- a/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/index.md +++ b/{{cookiecutter.repo_name}}/aisg-context/guide-site/docs/index.md @@ -1,50 +1,51 @@ -# End-to-end Project Template (On-premise Run:ai) +# {{cookiecutter.project_name}} -![AI Singapore's Kapitan Hull EPTG Onprem Run:ai Banner](./assets/images/kapitan-hull-eptg-onprem-runai-banner.png) +![AI Singapore's Kapitan Hull EPTG Onprem Run:ai Banner](./kapitan-hull-eptg-onprem-runai-banner.png) -__Customised for `{{cookiecutter.project_name}}`__. +_{{cookiecutter.description}}_ -__Project Description:__ {{cookiecutter.description}} +__A project generated using AI Singpaore's Kapitan Hull, an end-to-end +ML project template.__ This template that is also accompanied with an end-to-end guide was -generated and customised using the -following -[`cookiecutter`](https://cookiecutter.readthedocs.io/en/stable/) +generated and customised using the following [`cookiecutter`][ccutter] template: > https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai -This `mkdocs` site is for serving the contents of the end-to-end -guide in a more readable manner, as opposed to plain -Markdown views. The contents of this guide have been customised -according to the inputs provided upon generation of this repository -through the usage of the `cookiecutter` CLI, -following instructions detailed -[here](https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/README.md) -. +This `mkdocs` site is for serving the contents of the end-to-end guide +in a more readable manner, as opposed to plain Markdown views. The +contents of this guide have been customised according to the inputs +provided upon generation of this repository through the usage of the +`cookiecutter` CLI, following instructions detailed [here][kh-readme]. -Inputs provided to `cookiecutter` for the generation of this -template: +Inputs provided to `cookiecutter` for the generation of this template: - __`project_name`:__ {{cookiecutter.project_name}} - __`description`:__ {{cookiecutter.description}} - __`repo_name`:__ {{cookiecutter.repo_name}} - __`src_package_name`:__ {{cookiecutter.src_package_name}} - __`src_package_name_short`:__ {{cookiecutter.src_package_name_short}} -- __`harbor_registry_project_path`:__ {{cookiecutter.harbor_registry_project_path}} +- __`platform`:__ {{cookiecutter.platform}} +- __`orchestrator`:__ {{cookiecutter.orchestrator}} +- __`proj_name`:__ {{cookiecutter.proj_name}} +- __`registry_project_path`:__ {{cookiecutter.registry_project_path}} - __`author_name`:__ {{cookiecutter.author_name}} +[ccutter]: https://cookiecutter.readthedocs.io/en/stable/ +[kh-readme]: https://github.com/aisingapore/ml-project-cookiecutter-onprem-runai/blob/main/README.md + ## Overview For User Guide -1. [Prerequisites](./guide-for-user/01-prerequisites.md) -2. [Preface](./guide-for-user/02-preface.md) -3. [MLOps Components & Platform](./guide-for-user/03-mlops-components-platform.md) -4. [Developer Workspace](guide-for-user/04-dev-wksp.md) -5. [Virtual Environment](./guide-for-user/05-virtual-env.md) -6. [Data Storage & Versioning](./guide-for-user/06-data-storage-versioning.md) -7. [Job Orchestration](./guide-for-user/07-job-orchestration.md) -8. [Deployment](./guide-for-user/08-deployment.md) -9. [Batch Inferencing](./guide-for-user/09-batch-inferencing.md) +1. [Prerequisites](./guide-for-user/01-prerequisites.md) +2. [Preface](./guide-for-user/02-preface.md) +3. [MLOps Components & Platform](./guide-for-user/03-mlops-components-platform.md) +4. [Developer Workspace](guide-for-user/04-dev-wksp.md) +5. [Virtual Environment](./guide-for-user/05-virtual-env.md) +6. [Data Storage & Versioning](./guide-for-user/06-data-storage-versioning.md) +7. [Job Orchestration](./guide-for-user/07-job-orchestration.md) +8. [Deployment](./guide-for-user/08-deployment.md) +9. [Batch Inferencing](./guide-for-user/09-batch-inferencing.md) 10. [Continuous Integration & Deployment](./guide-for-user/10-cicd.md) 11. [Documentation](./guide-for-user/11-documentation.md) @@ -56,7 +57,12 @@ template: │ │ for works within the context of AISG's │ │ development environments. │ └── guide-site <- Files relevant for spinning up the `mkdocs` -│ site to view the end-to-end guide. +│ │ site to view the end-to-end guide. +{% if cookiecutter.orchestrator == 'runai' -%} +│ └── runai <- RunAI YAML scripts. +{% elif cookiecutter.orchestrator == 'polyaxon' -%} +{% elif cookiecutter.orchestrator == 'noorch' -%} +{% endif -%} ├── conf <- Configuration files associated with the │ various pipelines as well as for logging. ├── data <- Folder to contain any data for the various diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/00-pvc.yaml b/{{cookiecutter.repo_name}}/aisg-context/runai/00-pvc.yaml new file mode 100644 index 0000000..7b7a91a --- /dev/null +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/00-pvc.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} +spec: +{%- if cookiecutter.platform == 'onprem' -%} + storageClassName: nfs-client +{%- elif cookiecutter.platform == 'gcp' -%} + storageClassName: filestore-standard +{% endif %} + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Ti \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/01-workspace-prep.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/01-workspace-prep.yml new file mode 100644 index 0000000..03ab135 --- /dev/null +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/01-workspace-prep.yml @@ -0,0 +1,42 @@ +apiVersion: run.ai/v2alpha1 +kind: TrainingWorkload +metadata: + name: {{cookiecutter.author_name.replace('_', '-')}}-workspace-prep + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} +spec: + environment: + items: + WORKSPACE_NAME: + value: {{cookiecutter.author_name.replace('_', '-')}} + name: + value: {{cookiecutter.author_name.replace('_', '-')}}-workspace-prep + image: + value: alpine + imagePullPolicy: + value: IfNotPresent + command: + value: >- + /bin/sh -c "mkdir -p workspaces/$WORKSPACE_NAME && + cd workspaces/$WORKSPACE_NAME && + chown -R 2222:2222 .;" + workingDir: + value: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc + cpu: + value: '0.5' + cpuLimit: + value: '0.5' + memory: + value: 1G + memoryLimit: + value: 1G + pvcs: + items: + pvc-{{cookiecutter.proj_name.replace('_', '-')}}-pvc: + value: + claimName: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + existingPvc: true + path: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc + readOnly: false + readWriteMany: true + username: + value: {{cookiecutter.author_name}} diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/02-vscode.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/02-vscode.yml new file mode 100644 index 0000000..b1a431e --- /dev/null +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/02-vscode.yml @@ -0,0 +1,59 @@ +apiVersion: run.ai/v2alpha1 +kind: InteractiveWorkload +metadata: + name: {{cookiecutter.author_name.replace('_', '-')}}-vscode + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} +spec: + name: + value: {{cookiecutter.author_name.replace('_', '-')}}-vscode + image: +{%- if cookiecutter.platform == 'onprem' %} + value: registry.aisingapore.net/mlops-pub/kapitan-hull/code-server:0.2.0 +{%- elif cookiecutter.platform == 'gcp' %} + value: asia-southeast1-docker.pkg.dev/machine-learning-ops/pub-images/code-server:0.2.0 +{% endif %} + imagePullPolicy: + value: Always + command: + value: >- + /bin/sh -c " + mkdir -p /{{cookiecutter.proj_name.replace('_', '-')}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}}/vscode/local && + mkdir -p /{{cookiecutter.proj_name.replace('_', '-')}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}}/vscode/config && + rm -rf ~/.local ~/.config && + ln -sf /{{cookiecutter.proj_name.replace('_', '-')}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}}/vscode/local ~/.local && + ln -sf /{{cookiecutter.proj_name.replace('_', '-')}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}}/vscode/config ~/.config && + ln -sf /{{cookiecutter.proj_name.replace('_', '-')}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}} ~/workspace && + code-server --bind-addr 0.0.0.0:8080 --disable-telemetry ." + cpu: + value: '2' + cpuLimit: + value: '4' + memory: + value: 4G + memoryLimit: + value: 8G + gpu: + value: '0' + environment: + items: + RUNAI_JOB_NAME: + value: ${RUNAI_JOB_NAME} + RUNAI_PROJECT: + value: ${RUNAI_PROJECT} + exposedUrls: + items: + url-0: + value: + containerPort: 8080 + customUrl: false + pvcs: + items: + pvc-{{cookiecutter.proj_name.replace('_', '-')}}-pvc: + value: + claimName: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + existingPvc: true + path: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc + readOnly: false + readWriteMany: true + username: + value: {{cookiecutter.author_name}} \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/jupyterlab.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/02b-jupyterlab.yml similarity index 60% rename from {{cookiecutter.repo_name}}/aisg-context/runai/jupyterlab.yml rename to {{cookiecutter.repo_name}}/aisg-context/runai/02b-jupyterlab.yml index 8d8aa7e..b30ea4e 100644 --- a/{{cookiecutter.repo_name}}/aisg-context/runai/jupyterlab.yml +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/02b-jupyterlab.yml @@ -2,12 +2,16 @@ apiVersion: run.ai/v2alpha1 kind: InteractiveWorkload metadata: name: {{cookiecutter.author_name.replace('_', '-')}}-jupyterlab - namespace: runai-{{cookiecutter.runai_proj_name.replace('_', '-')}} + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} spec: name: value: {{cookiecutter.author_name.replace('_', '-')}}-jupyterlab image: - value: registry.aisingapore.net/runai/workspaces/jupyterlab-server:0.1.0 +{%- if cookiecutter.platform == 'onprem' %} + value: registry.aisingapore.net/mlops-pub/kapitan-hull/jupyterlab-server:0.2.0 +{%- elif cookiecutter.platform == 'gcp' %} + value: asia-southeast1-docker.pkg.dev/machine-learning-ops/pub-images/jupyterlab-server:0.2.0 +{% endif %} imagePullPolicy: value: Always command: @@ -38,14 +42,12 @@ spec: customUrl: false pvcs: items: - pvc-{{cookiecutter.runai_proj_name}}-pvc: + pvc-{{cookiecutter.proj_name.replace('_', '-')}}-pvc: value: - claimName: {{cookiecutter.runai_proj_name}}-pvc - existingPvc: false - path: /{{cookiecutter.runai_proj_name}}-pvc + claimName: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + existingPvc: true + path: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc readOnly: false readWriteMany: true - size: 1000G - storageClass: nfs-client username: value: {{cookiecutter.author_name}} \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/03-repo-download.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/03-repo-download.yml new file mode 100644 index 0000000..7e42f23 --- /dev/null +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/03-repo-download.yml @@ -0,0 +1,50 @@ +apiVersion: run.ai/v2alpha1 +kind: TrainingWorkload +metadata: + name: {{cookiecutter.author_name.replace('_', '-')}}-repo-download + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} +spec: + environment: + items: + GITLAB_USERNAME: + value: SECRET:gitlab-ro-credentials,username + GITLAB_PASSWORD: + value: SECRET:gitlab-ro-credentials,password + GITLAB_URL: + value: SECRET:gitlab-ro-credentials,url + name: + value: {{cookiecutter.author_name.replace('_', '-')}}-repo-download + image: + value: alpine/git + imagePullPolicy: + value: IfNotPresent + command: + value: >- + /bin/sh -c + "git clone https://$GITLAB_USERNAME:$GITLAB_PASSWORD@$GITLAB_URL.git && + chown -R 2222:2222 {{cookiecutter.repo_name}}" + runAsUid: + value: 2222 + runAsGid: + value: 2222 + workingDir: + value: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}} + cpu: + value: '0.5' + cpuLimit: + value: '0.5' + memory: + value: 1G + memoryLimit: + value: 1G + pvcs: + items: + pvc-{{cookiecutter.proj_name.replace('_', '-')}}-pvc: + value: + claimName: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + existingPvc: true + path: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc + readOnly: false + readWriteMany: true + username: + value: {{cookiecutter.author_name}} \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/03b-data-download.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/03b-data-download.yml new file mode 100644 index 0000000..6d69e0f --- /dev/null +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/03b-data-download.yml @@ -0,0 +1,50 @@ +apiVersion: run.ai/v2alpha1 +kind: TrainingWorkload +metadata: + name: {{cookiecutter.author_name.replace('_', '-')}}-data-download + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} +spec: + environment: + items: + WORKSPACE_NAME: + value: {{cookiecutter.author_name.replace('_', '-')}} + name: + value: {{cookiecutter.author_name.replace('_', '-')}}-data-download + image: + value: alpine + imagePullPolicy: + value: IfNotPresent + command: + # Change your values here to download other data sources according + # to your project needs. + value: >- + /bin/sh -c " + mkdir -p workspaces/$WORKSPACE_NAME/data && + cd workspaces/$WORKSPACE_NAME/data && + echo Moved to $(pwd) && + wget https://storage.googleapis.com/aisg-mlops-pub-data/kapitan-hull/mnist-pngs-data-aisg.zip && + unzip mnist-pngs-data-aisg.zip && + chown -R 2222:2222 ." + workingDir: + value: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc + cpu: + value: '0.5' + cpu: + value: '0.5' + cpuLimit: + value: '0.5' + memory: + value: 1G + memoryLimit: + value: 1G + pvcs: + items: + pvc-0: + value: + claimName: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + existingPvc: true + path: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc + readOnly: false + readWriteMany: true + username: + value: {{cookiecutter.author_name}} \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/04-docker-build-dataprep.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/04-docker-build-dataprep.yml new file mode 100644 index 0000000..95c95c6 --- /dev/null +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/04-docker-build-dataprep.yml @@ -0,0 +1,53 @@ +apiVersion: run.ai/v2alpha1 +kind: TrainingWorkload +metadata: + name: {{cookiecutter.author_name.replace('_', '-')}}-build-dataprep + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} +spec: + environment: + items: +{%- if cookiecutter.platform == 'onprem' %} + HARBOR_ROBOT_CREDS_JSON: + value: SECRET:harbor-credentials,.dockerconfigjson +{%- elif cookiecutter.platform == 'gcp' %} + SA_CREDENTIALS: + value: SECRET:gcp-sa-credentials,gcp-service-account.json + GOOGLE_APPLICATION_CREDENTIALS: + value: /kaniko/config.json +{% endif %} + name: + value: {{cookiecutter.author_name.replace('_', '-')}}-build-dataprep + image: + value: gcr.io/kaniko-project/executor:debug + imagePullPolicy: + value: IfNotPresent + command: + value: >- + /bin/sh -c + "mkdir -p /kaniko/.docker; +{%- if cookiecutter.platform == 'onprem' %} + echo $HARBOR_ROBOT_CREDS_JSON > /kaniko/.docker/config.json; +{%- elif cookiecutter.platform == 'gcp' %} + echo $SA_CREDENTIALS > $GOOGLE_APPLICATION_CREDENTIALS; +{% endif %} + /kaniko/executor + --context . + --dockerfile ./docker/{{cookiecutter.repo_name}}-data-prep.Dockerfile + --destination {{cookiecutter.registry_project_path}}/data-prep:runai-yaml-build" + workingDir: + value: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}}/{{cookiecutter.repo_name}} + cpu: + value: '4' + memory: + value: 6G + pvcs: + items: + pvc-{{cookiecutter.proj_name.replace('_', '-')}}-pvc: + value: + claimName: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + existingPvc: true + path: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc + readOnly: false + readWriteMany: true + username: + value: {{cookiecutter.author_name}} \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/05-dataprep.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/05-dataprep.yml new file mode 100644 index 0000000..a50906f --- /dev/null +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/05-dataprep.yml @@ -0,0 +1,42 @@ +apiVersion: run.ai/v2alpha1 +kind: TrainingWorkload +metadata: + name: {{cookiecutter.author_name.replace('_', '-')}}-data-prep + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} +spec: + environment: + items: + WORKSPACE_NAME: + value: {{cookiecutter.author_name.replace('_', '-')}} + name: + value: {{cookiecutter.author_name.replace('_', '-')}}-data-prep + image: + value: {{cookiecutter.registry_project_path}}/data-prep:runai-yaml-build + imagePullPolicy: + value: Always + command: + value: >- + /bin/bash -c "source activate {{cookiecutter.repo_name}} && python src/process_data.py + process_data.raw_data_dir_path=../data/mnist-pngs-data-aisg + process_data.processed_data_dir_path=../data/processed/mnist-pngs-data-aisg-processed" + workingDir: + value: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}}/{{cookiecutter.repo_name}} + cpu: + value: '2' + cpuLimit: + value: '2' + memory: + value: 4G + memoryLimit: + value: 4G + pvcs: + items: + pvc-{{cookiecutter.proj_name.replace('_', '-')}}-pvc: + value: + claimName: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + existingPvc: true + path: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc + readOnly: false + readWriteMany: true + username: + value: {{cookiecutter.author_name}} diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/06-docker-build-modeltraining.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/06-docker-build-modeltraining.yml new file mode 100644 index 0000000..69c6a11 --- /dev/null +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/06-docker-build-modeltraining.yml @@ -0,0 +1,53 @@ +apiVersion: run.ai/v2alpha1 +kind: TrainingWorkload +metadata: + name: {{cookiecutter.author_name.replace('_', '-')}}-build-modeltraining + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} +spec: + environment: + items: +{%- if cookiecutter.platform == 'onprem' %} + HARBOR_ROBOT_CREDS_JSON: + value: SECRET:harbor-credentials,.dockerconfigjson +{%- elif cookiecutter.platform == 'gcp' %} + SA_CREDENTIALS: + value: SECRET:gcp-sa-credentials,gcp-service-account.json + GOOGLE_APPLICATION_CREDENTIALS: + value: /kaniko/config.json +{% endif %} + name: + value: {{cookiecutter.author_name.replace('_', '-')}}-build-modeltraining + image: + value: gcr.io/kaniko-project/executor:debug + imagePullPolicy: + value: IfNotPresent + command: + value: >- + /bin/sh -c + "mkdir -p /kaniko/.docker; +{%- if cookiecutter.platform == 'onprem' %} + echo $HARBOR_ROBOT_CREDS_JSON > /kaniko/.docker/config.json; +{%- elif cookiecutter.platform == 'gcp' %} + echo $SA_CREDENTIALS > $GOOGLE_APPLICATION_CREDENTIALS; +{% endif %} + /kaniko/executor + --context . + --dockerfile ./docker/{{cookiecutter.repo_name}}-model-training.Dockerfile + --destination {{cookiecutter.registry_project_path}}/model-training:runai-yaml-build" + workingDir: + value: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}}/{{cookiecutter.repo_name}} + cpu: + value: '4' + memory: + value: 6G + pvcs: + items: + pvc-{{cookiecutter.proj_name.replace('_', '-')}}-pvc: + value: + claimName: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + existingPvc: true + path: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc + readOnly: false + readWriteMany: true + username: + value: {{cookiecutter.author_name}} \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/07-modeltraining.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/07-modeltraining.yml new file mode 100644 index 0000000..3704d48 --- /dev/null +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/07-modeltraining.yml @@ -0,0 +1,53 @@ +apiVersion: run.ai/v2alpha1 +kind: TrainingWorkload +metadata: + name: {{cookiecutter.author_name.replace('_', '-')}}-train + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} +spec: + environment: + items: + WORKSPACE_NAME: + value: {{cookiecutter.author_name.replace('_', '-')}} + MLFLOW_EXP_NAME: + value: {{cookiecutter.proj_name.replace('_', '-')}} + MLFLOW_TRACKING_URI: + value: SECRET:mlflow-credentials,url + MLFLOW_TRACKING_USERNAME: + value: SECRET:mlflow-credentials,username + MLFLOW_TRACKING_PASSWORD: + value: SECRET:mlflow-credentials,password + name: + value: {{cookiecutter.author_name.replace('_', '-')}}-train + image: + value: {{cookiecutter.registry_project_path}}/model-training:runai-yaml-build + imagePullPolicy: + value: Always + command: + value: >- + /bin/bash -c "source activate {{cookiecutter.repo_name}} && python src/train_model.py + train_model.data_dir_path=../data/processed/mnist-pngs-data-aisg-processed + train_model.model_checkpoint_dir_path=./models + train_model.setup_mlflow=true train_model.epochs=3 + train_model.mlflow_tracking_uri=$MLFLOW_TRACKING_URI + train_model.mlflow_exp_name=$MLFLOW_EXP_NAME" + workingDir: + value: /{{cookiecutter.proj_name}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}}/{{cookiecutter.repo_name}} + cpu: + value: '2' + cpuLimit: + value: '2' + memory: + value: 4G + memoryLimit: + value: 4G + pvcs: + items: + pvc-{{cookiecutter.proj_name.replace('_', '-')}}-pvc: + value: + claimName: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + existingPvc: true + path: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc + readOnly: false + readWriteMany: true + username: + value: {{cookiecutter.author_name}} \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/modeltraining-hp.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/08-modeltraining-hp.yml similarity index 53% rename from {{cookiecutter.repo_name}}/aisg-context/runai/modeltraining-hp.yml rename to {{cookiecutter.repo_name}}/aisg-context/runai/08-modeltraining-hp.yml index 3d49755..d038ca6 100644 --- a/{{cookiecutter.repo_name}}/aisg-context/runai/modeltraining-hp.yml +++ b/{{cookiecutter.repo_name}}/aisg-context/runai/08-modeltraining-hp.yml @@ -2,22 +2,16 @@ apiVersion: run.ai/v2alpha1 kind: TrainingWorkload metadata: name: {{cookiecutter.author_name.replace('_', '-')}}-hp-train - namespace: runai-{{cookiecutter.runai_proj_name.replace('_', '-')}} + namespace: runai-{{cookiecutter.proj_name.replace('_', '-')}} spec: environment: items: WORKSPACE_NAME: - value: - MLFLOW_TRACKING_URI: - value: + value: {{cookiecutter.author_name.replace('_', '-')}} MLFLOW_EXP_NAME: value: {{cookiecutter.author_name}}-{{cookiecutter.repo_name}} - AWS_ACCESS_KEY_ID: - value: SECRET:s3-credentials,accessKeyId - AWS_SECRET_ACCESS_KEY: - value: SECRET:s3-credentials,secretAccessKey - MLFLOW_S3_ENDPOINT_URL: - value: https://necs.nus.edu.sg + MLFLOW_TRACKING_URI: + value: SECRET:mlflow-credentials,url MLFLOW_TRACKING_USERNAME: value: SECRET:mlflow-admin-credentials,username MLFLOW_TRACKING_PASSWORD: @@ -25,19 +19,19 @@ spec: name: value: {{cookiecutter.author_name.replace('_', '-')}}-hp-train image: - value: {{cookiecutter.harbor_registry_project_path}}/model-training:0.1.0 + value: {{cookiecutter.registry_project_path}}/model-training:runai-yaml-build imagePullPolicy: value: Always command: value: >- /bin/bash -c "export MLFLOW_HPTUNING_TAG=$(date +%s) && source activate {{cookiecutter.repo_name}} && python src/train_model.py --multirun - train_model.data_dir_path=/{{cookiecutter.runai_proj_name}}-pvc/workspaces/$WORKSPACE_NAME/data/processed/mnist-pngs-data-aisg-processed - train_model.model_checkpoint_dir_path=/{{cookiecutter.runai_proj_name}}-pvc/workspaces/$WORKSPACE_NAME/{{cookiecutter.repo_name}}/models + train_model.data_dir_path=../data/processed/mnist-pngs-data-aisg-processed + train_model.model_checkpoint_dir_path=./models train_model.setup_mlflow=true train_model.epochs=3 train_model.mlflow_tracking_uri=$MLFLOW_TRACKING_URI train_model.mlflow_exp_name=$MLFLOW_EXP_NAME" workingDir: - value: /{{cookiecutter.runai_proj_name}}-pvc/workspaces/$WORKSPACE_NAME/{{cookiecutter.repo_name}} + value: /{{cookiecutter.proj_name}}-pvc/workspaces/{{cookiecutter.author_name.replace('_', '-')}}/{{cookiecutter.repo_name}} cpu: value: '2' cpuLimit: @@ -48,14 +42,12 @@ spec: value: 4G pvcs: items: - pvc-0: + pvc-{{cookiecutter.proj_name.replace('_', '-')}}-pvc: value: - claimName: {{cookiecutter.runai_proj_name}}-pvc - existingPvc: false - path: /{{cookiecutter.runai_proj_name}}-pvc + claimName: {{cookiecutter.proj_name.replace('_', '-')}}-pvc + existingPvc: true + path: /{{cookiecutter.proj_name.replace('_', '-')}}-pvc readOnly: false readWriteMany: true - size: 1000G - storageClass: nfs-client username: value: {{cookiecutter.author_name}} \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/dataprep.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/dataprep.yml deleted file mode 100644 index a26faff..0000000 --- a/{{cookiecutter.repo_name}}/aisg-context/runai/dataprep.yml +++ /dev/null @@ -1,44 +0,0 @@ -apiVersion: run.ai/v2alpha1 -kind: TrainingWorkload -metadata: - name: {{cookiecutter.author_name.replace('_', '-')}}-data-prep - namespace: runai-{{cookiecutter.runai_proj_name.replace('_', '-')}} -spec: - environment: - items: - WORKSPACE_NAME: - value: - name: - value: {{cookiecutter.author_name.replace('_', '-')}}-data-prep - image: - value: {{cookiecutter.harbor_registry_project_path}}/data-prep:0.1.0 - imagePullPolicy: - value: Always - command: - value: >- - /bin/bash -c "source activate {{cookiecutter.repo_name}} && python src/process_data.py - process_data.raw_data_dir_path=/{{cookiecutter.runai_proj_name}}-pvc/workspaces/$WORKSPACE_NAME/data/mnist-pngs-data-aisg - process_data.processed_data_dir_path=/{{cookiecutter.runai_proj_name}}-pvc/workspaces/$WORKSPACE_NAME/data/processed/mnist-pngs-data-aisg-processed" - workingDir: - value: /{{cookiecutter.runai_proj_name}}-pvc/workspaces/$WORKSPACE_NAME/{{cookiecutter.repo_name}} - cpu: - value: '2' - cpuLimit: - value: '2' - memory: - value: 4G - memoryLimit: - value: 4G - pvcs: - items: - pvc-{{cookiecutter.runai_proj_name}}-pvc: - value: - claimName: {{cookiecutter.runai_proj_name}}-pvc - existingPvc: false - path: /{{cookiecutter.runai_proj_name}}-pvc - readOnly: false - readWriteMany: true - size: 1000G - storageClass: nfs-client - username: - value: {{cookiecutter.author_name}} diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/modeltraining.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/modeltraining.yml deleted file mode 100644 index e69cf0a..0000000 --- a/{{cookiecutter.repo_name}}/aisg-context/runai/modeltraining.yml +++ /dev/null @@ -1,61 +0,0 @@ -apiVersion: run.ai/v2alpha1 -kind: TrainingWorkload -metadata: - name: {{cookiecutter.author_name.replace('_', '-')}}-train - namespace: runai-{{cookiecutter.runai_proj_name.replace('_', '-')}} -spec: - environment: - items: - WORKSPACE_NAME: - value: - MLFLOW_TRACKING_URI: - value: - MLFLOW_EXP_NAME: - value: {{cookiecutter.author_name}}-{{cookiecutter.repo_name}} - AWS_ACCESS_KEY_ID: - value: SECRET:s3-credentials,accessKeyId - AWS_SECRET_ACCESS_KEY: - value: SECRET:s3-credentials,secretAccessKey - MLFLOW_S3_ENDPOINT_URL: - value: https://necs.nus.edu.sg - MLFLOW_TRACKING_USERNAME: - value: SECRET:mlflow-admin-credentials,username - MLFLOW_TRACKING_PASSWORD: - value: SECRET:mlflow-admin-credentials,password - name: - value: {{cookiecutter.author_name.replace('_', '-')}}-train - image: - value: {{cookiecutter.harbor_registry_project_path}}/model-training:0.1.0 - imagePullPolicy: - value: Always - command: - value: >- - /bin/bash -c "source activate {{cookiecutter.repo_name}} && python src/train_model.py - train_model.data_dir_path=/{{cookiecutter.runai_proj_name}}-pvc/workspaces/$WORKSPACE_NAME/data/processed/mnist-pngs-data-aisg-processed - train_model.model_checkpoint_dir_path=/{{cookiecutter.runai_proj_name}}-pvc/workspaces/$WORKSPACE_NAME/{{cookiecutter.repo_name}}/models - train_model.setup_mlflow=true train_model.epochs=3 - train_model.mlflow_tracking_uri=$MLFLOW_TRACKING_URI - train_model.mlflow_exp_name=$MLFLOW_EXP_NAME" - workingDir: - value: /{{cookiecutter.runai_proj_name}}-pvc/workspaces/$WORKSPACE_NAME/{{cookiecutter.repo_name}} - cpu: - value: '2' - cpuLimit: - value: '2' - memory: - value: 4G - memoryLimit: - value: 4G - pvcs: - items: - pvc-0: - value: - claimName: {{cookiecutter.runai_proj_name}}-pvc - existingPvc: false - path: /{{cookiecutter.runai_proj_name}}-pvc - readOnly: false - readWriteMany: true - size: 1000G - storageClass: nfs-client - username: - value: {{cookiecutter.author_name}} \ No newline at end of file diff --git a/{{cookiecutter.repo_name}}/aisg-context/runai/vscode.yml b/{{cookiecutter.repo_name}}/aisg-context/runai/vscode.yml deleted file mode 100644 index f6e9217..0000000 --- a/{{cookiecutter.repo_name}}/aisg-context/runai/vscode.yml +++ /dev/null @@ -1,49 +0,0 @@ -apiVersion: run.ai/v2alpha1 -kind: InteractiveWorkload -metadata: - name: {{cookiecutter.author_name.replace('_', '-')}}-vscode - namespace: runai-{{cookiecutter.runai_proj_name.replace('_', '-')}} -spec: - name: - value: {{cookiecutter.author_name.replace('_', '-')}}-vscode - image: - value: registry.aisingapore.net/runai/workspaces/code-server:v4.16.1-0.1.0 - imagePullPolicy: - value: Always - arguments: - value: '--bind-addr 0.0.0.0:8080 --auth none --disable-telemetry .' - cpu: - value: '2' - cpuLimit: - value: '4' - memory: - value: 4G - memoryLimit: - value: 8G - gpu: - value: '0' - environment: - items: - RUNAI_JOB_NAME: - value: ${RUNAI_JOB_NAME} - RUNAI_PROJECT: - value: ${RUNAI_PROJECT} - exposedUrls: - items: - url-0: - value: - containerPort: 8080 - customUrl: false - pvcs: - items: - pvc-{{cookiecutter.runai_proj_name}}-pvc: - value: - claimName: {{cookiecutter.runai_proj_name}}-pvc - existingPvc: false - path: /{{cookiecutter.runai_proj_name}}-pvc - readOnly: false - readWriteMany: true - size: 1000G - storageClass: nfs-client - username: - value: {{cookiecutter.author_name}} \ No newline at end of file