From 93ed74ef8f2dafd157bcf6dc1d120bf4e60eb4c0 Mon Sep 17 00:00:00 2001 From: Syakyr Surani Date: Mon, 24 Jun 2024 18:01:01 +0800 Subject: [PATCH] Reorganise job orchestration page --- .../guide-for-user/07-job-orchestration.md | 948 +++++++++--------- 1 file changed, 478 insertions(+), 470 deletions(-) diff --git a/{{cookiecutter.repo_name}}/problem-templates/cv/aisg-context/guide-site/docs/guide-for-user/07-job-orchestration.md b/{{cookiecutter.repo_name}}/problem-templates/cv/aisg-context/guide-site/docs/guide-for-user/07-job-orchestration.md index fdc71ea..b7da56f 100644 --- a/{{cookiecutter.repo_name}}/problem-templates/cv/aisg-context/guide-site/docs/guide-for-user/07-job-orchestration.md +++ b/{{cookiecutter.repo_name}}/problem-templates/cv/aisg-context/guide-site/docs/guide-for-user/07-job-orchestration.md @@ -38,169 +38,170 @@ values can be overridden through the CLI. ## Data Preparation & Preprocessing -### Local +To process the sample raw data, there are 3 main ways to do so: -To process the sample raw data, there are many ways to do so. One way -is to run it locally. Ensure that you have activated your Conda -environment before running the script. More information on this can be -found [here][venv]. You can also update your configuration variables at -`conf/process_data.yaml`, specifically this section: +=== "Local" -```yaml -raw_data_dir_path: "./data/mnist-pngs-data-aisg" -processed_data_dir_path: "./data/processed/mnist-pngs-data-aisg-processed" -``` - -After that, run the script: - -=== "Linux/macOS" - - ```bash - # Add no_cuda=False at the end to enable GPU use. - # Make sure you have installed CUDA/RoCM before using. - # Check that LD_LIBRARY_PATH has been set. - # Also set HIP_VISIBLE_DEVICES=0 if RoCM is used. - python src/process_data.py - ``` - -=== "Windows PowerShell" - - ```powershell - python src\process_data.py - ``` - -### Docker - -We can also run through a Docker container. This requires the Docker -image to be built from a Dockerfile -(`docker/{{cookiecutter.src_package_name}}-cpu.Dockerfile`) -provided in this template: - -=== "Linux/macOS" - - ```bash - docker build \ - -t {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ - -f docker/{{cookiecutter.repo_name}}-cpu.Dockerfile \ - --platform linux/amd64 . - ``` - -=== "Windows PowerShell" - - ```powershell - docker build ` - -t {{cookiecutter.registry_project_path}}/data-prep:0.1.0 ` - -f docker/{{cookiecutter.repo_name}}-cpu.Dockerfile ` - --platform linux/amd64 . - ``` - -=== "VSCode Server Terminal" - - ```bash - # Run `runai login` and `runai config project {{cookiecutter.proj_name}}` first if needed - # Run this in the base of your project repository, and change accordingly - khull kaniko --context $(pwd) \ - --dockerfile $(pwd)/docker/{{cookiecutter.repo_name}}-cpu.Dockerfile \ - --destination {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ -{%- if cookiecutter.platform == 'gcp' %} - --gcp \ -{%- endif %} - --cred-file /path/to/docker/config.json \ - -v :/path/to/pvc/mount - ``` - -After building the image, you can run the script through Docker: - -=== "Linux/macOS" - - ```bash - sudo chown 2222:2222 ./data - docker run --rm \ - -v ./data:/home/aisg/{{cookiecutter.repo_name}}/data \ - -w /home/aisg/{{cookiecutter.repo_name}} \ - {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ - bash -c "python -u src/process_data.py" - ``` + Ensure that you have activated your Conda + environment before running the script. More information on this can be + found [here][venv]. You can also update your configuration variables at + `conf/process_data.yaml`, specifically this section: -=== "Windows PowerShell" - - ```powershell - docker run --rm ` - -v .\data:/home/aisg/{{cookiecutter.repo_name}}/data ` - -w /home/aisg/{{cookiecutter.repo_name}} ` - {{cookiecutter.registry_project_path}}/data-prep:0.1.0 ` - bash -c "python -u src/process_data.py" - ``` - -Once you are satisfied with the Docker image, you can push it to the -Docker registry: - -!!! warning "Attention" - - If you're following the "VSCode Server Terminal" method, you can - skip this as you have already pushed to the Docker registry. - -=== "Linux/macOS" - - ```bash - docker push {{cookiecutter.registry_project_path}}/data-prep:0.1.0 - ``` - -=== "Windows PowerShell" - - ```powershell - docker push {{cookiecutter.registry_project_path}}/data-prep:0.1.0 + ```yaml + raw_data_dir_path: "./data/mnist-pngs-data-aisg" + processed_data_dir_path: "./data/processed/mnist-pngs-data-aisg-processed" ``` -### Run:ai + After that, run the script: -Now that we have the Docker image pushed to the registry, we can submit -a job using that image to Run:ai\: + === "Linux/macOS" -=== "Linux/macOS" - - ```bash - # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC - runai submit \ - --job-name-prefix -data-prep \ - -i {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ - --working-dir /home/aisg/{{cookiecutter.repo_name}} \ - --existing-pvc claimname=,path=/ \ - --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ - --command -- '/bin/bash -c "python -u src/process_data.py raw_data_dir_path=//workspaces//data/mnist-pngs-data-aisg processed_data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed"' - ``` + ```bash + # Add no_cuda=False at the end to enable GPU use. + # Make sure you have installed CUDA/RoCM before using. + # Check that LD_LIBRARY_PATH has been set. + # Also set HIP_VISIBLE_DEVICES=0 if RoCM is used. + python src/process_data.py + ``` -=== "Windows PowerShell" - - ```powershell - # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC - runai submit ` - --job-name-prefix -data-prep ` - -i {{cookiecutter.registry_project_path}}/data-prep:0.1.0 ` - --working-dir /home/aisg/{{cookiecutter.repo_name}} ` - --existing-pvc claimname=,path=/ ` - --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 ` - --command -- '/bin/bash -c "python -u src/process_data.py raw_data_dir_path=//workspaces//data/mnist-pngs-data-aisg processed_data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed"' - ``` + === "Windows PowerShell" -=== "VSCode Server Terminal" + ```powershell + python src\process_data.py + ``` + +=== "Docker" - ```bash - # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC - runai submit \ - --job-name-prefix -data-prep \ - -i {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ - --working-dir /home/aisg/{{cookiecutter.repo_name}} \ - --existing-pvc claimname=,path=/ \ - --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ - --command -- '/bin/bash -c "python -u src/process_data.py raw_data_dir_path=//workspaces//data/mnist-pngs-data-aisg processed_data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed"' - ``` + This requires the Docker + image to be built from a Dockerfile + (`docker/{{cookiecutter.src_package_name}}-cpu.Dockerfile`) + provided in this template: -After some time, the data processing job should conclude and we can -proceed with training the predictive model. -The processed data is exported to the directory -`//workspaces//data/processed/mnist-pngs-data-aisg-processed`. -We will be passing this path to the model training workflows. + === "Linux/macOS" + + ```bash + docker build \ + -t {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ + -f docker/{{cookiecutter.repo_name}}-cpu.Dockerfile \ + --platform linux/amd64 . + ``` + + === "Windows PowerShell" + + ```powershell + docker build ` + -t {{cookiecutter.registry_project_path}}/data-prep:0.1.0 ` + -f docker/{{cookiecutter.repo_name}}-cpu.Dockerfile ` + --platform linux/amd64 . + ``` + + After building the image, you can run the script through Docker: + + === "Linux/macOS" + + ```bash + sudo chown 2222:2222 ./data + docker run --rm \ + -v ./data:/home/aisg/{{cookiecutter.repo_name}}/data \ + -w /home/aisg/{{cookiecutter.repo_name}} \ + {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ + bash -c "python -u src/process_data.py" + ``` + + === "Windows PowerShell" + + ```powershell + docker run --rm ` + -v .\data:/home/aisg/{{cookiecutter.repo_name}}/data ` + -w /home/aisg/{{cookiecutter.repo_name}} ` + {{cookiecutter.registry_project_path}}/data-prep:0.1.0 ` + bash -c "python -u src/process_data.py" + ``` + + Once you are satisfied with the Docker image, you can push it to the + Docker registry: + + === "Linux/macOS" + + ```bash + docker push {{cookiecutter.registry_project_path}}/data-prep:0.1.0 + ``` + + === "Windows PowerShell" + + ```powershell + docker push {{cookiecutter.registry_project_path}}/data-prep:0.1.0 + ``` + +=== "Run:ai" + + This requires the Docker + image to be built from a Dockerfile + (`docker/{{cookiecutter.src_package_name}}-cpu.Dockerfile`) + provided in this template: + + === "VSCode Server Terminal" + + ```bash + # Run `runai login` and `runai config project {{cookiecutter.proj_name}}` first if needed + # Run this in the base of your project repository, and change accordingly + khull kaniko --context $(pwd) \ + --dockerfile $(pwd)/docker/{{cookiecutter.repo_name}}-cpu.Dockerfile \ + --destination {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ + {%- if cookiecutter.platform == 'gcp' %} + --gcp \ + {%- endif %} + --cred-file /path/to/docker/config.json \ + -v :/path/to/pvc/mount + ``` + + Now that we have the Docker image built and pushed to the registry, we can submit + a job using that image to Run:ai\: + + === "Linux/macOS" + + ```bash + # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC + runai submit \ + --job-name-prefix -data-prep \ + -i {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ + --working-dir /home/aisg/{{cookiecutter.repo_name}} \ + --existing-pvc claimname=,path=/ \ + --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ + --command -- '/bin/bash -c "python -u src/process_data.py raw_data_dir_path=//workspaces//data/mnist-pngs-data-aisg processed_data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed"' + ``` + + === "Windows PowerShell" + + ```powershell + # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC + runai submit ` + --job-name-prefix -data-prep ` + -i {{cookiecutter.registry_project_path}}/data-prep:0.1.0 ` + --working-dir /home/aisg/{{cookiecutter.repo_name}} ` + --existing-pvc claimname=,path=/ ` + --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 ` + --command -- '/bin/bash -c "python -u src/process_data.py raw_data_dir_path=//workspaces//data/mnist-pngs-data-aisg processed_data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed"' + ``` + + === "VSCode Server Terminal" + + ```bash + # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC + runai submit \ + --job-name-prefix -data-prep \ + -i {{cookiecutter.registry_project_path}}/data-prep:0.1.0 \ + --working-dir /home/aisg/{{cookiecutter.repo_name}} \ + --existing-pvc claimname=,path=/ \ + --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ + --command -- '/bin/bash -c "python -u src/process_data.py raw_data_dir_path=//workspaces//data/mnist-pngs-data-aisg processed_data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed"' + ``` + + After some time, the data processing job should conclude and we can + proceed with training the predictive model. + The processed data is exported to the directory + `//workspaces//data/processed/mnist-pngs-data-aisg-processed`. + We will be passing this path to the model training workflows. [venv]: ./05-virtual-env.md#local-virtual-environments @@ -247,269 +248,276 @@ artifacts without explicitly knowing the {{objstg}} credentials. - [MLflow Docs - Tracking](https://www.mlflow.org/docs/latest/tracking.html#) - [MLflow Docs - Tracking (Artifact Stores)](https://www.mlflow.org/docs/latest/tracking.html#artifact-stores) -### Local - -The beauty of MLFlow is that it can run locally, within a Docker -container or connecting to a remote MLFlow server. In this case, it is -assumed that you are spinning up an MLFlow instance locally whenever it -is needed. - -To run the model training script locally, you should have your Conda -environment activated from the data preparation stage, and update your -configuration variables at `conf/train_model.yaml`, especially this -section: - -```yaml -setup_mlflow: true -mlflow_autolog: false -mlflow_tracking_uri: "./mlruns" -mlflow_exp_name: "{{cookiecutter.src_package_name_short}}" -mlflow_run_name: "train-model" -data_dir_path: "./data/processed/mnist-pngs-data-aisg-processed" -no_cuda: true -no_mps: true -train_bs: 64 -test_bs: 1000 -lr: 1.0 -gamma: 0.7 -seed: 1111 -epochs: 3 -log_interval: 100 -dry_run: false -model_checkpoint_interval: 2 -model_checkpoint_dir_path: "./models/checkpoint" -``` - -After that, run the script: - -=== "Linux/macOS" - - ```bash - python src/train_model.py - ``` - -=== "Windows PowerShell" +=== "Local" - ```powershell - python src\train_model.py - ``` - -This will generate the MLFlow logs and artifacts locally, of which you -can parse it with the MLFlow UI with: - -```bash -conda activate mlflow-test -mlflow server -``` - -and connect to http://localhost:5000. - -### Docker - -We shall build the Docker image from the Docker file -`docker/{{cookiecutter.repo_name}}-gpu.Dockerfile`: - -!!! warning "Attention" - - If you're only using CPUs for training, then you can just use - `docker/{{cookiecutter.repo_name}}-cpu.Dockerfile` instead for - smaller image size. - If you're using AMD GPUs for training, you can copy the components - from the [`rocm`][rocm] folder in the Kapitan Hull repository. - -[rocm]: https://github.com/aisingapore/kapitan-hull/tree/main/extras/rocm - -=== "Linux/macOS" - - ```bash - docker build \ - -t {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ - -f docker/{{cookiecutter.repo_name}}-gpu.Dockerfile \ - --platform linux/amd64 . - ``` + The beauty of MLFlow is that it can run locally, within a Docker + container or connecting to a remote MLFlow server. In this case, it is + assumed that you are spinning up an MLFlow instance locally whenever it + is needed. -=== "Windows PowerShell" + To run the model training script locally, you should have your Conda + environment activated from the data preparation stage, and update your + configuration variables at `conf/train_model.yaml`, especially this + section: - ```powershell - docker build ` - -t {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` - -f docker/{{cookiecutter.repo_name}}-gpu.Dockerfile ` - --platform linux/amd64 . + ```yaml + setup_mlflow: true + mlflow_autolog: false + mlflow_tracking_uri: "./mlruns" + mlflow_exp_name: "{{cookiecutter.src_package_name_short}}" + mlflow_run_name: "train-model" + data_dir_path: "./data/processed/mnist-pngs-data-aisg-processed" + no_cuda: true + no_mps: true + train_bs: 64 + test_bs: 1000 + lr: 1.0 + gamma: 0.7 + seed: 1111 + epochs: 3 + log_interval: 100 + dry_run: false + model_checkpoint_interval: 2 + model_checkpoint_dir_path: "./models/checkpoint" ``` -=== "VSCode Server Terminal" + After that, run the script: - ```bash - # Run `runai login` and `runai config project {{cookiecutter.proj_name}}` first if needed - # Run this in the base of your project repository, and change accordingly - khull kaniko --context $(pwd) \ - --dockerfile $(pwd)/docker/{{cookiecutter.repo_name}}-gpu.Dockerfile \ - --destination {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ -{%- if cookiecutter.platform == 'gcp' %} - --gcp \ -{%- endif %} - --cred-file /path/to/docker/config.json \ - -v :/path/to/pvc/mount - ``` - -After building the image, you can run the script through Docker: - -=== "Linux/macOS" - - ```bash - sudo chown 2222:2222 ./mlruns ./models - # Add --gpus=all for Nvidia GPUs in front of the image name - # Add --device=/dev/kfd --device=/dev/dri --group-add video for AMD GPUs in front of the image name - # Add no_cuda=false to use GPUs behind the image name - docker run --rm \ - -v ./data:/home/aisg/{{cookiecutter.repo_name}}/data \ - -v ./mlruns:/home/aisg/{{cookiecutter.repo_name}}/mlruns \ - -v ./models:/home/aisg/{{cookiecutter.repo_name}}/models \ - -w /home/aisg/{{cookiecutter.repo_name}} \ - {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ - bash -c "python -u src/train_model.py" - ``` + === "Linux/macOS" -=== "Windows PowerShell" + ```bash + python src/train_model.py + ``` - ```powershell - docker run --rm \ - -v .\data:/home/aisg/{{cookiecutter.repo_name}}/data ` - -v .\mlruns:/home/aisg/{{cookiecutter.repo_name}}/mlruns ` - -v .\models:/home/aisg/{{cookiecutter.repo_name}}/models ` - -w /home/aisg/{{cookiecutter.repo_name}} ` - {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` - bash -c "python -u src/train_model.py" - ``` + === "Windows PowerShell" -You can run MLFlow in Docker as well with the following command: + ```powershell + python src\train_model.py + ``` -=== "Linux/macOS" + This will generate the MLFlow logs and artifacts locally, of which you + can parse it with the MLFlow UI with: ```bash - docker run --rm -d \ - -p 5000:5000 - -v ./mlruns:/mlruns \ - ghcr.io/mlflow/mlflow:v2.9.2 \ - mlflow server -h 0.0.0.0 - ``` - -=== "Windows PowerShell" - - ```powershell - docker run --rm -d ` - -p 5000:5000 ` - -v .\mlruns:/mlruns ` - ghcr.io/mlflow/mlflow:v2.9.2 ` - mlflow server -h 0.0.0.0 + conda activate mlflow-test + mlflow server ``` -and connect to http://localhost:5000. + and connect to http://localhost:5000. -Once you are satisfied with the Docker image, you can push it to the -Docker registry: +=== "Docker" -!!! warning "Attention" + The beauty of MLFlow is that it can run locally, within a Docker + container or connecting to a remote MLFlow server. In this case, + We shall build the Docker image from the Docker file + `docker/{{cookiecutter.repo_name}}-gpu.Dockerfile`: - If you're following the "VSCode Server Terminal" method, you can - skip this as you have already pushed to the Docker registry. - -=== "Linux/macOS" - - ```bash - docker push {{cookiecutter.registry_project_path}}/model-training:0.1.0 - ``` - -=== "Windows PowerShell" + !!! warning "Attention" - ```powershell - docker push {{cookiecutter.registry_project_path}}/model-training:0.1.0 - ``` + If you're only using CPUs for training, then you can just use + `docker/{{cookiecutter.repo_name}}-cpu.Dockerfile` instead for + smaller image size. + If you're using AMD GPUs for training, you can copy the components + from the [`rocm`][rocm] folder in the Kapitan Hull repository. -### Run:ai + [rocm]: https://github.com/aisingapore/kapitan-hull/tree/main/extras/rocm -Now that we have the Docker image pushed to the registry, we can run a -job using it: + === "Linux/macOS" -=== "Linux/macOS" + ```bash + docker build \ + -t {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ + -f docker/{{cookiecutter.repo_name}}-gpu.Dockerfile \ + --platform linux/amd64 . + ``` - ```bash - # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC - runai submit \ - --job-name-prefix -train \ - -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ - --working-dir /home/aisg/{{cookiecutter.repo_name}} \ - --existing-pvc claimname=,path=/ \ - --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ - -e MLFLOW_TRACKING_USERNAME= \ - -e MLFLOW_TRACKING_PASSWORD= \ - -e OMP_NUM_THREADS=2 \ - --command -- '/bin/bash -c "python -u src/train_model.py data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' - ``` + === "Windows PowerShell" -=== "Windows PowerShell" - - ```powershell - # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC - runai submit ` - --job-name-prefix -train ` - -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` - --working-dir /home/aisg/{{cookiecutter.repo_name}} ` - --existing-pvc claimname=,path=/ ` - --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 ` - -e MLFLOW_TRACKING_USERNAME= ` - -e MLFLOW_TRACKING_PASSWORD= ` - -e OMP_NUM_THREADS=2 ` - --command -- '/bin/bash -c "python -u src/train_model.py data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' - ``` + ```powershell + docker build ` + -t {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` + -f docker/{{cookiecutter.repo_name}}-gpu.Dockerfile ` + --platform linux/amd64 . + ``` -=== "VSCode Server Terminal" + After building the image, you can run the script through Docker: - ```bash - # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC - $ runai submit \ - --job-name-prefix -train \ - -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ - --working-dir /home/aisg/{{cookiecutter.repo_name}} \ - --existing-pvc claimname=,path=/ \ - --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ - -e MLFLOW_TRACKING_USERNAME= \ - -e MLFLOW_TRACKING_PASSWORD= \ - -e OMP_NUM_THREADS=2 \ - --command -- '/bin/bash -c "python -u src/train_model.py data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' - ``` + === "Linux/macOS" -Once you have successfully run an experiment, you may inspect the run -on the MLflow Tracking server. Through the MLflow Tracking server -interface, you can view the metrics and parameters logged for the run, -as well as download the artifacts that have been uploaded to the ECS -bucket. You can also compare runs with each other. - -![MLflow Tracking Server - Inspecting Runs](https://storage.googleapis.com/aisg-mlops-pub-data/images/mlflow-tracking-server-inspect.gif) - -!!! tip - Every job submitted with `runai submit` is assigned a unique ID, - and a unique job name if the `--job-name-prefix` is used. The - `mlflow_init` function within the `general_utils.py` module tags - every experiment name with the job's name and UUID as provided by - Run:ai, with the tags `job_uuid` and `job_name`. This allows you to - easily identify the MLflow experiment runs that are associated with - each Run:ai job. You can filter for MLflow experiment runs - associated with a specific Run:ai job by using MLflow's search - filter expressions and API. - - ??? info "Reference Link(s)" - - - [Run:ai Docs - Environment Variables inside a Run:ai Workload](https://docs.run.ai/latest/Researcher/best-practices/env-variables/) - - [MLflow Docs - Search Runs](https://mlflow.org/docs/latest/search-runs.html) - -!!! info - If your project has GPU quotas assigned to it, you can make use of - it by specifying the `--gpu` flag in the `runai submit` command. As - part of Run:ai's unique selling point, you can also specify - fractional values, which would allow you to utilise a fraction of a - GPU. This is useful for projects that require a GPU for training, - but do not require the full capacity of a GPU. + ```bash + sudo chown 2222:2222 ./mlruns ./models + # Add --gpus=all for Nvidia GPUs in front of the image name + # Add --device=/dev/kfd --device=/dev/dri --group-add video for AMD GPUs in front of the image name + # Add no_cuda=false to use GPUs behind the image name + docker run --rm \ + -v ./data:/home/aisg/{{cookiecutter.repo_name}}/data \ + -v ./mlruns:/home/aisg/{{cookiecutter.repo_name}}/mlruns \ + -v ./models:/home/aisg/{{cookiecutter.repo_name}}/models \ + -w /home/aisg/{{cookiecutter.repo_name}} \ + {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ + bash -c "python -u src/train_model.py" + ``` + + === "Windows PowerShell" + + ```powershell + docker run --rm \ + -v .\data:/home/aisg/{{cookiecutter.repo_name}}/data ` + -v .\mlruns:/home/aisg/{{cookiecutter.repo_name}}/mlruns ` + -v .\models:/home/aisg/{{cookiecutter.repo_name}}/models ` + -w /home/aisg/{{cookiecutter.repo_name}} ` + {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` + bash -c "python -u src/train_model.py" + ``` + + You can run MLFlow in Docker as well with the following command: + + === "Linux/macOS" + + ```bash + docker run --rm -d \ + -p 5000:5000 + -v ./mlruns:/mlruns \ + ghcr.io/mlflow/mlflow:v2.9.2 \ + mlflow server -h 0.0.0.0 + ``` + + === "Windows PowerShell" + + ```powershell + docker run --rm -d ` + -p 5000:5000 ` + -v .\mlruns:/mlruns ` + ghcr.io/mlflow/mlflow:v2.9.2 ` + mlflow server -h 0.0.0.0 + ``` + + and connect to http://localhost:5000. + + Once you are satisfied with the Docker image, you can push it to the + Docker registry: + + !!! warning "Attention" + + If you're following the "VSCode Server Terminal" method, you can + skip this as you have already pushed to the Docker registry. + + === "Linux/macOS" + + ```bash + docker push {{cookiecutter.registry_project_path}}/model-training:0.1.0 + ``` + + === "Windows PowerShell" + + ```powershell + docker push {{cookiecutter.registry_project_path}}/model-training:0.1.0 + ``` + +=== "Run:ai" + + The beauty of MLFlow is that it can run locally, within a Docker + container or connecting to a remote MLFlow server. In this case, + We shall build the Docker image from the Docker file + `docker/{{cookiecutter.repo_name}}-gpu.Dockerfile`: + + === "VSCode Server Terminal" + + ```bash + # Run `runai login` and `runai config project {{cookiecutter.proj_name}}` first if needed + # Run this in the base of your project repository, and change accordingly + khull kaniko --context $(pwd) \ + --dockerfile $(pwd)/docker/{{cookiecutter.repo_name}}-gpu.Dockerfile \ + --destination {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ + {%- if cookiecutter.platform == 'gcp' %} + --gcp \ + {%- endif %} + --cred-file /path/to/docker/config.json \ + -v :/path/to/pvc/mount + ``` + + Now that we have the Docker image built and pushed to the registry, + we can run a job using it: + + === "Linux/macOS" + + ```bash + # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC + runai submit \ + --job-name-prefix -train \ + -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ + --working-dir /home/aisg/{{cookiecutter.repo_name}} \ + --existing-pvc claimname=,path=/ \ + --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ + -e MLFLOW_TRACKING_USERNAME= \ + -e MLFLOW_TRACKING_PASSWORD= \ + -e OMP_NUM_THREADS=2 \ + --command -- '/bin/bash -c "python -u src/train_model.py data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' + ``` + + === "Windows PowerShell" + + ```powershell + # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC + runai submit ` + --job-name-prefix -train ` + -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` + --working-dir /home/aisg/{{cookiecutter.repo_name}} ` + --existing-pvc claimname=,path=/ ` + --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 ` + -e MLFLOW_TRACKING_USERNAME= ` + -e MLFLOW_TRACKING_PASSWORD= ` + -e OMP_NUM_THREADS=2 ` + --command -- '/bin/bash -c "python -u src/train_model.py data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' + ``` + + === "VSCode Server Terminal" + + ```bash + # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC + $ runai submit \ + --job-name-prefix -train \ + -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ + --working-dir /home/aisg/{{cookiecutter.repo_name}} \ + --existing-pvc claimname=,path=/ \ + --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ + -e MLFLOW_TRACKING_USERNAME= \ + -e MLFLOW_TRACKING_PASSWORD= \ + -e OMP_NUM_THREADS=2 \ + --command -- '/bin/bash -c "python -u src/train_model.py data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' + ``` + + Once you have successfully run an experiment, you may inspect the run + on the MLflow Tracking server. Through the MLflow Tracking server + interface, you can view the metrics and parameters logged for the run, + as well as download the artifacts that have been uploaded to the ECS + bucket. You can also compare runs with each other. + + ![MLflow Tracking Server - Inspecting Runs](https://storage.googleapis.com/aisg-mlops-pub-data/images/mlflow-tracking-server-inspect.gif) + + !!! tip + Every job submitted with `runai submit` is assigned a unique ID, + and a unique job name if the `--job-name-prefix` is used. The + `mlflow_init` function within the `general_utils.py` module tags + every experiment name with the job's name and UUID as provided by + Run:ai, with the tags `job_uuid` and `job_name`. This allows you to + easily identify the MLflow experiment runs that are associated with + each Run:ai job. You can filter for MLflow experiment runs + associated with a specific Run:ai job by using MLflow's search + filter expressions and API. + + ??? info "Reference Link(s)" + + - [Run:ai Docs - Environment Variables inside a Run:ai Workload](https://docs.run.ai/latest/Researcher/best-practices/env-variables/) + - [MLflow Docs - Search Runs](https://mlflow.org/docs/latest/search-runs.html) + + !!! info + If your project has GPU quotas assigned to it, you can make use of + it by specifying the `--gpu` flag in the `runai submit` command. As + part of Run:ai's unique selling point, you can also specify + fractional values, which would allow you to utilise a fraction of a + GPU. This is useful for projects that require a GPU for training, + but do not require the full capacity of a GPU. ### Hyperparameter Tuning @@ -607,98 +615,98 @@ executing the model training job out of the Run:ai platform, as the `JOB_NAME` and `JOB_UUID` environment variables would not be available by default. -#### Local +=== "Local" -=== "Linux/macOS" + === "Linux/macOS" - ```bash - python src/train_model.py --multirun - ``` + ```bash + python src/train_model.py --multirun + ``` -=== "Windows PowerShell" + === "Windows PowerShell" - ```powershell - python src\train_model.py --multirun - ``` + ```powershell + python src\train_model.py --multirun + ``` -#### Docker +=== "Docker" -=== "Linux/macOS" + === "Linux/macOS" - ```bash - docker run --rm \ - -v ./data:/home/aisg/{{cookiecutter.repo_name}}/data \ - -v ./mlruns:/home/aisg/{{cookiecutter.repo_name}}/mlruns \ - -v ./models:/home/aisg/{{cookiecutter.repo_name}}/models \ - -w /home/aisg/{{cookiecutter.repo_name}} \ - {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ - python -u src/train_model.py --multirun - ``` + ```bash + docker run --rm \ + -v ./data:/home/aisg/{{cookiecutter.repo_name}}/data \ + -v ./mlruns:/home/aisg/{{cookiecutter.repo_name}}/mlruns \ + -v ./models:/home/aisg/{{cookiecutter.repo_name}}/models \ + -w /home/aisg/{{cookiecutter.repo_name}} \ + {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ + python -u src/train_model.py --multirun + ``` -=== "Windows PowerShell" + === "Windows PowerShell" - ```powershell - docker run --rm \ - -v .\data:/home/aisg/{{cookiecutter.repo_name}}/data ` - -v .\mlruns:/home/aisg/{{cookiecutter.repo_name}}/mlruns ` - -v .\models:/home/aisg/{{cookiecutter.repo_name}}/models ` - -w /home/aisg/{{cookiecutter.repo_name}} ` - {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` - python -u src/train_model.py --multirun - ``` + ```powershell + docker run --rm \ + -v .\data:/home/aisg/{{cookiecutter.repo_name}}/data ` + -v .\mlruns:/home/aisg/{{cookiecutter.repo_name}}/mlruns ` + -v .\models:/home/aisg/{{cookiecutter.repo_name}}/models ` + -w /home/aisg/{{cookiecutter.repo_name}} ` + {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` + python -u src/train_model.py --multirun + ``` -#### Run:ai +=== "Run:ai" -=== "Linux/macOS" - - ```bash - # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC - runai submit \ - --job-name-prefix -train-hp \ - -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ - --working-dir /home/aisg/{{cookiecutter.repo_name}} \ - --existing-pvc claimname=,path=/ \ - --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ - -e MLFLOW_TRACKING_USERNAME= \ - -e MLFLOW_TRACKING_PASSWORD= \ - -e MLFLOW_HPTUNING_TAG=$(date +%s) \ - -e OMP_NUM_THREADS=2 \ - --command -- '/bin/bash -c "python -u src/train_model.py --multirun data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' - ``` - -=== "Windows PowerShell" - - ```powershell - # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC - runai submit ` - --job-name-prefix -train ` - -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` - --working-dir /home/aisg/{{cookiecutter.repo_name}} ` - --existing-pvc claimname=,path=/ \ - --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ - -e MLFLOW_TRACKING_USERNAME= ` - -e MLFLOW_TRACKING_PASSWORD= ` - -e MLFLOW_HPTUNING_TAG=$(Get-Date -UFormat %s -Millisecond 0) ` - -e OMP_NUM_THREADS=2 ` - --command -- '/bin/bash -c "python -u src/train_model.py --multirun data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' - ``` - -=== "VSCode Server Terminal" - - ```bash - # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC - runai submit \ - --job-name-prefix -train-hp \ - -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ - --working-dir /home/aisg/{{cookiecutter.repo_name}} \ - --existing-pvc claimname=,path=/ \ - --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ - -e MLFLOW_TRACKING_USERNAME= \ - -e MLFLOW_TRACKING_PASSWORD= \ - -e MLFLOW_HPTUNING_TAG=$(date +%s) \ - -e OMP_NUM_THREADS=2 \ - --command -- '/bin/bash -c "python -u src/train_model.py --multirun data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' - ``` + === "Linux/macOS" + + ```bash + # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC + runai submit \ + --job-name-prefix -train-hp \ + -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ + --working-dir /home/aisg/{{cookiecutter.repo_name}} \ + --existing-pvc claimname=,path=/ \ + --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ + -e MLFLOW_TRACKING_USERNAME= \ + -e MLFLOW_TRACKING_PASSWORD= \ + -e MLFLOW_HPTUNING_TAG=$(date +%s) \ + -e OMP_NUM_THREADS=2 \ + --command -- '/bin/bash -c "python -u src/train_model.py --multirun data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' + ``` + + === "Windows PowerShell" + + ```powershell + # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC + runai submit ` + --job-name-prefix -train ` + -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 ` + --working-dir /home/aisg/{{cookiecutter.repo_name}} ` + --existing-pvc claimname=,path=/ \ + --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ + -e MLFLOW_TRACKING_USERNAME= ` + -e MLFLOW_TRACKING_PASSWORD= ` + -e MLFLOW_HPTUNING_TAG=$(Get-Date -UFormat %s -Millisecond 0) ` + -e OMP_NUM_THREADS=2 ` + --command -- '/bin/bash -c "python -u src/train_model.py --multirun data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' + ``` + + === "VSCode Server Terminal" + + ```bash + # Switch working-dir to //workspaces//{{cookiecutter.repo_name}} to use the repo in the PVC + runai submit \ + --job-name-prefix -train-hp \ + -i {{cookiecutter.registry_project_path}}/model-training:0.1.0 \ + --working-dir /home/aisg/{{cookiecutter.repo_name}} \ + --existing-pvc claimname=,path=/ \ + --cpu 2 --cpu-limit 2 --memory 4G --memory-limit 4G --backoff-limit 1 \ + -e MLFLOW_TRACKING_USERNAME= \ + -e MLFLOW_TRACKING_PASSWORD= \ + -e MLFLOW_HPTUNING_TAG=$(date +%s) \ + -e OMP_NUM_THREADS=2 \ + --command -- '/bin/bash -c "python -u src/train_model.py --multirun data_dir_path=//workspaces//data/processed/mnist-pngs-data-aisg-processed setup_mlflow=true mlflow_tracking_uri= mlflow_exp_name= model_checkpoint_dir_path=//workspaces//{{cookiecutter.repo_name}}/models epochs=3"' + ``` ![MLflow Tracking Server - Hyperparameter Tuning Runs](assets/screenshots/mlflow-tracking-hptuning-runs.png)