From 1612e2b919e66037ea9756d2984494dec8c8298c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Jes=C3=BAs?= Date: Thu, 18 Jan 2024 14:28:53 +0100 Subject: [PATCH] feat (k8s/GPU): Changed aliyun GPU memory management plugin to NVIDIA official device plugin. --- backend/automl/views.py | 68 +++++++++++--------- kustomize/README.md | 19 +++--- kustomize/v1.1-gpu-nvidia/kustomization.yaml | 17 +++++ 3 files changed, 65 insertions(+), 39 deletions(-) create mode 100644 kustomize/v1.1-gpu-nvidia/kustomization.yaml diff --git a/backend/automl/views.py b/backend/automl/views.py index 1a4d107..6469a16 100644 --- a/backend/automl/views.py +++ b/backend/automl/views.py @@ -548,10 +548,11 @@ def post(self, request, format=None): {'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"}, ## (Sharing GPU) {'name': 'CASE', 'value': str(case)} ], - 'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU) + 'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU) }], - 'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub - 'restartPolicy': 'OnFailure' + 'imagePullPolicy': 'Always', + 'restartPolicy': 'OnFailure', + 'runtimeClassName': 'nvidia' } } } @@ -593,10 +594,11 @@ def post(self, request, format=None): {'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)}, {'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)} ], - 'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU) + 'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU) }], - 'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub - 'restartPolicy': 'OnFailure' + 'imagePullPolicy': 'Always', + 'restartPolicy': 'OnFailure', + 'runtimeClassName': 'nvidia' } } } @@ -634,10 +636,11 @@ def post(self, request, format=None): {'name': 'CHANGE', 'value': deployment.change}, {'name': 'IMPROVEMENT', 'value': str(deployment.improvement)} ], - 'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU) + 'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU) }], - 'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub - 'restartPolicy': 'OnFailure' + 'imagePullPolicy': 'Always', + 'restartPolicy': 'OnFailure', + 'runtimeClassName': 'nvidia' } } } @@ -684,10 +687,11 @@ def post(self, request, format=None): {'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)}, {'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)} ], - 'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU) + 'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU) }], - 'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub - 'restartPolicy': 'OnFailure' + 'imagePullPolicy': 'Always', + 'restartPolicy': 'OnFailure', + 'runtimeClassName': 'nvidia' } } } @@ -746,10 +750,11 @@ def post(self, request, format=None): {'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"}, ## (Sharing GPU) {'name': 'CASE', 'value': str(case)} ], - 'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU) + 'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU) }], - 'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub - 'restartPolicy': 'OnFailure' + 'imagePullPolicy': 'Always', + 'restartPolicy': 'OnFailure', + 'runtimeClassName': 'nvidia' } } } @@ -796,10 +801,11 @@ def post(self, request, format=None): {'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)}, {'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)} ], - 'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU) + 'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU) }], - 'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub - 'restartPolicy': 'OnFailure' + 'imagePullPolicy': 'Always', + 'restartPolicy': 'OnFailure', + 'runtimeClassName': 'nvidia' } } } @@ -839,10 +845,11 @@ def post(self, request, format=None): {'name': 'STREAM_TIMEOUT', 'value': str(deployment.stream_timeout) if not deployment.indefinite else str(-1)}, {'name': 'IMPROVEMENT', 'value': str(deployment.improvement)} ], - 'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU) + 'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU) }], - 'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub - 'restartPolicy': 'OnFailure' + 'imagePullPolicy': 'Always', + 'restartPolicy': 'OnFailure', + 'runtimeClassName': 'nvidia' } } } @@ -891,10 +898,11 @@ def post(self, request, format=None): {'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)}, {'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)} ], - 'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU) + 'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU) }], - 'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub - 'restartPolicy': 'OnFailure' + 'imagePullPolicy': 'Always', + 'restartPolicy': 'OnFailure', + 'runtimeClassName': 'nvidia' } } } @@ -1540,10 +1548,10 @@ def post(self, request, pk, format=None): {'name': 'GROUP_ID', 'value': 'inf'+str(result.id)}, {'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"} ## (Sharing GPU) ], - 'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU) - #'resources': {'limits':{'nvidia.com/gpu': 1}} ## (Greedy GPU) + 'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU) }], - 'imagePullPolicy': 'IfNotPresent' # TODO: Remove this when the image is in DockerHub + 'imagePullPolicy': 'Always', + 'runtimeClassName': 'nvidia' } } } @@ -1593,10 +1601,10 @@ def post(self, request, pk, format=None): {'name': 'LIMIT', 'value': str(inference.limit)}, {'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"} ## (Sharing GPU) ], - 'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU) - #'resources': {'limits':{'nvidia.com/gpu': 1}} ## (Greedy GPU) + 'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU) }], - 'imagePullPolicy': 'IfNotPresent' # TODO: Remove this when the image is in DockerHub + 'imagePullPolicy': 'Always', + 'runtimeClassName': 'nvidia' } } } diff --git a/kustomize/README.md b/kustomize/README.md index 1926913..a905872 100644 --- a/kustomize/README.md +++ b/kustomize/README.md @@ -3,15 +3,16 @@ This folder contains multiple Kustomize files to ease the deployment on Kubernetes. Notably the following versions are available: -| Version | Resource URL | -| ------------ | --------------------------------------------------------- | -| `master` | `github.com/ertis-research/kafka-ml/kustomize/master` | -| `master-gpu` | `github.com/ertis-research/kafka-ml/kustomize/master-gpu` | -| `v1.0` | `github.com/ertis-research/kafka-ml/kustomize/v1.0` | -| `v1.0-gpu` | `github.com/ertis-research/kafka-ml/kustomize/v1.0-gpu` | -| `v1.1` | `github.com/ertis-research/kafka-ml/kustomize/v1.1` | -| `v1.1-gpu` | `github.com/ertis-research/kafka-ml/kustomize/v1.1-gpu` | -| `local` | `github.com/ertis-research/kafka-ml/kustomize/local` | +| Version | Resource URL | +| ------------------- | ---------------------------------------------------------------- | +| `master` | `github.com/ertis-research/kafka-ml/kustomize/master` | +| `master-gpu` | `github.com/ertis-research/kafka-ml/kustomize/master-gpu` | +| `v1.0` | `github.com/ertis-research/kafka-ml/kustomize/v1.0` | +| `v1.0-gpu` | `github.com/ertis-research/kafka-ml/kustomize/v1.0-gpu` | +| `v1.1` | `github.com/ertis-research/kafka-ml/kustomize/v1.1` | +| `v1.1-gpu` | `github.com/ertis-research/kafka-ml/kustomize/v1.1-gpu` | +| `v1.1-gpu-nvidia` | `github.com/ertis-research/kafka-ml/kustomize/v1.1-gpu-nvidia` | +| `local` | `github.com/ertis-research/kafka-ml/kustomize/local` | These versions should work with any Kubernetes compatible cluster, such as K8s and K3s. diff --git a/kustomize/v1.1-gpu-nvidia/kustomization.yaml b/kustomize/v1.1-gpu-nvidia/kustomization.yaml new file mode 100644 index 0000000..624333c --- /dev/null +++ b/kustomize/v1.1-gpu-nvidia/kustomization.yaml @@ -0,0 +1,17 @@ +resources: + - "../v1.1" + +configMapGenerator: + - name: kafkaml-configmap + behavior: merge + literals: + - tensorflow.training.image=ertis/kafka-ml-tensorflow_model_training-gpu:v1.1 + - tensorflow.inference.image=ertis/kafka-ml-tensorflow_model_inference-gpu:v1.1 + - pytorch.training.image=ertis/kafka-ml-pytorch_model_training-gpu:v1.1 + - pytorch.inference.image=ertis/kafka-ml-pytorch_model_inference-gpu:v1.1 + +images: + - name: ertis/kafka-ml-pthexecutor + newName: ertis/kafka-ml-pthexecutor-gpu + - name: ertis/kafka-ml-tfexecutor + newName: ertis/kafka-ml-tfexecutor-gpu