From 0f15c7cb3f0d7d7f8fdcd449045bff190f9fb1b5 Mon Sep 17 00:00:00 2001 From: Blagoj Atanasovski Date: Fri, 16 Aug 2024 09:22:15 +0200 Subject: [PATCH] Configure DCGM depending on k8s provider EKS and GKE work slightly differently in allowing DCGM exporter to attach workload labels to the metric it collects. The helm chart is updated to dynamicaly apply the required properties based on provider and also some unneeded values are removed --- .../templates/_helpers.tpl | 2 +- .../templates/daemonset.yaml | 123 +++++++++--------- charts/gpu-metrics-exporter/values-gke.yaml | 7 - charts/gpu-metrics-exporter/values.yaml | 28 ++-- 4 files changed, 78 insertions(+), 82 deletions(-) diff --git a/charts/gpu-metrics-exporter/templates/_helpers.tpl b/charts/gpu-metrics-exporter/templates/_helpers.tpl index bff57cc..2d69dae 100644 --- a/charts/gpu-metrics-exporter/templates/_helpers.tpl +++ b/charts/gpu-metrics-exporter/templates/_helpers.tpl @@ -50,7 +50,7 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} {{/* -Common labels +GPU metrics exporter labels */}} {{- define "gpu-metrics-exporter.labels" -}} helm.sh/chart: {{ include "gpu-metrics-exporter.chart" . }} diff --git a/charts/gpu-metrics-exporter/templates/daemonset.yaml b/charts/gpu-metrics-exporter/templates/daemonset.yaml index 9a609ae..74a7100 100644 --- a/charts/gpu-metrics-exporter/templates/daemonset.yaml +++ b/charts/gpu-metrics-exporter/templates/daemonset.yaml @@ -5,68 +5,61 @@ metadata: namespace: {{ .Release.Namespace }} labels: {{- include "gpu-metrics-exporter.labels" . | nindent 4 }} - annotations: - {{- if .Values.dcgmExporter.enabled }} - ignore-check.kube-linter.io/privileged-container: "This daemon set needs to run DCGM Exporter as privileged to access the GPU metrics." - ignore-check.kube-linter.io/run-as-non-root: "This daemon set needs to run DCGM Exporter as root to access the GPU metrics." - ignore-check.kube-linter.io/privilege-escalation-container: "This daemon set needs escalate privileges for DCGM Exporter to access the GPU metrics." - ignore-check.kube-linter.io/no-read-only-root-fs: "This daemon set needs to run DCGM Exporter with read-only root filesystem to access the GPU utilization metrics." - {{- end }} spec: + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + maxSurge: 0 selector: matchLabels: {{- include "gpu-metrics-exporter.selectorLabels" . | nindent 6 }} template: metadata: - {{- with .Values.podAnnotations }} - annotations: - {{- toYaml . | nindent 8 }} - {{- end }} labels: - {{- include "gpu-metrics-exporter.labels" . | nindent 8 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} + {{- include "gpu-metrics-exporter.selectorLabels" . | nindent 8 }} spec: {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} {{- end }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} + {{- if eq (required ".Values.provider is required (gke|eks|aks)" .Values.provider) "eks" }} + priorityClassName: system-node-critical + {{- end }} + serviceAccountName: {{ include "gpu-metrics-exporter.serviceAccountName" . }} + {{- if .Values.dcgmExporter.enabled }} volumes: - {{- if .Values.dcgmExporter.enabled }} + - name: "pod-gpu-resources" + hostPath: + path: /var/lib/kubelet/pod-resources - name: {{- include "dcgm-exporter.config-map" . | indent 1 }} configMap: name: {{- include "dcgm-exporter.config-map" . | indent 1 }} - - name: nvidia-install-dir-host + {{- if eq .Values.provider "gke" }} + - name: "nvidia-install-dir-host" hostPath: path: /home/kubernetes/bin/nvidia - - name: pod-resources - hostPath: - path: /var/lib/kubelet/pod-resources {{- end }} - {{- with .Values.additionalVolumes }} - {{- toYaml . | nindent 8 }} {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} - serviceAccountName: {{ include "gpu-metrics-exporter.serviceAccountName" . }} - {{- with .Values.affinity }} + {{- if eq .Values.provider "gke"}} + {{- with .Values.gke.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} + {{- end}} containers: - - name: {{ .Chart.Name }} + - name: castai-gpu-metrics-exporter securityContext: - {{- toYaml .Values.gpuMetricsExporter.securityContext | nindent 12 }} + {{- toYaml .Values.gpuMetricsExporter.securityContext | nindent 12 }} image: "{{ .Values.gpuMetricsExporter.image.repository }}:{{ .Values.gpuMetricsExporter.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.gpuMetricsExporter.image.pullPolicy }} ports: - name: http - containerPort: {{ .Values.service.port }} + containerPort: {{ .Values.gpuMetricsExporter.port }} protocol: TCP livenessProbe: httpGet: @@ -79,61 +72,73 @@ spec: envFrom: - configMapRef: name: {{- include "gpu-metrics-exporter.config-map" . | indent 1}} - {{- if .Values.dcgmExporter.enabled}} + {{- if .Values.dcgmExporter.enabled }} env: - name: "DCGM_HOST" value: "localhost" {{- end }} resources: {{- toYaml .Values.gpuMetricsExporter.resources | nindent 12 }} - {{- with .Values.gpuMetricsExporter.volumeMounts }} - volumeMounts: - {{- toYaml . | nindent 12 }} - {{- end }} {{- if .Values.dcgmExporter.enabled }} - name: dcgm-exporter - image: "{{ .Values.dcgmExporter.image.repository}}:{{ .Values.dcgmExporter.image.tag }}" - imagePullPolicy: {{ .Values.dcgmExporter.image.pullPolicy }} + securityContext: + capabilities: + add: + - SYS_ADMIN + runAsNonRoot: false + runAsUser: 0 + {{- if eq .Values.provider "gke"}} + privileged: true + {{- end }} + image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04" + imagePullPolicy: "IfNotPresent" command: [ "/bin/bash", "-c" ] args: - {{- if .Values.dcgmExporter.useExternalHostEngine }} - - hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter --remote-hostengine-info $(NODE_IP) --collectors /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 30 ; } + {{- if eq .Values.provider "gke"}} + {{- if .Values.dcgmExporter.useExternalHostEngine }} + - hostname $NODE_NAME; dcgm-exporter --remote-hostengine-info $(NODE_IP) -f /etc/dcgm-exporter/counters.csv + {{- else }} + - hostname $NODE_NAME; dcgm-exporter -f /etc/dcgm-exporter/counters.csv + {{- end }} {{- else }} - - hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter --collectors /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 30 ; } + {{- if .Values.dcgmExporter.useExternalHostEngine }} + - hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter --remote-hostengine-info $(NODE_IP) -f /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 60 ; } + {{- else }} + - hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter -f /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 60 ; } + {{- end }} {{- end }} ports: - - name: metrics + - name: "metrics" containerPort: 9400 - securityContext: - privileged: true env: - - name: NODE_NAME + - name: "DCGM_EXPORTER_KUBERNETES" + value: "true" + - name: "DCGM_EXPORTER_LISTEN" + value: ":9400" + - name: "DCGM_EXPORTER_INTERVAL" + value: "5000" + - name: "NODE_NAME" valueFrom: fieldRef: fieldPath: spec.nodeName - - name: NODE_IP + {{- if eq .Values.provider "gke" }} + - name: "NODE_IP" valueFrom: fieldRef: fieldPath: status.hostIP + - name: "LD_LIBRARY_PATH" + value: "/usr/local/nvidia/lib64" - name: "DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE" value: "device-name" - - name: LD_LIBRARY_PATH - value: /usr/local/nvidia/lib64 - - name: DCGM_EXPORTER_KUBERNETES - value: 'true' - - name: DCGM_EXPORTER_LISTEN - value: ':9400' - - name: DCGM_EXPORTER_INTERVAL - value: '5000' - {{- with .Values.dcgmExporter.additionalEnv }} - {{- toYaml . | nindent 12 }} {{- end }} volumeMounts: - - name: {{- include "dcgm-exporter.config-map" . | indent 1 }} - mountPath: "/etc/dcgm-exporter" + - name: "pod-gpu-resources" readOnly: true - - name: nvidia-install-dir-host + mountPath: "/var/lib/kubelet/pod-resources" + {{- if eq .Values.provider "gke" }} + - name: "nvidia-install-dir-host" mountPath: /usr/local/nvidia - - name: pod-resources - mountPath: /var/lib/kubelet/pod-resources + {{- end }} + - name: {{- include "dcgm-exporter.config-map" . | indent 1 }} + mountPath: "/etc/dcgm-exporter" {{- end }} diff --git a/charts/gpu-metrics-exporter/values-gke.yaml b/charts/gpu-metrics-exporter/values-gke.yaml index da99913..e69de29 100644 --- a/charts/gpu-metrics-exporter/values-gke.yaml +++ b/charts/gpu-metrics-exporter/values-gke.yaml @@ -1,7 +0,0 @@ -affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: cloud.google.com/gke-accelerator - operator: Exists diff --git a/charts/gpu-metrics-exporter/values.yaml b/charts/gpu-metrics-exporter/values.yaml index a15a61e..5ac9ace 100644 --- a/charts/gpu-metrics-exporter/values.yaml +++ b/charts/gpu-metrics-exporter/values.yaml @@ -1,20 +1,11 @@ +provider: "" # gke | eks | aks imagePullSecrets: [] -podAnnotations: {} -podLabels: {} - -podSecurityContext: {} - -service: - type: ClusterIP - port: 6061 - -additionalVolumes: [] - serviceAccount: create: true automount: true annotations: {} + gpuMetricsExporter: image: repository: ghcr.io/castai/gpu-metrics-exporter/gpu-metrics-exporter @@ -31,14 +22,21 @@ gpuMetricsExporter: securityContext: readOnlyRootFilesystem: true runAsNonRoot: true - volumeMounts: [] + port: 6061 dcgmExporter: enabled: true - arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"] image: repository: nvcr.io/nvidia/k8s/dcgm-exporter pullPolicy: IfNotPresent - tag: 3.3.6-3.4.2-ubuntu22.04 + tag: 3.3.7-3.5.0-ubuntu22.04 useExternalHostEngine: false - additionalEnv: [] + +gke: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-accelerator + operator: Exists