Skip to content

Commit

Permalink
Configure DCGM depending on k8s provider (#29)
Browse files Browse the repository at this point in the history
* Configure DCGM depending on k8s provider

EKS and GKE work slightly differently in
allowing DCGM exporter to attach workload labels
to the metric it collects.

The helm chart is updated to dynamicaly
apply the required properties based on provider
and also some unneeded values are removed
  • Loading branch information
atanasovskib authored Aug 22, 2024
1 parent 8ee5cac commit 7bd71a0
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 79 deletions.
2 changes: 1 addition & 1 deletion charts/gpu-metrics-exporter/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

{{/*
Common labels
GPU metrics exporter labels
*/}}
{{- define "gpu-metrics-exporter.labels" -}}
helm.sh/chart: {{ include "gpu-metrics-exporter.chart" . }}
Expand Down
121 changes: 65 additions & 56 deletions charts/gpu-metrics-exporter/templates/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,68 +5,63 @@ metadata:
namespace: {{ .Release.Namespace }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 4 }}
{{- if .Values.dcgmExporter.enabled }}
annotations:
{{- if .Values.dcgmExporter.enabled }}
ignore-check.kube-linter.io/privileged-container: "This daemon set needs to run DCGM Exporter as privileged to access the GPU metrics."
ignore-check.kube-linter.io/run-as-non-root: "This daemon set needs to run DCGM Exporter as root to access the GPU metrics."
ignore-check.kube-linter.io/privilege-escalation-container: "This daemon set needs escalate privileges for DCGM Exporter to access the GPU metrics."
ignore-check.kube-linter.io/no-read-only-root-fs: "This daemon set needs to run DCGM Exporter with read-only root filesystem to access the GPU utilization metrics."
{{- end }}
ignore-check.kube-linter.io/privilege-escalation-container: "This daemon set needs escalate privileges for DCGM Exporter."
ignore-check.kube-linter.io/no-read-only-root-fs: "This daemon set needs to run DCGM Exporter with read-only root filesystem."
{{- end }}
spec:
selector:
matchLabels:
{{- include "gpu-metrics-exporter.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 8 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- include "gpu-metrics-exporter.selectorLabels" . | nindent 8 }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
{{- if eq (required ".Values.provider is required (gke|eks|aks)" .Values.provider) "eks" }}
priorityClassName: system-node-critical
{{- end }}
serviceAccountName: {{ include "gpu-metrics-exporter.serviceAccountName" . }}
{{- if .Values.dcgmExporter.enabled }}
volumes:
{{- if .Values.dcgmExporter.enabled }}
- name: "pod-gpu-resources"
hostPath:
path: /var/lib/kubelet/pod-resources
- name: {{- include "dcgm-exporter.config-map" . | indent 1 }}
configMap:
name: {{- include "dcgm-exporter.config-map" . | indent 1 }}
- name: nvidia-install-dir-host
{{- if eq .Values.provider "gke" }}
- name: "nvidia-install-dir-host"
hostPath:
path: /home/kubernetes/bin/nvidia
- name: pod-resources
hostPath:
path: /var/lib/kubelet/pod-resources
{{- end }}
{{- with .Values.additionalVolumes }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "gpu-metrics-exporter.serviceAccountName" . }}
{{- with .Values.affinity }}
{{- if eq .Values.provider "gke"}}
{{- with .Values.gke.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end}}
containers:
- name: {{ .Chart.Name }}
- name: castai-gpu-metrics-exporter
securityContext:
{{- toYaml .Values.gpuMetricsExporter.securityContext | nindent 12 }}
{{- toYaml .Values.gpuMetricsExporter.securityContext | nindent 12 }}
image: "{{ .Values.gpuMetricsExporter.image.repository }}:{{ .Values.gpuMetricsExporter.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.gpuMetricsExporter.image.pullPolicy }}
ports:
- name: http
containerPort: {{ .Values.service.port }}
containerPort: {{ .Values.gpuMetricsExporter.port }}
protocol: TCP
livenessProbe:
httpGet:
Expand All @@ -79,61 +74,75 @@ spec:
envFrom:
- configMapRef:
name: {{- include "gpu-metrics-exporter.config-map" . | indent 1}}
{{- if .Values.dcgmExporter.enabled}}
{{- if .Values.dcgmExporter.enabled }}
env:
- name: "DCGM_HOST"
value: "localhost"
{{- end }}
resources:
{{- toYaml .Values.gpuMetricsExporter.resources | nindent 12 }}
{{- with .Values.gpuMetricsExporter.volumeMounts }}
volumeMounts:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- if .Values.dcgmExporter.enabled }}
- name: dcgm-exporter
image: "{{ .Values.dcgmExporter.image.repository}}:{{ .Values.dcgmExporter.image.tag }}"
imagePullPolicy: {{ .Values.dcgmExporter.image.pullPolicy }}
securityContext:
capabilities:
add:
- SYS_ADMIN
drop:
- NET_RAW
runAsNonRoot: false
runAsUser: 0
{{- if eq .Values.provider "gke"}}
privileged: true
{{- end }}
image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04"
imagePullPolicy: "IfNotPresent"
command: [ "/bin/bash", "-c" ]
args:
{{- if .Values.dcgmExporter.useExternalHostEngine }}
- hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter --remote-hostengine-info $(NODE_IP) --collectors /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 30 ; }
{{- if eq .Values.provider "gke"}}
{{- if .Values.dcgmExporter.useExternalHostEngine }}
- hostname $NODE_NAME; dcgm-exporter --remote-hostengine-info $(NODE_IP) -f /etc/dcgm-exporter/counters.csv
{{- else }}
- hostname $NODE_NAME; dcgm-exporter -f /etc/dcgm-exporter/counters.csv
{{- end }}
{{- else }}
- hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter --collectors /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 30 ; }
{{- if .Values.dcgmExporter.useExternalHostEngine }}
- hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter --remote-hostengine-info $(NODE_IP) -f /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 60 ; }
{{- else }}
- hostname $NODE_NAME; for ((;;)) { r=$(dcgm-exporter -f /etc/dcgm-exporter/counters.csv); echo "dcgm-exporter could not run"; sleep 60 ; }
{{- end }}
{{- end }}
ports:
- name: metrics
- name: "metrics"
containerPort: 9400
securityContext:
privileged: true
env:
- name: NODE_NAME
- name: "DCGM_EXPORTER_KUBERNETES"
value: "true"
- name: "DCGM_EXPORTER_LISTEN"
value: ":9400"
- name: "DCGM_EXPORTER_INTERVAL"
value: "5000"
- name: "NODE_NAME"
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: NODE_IP
{{- if eq .Values.provider "gke" }}
- name: "NODE_IP"
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: "LD_LIBRARY_PATH"
value: "/usr/local/nvidia/lib64"
- name: "DCGM_EXPORTER_KUBERNETES_GPU_ID_TYPE"
value: "device-name"
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
- name: DCGM_EXPORTER_KUBERNETES
value: 'true'
- name: DCGM_EXPORTER_LISTEN
value: ':9400'
- name: DCGM_EXPORTER_INTERVAL
value: '5000'
{{- with .Values.dcgmExporter.additionalEnv }}
{{- toYaml . | nindent 12 }}
{{- end }}
volumeMounts:
- name: {{- include "dcgm-exporter.config-map" . | indent 1 }}
mountPath: "/etc/dcgm-exporter"
- name: "pod-gpu-resources"
readOnly: true
- name: nvidia-install-dir-host
mountPath: "/var/lib/kubelet/pod-resources"
{{- if eq .Values.provider "gke" }}
- name: "nvidia-install-dir-host"
mountPath: /usr/local/nvidia
- name: pod-resources
mountPath: /var/lib/kubelet/pod-resources
{{- end }}
- name: {{- include "dcgm-exporter.config-map" . | indent 1 }}
mountPath: "/etc/dcgm-exporter"
{{- end }}
7 changes: 0 additions & 7 deletions charts/gpu-metrics-exporter/values-gke.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +0,0 @@
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
28 changes: 13 additions & 15 deletions charts/gpu-metrics-exporter/values.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,11 @@
provider: "" # gke | eks | aks
imagePullSecrets: []

podAnnotations: {}
podLabels: {}

podSecurityContext: {}

service:
type: ClusterIP
port: 6061

additionalVolumes: []

serviceAccount:
create: true
automount: true
annotations: {}

gpuMetricsExporter:
image:
repository: ghcr.io/castai/gpu-metrics-exporter/gpu-metrics-exporter
Expand All @@ -31,14 +22,21 @@ gpuMetricsExporter:
securityContext:
readOnlyRootFilesystem: true
runAsNonRoot: true
volumeMounts: []
port: 6061

dcgmExporter:
enabled: true
arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"]
image:
repository: nvcr.io/nvidia/k8s/dcgm-exporter
pullPolicy: IfNotPresent
tag: 3.3.6-3.4.2-ubuntu22.04
tag: 3.3.7-3.5.0-ubuntu22.04
useExternalHostEngine: false
additionalEnv: []

gke:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists

0 comments on commit 7bd71a0

Please sign in to comment.