Skip to content

Commit

Permalink
Add deployment and don't send metrics if no url to scrape, or no metr…
Browse files Browse the repository at this point in the history
…ics scraped
  • Loading branch information
Ivaka committed Mar 19, 2024
1 parent 0e167c6 commit 596ee6e
Show file tree
Hide file tree
Showing 13 changed files with 282 additions and 147 deletions.
6 changes: 6 additions & 0 deletions charts/gpu-metrics-exporter/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v2
name: gpu-metrics-exporter
description: A Helm chart for Kubernetes
type: application
version: 0.1.0
appVersion: "1.16.0"
62 changes: 62 additions & 0 deletions charts/gpu-metrics-exporter/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "gpu-metrics-exporter.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "gpu-metrics-exporter.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Create the name of the service account to use
*/}}
{{- define "gpu-metrics-exporter.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "gpu-metrics-exporter.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "gpu-metrics-exporter.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}

{{/*
Selector labels
*/}}
{{- define "gpu-metrics-exporter.selectorLabels" -}}
app.kubernetes.io/name: {{ include "gpu-metrics-exporter.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

{{/*
Common labels
*/}}
{{- define "gpu-metrics-exporter.labels" -}}
helm.sh/chart: {{ include "gpu-metrics-exporter.chart" . }}
{{ include "gpu-metrics-exporter.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
8 changes: 8 additions & 0 deletions charts/gpu-metrics-exporter/templates/configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: gpu-metrics-exporter-config
data:
{{- with .Values.config }}
{{- toYaml . | nindent 2 }}
{{- end }}
72 changes: 72 additions & 0 deletions charts/gpu-metrics-exporter/templates/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "gpu-metrics-exporter.fullname" . }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 4 }}
spec:
selector:
matchLabels:
{{- include "gpu-metrics-exporter.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 8 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "gpu-metrics-exporter.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
containers:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
ports:
- name: http
containerPort: {{ .Values.service.port }}
protocol: TCP
livenessProbe:
httpGet:
path: /healthz
port: http
readinessProbe:
httpGet:
path: /healthz
port: http
envFrom:
- configMapRef:
name: gpu-metrics-exporter-config
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- with .Values.volumeMounts }}
volumeMounts:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.volumes }}
volumes:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
44 changes: 44 additions & 0 deletions charts/gpu-metrics-exporter/templates/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "gpu-metrics-exporter.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 4 }}
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "gpu-metrics-exporter.fullname" . }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 4 }}
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "gpu-metrics-exporter.fullname" . }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "gpu-metrics-exporter.fullname" . }}
subjects:
- kind: ServiceAccount
name: {{ include "gpu-metrics-exporter.serviceAccountName" . }}
namespace: {{.Release.Namespace}}
{{- end }}
32 changes: 32 additions & 0 deletions charts/gpu-metrics-exporter/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
image:
repository: ghcr.io/castai/gpu-metrics-exporter/gpu-metrics-exporter
pullPolicy: IfNotPresent
tag: ""

imagePullSecrets: []

serviceAccount:
create: true
automount: true
annotations: {}

podAnnotations: {}
podLabels: {}

podSecurityContext: {}

securityContext: {}

service:
type: ClusterIP
port: 6061

resources:
limits:
cpu: 100m
memory: 128Mi
requests:
cpu: 100m
memory: 128Mi

config: {}
4 changes: 2 additions & 2 deletions internal/castai/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ func (c client) UploadBatch(ctx context.Context, batch *pb.MetricsBatch) error {
case statusCode >= 200 && statusCode < 300:
return true, nil
case statusCode >= 400 && statusCode < 500:
return true, fmt.Errorf("client error: %d %s", statusCode, resp.Status())
return true, fmt.Errorf("status code: %d, status: %s", statusCode, resp.Status())
default:
c.log.Errorf("server error or unexpected response code: %d %s", statusCode, resp.Status())
c.log.Errorf("server error or unexpected status code: %d, status: %s", statusCode, resp.Status())
return false, nil
}
})
Expand Down
5 changes: 2 additions & 3 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@ import (
type Config struct {
HTTPListenPort int `envconfig:"HTTP_LISTEN_PORT" default:"6061"`
LogLevel string `envconfig:"LOG_LEVEL" default:"info"`
KubeConfigPath string `envconfig:"KUBE_CONFIG_PATH"`
DCGMLabels map[string]string `envconfig:"DCGM_LABELS" default:"app.kubernetes.io/component:dcgm-exporter"`
KubeConfigPath string `envconfig:"KUBE_CONFIG_PATH" default:""`
DCGMLabels map[string]string `envconfig:"DCGM_LABELS" default:"app.kubernetes.io/name:dcgm-exporter"`
DCGMPort int `envconfig:"DCGM_PORT" default:"9400"`
DCGMMetricsEndpoint string `envconfig:"DCGM_METRICS_ENDPOINT" default:"/metrics"`
ExportInterval time.Duration `envconfig:"EXPORT_INTERVAL" default:"15s"`
CastAPI string `envconfig:"CAST_API" default:"https://api.cast.ai"`
APIToken string `envconfig:"API_TOKEN"`
ClusterID string `envconfig:"CLUSTER_ID"`
APIKey string `envconfig:"API_KEY"`
}
Expand Down
10 changes: 9 additions & 1 deletion internal/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,23 @@ func (e *exporter) export(ctx context.Context) error {
dcgmExporter := dcgmExporterList.Items[i]
urls[i] = fmt.Sprintf("http://%s:%d%s", dcgmExporter.Status.PodIP, e.cfg.DCGMExporterPort, e.cfg.DCGMExporterPath)
}
if len(urls) == 0 {
e.log.Info("no dcgm-exporter instances to scrape")
return nil
}

metricFamilies, err := e.scraper.Scrape(ctx, urls)
if err != nil {
return fmt.Errorf("couldn't scrape DCGM exporters %w", err)
}
if len(metricFamilies) == 0 {
e.log.Warnf("no metrics collected from %d dcgm-exporters", len(urls))
return nil
}

batch := e.mapper.Map(metricFamilies, time.Now())
if err := e.client.UploadBatch(ctx, batch); err != nil {
return fmt.Errorf("error whlie sending metrics %d to castai %w", len(batch.Metrics), err)
return fmt.Errorf("error whlie sending %d metrics to castai %w", len(batch.Metrics), err)
}

e.log.Infof("successfully exported %d metrics", len(batch.Metrics))
Expand Down
10 changes: 2 additions & 8 deletions internal/exporter/mapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ package exporter
import (
"time"

"google.golang.org/protobuf/types/known/timestamppb"

"github.com/castai/gpu-metrics-exporter/pb"
)

Expand Down Expand Up @@ -37,12 +35,9 @@ func (p metricMapper) Map(metricFamilyMaps []MetricFamilyMap, ts time.Time) *pb.
}
t := family.Type.String()
for _, m := range family.Metric {
labels := []*pb.Metric_Label{}
labels := make(map[string]string)
for _, l := range m.Label {
labels = append(labels, &pb.Metric_Label{
Name: *l.Name,
Value: *l.Value,
})
labels[*l.Name] = *l.Value
}
var newValue float64
switch t {
Expand All @@ -54,7 +49,6 @@ func (p metricMapper) Map(metricFamilyMaps []MetricFamilyMap, ts time.Time) *pb.

metric.Measurements = append(metric.Measurements, &pb.Metric_Measurement{
Value: newValue,
Ts: timestamppb.New(ts),
Labels: labels,
})
}
Expand Down
6 changes: 2 additions & 4 deletions internal/exporter/mapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (

dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/timestamppb"

"github.com/castai/gpu-metrics-exporter/internal/exporter"
"github.com/castai/gpu-metrics-exporter/pb"
Expand Down Expand Up @@ -101,9 +100,8 @@ func TestMetricMapper_Map(t *testing.T) {
Measurements: []*pb.Metric_Measurement{
{
Value: 1.0,
Ts: timestamppb.New(ts),
Labels: []*pb.Metric_Label{
{Name: "label1", Value: "value1"},
Labels: map[string]string{
"label1": "value1",
},
},
},
Expand Down
Loading

0 comments on commit 596ee6e

Please sign in to comment.