Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add deployment and don't send metrics if no url to scrape #4

Merged
merged 4 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions charts/gpu-metrics-exporter/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v2
name: gpu-metrics-exporter
description: A Helm chart for Kubernetes
type: application
version: 0.1.0
appVersion: "1.16.0"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the real version of our app?

62 changes: 62 additions & 0 deletions charts/gpu-metrics-exporter/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "gpu-metrics-exporter.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "gpu-metrics-exporter.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Create the name of the service account to use
*/}}
{{- define "gpu-metrics-exporter.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "gpu-metrics-exporter.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "gpu-metrics-exporter.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}

{{/*
Selector labels
*/}}
{{- define "gpu-metrics-exporter.selectorLabels" -}}
app.kubernetes.io/name: {{ include "gpu-metrics-exporter.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

{{/*
Common labels
*/}}
{{- define "gpu-metrics-exporter.labels" -}}
helm.sh/chart: {{ include "gpu-metrics-exporter.chart" . }}
{{ include "gpu-metrics-exporter.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
8 changes: 8 additions & 0 deletions charts/gpu-metrics-exporter/templates/configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: gpu-metrics-exporter-config
data:
{{- with .Values.config }}
{{- toYaml . | nindent 2 }}
{{- end }}
72 changes: 72 additions & 0 deletions charts/gpu-metrics-exporter/templates/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "gpu-metrics-exporter.fullname" . }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 4 }}
spec:
selector:
matchLabels:
{{- include "gpu-metrics-exporter.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 8 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "gpu-metrics-exporter.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
containers:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
ports:
- name: http
containerPort: {{ .Values.service.port }}
protocol: TCP
livenessProbe:
httpGet:
path: /healthz
port: http
readinessProbe:
httpGet:
path: /healthz
port: http
envFrom:
- configMapRef:
name: gpu-metrics-exporter-config
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- with .Values.volumeMounts }}
volumeMounts:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.volumes }}
volumes:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
44 changes: 44 additions & 0 deletions charts/gpu-metrics-exporter/templates/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "gpu-metrics-exporter.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 4 }}
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "gpu-metrics-exporter.fullname" . }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 4 }}
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "gpu-metrics-exporter.fullname" . }}
labels:
{{- include "gpu-metrics-exporter.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "gpu-metrics-exporter.fullname" . }}
subjects:
- kind: ServiceAccount
name: {{ include "gpu-metrics-exporter.serviceAccountName" . }}
namespace: {{.Release.Namespace}}
{{- end }}
32 changes: 32 additions & 0 deletions charts/gpu-metrics-exporter/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
image:
repository: ghcr.io/castai/gpu-metrics-exporter/gpu-metrics-exporter
pullPolicy: IfNotPresent
tag: ""

imagePullSecrets: []

serviceAccount:
create: true
automount: true
annotations: {}

podAnnotations: {}
podLabels: {}

podSecurityContext: {}

securityContext: {}

service:
type: ClusterIP
port: 6061

resources:
limits:
cpu: 100m
memory: 128Mi
requests:
cpu: 100m
memory: 128Mi

config: {}
4 changes: 2 additions & 2 deletions internal/castai/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ func (c client) UploadBatch(ctx context.Context, batch *pb.MetricsBatch) error {
case statusCode >= 200 && statusCode < 300:
return true, nil
case statusCode >= 400 && statusCode < 500:
return true, fmt.Errorf("client error: %d %s", statusCode, resp.Status())
return true, fmt.Errorf("status code: %d, status: %s", statusCode, resp.Status())
default:
c.log.Errorf("server error or unexpected response code: %d %s", statusCode, resp.Status())
c.log.Errorf("server error or unexpected status code: %d, status: %s", statusCode, resp.Status())
return false, nil
}
})
Expand Down
5 changes: 2 additions & 3 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@ import (
type Config struct {
HTTPListenPort int `envconfig:"HTTP_LISTEN_PORT" default:"6061"`
LogLevel string `envconfig:"LOG_LEVEL" default:"info"`
KubeConfigPath string `envconfig:"KUBE_CONFIG_PATH"`
DCGMLabels map[string]string `envconfig:"DCGM_LABELS" default:"app.kubernetes.io/component:dcgm-exporter"`
KubeConfigPath string `envconfig:"KUBE_CONFIG_PATH" default:""`
DCGMLabels map[string]string `envconfig:"DCGM_LABELS" default:"app.kubernetes.io/name:dcgm-exporter"`
DCGMPort int `envconfig:"DCGM_PORT" default:"9400"`
DCGMMetricsEndpoint string `envconfig:"DCGM_METRICS_ENDPOINT" default:"/metrics"`
ExportInterval time.Duration `envconfig:"EXPORT_INTERVAL" default:"15s"`
CastAPI string `envconfig:"CAST_API" default:"https://api.cast.ai"`
APIToken string `envconfig:"API_TOKEN"`
ClusterID string `envconfig:"CLUSTER_ID"`
APIKey string `envconfig:"API_KEY"`
}
Expand Down
10 changes: 9 additions & 1 deletion internal/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,23 @@ func (e *exporter) export(ctx context.Context) error {
dcgmExporter := dcgmExporterList.Items[i]
urls[i] = fmt.Sprintf("http://%s:%d%s", dcgmExporter.Status.PodIP, e.cfg.DCGMExporterPort, e.cfg.DCGMExporterPath)
}
if len(urls) == 0 {
e.log.Info("no dcgm-exporter instances to scrape")
return nil
}

metricFamilies, err := e.scraper.Scrape(ctx, urls)
if err != nil {
return fmt.Errorf("couldn't scrape DCGM exporters %w", err)
}
if len(metricFamilies) == 0 {
e.log.Warnf("no metrics collected from %d dcgm-exporters", len(urls))
return nil
}

batch := e.mapper.Map(metricFamilies, time.Now())
if err := e.client.UploadBatch(ctx, batch); err != nil {
return fmt.Errorf("error whlie sending metrics %d to castai %w", len(batch.Metrics), err)
return fmt.Errorf("error whlie sending %d metrics to castai %w", len(batch.Metrics), err)
}

e.log.Infof("successfully exported %d metrics", len(batch.Metrics))
Expand Down
15 changes: 6 additions & 9 deletions internal/exporter/mapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ package exporter
import (
"time"

"google.golang.org/protobuf/types/known/timestamppb"

"github.com/castai/gpu-metrics-exporter/pb"
)

Expand Down Expand Up @@ -37,12 +35,12 @@ func (p metricMapper) Map(metricFamilyMaps []MetricFamilyMap, ts time.Time) *pb.
}
t := family.Type.String()
for _, m := range family.Metric {
labels := []*pb.Metric_Label{}
for _, l := range m.Label {
labels = append(labels, &pb.Metric_Label{
Name: *l.Name,
Value: *l.Value,
})
labels := make([]*pb.Metric_Label, len(m.Label))
for i, label := range m.Label {
labels[i] = &pb.Metric_Label{
Name: *label.Name,
Value: *label.Value,
}
}
var newValue float64
switch t {
Expand All @@ -54,7 +52,6 @@ func (p metricMapper) Map(metricFamilyMaps []MetricFamilyMap, ts time.Time) *pb.

metric.Measurements = append(metric.Measurements, &pb.Metric_Measurement{
Value: newValue,
Ts: timestamppb.New(ts),
Ivaka marked this conversation as resolved.
Show resolved Hide resolved
Labels: labels,
})
}
Expand Down
2 changes: 0 additions & 2 deletions internal/exporter/mapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (

dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/timestamppb"

"github.com/castai/gpu-metrics-exporter/internal/exporter"
"github.com/castai/gpu-metrics-exporter/pb"
Expand Down Expand Up @@ -101,7 +100,6 @@ func TestMetricMapper_Map(t *testing.T) {
Measurements: []*pb.Metric_Measurement{
{
Value: 1.0,
Ts: timestamppb.New(ts),
Labels: []*pb.Metric_Label{
{Name: "label1", Value: "value1"},
},
Expand Down
Loading
Loading