From dd3001a700452c0340bdc1942eb600fd59acc296 Mon Sep 17 00:00:00 2001 From: Rohit Arora <49132604+rohit-arora-dev@users.noreply.github.com> Date: Mon, 20 May 2024 15:58:40 -0400 Subject: [PATCH] DCGM Exporter Release 3.3.6-3.4.2 (#325) Signed-off-by: Rohit Arora Co-authored-by: Vadym Fedorov --- .gitignore | 1 + Makefile | 43 ++--- README.md | 94 +++++++---- dcgm-exporter.yaml | 12 +- deployment/Chart.yaml | 4 +- deployment/values.yaml | 2 +- docker/Dockerfile.ubi9 | 65 ++++---- docker/Dockerfile.ubuntu22.04 | 63 ++++--- go.mod | 1 + go.sum | 2 + hack/VERSION | 6 +- hack/header.txt | 13 ++ internal/mocks/pkg/os/dir_entry.go | 111 +++++++++++++ internal/mocks/pkg/os/file_info.go | 139 ++++++++++++++++ internal/mocks/pkg/os/os.go | 215 ++++++++++++++++++++++++ internal/pkg/os/README.md | 5 + internal/pkg/os/os.go | 82 +++++++++ pkg/cmd/app.go | 10 +- pkg/dcgmexporter/config.go | 1 + pkg/dcgmexporter/expcollector.go | 55 +++--- pkg/dcgmexporter/gpu_collector.go | 25 +-- pkg/dcgmexporter/gpu_collector_test.go | 31 +++- pkg/dcgmexporter/hpc.go | 158 ++++++++++++++++++ pkg/dcgmexporter/hpc_test.go | 222 +++++++++++++++++++++++++ pkg/dcgmexporter/kubernetes.go | 2 - pkg/dcgmexporter/kubernetes_test.go | 1 - pkg/dcgmexporter/os.go | 21 +++ pkg/dcgmexporter/parser.go | 3 +- pkg/dcgmexporter/parser_test.go | 2 - pkg/dcgmexporter/pipeline.go | 6 +- pkg/dcgmexporter/pipeline_test.go | 5 +- pkg/dcgmexporter/types.go | 2 + pkg/dcgmexporter/utils.go | 28 ++++ pkg/dcgmexporter/utils_test.go | 15 ++ pkg/dcgmexporter/xid_collector_test.go | 87 +++++++--- service-monitor.yaml | 4 +- tests/e2e/Makefile | 2 +- 37 files changed, 1337 insertions(+), 201 deletions(-) create mode 100644 hack/header.txt create mode 100644 internal/mocks/pkg/os/dir_entry.go create mode 100644 internal/mocks/pkg/os/file_info.go create mode 100644 internal/mocks/pkg/os/os.go create mode 100644 internal/pkg/os/README.md create mode 100644 internal/pkg/os/os.go create mode 100644 pkg/dcgmexporter/hpc.go create mode 100644 pkg/dcgmexporter/hpc_test.go create mode 100644 pkg/dcgmexporter/os.go diff --git a/.gitignore b/.gitignore index e365ca8a..19c0ff61 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ vendor/ tests.cov test_results.json .scannerwork +dist/ ############################################################################### # JetBrains diff --git a/Makefile b/Makefile index 34342c1e..a1146b61 100644 --- a/Makefile +++ b/Makefile @@ -14,30 +14,32 @@ include hack/VERSION -MKDIR ?= mkdir -REGISTRY ?= nvidia +REGISTRY ?= nvidia +GO ?= go +MKDIR ?= mkdir GOLANGCILINT_TIMEOUT ?= 10m DCGM_VERSION := $(NEW_DCGM_VERSION) GOLANG_VERSION := 1.21.5 VERSION := $(NEW_EXPORTER_VERSION) FULL_VERSION := $(DCGM_VERSION)-$(VERSION) -OUTPUT := type=oci,dest=/tmp/dcgm-exporter.tar +OUTPUT := type=oci,dest=/dev/null PLATFORMS := linux/amd64,linux/arm64 DOCKERCMD := docker buildx build MODULE := github.com/NVIDIA/dcgm-exporter + .PHONY: all binary install check-format local all: update-version ubuntu22.04 ubi9 -binary: update-version - cd cmd/dcgm-exporter; go build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}" +binary: generate update-version + cd cmd/dcgm-exporter; $(GO) build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}" test-main: - go test ./... -short + $(GO) test ./... -short install: binary - install -m 755 cmd/dcgm-exporter/dcgm-exporter /usr/bin/dcgm-exporter + install -m 755 $(DIST_DIR)/dcgm-exporter /usr/bin/dcgm-exporter install -m 644 -D ./etc/default-counters.csv /etc/dcgm-exporter/default-counters.csv install -m 644 -D ./etc/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv @@ -56,24 +58,20 @@ else $(MAKE) PLATFORMS=linux/amd64 OUTPUT=type=docker DOCKERCMD='docker build' endif -ubuntu22.04: - $(DOCKERCMD) --pull \ - --output $(OUTPUT) \ - --platform $(PLATFORMS) \ - --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ - --build-arg "DCGM_VERSION=$(DCGM_VERSION)" \ - --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu22.04" \ - --file docker/Dockerfile.ubuntu22.04 . +TARGETS = ubuntu22.04 ubi9 -ubi9: +DOCKERFILE.ubuntu22.04 = docker/Dockerfile.ubuntu22.04 +DOCKERFILE.ubi9 = docker/Dockerfile.ubi9 + +$(TARGETS): $(DOCKERCMD) --pull \ --output $(OUTPUT) \ --platform $(PLATFORMS) \ --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ --build-arg "DCGM_VERSION=$(DCGM_VERSION)" \ - --build-arg "VERSION=$(FULL_VERSION)" \ - --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi9" \ - --file docker/Dockerfile.ubi9 . + --build-arg "VERSION=$(VERSION)" \ + --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$@" \ + --file $(DOCKERFILE.$@) . .PHONY: integration test-integration: @@ -84,7 +82,7 @@ test-coverage: .PHONY: lint lint: - golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --verbose + golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix .PHONY: validate-modules validate-modules: @@ -132,3 +130,8 @@ update-version: # Update DCGM and DCGM Exporter versions update-versions: update-version + +.PHONY: generate +# Generate code (Mocks) +generate: + go generate ./... diff --git a/README.md b/README.md index b366ef42..ea6060ea 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,10 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https ### Quickstart To gather metrics on a GPU node, simply start the `dcgm-exporter` container: -``` -$ docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-3.4.1-ubuntu22.04 -$ curl localhost:9400/metrics + +```shell +docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04 +curl localhost:9400/metrics # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). # TYPE DCGM_FI_DEV_SM_CLOCK gauge # HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). @@ -32,33 +33,38 @@ Note: Consider using the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-ope Ensure you have already setup your cluster with the [default runtime as NVIDIA](https://github.com/NVIDIA/nvidia-container-runtime#docker-engine-setup). The recommended way to install DCGM-Exporter is to use the Helm chart: -``` -$ helm repo add gpu-helm-charts \ + +```shell +helm repo add gpu-helm-charts \ https://nvidia.github.io/dcgm-exporter/helm-charts ``` + Update the repo: + +```shell +helm repo update ``` -$ helm repo update -``` + And install the chart: -``` -$ helm install \ + +```shell +helm install \ --generate-name \ gpu-helm-charts/dcgm-exporter ``` Once the `dcgm-exporter` pod is deployed, you can use port forwarding to obtain metrics quickly: - -``` -$ kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml +```shell +kubectl create -f https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/master/dcgm-exporter.yaml # Let's get the output of a random pod: -$ NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter" \ +NAME=$(kubectl get pods -l "app.kubernetes.io/name=dcgm-exporter" \ -o "jsonpath={ .items[0].metadata.name}") -$ kubectl port-forward $NAME 8080:9400 & -$ curl -sL http://127.0.0.1:8080/metrics +kubectl port-forward $NAME 8080:9400 & + +curl -sL http://127.0.0.1:8080/metrics # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). # TYPE DCGM_FI_DEV_SM_CLOCK gauge # HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). @@ -72,6 +78,7 @@ DCGM_FI_DEV_MEMORY_TEMP{gpu="0", UUID="GPU-604ac76c-d9cf-fef3-62e9-d92044ab6e52" ... ``` + To integrate DCGM-Exporter with Prometheus and Grafana, see the full instructions in the [user guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/latest/). `dcgm-exporter` is deployed as part of the GPU Operator. To get started with integrating with Prometheus, check the Operator [user guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#gpu-telemetry). @@ -79,26 +86,42 @@ To integrate DCGM-Exporter with Prometheus and Grafana, see the full instruction Exporter supports TLS and basic auth using [exporter-toolkit](https://github.com/prometheus/exporter-toolkit). To use TLS and/or basic auth, users need to use `--web-config-file` CLI flag as follows -``` +```shell dcgm-exporter --web-config-file=web-config.yaml ``` A sample `web-config.yaml` file can be fetched from [exporter-toolkit repository](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-config.yml). The reference of the `web-config.yaml` file can be consulted in the [docs](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md). +### How to include HPC jobs in metric labels + +The DCGM-exporter can include High-Performance Computing (HPC) job information into its metric labels. To achieve this, HPC environment administrators must configure their HPC environment to generate files that map GPUs to HPC jobs. + +#### File Conventions + +These mapping files follow a specific format: + +* Each file is named after a unique GPU ID (e.g., 0, 1, 2, etc.). +* Each line in the file contains JOB IDs that run on the corresponding GPU. + +#### Enabling HPC Job Mapping on DCGM-Exporter + +To enable GPU-to-job mapping on the DCGM-exporter side, users must run the DCGM-exporter with the --hpc-job-mapping-dir command-line parameter, pointing to a directory where the HPC cluster creates job mapping files. Or, users can set the environment variable DCGM_HPC_JOB_MAPPING_DIR to achieve the same result. + ### Building from Source In order to build dcgm-exporter ensure you have the following: -- [Golang >= 1.21 installed](https://golang.org/) -- [DCGM installed](https://developer.nvidia.com/dcgm) -``` -$ git clone https://github.com/NVIDIA/dcgm-exporter.git -$ cd dcgm-exporter -$ make binary -$ sudo make install +* [Golang >= 1.21 installed](https://golang.org/) +* [DCGM installed](https://developer.nvidia.com/dcgm) + +```shell +git clone https://github.com/NVIDIA/dcgm-exporter.git +cd dcgm-exporter +make binary +sudo make install ... -$ dcgm-exporter & -$ curl localhost:9400/metrics +dcgm-exporter & +curl localhost:9400/metrics # HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). # TYPE DCGM_FI_DEV_SM_CLOCK gauge # HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). @@ -118,6 +141,7 @@ With `dcgm-exporter` you can configure which fields are collected by specifying You will find the default CSV file under `etc/default-counters.csv` in the repository, which is copied on your system or container to `/etc/dcgm-exporter/default-counters.csv` The layout and format of this file is as follows: + ``` # Format # If line starts with a '#' it is considered a comment @@ -129,39 +153,43 @@ DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). ``` A custom csv file can be specified using the `-f` option or `--collectors` as follows: -``` -$ dcgm-exporter -f /tmp/custom-collectors.csv + +```shell +dcgm-exporter -f /tmp/custom-collectors.csv ``` Notes: -- Always make sure your entries have 2 commas (',') -- The complete list of counters that can be collected can be found on the DCGM API reference manual: https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-ids.html + +* Always make sure your entries have 2 commas (',') +* The complete list of counters that can be collected can be found on the DCGM API reference manual: ### What about a Grafana Dashboard? -You can find the official NVIDIA DCGM-Exporter dashboard here: https://grafana.com/grafana/dashboards/12239 +You can find the official NVIDIA DCGM-Exporter dashboard here: You will also find the `json` file on this repo under `grafana/dcgm-exporter-dashboard.json` Pull requests are accepted! - ### Building the containers This project uses [docker buildx](https://docs.docker.com/buildx/working-with-buildx/) for multi-arch image creation. Follow the instructions on that page to get a working builder instance for creating these containers. Some other useful build options follow. Builds local images based on the machine architecture and makes them available in 'docker images' + ``` make local ``` Build the ubuntu image and export to 'docker images' -``` + +```shell make ubuntu22.04 PLATFORMS=linux/amd64 OUTPUT=type=docker ``` Build and push the images to some other 'private_registry' -``` + +```shell make REGISTRY= push ``` diff --git a/dcgm-exporter.yaml b/dcgm-exporter.yaml index 821b392a..cad69704 100644 --- a/dcgm-exporter.yaml +++ b/dcgm-exporter.yaml @@ -18,23 +18,23 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.1" + app.kubernetes.io/version: "3.4.2" spec: updateStrategy: type: RollingUpdate selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.1" + app.kubernetes.io/version: "3.4.2" template: metadata: labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.1" + app.kubernetes.io/version: "3.4.2" name: "dcgm-exporter" spec: containers: - - image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-3.4.1-ubuntu22.04" + - image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.6-3.4.2-ubuntu22.04" env: - name: "DCGM_EXPORTER_LISTEN" value: ":9400" @@ -64,11 +64,11 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.1" + app.kubernetes.io/version: "3.4.2" spec: selector: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.1" + app.kubernetes.io/version: "3.4.2" ports: - name: "metrics" port: 9400 diff --git a/deployment/Chart.yaml b/deployment/Chart.yaml index 28b152ef..c4271839 100644 --- a/deployment/Chart.yaml +++ b/deployment/Chart.yaml @@ -1,9 +1,9 @@ apiVersion: v2 name: dcgm-exporter description: A Helm chart for DCGM exporter -version: "3.4.1" +version: "3.4.2" kubeVersion: ">= 1.19.0-0" -appVersion: "3.4.1" +appVersion: "3.4.2" sources: - https://github.com/nvidia/dcgm-exporter home: https://github.com/nvidia/dcgm-exporter/ diff --git a/deployment/values.yaml b/deployment/values.yaml index 92842f5b..c2cffade 100644 --- a/deployment/values.yaml +++ b/deployment/values.yaml @@ -17,7 +17,7 @@ image: pullPolicy: IfNotPresent # Image tag defaults to AppVersion, but you can use the tag key # for the image tag, e.g: - tag: 3.3.5-3.4.1-ubuntu22.04 + tag: 3.3.6-3.4.2-ubuntu22.04 # Change the following reference to "/etc/dcgm-exporter/default-counters.csv" # to stop profiling metrics from DCGM diff --git a/docker/Dockerfile.ubi9 b/docker/Dockerfile.ubi9 index feeafb31..66ef969f 100644 --- a/docker/Dockerfile.ubi9 +++ b/docker/Dockerfile.ubi9 @@ -1,4 +1,4 @@ -FROM nvcr.io/nvidia/cuda:12.3.2-base-ubi9 AS builder +FROM nvcr.io/nvidia/cuda:12.4.1-base-ubi9 AS builder ARG GOLANG_VERSION WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter RUN set -eux; \ @@ -8,28 +8,28 @@ RUN dnf clean all && rm -rf /usr/bin/go # Install Go official release RUN set -eux; \ - url=; \ - arch=$(uname -m) && if [ "${arch}" = "x86_64" ]; then arch=amd64; fi && if [ "${arch}" = "aarch64" ]; then arch=arm64; fi; \ + url=; \ + arch=$(uname -m) && if [ "${arch}" = "x86_64" ]; then arch=amd64; fi && if [ "${arch}" = "aarch64" ]; then arch=arm64; fi; \ case "$arch" in \ - 'amd64') \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ - ;; \ - 'arm64') \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \ - ;; \ - *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ + 'amd64') \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ + ;; \ + 'arm64') \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \ + ;; \ + *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ esac; \ build=; \ if [ -z "$url" ]; then \ -# https://github.com/golang/go/issues/38536#issuecomment-616897960 - build=1; \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \ - echo >&2; \ - echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \ - echo >&2; \ + # https://github.com/golang/go/issues/38536#issuecomment-616897960 + build=1; \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \ + echo >&2; \ + echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \ + echo >&2; \ fi; \ - wget -O go.tgz "$url" --progress=dot:giga; \ - tar -C /usr/local -xzf go.tgz; \ + wget -O go.tgz "$url" --progress=dot:giga; \ + tar -C /usr/local -xzf go.tgz; \ rm go.tgz; ENV GOTOOLCHAIN=local ENV GOPATH /go @@ -40,17 +40,26 @@ COPY . . RUN make binary check-format -FROM nvcr.io/nvidia/cuda:12.3.2-base-ubi9 -LABEL io.k8s.display-name="NVIDIA DCGM Exporter" - +FROM nvcr.io/nvidia/cuda:12.4.1-base-ubi9 ARG DCGM_VERSION +ARG VERSION +ARG DIST_DIR + +LABEL io.k8s.display-name="NVIDIA DCGM Exporter" +LABEL name="NVIDIA DCGM Exporter" +LABEL vendor="NVIDIA" +LABEL version="${VERSION}" +LABEL release="N/A" +LABEL summary="Exports GPU Metrics to Prometheus" +LABEL description="See summary" RUN dnf update --disablerepo=* --enablerepo=ubi-9-appstream-rpms --enablerepo=ubi-9-baseos-rpms -y && rm -rf /var/cache/yum \ && dnf clean expire-cache && dnf install -y datacenter-gpu-manager-${DCGM_VERSION} libcap \ && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite +COPY ./LICENSE ./licenses/LICENSE COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/ -COPY etc /etc/dcgm-exporter +COPY ./etc /etc/dcgm-exporter ENV NVIDIA_VISIBLE_DEVICES=all # disable all constraints on the configurations required by NVIDIA container toolkit @@ -58,18 +67,6 @@ ENV NVIDIA_DISABLE_REQUIRE="true" # Required for DCP metrics ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 -ARG VERSION - -LABEL io.k8s.display-name="NVIDIA DCGM Eporter" -LABEL name="NVIDIA DCGM Exporter" -LABEL vendor="NVIDIA" -LABEL version="${VERSION}" -LABEL release="N/A" -LABEL summary="Exports GPU Metrics to Prometheus" -LABEL description="See summary" - -COPY ./LICENSE ./licenses/LICENSE - ENV NO_SETCAP= COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh diff --git a/docker/Dockerfile.ubuntu22.04 b/docker/Dockerfile.ubuntu22.04 index 8547d3df..24191666 100644 --- a/docker/Dockerfile.ubuntu22.04 +++ b/docker/Dockerfile.ubuntu22.04 @@ -1,59 +1,70 @@ -FROM nvcr.io/nvidia/cuda:12.3.2-base-ubuntu22.04 AS builder +FROM nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 AS builder ARG GOLANG_VERSION=1.21.5 WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter RUN set -eux; \ apt-get update; \ apt-get install -y --no-install-recommends \ - g++ \ - gcc \ - libc6-dev \ - make \ - pkg-config \ - wget \ + g++ \ + gcc \ + libc6-dev \ + make \ + pkg-config \ + wget \ ; \ rm -rf /var/lib/apt/lists/* RUN set -eux; \ arch="$(dpkg --print-architecture)"; arch="${arch##*-}"; \ url=; \ case "$arch" in \ - 'amd64') \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ - ;; \ - 'arm64') \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \ - ;; \ - *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ + 'amd64') \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ + ;; \ + 'arm64') \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \ + ;; \ + *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ esac; \ build=; \ if [ -z "$url" ]; then \ -# https://github.com/golang/go/issues/38536#issuecomment-616897960 - build=1; \ - url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \ - echo >&2; \ - echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \ - echo >&2; \ + # https://github.com/golang/go/issues/38536#issuecomment-616897960 + build=1; \ + url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \ + echo >&2; \ + echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \ + echo >&2; \ fi; \ - wget -O go.tgz "$url" --progress=dot:giga; \ - tar -C /usr/local -xzf go.tgz; \ + wget -O go.tgz "$url" --progress=dot:giga; \ + tar -C /usr/local -xzf go.tgz; \ rm go.tgz; ENV GOTOOLCHAIN=local ENV GOPATH /go ENV PATH $GOPATH/bin:$PATH RUN mkdir -p "$GOPATH/src" "$GOPATH/bin" && chmod -R 1777 "$GOPATH" -ENV PATH $PATH:/usr/local/go/bin +ENV PATH $PATH:/usr/local/go/bin COPY . . RUN make binary check-format -FROM nvcr.io/nvidia/cuda:12.3.2-base-ubuntu22.04 +FROM nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 + +ARG VERSION +ARG DCGM_VERSION +ARG DIST_DIR + LABEL io.k8s.display-name="NVIDIA DCGM Exporter" +LABEL name="NVIDIA DCGM Exporter" +LABEL vendor="NVIDIA" +LABEL version="${VERSION}" +LABEL release="N/A" +LABEL summary="Exports GPU Metrics to Prometheus" +LABEL description="See summary" +COPY ./LICENSE ./licenses/LICENSE COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/ COPY etc /etc/dcgm-exporter -ARG DCGM_VERSION RUN apt-get update && apt-get install -y --no-install-recommends \ - datacenter-gpu-manager=1:${DCGM_VERSION} libcap2-bin && apt-get purge --autoremove -y openssl \ + datacenter-gpu-manager=1:${DCGM_VERSION} libcap2-bin && apt-get purge --autoremove -y openssl \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* \ && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite diff --git a/go.mod b/go.mod index 0bacee30..00f3dc03 100644 --- a/go.mod +++ b/go.mod @@ -140,6 +140,7 @@ require ( go.opentelemetry.io/otel/metric v1.21.0 // indirect go.opentelemetry.io/otel/trace v1.21.0 // indirect go.starlark.net v0.0.0-20231121155337-90ade8b19d09 // indirect + go.uber.org/mock v0.4.0 // indirect golang.org/x/crypto v0.18.0 // indirect golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc // indirect golang.org/x/net v0.20.0 // indirect diff --git a/go.sum b/go.sum index 92d476fa..7ccb8941 100644 --- a/go.sum +++ b/go.sum @@ -402,6 +402,8 @@ go.uber.org/automaxprocs v1.5.3 h1:kWazyxZUrS3Gs4qUpbwo5kEIMGe/DAvi5Z4tl2NW4j8= go.uber.org/automaxprocs v1.5.3/go.mod h1:eRbA25aqJrxAbsLO0xy5jVwPt7FQnRgjW+efnwa1WM0= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU= +go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= diff --git a/hack/VERSION b/hack/VERSION index ec68b71f..273f147f 100644 --- a/hack/VERSION +++ b/hack/VERSION @@ -1,4 +1,4 @@ OLD_DCGM_VERSION=3.3.5 -OLD_EXPORTER_VERSION=3.4.0 -NEW_DCGM_VERSION=3.3.5 -NEW_EXPORTER_VERSION=3.4.1 +OLD_EXPORTER_VERSION=3.4.1 +NEW_DCGM_VERSION=3.3.6 +NEW_EXPORTER_VERSION=3.4.2 diff --git a/hack/header.txt b/hack/header.txt new file mode 100644 index 00000000..08262ec1 --- /dev/null +++ b/hack/header.txt @@ -0,0 +1,13 @@ +Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/internal/mocks/pkg/os/dir_entry.go b/internal/mocks/pkg/os/dir_entry.go new file mode 100644 index 00000000..d6271885 --- /dev/null +++ b/internal/mocks/pkg/os/dir_entry.go @@ -0,0 +1,111 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: os (interfaces: DirEntry) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/os/dir_entry.go -package=os -copyright_file=../../../hack/header.txt os DirEntry +// + +// Package os is a generated GoMock package. +package os + +import ( + fs "io/fs" + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockDirEntry is a mock of DirEntry interface. +type MockDirEntry struct { + ctrl *gomock.Controller + recorder *MockDirEntryMockRecorder +} + +// MockDirEntryMockRecorder is the mock recorder for MockDirEntry. +type MockDirEntryMockRecorder struct { + mock *MockDirEntry +} + +// NewMockDirEntry creates a new mock instance. +func NewMockDirEntry(ctrl *gomock.Controller) *MockDirEntry { + mock := &MockDirEntry{ctrl: ctrl} + mock.recorder = &MockDirEntryMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockDirEntry) EXPECT() *MockDirEntryMockRecorder { + return m.recorder +} + +// Info mocks base method. +func (m *MockDirEntry) Info() (fs.FileInfo, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Info") + ret0, _ := ret[0].(fs.FileInfo) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Info indicates an expected call of Info. +func (mr *MockDirEntryMockRecorder) Info() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Info", reflect.TypeOf((*MockDirEntry)(nil).Info)) +} + +// IsDir mocks base method. +func (m *MockDirEntry) IsDir() bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsDir") + ret0, _ := ret[0].(bool) + return ret0 +} + +// IsDir indicates an expected call of IsDir. +func (mr *MockDirEntryMockRecorder) IsDir() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsDir", reflect.TypeOf((*MockDirEntry)(nil).IsDir)) +} + +// Name mocks base method. +func (m *MockDirEntry) Name() string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Name") + ret0, _ := ret[0].(string) + return ret0 +} + +// Name indicates an expected call of Name. +func (mr *MockDirEntryMockRecorder) Name() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockDirEntry)(nil).Name)) +} + +// Type mocks base method. +func (m *MockDirEntry) Type() fs.FileMode { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Type") + ret0, _ := ret[0].(fs.FileMode) + return ret0 +} + +// Type indicates an expected call of Type. +func (mr *MockDirEntryMockRecorder) Type() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Type", reflect.TypeOf((*MockDirEntry)(nil).Type)) +} diff --git a/internal/mocks/pkg/os/file_info.go b/internal/mocks/pkg/os/file_info.go new file mode 100644 index 00000000..b2ef5282 --- /dev/null +++ b/internal/mocks/pkg/os/file_info.go @@ -0,0 +1,139 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: io/fs (interfaces: FileInfo) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/os/file_info.go -package=os -copyright_file=../../../hack/header.txt io/fs FileInfo +// + +// Package os is a generated GoMock package. +package os + +import ( + fs "io/fs" + reflect "reflect" + time "time" + + gomock "go.uber.org/mock/gomock" +) + +// MockFileInfo is a mock of FileInfo interface. +type MockFileInfo struct { + ctrl *gomock.Controller + recorder *MockFileInfoMockRecorder +} + +// MockFileInfoMockRecorder is the mock recorder for MockFileInfo. +type MockFileInfoMockRecorder struct { + mock *MockFileInfo +} + +// NewMockFileInfo creates a new mock instance. +func NewMockFileInfo(ctrl *gomock.Controller) *MockFileInfo { + mock := &MockFileInfo{ctrl: ctrl} + mock.recorder = &MockFileInfoMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockFileInfo) EXPECT() *MockFileInfoMockRecorder { + return m.recorder +} + +// IsDir mocks base method. +func (m *MockFileInfo) IsDir() bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsDir") + ret0, _ := ret[0].(bool) + return ret0 +} + +// IsDir indicates an expected call of IsDir. +func (mr *MockFileInfoMockRecorder) IsDir() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsDir", reflect.TypeOf((*MockFileInfo)(nil).IsDir)) +} + +// ModTime mocks base method. +func (m *MockFileInfo) ModTime() time.Time { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ModTime") + ret0, _ := ret[0].(time.Time) + return ret0 +} + +// ModTime indicates an expected call of ModTime. +func (mr *MockFileInfoMockRecorder) ModTime() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ModTime", reflect.TypeOf((*MockFileInfo)(nil).ModTime)) +} + +// Mode mocks base method. +func (m *MockFileInfo) Mode() fs.FileMode { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Mode") + ret0, _ := ret[0].(fs.FileMode) + return ret0 +} + +// Mode indicates an expected call of Mode. +func (mr *MockFileInfoMockRecorder) Mode() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Mode", reflect.TypeOf((*MockFileInfo)(nil).Mode)) +} + +// Name mocks base method. +func (m *MockFileInfo) Name() string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Name") + ret0, _ := ret[0].(string) + return ret0 +} + +// Name indicates an expected call of Name. +func (mr *MockFileInfoMockRecorder) Name() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockFileInfo)(nil).Name)) +} + +// Size mocks base method. +func (m *MockFileInfo) Size() int64 { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Size") + ret0, _ := ret[0].(int64) + return ret0 +} + +// Size indicates an expected call of Size. +func (mr *MockFileInfoMockRecorder) Size() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Size", reflect.TypeOf((*MockFileInfo)(nil).Size)) +} + +// Sys mocks base method. +func (m *MockFileInfo) Sys() any { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Sys") + ret0, _ := ret[0].(any) + return ret0 +} + +// Sys indicates an expected call of Sys. +func (mr *MockFileInfoMockRecorder) Sys() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Sys", reflect.TypeOf((*MockFileInfo)(nil).Sys)) +} diff --git a/internal/mocks/pkg/os/os.go b/internal/mocks/pkg/os/os.go new file mode 100644 index 00000000..f3b77f42 --- /dev/null +++ b/internal/mocks/pkg/os/os.go @@ -0,0 +1,215 @@ +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/os (interfaces: OS) +// +// Generated by this command: +// +// mockgen -destination=../../mocks/pkg/os/os.go -package=os -copyright_file=../../../hack/header.txt . OS +// + +// Package os is a generated GoMock package. +package os + +import ( + fs "io/fs" + os "os" + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockOS is a mock of OS interface. +type MockOS struct { + ctrl *gomock.Controller + recorder *MockOSMockRecorder +} + +// MockOSMockRecorder is the mock recorder for MockOS. +type MockOSMockRecorder struct { + mock *MockOS +} + +// NewMockOS creates a new mock instance. +func NewMockOS(ctrl *gomock.Controller) *MockOS { + mock := &MockOS{ctrl: ctrl} + mock.recorder = &MockOSMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockOS) EXPECT() *MockOSMockRecorder { + return m.recorder +} + +// CreateTemp mocks base method. +func (m *MockOS) CreateTemp(arg0, arg1 string) (*os.File, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CreateTemp", arg0, arg1) + ret0, _ := ret[0].(*os.File) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// CreateTemp indicates an expected call of CreateTemp. +func (mr *MockOSMockRecorder) CreateTemp(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateTemp", reflect.TypeOf((*MockOS)(nil).CreateTemp), arg0, arg1) +} + +// Getenv mocks base method. +func (m *MockOS) Getenv(arg0 string) string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Getenv", arg0) + ret0, _ := ret[0].(string) + return ret0 +} + +// Getenv indicates an expected call of Getenv. +func (mr *MockOSMockRecorder) Getenv(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Getenv", reflect.TypeOf((*MockOS)(nil).Getenv), arg0) +} + +// Hostname mocks base method. +func (m *MockOS) Hostname() (string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Hostname") + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Hostname indicates an expected call of Hostname. +func (mr *MockOSMockRecorder) Hostname() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Hostname", reflect.TypeOf((*MockOS)(nil).Hostname)) +} + +// IsNotExist mocks base method. +func (m *MockOS) IsNotExist(arg0 error) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsNotExist", arg0) + ret0, _ := ret[0].(bool) + return ret0 +} + +// IsNotExist indicates an expected call of IsNotExist. +func (mr *MockOSMockRecorder) IsNotExist(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsNotExist", reflect.TypeOf((*MockOS)(nil).IsNotExist), arg0) +} + +// MkdirTemp mocks base method. +func (m *MockOS) MkdirTemp(arg0, arg1 string) (string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "MkdirTemp", arg0, arg1) + ret0, _ := ret[0].(string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// MkdirTemp indicates an expected call of MkdirTemp. +func (mr *MockOSMockRecorder) MkdirTemp(arg0, arg1 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MkdirTemp", reflect.TypeOf((*MockOS)(nil).MkdirTemp), arg0, arg1) +} + +// Open mocks base method. +func (m *MockOS) Open(arg0 string) (*os.File, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Open", arg0) + ret0, _ := ret[0].(*os.File) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Open indicates an expected call of Open. +func (mr *MockOSMockRecorder) Open(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Open", reflect.TypeOf((*MockOS)(nil).Open), arg0) +} + +// ReadDir mocks base method. +func (m *MockOS) ReadDir(arg0 string) ([]fs.DirEntry, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ReadDir", arg0) + ret0, _ := ret[0].([]fs.DirEntry) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ReadDir indicates an expected call of ReadDir. +func (mr *MockOSMockRecorder) ReadDir(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReadDir", reflect.TypeOf((*MockOS)(nil).ReadDir), arg0) +} + +// Remove mocks base method. +func (m *MockOS) Remove(arg0 string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Remove", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// Remove indicates an expected call of Remove. +func (mr *MockOSMockRecorder) Remove(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Remove", reflect.TypeOf((*MockOS)(nil).Remove), arg0) +} + +// RemoveAll mocks base method. +func (m *MockOS) RemoveAll(arg0 string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "RemoveAll", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// RemoveAll indicates an expected call of RemoveAll. +func (mr *MockOSMockRecorder) RemoveAll(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveAll", reflect.TypeOf((*MockOS)(nil).RemoveAll), arg0) +} + +// Stat mocks base method. +func (m *MockOS) Stat(arg0 string) (fs.FileInfo, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Stat", arg0) + ret0, _ := ret[0].(fs.FileInfo) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Stat indicates an expected call of Stat. +func (mr *MockOSMockRecorder) Stat(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stat", reflect.TypeOf((*MockOS)(nil).Stat), arg0) +} + +// TempDir mocks base method. +func (m *MockOS) TempDir() string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "TempDir") + ret0, _ := ret[0].(string) + return ret0 +} + +// TempDir indicates an expected call of TempDir. +func (mr *MockOSMockRecorder) TempDir() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TempDir", reflect.TypeOf((*MockOS)(nil).TempDir)) +} diff --git a/internal/pkg/os/README.md b/internal/pkg/os/README.md new file mode 100644 index 00000000..23631eb5 --- /dev/null +++ b/internal/pkg/os/README.md @@ -0,0 +1,5 @@ +# OS - wrapper package for system os package + +The package allows to mock os package functions for testing purposes. + + diff --git a/internal/pkg/os/os.go b/internal/pkg/os/os.go new file mode 100644 index 00000000..891d3893 --- /dev/null +++ b/internal/pkg/os/os.go @@ -0,0 +1,82 @@ +/* +* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. + */ + +package os + +import "os" + +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/os.go -package=os -copyright_file=../../../hack/header.txt . OS +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/dir_entry.go -package=os -copyright_file=../../../hack/header.txt os DirEntry +//go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/file_info.go -package=os -copyright_file=../../../hack/header.txt io/fs FileInfo +type OS interface { + CreateTemp(dir, pattern string) (*os.File, error) + Getenv(key string) string + Hostname() (string, error) + IsNotExist(err error) bool + MkdirTemp(dir, pattern string) (string, error) + Open(name string) (*os.File, error) + Remove(name string) error + RemoveAll(path string) error + Stat(name string) (os.FileInfo, error) + TempDir() string + ReadDir(name string) ([]os.DirEntry, error) +} + +type RealOS struct{} + +func (RealOS) Hostname() (string, error) { + return os.Hostname() +} + +func (RealOS) Getenv(key string) string { + return os.Getenv(key) +} + +func (RealOS) Stat(name string) (os.FileInfo, error) { + return os.Stat(name) +} + +func (RealOS) IsNotExist(err error) bool { + return os.IsNotExist(err) +} + +func (RealOS) Open(name string) (*os.File, error) { + return os.Open(name) +} + +func (RealOS) MkdirTemp(dir, pattern string) (string, error) { + return os.MkdirTemp(dir, pattern) +} + +func (RealOS) RemoveAll(path string) error { + return os.RemoveAll(path) +} + +func (RealOS) CreateTemp(dir, pattern string) (*os.File, error) { + return os.CreateTemp(dir, pattern) +} + +func (RealOS) TempDir() string { + return os.TempDir() +} + +func (RealOS) Remove(name string) error { + return os.Remove(name) +} + +func (RealOS) ReadDir(name string) ([]os.DirEntry, error) { + return os.ReadDir(name) +} diff --git a/pkg/cmd/app.go b/pkg/cmd/app.go index 9bdc6014..7bbd44c8 100644 --- a/pkg/cmd/app.go +++ b/pkg/cmd/app.go @@ -73,6 +73,7 @@ const ( CLIEnableDCGMLog = "enable-dcgm-log" CLIDCGMLogLevel = "dcgm-log-level" CLIPodResourcesKubeletSocket = "pod-resources-kubelet-socket" + CLIHPCJobMappingDir = "hpc-job-mapping-dir" ) func NewApp(buildVersion ...string) *cli.App { @@ -227,9 +228,15 @@ func NewApp(buildVersion ...string) *cli.App { &cli.StringFlag{ Name: CLIPodResourcesKubeletSocket, Value: "/var/lib/kubelet/pod-resources/kubelet.sock", - Usage: "Path to the kubelet pod-resources socket file", + Usage: "Path to the kubelet pod-resources socket file.", EnvVars: []string{"DCGM_POD_RESOURCES_KUBELET_SOCKET"}, }, + &cli.StringFlag{ + Name: CLIHPCJobMappingDir, + Value: "", + Usage: "Path to HPC job mapping file directory used for mapping GPUs to jobs.", + EnvVars: []string{"DCGM_HPC_JOB_MAPPING_DIR"}, + }, } if runtime.GOOS == "linux" { @@ -623,5 +630,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) { EnableDCGMLog: c.Bool(CLIEnableDCGMLog), DCGMLogLevel: dcgmLogLevel, PodResourcesKubeletSocket: c.String(CLIPodResourcesKubeletSocket), + HPCJobMappingDir: c.String(CLIHPCJobMappingDir), }, nil } diff --git a/pkg/dcgmexporter/config.go b/pkg/dcgmexporter/config.go index adaaeb6f..a9405111 100644 --- a/pkg/dcgmexporter/config.go +++ b/pkg/dcgmexporter/config.go @@ -57,4 +57,5 @@ type Config struct { EnableDCGMLog bool DCGMLogLevel string PodResourcesKubeletSocket string + HPCJobMappingDir string } diff --git a/pkg/dcgmexporter/expcollector.go b/pkg/dcgmexporter/expcollector.go index 93d4d179..1ddcea87 100644 --- a/pkg/dcgmexporter/expcollector.go +++ b/pkg/dcgmexporter/expcollector.go @@ -56,7 +56,6 @@ type Collector interface { var getExpMetricTemplate = sync.OnceValue(func() *template.Template { return template.Must(template.New("expMetrics").Parse(expMetricsFormat)) - }) func encodeExpMetrics(w io.Writer, metrics MetricsByCounter) error { @@ -82,7 +81,6 @@ type expCollector struct { } func (c *expCollector) getMetrics() (MetricsByCounter, error) { - fieldGroupIdx := expCollectorFieldGroupIdx.Add(1) fieldGroupName := fmt.Sprintf("expCollectorFieldGroupName%d", fieldGroupIdx) @@ -144,31 +142,14 @@ func (c *expCollector) getMetrics() (MetricsByCounter, error) { metricValueLabels := maps.Clone(labels) c.labelFiller(metricValueLabels, entityValue) - gpuModel := getGPUModel(mi.DeviceInfo, c.config.ReplaceBlanksInModelName) - - m := Metric{ - Counter: c.counter, - Value: fmt.Sprint(val), - UUID: uuid, - GPU: fmt.Sprintf("%d", mi.DeviceInfo.GPU), - GPUUUID: mi.DeviceInfo.UUID, - GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU), - GPUModelName: gpuModel, - Hostname: c.hostname, - - Labels: metricValueLabels, - Attributes: map[string]string{}, - } - if mi.InstanceInfo != nil { - m.MigProfile = mi.InstanceInfo.ProfileName - m.GPUInstanceID = fmt.Sprintf("%d", mi.InstanceInfo.Info.NvmlInstanceId) - } else { - m.MigProfile = "" - m.GPUInstanceID = "" - } + m := c.createMetric(metricValueLabels, mi, uuid, val) metrics[c.counter] = append(metrics[c.counter], m) } + } else { + // Create metric with Zero value if group (mapEntityIDToValues) is empty + m := c.createMetric(labels, mi, uuid, 0) + metrics[c.counter] = append(metrics[c.counter], m) } } @@ -182,6 +163,32 @@ func (c *expCollector) getMetrics() (MetricsByCounter, error) { return metrics, nil } +func (c *expCollector) createMetric(labels map[string]string, mi MonitoringInfo, uuid string, val int) Metric { + gpuModel := getGPUModel(mi.DeviceInfo, c.config.ReplaceBlanksInModelName) + + m := Metric{ + Counter: c.counter, + Value: fmt.Sprint(val), + UUID: uuid, + GPU: fmt.Sprintf("%d", mi.DeviceInfo.GPU), + GPUUUID: mi.DeviceInfo.UUID, + GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU), + GPUModelName: gpuModel, + Hostname: c.hostname, + + Labels: labels, + Attributes: map[string]string{}, + } + if mi.InstanceInfo != nil { + m.MigProfile = mi.InstanceInfo.ProfileName + m.GPUInstanceID = fmt.Sprintf("%d", mi.InstanceInfo.Info.NvmlInstanceId) + } else { + m.MigProfile = "" + m.GPUInstanceID = "" + } + return m +} + func (c *expCollector) getLabelsFromCounters(mi MonitoringInfo, labels map[string]string) error { latestValues, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.labelDeviceFields) if err != nil { diff --git a/pkg/dcgmexporter/gpu_collector.go b/pkg/dcgmexporter/gpu_collector.go index c63f484e..dcd640ca 100644 --- a/pkg/dcgmexporter/gpu_collector.go +++ b/pkg/dcgmexporter/gpu_collector.go @@ -19,7 +19,6 @@ package dcgmexporter import ( "errors" "fmt" - "os" "strconv" "strings" @@ -27,9 +26,13 @@ import ( "github.com/sirupsen/logrus" ) -type DCGMCollectorConstructor func([]Counter, string, *Config, FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) +const unknownErr = "Unknown Error" -func NewDCGMCollector(c []Counter, +type DCGMCollectorConstructor func([]Counter, string, *Config, FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, + func(), error) + +func NewDCGMCollector( + c []Counter, hostname string, config *Config, fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem, @@ -153,17 +156,18 @@ func ShouldMonitorDeviceType(fields []dcgm.Short, entityType dcgm.Field_Entity_G return true } -func FindCounterField(c []Counter, fieldId uint) (Counter, error) { +func FindCounterField(c []Counter, fieldID uint) (Counter, error) { for i := 0; i < len(c); i++ { - if uint(c[i].FieldID) == fieldId { + if uint(c[i].FieldID) == fieldID { return c[i], nil } } - return c[0], fmt.Errorf("could not find counter corresponding to field ID '%d'", fieldId) + return Counter{}, fmt.Errorf("could not find counter corresponding to field ID '%d'", fieldID) } -func ToSwitchMetric(metrics MetricsByCounter, +func ToSwitchMetric( + metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string, ) { labels := map[string]string{} @@ -207,7 +211,8 @@ func ToSwitchMetric(metrics MetricsByCounter, } } -func ToCPUMetric(metrics MetricsByCounter, +func ToCPUMetric( + metrics MetricsByCounter, values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string, ) { labels := map[string]string{} @@ -290,10 +295,10 @@ func ToMetric( if counter.FieldID == dcgm.DCGM_FI_DEV_XID_ERRORS { errCode := int(val.Int64()) attrs["err_code"] = strconv.Itoa(errCode) - if 0 < errCode && errCode < len(xidErrCodeToText) { + if 0 <= errCode && errCode < len(xidErrCodeToText) { attrs["err_msg"] = xidErrCodeToText[errCode] } else { - attrs["err_msg"] = "Unknown Error" + attrs["err_msg"] = unknownErr } } diff --git a/pkg/dcgmexporter/gpu_collector_test.go b/pkg/dcgmexporter/gpu_collector_test.go index 23a92929..e38b2673 100644 --- a/pkg/dcgmexporter/gpu_collector_test.go +++ b/pkg/dcgmexporter/gpu_collector_test.go @@ -32,8 +32,18 @@ var sampleCounters = []Counter{ {dcgm.DCGM_FI_DEV_POWER_USAGE, "DCGM_FI_DEV_POWER_USAGE", "gauge", "Power help info"}, {dcgm.DCGM_FI_DRIVER_VERSION, "DCGM_FI_DRIVER_VERSION", "label", "Driver version"}, /* test that switch and link metrics are filtered out automatically when devices are not detected */ - {dcgm.DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT, "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT", "gauge", "switch temperature"}, - {dcgm.DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS, "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS", "gauge", "per-link flit errors"}, + { + dcgm.DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT", + "gauge", + "switch temperature", + }, + { + dcgm.DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS, + "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS", + "gauge", + "per-link flit errors", + }, /* test that vgpu metrics are not filtered out */ {dcgm.DCGM_FI_DEV_VGPU_LICENSE_STATUS, "DCGM_FI_DEV_VGPU_LICENSE_STATUS", "gauge", "vgpu license status"}, /* test that cpu and cpu core metrics are filtered out automatically when devices are not detected */ @@ -97,7 +107,9 @@ func testDCGMGPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun return hierarchy, nil } - dcgmAddEntityToGroup = func(groupId dcgm.GroupHandle, entityGroupId dcgm.Field_Entity_Group, entityId uint) (err error) { + dcgmAddEntityToGroup = func( + groupId dcgm.GroupHandle, entityGroupId dcgm.Field_Entity_Group, entityId uint, + ) (err error) { return nil } @@ -197,7 +209,9 @@ func testDCGMCPUCollector(t *testing.T, counters []Counter) (*DCGMCollector, fun return hierarchy, nil } - dcgmAddEntityToGroup = func(groupId dcgm.GroupHandle, entityGroupId dcgm.Field_Entity_Group, entityId uint) (err error) { + dcgmAddEntityToGroup = func( + groupId dcgm.GroupHandle, entityGroupId dcgm.Field_Entity_Group, entityId uint, + ) (err error) { return nil } @@ -340,15 +354,20 @@ func TestToMetricWhenDCGM_FI_DEV_XID_ERRORSField(t *testing.T) { } testCases := []testCase{ + { + name: "when DCGM_FI_DEV_XID_ERRORS has no error", + fieldValue: 0, + expectedErr: xidErrCodeToText[0], + }, { name: "when DCGM_FI_DEV_XID_ERRORS has known value", fieldValue: 42, - expectedErr: "Video processor exception", + expectedErr: xidErrCodeToText[42], }, { name: "when DCGM_FI_DEV_XID_ERRORS has unknown value", fieldValue: 255, - expectedErr: "Unknown Error", + expectedErr: unknownErr, }, } diff --git a/pkg/dcgmexporter/hpc.go b/pkg/dcgmexporter/hpc.go new file mode 100644 index 00000000..e360b096 --- /dev/null +++ b/pkg/dcgmexporter/hpc.go @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import ( + "bufio" + sysOS "os" + "path" + "strconv" + + "github.com/sirupsen/logrus" +) + +type hpcMapper struct { + Config *Config +} + +func newHPCMapper(c *Config) *hpcMapper { + logrus.Infof("HPC job mapping is enabled and watch for the %q directory", c.HPCJobMappingDir) + return &hpcMapper{ + Config: c, + } +} + +func (p *hpcMapper) Name() string { + return "hpcMapper" +} + +func (p *hpcMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error { + _, err := os.Stat(p.Config.HPCJobMappingDir) + if err != nil { + logrus.WithError(err).Warnf("Unable to access HPC job mapping file directory '%s' - directory not found. Ignoring.", p.Config.HPCJobMappingDir) + return nil + } + + gpuFiles, err := getGPUFiles(p.Config.HPCJobMappingDir) + if err != nil { + return err + } + + gpuToJobMap := make(map[string][]string) + + logrus.Debugf("HPC job mapping files: %#v", gpuFiles) + + for _, gpuFileName := range gpuFiles { + jobs, err := readFile(path.Join(p.Config.HPCJobMappingDir, gpuFileName)) + if err != nil { + return err + } + + if _, exist := gpuToJobMap[gpuFileName]; !exist { + gpuToJobMap[gpuFileName] = []string{} + } + gpuToJobMap[gpuFileName] = append(gpuToJobMap[gpuFileName], jobs...) + } + + logrus.Debugf("GPU to job mapping: %+v", gpuToJobMap) + + for counter := range metrics { + var modifiedMetrics []Metric + for _, metric := range metrics[counter] { + jobs, exists := gpuToJobMap[metric.GPU] + if exists { + for _, job := range jobs { + modifiedMetric, err := deepCopy(metric) + if err != nil { + logrus.WithError(err).Errorf("Can not create deepCopy for the value: %v", metric) + continue + } + modifiedMetric.Attributes[hpcJobAttribute] = job + modifiedMetrics = append(modifiedMetrics, modifiedMetric) + } + } else { + modifiedMetrics = append(modifiedMetrics, metric) + } + } + metrics[counter] = modifiedMetrics + } + + return nil +} + +func readFile(path string) ([]string, error) { + var jobs []string + + file, err := os.Open(path) + if err != nil { + return nil, err + } + defer func(file *sysOS.File) { + err := file.Close() + if err != nil { + logrus.WithError(err).Errorf("Failed for close the file: %s", file.Name()) + } + }(file) + + // Example of the expected file format: + // job1 + // job2 + // job3 + scanner := bufio.NewScanner(file) + for scanner.Scan() { + jobs = append(jobs, scanner.Text()) + } + + if err := scanner.Err(); err != nil { + return nil, err + } + + return jobs, nil +} + +func getGPUFiles(dirPath string) ([]string, error) { + files, err := os.ReadDir(dirPath) + if err != nil { + return nil, err + } + + logrus.Debugf("hpc mapper: %d files in the %q found", len(files), dirPath) + + var mappingFiles []string + + for _, file := range files { + finfo, err := file.Info() + if err != nil { + logrus.Warnf("HPC mapper: can not get file info for the %s file.", file.Name()) + continue // Skip files that we can't read + } + + if finfo.IsDir() { + logrus.Debugf("HPC mapper: the %q file is directory", file.Name()) + continue // Skip directories + } + + _, err = strconv.Atoi(file.Name()) + if err != nil { + logrus.Debugf("HPC mapper: file %q name doesn't match with GPU ID convention", file.Name()) + continue + } + mappingFiles = append(mappingFiles, file.Name()) + } + + return mappingFiles, nil +} diff --git a/pkg/dcgmexporter/hpc_test.go b/pkg/dcgmexporter/hpc_test.go new file mode 100644 index 00000000..8b834955 --- /dev/null +++ b/pkg/dcgmexporter/hpc_test.go @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package dcgmexporter + +import ( + "cmp" + "errors" + "fmt" + "io/fs" + "reflect" + "slices" + "testing" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + + osmock "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/os" + osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" +) + +func TestHPCProcess(t *testing.T) { + realOS := osinterface.RealOS{} + + tests := []struct { + name string + config *Config + fsState func() func() + assertion func(*testing.T, MetricsByCounter) + wantErr assert.ErrorAssertionFunc + }{ + { + name: "When all GPU have job files", + config: &Config{HPCJobMappingDir: "/var/run/nvidia/slurm"}, + fsState: func() func() { + ctrl := gomock.NewController(t) + mOS := osmock.NewMockOS(ctrl) + mFileInfoGPU0 := osmock.NewMockFileInfo(ctrl) + mFileInfoGPU0.EXPECT().IsDir().Return(false).AnyTimes() + + mDirEntryGPU0 := osmock.NewMockDirEntry(ctrl) + mDirEntryGPU0.EXPECT().Info().Return(mFileInfoGPU0, nil).AnyTimes() + mDirEntryGPU0.EXPECT().Name().Return("0").AnyTimes() + + mFileInfoGPU1 := osmock.NewMockFileInfo(ctrl) + mFileInfoGPU1.EXPECT().IsDir().Return(false).AnyTimes() + + mDirEntryGPU1 := osmock.NewMockDirEntry(ctrl) + mDirEntryGPU1.EXPECT().Info().Return(mFileInfoGPU1, nil).AnyTimes() + mDirEntryGPU1.EXPECT().Name().Return("1").AnyTimes() + + mFileInfoDir := osmock.NewMockFileInfo(ctrl) + mFileInfoDir.EXPECT().IsDir().Return(true).AnyTimes() + + mDirEntryDir := osmock.NewMockDirEntry(ctrl) + mDirEntryDir.EXPECT().Info().Return(mFileInfoDir, nil).AnyTimes() + mDirEntryDir.EXPECT().Name().Return("iamdir").AnyTimes() + + mDirEntryDamagedFile := osmock.NewMockDirEntry(ctrl) + mDirEntryDamagedFile.EXPECT().Info().Return(nil, errors.New("boom")).AnyTimes() + mDirEntryDamagedFile.EXPECT().Name().Return("iamerror").AnyTimes() + + mOS.EXPECT().Stat(gomock.Eq("/var/run/nvidia/slurm")) + mOS.EXPECT().ReadDir(gomock.Eq("/var/run/nvidia/slurm")). + Return([]fs.DirEntry{ + mDirEntryGPU0, + mDirEntryGPU1, + mDirEntryDir, + mDirEntryDamagedFile, + }, nil).AnyTimes() + + slurm0, err := realOS.CreateTemp("", "slurm0") + require.NoError(t, err) + _, _ = slurm0.WriteString("job1-0\n") + slurm0.Close() + + slurm1, err := realOS.CreateTemp("", "slurm1") + require.NoError(t, err) + _, _ = slurm1.WriteString("job1-1\n") + _, _ = slurm1.WriteString("job2-1\n") + slurm1.Close() + + mOS.EXPECT().Open(gomock.Eq("/var/run/nvidia/slurm/0")).Return(realOS.Open(slurm0.Name())) + mOS.EXPECT().Open(gomock.Eq("/var/run/nvidia/slurm/1")).Return(realOS.Open(slurm1.Name())) + + os = mOS + return func() { + os = osinterface.RealOS{} + slurm0.Close() + _ = realOS.Remove(slurm0.Name()) + slurm1.Close() + _ = realOS.Remove(slurm1.Name()) + } + }, + assertion: func(t *testing.T, mbc MetricsByCounter) { + require.Len(t, mbc, 1, "metrics are expected for a single counter only.") + // We get metric value with 0 index + metricValues := mbc[reflect.ValueOf(mbc).MapKeys()[0].Interface().(Counter)] + require.Len(t, metricValues, 4, "received unexpected number of metric values.") + // Sort metrics by GPU ID + slices.SortFunc(metricValues, func(a, b Metric) int { + return cmp.Compare(a.GPU, b.GPU) + }) + assert.Equal(t, "0", metricValues[0].GPU) + assert.Equal(t, "42", metricValues[0].Value) + assert.Equal(t, "job1-0", metricValues[0].Attributes[hpcJobAttribute]) + + assert.Equal(t, "1", metricValues[1].GPU) + assert.Equal(t, "451", metricValues[1].Value) + assert.Equal(t, "job1-1", metricValues[1].Attributes[hpcJobAttribute]) + + assert.Equal(t, "1", metricValues[2].GPU) + assert.Equal(t, "451", metricValues[2].Value) + assert.Equal(t, "job2-1", metricValues[2].Attributes[hpcJobAttribute]) + + assert.Equal(t, "2", metricValues[3].GPU) + assert.Equal(t, "1984", metricValues[3].Value) + assert.NotContains(t, metricValues[3].Attributes, hpcJobAttribute) + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.fsState != nil { + cleanup := tt.fsState() + defer cleanup() + } + + metrics := MetricsByCounter{} + counter := Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + } + + metrics[counter] = append(metrics[counter], Metric{ + GPU: "0", + GPUUUID: uuid.New().String(), + GPUDevice: "nvidia0", + GPUInstanceID: "", + Value: "42", + Counter: Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + }, + Attributes: map[string]string{}, + }) + + metrics[counter] = append(metrics[counter], Metric{ + GPU: "1", + GPUUUID: uuid.New().String(), + GPUDevice: "nvidia1", + GPUInstanceID: "1", + Value: "451", + Counter: Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + }, + Attributes: map[string]string{}, + }) + + metrics[counter] = append(metrics[counter], Metric{ + GPU: "2", + GPUUUID: uuid.New().String(), + GPUDevice: "nvidia3", + GPUInstanceID: "2", + Value: "1984", + Counter: Counter{ + FieldID: 155, + FieldName: "DCGM_FI_DEV_POWER_USAGE", + PromType: "gauge", + }, + Attributes: map[string]string{}, + }) + + sysInfo := SystemInfo{ + GPUCount: 2, + GPUs: [dcgm.MAX_NUM_DEVICES]GPUInfo{ + { + DeviceInfo: dcgm.Device{ + UUID: "00000000-0000-0000-0000-000000000000", + GPU: 0, + }, + }, + { + DeviceInfo: dcgm.Device{ + UUID: "00000000-0000-0000-0000-000000000001", + GPU: 1, + }, + }, + }, + } + mapper := newHPCMapper(tt.config) + err := mapper.Process(metrics, sysInfo) + if tt.wantErr != nil && !tt.wantErr(t, err, fmt.Sprintf("hpcMapper.Process(%v,%v)", metrics, sysInfo)) { + return + } + tt.assertion(t, metrics) + }) + } +} + +func TestHPCName(t *testing.T) { + assert.Equal(t, "hpcMapper", newHPCMapper(&Config{}).Name()) +} diff --git a/pkg/dcgmexporter/kubernetes.go b/pkg/dcgmexporter/kubernetes.go index 615cfeab..1a04245b 100644 --- a/pkg/dcgmexporter/kubernetes.go +++ b/pkg/dcgmexporter/kubernetes.go @@ -20,7 +20,6 @@ import ( "context" "fmt" "net" - "os" "regexp" "strings" "time" @@ -117,7 +116,6 @@ func connectToServer(socket string) (*grpc.ClientConn, func(), error) { return d.DialContext(ctx, "unix", addr) }), ) - if err != nil { return nil, func() {}, fmt.Errorf("failure connecting to '%s'; err: %w", socket, err) } diff --git a/pkg/dcgmexporter/kubernetes_test.go b/pkg/dcgmexporter/kubernetes_test.go index cdb35ff7..7a9b2b86 100644 --- a/pkg/dcgmexporter/kubernetes_test.go +++ b/pkg/dcgmexporter/kubernetes_test.go @@ -20,7 +20,6 @@ import ( "context" "fmt" "net" - "os" "reflect" "testing" "time" diff --git a/pkg/dcgmexporter/os.go b/pkg/dcgmexporter/os.go new file mode 100644 index 00000000..da351ca4 --- /dev/null +++ b/pkg/dcgmexporter/os.go @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgmexporter + +import osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" + +var os osinterface.OS = osinterface.RealOS{} diff --git a/pkg/dcgmexporter/parser.go b/pkg/dcgmexporter/parser.go index 08ce36ba..f25036ab 100644 --- a/pkg/dcgmexporter/parser.go +++ b/pkg/dcgmexporter/parser.go @@ -20,7 +20,6 @@ import ( "context" "encoding/csv" "fmt" - "os" "strings" "github.com/NVIDIA/go-dcgm/pkg/dcgm" @@ -96,7 +95,7 @@ func extractCounters(records [][]string, c *Config) (*CounterSet, error) { res := CounterSet{} for i, record := range records { - var useOld = false + useOld := false if len(record) == 0 { continue } diff --git a/pkg/dcgmexporter/parser_test.go b/pkg/dcgmexporter/parser_test.go index ede589b3..0f00e25b 100644 --- a/pkg/dcgmexporter/parser_test.go +++ b/pkg/dcgmexporter/parser_test.go @@ -1,7 +1,6 @@ package dcgmexporter import ( - "os" "testing" "github.com/stretchr/testify/assert" @@ -122,7 +121,6 @@ func TestExtractCounters(t *testing.T) { extractCountersHelper(t, tt.field, tt.valid) }) } - } func extractCountersHelper(t *testing.T, input string, valid bool) { diff --git a/pkg/dcgmexporter/pipeline.go b/pkg/dcgmexporter/pipeline.go index 0e524669..69312403 100644 --- a/pkg/dcgmexporter/pipeline.go +++ b/pkg/dcgmexporter/pipeline.go @@ -33,7 +33,6 @@ func NewMetricsPipeline(config *Config, newDCGMCollector DCGMCollectorConstructor, fieldEntityGroupTypeSystemInfo *FieldEntityGroupTypeSystemInfo, ) (*MetricsPipeline, func(), error) { - logrus.WithField(LoggerDumpKey, fmt.Sprintf("%+v", counters)).Debug("Counters are initialized") cleanups := []func(){} @@ -128,6 +127,11 @@ func getTransformations(c *Config) []Transform { } } + if c.HPCJobMappingDir != "" { + hpcMapper := newHPCMapper(c) + transformations = append(transformations, hpcMapper) + } + return transformations } diff --git a/pkg/dcgmexporter/pipeline_test.go b/pkg/dcgmexporter/pipeline_test.go index 4e8f0b3b..f9385eda 100644 --- a/pkg/dcgmexporter/pipeline_test.go +++ b/pkg/dcgmexporter/pipeline_test.go @@ -18,7 +18,6 @@ package dcgmexporter import ( "errors" - "os" "testing" "github.com/sirupsen/logrus" @@ -58,7 +57,8 @@ func testNewDCGMCollector(t *testing.T, return func(c []Counter, hostname string, config *Config, - fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem) (*DCGMCollector, func(), error) { + fieldEntityGroupTypeSystemInfo FieldEntityGroupTypeSystemInfoItem, + ) (*DCGMCollector, func(), error) { // should always create GPU Collector if fieldEntityGroupTypeSystemInfo.SystemInfo.InfoType != dcgm.FE_GPU { if _, ok := enabledCollector[fieldEntityGroupTypeSystemInfo.SystemInfo.InfoType]; !ok { @@ -134,7 +134,6 @@ func TestCountPipelineCleanup(t *testing.T) { dcgm.FE_CPU_CORE: {}, }, }} { - t.Run(c.name, func(t *testing.T) { cleanupCounter := 0 diff --git a/pkg/dcgmexporter/types.go b/pkg/dcgmexporter/types.go index 0dd91468..fc4ba0f4 100644 --- a/pkg/dcgmexporter/types.go +++ b/pkg/dcgmexporter/types.go @@ -39,6 +39,8 @@ var ( namespaceAttribute = "namespace" containerAttribute = "container" + hpcJobAttribute = "hpc_job" + oldPodAttribute = "pod_name" oldNamespaceAttribute = "pod_namespace" oldContainerAttribute = "container_name" diff --git a/pkg/dcgmexporter/utils.go b/pkg/dcgmexporter/utils.go index d4791e15..6f5391bc 100644 --- a/pkg/dcgmexporter/utils.go +++ b/pkg/dcgmexporter/utils.go @@ -17,6 +17,8 @@ package dcgmexporter import ( + "bytes" + "encoding/gob" "fmt" "sync" "time" @@ -35,3 +37,29 @@ func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error { return fmt.Errorf("timeout waiting for WaitGroup") } } + +func deepCopy[T any](src T) (dst T, err error) { + var buf bytes.Buffer + + defer func() { + if r := recover(); r != nil { + // If there was a panic, return the zero value of T and the error. + dst = *new(T) + err = fmt.Errorf("panic occurred: %v", r) + } + }() + + // Create an encoder and send a value. + err = gob.NewEncoder(&buf).Encode(src) + if err != nil { + return *new(T), err + } + + // Create a new instance of the type T and decode into that. + err = gob.NewDecoder(&buf).Decode(&dst) + if err != nil { + return *new(T), err + } + + return dst, nil +} diff --git a/pkg/dcgmexporter/utils_test.go b/pkg/dcgmexporter/utils_test.go index b1a7dc94..c36e1e9f 100644 --- a/pkg/dcgmexporter/utils_test.go +++ b/pkg/dcgmexporter/utils_test.go @@ -45,3 +45,18 @@ func TestWaitWithTimeout(t *testing.T) { require.NoError(t, err) }) } + +func TestDeepCopy(t *testing.T) { + t.Run("Return error when pointer value is nil", func(t *testing.T) { + got, err := deepCopy[*struct{}](nil) + assert.Nil(t, got) + assert.Error(t, err) + }) + + t.Run("Return error when src is unsupported type", func(t *testing.T) { + ch := make(chan int) + got, err := deepCopy(ch) + assert.Nil(t, got) + assert.Error(t, err) + }) +} diff --git a/pkg/dcgmexporter/xid_collector_test.go b/pkg/dcgmexporter/xid_collector_test.go index a60d033c..96b3f9b9 100644 --- a/pkg/dcgmexporter/xid_collector_test.go +++ b/pkg/dcgmexporter/xid_collector_test.go @@ -20,6 +20,7 @@ import ( "bytes" "fmt" "reflect" + "slices" "testing" "time" @@ -28,6 +29,7 @@ import ( "github.com/prometheus/common/expfmt" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "k8s.io/utils/ptr" ) func TestXIDCollector_Gather_Encode(t *testing.T) { @@ -61,11 +63,11 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { } } - // Create fake GPU - numGPUs, err := dcgm.GetAllDeviceCount() + // Get a number of hardware GPUs + hardwareGPUs, err := dcgm.GetAllDeviceCount() require.NoError(t, err) - if numGPUs+1 > dcgm.MAX_NUM_DEVICES { + if hardwareGPUs+1 > dcgm.MAX_NUM_DEVICES { t.Skipf("Unable to add fake GPU with more than %d gpus", dcgm.MAX_NUM_DEVICES) } @@ -75,11 +77,12 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { {Entity: dcgm.GroupEntityPair{EntityGroupId: dcgm.FE_GPU}}, } - gpuIDs, err := dcgm.CreateFakeEntities(entityList) + // Create fake GPU + fakeGPUIDs, err := dcgm.CreateFakeEntities(entityList) require.NoError(t, err) - require.NotEmpty(t, gpuIDs) + require.NotEmpty(t, fakeGPUIDs) - for i, gpuID := range gpuIDs { + for i, gpuID := range fakeGPUIDs { err = dcgm.InjectFieldValue(gpuID, dcgm.DCGM_FI_DEV_XID_ERRORS, dcgm.DCGM_FT_INT64, @@ -136,15 +139,28 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { require.Len(t, metrics, 1) // We get metric value with 0 index metricValues := metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - // We expect 6 records, because we have 3 fake GPU and each GPU experienced 2 XID errors: 42 and 46 - require.Len(t, metricValues, 6) + + fakeGPUIDMap := map[string]struct{}{} + for _, fakeGPUID := range fakeGPUIDs { + fakeGPUIDMap[fmt.Sprint(fakeGPUID)] = struct{}{} + } + + conditionFakeGPUOnly := func(m Metric) bool { + _, exists := fakeGPUIDMap[m.GPU] + return exists + } + + // We want to filter out physical GPU and keep fake only + metricValues = filterMetrics(metricValues, conditionFakeGPUOnly) + + require.Len(t, metricValues, len(fakeGPUIDs)*2) for _, val := range metricValues { require.Contains(t, val.Labels, "window_size_in_ms") require.Equal(t, fmt.Sprint(config.XIDCountWindowSize), val.Labels["window_size_in_ms"]) } // We inject new error - err = dcgm.InjectFieldValue(gpuIDs[0], + err = dcgm.InjectFieldValue(fakeGPUIDs[0], dcgm.DCGM_FI_DEV_XID_ERRORS, dcgm.DCGM_FT_INT64, 0, @@ -162,9 +178,16 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { // We expect 1 metric: DCGM_EXP_XID_ERRORS_COUNT require.Len(t, metrics, 1) + // We get metric value with the last index metricValues = metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] - require.Len(t, metricValues, 6+1) + // We want to filter out physical GPU and keep fake only + metricValues = filterMetrics(metricValues, conditionFakeGPUOnly) + // We update metrics with slice, that doesn't contain physical GPU + metrics[reflect.ValueOf(metrics).MapKeys()[0].Interface().(Counter)] = metricValues + + // We have 3 fake GPU and each GPU experienced 3 XID errors: 42, 46, 19 to GPU0 + require.Len(t, metricValues, 1+(len(fakeGPUIDs)*2)) for _, val := range metricValues { require.Contains(t, val.Labels, "window_size_in_ms") require.Equal(t, fmt.Sprint(config.XIDCountWindowSize), val.Labels["window_size_in_ms"]) @@ -186,17 +209,39 @@ func TestXIDCollector_Gather_Encode(t *testing.T) { assert.Equal(t, "DCGM_EXP_XID_ERRORS_COUNT", *metricFamily.Name) assert.Equal(t, "Count of XID Errors within user-specified time window (see xid-count-window-size param).", *metricFamily.Help) assert.Equal(t, io_prometheus_client.MetricType_GAUGE, *metricFamily.Type) - require.Len(t, metricFamily.Metric, 6+1) - assert.Len(t, metricFamily.Metric[0].Label, 8) - assert.Equal(t, "gpu", *metricFamily.Metric[0].Label[0].Name) - assert.Equal(t, "UUID", *metricFamily.Metric[0].Label[1].Name) - assert.Equal(t, "device", *metricFamily.Metric[0].Label[2].Name) - assert.Equal(t, "modelName", *metricFamily.Metric[0].Label[3].Name) - assert.Equal(t, "Hostname", *metricFamily.Metric[0].Label[4].Name) - assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *metricFamily.Metric[0].Label[5].Name) - assert.Equal(t, "window_size_in_ms", *metricFamily.Metric[0].Label[6].Name) - assert.Equal(t, "xid", *metricFamily.Metric[0].Label[7].Name) - assert.NotEmpty(t, *metricFamily.Metric[0].Label[7].Value) + // We have 3 fake GPU and each GPU, except the one experienced XID errors: 42, 46, 19 + require.Len(t, metricFamily.Metric, 1+(len(fakeGPUIDs)*2)) + for _, mv := range metricFamily.Metric { + require.NotNil(t, mv.Gauge.Value) + if *(mv.Gauge.Value) == 0 { + // We don't inject XID errors into the hardware GPU, so we do not expect XID label + assert.Len(t, mv.Label, 7) + assert.False(t, slices.ContainsFunc(mv.Label, func(lp *io_prometheus_client.LabelPair) bool { + return ptr.Deref(lp.Name, "") == "xid" + })) + continue + } + assert.Len(t, mv.Label, 8) + assert.Equal(t, "gpu", *mv.Label[0].Name) + assert.Equal(t, "UUID", *mv.Label[1].Name) + assert.Equal(t, "device", *mv.Label[2].Name) + assert.Equal(t, "modelName", *mv.Label[3].Name) + assert.Equal(t, "Hostname", *mv.Label[4].Name) + assert.Equal(t, "DCGM_FI_DRIVER_VERSION", *mv.Label[5].Name) + assert.Equal(t, "window_size_in_ms", *mv.Label[6].Name) + assert.Equal(t, "xid", *mv.Label[7].Name) + assert.NotEmpty(t, *mv.Label[7].Value) + } +} + +func filterMetrics(metricValues []Metric, condition func(Metric) bool) []Metric { + var result []Metric + for _, metricValue := range metricValues { + if condition(metricValue) { + result = append(result, metricValue) + } + } + return result } func TestXIDCollector_NewXIDCollector(t *testing.T) { diff --git a/service-monitor.yaml b/service-monitor.yaml index b00afda2..7e89b3ee 100644 --- a/service-monitor.yaml +++ b/service-monitor.yaml @@ -18,12 +18,12 @@ metadata: name: "dcgm-exporter" labels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.1" + app.kubernetes.io/version: "3.4.2" spec: selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.4.1" + app.kubernetes.io/version: "3.4.2" endpoints: - port: "metrics" path: "/metrics" diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 9cc13f65..23214afb 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -16,7 +16,7 @@ GO_CMD ?= go NAMESPACE ?= "dcgm-exporter" CHART ?= "./../../deployment/" IMAGE_REPOSITORY ?= "nvcr.io/nvidia/k8s/dcgm-exporter" -IMAGE_TAG ?= "3.3.5-3.4.1-ubuntu22.04" +IMAGE_TAG ?= "3.3.6-3.4.2-ubuntu22.04" KUBECONFIG ?= "~/.kube/config" define TEST_CMD