Skip to content

Commit

Permalink
fix(nvidia): skip clock events NVML check if not supported by old dri…
Browse files Browse the repository at this point in the history
…vers

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Sep 5, 2024
1 parent 5c67697 commit 65e00fd
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 34 deletions.
8 changes: 5 additions & 3 deletions components/accelerator/nvidia/clock/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ import (
)

func ToOutput(i *nvidia_query.Output) *Output {
clockEvents := make([]nvidia_query_nvml.ClockEvents, len(i.NVML.DeviceInfos))
for idx, devInfo := range i.NVML.DeviceInfos {
clockEvents[idx] = devInfo.ClockEvents
var clockEvents []nvidia_query_nvml.ClockEvents = nil
for _, devInfo := range i.NVML.DeviceInfos {
if devInfo.ClockEvents != nil {
clockEvents = append(clockEvents, *devInfo.ClockEvents)
}
}
return &Output{
HWSlowdownSMI: HWSlowdownSMI{
Expand Down
49 changes: 49 additions & 0 deletions components/accelerator/nvidia/query/nvml/clock_events.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,57 @@ import (
"encoding/json"
"fmt"

"github.com/leptonai/gpud/log"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"sigs.k8s.io/yaml"
)

// Returns true if clock events is supported by all devices.
// Returns false if any device does not support clock events.
// ref. undefined symbol: nvmlDeviceGetCurrentClocksEventReasons for older nvidia drivers
func ClockEventsSupported() (bool, error) {
nvmlLib := nvml.New()
if ret := nvmlLib.Init(); ret != nvml.SUCCESS {
return false, fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
}
log.Logger.Debugw("successfully initialized NVML")

deviceLib := device.New(nvmlLib)
devices, err := deviceLib.GetDevices()
if err != nil {
return false, err
}

for _, dev := range devices {
supported, err := ClockEventsSupportedByDevice(dev)
if err != nil {
return false, err
}
if !supported {
return false, nil
}
}
return true, nil
}

// Returns true if clock events is supported by this device.
func ClockEventsSupportedByDevice(dev device.Device) (bool, error) {
// clock events are supported in versions 535 and above
// otherwise, CGO call just exits with
// undefined symbol: nvmlDeviceGetCurrentClocksEventReasons
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6
_, ret := dev.GetCurrentClocksEventReasons()
if ret != nvml.ERROR_NOT_SUPPORTED {
return false, nil
}
if ret != nvml.SUCCESS {
return false, fmt.Errorf("could not get current clock events: %v", nvml.ErrorString(ret))
}
return true, nil
}

// ClockEvents represents the current clock events from the nvmlDeviceGetCurrentClocksEventReasons API.
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga115e41a14b747cb334a0e7b49ae1941
Expand Down Expand Up @@ -50,6 +96,9 @@ func GetClockEvents(uuid string, dev device.Device) (ClockEvents, error) {
UUID: uuid,
}

// clock events are supported in versions 535 and above
// otherwise, CGO call just exits with
// undefined symbol: nvmlDeviceGetCurrentClocksEventReasons
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6
reasons, ret := dev.GetCurrentClocksEventReasons()
if ret != nvml.SUCCESS {
Expand Down
84 changes: 70 additions & 14 deletions components/accelerator/nvidia/query/nvml/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ var _ Instance = (*instance)(nil)
type instance struct {
mu sync.RWMutex

driverVersion string
clockEventsSupported bool

rootCtx context.Context
rootCancel context.CancelFunc

Expand Down Expand Up @@ -91,19 +94,53 @@ type DeviceInfo struct {
// Set true if the device supports GPM metrics.
GPMMetricsSupported bool `json:"gpm_metrics_supported"`

ClockEvents ClockEvents `json:"clock_events"`
ClockSpeed ClockSpeed `json:"clock_speed"`
Memory Memory `json:"memory"`
NVLink NVLink `json:"nvlink"`
Power Power `json:"power"`
Temperature Temperature `json:"temperature"`
Utilization Utilization `json:"utilization"`
Processes Processes `json:"processes"`
ECCErrors ECCErrors `json:"ecc_errors"`
ClockEvents *ClockEvents `json:"clock_events,omitempty"`
ClockSpeed ClockSpeed `json:"clock_speed"`
Memory Memory `json:"memory"`
NVLink NVLink `json:"nvlink"`
Power Power `json:"power"`
Temperature Temperature `json:"temperature"`
Utilization Utilization `json:"utilization"`
Processes Processes `json:"processes"`
ECCErrors ECCErrors `json:"ecc_errors"`

device device.Device `json:"-"`
}

func GetDriverVersion() (string, error) {
nvmlLib := nvml.New()
if ret := nvmlLib.Init(); ret != nvml.SUCCESS {
return "", fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
}

ver, ret := nvmlLib.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("failed to get driver version: %v", nvml.ErrorString(ret))
}

// e.g.,
// 525.85.12 == does not support clock events
// 535.161.08 == supports clock events
return ver, nil
}

func ParseDriverVersion(version string) (major, minor, patch int, err error) {
var parsed [3]int
if _, err = fmt.Sscanf(version, "%d.%d.%d", &parsed[0], &parsed[1], &parsed[2]); err != nil {
return 0, 0, 0, fmt.Errorf("failed to parse driver version: %v", err)
}

major, minor, patch = parsed[0], parsed[1], parsed[2]
return major, minor, patch, nil
}

// clock events are supported in versions 535 and above
// otherwise, CGO call just exits with
// undefined symbol: nvmlDeviceGetCurrentClocksEventReasons
func ClockEventsSupportedVersion(major, minor, patch int) bool {
return major >= 535
}

func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) {
op := &Op{}
if err := op.applyOpts(opts); err != nil {
Expand All @@ -118,7 +155,20 @@ func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) {
if ret := nvmlLib.Init(); ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
}
log.Logger.Debugw("successfully initialized NVML")
driverVersion, err := GetDriverVersion()
if err != nil {
return nil, err
}
major, minor, patch, err := ParseDriverVersion(driverVersion)
if err != nil {
return nil, err
}
clockEventsSupported := ClockEventsSupportedVersion(major, minor, patch)
if !clockEventsSupported {
log.Logger.Warnw("old nvidia driver -- skipping clock events, see https://github.com/NVIDIA/go-nvml/pull/123", "version", driverVersion)
}

log.Logger.Debugw("successfully initialized NVML", "driverVersion", driverVersion)

deviceLib := device.New(nvmlLib)
infoLib := nvinfo.New(
Expand All @@ -141,6 +191,9 @@ func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) {
rootCtx: rootCtx,
rootCancel: rootCancel,

driverVersion: driverVersion,
clockEventsSupported: clockEventsSupported,

nvmlLib: nvmlLib,
deviceLib: deviceLib,
infoLib: infoLib,
Expand Down Expand Up @@ -345,12 +398,15 @@ func (inst *instance) Get() (*Output, error) {
}
st.DeviceInfos = append(st.DeviceInfos, latestInfo)

var err error
latestInfo.ClockEvents, err = GetClockEvents(devInfo.UUID, devInfo.device)
if err != nil {
return st, err
if inst.clockEventsSupported {
clockEvents, err := GetClockEvents(devInfo.UUID, devInfo.device)
if err != nil {
return st, err
}
latestInfo.ClockEvents = &clockEvents
}

var err error
latestInfo.ClockSpeed, err = GetClockSpeed(devInfo.UUID, devInfo.device)
if err != nil {
return st, err
Expand Down
55 changes: 55 additions & 0 deletions components/accelerator/nvidia/query/nvml/nvml_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package nvml

import "testing"

func TestParseDriverVersion(t *testing.T) {
testCases := []struct {
version string
wantMajor int
wantMinor int
wantPatch int
wantErrNil bool
}{
{
version: "525.85.12",
wantMajor: 525,
wantMinor: 85,
wantPatch: 12,
wantErrNil: true,
},
{
version: "535.161.08",
wantMajor: 535,
wantMinor: 161,
wantPatch: 8,
wantErrNil: true,
},
{
version: "invalid.version",
wantErrNil: false,
},
}

for _, tc := range testCases {
t.Run(tc.version, func(t *testing.T) {
major, minor, patch, err := ParseDriverVersion(tc.version)

if (err == nil) != tc.wantErrNil {
t.Errorf("ParseDriverVersion(%q) error = %v, wantErrNil %v", tc.version, err, tc.wantErrNil)
return
}

if err == nil {
if major != tc.wantMajor {
t.Errorf("ParseDriverVersion(%q) major = %d, want %d", tc.version, major, tc.wantMajor)
}
if minor != tc.wantMinor {
t.Errorf("ParseDriverVersion(%q) minor = %d, want %d", tc.version, minor, tc.wantMinor)
}
if patch != tc.wantPatch {
t.Errorf("ParseDriverVersion(%q) patch = %d, want %d", tc.version, patch, tc.wantPatch)
}
}
})
}
}
36 changes: 20 additions & 16 deletions components/accelerator/nvidia/query/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,16 @@ func Get(ctx context.Context) (output any, err error) {
for _, dev := range o.NVML.DeviceInfos {
log.Logger.Debugw("setting metrics for device", "uuid", dev.UUID, "bus", dev.Bus, "device", dev.Device, "minorNumber", dev.MinorNumber)

if err := metrics_clock.SetHWSlowdown(ctx, dev.UUID, dev.ClockEvents.HWSlowdown, now); err != nil {
return nil, err
}
if err := metrics_clock.SetHWSlowdownThermal(ctx, dev.UUID, dev.ClockEvents.HWSlowdownThermal, now); err != nil {
return nil, err
}
if err := metrics_clock.SetHWSlowdownPowerBrake(ctx, dev.UUID, dev.ClockEvents.HWSlowdownPowerBrake, now); err != nil {
return nil, err
if dev.ClockEvents != nil {
if err := metrics_clock.SetHWSlowdown(ctx, dev.UUID, dev.ClockEvents.HWSlowdown, now); err != nil {
return nil, err
}
if err := metrics_clock.SetHWSlowdownThermal(ctx, dev.UUID, dev.ClockEvents.HWSlowdownThermal, now); err != nil {
return nil, err
}
if err := metrics_clock.SetHWSlowdownPowerBrake(ctx, dev.UUID, dev.ClockEvents.HWSlowdownPowerBrake, now); err != nil {
return nil, err
}
}

if err := metrics_clockspeed.SetGraphicsMHz(ctx, dev.UUID, dev.ClockSpeed.GraphicsMHz, now); err != nil {
Expand Down Expand Up @@ -368,16 +370,18 @@ func (o *Output) PrintInfo(debug bool) {
for _, dev := range o.NVML.DeviceInfos {
fmt.Printf("\n\n##################\nNVML scan results for %s\n\n", dev.UUID)

if dev.ClockEvents.HWSlowdown || dev.ClockEvents.HWSlowdownThermal || dev.ClockEvents.HWSlowdownPowerBrake {
fmt.Printf("%s NVML found hw slowdown error(s)\n", warningSign)
yb, err := dev.ClockEvents.YAML()
if err != nil {
log.Logger.Warnw("failed to marshal clock events", "error", err)
if dev.ClockEvents != nil {
if dev.ClockEvents.HWSlowdown || dev.ClockEvents.HWSlowdownThermal || dev.ClockEvents.HWSlowdownPowerBrake {
fmt.Printf("%s NVML found hw slowdown error(s)\n", warningSign)
yb, err := dev.ClockEvents.YAML()
if err != nil {
log.Logger.Warnw("failed to marshal clock events", "error", err)
} else {
fmt.Printf("clock events:\n%s\n\n", string(yb))
}
} else {
fmt.Printf("clock events:\n%s\n\n", string(yb))
fmt.Printf("%s NVML found no hw slowdown error\n", checkMark)
}
} else {
fmt.Printf("%s NVML found no hw slowdown error\n", checkMark)
}

uncorrectedErrs := dev.ECCErrors.Volatile.FindUncorrectedErrs()
Expand Down
26 changes: 25 additions & 1 deletion config/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,33 @@ func DefaultConfig(ctx context.Context) (*Config, error) {

if runtime.GOOS == "linux" {
if nvidia_query.SMIExists() {
driverVersion, err := nvidia_query_nvml.GetDriverVersion()
if err != nil {
return nil, err
}
major, minor, patch, err := nvidia_query_nvml.ParseDriverVersion(driverVersion)
if err != nil {
return nil, err
}

log.Logger.Debugw("auto-detected nvidia -- configuring nvidia components")

cfg.Components[nvidia_clock.Name] = nil
if nvidia_query_nvml.ClockEventsSupportedVersion(major, minor, patch) {
clockEventsSupported, err := nvidia_query_nvml.ClockEventsSupported()
if err == nil {
if clockEventsSupported {
log.Logger.Infow("auto-detected clock events supported")
cfg.Components[nvidia_clock.Name] = nil
} else {
log.Logger.Infow("auto-detected clock events not supported -- skipping", "error", err)
}
} else {
log.Logger.Warnw("failed to check clock events supported or not", "error", err)
}
} else {
log.Logger.Warnw("old nvidia driver -- skipping clock events in the default config, see https://github.com/NVIDIA/go-nvml/pull/123", "version", driverVersion)
}

cfg.Components[nvidia_ecc.Name] = nil
cfg.Components[nvidia_error.Name] = nil
if _, ok := cfg.Components[dmesg.Name]; ok {
Expand Down

0 comments on commit 65e00fd

Please sign in to comment.