Skip to content

Commit

Permalink
check version
Browse files Browse the repository at this point in the history
Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Sep 4, 2024
1 parent bbf6aba commit 629b5a3
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 7 deletions.
6 changes: 6 additions & 0 deletions components/accelerator/nvidia/query/nvml/clock_events.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ func ClockEventsSupported() (bool, error) {

// Returns true if clock events is supported by this device.
func ClockEventsSupportedByDevice(dev device.Device) (bool, error) {
// clock events are supported in versions 535 and above
// otherwise, CGO call just exits with
// undefined symbol: nvmlDeviceGetCurrentClocksEventReasons
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6
_, ret := dev.GetCurrentClocksEventReasons()
if ret != nvml.ERROR_NOT_SUPPORTED {
Expand Down Expand Up @@ -93,6 +96,9 @@ func GetClockEvents(uuid string, dev device.Device) (ClockEvents, error) {
UUID: uuid,
}

// clock events are supported in versions 535 and above
// otherwise, CGO call just exits with
// undefined symbol: nvmlDeviceGetCurrentClocksEventReasons
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7e505374454a0d4fc7339b6c885656d6
reasons, ret := dev.GetCurrentClocksEventReasons()
if ret != nvml.SUCCESS {
Expand Down
35 changes: 35 additions & 0 deletions components/accelerator/nvidia/query/nvml/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,41 @@ type DeviceInfo struct {
device device.Device `json:"-"`
}

func GetDriverVersion() (string, error) {
nvmlLib := nvml.New()
if ret := nvmlLib.Init(); ret != nvml.SUCCESS {
return "", fmt.Errorf("failed to initialize NVML: %v", nvml.ErrorString(ret))
}

ver, ret := nvmlLib.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("failed to get driver version: %v", nvml.ErrorString(ret))
}

// e.g.,
// 525.85.12 == does not support clock events
// 535.161.08 == supports clock events

return ver, nil
}

func ParseDriverVersion(version string) (major, minor, patch int, err error) {
var parsed [3]int
if _, err = fmt.Sscanf(version, "%d.%d.%d", &parsed[0], &parsed[1], &parsed[2]); err != nil {
return 0, 0, 0, fmt.Errorf("failed to parse driver version: %v", err)
}

major, minor, patch = parsed[0], parsed[1], parsed[2]
return major, minor, patch, nil
}

func ClockEventsSupportedVersion(major, minor, patch int) bool {
// clock events are supported in versions 535 and above
// otherwise, CGO call just exits with
// undefined symbol: nvmlDeviceGetCurrentClocksEventReasons
return major >= 535
}

func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) {
op := &Op{}
if err := op.applyOpts(opts); err != nil {
Expand Down
55 changes: 55 additions & 0 deletions components/accelerator/nvidia/query/nvml/nvml_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package nvml

import "testing"

func TestParseDriverVersion(t *testing.T) {
testCases := []struct {
version string
wantMajor int
wantMinor int
wantPatch int
wantErrNil bool
}{
{
version: "525.85.12",
wantMajor: 525,
wantMinor: 85,
wantPatch: 12,
wantErrNil: true,
},
{
version: "535.161.08",
wantMajor: 535,
wantMinor: 161,
wantPatch: 8,
wantErrNil: true,
},
{
version: "invalid.version",
wantErrNil: false,
},
}

for _, tc := range testCases {
t.Run(tc.version, func(t *testing.T) {
major, minor, patch, err := ParseDriverVersion(tc.version)

if (err == nil) != tc.wantErrNil {
t.Errorf("ParseDriverVersion(%q) error = %v, wantErrNil %v", tc.version, err, tc.wantErrNil)
return
}

if err == nil {
if major != tc.wantMajor {
t.Errorf("ParseDriverVersion(%q) major = %d, want %d", tc.version, major, tc.wantMajor)
}
if minor != tc.wantMinor {
t.Errorf("ParseDriverVersion(%q) minor = %d, want %d", tc.version, minor, tc.wantMinor)
}
if patch != tc.wantPatch {
t.Errorf("ParseDriverVersion(%q) patch = %d, want %d", tc.version, patch, tc.wantPatch)
}
}
})
}
}
27 changes: 20 additions & 7 deletions config/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,18 +217,31 @@ func DefaultConfig(ctx context.Context) (*Config, error) {

if runtime.GOOS == "linux" {
if nvidia_query.SMIExists() {
driverVersion, err := nvidia_query_nvml.GetDriverVersion()
if err != nil {
return nil, err
}
major, minor, patch, err := nvidia_query_nvml.ParseDriverVersion(driverVersion)
if err != nil {
return nil, err
}

log.Logger.Debugw("auto-detected nvidia -- configuring nvidia components")

clockEventsSupported, err := nvidia_query_nvml.ClockEventsSupported()
if err == nil {
if clockEventsSupported {
log.Logger.Infow("auto-detected clock events supported")
cfg.Components[nvidia_clock.Name] = nil
if nvidia_query_nvml.ClockEventsSupportedVersion(major, minor, patch) {
clockEventsSupported, err := nvidia_query_nvml.ClockEventsSupported()
if err == nil {
if clockEventsSupported {
log.Logger.Infow("auto-detected clock events supported")
cfg.Components[nvidia_clock.Name] = nil
} else {
log.Logger.Infow("auto-detected clock events not supported -- skipping", "error", err)
}
} else {
log.Logger.Infow("auto-detected clock events not supported -- skipping", "error", err)
log.Logger.Warnw("failed to check clock events supported or not", "error", err)
}
} else {
log.Logger.Warnw("failed to check clock events supported or not", "error", err)
log.Logger.Warnw("old nvidia driver -- skipping clock events, see https://github.com/NVIDIA/go-nvml/pull/123", "version", driverVersion)
}

cfg.Components[nvidia_ecc.Name] = nil
Expand Down

0 comments on commit 629b5a3

Please sign in to comment.