From 9aa08cf64ff6b128d50d82b8cae52c5fbd8bb39a Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Tue, 20 Aug 2024 14:18:52 +0800 Subject: [PATCH 1/2] fix(scan): require sudo access for dmesg scanning Signed-off-by: Gyuho Lee --- cmd/gpud/command/diagnose.go | 2 +- components/accelerator/nvidia/query/nvidia_peermem.go | 2 +- components/diagnose/scan.go | 8 +++++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cmd/gpud/command/diagnose.go b/cmd/gpud/command/diagnose.go index 71d79da3..c1a1dbd3 100644 --- a/cmd/gpud/command/diagnose.go +++ b/cmd/gpud/command/diagnose.go @@ -13,7 +13,7 @@ import ( func cmdDiagnose(cliContext *cli.Context) error { if os.Geteuid() != 0 { - return errors.New("diagnose requires root") + return errors.New("requires sudo/root access to diagnose GPU issues") } ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) diff --git a/components/accelerator/nvidia/query/nvidia_peermem.go b/components/accelerator/nvidia/query/nvidia_peermem.go index 0984e66c..4b7bc9ca 100644 --- a/components/accelerator/nvidia/query/nvidia_peermem.go +++ b/components/accelerator/nvidia/query/nvidia_peermem.go @@ -12,7 +12,7 @@ const peerMemModule = "nvidia_peermem" func CheckLsmodPeermemModule(ctx context.Context) (*LsmodPeermemModuleOutput, error) { if os.Geteuid() != 0 { - return nil, errors.New("nvidia_peermem check requires root") + return nil, errors.New("requires sudo/root access to check if ib_core is using nvidia_peermem") } b, err := exec.CommandContext(ctx, "sudo", "lsmod").CombinedOutput() diff --git a/components/diagnose/scan.go b/components/diagnose/scan.go index a014b648..9bcb0ee5 100644 --- a/components/diagnose/scan.go +++ b/components/diagnose/scan.go @@ -2,7 +2,9 @@ package diagnose import ( "context" + "errors" "fmt" + "os" "time" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" @@ -23,6 +25,10 @@ const ( // Runs the scan operations. func Scan(ctx context.Context, lines int, debug bool) error { + if os.Geteuid() != 0 { + return errors.New("requires sudo/root access in order to scan dmesg errors") + } + fmt.Printf("\n\n%s scanning the host\n\n", inProgress) if nvidia_query.SMIExists() { @@ -45,7 +51,7 @@ func Scan(ctx context.Context, lines int, debug bool) error { } else { output.PrintInfo(debug) - fmt.Printf("%s checking nvidia xid errors\n", inProgress) + fmt.Printf("\n%s checking nvidia xid errors\n", inProgress) select { case <-ctx.Done(): From 61ae5294b9d1d40f47b9552ccdca1cdd75e2fe51 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Tue, 20 Aug 2024 14:56:40 +0800 Subject: [PATCH 2/2] update Signed-off-by: Gyuho Lee --- components/accelerator/nvidia/query/query.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/components/accelerator/nvidia/query/query.go b/components/accelerator/nvidia/query/query.go index 7536a51e..503e8fe9 100644 --- a/components/accelerator/nvidia/query/query.go +++ b/components/accelerator/nvidia/query/query.go @@ -298,6 +298,10 @@ func (o *Output) PrintInfo(debug bool) { } if o.SMI != nil { + if len(o.SMI.GPUs) > 0 { + fmt.Printf("%s product name: %s (nvidia-smi)\n", checkMark, o.SMI.GPUs[0].ProductName) + } + if errs := o.SMI.FindGPUErrs(); len(errs) > 0 { fmt.Printf("%s scanned nvidia-smi -- found %d error(s)\n", warningSign, len(errs)) for _, err := range errs { @@ -356,6 +360,10 @@ func (o *Output) PrintInfo(debug bool) { } if o.NVML != nil { + if len(o.NVML.DeviceInfos) > 0 { + fmt.Printf("%s name: %s (NVML)\n", checkMark, o.NVML.DeviceInfos[0].Name) + } + for _, dev := range o.NVML.DeviceInfos { fmt.Printf("\n\n##################\nNVML scan results for %s\n\n", dev.UUID)