Skip to content

Commit

Permalink
fix(scan): require sudo access for dmesg scanning (#13)
Browse files Browse the repository at this point in the history
* fix(scan): require sudo access for dmesg scanning

Signed-off-by: Gyuho Lee <[email protected]>

* update

Signed-off-by: Gyuho Lee <[email protected]>

---------

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Aug 20, 2024
1 parent 251c654 commit 1ddef0a
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 3 deletions.
2 changes: 1 addition & 1 deletion cmd/gpud/command/diagnose.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (

func cmdDiagnose(cliContext *cli.Context) error {
if os.Geteuid() != 0 {
return errors.New("diagnose requires root")
return errors.New("requires sudo/root access to diagnose GPU issues")
}

ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
Expand Down
2 changes: 1 addition & 1 deletion components/accelerator/nvidia/query/nvidia_peermem.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ const peerMemModule = "nvidia_peermem"

func CheckLsmodPeermemModule(ctx context.Context) (*LsmodPeermemModuleOutput, error) {
if os.Geteuid() != 0 {
return nil, errors.New("nvidia_peermem check requires root")
return nil, errors.New("requires sudo/root access to check if ib_core is using nvidia_peermem")
}

b, err := exec.CommandContext(ctx, "sudo", "lsmod").CombinedOutput()
Expand Down
8 changes: 8 additions & 0 deletions components/accelerator/nvidia/query/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,10 @@ func (o *Output) PrintInfo(debug bool) {
}

if o.SMI != nil {
if len(o.SMI.GPUs) > 0 {
fmt.Printf("%s product name: %s (nvidia-smi)\n", checkMark, o.SMI.GPUs[0].ProductName)
}

if errs := o.SMI.FindGPUErrs(); len(errs) > 0 {
fmt.Printf("%s scanned nvidia-smi -- found %d error(s)\n", warningSign, len(errs))
for _, err := range errs {
Expand Down Expand Up @@ -356,6 +360,10 @@ func (o *Output) PrintInfo(debug bool) {
}

if o.NVML != nil {
if len(o.NVML.DeviceInfos) > 0 {
fmt.Printf("%s name: %s (NVML)\n", checkMark, o.NVML.DeviceInfos[0].Name)
}

for _, dev := range o.NVML.DeviceInfos {
fmt.Printf("\n\n##################\nNVML scan results for %s\n\n", dev.UUID)

Expand Down
8 changes: 7 additions & 1 deletion components/diagnose/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package diagnose

import (
"context"
"errors"
"fmt"
"os"
"time"

nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
Expand All @@ -23,6 +25,10 @@ const (

// Runs the scan operations.
func Scan(ctx context.Context, lines int, debug bool) error {
if os.Geteuid() != 0 {
return errors.New("requires sudo/root access in order to scan dmesg errors")
}

fmt.Printf("\n\n%s scanning the host\n\n", inProgress)

if nvidia_query.SMIExists() {
Expand All @@ -45,7 +51,7 @@ func Scan(ctx context.Context, lines int, debug bool) error {
} else {
output.PrintInfo(debug)

fmt.Printf("%s checking nvidia xid errors\n", inProgress)
fmt.Printf("\n%s checking nvidia xid errors\n", inProgress)

select {
case <-ctx.Done():
Expand Down

0 comments on commit 1ddef0a

Please sign in to comment.