Skip to content

Commit

Permalink
fix(nvidia/nvml): remove xid event polling gaps, log when event happe…
Browse files Browse the repository at this point in the history
…ns (#49)

* fix(nvidia/nvml): remove xid event polling gaps, log when event happens

Signed-off-by: Gyuho Lee <[email protected]>

* explain more

Signed-off-by: Gyuho Lee <[email protected]>

---------

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Sep 5, 2024
1 parent a50ec24 commit 5c67697
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 10 deletions.
4 changes: 0 additions & 4 deletions components/accelerator/nvidia/query/nvml/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ type instance struct {
// maps from uuid to device info
devices map[string]*DeviceInfo

xidPollInterval time.Duration

xidErrorSupported bool
xidEventMask uint64
xidEventSet nvml.EventSet
Expand Down Expand Up @@ -150,8 +148,6 @@ func NewInstance(ctx context.Context, opts ...OpOption) (Instance, error) {
nvmlExists: nvmlExists,
nvmlExistsMsg: nvmlExistsMsg,

xidPollInterval: time.Minute,

xidErrorSupported: false,
xidEventSet: xidEventSet,
xidEventMask: defaultXidEventMask,
Expand Down
22 changes: 16 additions & 6 deletions components/accelerator/nvidia/query/nvml/xid.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,21 +60,28 @@ const defaultXidEventMask = uint64(nvml.EventTypeXidCriticalError | nvml.EventTy
func (inst *instance) pollXidEvents() {
log.Logger.Debugw("polling xid events")

ticker := time.NewTicker(1)
defer ticker.Stop()

for {
select {
case <-inst.rootCtx.Done():
return
case <-ticker.C:
ticker.Reset(inst.xidPollInterval)
default:
}

// ok to for-loop with infinite 5-second retry
// because the below wait call blocks 5-second anyways
// and we do not want to miss the events between retries
// the event is only sent to the "xidEventCh" channel
// if it's an Xid event thus safe to retry in the for-loop

// waits 5 seconds
// ref. https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents
e, ret := inst.xidEventSet.Wait(5000)

if ret == nvml.ERROR_NOT_SUPPORTED {
log.Logger.Warnw("xid events not supported -- skipping", "error", nvml.ErrorString(ret))
return
}

if ret == nvml.ERROR_TIMEOUT {
log.Logger.Debugw("no event found in wait (timeout) -- retrying...", "error", nvml.ErrorString(ret))
continue
Expand Down Expand Up @@ -135,12 +142,15 @@ func (inst *instance) pollXidEvents() {

Error: deviceUUIDErr,
}

log.Logger.Warnw("detected xid event", "event", event)
select {
case <-inst.rootCtx.Done():
return
case inst.xidEventCh <- event:
log.Logger.Warnw("notified xid event", "event", event)
default:
log.Logger.Debugw("xid event channel is full, skipping event")
log.Logger.Warnw("xid event channel is full, skipping event")
}
}
}

0 comments on commit 5c67697

Please sign in to comment.