Skip to content

Commit

Permalink
fix(xid/sxid): rely on last reboot first
Browse files Browse the repository at this point in the history
Signed-off-by: cardyok <[email protected]>
  • Loading branch information
cardyok committed Feb 7, 2025
1 parent ab3785a commit 22cf68f
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 14 deletions.
37 changes: 30 additions & 7 deletions components/accelerator/nvidia/error/sxid/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
os_id "github.com/leptonai/gpud/components/os/id"
"github.com/leptonai/gpud/log"
pkg_dmesg "github.com/leptonai/gpud/pkg/dmesg"
"github.com/leptonai/gpud/pkg/reboot"
)

const (
Expand All @@ -32,6 +33,7 @@ const (

DefaultRetentionPeriod = 3 * 24 * time.Hour
DefaultStateUpdatePeriod = 30 * time.Second
DefaultRebootCounts = 3
)

type SXIDComponent struct {
Expand Down Expand Up @@ -201,25 +203,46 @@ func (c *SXIDComponent) SetHealthy() error {
}

func (c *SXIDComponent) updateCurrentState() error {
osComponent, err := components.GetComponent(os_id.Name)
if err != nil {
return fmt.Errorf("failed to get os component: %w", err)
}
osEvents, err := osComponent.Events(c.rootCtx, time.Now().Add(-DefaultRetentionPeriod))
rebootEvents, err := getRebootEvents(c.rootCtx)
if err != nil {
return fmt.Errorf("failed to get os events: %w", err)
return fmt.Errorf("failed to get reboot events: %w", err)
}
localEvents, err := c.store.Get(c.rootCtx, time.Time{})
if err != nil {
return fmt.Errorf("failed to get all events: %w", err)
}
events := mergeEvents(osEvents, localEvents)
events := mergeEvents(rebootEvents, localEvents)
c.mu.Lock()
c.currState = EvolveHealthyState(events)
c.mu.Unlock()
return nil
}

func getRebootEvents(ctx context.Context) ([]components.Event, error) {
lastReboots, err := reboot.LastReboot(DefaultRebootCounts)
if err == nil {
var ret []components.Event
for _, rebootTime := range lastReboots {
ret = append(ret, components.Event{
Time: metav1.Time{Time: rebootTime},
Name: "reboot",
})
}
return ret, nil
}
//fall back to use os component
log.Logger.Infow("failed to get reboot events, falling back to os component", "error", err)
osComponent, err := components.GetComponent(os_id.Name)
if err != nil {
return nil, fmt.Errorf("failed to get os component: %w", err)
}
osEvents, err := osComponent.Events(ctx, time.Now().Add(-DefaultRetentionPeriod))
if err != nil {
return nil, fmt.Errorf("failed to get os events: %w", err)
}
return osEvents, nil
}

// mergeEvents merges two event slices and returns a time descending sorted new slice
func mergeEvents(a, b []components.Event) []components.Event {
totalLen := len(a) + len(b)
Expand Down
37 changes: 30 additions & 7 deletions components/accelerator/nvidia/error/xid/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
os_id "github.com/leptonai/gpud/components/os/id"
"github.com/leptonai/gpud/log"
pkg_dmesg "github.com/leptonai/gpud/pkg/dmesg"
"github.com/leptonai/gpud/pkg/reboot"
)

const (
Expand All @@ -33,6 +34,7 @@ const (

DefaultRetentionPeriod = 3 * 24 * time.Hour
DefaultStateUpdatePeriod = 30 * time.Second
DefaultRebootCounts = 3
)

type XIDComponent struct {
Expand Down Expand Up @@ -203,25 +205,46 @@ func (c *XIDComponent) SetHealthy() error {
}

func (c *XIDComponent) updateCurrentState() error {
osComponent, err := components.GetComponent(os_id.Name)
if err != nil {
return fmt.Errorf("failed to get os component: %w", err)
}
osEvents, err := osComponent.Events(c.rootCtx, time.Now().Add(-DefaultRetentionPeriod))
rebootEvents, err := getRebootEvents(c.rootCtx)
if err != nil {
return fmt.Errorf("failed to get os events: %w", err)
return fmt.Errorf("failed to get reboot events: %w", err)
}
localEvents, err := c.store.Get(c.rootCtx, time.Time{})
if err != nil {
return fmt.Errorf("failed to get all events: %w", err)
}
events := mergeEvents(osEvents, localEvents)
events := mergeEvents(rebootEvents, localEvents)
c.mu.Lock()
c.currState = EvolveHealthyState(events)
c.mu.Unlock()
return nil
}

func getRebootEvents(ctx context.Context) ([]components.Event, error) {
lastReboots, err := reboot.LastReboot(DefaultRebootCounts)
if err == nil {
var ret []components.Event
for _, rebootTime := range lastReboots {
ret = append(ret, components.Event{
Time: metav1.Time{Time: rebootTime},
Name: "reboot",
})
}
return ret, nil
}
//fall back to use os component
log.Logger.Infow("failed to get reboot events, falling back to os component", "error", err)
osComponent, err := components.GetComponent(os_id.Name)
if err != nil {
return nil, fmt.Errorf("failed to get os component: %w", err)
}
osEvents, err := osComponent.Events(ctx, time.Now().Add(-DefaultRetentionPeriod))
if err != nil {
return nil, fmt.Errorf("failed to get os events: %w", err)
}
return osEvents, nil
}

// mergeEvents merges two event slices and returns a time descending sorted new slice
func mergeEvents(a, b []components.Event) []components.Event {
totalLen := len(a) + len(b)
Expand Down
37 changes: 37 additions & 0 deletions pkg/reboot/reboot.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@
package reboot

import (
"bytes"
"context"
"errors"
"fmt"
stdos "os"
"os/exec"
"regexp"
"strconv"
"strings"
"time"

"github.com/leptonai/gpud/log"
Expand Down Expand Up @@ -124,3 +129,35 @@ func Reboot(ctx context.Context, opts ...OpOption) error {
)
return nil
}

func LastReboot(count int) ([]time.Time, error) {
cmd := exec.Command("last", "reboot", "-n", strconv.Itoa(count), "-F")
var out bytes.Buffer
cmd.Stdout = &out
err := cmd.Run()
if err != nil {
return nil, err
}
lines := bytes.Split(out.Bytes(), []byte{'\n'})
return LastRebootHelper(lines)
}

func LastRebootHelper(lines [][]byte) ([]time.Time, error) {
var rebootTimes []time.Time
for _, line := range lines {
if !strings.Contains(string(line), "reboot") {
continue
}
if len(line) > 0 {
re := regexp.MustCompile(`(\w+\s+\w+\s+\d+\s+\d+:\d+:\d+\s+\d+)`)
matches := re.FindAllString(string(line), 1)
for _, match := range matches {
t, err := time.Parse("Mon Jan 2 15:04:05 2006", match)
if err == nil {
rebootTimes = append(rebootTimes, t)
}
}
}
}
return rebootTimes, nil
}
28 changes: 28 additions & 0 deletions pkg/reboot/reboot_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package reboot

import (
"bytes"
"context"
"testing"
"time"
)

func TestReboot(t *testing.T) {
Expand All @@ -14,3 +16,29 @@ func TestReboot(t *testing.T) {
t.Errorf("Reboot() expected error %v, got %v", ErrNotRoot, err)
}
}

func TestLastReboot(t *testing.T) {
output := `reboot system boot 5.4.0-42-generic Mon Jan 2 15:04:05 2023
reboot system boot 5.4.0-42-generic Sun Jan 1 14:00:00 2023
reboot system boot 5.4.0-42-generic Sat Dec 31 13:00:00 2022
`
lines := bytes.Split([]byte(output), []byte{'\n'})
times, err := LastRebootHelper(lines)
if err != nil {
t.Fatalf("Expected no error, got %v", err)
}
if len(times) != 3 {
t.Fatalf("Expected 3 reboot times, got %d", len(times))
}
expectedDates := []string{
"2023-01-02T15:04:05Z",
"2023-01-01T14:00:00Z",
"2022-12-31T13:00:00Z",
}

for i, expected := range expectedDates {
if times[i].Format(time.RFC3339) != expected {
t.Errorf("Expected %s, got %s", expected, times[i].Format(time.RFC3339))
}
}
}

0 comments on commit 22cf68f

Please sign in to comment.