diff --git a/components/accelerator/nvidia/query/nvml/options.go b/components/accelerator/nvidia/query/nvml/options.go index 58e5a8e0..f598e8ec 100644 --- a/components/accelerator/nvidia/query/nvml/options.go +++ b/components/accelerator/nvidia/query/nvml/options.go @@ -4,13 +4,17 @@ import ( "database/sql" "github.com/NVIDIA/go-nvml/pkg/nvml" + + events_db "github.com/leptonai/gpud/components/db" "github.com/leptonai/gpud/pkg/sqlite" ) type Op struct { - dbRW *sql.DB - dbRO *sql.DB - gpmMetricsIDs map[nvml.GpmMetricId]struct{} + dbRW *sql.DB + dbRO *sql.DB + xidEventsStore events_db.Store + hwslowdownEventsStore events_db.Store + gpmMetricsIDs map[nvml.GpmMetricId]struct{} } type OpOption func(*Op) @@ -53,6 +57,18 @@ func WithDBRO(db *sql.DB) OpOption { } } +func WithXidEventsStore(store events_db.Store) OpOption { + return func(op *Op) { + op.xidEventsStore = store + } +} + +func WithHWSlowdownEventsStore(store events_db.Store) OpOption { + return func(op *Op) { + op.hwslowdownEventsStore = store + } +} + func WithGPMMetricsID(ids ...nvml.GpmMetricId) OpOption { return func(op *Op) { if op.gpmMetricsIDs == nil { diff --git a/components/accelerator/nvidia/query/options.go b/components/accelerator/nvidia/query/options.go index 3f5d4edf..a4087362 100644 --- a/components/accelerator/nvidia/query/options.go +++ b/components/accelerator/nvidia/query/options.go @@ -1,10 +1,16 @@ package query -import "database/sql" +import ( + "database/sql" + + events_db "github.com/leptonai/gpud/components/db" +) type Op struct { dbRW *sql.DB dbRO *sql.DB + xidEventsStore events_db.Store + hwslowdownEventsStore events_db.Store nvidiaSMICommand string nvidiaSMIQueryCommand string ibstatCommand string @@ -47,6 +53,18 @@ func WithDBRO(db *sql.DB) OpOption { } } +func WithXidEventsStore(store events_db.Store) OpOption { + return func(op *Op) { + op.xidEventsStore = store + } +} + +func WithHWSlowdownEventsStore(store events_db.Store) OpOption { + return func(op *Op) { + op.hwslowdownEventsStore = store + } +} + // Specifies the nvidia-smi binary path to overwrite the default path. func WithNvidiaSMICommand(p string) OpOption { return func(op *Op) { diff --git a/components/accelerator/nvidia/query/query.go b/components/accelerator/nvidia/query/query.go index bfcae04a..119defd0 100644 --- a/components/accelerator/nvidia/query/query.go +++ b/components/accelerator/nvidia/query/query.go @@ -92,8 +92,10 @@ func Get(ctx context.Context, opts ...OpOption) (output any, err error) { if err := nvml.StartDefaultInstance( ctx, - nvml.WithDBRW(op.dbRW), - nvml.WithDBRO(op.dbRO), + nvml.WithDBRW(op.dbRW), // to deprecate in favor of events store + nvml.WithDBRO(op.dbRO), // to deprecate in favor of events store + nvml.WithXidEventsStore(op.xidEventsStore), + nvml.WithHWSlowdownEventsStore(op.hwslowdownEventsStore), nvml.WithGPMMetricsID( go_nvml.GPM_METRIC_SM_OCCUPANCY, go_nvml.GPM_METRIC_INTEGER_UTIL, diff --git a/components/diagnose/scan.go b/components/diagnose/scan.go index c11ed70b..561c7805 100644 --- a/components/diagnose/scan.go +++ b/components/diagnose/scan.go @@ -8,13 +8,15 @@ import ( "runtime" "time" - "github.com/dustin/go-humanize" + nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id" + nvidia_hw_slowdown_id "github.com/leptonai/gpud/components/accelerator/nvidia/hw-slowdown/id" nvidia_hw_slowdown_state "github.com/leptonai/gpud/components/accelerator/nvidia/hw-slowdown/state" "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" nvidia_query_nvml "github.com/leptonai/gpud/components/accelerator/nvidia/query/nvml" nvidia_query_sxid "github.com/leptonai/gpud/components/accelerator/nvidia/query/sxid" nvidia_query_xid "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid" + events_db "github.com/leptonai/gpud/components/db" "github.com/leptonai/gpud/components/dmesg" query_log_common "github.com/leptonai/gpud/components/query/log/common" query_log_tail "github.com/leptonai/gpud/components/query/log/tail" @@ -28,6 +30,7 @@ import ( "github.com/leptonai/gpud/pkg/process" "github.com/leptonai/gpud/pkg/sqlite" + "github.com/dustin/go-humanize" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -125,6 +128,26 @@ func Scan(ctx context.Context, opts ...OpOption) error { } defer db.Close() + eventsStoreNvidiaErrorXid, err := events_db.NewStore( + db, + db, + events_db.CreateDefaultTableName(nvidia_component_error_xid_id.Name), + 3*24*time.Hour, + ) + if err != nil { + log.Logger.Fatalw("failed to create events store", "error", err) + } + + eventsStoreNvidiaHWSlowdown, err := events_db.NewStore( + db, + db, + events_db.CreateDefaultTableName(nvidia_hw_slowdown_id.Name), + 3*24*time.Hour, + ) + if err != nil { + log.Logger.Fatalw("failed to create events store", "error", err) + } + // "nvidia_query.Get" assumes that the "clock-events-state" table exists // pre-create since this is a one-off operation // TODO: move these into a single place @@ -134,8 +157,10 @@ func Scan(ctx context.Context, opts ...OpOption) error { outputRaw, err := nvidia_query.Get( ctx, - nvidia_query.WithDBRW(db), - nvidia_query.WithDBRO(db), + nvidia_query.WithDBRW(db), // to deprecate in favor of events store + nvidia_query.WithDBRO(db), // to deprecate in favor of events store + nvidia_query.WithXidEventsStore(eventsStoreNvidiaErrorXid), + nvidia_query.WithHWSlowdownEventsStore(eventsStoreNvidiaHWSlowdown), nvidia_query.WithNvidiaSMICommand(op.nvidiaSMICommand), nvidia_query.WithNvidiaSMIQueryCommand(op.nvidiaSMIQueryCommand), nvidia_query.WithIbstatCommand(op.ibstatCommand), diff --git a/internal/server/server.go b/internal/server/server.go index a08cf111..e425b2aa 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -78,6 +78,7 @@ import ( containerd_pod_id "github.com/leptonai/gpud/components/containerd/pod/id" "github.com/leptonai/gpud/components/cpu" cpu_id "github.com/leptonai/gpud/components/cpu/id" + events_db "github.com/leptonai/gpud/components/db" "github.com/leptonai/gpud/components/disk" disk_id "github.com/leptonai/gpud/components/disk/id" "github.com/leptonai/gpud/components/dmesg" @@ -196,10 +197,32 @@ func New(ctx context.Context, config *lepconfig.Config, endpoint string, cliUID if err != nil { return nil, err } + var eventsStoreNvidiaErrorXid events_db.Store + var eventsStoreNvidiaHWSlowdown events_db.Store if runtime.GOOS == "linux" && nvidiaInstalled { + eventsStoreNvidiaErrorXid, err = events_db.NewStore( + dbRW, + dbRO, + events_db.CreateDefaultTableName(nvidia_component_error_xid_id.Name), + 3*24*time.Hour, + ) + if err != nil { + return nil, err + } + eventsStoreNvidiaHWSlowdown, err = events_db.NewStore( + dbRW, + dbRO, + events_db.CreateDefaultTableName(nvidia_hw_slowdown_id.Name), + 3*24*time.Hour, + ) + if err != nil { + return nil, err + } nvidia_query.SetDefaultPoller( nvidia_query.WithDBRW(dbRW), // to deprecate in favor of events store nvidia_query.WithDBRO(dbRO), // to deprecate in favor of events store + nvidia_query.WithXidEventsStore(eventsStoreNvidiaErrorXid), + nvidia_query.WithHWSlowdownEventsStore(eventsStoreNvidiaHWSlowdown), nvidia_query.WithNvidiaSMICommand(options.NvidiaSMICommand), nvidia_query.WithNvidiaSMIQueryCommand(options.NvidiaSMIQueryCommand), nvidia_query.WithIbstatCommand(options.IbstatCommand),