Skip to content

Commit

Permalink
fix(infiniband): simplify ibstat existence when evaluating healthy (#124
Browse files Browse the repository at this point in the history
)

* fix(infiniband): check ibstat existence when evaluating healthy

Signed-off-by: Gyuho Lee <[email protected]>

* include gpu product name

Signed-off-by: Gyuho Lee <[email protected]>

* fix

Signed-off-by: Gyuho Lee <[email protected]>

---------

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Oct 25, 2024
1 parent 76e14aa commit ea752da
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 36 deletions.
32 changes: 1 addition & 31 deletions components/accelerator/nvidia/infiniband/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@ package infiniband
import (
"context"
"fmt"
"strings"
"time"

"github.com/leptonai/gpud/components"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/common"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"
)
Expand Down Expand Up @@ -78,35 +76,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
if !ok {
return nil, fmt.Errorf("invalid output type: %T", last.Output)
}
if !allOutput.IbstatExists {
return []components.State{
{
Name: Name,
Healthy: true,
Reason: "ibstat does not exist",
},
}, nil
}
if allOutput.IbstatExists && len(allOutput.Ibstat.Errors) > 0 {
return []components.State{
{
Name: Name,
Healthy: false,
Reason: "ibstat query found errors " + strings.Join(allOutput.Ibstat.Errors, ", "),
ExtraInfo: map[string]string{
nvidia_query.StateKeyIbstatExists: fmt.Sprintf("%v", allOutput.IbstatExists),
},
SuggestedActions: &common.SuggestedActions{
RepairActions: []common.RepairActionType{
common.RepairActionTypeRepairHardware,
},
Descriptions: []string{
"potential infiniband switch/hardware issue needs immediate attention",
},
},
},
}, nil
}

output := ToOutput(allOutput)
return output.States()
}
Expand Down
36 changes: 31 additions & 5 deletions components/accelerator/nvidia/infiniband/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (

"github.com/leptonai/gpud/components"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/common"
)

// ToOutput converts nvidia_query.Output to Output.
Expand All @@ -18,6 +19,7 @@ func ToOutput(i *nvidia_query.Output) *Output {
}

o := &Output{
GPUProductName: i.GPUProductName(),
InfinibandClassExists: i.InfinibandClassExists,
IbstatExists: i.IbstatExists,
}
Expand All @@ -29,6 +31,10 @@ func ToOutput(i *nvidia_query.Output) *Output {
}

type Output struct {
// GPUProductName is the product name of the GPU.
// Useful to ignore infiniband states for non-infiniband supported GPUs (e.g., GTX 4090).
GPUProductName string `json:"gpu_product_name"`

InfinibandClassExists bool `json:"infiniband_class_exists"`
IbstatExists bool `json:"ibstat_exists"`
Ibstat nvidia_query.IbstatOutput `json:"ibstat"`
Expand Down Expand Up @@ -82,26 +88,46 @@ func ParseStatesToOutput(states ...components.State) (*Output, error) {

// Returns the output evaluation reason and its healthy-ness.
func (o *Output) Evaluate() (string, bool, error) {
if len(o.Ibstat.Errors) > 0 {
if o.IbstatExists && len(o.Ibstat.Errors) > 0 {
return fmt.Sprintf("ibstat errors found: %s", strings.Join(o.Ibstat.Errors, ", ")), false, nil
}
return "no ibstat error found", true, nil
return "no ibstat exists or no ibstat error found", true, nil
}

func (o *Output) States() ([]components.State, error) {
outputReasons, healthy, err := o.Evaluate()
if err != nil {
return nil, err
}

b, _ := o.JSON()

var suggestedActions *common.SuggestedActions = nil
if !healthy {
suggestedActions = &common.SuggestedActions{
RepairActions: []common.RepairActionType{
common.RepairActionTypeRepairHardware,
},
Descriptions: []string{
"potential infiniband switch/hardware issue needs immediate attention",
},
}
}

state := components.State{
Name: StateNameIbstat,
Name: StateNameIbstat,

Healthy: healthy,
Reason: outputReasons,

ExtraInfo: map[string]string{
StateKeyIbstatData: string(b),
StateKeyIbstatEncoding: StateValueIbstatEncodingJSON,
nvidia_query.StateKeyGPUProductName: o.GPUProductName,
nvidia_query.StateKeyIbstatExists: fmt.Sprintf("%v", o.IbstatExists),
StateKeyIbstatData: string(b),
StateKeyIbstatEncoding: StateValueIbstatEncodingJSON,
},

SuggestedActions: suggestedActions,
}
return []components.State{state}, nil
}
96 changes: 96 additions & 0 deletions components/accelerator/nvidia/infiniband/component_output_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package infiniband

import (
"testing"

nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
"github.com/leptonai/gpud/components/common"
)

func TestOutputStates(t *testing.T) {
tests := []struct {
name string
o *Output
expectedHealthy bool
expectedReason string
}{
{
name: "Healthy state",
o: &Output{
IbstatExists: true,
Ibstat: nvidia_query.IbstatOutput{},
},
expectedHealthy: true,
expectedReason: "no ibstat exists or no ibstat error found",
},
{
name: "Unhealthy state",
o: &Output{
IbstatExists: true,
Ibstat: nvidia_query.IbstatOutput{
Errors: []string{"Error 1", "Error 2"},
},
},
expectedHealthy: false,
expectedReason: "ibstat errors found: Error 1, Error 2",
},
{
name: "No ibstat state",
o: &Output{
IbstatExists: false,
},
expectedHealthy: true,
expectedReason: "no ibstat exists or no ibstat error found",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
states, err := tt.o.States()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

if len(states) != 1 {
t.Fatalf("expected 1 state, got %d", len(states))
}

state := states[0]
if state.Healthy != tt.expectedHealthy {
t.Errorf("expected Healthy to be %v, got %v", tt.expectedHealthy, state.Healthy)
}

if state.Reason != tt.expectedReason {
t.Errorf("expected Reason to be %s, got %s", tt.expectedReason, state.Reason)
}

// Additional checks for ExtraInfo and SuggestedActions
if !tt.o.IbstatExists {
if state.ExtraInfo[nvidia_query.StateKeyIbstatExists] != "false" {
t.Errorf("expected IbstatExists to be false, got %s", state.ExtraInfo[nvidia_query.StateKeyIbstatExists])
}
} else {
if state.ExtraInfo[nvidia_query.StateKeyIbstatExists] != "true" {
t.Errorf("expected IbstatExists to be true, got %s", state.ExtraInfo[nvidia_query.StateKeyIbstatExists])
}
}

if !tt.expectedHealthy {
if state.SuggestedActions == nil {
t.Error("expected SuggestedActions to be non-nil for unhealthy state")
} else {
if len(state.SuggestedActions.RepairActions) != 1 || state.SuggestedActions.RepairActions[0] != common.RepairActionTypeRepairHardware {
t.Errorf("expected RepairActions to be [RepairHardware], got %v", state.SuggestedActions.RepairActions)
}
if len(state.SuggestedActions.Descriptions) != 1 || state.SuggestedActions.Descriptions[0] != "potential infiniband switch/hardware issue needs immediate attention" {
t.Errorf("unexpected SuggestedActions description: %v", state.SuggestedActions.Descriptions)
}
}
} else {
if state.SuggestedActions != nil {
t.Error("expected SuggestedActions to be nil for healthy state")
}
}
})
}
}
1 change: 1 addition & 0 deletions components/accelerator/nvidia/query/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ func Get(ctx context.Context) (output any, err error) {
}

const (
StateKeyGPUProductName = "gpu_product_name"
StateKeySMIExists = "smi_exists"
StateKeyFabricManagerExists = "fabric_manager_exists"
StateKeyIbstatExists = "ibstat_exists"
Expand Down

0 comments on commit ea752da

Please sign in to comment.