Skip to content

Commit

Permalink
fix(components): scan panic error, docker-container error handling by…
Browse files Browse the repository at this point in the history
… converting state.error to string

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho committed Aug 19, 2024
1 parent 27f861b commit 15e303d
Show file tree
Hide file tree
Showing 31 changed files with 55 additions and 62 deletions.
2 changes: 1 addition & 1 deletion cmd/gpud/command/up.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ func cmdUp(cliContext *cli.Context) (retErr error) {
retErr = err
}
} else {
fmt.Printf("visit https://localhost:15132 to view the dashboard")
fmt.Printf("\nvisit https://localhost:15132 to view the dashboard\n\n")
}
}()

Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/clock-speed/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package clockspeed
import (
"context"
"database/sql"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "nvidia-smi query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists),
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/clock/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package clock
import (
"context"
"database/sql"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "nvidia-smi query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists),
Expand Down
1 change: 0 additions & 1 deletion components/accelerator/nvidia/clock/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ func (o *Output) States() ([]components.State, error) {
{
Name: StateNameHWSlowdown,
Healthy: true,
Error: nil,
Reason: rm,
ExtraInfo: map[string]string{
StateKeyHWSlowdownData: string(b),
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/ecc/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package ecc
import (
"context"
"database/sql"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "nvidia-smi query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists),
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/error/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package error

import (
"context"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -51,7 +50,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -75,7 +74,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "nvidia-smi query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists),
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/fabric-manager/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package fabricmanager

import (
"context"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -68,7 +67,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand Down Expand Up @@ -101,7 +100,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "fabric manager query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeyFabricManagerExists: fmt.Sprintf("%v", allOutput.FabricManagerExists),
Expand Down
2 changes: 1 addition & 1 deletion components/accelerator/nvidia/infiniband/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/info/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package info

import (
"context"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -51,7 +50,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -75,7 +74,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "nvidia-smi query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists),
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/memory/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package memory
import (
"context"
"database/sql"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "nvidia-smi query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists),
Expand Down
2 changes: 1 addition & 1 deletion components/accelerator/nvidia/nvlink/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/peermem/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package peermem

import (
"context"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -52,7 +51,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -76,7 +75,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "lsmod peermem query failed with " + e,
})
}
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/power/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package power
import (
"context"
"database/sql"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "nvidia-smi query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists),
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/processes/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package processes
import (
"context"
"database/sql"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "nvidia-smi query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists),
Expand Down
14 changes: 8 additions & 6 deletions components/accelerator/nvidia/query/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -326,13 +326,15 @@ func (o *Output) PrintInfo(debug bool) {
fmt.Printf("%s successfully checked fabric manager\n", checkMark)
}

if len(o.Ibstat.Errors) > 0 {
fmt.Printf("%s ibstat check failed with %d error(s)\n", warningSign, len(o.Ibstat.Errors))
for _, err := range o.Ibstat.Errors {
fmt.Println(err)
if o.IbstatExists {
if o.Ibstat != nil && len(o.Ibstat.Errors) > 0 {
fmt.Printf("%s ibstat check failed with %d error(s)\n", warningSign, len(o.Ibstat.Errors))
for _, err := range o.Ibstat.Errors {
fmt.Println(err)
}
} else {
fmt.Printf("%s successfully checked ibstat\n", checkMark)
}
} else {
fmt.Printf("%s successfully checked ibstat\n", checkMark)
}

if len(o.LsmodPeermemErrors) > 0 {
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/temperature/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package temperature
import (
"context"
"database/sql"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "nvidia-smi query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists),
Expand Down
5 changes: 2 additions & 3 deletions components/accelerator/nvidia/utilization/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package utilization
import (
"context"
"database/sql"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
return []components.State{
{
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand All @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
cs = append(cs, components.State{
Name: Name,
Healthy: false,
Error: errors.New(e),
Error: e,
Reason: "nvidia-smi query failed with " + e,
ExtraInfo: map[string]string{
nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists),
Expand Down
2 changes: 1 addition & 1 deletion components/components.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ type State struct {
Name string `json:"name,omitempty"`
Healthy bool `json:"healthy,omitempty"`
Reason string `json:"reason,omitempty"` // a detailed and processed reason on why the component is not healthy
Error error `json:"error,omitempty"` // the unprocessed error returned from the component
Error string `json:"error,omitempty"` // the unprocessed error returned from the component
ExtraInfo map[string]string `json:"extra_info,omitempty"` // any extra information the component may want to expose
}

Expand Down
2 changes: 1 addition & 1 deletion components/containerd/pod/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
{
Name: Name,
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand Down
2 changes: 1 addition & 1 deletion components/cpu/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) {
{
Name: Name,
Healthy: false,
Error: last.Error,
Error: last.Error.Error(),
Reason: "last query failed",
},
}, nil
Expand Down
Loading

0 comments on commit 15e303d

Please sign in to comment.