From 15e303d374d265edc30a80e912515d69b701f7b1 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Mon, 19 Aug 2024 13:42:28 +0800 Subject: [PATCH] fix(components): scan panic error, docker-container error handling by converting state.error to string Signed-off-by: Gyuho Lee --- cmd/gpud/command/up.go | 2 +- .../accelerator/nvidia/clock-speed/component.go | 5 ++--- components/accelerator/nvidia/clock/component.go | 5 ++--- .../accelerator/nvidia/clock/component_output.go | 1 - components/accelerator/nvidia/ecc/component.go | 5 ++--- components/accelerator/nvidia/error/component.go | 5 ++--- .../accelerator/nvidia/fabric-manager/component.go | 5 ++--- .../accelerator/nvidia/infiniband/component.go | 2 +- components/accelerator/nvidia/info/component.go | 5 ++--- components/accelerator/nvidia/memory/component.go | 5 ++--- components/accelerator/nvidia/nvlink/component.go | 2 +- components/accelerator/nvidia/peermem/component.go | 5 ++--- components/accelerator/nvidia/power/component.go | 5 ++--- .../accelerator/nvidia/processes/component.go | 5 ++--- components/accelerator/nvidia/query/query.go | 14 ++++++++------ .../accelerator/nvidia/temperature/component.go | 5 ++--- .../accelerator/nvidia/utilization/component.go | 5 ++--- components/components.go | 2 +- components/containerd/pod/component.go | 2 +- components/cpu/component.go | 2 +- components/disk/component.go | 2 +- components/docker/container/component.go | 2 +- components/fd/component.go | 2 +- components/k8s/pod/component.go | 2 +- components/memory/component.go | 2 +- components/network/latency/component.go | 2 +- components/os/component.go | 2 +- components/power-supply/component.go | 2 +- components/query/poller.go | 10 +++++++--- components/systemd/component.go | 2 +- components/tailscale/component.go | 2 +- 31 files changed, 55 insertions(+), 62 deletions(-) diff --git a/cmd/gpud/command/up.go b/cmd/gpud/command/up.go index 10c546eb..ce4db765 100644 --- a/cmd/gpud/command/up.go +++ b/cmd/gpud/command/up.go @@ -20,7 +20,7 @@ func cmdUp(cliContext *cli.Context) (retErr error) { retErr = err } } else { - fmt.Printf("visit https://localhost:15132 to view the dashboard") + fmt.Printf("\nvisit https://localhost:15132 to view the dashboard\n\n") } }() diff --git a/components/accelerator/nvidia/clock-speed/component.go b/components/accelerator/nvidia/clock-speed/component.go index a284cd95..d0fd01e7 100644 --- a/components/accelerator/nvidia/clock-speed/component.go +++ b/components/accelerator/nvidia/clock-speed/component.go @@ -4,7 +4,6 @@ package clockspeed import ( "context" "database/sql" - "errors" "fmt" "time" @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "nvidia-smi query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists), diff --git a/components/accelerator/nvidia/clock/component.go b/components/accelerator/nvidia/clock/component.go index f6a9da17..18bf2ae0 100644 --- a/components/accelerator/nvidia/clock/component.go +++ b/components/accelerator/nvidia/clock/component.go @@ -4,7 +4,6 @@ package clock import ( "context" "database/sql" - "errors" "fmt" "time" @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "nvidia-smi query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists), diff --git a/components/accelerator/nvidia/clock/component_output.go b/components/accelerator/nvidia/clock/component_output.go index 3a363da4..6f0ab784 100644 --- a/components/accelerator/nvidia/clock/component_output.go +++ b/components/accelerator/nvidia/clock/component_output.go @@ -115,7 +115,6 @@ func (o *Output) States() ([]components.State, error) { { Name: StateNameHWSlowdown, Healthy: true, - Error: nil, Reason: rm, ExtraInfo: map[string]string{ StateKeyHWSlowdownData: string(b), diff --git a/components/accelerator/nvidia/ecc/component.go b/components/accelerator/nvidia/ecc/component.go index 1d72b115..19dd4967 100644 --- a/components/accelerator/nvidia/ecc/component.go +++ b/components/accelerator/nvidia/ecc/component.go @@ -4,7 +4,6 @@ package ecc import ( "context" "database/sql" - "errors" "fmt" "time" @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "nvidia-smi query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists), diff --git a/components/accelerator/nvidia/error/component.go b/components/accelerator/nvidia/error/component.go index 8fafeeb2..c6dcc7db 100644 --- a/components/accelerator/nvidia/error/component.go +++ b/components/accelerator/nvidia/error/component.go @@ -3,7 +3,6 @@ package error import ( "context" - "errors" "fmt" "time" @@ -51,7 +50,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -75,7 +74,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "nvidia-smi query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists), diff --git a/components/accelerator/nvidia/fabric-manager/component.go b/components/accelerator/nvidia/fabric-manager/component.go index 50387ede..80153204 100644 --- a/components/accelerator/nvidia/fabric-manager/component.go +++ b/components/accelerator/nvidia/fabric-manager/component.go @@ -4,7 +4,6 @@ package fabricmanager import ( "context" - "errors" "fmt" "time" @@ -68,7 +67,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -101,7 +100,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "fabric manager query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeyFabricManagerExists: fmt.Sprintf("%v", allOutput.FabricManagerExists), diff --git a/components/accelerator/nvidia/infiniband/component.go b/components/accelerator/nvidia/infiniband/component.go index 9387a185..3d727db9 100644 --- a/components/accelerator/nvidia/infiniband/component.go +++ b/components/accelerator/nvidia/infiniband/component.go @@ -52,7 +52,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/accelerator/nvidia/info/component.go b/components/accelerator/nvidia/info/component.go index 18113de0..a2f947f9 100644 --- a/components/accelerator/nvidia/info/component.go +++ b/components/accelerator/nvidia/info/component.go @@ -3,7 +3,6 @@ package info import ( "context" - "errors" "fmt" "time" @@ -51,7 +50,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -75,7 +74,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "nvidia-smi query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists), diff --git a/components/accelerator/nvidia/memory/component.go b/components/accelerator/nvidia/memory/component.go index d99f6cff..a7eba042 100644 --- a/components/accelerator/nvidia/memory/component.go +++ b/components/accelerator/nvidia/memory/component.go @@ -4,7 +4,6 @@ package memory import ( "context" "database/sql" - "errors" "fmt" "time" @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "nvidia-smi query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists), diff --git a/components/accelerator/nvidia/nvlink/component.go b/components/accelerator/nvidia/nvlink/component.go index 68b27e0b..f1d2cc84 100644 --- a/components/accelerator/nvidia/nvlink/component.go +++ b/components/accelerator/nvidia/nvlink/component.go @@ -55,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/accelerator/nvidia/peermem/component.go b/components/accelerator/nvidia/peermem/component.go index 76ce84cd..9b8998d1 100644 --- a/components/accelerator/nvidia/peermem/component.go +++ b/components/accelerator/nvidia/peermem/component.go @@ -4,7 +4,6 @@ package peermem import ( "context" - "errors" "fmt" "time" @@ -52,7 +51,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -76,7 +75,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "lsmod peermem query failed with " + e, }) } diff --git a/components/accelerator/nvidia/power/component.go b/components/accelerator/nvidia/power/component.go index 95c41799..cb05d16c 100644 --- a/components/accelerator/nvidia/power/component.go +++ b/components/accelerator/nvidia/power/component.go @@ -4,7 +4,6 @@ package power import ( "context" "database/sql" - "errors" "fmt" "time" @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "nvidia-smi query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists), diff --git a/components/accelerator/nvidia/processes/component.go b/components/accelerator/nvidia/processes/component.go index 864e6a32..88b2630b 100644 --- a/components/accelerator/nvidia/processes/component.go +++ b/components/accelerator/nvidia/processes/component.go @@ -4,7 +4,6 @@ package processes import ( "context" "database/sql" - "errors" "fmt" "time" @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "nvidia-smi query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists), diff --git a/components/accelerator/nvidia/query/query.go b/components/accelerator/nvidia/query/query.go index e6d048e8..7536a51e 100644 --- a/components/accelerator/nvidia/query/query.go +++ b/components/accelerator/nvidia/query/query.go @@ -326,13 +326,15 @@ func (o *Output) PrintInfo(debug bool) { fmt.Printf("%s successfully checked fabric manager\n", checkMark) } - if len(o.Ibstat.Errors) > 0 { - fmt.Printf("%s ibstat check failed with %d error(s)\n", warningSign, len(o.Ibstat.Errors)) - for _, err := range o.Ibstat.Errors { - fmt.Println(err) + if o.IbstatExists { + if o.Ibstat != nil && len(o.Ibstat.Errors) > 0 { + fmt.Printf("%s ibstat check failed with %d error(s)\n", warningSign, len(o.Ibstat.Errors)) + for _, err := range o.Ibstat.Errors { + fmt.Println(err) + } + } else { + fmt.Printf("%s successfully checked ibstat\n", checkMark) } - } else { - fmt.Printf("%s successfully checked ibstat\n", checkMark) } if len(o.LsmodPeermemErrors) > 0 { diff --git a/components/accelerator/nvidia/temperature/component.go b/components/accelerator/nvidia/temperature/component.go index 4c5361c8..c9db65d2 100644 --- a/components/accelerator/nvidia/temperature/component.go +++ b/components/accelerator/nvidia/temperature/component.go @@ -4,7 +4,6 @@ package temperature import ( "context" "database/sql" - "errors" "fmt" "time" @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "nvidia-smi query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists), diff --git a/components/accelerator/nvidia/utilization/component.go b/components/accelerator/nvidia/utilization/component.go index 7e3086f6..89586e15 100644 --- a/components/accelerator/nvidia/utilization/component.go +++ b/components/accelerator/nvidia/utilization/component.go @@ -4,7 +4,6 @@ package utilization import ( "context" "database/sql" - "errors" "fmt" "time" @@ -56,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { return []components.State{ { Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil @@ -80,7 +79,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { cs = append(cs, components.State{ Name: Name, Healthy: false, - Error: errors.New(e), + Error: e, Reason: "nvidia-smi query failed with " + e, ExtraInfo: map[string]string{ nvidia_query.StateKeySMIExists: fmt.Sprintf("%v", allOutput.SMIExists), diff --git a/components/components.go b/components/components.go index 66ed1c4e..57f42108 100644 --- a/components/components.go +++ b/components/components.go @@ -68,7 +68,7 @@ type State struct { Name string `json:"name,omitempty"` Healthy bool `json:"healthy,omitempty"` Reason string `json:"reason,omitempty"` // a detailed and processed reason on why the component is not healthy - Error error `json:"error,omitempty"` // the unprocessed error returned from the component + Error string `json:"error,omitempty"` // the unprocessed error returned from the component ExtraInfo map[string]string `json:"extra_info,omitempty"` // any extra information the component may want to expose } diff --git a/components/containerd/pod/component.go b/components/containerd/pod/component.go index 2e590f26..8ce771c4 100644 --- a/components/containerd/pod/component.go +++ b/components/containerd/pod/component.go @@ -51,7 +51,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/cpu/component.go b/components/cpu/component.go index fd27773f..38204e4f 100644 --- a/components/cpu/component.go +++ b/components/cpu/component.go @@ -56,7 +56,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/disk/component.go b/components/disk/component.go index 897e04c2..af0ccb38 100644 --- a/components/disk/component.go +++ b/components/disk/component.go @@ -56,7 +56,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/docker/container/component.go b/components/docker/container/component.go index e295f826..f3f6c06b 100644 --- a/components/docker/container/component.go +++ b/components/docker/container/component.go @@ -51,7 +51,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/fd/component.go b/components/fd/component.go index 48fdd81f..3e8422a9 100644 --- a/components/fd/component.go +++ b/components/fd/component.go @@ -56,7 +56,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/k8s/pod/component.go b/components/k8s/pod/component.go index 950e29cd..47684bcb 100644 --- a/components/k8s/pod/component.go +++ b/components/k8s/pod/component.go @@ -54,7 +54,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/memory/component.go b/components/memory/component.go index 17aafdc5..cf692b55 100644 --- a/components/memory/component.go +++ b/components/memory/component.go @@ -56,7 +56,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/network/latency/component.go b/components/network/latency/component.go index 38c5676f..159bba22 100644 --- a/components/network/latency/component.go +++ b/components/network/latency/component.go @@ -47,7 +47,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/os/component.go b/components/os/component.go index 377b4829..dcd2111b 100644 --- a/components/os/component.go +++ b/components/os/component.go @@ -51,7 +51,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/power-supply/component.go b/components/power-supply/component.go index fc97379f..72796114 100644 --- a/components/power-supply/component.go +++ b/components/power-supply/component.go @@ -51,7 +51,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/query/poller.go b/components/query/poller.go index d127e7c0..7608b78d 100644 --- a/components/query/poller.go +++ b/components/query/poller.go @@ -110,7 +110,8 @@ func pollLoops(ctx context.Context, id string, ch chan<- Item, interval time.Dur Time: metav1.Time{Time: time.Now().UTC()}, Error: ctx.Err(), }: - default: // channel is full, skip this result and continue + default: + log.Logger.Debugw("channel is full, skip this result and continue") } return @@ -122,6 +123,7 @@ func pollLoops(ctx context.Context, id string, ch chan<- Item, interval time.Dur output, err := get(ctx) if err != nil { + log.Logger.Debugw("polling error", "id", id, "error", err) select { case <-ctx.Done(): return @@ -129,7 +131,8 @@ func pollLoops(ctx context.Context, id string, ch chan<- Item, interval time.Dur Time: metav1.Time{Time: time.Now().UTC()}, Error: err, }: - default: // channel is full, skip this result and continue + default: + log.Logger.Debugw("channel is full, skip this result and continue") } continue } @@ -146,7 +149,8 @@ func pollLoops(ctx context.Context, id string, ch chan<- Item, interval time.Dur Time: metav1.Time{Time: time.Now().UTC()}, Output: output, }: - default: // channel is full, skip this result and continue + default: + log.Logger.Debugw("channel is full, skip this result and continue") } } } diff --git a/components/systemd/component.go b/components/systemd/component.go index 676f569a..baf04b95 100644 --- a/components/systemd/component.go +++ b/components/systemd/component.go @@ -55,7 +55,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil diff --git a/components/tailscale/component.go b/components/tailscale/component.go index 95b894e8..59165d5b 100644 --- a/components/tailscale/component.go +++ b/components/tailscale/component.go @@ -51,7 +51,7 @@ func (c *component) States(ctx context.Context) ([]components.State, error) { { Name: Name, Healthy: false, - Error: last.Error, + Error: last.Error.Error(), Reason: "last query failed", }, }, nil