-
Notifications
You must be signed in to change notification settings - Fork 442
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Re-introduce use of
px/agent_status_diagnostics
in px cli to detect…
… missing kernel headers (#2091) Summary: Re-introduce use of `px/agent_status_diagnostics` in px cli to detect missing kernel headers The original version of this was reverted since it resulted in hung `px deploy` and `px collect-logs` commands on larger clusters. This PR reintroduces the change with the fixes necessary to prevent the previous issue and is best reviewed commit by commit as outlined below: Commit 1: Cherry-pick of #2065 Commit 2: Fix for goroutine deadlock Commit 3: Add bundle flag to the `px collect-logs` command Commit 4: Introduce `PX_LOG_FILE` env var for redirecting `px` log output to a file -- useful for debugging the cli since its terminal spinners complicate logging to stdout Commits 1, 3 and 4 should be self explanatory. As for Commit 2, the goroutine deadlock occurred from the `streamCh` channel consumer. The previous version read a single value from the `streamCh` channel, parsed the result and [terminated](2ec63c8#diff-4da8f48b4c664d330cff34e70f907d6015289797c832587b0b14004875ef0831R363) its goroutine. Thus future sends to the `streamCh` channel could block and prevent the pipe receiving the pxl script results to be fully consumed. Since the stream adapter writes to the pipe, it couldn't flush all of its results and the deadlock occurred. The original testing was performed on clusters with 1 and 2 nodes -- max of 2 PEMs and 2 results from `px/agent_status`. This deadlock issue didn't surface in those situations because `streamCh` was a buffered channel with capacity of 1 and the consumer would read a single record before terminating. This meant that the pipe reader would hit EOF before it would initiate a channel send that would deadlock as outlined below: 2 Node cluster situation: 1. `px` cli executes `px/agent_status` as `px/agent_status_diagnostics` is not in the canonical bundle yet 2. streamCh producer sends 1st PEMs result -- streamCh at capacity 3. streamCh consumer reads the value and exits -- streamCh ready to accept 1 value 4. streamCh producer sends 2nd and final PEM result -- streamCh at capacity and future sends would block! 5. Program exits since pxl script is complete Relevant Issues: #2051 Type of change: /kind feature Test Plan: Verified that the deadlock no longer occurs on clusters with 3-6 nodes - [x] Used the [following](https://github.com/user-attachments/files/18457105/deadlocked-goroutines.txt) pprof goroutine stack dump to understand the deadlock described above -- see blocked goroutine on `streamCh` channel send on `script.go:337` - [x] Re-tested all of the scenarios from #2065 Changelog Message: Re-introduce enhanced diagnostics for `px deploy` and `px collect-logs` commands used to detect common sources of environment incompatibilities --------- Signed-off-by: Dom Del Nano <[email protected]>
- Loading branch information
Showing
10 changed files
with
358 additions
and
148 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
/* | ||
* Copyright 2018- The Pixie Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package vizier | ||
|
||
import ( | ||
"archive/zip" | ||
"context" | ||
"errors" | ||
"os" | ||
"strings" | ||
|
||
log "github.com/sirupsen/logrus" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/client-go/kubernetes" | ||
"k8s.io/client-go/rest" | ||
|
||
"px.dev/pixie/src/utils/script" | ||
"px.dev/pixie/src/utils/shared/k8s" | ||
) | ||
|
||
// LogCollector collect logs for Pixie and cluster setup information. | ||
type LogCollector struct { | ||
k8sConfig *rest.Config | ||
k8sClientSet *kubernetes.Clientset | ||
cloudAddr string | ||
br *script.BundleManager | ||
k8s.LogCollector | ||
} | ||
|
||
// NewLogCollector creates a new log collector. | ||
func NewLogCollector(br *script.BundleManager, cloudAddr string) *LogCollector { | ||
cfg := k8s.GetConfig() | ||
cs := k8s.GetClientset(cfg) | ||
return &LogCollector{ | ||
cfg, | ||
cs, | ||
cloudAddr, | ||
br, | ||
*k8s.NewLogCollector(), | ||
} | ||
} | ||
|
||
// CollectPixieLogs collects logs for all Pixie pods and write them to the zip file fName. | ||
func (c *LogCollector) CollectPixieLogs(fName string) error { | ||
if !strings.HasSuffix(fName, ".zip") { | ||
return errors.New("fname must have .zip suffix") | ||
} | ||
f, err := os.Create(fName) | ||
if err != nil { | ||
return err | ||
} | ||
defer f.Close() | ||
|
||
zf := zip.NewWriter(f) | ||
defer zf.Close() | ||
|
||
vls := k8s.VizierLabelSelector() | ||
vizierLabelSelector := metav1.FormatLabelSelector(&vls) | ||
|
||
// We check across all namespaces for the matching pixie pods. | ||
vizierPodList, err := c.k8sClientSet.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{LabelSelector: vizierLabelSelector}) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
// We also need to get the logs the operator logs. | ||
// As the LabelSelectors are ANDed, we need to make a new query and merge | ||
// the results. | ||
ols := k8s.OperatorLabelSelector() | ||
operatorLabelSelector := metav1.FormatLabelSelector(&ols) | ||
|
||
operatorPodList, err := c.k8sClientSet.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{LabelSelector: operatorLabelSelector}) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
// Merge the two pod lists | ||
pods := append(vizierPodList.Items, operatorPodList.Items...) | ||
|
||
for _, pod := range pods { | ||
for _, containerStatus := range pod.Status.ContainerStatuses { | ||
// Ignore prev logs, they might not exist. | ||
_ = c.LogPodInfoToZipFile(zf, pod, containerStatus.Name, true) | ||
|
||
err := c.LogPodInfoToZipFile(zf, pod, containerStatus.Name, false) | ||
if err != nil { | ||
log.WithError(err).Warnf("Failed to log pod: %s", pod.Name) | ||
} | ||
} | ||
err = c.WritePodDescription(zf, pod) | ||
if err != nil { | ||
log.WithError(err).Warnf("failed to write pod description") | ||
} | ||
} | ||
|
||
err = c.LogKubeCmd(zf, "nodes.log", "describe", "node") | ||
if err != nil { | ||
log.WithError(err).Warn("failed to log node info") | ||
} | ||
|
||
err = c.LogKubeCmd(zf, "services.log", "describe", "services", "--all-namespaces", "-l", vizierLabelSelector) | ||
if err != nil { | ||
log.WithError(err).Warnf("failed to log services") | ||
} | ||
|
||
// Describe vizier and write it to vizier.log | ||
err = c.LogKubeCmd(zf, "vizier.log", "describe", "vizier", "--all-namespaces") | ||
if err != nil { | ||
log.WithError(err).Warnf("failed to log vizier crd") | ||
} | ||
|
||
clusterID, err := GetCurrentVizier(c.cloudAddr) | ||
if err != nil { | ||
log.WithError(err).Warnf("failed to get cluster ID") | ||
} | ||
outputCh, err := RunSimpleHealthCheckScript(c.br, c.cloudAddr, clusterID) | ||
|
||
if err != nil { | ||
entry := log.WithError(err) | ||
if _, ok := err.(*HealthCheckWarning); ok { | ||
entry.Warn("healthcheck script detected the following warnings:") | ||
} else { | ||
entry.Warn("failed to run healthcheck script") | ||
} | ||
} | ||
|
||
return c.LogOutputToZipFile(zf, "px_agent_diagnostics.txt", <-outputCh) | ||
} |
Oops, something went wrong.