Skip to content

Commit

Permalink
add a monitor test for installer pod timeline
Browse files Browse the repository at this point in the history
  • Loading branch information
tkashem committed Dec 17, 2024
1 parent b7c61a7 commit f6f032f
Show file tree
Hide file tree
Showing 5 changed files with 300 additions and 0 deletions.
11 changes: 11 additions & 0 deletions e2echart/e2e-chart-template.html
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,10 @@ <h5 class="modal-title">Resource</h5>
return (eventInterval.source === "APIUnreachableFromClient")
}

function isInstallerPodActivity(eventInterval) {
return (eventInterval.source === "InstallerPodMonitor")
}

function isEndpointConnectivity(eventInterval) {
if (eventInterval.message.reason !== "DisruptionBegan" && eventInterval.message.reason !== "DisruptionSamplerOutageBegan") {
return false
Expand Down Expand Up @@ -272,6 +276,10 @@ <h5 class="modal-title">Resource</h5>
return [buildLocatorDisplayString(item.locator), "", "APIUnreachableFromClientMetrics"]
}

function isInstallerPodValue(item) {
return [buildLocatorDisplayString(item.locator), "", item.message.reason]
}

function disruptionValue(item) {
// We classify these disruption samples with this message if it thinks
// it looks like a problem in the CI cluster running the tests, not the cluster under test.
Expand Down Expand Up @@ -487,6 +495,9 @@ <h5 class="modal-title">Resource</h5>
timelineGroups.push({group: "api-unreachable", data: []})
createTimelineData(isAPIUnreachableFromClientValue, timelineGroups[timelineGroups.length - 1].data, eventIntervals, isAPIUnreachableFromClientActivity, regex)

timelineGroups.push({group: "installer-pod", data: []})
createTimelineData(isInstallerPodValue, timelineGroups[timelineGroups.length - 1].data, eventIntervals, isInstallerPodActivity, regex)

timelineGroups.push({ group: "etcd-leaders", data: [] })
createTimelineData(etcdLeadershipLogsValue, timelineGroups[timelineGroups.length - 1].data, eventIntervals, isEtcdLeadershipAndNotEmpty, regex)

Expand Down
2 changes: 2 additions & 0 deletions pkg/defaultmonitortests/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/openshift/origin/pkg/monitortests/kubeapiserver/disruptionnewapiserver"
"github.com/openshift/origin/pkg/monitortests/kubeapiserver/faultyloadbalancer"
"github.com/openshift/origin/pkg/monitortests/kubeapiserver/generationanalyzer"
"github.com/openshift/origin/pkg/monitortests/kubeapiserver/installerpod"
"github.com/openshift/origin/pkg/monitortests/kubeapiserver/legacykubeapiservermonitortests"
"github.com/openshift/origin/pkg/monitortests/machines/watchmachines"
"github.com/openshift/origin/pkg/monitortests/monitoring/disruptionmetricsapi"
Expand Down Expand Up @@ -133,6 +134,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI
monitorTestRegistry.AddMonitorTestOrDie("metrics-api-availability", "Monitoring", disruptionmetricsapi.NewAvailabilityInvariant())
monitorTestRegistry.AddMonitorTestOrDie(apiunreachablefromclientmetrics.MonitorName, "kube-apiserver", apiunreachablefromclientmetrics.NewMonitorTest())
monitorTestRegistry.AddMonitorTestOrDie(faultyloadbalancer.MonitorName, "kube-apiserver", faultyloadbalancer.NewMonitorTest())
monitorTestRegistry.AddMonitorTestOrDie(installerpod.MonitorName, "kube-apiserver", installerpod.NewInstallerPodMonitorTest())

return monitorTestRegistry
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/monitor/monitorapi/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ const (
SourceMachine IntervalSource = "MachineMonitor"

SourceGenerationMonitor IntervalSource = "GenerationMonitor"

SourceInstallerPodMonitor IntervalSource = "InstallerPodMonitor"
)

type Interval struct {
Expand Down
274 changes: 274 additions & 0 deletions pkg/monitortests/kubeapiserver/installerpod/monitortest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
package installerpod

import (
"context"
"fmt"
"strings"
"time"

"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/monitortestframework"
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"

"k8s.io/client-go/rest"
"k8s.io/kubernetes/test/e2e/framework"
)

const (
MonitorName = "installer-pod-monitor"
)

func NewInstallerPodMonitorTest() monitortestframework.MonitorTest {
return &monitorTest{
monitor: &installerPodMonitor{
pods: map[string]*podInfo{},
},
filter: func(interval monitorapi.Interval) bool {
if ns, ok := interval.Locator.Keys[monitorapi.LocatorNamespaceKey]; !ok || ns != "openshift-etcd" {
return false
}

switch interval.Message.Reason {
case "Created", "Started", "Killing", "StaticPodInstallerCompleted":
return true
default:
return false
}
},
}
}

type monitorTest struct {
monitor *installerPodMonitor
filter func(interval monitorapi.Interval) bool
}

func (mt *monitorTest) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
return nil
}

func (mt *monitorTest) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
return nil, nil, nil
}

func (mt *monitorTest) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
computed := mt.monitor.process(startingIntervals, mt.filter)
return computed, nil
}

func (mt *monitorTest) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
junitTest := &junitTest{
name: "[sig-apimachinery] installer Pods should not run concurrently on two or more node",
concurrentPods: mt.monitor.concurrentPods,
}

framework.Logf("monitor[%s]: found %d occurrences of installer pods running concurrently on two or more nodes", MonitorName, len(junitTest.concurrentPods))

// the following constraints define pass/fail for this test:
// a) if we don't find any installer pod activity, then
// this test is a noop, so we mark the test as skipped
// b) we find installer pod activity, but no two nodes are running
// these pods concurrently, this test is a pass
// c) we find installer pod activity, and at least one incident of two
// or more nodes running these pods concurrently, this test is a flake/fail
if len(mt.monitor.interested) == 0 {
// a) no installer pod activity observed, mark the test as skipped
return junitTest.Skip(), nil
}
return junitTest.Result(), nil // b or c
}

func (*monitorTest) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
return nil
}

func (*monitorTest) Cleanup(ctx context.Context) error {
// TODO wire up the start to a context we can kill here
return nil
}

type podInfo struct {
node string
name string
namespace string
reason string
startedAt, endedAt time.Time
concurrent bool
}

func (pi *podInfo) String() string {
return fmt.Sprintf("node(%s) name(%s) namespace(%s) reason(%s) started(%s) duration: %s",
pi.node, pi.name, pi.namespace, pi.reason, pi.startedAt.Format(time.RFC3339), pi.endedAt.Sub(pi.startedAt))
}

type concurrent struct {
this, that *podInfo
}

type installerPodMonitor struct {
interested monitorapi.Intervals
pods map[string]*podInfo
concurrentPods []concurrent
}

func (m *installerPodMonitor) process(intervals monitorapi.Intervals, filter func(interval monitorapi.Interval) bool) monitorapi.Intervals {
m.interested = make(monitorapi.Intervals, 0)
for _, interval := range intervals {
if filter(interval) {
m.interested = append(m.interested, interval)
}
}

framework.Logf("monitor[%s]: processing %d events", len(m.interested))
for _, interval := range m.interested {
m.processOne(interval)
}

computed := monitorapi.Intervals{}
for podName, info := range m.pods {
level := monitorapi.Info
endedAt := info.endedAt
if endedAt.IsZero() {
endedAt = info.startedAt
level = monitorapi.Error
}
if info.reason == "Killing" || info.concurrent {
level = monitorapi.Error
}

concurrentMsg := ""
if info.concurrent {
concurrentMsg = fmt.Sprintf("installer Pods may be running on concurrently on at least two nodes")
}
computed = append(computed,
monitorapi.NewInterval(monitorapi.SourceInstallerPodMonitor, level).
Locator(monitorapi.NewLocator().NodeFromName(info.node)).
Message(monitorapi.NewMessage().
HumanMessage(fmt.Sprintf("%s %s", podName, concurrentMsg)).
Reason(monitorapi.IntervalReason(info.reason)),
).
Display().
Build(info.startedAt, endedAt),
)
}

return computed
}

func (m *installerPodMonitor) processOne(interval monitorapi.Interval) {
hostname := host(interval)
if len(hostname) == 0 {
framework.Logf("monitor[%s]: no host name for interval: %+v", MonitorName, interval)
return
}

thisPodName := interval.Locator.Keys[monitorapi.LocatorKey("pod")]
switch interval.Message.Reason {
case "Started":
if _, ok := m.pods[thisPodName]; ok {
framework.Logf("monitor[%s]: unexpected, seeing Started twice for the same event: %+v", MonitorName, interval)
return
}
thisPodInfo := &podInfo{
node: hostname,
name: thisPodName,
namespace: interval.Locator.Keys[monitorapi.LocatorNamespaceKey],
startedAt: interval.From,
}
m.pods[thisPodName] = thisPodInfo
// TODO: are any other installer pods active on a different node?
for _, otherPodInfo := range m.pods {
if otherPodInfo.node == thisPodInfo.node {
continue
}
// a) we are on a different node
// b) is there any installer pod that is active?
if !otherPodInfo.startedAt.IsZero() && otherPodInfo.endedAt.IsZero() {
thisPodInfo.concurrent, otherPodInfo.concurrent = true, true
m.concurrentPods = append(m.concurrentPods, concurrent{this: otherPodInfo, that: thisPodInfo})
}
}

// these events denote the end of an installer pod, in my investigation of
// a failed run, i see one or the other, never both for an installer pod.
case "Killing", "StaticPodInstallerCompleted":
info, ok := m.pods[thisPodName]
if !ok {
framework.Logf("monitor[%s]: unexpected, not seen Started before - event: %+v", MonitorName, interval)
return
}
info.reason = string(interval.Message.Reason)
info.endedAt = interval.From
}
}

func host(interval monitorapi.Interval) string {
// kubelet events
if host, ok := interval.Locator.Keys[monitorapi.LocatorNodeKey]; ok && len(host) > 0 {
return host
}

// StaticPodInstallerCompleted is reported by the installer pod, and it
// does not contain any source information
name := interval.Locator.Keys[monitorapi.LocatorKey("pod")]
if len(name) == 0 {
return ""
}

// installer pod name has the following format:
// - installer-5-retry-1-ci-op-cn7ykf7p-b9a0c-bxxcm-master-2
// - installer-7-ci-op-cn7ykf7p-b9a0c-bxxcm-master-2
_, after, found := strings.Cut(name, "-retry-")
if found {
if split := strings.SplitN(after, "-", 2); len(split) == 2 {
return split[1]
}
return ""
}
if split := strings.SplitN(after, "-", 3); len(split) == 3 {
return split[2]
}
return ""
}

type junitTest struct {
name string
concurrentPods []concurrent
}

func (jut *junitTest) Skip() []*junitapi.JUnitTestCase {
skipped := &junitapi.JUnitTestCase{
Name: jut.name,
SkipMessage: &junitapi.SkipMessage{
Message: "No installer pod activity found",
},
}
return []*junitapi.JUnitTestCase{skipped}
}

func (jut *junitTest) Result() []*junitapi.JUnitTestCase {
passed := &junitapi.JUnitTestCase{
Name: jut.name,
SystemOut: "",
}
if len(jut.concurrentPods) == 0 {
// passed
return []*junitapi.JUnitTestCase{passed}
}

failed := &junitapi.JUnitTestCase{
Name: jut.name,
SystemOut: fmt.Sprintf("installer pods running concurrently on two or more nodes"),
FailureOutput: &junitapi.FailureOutput{},
}
for _, concurrent := range jut.concurrentPods {
a, b := concurrent.this, concurrent.that
msg := fmt.Sprintf("A(%s -> %s) B(%s -> %s):\nA: %s\nB: %s\n", a.startedAt.Format(time.RFC3339),
a.endedAt.Format(time.RFC3339), b.startedAt.Format(time.RFC3339), b.endedAt.Format(time.RFC3339), a, b)
failed.FailureOutput.Output = fmt.Sprintf("%s\n%s", failed.FailureOutput.Output, msg)
}

// TODO: for now, we flake the test, Once we know it's fully
// passing then we can remove the flake test case.
return []*junitapi.JUnitTestCase{failed, passed}
}
11 changes: 11 additions & 0 deletions test/extended/testdata/bindata.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f6f032f

Please sign in to comment.