add a monitor test for installer pod timeline

openshift · Dec 17, 2024 · f6f032f · f6f032f
1 parent b7c61a7
commit f6f032f
Show file tree

Hide file tree

Showing 5 changed files with 300 additions and 0 deletions.
diff --git a/e2echart/e2e-chart-template.html b/e2echart/e2e-chart-template.html
@@ -142,6 +142,10 @@ <h5 class="modal-title">Resource</h5>
         return (eventInterval.source === "APIUnreachableFromClient")
     }
 
+    function isInstallerPodActivity(eventInterval) {
+        return (eventInterval.source === "InstallerPodMonitor")
+    }
+
     function isEndpointConnectivity(eventInterval) {
         if (eventInterval.message.reason !== "DisruptionBegan" && eventInterval.message.reason !== "DisruptionSamplerOutageBegan") {
             return false
@@ -272,6 +276,10 @@ <h5 class="modal-title">Resource</h5>
         return [buildLocatorDisplayString(item.locator), "", "APIUnreachableFromClientMetrics"]
     }
 
+    function isInstallerPodValue(item) {
+        return [buildLocatorDisplayString(item.locator), "", item.message.reason]
+    }
+
     function disruptionValue(item) {
         // We classify these disruption samples with this message if it thinks
         // it looks like a problem in the CI cluster running the tests, not the cluster under test.
@@ -487,6 +495,9 @@ <h5 class="modal-title">Resource</h5>
         timelineGroups.push({group: "api-unreachable", data: []})
         createTimelineData(isAPIUnreachableFromClientValue, timelineGroups[timelineGroups.length - 1].data, eventIntervals, isAPIUnreachableFromClientActivity, regex)
 
+        timelineGroups.push({group: "installer-pod", data: []})
+        createTimelineData(isInstallerPodValue, timelineGroups[timelineGroups.length - 1].data, eventIntervals, isInstallerPodActivity, regex)
+
         timelineGroups.push({ group: "etcd-leaders", data: [] })
         createTimelineData(etcdLeadershipLogsValue, timelineGroups[timelineGroups.length - 1].data, eventIntervals, isEtcdLeadershipAndNotEmpty, regex)
 

diff --git a/pkg/defaultmonitortests/types.go b/pkg/defaultmonitortests/types.go
@@ -20,6 +20,7 @@ import (
 	"github.com/openshift/origin/pkg/monitortests/kubeapiserver/disruptionnewapiserver"
 	"github.com/openshift/origin/pkg/monitortests/kubeapiserver/faultyloadbalancer"
 	"github.com/openshift/origin/pkg/monitortests/kubeapiserver/generationanalyzer"
+	"github.com/openshift/origin/pkg/monitortests/kubeapiserver/installerpod"
 	"github.com/openshift/origin/pkg/monitortests/kubeapiserver/legacykubeapiservermonitortests"
 	"github.com/openshift/origin/pkg/monitortests/machines/watchmachines"
 	"github.com/openshift/origin/pkg/monitortests/monitoring/disruptionmetricsapi"
@@ -133,6 +134,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI
 	monitorTestRegistry.AddMonitorTestOrDie("metrics-api-availability", "Monitoring", disruptionmetricsapi.NewAvailabilityInvariant())
 	monitorTestRegistry.AddMonitorTestOrDie(apiunreachablefromclientmetrics.MonitorName, "kube-apiserver", apiunreachablefromclientmetrics.NewMonitorTest())
 	monitorTestRegistry.AddMonitorTestOrDie(faultyloadbalancer.MonitorName, "kube-apiserver", faultyloadbalancer.NewMonitorTest())
+	monitorTestRegistry.AddMonitorTestOrDie(installerpod.MonitorName, "kube-apiserver", installerpod.NewInstallerPodMonitorTest())
 
 	return monitorTestRegistry
 }

diff --git a/pkg/monitor/monitorapi/types.go b/pkg/monitor/monitorapi/types.go
@@ -354,6 +354,8 @@ const (
 	SourceMachine                  IntervalSource = "MachineMonitor"
 
 	SourceGenerationMonitor IntervalSource = "GenerationMonitor"
+
+	SourceInstallerPodMonitor IntervalSource = "InstallerPodMonitor"
 )
 
 type Interval struct {

diff --git a/pkg/monitortests/kubeapiserver/installerpod/monitortest.go b/pkg/monitortests/kubeapiserver/installerpod/monitortest.go
@@ -0,0 +1,274 @@
+package installerpod
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/openshift/origin/pkg/monitor/monitorapi"
+	"github.com/openshift/origin/pkg/monitortestframework"
+	"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
+
+	"k8s.io/client-go/rest"
+	"k8s.io/kubernetes/test/e2e/framework"
+)
+
+const (
+	MonitorName = "installer-pod-monitor"
+)
+
+func NewInstallerPodMonitorTest() monitortestframework.MonitorTest {
+	return &monitorTest{
+		monitor: &installerPodMonitor{
+			pods: map[string]*podInfo{},
+		},
+		filter: func(interval monitorapi.Interval) bool {
+			if ns, ok := interval.Locator.Keys[monitorapi.LocatorNamespaceKey]; !ok || ns != "openshift-etcd" {
+				return false
+			}
+
+			switch interval.Message.Reason {
+			case "Created", "Started", "Killing", "StaticPodInstallerCompleted":
+				return true
+			default:
+				return false
+			}
+		},
+	}
+}
+
+type monitorTest struct {
+	monitor *installerPodMonitor
+	filter  func(interval monitorapi.Interval) bool
+}
+
+func (mt *monitorTest) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
+	return nil
+}
+
+func (mt *monitorTest) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
+	return nil, nil, nil
+}
+
+func (mt *monitorTest) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
+	computed := mt.monitor.process(startingIntervals, mt.filter)
+	return computed, nil
+}
+
+func (mt *monitorTest) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
+	junitTest := &junitTest{
+		name:           "[sig-apimachinery] installer Pods should not run concurrently on two or more node",
+		concurrentPods: mt.monitor.concurrentPods,
+	}
+
+	framework.Logf("monitor[%s]: found %d occurrences of installer pods running concurrently on two or more nodes", MonitorName, len(junitTest.concurrentPods))
+
+	// the following constraints define pass/fail for this test:
+	// a) if we don't find any installer pod activity, then
+	// this test is a noop, so we mark the test as skipped
+	// b) we find installer pod activity, but no two nodes are running
+	// these pods concurrently, this test is a pass
+	// c) we find installer pod activity, and at least one incident of two
+	// or more nodes running these pods concurrently, this test is a flake/fail
+	if len(mt.monitor.interested) == 0 {
+		// a) no installer pod activity observed, mark the test as skipped
+		return junitTest.Skip(), nil
+	}
+	return junitTest.Result(), nil // b or c
+}
+
+func (*monitorTest) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
+	return nil
+}
+
+func (*monitorTest) Cleanup(ctx context.Context) error {
+	// TODO wire up the start to a context we can kill here
+	return nil
+}
+
+type podInfo struct {
+	node               string
+	name               string
+	namespace          string
+	reason             string
+	startedAt, endedAt time.Time
+	concurrent         bool
+}
+
+func (pi *podInfo) String() string {
+	return fmt.Sprintf("node(%s) name(%s) namespace(%s) reason(%s) started(%s) duration: %s",
+		pi.node, pi.name, pi.namespace, pi.reason, pi.startedAt.Format(time.RFC3339), pi.endedAt.Sub(pi.startedAt))
+}
+
+type concurrent struct {
+	this, that *podInfo
+}
+
+type installerPodMonitor struct {
+	interested     monitorapi.Intervals
+	pods           map[string]*podInfo
+	concurrentPods []concurrent
+}
+
+func (m *installerPodMonitor) process(intervals monitorapi.Intervals, filter func(interval monitorapi.Interval) bool) monitorapi.Intervals {
+	m.interested = make(monitorapi.Intervals, 0)
+	for _, interval := range intervals {
+		if filter(interval) {
+			m.interested = append(m.interested, interval)
+		}
+	}
+
+	framework.Logf("monitor[%s]: processing %d events", len(m.interested))
+	for _, interval := range m.interested {
+		m.processOne(interval)
+	}
+
+	computed := monitorapi.Intervals{}
+	for podName, info := range m.pods {
+		level := monitorapi.Info
+		endedAt := info.endedAt
+		if endedAt.IsZero() {
+			endedAt = info.startedAt
+			level = monitorapi.Error
+		}
+		if info.reason == "Killing" || info.concurrent {
+			level = monitorapi.Error
+		}
+
+		concurrentMsg := ""
+		if info.concurrent {
+			concurrentMsg = fmt.Sprintf("installer Pods may be running on concurrently on at least two nodes")
+		}
+		computed = append(computed,
+			monitorapi.NewInterval(monitorapi.SourceInstallerPodMonitor, level).
+				Locator(monitorapi.NewLocator().NodeFromName(info.node)).
+				Message(monitorapi.NewMessage().
+					HumanMessage(fmt.Sprintf("%s %s", podName, concurrentMsg)).
+					Reason(monitorapi.IntervalReason(info.reason)),
+				).
+				Display().
+				Build(info.startedAt, endedAt),
+		)
+	}
+
+	return computed
+}
+
+func (m *installerPodMonitor) processOne(interval monitorapi.Interval) {
+	hostname := host(interval)
+	if len(hostname) == 0 {
+		framework.Logf("monitor[%s]: no host name for interval: %+v", MonitorName, interval)
+		return
+	}
+
+	thisPodName := interval.Locator.Keys[monitorapi.LocatorKey("pod")]
+	switch interval.Message.Reason {
+	case "Started":
+		if _, ok := m.pods[thisPodName]; ok {
+			framework.Logf("monitor[%s]: unexpected, seeing Started twice for the same event: %+v", MonitorName, interval)
+			return
+		}
+		thisPodInfo := &podInfo{
+			node:      hostname,
+			name:      thisPodName,
+			namespace: interval.Locator.Keys[monitorapi.LocatorNamespaceKey],
+			startedAt: interval.From,
+		}
+		m.pods[thisPodName] = thisPodInfo
+		// TODO: are any other installer pods active on a different node?
+		for _, otherPodInfo := range m.pods {
+			if otherPodInfo.node == thisPodInfo.node {
+				continue
+			}
+			// a) we are on a different node
+			// b) is there any installer pod that is active?
+			if !otherPodInfo.startedAt.IsZero() && otherPodInfo.endedAt.IsZero() {
+				thisPodInfo.concurrent, otherPodInfo.concurrent = true, true
+				m.concurrentPods = append(m.concurrentPods, concurrent{this: otherPodInfo, that: thisPodInfo})
+			}
+		}
+
+	// these events denote the end of an installer pod, in my investigation of
+	// a failed run, i see one or the other, never both for an installer pod.
+	case "Killing", "StaticPodInstallerCompleted":
+		info, ok := m.pods[thisPodName]
+		if !ok {
+			framework.Logf("monitor[%s]: unexpected, not seen Started before - event: %+v", MonitorName, interval)
+			return
+		}
+		info.reason = string(interval.Message.Reason)
+		info.endedAt = interval.From
+	}
+}
+
+func host(interval monitorapi.Interval) string {
+	// kubelet events
+	if host, ok := interval.Locator.Keys[monitorapi.LocatorNodeKey]; ok && len(host) > 0 {
+		return host
+	}
+
+	// StaticPodInstallerCompleted is reported by the installer pod, and it
+	// does not contain any source information
+	name := interval.Locator.Keys[monitorapi.LocatorKey("pod")]
+	if len(name) == 0 {
+		return ""
+	}
+
+	// installer pod name has the following format:
+	//   - installer-5-retry-1-ci-op-cn7ykf7p-b9a0c-bxxcm-master-2
+	//   - installer-7-ci-op-cn7ykf7p-b9a0c-bxxcm-master-2
+	_, after, found := strings.Cut(name, "-retry-")
+	if found {
+		if split := strings.SplitN(after, "-", 2); len(split) == 2 {
+			return split[1]
+		}
+		return ""
+	}
+	if split := strings.SplitN(after, "-", 3); len(split) == 3 {
+		return split[2]
+	}
+	return ""
+}
+
+type junitTest struct {
+	name           string
+	concurrentPods []concurrent
+}
+
+func (jut *junitTest) Skip() []*junitapi.JUnitTestCase {
+	skipped := &junitapi.JUnitTestCase{
+		Name: jut.name,
+		SkipMessage: &junitapi.SkipMessage{
+			Message: "No installer pod activity found",
+		},
+	}
+	return []*junitapi.JUnitTestCase{skipped}
+}
+
+func (jut *junitTest) Result() []*junitapi.JUnitTestCase {
+	passed := &junitapi.JUnitTestCase{
+		Name:      jut.name,
+		SystemOut: "",
+	}
+	if len(jut.concurrentPods) == 0 {
+		// passed
+		return []*junitapi.JUnitTestCase{passed}
+	}
+
+	failed := &junitapi.JUnitTestCase{
+		Name:          jut.name,
+		SystemOut:     fmt.Sprintf("installer pods running concurrently on two or more nodes"),
+		FailureOutput: &junitapi.FailureOutput{},
+	}
+	for _, concurrent := range jut.concurrentPods {
+		a, b := concurrent.this, concurrent.that
+		msg := fmt.Sprintf("A(%s -> %s) B(%s -> %s):\nA: %s\nB: %s\n", a.startedAt.Format(time.RFC3339),
+			a.endedAt.Format(time.RFC3339), b.startedAt.Format(time.RFC3339), b.endedAt.Format(time.RFC3339), a, b)
+		failed.FailureOutput.Output = fmt.Sprintf("%s\n%s", failed.FailureOutput.Output, msg)
+	}
+
+	// TODO: for now, we flake the test, Once we know it's fully
+	// passing then we can remove the flake test case.
+	return []*junitapi.JUnitTestCase{failed, passed}
+}
diff --git a/test/extended/testdata/bindata.go b/test/extended/testdata/bindata.go