diff --git a/test/e2e/common/common.go b/test/e2e/common/common.go index 9772320685..2aa29f05dd 100644 --- a/test/e2e/common/common.go +++ b/test/e2e/common/common.go @@ -22,6 +22,7 @@ const ( KubeSystemNamespace = "kube-system" TestPodNamespace = "kube-system-test" AzureAppInsightsKeyEnv = "AZURE_APP_INSIGHTS_KEY" + OutputFilePathEnv = "OUTPUT_FILEPATH" ) var ( diff --git a/test/e2e/framework/kubernetes/check-pod-status.go b/test/e2e/framework/kubernetes/check-pod-status.go index 27405031bb..b10cb21f04 100644 --- a/test/e2e/framework/kubernetes/check-pod-status.go +++ b/test/e2e/framework/kubernetes/check-pod-status.go @@ -14,8 +14,9 @@ import ( ) const ( - RetryTimeoutPodsReady = 5 * time.Minute - RetryIntervalPodsReady = 5 * time.Second + RetryTimeoutPodsReady = 5 * time.Minute + RetryIntervalPodsReady = 5 * time.Second + timeoutWaitForPodsSeconds = 1200 printInterval = 5 // print to stdout every 5 iterations ) @@ -48,7 +49,7 @@ func (w *WaitPodsReady) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), timeoutWaitForPodsSeconds*time.Second) defer cancel() return WaitForPodReady(ctx, clientset, w.Namespace, w.LabelSelector) @@ -60,7 +61,6 @@ func (w *WaitPodsReady) Stop() error { } func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) error { - podReadyMap := make(map[string]bool) printIterator := 0 conditionFunc := wait.ConditionWithContextFunc(func(context.Context) (bool, error) { @@ -78,34 +78,25 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names return false, nil } - // check each indviidual pod to see if it's in Running state + // check each individual pod to see if it's in Running state for i := range podList.Items { - var pod *corev1.Pod - pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, podList.Items[i].Name, metav1.GetOptions{}) - if err != nil { - return false, fmt.Errorf("error getting Pod: %w", err) - } // Check the Pod phase - if pod.Status.Phase != corev1.PodRunning { + if podList.Items[i].Status.Phase != corev1.PodRunning { if printIterator%printInterval == 0 { - log.Printf("pod \"%s\" is not in Running state yet. Waiting...\n", pod.Name) + log.Printf("pod \"%s\" is not in Running state yet. Waiting...\n", podList.Items[i].Name) } return false, nil } // Check all container status. - for _, containerStatus := range pod.Status.ContainerStatuses { + for _, containerStatus := range podList.Items[i].Status.ContainerStatuses { if !containerStatus.Ready { - log.Printf("container \"%s\" in pod \"%s\" is not ready yet. Waiting...\n", containerStatus.Name, pod.Name) + log.Printf("container \"%s\" in pod \"%s\" is not ready yet. Waiting...\n", containerStatus.Name, podList.Items[i].Name) return false, nil } } - if !podReadyMap[pod.Name] { - log.Printf("pod \"%s\" is in Running state\n", pod.Name) - podReadyMap[pod.Name] = true - } } log.Printf("all pods in namespace \"%s\" with label \"%s\" are in Running state\n", namespace, labelSelector) return true, nil diff --git a/test/e2e/framework/kubernetes/create-kapinger-deployment.go b/test/e2e/framework/kubernetes/create-kapinger-deployment.go index 06862e1c09..35040fda17 100644 --- a/test/e2e/framework/kubernetes/create-kapinger-deployment.go +++ b/test/e2e/framework/kubernetes/create-kapinger-deployment.go @@ -138,7 +138,7 @@ func (c *CreateKapingerDeployment) GetKapingerDeployment() *appsv1.Deployment { "memory": resource.MustParse("20Mi"), }, Limits: v1.ResourceList{ - "memory": resource.MustParse("20Mi"), + "memory": resource.MustParse("40Mi"), }, }, Ports: []v1.ContainerPort{ diff --git a/test/e2e/framework/kubernetes/delete-namespace.go b/test/e2e/framework/kubernetes/delete-namespace.go index c5fa3dbc66..b33cc02b93 100644 --- a/test/e2e/framework/kubernetes/delete-namespace.go +++ b/test/e2e/framework/kubernetes/delete-namespace.go @@ -14,6 +14,10 @@ import ( "k8s.io/client-go/util/retry" ) +const ( + deleteNamespaceTimeoutSeconds = 1200 +) + type DeleteNamespace struct { Namespace string KubeConfigFilePath string @@ -30,7 +34,7 @@ func (d *DeleteNamespace) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), deleteNamespaceTimeoutSeconds*time.Second) defer cancel() err = clientset.CoreV1().Namespaces().Delete(ctx, d.Namespace, metaV1.DeleteOptions{}) @@ -40,8 +44,10 @@ func (d *DeleteNamespace) Run() error { } } + numberOfSteps := 9 + backoff := wait.Backoff{ - Steps: 6, + Steps: numberOfSteps, Duration: 10 * time.Second, Factor: 2.0, // Jitter: 0.1, diff --git a/test/e2e/framework/kubernetes/install-retina-helm.go b/test/e2e/framework/kubernetes/install-retina-helm.go index 7f1828f17c..ba74d64eac 100644 --- a/test/e2e/framework/kubernetes/install-retina-helm.go +++ b/test/e2e/framework/kubernetes/install-retina-helm.go @@ -91,6 +91,7 @@ func (i *InstallHelmChart) Run() error { chart.Values["image"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-agent" chart.Values["image"].(map[string]interface{})["initRepository"] = imageRegistry + "/" + imageNamespace + "/retina-init" chart.Values["operator"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-operator" + chart.Values["operator"].(map[string]interface{})["enabled"] = true getclient := action.NewGet(actionConfig) release, err := getclient.Run(i.ReleaseName) diff --git a/test/e2e/framework/scaletest/add-shared-labels.go b/test/e2e/framework/scaletest/add-shared-labels.go index d76139c0be..c1ea93f828 100644 --- a/test/e2e/framework/scaletest/add-shared-labels.go +++ b/test/e2e/framework/scaletest/add-shared-labels.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "log" "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -12,6 +13,10 @@ import ( "k8s.io/client-go/tools/clientcmd" ) +const ( + timeoutToLabelAllPodsMinutes = 120 +) + type patchStringValue struct { Op string `json:"op"` Path string `json:"path"` @@ -50,32 +55,21 @@ func (a *AddSharedLabelsToAllPods) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := contextToLabelAllPods() defer cancel() resources, err := clientset.CoreV1().Pods(a.Namespace).List(ctx, metav1.ListOptions{}) - patch := []patchStringValue{} - - for i := 0; i < a.NumSharedLabelsPerPod; i++ { - patch = append(patch, patchStringValue{ - Op: "add", - Path: "/metadata/labels/shared-lab-" + fmt.Sprintf("%05d", i), - Value: "val", - }) - } - - patchBytes, err := json.Marshal(patch) + patchBytes, err := getSharedLabelsPatch(a.NumSharedLabelsPerPod) if err != nil { - return fmt.Errorf("error marshalling patch: %w", err) + return fmt.Errorf("error getting label patch: %w", err) } for _, resource := range resources.Items { - clientset.CoreV1().Pods(a.Namespace).Patch(ctx, resource.Name, - types.JSONPatchType, - patchBytes, - metav1.PatchOptions{}, - ) + err = patchLabel(ctx, clientset, a.Namespace, resource.Name, patchBytes) + if err != nil { + log.Printf("Error adding shared labels to pod %s: %s\n", resource.Name, err) + } } return nil @@ -85,3 +79,30 @@ func (a *AddSharedLabelsToAllPods) Run() error { func (a *AddSharedLabelsToAllPods) Stop() error { return nil } + +func patchLabel(ctx context.Context, clientset *kubernetes.Clientset, namespace, podName string, patchBytes []byte) error { + log.Println("Labeling Pod", podName) + _, err := clientset.CoreV1().Pods(namespace).Patch(ctx, podName, + types.JSONPatchType, + patchBytes, + metav1.PatchOptions{}, + ) + return fmt.Errorf("error patching pod: %w", err) +} + +func getSharedLabelsPatch(numLabels int) ([]byte, error) { + patch := []patchStringValue{} + for i := 0; i < numLabels; i++ { + patch = append(patch, patchStringValue{ + Op: "add", + Path: "/metadata/labels/shared-lab-" + fmt.Sprintf("%05d", i), + Value: "val", + }) + } + b, err := json.Marshal(patch) + return b, fmt.Errorf("error marshalling patch: %w", err) +} + +func contextToLabelAllPods() (context.Context, context.CancelFunc) { + return context.WithTimeout(context.Background(), timeoutToLabelAllPodsMinutes*time.Minute) +} diff --git a/test/e2e/framework/scaletest/add-unique-labels.go b/test/e2e/framework/scaletest/add-unique-labels.go index cfdd458c82..ccd74886c7 100644 --- a/test/e2e/framework/scaletest/add-unique-labels.go +++ b/test/e2e/framework/scaletest/add-unique-labels.go @@ -1,13 +1,10 @@ package scaletest import ( - "context" "encoding/json" "fmt" - "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" ) @@ -44,7 +41,7 @@ func (a *AddUniqueLabelsToAllPods) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := contextToLabelAllPods() defer cancel() resources, err := clientset.CoreV1().Pods(a.Namespace).List(ctx, metav1.ListOptions{}) @@ -52,27 +49,15 @@ func (a *AddUniqueLabelsToAllPods) Run() error { count := 0 for _, resource := range resources.Items { - patch := []patchStringValue{} - - for i := 0; i < a.NumUniqueLabelsPerPod; i++ { - patch = append(patch, patchStringValue{ - Op: "add", - Path: "/metadata/labels/uni-lab-" + fmt.Sprintf("%05d", count), - Value: "val", - }) - count++ + patchBytes, err := getUniqueLabelsPatch(a.NumUniqueLabelsPerPod, &count) + if err != nil { + return fmt.Errorf("error getting label patch: %w", err) } - patchBytes, err := json.Marshal(patch) + err = patchLabel(ctx, clientset, a.Namespace, resource.Name, patchBytes) if err != nil { - return fmt.Errorf("error marshalling patch: %w", err) + return fmt.Errorf("error adding unique label to pod: %w", err) } - - clientset.CoreV1().Pods(a.Namespace).Patch(ctx, resource.Name, - types.JSONPatchType, - patchBytes, - metav1.PatchOptions{}, - ) } return nil @@ -82,3 +67,19 @@ func (a *AddUniqueLabelsToAllPods) Run() error { func (a *AddUniqueLabelsToAllPods) Stop() error { return nil } + +func getUniqueLabelsPatch(numLabels int, counter *int) ([]byte, error) { + patch := []patchStringValue{} + + for i := 0; i < numLabels; i++ { + patch = append(patch, patchStringValue{ + Op: "add", + Path: "/metadata/labels/uni-lab-" + fmt.Sprintf("%05d", *counter), + Value: "val", + }) + (*counter)++ + } + + b, err := json.Marshal(patch) + return b, fmt.Errorf("error marshalling patch: %w", err) +} diff --git a/test/e2e/framework/scaletest/create-resources.go b/test/e2e/framework/scaletest/create-resources.go index 688ab57747..d7a592efe5 100644 --- a/test/e2e/framework/scaletest/create-resources.go +++ b/test/e2e/framework/scaletest/create-resources.go @@ -7,11 +7,16 @@ import ( "time" e2ekubernetes "github.com/microsoft/retina/test/e2e/framework/kubernetes" + "github.com/microsoft/retina/test/retry" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" ) +const ( + timeoutCreateResourcesSeconds = 1200 +) + type CreateResources struct { Namespace string KubeConfigFilePath string @@ -48,11 +53,18 @@ func (c *CreateResources) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), timeoutCreateResourcesSeconds*time.Second) defer cancel() + retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} + for _, resource := range resources { - e2ekubernetes.CreateResource(ctx, resource, clientset) + err := retrier.Do(ctx, func() error { + return e2ekubernetes.CreateResource(ctx, resource, clientset) + }) + if err != nil { + return fmt.Errorf("error creating resource: %w", err) + } } return nil @@ -71,12 +83,6 @@ func (c *CreateResources) getResources() []runtime.Object { // kwokDeployments := c.generateDeployments(c.NumKwokDeployments, c.NumKwokReplicas, "kwok") // objs = append(objs, kwokDeployments...) - realDeployments := c.generateDeployments() - objs = append(objs, realDeployments...) - - services := c.generateServices("real") - objs = append(objs, services...) - kapinger := e2ekubernetes.CreateKapingerDeployment{ KapingerNamespace: c.Namespace, KubeConfigFilePath: c.KubeConfigFilePath, @@ -88,6 +94,13 @@ func (c *CreateResources) getResources() []runtime.Object { kapingerSA := kapinger.GetKapingerServiceAccount() objs = append(objs, kapingerClusterRole, kapingerClusterRoleBinding, kapingerSA) + + realDeployments := c.generateDeployments() + objs = append(objs, realDeployments...) + + services := c.generateServices() + objs = append(objs, services...) + // c.generateKwokNodes() log.Println("Finished generating YAMLs") return objs @@ -118,6 +131,8 @@ func (c *CreateResources) generateDeployments() []runtime.Object { labelPrefix := fmt.Sprintf("%s-dep-lab", name) deployment.Name = name + deployment.Labels["name"] = name + deployment.Spec.Template.Labels["name"] = name r := int32(c.NumRealReplicas) deployment.Spec.Replicas = &r @@ -135,7 +150,7 @@ func (c *CreateResources) generateDeployments() []runtime.Object { return objs } -func (c *CreateResources) generateServices(svcKind string) []runtime.Object { +func (c *CreateResources) generateServices() []runtime.Object { objs := []runtime.Object{} kapingerSvc := e2ekubernetes.CreateKapingerDeployment{ @@ -146,10 +161,10 @@ func (c *CreateResources) generateServices(svcKind string) []runtime.Object { for i := 0; i < c.NumRealServices; i++ { template := kapingerSvc.GetKapingerService() - name := fmt.Sprintf("%s-svc-%05d", svcKind, i) + name := fmt.Sprintf("%s-svc-%05d", c.RealPodType, i) template.Name = name - template.Spec.Selector["name"] = fmt.Sprintf("%s-%s-dep-%05d", svcKind, c.RealPodType, i) + template.Spec.Selector["name"] = fmt.Sprintf("%s-dep-%05d", c.RealPodType, i) objs = append(objs, template) } diff --git a/test/e2e/framework/scaletest/delete-and-re-add-labels.go b/test/e2e/framework/scaletest/delete-and-re-add-labels.go index 5897b4d766..3403ea2488 100644 --- a/test/e2e/framework/scaletest/delete-and-re-add-labels.go +++ b/test/e2e/framework/scaletest/delete-and-re-add-labels.go @@ -48,7 +48,7 @@ func (d *DeleteAndReAddLabels) Run() error { return fmt.Errorf("error creating Kubernetes client: %w", err) } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + ctx, cancel := contextToLabelAllPods() defer cancel() labelsToDelete := `"shared-lab-00000": null, "shared-lab-00001": null, "shared-lab-00002": null` @@ -91,6 +91,7 @@ func (d *DeleteAndReAddLabels) Run() error { func (d *DeleteAndReAddLabels) addLabels(ctx context.Context, clientset *kubernetes.Clientset, pods *corev1.PodList, patch string) error { for _, pod := range pods.Items { + log.Println("Labeling Pod", pod.Name) _, err := clientset.CoreV1().Pods(d.Namespace).Patch(ctx, pod.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}) if err != nil { return fmt.Errorf("error patching pod: %w", err) @@ -103,6 +104,7 @@ func (d *DeleteAndReAddLabels) addLabels(ctx context.Context, clientset *kuberne func (d *DeleteAndReAddLabels) deleteLabels(ctx context.Context, clientset *kubernetes.Clientset, pods *corev1.PodList, patch string) error { for _, pod := range pods.Items { + log.Println("Deleting label from Pod", pod.Name) _, err := clientset.CoreV1().Pods(d.Namespace).Patch(ctx, pod.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}) if err != nil { return fmt.Errorf("error patching pod: %w", err) diff --git a/test/e2e/framework/scaletest/get-publish-metrics.go b/test/e2e/framework/scaletest/get-publish-metrics.go index 3495addf33..be403490ea 100644 --- a/test/e2e/framework/scaletest/get-publish-metrics.go +++ b/test/e2e/framework/scaletest/get-publish-metrics.go @@ -11,24 +11,34 @@ import ( "github.com/microsoft/retina/pkg/telemetry" "github.com/microsoft/retina/test/e2e/common" + "github.com/microsoft/retina/test/retry" "github.com/pkg/errors" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" + v1beta1 "k8s.io/metrics/pkg/apis/metrics/v1beta1" metrics "k8s.io/metrics/pkg/client/clientset/versioned" ) +const ( + defaultRetryAttempts = 10 + defaultRetryDelay = 500 * time.Millisecond +) + type GetAndPublishMetrics struct { KubeConfigFilePath string AdditionalTelemetryProperty map[string]string Labels map[string]string - OutputFilePath string + outputFilePath string stop chan struct{} wg sync.WaitGroup telemetryClient *telemetry.TelemetryClient appInsightsKey string + k8sClient *kubernetes.Clientset + metricsClient *metrics.Clientset } func (g *GetAndPublishMetrics) Run() error { @@ -43,6 +53,23 @@ func (g *GetAndPublishMetrics) Run() error { g.telemetryClient = telemetryClient } + config, err := clientcmd.BuildConfigFromFlags("", g.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + k8sClient, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + g.k8sClient = k8sClient + + metricsClient, err := metrics.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating metrics client: %w", err) + } + g.metricsClient = metricsClient + g.stop = make(chan struct{}) g.wg.Add(1) @@ -50,6 +77,13 @@ func (g *GetAndPublishMetrics) Run() error { t := time.NewTicker(5 * time.Minute) + // First execution + err := g.getAndPublishMetrics() + if err != nil { + log.Fatalf("error getting and publishing number of restarts: %v", err) + return + } + for { select { @@ -88,34 +122,38 @@ func (g *GetAndPublishMetrics) Prevalidate() error { if _, ok := g.AdditionalTelemetryProperty["retinaVersion"]; !ok { return fmt.Errorf("retinaVersion is required in AdditionalTelemetryProperty") } + + if os.Getenv(common.OutputFilePathEnv) == "" { + log.Println("Output file path not provided. Metrics will not be written to file") + return nil + } + g.outputFilePath = os.Getenv(common.OutputFilePathEnv) + + log.Println("Output file path provided: ", g.outputFilePath) return nil } func (g *GetAndPublishMetrics) getAndPublishMetrics() error { - config, err := clientcmd.BuildConfigFromFlags("", g.KubeConfigFilePath) - if err != nil { - return fmt.Errorf("error building kubeconfig: %w", err) - } + ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) + defer cancel() - clientset, err := kubernetes.NewForConfig(config) - if err != nil { - return fmt.Errorf("error creating Kubernetes client: %w", err) - } + labelSelector := labels.Set(g.Labels).String() - mc, err := metrics.NewForConfig(config) + agentsMetrics, err := g.getPodsMetrics(ctx, labelSelector) if err != nil { - return fmt.Errorf("error creating metrics client: %w", err) + log.Println("Error getting agents' metrics, will try again later:", err) + return nil } - ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second) - defer cancel() - - metrics, err := g.getMetrics(ctx, clientset, mc) + operatorMetrics, err := g.getPodsMetrics(ctx, "app=retina-operator") if err != nil { - return fmt.Errorf("error getting metrics: %w", err) + log.Println("Error getting operator's metrics, will try again later:", err) + return nil } + metrics := append(agentsMetrics, operatorMetrics...) + // Publish metrics if g.telemetryClient != nil { log.Println("Publishing metrics to AppInsights") @@ -126,9 +164,9 @@ func (g *GetAndPublishMetrics) getAndPublishMetrics() error { } // Write metrics to file - if g.OutputFilePath != "" { - log.Println("Writing metrics to file ", g.OutputFilePath) - file, err := os.OpenFile(g.OutputFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if g.outputFilePath != "" { + log.Println("Writing metrics to file ", g.outputFilePath) + file, err := os.OpenFile(g.outputFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) if err != nil { return fmt.Errorf("error writing to csv file: %w", err) } @@ -150,45 +188,74 @@ func (g *GetAndPublishMetrics) getAndPublishMetrics() error { type metric map[string]string -func (g *GetAndPublishMetrics) getMetrics(ctx context.Context, k8sClient *kubernetes.Clientset, metricsClient *metrics.Clientset) ([]metric, error) { +func (g *GetAndPublishMetrics) getPodsMetrics(ctx context.Context, labelSelector string) ([]metric, error) { - labelSelector := labels.Set(g.Labels).String() + var pods *v1.PodList + + retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} + + err := retrier.Do(ctx, func() error { + var err error + pods, err = g.k8sClient.CoreV1().Pods(common.KubeSystemNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) + return fmt.Errorf("error listing pods: %w", err) + }) + if err != nil { + return nil, errors.Wrap(err, "error getting pods") + } - pods, err := k8sClient.CoreV1().Pods(common.KubeSystemNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) + var nodeMetricsList *v1beta1.NodeMetricsList + err = retrier.Do(ctx, func() error { + nodeMetricsList, err = g.metricsClient.MetricsV1beta1().NodeMetricses().List(ctx, metav1.ListOptions{}) + return fmt.Errorf("error listing node metrics: %w", err) + }) if err != nil { - return nil, errors.Wrap(err, "error getting nodes") + log.Println("Error getting node metrics:", err) } - nodesMetricsInt := metricsClient.MetricsV1beta1().NodeMetricses() - podMetricsInt := metricsClient.MetricsV1beta1().PodMetricses(common.KubeSystemNamespace) + var podMetricsList *v1beta1.PodMetricsList + err = retrier.Do(ctx, func() error { + podMetricsList, err = g.metricsClient.MetricsV1beta1().PodMetricses(common.KubeSystemNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) + return fmt.Errorf("error listing pod metrics: %w", err) + }) + if err != nil { + log.Println("Error getting pod metrics:", err) + } var allPodsHealth []metric timestamp := time.Now().UTC().Format(time.RFC3339) + // List -> map for lookup + podMetrics := make(map[string]*v1beta1.PodMetrics) + for i := range podMetricsList.Items { + podMetrics[podMetricsList.Items[i].Name] = podMetricsList.Items[i].DeepCopy() + } + + // List -> map for lookup + nodeMetrics := make(map[string]*v1beta1.NodeMetrics) + for i := range nodeMetricsList.Items { + nodeMetrics[nodeMetricsList.Items[i].Name] = nodeMetricsList.Items[i].DeepCopy() + } + for _, pod := range pods.Items { var podHealth metric = make(map[string]string) - podMetrics, err := podMetricsInt.Get(ctx, pod.Name, metav1.GetOptions{}) - if err != nil { - return nil, errors.Wrap(err, "error getting pod metrics") - } - podMem := resource.MustParse("0") podCpu := resource.MustParse("0") - for _, cm := range podMetrics.Containers { - podMem.Add(cm.Usage["memory"]) - podCpu.Add(cm.Usage["cpu"]) + if podMetrics[pod.Name] != nil { + for _, cm := range podMetrics[pod.Name].Containers { + podMem.Add(cm.Usage["memory"]) + podCpu.Add(cm.Usage["cpu"]) + } } - nodeMetrics, err := nodesMetricsInt.Get(ctx, pod.Spec.NodeName, metav1.GetOptions{}) - if err != nil { - return nil, errors.Wrap(err, "error getting node metrics") + nodeMem := resource.MustParse("0") + nodeCPU := resource.MustParse("0") + if nodeMetrics[pod.Spec.NodeName] != nil { + nodeMem = nodeMetrics[pod.Spec.NodeName].Usage["memory"] + nodeCPU = nodeMetrics[pod.Spec.NodeName].Usage["cpu"] } - nodeMem := nodeMetrics.Usage["memory"] - nodeCpu := nodeMetrics.Usage["cpu"] - restarts := 0 for _, containerStatus := range pod.Status.ContainerStatuses { @@ -197,11 +264,12 @@ func (g *GetAndPublishMetrics) getMetrics(ctx context.Context, k8sClient *kubern podHealth["timestamp"] = timestamp podHealth["pod"] = pod.Name + podHealth["podStatus"] = string(pod.Status.Phase) podHealth["podCpuInMilliCore"] = fmt.Sprintf("%d", podCpu.MilliValue()) podHealth["podMemoryInMB"] = fmt.Sprintf("%d", podMem.Value()/(1048576)) podHealth["podRestarts"] = fmt.Sprintf("%d", restarts) podHealth["node"] = pod.Spec.NodeName - podHealth["nodeCpuInMilliCore"] = fmt.Sprintf("%d", nodeCpu.MilliValue()) + podHealth["nodeCpuInMilliCore"] = fmt.Sprintf("%d", nodeCPU.MilliValue()) podHealth["nodeMemoryInMB"] = fmt.Sprintf("%d", nodeMem.Value()/(1048576)) allPodsHealth = append(allPodsHealth, podHealth) diff --git a/test/e2e/framework/scaletest/options.go b/test/e2e/framework/scaletest/options.go index 6b5284422b..a7d27683b6 100644 --- a/test/e2e/framework/scaletest/options.go +++ b/test/e2e/framework/scaletest/options.go @@ -37,4 +37,5 @@ type Options struct { numRealPods int LabelsToGetMetrics map[string]string AdditionalTelemetryProperty map[string]string + CleanUp bool } diff --git a/test/e2e/jobs/scale.go b/test/e2e/jobs/scale.go index 89215785c1..360450f792 100644 --- a/test/e2e/jobs/scale.go +++ b/test/e2e/jobs/scale.go @@ -1,7 +1,6 @@ package retina import ( - "os" "time" "github.com/microsoft/retina/test/e2e/framework/kubernetes" @@ -32,7 +31,7 @@ func DefaultScaleTestOptions() scaletest.Options { DeletePodsInterval: 60 * time.Second, DeleteRealPods: false, DeletePodsTimes: 1, - DeleteLabels: false, + DeleteLabels: true, DeleteLabelsInterval: 60 * time.Second, DeleteLabelsTimes: 1, DeleteNetworkPolicies: false, @@ -40,6 +39,7 @@ func DefaultScaleTestOptions() scaletest.Options { DeleteNetworkPoliciesTimes: 1, LabelsToGetMetrics: map[string]string{}, AdditionalTelemetryProperty: map[string]string{}, + CleanUp: true, } } @@ -66,7 +66,6 @@ func ScaleTest(opt *scaletest.Options) *types.Job { job.AddStep(&scaletest.GetAndPublishMetrics{ Labels: opt.LabelsToGetMetrics, AdditionalTelemetryProperty: opt.AdditionalTelemetryProperty, - OutputFilePath: os.Getenv("OUTPUT_FILEPATH"), }, &types.StepOptions{ SkipSavingParametersToJob: true, RunInBackgroundWithID: "get-metrics", @@ -111,7 +110,9 @@ func ScaleTest(opt *scaletest.Options) *types.Job { BackgroundID: "get-metrics", }, nil) - job.AddStep(&kubernetes.DeleteNamespace{}, nil) + if opt.CleanUp { + job.AddStep(&kubernetes.DeleteNamespace{}, nil) + } return job } diff --git a/test/e2e/scale_test.go b/test/e2e/scale_test.go index 6769dccc09..7ee7749335 100644 --- a/test/e2e/scale_test.go +++ b/test/e2e/scale_test.go @@ -75,7 +75,7 @@ func TestE2ERetina_Scale(t *testing.T) { require.NoError(t, err) } if CleanUp != "" { - opt.DeleteLabels, err = strconv.ParseBool(CleanUp) + opt.CleanUp, err = strconv.ParseBool(CleanUp) require.NoError(t, err) }