Skip to content

Commit

Permalink
fix(test): configuration changes and fixes needed to scale-test (#1085)
Browse files Browse the repository at this point in the history
# Description

Changes:
* move env var name `OUTPUT_FILEPATH` to `common/common.go`
* adjust timeout for diferent steps
* enable operator in retina legacy helm chart installation
* add retries to kube api requests to make it more robust to eventual
failures in responses
* add `podStatus` to metrics exported
* add retina operator to get metrics from 
* other small fixes and improvements

## Related Issue

If this pull request is related to any issue, please mention it here.
Additionally, make sure that the issue is assigned to you before
submitting this pull request.

## Checklist

- [ ] I have read the [contributing
documentation](https://retina.sh/docs/contributing).
- [ ] I signed and signed-off the commits (`git commit -S -s ...`). See
[this
documentation](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification)
on signing commits.
- [ ] I have correctly attributed the author(s) of the code.
- [ ] I have tested the changes locally.
- [ ] I have followed the project's style guidelines.
- [ ] I have updated the documentation, if necessary.
- [ ] I have added tests, if applicable.

## Screenshots (if applicable) or Testing Completed

Please add any relevant screenshots or GIFs to showcase the changes
made.

## Additional Notes

Add any additional notes or context about the pull request here.

---

Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) file for more
information on how to contribute to this project.

---------

Signed-off-by: Alex Castilio dos Santos <[email protected]>
  • Loading branch information
alexcastilio authored Jan 20, 2025
1 parent a3088c8 commit b3cd0ec
Show file tree
Hide file tree
Showing 14 changed files with 264 additions and 137 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/scale-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,12 @@ jobs:
NUM_REPLICAS: ${{ inputs.num_replicas }}
NUM_NETPOLS: ${{ inputs.num_netpol }}
CLEANUP: ${{ inputs.cleanup }}
IMAGE_REGISTRY: ${{ inputs.image_namespace == '' && vars.ACR_NAME || inputs.image_namespace }}
IMAGE_REGISTRY: ${{ vars.ACR_NAME }}
IMAGE_NAMESPACE: ${{ github.repository }}
TAG: ${{ inputs.image_tag }}
AZURE_APP_INSIGHTS_KEY: ${{ secrets.AZURE_APP_INSIGHTS_KEY }}
shell: bash
run: |
set -euo pipefail
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -image-tag=$( [[ $TAG == "" ]] && make version || echo $TAG ) -create-infra=false -delete-infra=false
[[ $TAG == "" ]] && TAG=$(make version)
go test -v ./test/e2e/. -timeout 300m -tags=scale -count=1 -args -create-infra=false -delete-infra=false
1 change: 1 addition & 0 deletions test/e2e/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const (
KubeSystemNamespace = "kube-system"
TestPodNamespace = "kube-system-test"
AzureAppInsightsKeyEnv = "AZURE_APP_INSIGHTS_KEY"
OutputFilePathEnv = "OUTPUT_FILEPATH"
)

var (
Expand Down
29 changes: 10 additions & 19 deletions test/e2e/framework/kubernetes/check-pod-status.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ import (
)

const (
RetryTimeoutPodsReady = 5 * time.Minute
RetryIntervalPodsReady = 5 * time.Second
RetryTimeoutPodsReady = 5 * time.Minute
RetryIntervalPodsReady = 5 * time.Second
timeoutWaitForPodsSeconds = 1200

printInterval = 5 // print to stdout every 5 iterations
)
Expand Down Expand Up @@ -48,7 +49,7 @@ func (w *WaitPodsReady) Run() error {
return fmt.Errorf("error creating Kubernetes client: %w", err)
}

ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), timeoutWaitForPodsSeconds*time.Second)
defer cancel()

return WaitForPodReady(ctx, clientset, w.Namespace, w.LabelSelector)
Expand All @@ -60,7 +61,6 @@ func (w *WaitPodsReady) Stop() error {
}

func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) error {
podReadyMap := make(map[string]bool)

printIterator := 0
conditionFunc := wait.ConditionWithContextFunc(func(context.Context) (bool, error) {
Expand All @@ -78,34 +78,25 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names
return false, nil
}

// check each indviidual pod to see if it's in Running state
// check each individual pod to see if it's in Running state
for i := range podList.Items {
var pod *corev1.Pod
pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, podList.Items[i].Name, metav1.GetOptions{})
if err != nil {
return false, fmt.Errorf("error getting Pod: %w", err)
}

// Check the Pod phase
if pod.Status.Phase != corev1.PodRunning {
if podList.Items[i].Status.Phase != corev1.PodRunning {
if printIterator%printInterval == 0 {
log.Printf("pod \"%s\" is not in Running state yet. Waiting...\n", pod.Name)
log.Printf("pod \"%s\" is not in Running state yet. Waiting...\n", podList.Items[i].Name)
}
return false, nil
}

// Check all container status.
for _, containerStatus := range pod.Status.ContainerStatuses {
if !containerStatus.Ready {
log.Printf("container \"%s\" in pod \"%s\" is not ready yet. Waiting...\n", containerStatus.Name, pod.Name)
for j := range podList.Items[i].Status.ContainerStatuses {
if !podList.Items[i].Status.ContainerStatuses[j].Ready {
log.Printf("container \"%s\" in pod \"%s\" is not ready yet. Waiting...\n", podList.Items[i].Status.ContainerStatuses[j].Name, podList.Items[i].Name)
return false, nil
}
}

if !podReadyMap[pod.Name] {
log.Printf("pod \"%s\" is in Running state\n", pod.Name)
podReadyMap[pod.Name] = true
}
}
log.Printf("all pods in namespace \"%s\" with label \"%s\" are in Running state\n", namespace, labelSelector)
return true, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ func (c *CreateKapingerDeployment) GetKapingerDeployment() *appsv1.Deployment {
"memory": resource.MustParse("20Mi"),
},
Limits: v1.ResourceList{
"memory": resource.MustParse("20Mi"),
"memory": resource.MustParse("100Mi"),
},
},
Ports: []v1.ContainerPort{
Expand Down
4 changes: 2 additions & 2 deletions test/e2e/framework/kubernetes/delete-namespace.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func (d *DeleteNamespace) Run() error {
return fmt.Errorf("error creating Kubernetes client: %w", err)
}

ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), 1200*time.Second)
defer cancel()

err = clientset.CoreV1().Namespaces().Delete(ctx, d.Namespace, metaV1.DeleteOptions{})
Expand All @@ -41,7 +41,7 @@ func (d *DeleteNamespace) Run() error {
}

backoff := wait.Backoff{
Steps: 6,
Steps: 9,
Duration: 10 * time.Second,
Factor: 2.0,
// Jitter: 0.1,
Expand Down
1 change: 1 addition & 0 deletions test/e2e/framework/kubernetes/install-retina-helm.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ func (i *InstallHelmChart) Run() error {
chart.Values["image"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-agent"
chart.Values["image"].(map[string]interface{})["initRepository"] = imageRegistry + "/" + imageNamespace + "/retina-init"
chart.Values["operator"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-operator"
chart.Values["operator"].(map[string]interface{})["enabled"] = true

getclient := action.NewGet(actionConfig)
release, err := getclient.Run(i.ReleaseName)
Expand Down
61 changes: 43 additions & 18 deletions test/e2e/framework/scaletest/add-shared-labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"log"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -50,32 +51,21 @@ func (a *AddSharedLabelsToAllPods) Run() error {
return fmt.Errorf("error creating Kubernetes client: %w", err)
}

ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)
ctx, cancel := contextToLabelAllPods()
defer cancel()

resources, err := clientset.CoreV1().Pods(a.Namespace).List(ctx, metav1.ListOptions{})

patch := []patchStringValue{}

for i := 0; i < a.NumSharedLabelsPerPod; i++ {
patch = append(patch, patchStringValue{
Op: "add",
Path: "/metadata/labels/shared-lab-" + fmt.Sprintf("%05d", i),
Value: "val",
})
}

patchBytes, err := json.Marshal(patch)
patchBytes, err := getSharedLabelsPatch(a.NumSharedLabelsPerPod)
if err != nil {
return fmt.Errorf("error marshalling patch: %w", err)
return fmt.Errorf("error getting label patch: %w", err)
}

for _, resource := range resources.Items {
clientset.CoreV1().Pods(a.Namespace).Patch(ctx, resource.Name,
types.JSONPatchType,
patchBytes,
metav1.PatchOptions{},
)
err = patchLabel(ctx, clientset, a.Namespace, resource.Name, patchBytes)
if err != nil {
log.Printf("Error adding shared labels to pod %s: %s\n", resource.Name, err)
}
}

return nil
Expand All @@ -85,3 +75,38 @@ func (a *AddSharedLabelsToAllPods) Run() error {
func (a *AddSharedLabelsToAllPods) Stop() error {
return nil
}

func patchLabel(ctx context.Context, clientset *kubernetes.Clientset, namespace, podName string, patchBytes []byte) error {
log.Println("Labeling Pod", podName)
_, err := clientset.CoreV1().Pods(namespace).Patch(ctx, podName,
types.JSONPatchType,
patchBytes,
metav1.PatchOptions{},
)
if err != nil {
return fmt.Errorf("failed to patch pod: %w", err)
}

return nil
}

func getSharedLabelsPatch(numLabels int) ([]byte, error) {
patch := []patchStringValue{}
for i := 0; i < numLabels; i++ {
patch = append(patch, patchStringValue{
Op: "add",
Path: "/metadata/labels/shared-lab-" + fmt.Sprintf("%05d", i),
Value: "val",
})
}
b, err := json.Marshal(patch)
if err != nil {
return nil, fmt.Errorf("error marshalling patch: %w", err)
}

return b, nil
}

func contextToLabelAllPods() (context.Context, context.CancelFunc) {
return context.WithTimeout(context.Background(), 120*time.Minute)
}
17 changes: 6 additions & 11 deletions test/e2e/framework/scaletest/add-unique-labels.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
package scaletest

import (
"context"
"encoding/json"
"fmt"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
)
Expand Down Expand Up @@ -44,7 +41,7 @@ func (a *AddUniqueLabelsToAllPods) Run() error {
return fmt.Errorf("error creating Kubernetes client: %w", err)
}

ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)
ctx, cancel := contextToLabelAllPods()
defer cancel()

resources, err := clientset.CoreV1().Pods(a.Namespace).List(ctx, metav1.ListOptions{})
Expand All @@ -53,7 +50,6 @@ func (a *AddUniqueLabelsToAllPods) Run() error {

for _, resource := range resources.Items {
patch := []patchStringValue{}

for i := 0; i < a.NumUniqueLabelsPerPod; i++ {
patch = append(patch, patchStringValue{
Op: "add",
Expand All @@ -65,14 +61,13 @@ func (a *AddUniqueLabelsToAllPods) Run() error {

patchBytes, err := json.Marshal(patch)
if err != nil {
return fmt.Errorf("error marshalling patch: %w", err)
return fmt.Errorf("failed to marshal patch: %w", err)
}

clientset.CoreV1().Pods(a.Namespace).Patch(ctx, resource.Name,
types.JSONPatchType,
patchBytes,
metav1.PatchOptions{},
)
err = patchLabel(ctx, clientset, a.Namespace, resource.Name, patchBytes)
if err != nil {
return fmt.Errorf("error adding unique label to pod: %w", err)
}
}

return nil
Expand Down
33 changes: 22 additions & 11 deletions test/e2e/framework/scaletest/create-resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

e2ekubernetes "github.com/microsoft/retina/test/e2e/framework/kubernetes"
"github.com/microsoft/retina/test/retry"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
Expand Down Expand Up @@ -48,11 +49,18 @@ func (c *CreateResources) Run() error {
return fmt.Errorf("error creating Kubernetes client: %w", err)
}

ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), 1200*time.Second)
defer cancel()

retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay}

for _, resource := range resources {
e2ekubernetes.CreateResource(ctx, resource, clientset)
err := retrier.Do(ctx, func() error {
return e2ekubernetes.CreateResource(ctx, resource, clientset)
})
if err != nil {
return fmt.Errorf("error creating resource: %w", err)
}
}

return nil
Expand All @@ -71,12 +79,6 @@ func (c *CreateResources) getResources() []runtime.Object {
// kwokDeployments := c.generateDeployments(c.NumKwokDeployments, c.NumKwokReplicas, "kwok")
// objs = append(objs, kwokDeployments...)

realDeployments := c.generateDeployments()
objs = append(objs, realDeployments...)

services := c.generateServices("real")
objs = append(objs, services...)

kapinger := e2ekubernetes.CreateKapingerDeployment{
KapingerNamespace: c.Namespace,
KubeConfigFilePath: c.KubeConfigFilePath,
Expand All @@ -88,6 +90,13 @@ func (c *CreateResources) getResources() []runtime.Object {
kapingerSA := kapinger.GetKapingerServiceAccount()

objs = append(objs, kapingerClusterRole, kapingerClusterRoleBinding, kapingerSA)

realDeployments := c.generateDeployments()
objs = append(objs, realDeployments...)

services := c.generateServices()
objs = append(objs, services...)

// c.generateKwokNodes()
log.Println("Finished generating YAMLs")
return objs
Expand Down Expand Up @@ -118,6 +127,8 @@ func (c *CreateResources) generateDeployments() []runtime.Object {
labelPrefix := fmt.Sprintf("%s-dep-lab", name)

deployment.Name = name
deployment.Labels["name"] = name
deployment.Spec.Template.Labels["name"] = name

r := int32(c.NumRealReplicas)
deployment.Spec.Replicas = &r
Expand All @@ -135,7 +146,7 @@ func (c *CreateResources) generateDeployments() []runtime.Object {
return objs
}

func (c *CreateResources) generateServices(svcKind string) []runtime.Object {
func (c *CreateResources) generateServices() []runtime.Object {
objs := []runtime.Object{}

kapingerSvc := e2ekubernetes.CreateKapingerDeployment{
Expand All @@ -146,10 +157,10 @@ func (c *CreateResources) generateServices(svcKind string) []runtime.Object {
for i := 0; i < c.NumRealServices; i++ {
template := kapingerSvc.GetKapingerService()

name := fmt.Sprintf("%s-svc-%05d", svcKind, i)
name := fmt.Sprintf("%s-svc-%05d", c.RealPodType, i)
template.Name = name

template.Spec.Selector["name"] = fmt.Sprintf("%s-%s-dep-%05d", svcKind, c.RealPodType, i)
template.Spec.Selector["name"] = fmt.Sprintf("%s-dep-%05d", c.RealPodType, i)

objs = append(objs, template)
}
Expand Down
4 changes: 3 additions & 1 deletion test/e2e/framework/scaletest/delete-and-re-add-labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func (d *DeleteAndReAddLabels) Run() error {
return fmt.Errorf("error creating Kubernetes client: %w", err)
}

ctx, cancel := context.WithTimeout(context.Background(), defaultTimeoutSeconds*time.Second)
ctx, cancel := contextToLabelAllPods()
defer cancel()

labelsToDelete := `"shared-lab-00000": null, "shared-lab-00001": null, "shared-lab-00002": null`
Expand Down Expand Up @@ -91,6 +91,7 @@ func (d *DeleteAndReAddLabels) Run() error {
func (d *DeleteAndReAddLabels) addLabels(ctx context.Context, clientset *kubernetes.Clientset, pods *corev1.PodList, patch string) error {

for _, pod := range pods.Items {
log.Println("Labeling Pod", pod.Name)
_, err := clientset.CoreV1().Pods(d.Namespace).Patch(ctx, pod.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{})
if err != nil {
return fmt.Errorf("error patching pod: %w", err)
Expand All @@ -103,6 +104,7 @@ func (d *DeleteAndReAddLabels) addLabels(ctx context.Context, clientset *kuberne
func (d *DeleteAndReAddLabels) deleteLabels(ctx context.Context, clientset *kubernetes.Clientset, pods *corev1.PodList, patch string) error {

for _, pod := range pods.Items {
log.Println("Deleting label from Pod", pod.Name)
_, err := clientset.CoreV1().Pods(d.Namespace).Patch(ctx, pod.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{})
if err != nil {
return fmt.Errorf("error patching pod: %w", err)
Expand Down
Loading

0 comments on commit b3cd0ec

Please sign in to comment.