From febfc9c6d11bd78e9d5b46b45a38f304f02cabcb Mon Sep 17 00:00:00 2001 From: Sergey Smolnikov Date: Thu, 25 Jul 2024 13:19:20 +0200 Subject: [PATCH] run-job-scheduler-on-arm (#1149) * Always run job-scheduler on arm platform * Fixed unit-tests * Set arm and cache to workflows * Fire build * Cleanup * Cleanup --- .github/workflows/build-push.yml | 7 +++ .github/workflows/pr.yml | 16 +++++- charts/radix-operator/Chart.yaml | 4 +- pkg/apis/deployment/deployment_test.go | 59 +++++++++++++------- pkg/apis/deployment/jobschedulercomponent.go | 2 +- 5 files changed, 63 insertions(+), 25 deletions(-) diff --git a/.github/workflows/build-push.yml b/.github/workflows/build-push.yml index 259bce4fd..574893879 100644 --- a/.github/workflows/build-push.yml +++ b/.github/workflows/build-push.yml @@ -15,6 +15,7 @@ jobs: strategy: fail-fast: false matrix: + arch: [arm64] target: - name: "dev" ref: "refs/heads/master" @@ -133,6 +134,8 @@ jobs: ts=$(date +%s) echo "radix-operator-tag=${GITHUB_REF_NAME}-${sha}-${ts}" >> $GITHUB_OUTPUT echo "pipeline-runner-tag=${GITHUB_REF_NAME}-latest" >> $GITHUB_OUTPUT + echo "cache-radix-operator-tag=cache-radix-operator-${GITHUB_REF_NAME}" >> $GITHUB_OUTPUT + echo "cache-pipeline-runner-tag=cache-pipeline-runner-${GITHUB_REF_NAME}" >> $GITHUB_OUTPUT - name: Extract labels from metadata for Docker if: matrix.target.ref == github.ref @@ -153,6 +156,8 @@ jobs: linux/arm64 tags: "${{ steps.build-image-names.outputs.radix-operator-image-name }}:${{ steps.build-tags.outputs.radix-operator-tag }}" labels: ${{ steps.radix-operator-meta.outputs.labels }} + cache-from: "type=registry,ref=${{ steps.build-image-names.outputs.radix-operator-image-name }}:${{ steps.build-tags.outputs.cache-radix-operator-tag }}" + cache-to: "type=registry,ref=${{ steps.build-image-names.outputs.radix-operator-image-name }}:${{ steps.build-tags.outputs.cache-radix-operator-tag }},mode=max" - name: Build and push pipeline-runner docker image if: matrix.target.ref == github.ref @@ -166,6 +171,8 @@ jobs: linux/arm64 tags: "${{ steps.build-image-names.outputs.pipeline-runner-image-name }}:${{ steps.build-tags.outputs.pipeline-runner-tag }}" labels: ${{ steps.pipeline-runner-meta.outputs.labels }} + cache-from: "type=registry,ref=${{ steps.build-image-names.outputs.pipeline-runner-image-name }}:${{ steps.build-tags.outputs.cache-pipeline-runner-tag }}" + cache-to: "type=registry,ref=${{ steps.build-image-names.outputs.pipeline-runner-image-name }}:${{ steps.build-tags.outputs.cache-pipeline-runner-tag }},mode=max" - name: Revoke GitHub IP on ACR if: ${{ matrix.target.ref == github.ref && steps.update_firewall.outcome == 'success' && !cancelled()}} # Always run this step even if previous step failed diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index e249e5925..fe0b9be71 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -3,11 +3,13 @@ on: pull_request: branches: - master - jobs: build: name: pull-request-check runs-on: ubuntu-latest + strategy: + matrix: + arch: [arm64] steps: - uses: actions/checkout@v4 - name: Set up Docker Buildx @@ -34,6 +36,9 @@ jobs: radix-operator-test: name: Pipeline-runner unit tests runs-on: ubuntu-latest + strategy: + matrix: + arch: [arm64] steps: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 @@ -51,6 +56,9 @@ jobs: pipeline-runner-test: name: Pipeline-runner unit tests runs-on: ubuntu-latest + strategy: + matrix: + arch: [arm64] steps: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 @@ -68,6 +76,9 @@ jobs: radix-operator-lint: name: Lint runs-on: ubuntu-latest + strategy: + matrix: + arch: [arm64] steps: - uses: actions/checkout@v4 with: @@ -84,6 +95,9 @@ jobs: verify-code-generation: name: Verify Code Generation runs-on: ubuntu-latest + strategy: + matrix: + arch: [arm64] steps: - uses: actions/checkout@v4 - uses: actions/setup-go@v5 diff --git a/charts/radix-operator/Chart.yaml b/charts/radix-operator/Chart.yaml index a019075d1..c3034802d 100644 --- a/charts/radix-operator/Chart.yaml +++ b/charts/radix-operator/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: radix-operator -version: 1.37.6 -appVersion: 1.57.5 +version: 1.37.7 +appVersion: 1.57.6 kubeVersion: ">=1.24.0" description: Radix Operator keywords: diff --git a/pkg/apis/deployment/deployment_test.go b/pkg/apis/deployment/deployment_test.go index 035044b78..e75a30fe7 100644 --- a/pkg/apis/deployment/deployment_test.go +++ b/pkg/apis/deployment/deployment_test.go @@ -682,7 +682,7 @@ func TestObjectSynced_MultiJob_ContainsAllElements(t *testing.T) { expectedAffinity := &corev1.Affinity{ NodeAffinity: &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{ {Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}}, - {Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorArchitecture}}, + {Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{string(radixv1.RuntimeArchitectureArm64)}}, }}}}}, PodAntiAffinity: &corev1.PodAntiAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{{Weight: 1, PodAffinityTerm: corev1.PodAffinityTerm{TopologyKey: corev1.LabelHostname, LabelSelector: &metav1.LabelSelector{MatchExpressions: []metav1.LabelSelectorRequirement{ {Key: kube.RadixAppLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{appName}}, @@ -2977,7 +2977,7 @@ func TestUseGpuNodeOnDeploy(t *testing.T) { expectedAffinity := &corev1.Affinity{ NodeAffinity: &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{ {Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}}, - {Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorArchitecture}}, + {Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{string(radixv1.RuntimeArchitectureArm64)}}, }}}}}, PodAntiAffinity: &corev1.PodAntiAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{{Weight: 1, PodAffinityTerm: corev1.PodAffinityTerm{TopologyKey: corev1.LabelHostname, LabelSelector: &metav1.LabelSelector{MatchExpressions: []metav1.LabelSelectorRequirement{ {Key: kube.RadixAppLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{anyAppName}}, @@ -3153,22 +3153,10 @@ func TestUseGpuNodeCountOnDeployment(t *testing.T) { WithNodeGpuCount(nodeGpuCount10))) require.NoError(t, err) - defaultAffinityBuilder := func(componentName string) *corev1.Affinity { - return &corev1.Affinity{ - NodeAffinity: &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{ - {Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}}, - {Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorArchitecture}}, - }}}}}, - PodAntiAffinity: &corev1.PodAntiAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{{Weight: 1, PodAffinityTerm: corev1.PodAffinityTerm{TopologyKey: corev1.LabelHostname, LabelSelector: &metav1.LabelSelector{MatchExpressions: []metav1.LabelSelectorRequirement{ - {Key: kube.RadixAppLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{anyAppName}}, - {Key: kube.RadixComponentLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{componentName}}, - }}}}}}, - } - } t.Run("missing node.gpu", func(t *testing.T) { t.Parallel() deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), componentName1, metav1.GetOptions{}) - assert.Equal(t, defaultAffinityBuilder(componentName1), deployment.Spec.Template.Spec.Affinity) + assert.Equal(t, getDefaultComponentAffinityBuilder(componentName1, anyAppName), deployment.Spec.Template.Spec.Affinity) tolerations := deployment.Spec.Template.Spec.Tolerations assert.Len(t, tolerations, 0) // missing node.gpu }) @@ -3199,35 +3187,35 @@ func TestUseGpuNodeCountOnDeployment(t *testing.T) { t.Run("has node with gpu-count 0", func(t *testing.T) { t.Parallel() deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), componentName3, metav1.GetOptions{}) - assert.Equal(t, defaultAffinityBuilder(componentName3), deployment.Spec.Template.Spec.Affinity) + assert.Equal(t, getDefaultComponentAffinityBuilder(componentName3, anyAppName), deployment.Spec.Template.Spec.Affinity) tolerations := deployment.Spec.Template.Spec.Tolerations assert.Len(t, tolerations, 0) }) t.Run("has node with gpu-count -1", func(t *testing.T) { t.Parallel() deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), componentName4, metav1.GetOptions{}) - assert.Equal(t, defaultAffinityBuilder(componentName4), deployment.Spec.Template.Spec.Affinity) + assert.Equal(t, getDefaultComponentAffinityBuilder(componentName4, anyAppName), deployment.Spec.Template.Spec.Affinity) tolerations := deployment.Spec.Template.Spec.Tolerations assert.Len(t, tolerations, 0) }) t.Run("has node with invalid value of gpu-count", func(t *testing.T) { t.Parallel() deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), componentName5, metav1.GetOptions{}) - assert.Equal(t, defaultAffinityBuilder(componentName5), deployment.Spec.Template.Spec.Affinity) + assert.Equal(t, getDefaultComponentAffinityBuilder(componentName5, anyAppName), deployment.Spec.Template.Spec.Affinity) tolerations := deployment.Spec.Template.Spec.Tolerations assert.Len(t, tolerations, 0) }) t.Run("has node with no gpu-count", func(t *testing.T) { t.Parallel() deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), componentName6, metav1.GetOptions{}) - assert.Equal(t, defaultAffinityBuilder(componentName6), deployment.Spec.Template.Spec.Affinity) + assert.Equal(t, getDefaultComponentAffinityBuilder(componentName6, anyAppName), deployment.Spec.Template.Spec.Affinity) tolerations := deployment.Spec.Template.Spec.Tolerations assert.Len(t, tolerations, 0) }) t.Run("job has node, but pod template of Job Scheduler does not have it", func(t *testing.T) { t.Parallel() deployment, _ := client.AppsV1().Deployments(envNamespace).Get(context.Background(), jobComponentName, metav1.GetOptions{}) - assert.Equal(t, defaultAffinityBuilder(jobComponentName), deployment.Spec.Template.Spec.Affinity) + assert.Equal(t, getDefaultJobComponentAffinityBuilder(anyAppName, jobComponentName), deployment.Spec.Template.Spec.Affinity) tolerations := deployment.Spec.Template.Spec.Tolerations assert.Len(t, tolerations, 0) }) @@ -3290,7 +3278,7 @@ func TestUseGpuNodeWithGpuCountOnDeployment(t *testing.T) { expectedAffinity := &corev1.Affinity{ NodeAffinity: &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{ {Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}}, - {Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorArchitecture}}, + {Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{string(radixv1.RuntimeArchitectureArm64)}}, }}}}}, PodAntiAffinity: &corev1.PodAntiAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{{Weight: 1, PodAffinityTerm: corev1.PodAffinityTerm{TopologyKey: corev1.LabelHostname, LabelSelector: &metav1.LabelSelector{MatchExpressions: []metav1.LabelSelectorRequirement{ {Key: kube.RadixAppLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{anyAppName}}, @@ -4569,3 +4557,32 @@ func getPortByName(name string, ports []corev1.ContainerPort) *corev1.ContainerP } return nil } + +func getDefaultComponentAffinityBuilder(componentName string, appName string) *corev1.Affinity { + return &corev1.Affinity{NodeAffinity: getLinuxAmd64NodeAffinity(), PodAntiAffinity: getComponentPodAntiAffinity(appName, componentName)} +} + +func getDefaultJobComponentAffinityBuilder(appName string, jobComponentName string) *corev1.Affinity { + return &corev1.Affinity{NodeAffinity: getLinuxArm64NodeAffinity(), PodAntiAffinity: getComponentPodAntiAffinity(appName, jobComponentName)} +} + +func getComponentPodAntiAffinity(anyAppName string, componentName string) *corev1.PodAntiAffinity { + return &corev1.PodAntiAffinity{PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{{Weight: 1, PodAffinityTerm: corev1.PodAffinityTerm{TopologyKey: corev1.LabelHostname, LabelSelector: &metav1.LabelSelector{MatchExpressions: []metav1.LabelSelectorRequirement{ + {Key: kube.RadixAppLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{anyAppName}}, + {Key: kube.RadixComponentLabel, Operator: metav1.LabelSelectorOpIn, Values: []string{componentName}}, + }}}}}} +} + +func getLinuxArm64NodeAffinity() *corev1.NodeAffinity { + return &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{ + {Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}}, + {Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{string(radixv1.RuntimeArchitectureArm64)}}, + }}}}} +} + +func getLinuxAmd64NodeAffinity() *corev1.NodeAffinity { + return &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{ + {Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorOS}}, + {Key: corev1.LabelArchStable, Operator: corev1.NodeSelectorOpIn, Values: []string{defaults.DefaultNodeSelectorArchitecture}}, + }}}}} +} diff --git a/pkg/apis/deployment/jobschedulercomponent.go b/pkg/apis/deployment/jobschedulercomponent.go index bd705c50c..e2454df5f 100644 --- a/pkg/apis/deployment/jobschedulercomponent.go +++ b/pkg/apis/deployment/jobschedulercomponent.go @@ -86,7 +86,7 @@ func (js *jobSchedulerComponent) GetNode() *radixv1.RadixNode { } func (js *jobSchedulerComponent) GetRuntime() *radixv1.Runtime { - return &radixv1.Runtime{Architecture: radixv1.RuntimeArchitectureAmd64} + return &radixv1.Runtime{Architecture: radixv1.RuntimeArchitectureArm64} } func isDeployComponentJobSchedulerDeployment(deployComponent radixv1.RadixCommonDeployComponent) bool {