diff --git a/.github/container/build-jax.sh b/.github/container/build-jax.sh index 95cf5246b..b179e9456 100755 --- a/.github/container/build-jax.sh +++ b/.github/container/build-jax.sh @@ -185,6 +185,8 @@ case "${CPU_ARCH}" in ;; "arm64") export CC_OPT_FLAGS="-march=armv8-a" + # ARM ACL build issue introduced in PR#23225 + BUILD_PARAM="${BUILD_PARAM} --disable_mkl_dnn" ;; esac diff --git a/.github/container/test-maxtext.sh b/.github/container/test-maxtext.sh index 90e7c2488..ebb2afcdc 100755 --- a/.github/container/test-maxtext.sh +++ b/.github/container/test-maxtext.sh @@ -174,6 +174,9 @@ if [ $DTYPE == "fp8" ]; then fi GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') +if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then + GPUS_PER_NODE=`python -c 'import os; x=os.environ.get("CUDA_VISIBLE_DEVICES", ""); print(len(x.split(",")))'` +fi NGPUS=$((GPUS_PER_NODE * NODES)) # Heuristic to figure out ici and dcn of DP diff --git a/.github/eks-workflow-files/job.yml b/.github/eks-workflow-files/job.yml index 463f0ee31..4d33ea840 100644 --- a/.github/eks-workflow-files/job.yml +++ b/.github/eks-workflow-files/job.yml @@ -11,6 +11,8 @@ apiVersion: batch/v1 kind: Job metadata: name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue spec: completions: 2 # number of nodes parallelism: 2 # number of nodes diff --git a/.github/eks-workflow-files/mpi-nccl-test.yml b/.github/eks-workflow-files/mpi-nccl-test.yml index 1f0503214..b3207cc1f 100644 --- a/.github/eks-workflow-files/mpi-nccl-test.yml +++ b/.github/eks-workflow-files/mpi-nccl-test.yml @@ -2,16 +2,15 @@ apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue spec: - # Without this then the first few attempts to run the launcher will result in errors - # due to failed DNS resolution of the worker names. It works eventually, given a big - # enough backoffLimit, but it makes it harder to handle log-streaming and identifying - # the "real" exit code of the job. - launcherCreationPolicy: WaitForWorkersReady runPolicy: cleanPodPolicy: Running - # surface errors direct to GitHub Actions without internal retries + # surface errors direct to GitHub Actions without Kubernetes-internal retries backoffLimit: 0 + # start suspended, let kueue unblock + suspend: true # 1 MPI rank per GPU slotsPerWorker: 8 mpiReplicaSpecs: @@ -27,25 +26,31 @@ spec: imagePullPolicy: IfNotPresent name: PLACEHOLDER command: - - mpirun - - --allow-run-as-root - - -np - - "16" - - -N - - "8" - - PLACEHOLDER - - -b - - "8" - - -e - - "16G" - - -f - - "2" - - -g - - "1" + - bash - -c - - "1" - - -n - - "100" + - | + # kueue breaks the WaitForWorkersReady policy that mpi-operator + # nominally supports, so manually wait a while for a basic mpirun to + # start working (i.e. for the workers to be ready) before doing + # anything interesting, instead of relying on mpi-operator not to + # start the launcher before it is expected to succeed. This issue + # seems related: https://github.com/kubeflow/mpi-operator/pull/617 + limit=5m + if ! timeout ${limit} sh -c "while ! mpirun --allow-run-as-root -N 1 hostname; do sleep 5; done"; then + echo "Workers were still not reachable after ${limit}, exiting" + exit 1 + fi + mpirun --allow-run-as-root -np 16 -N 8 $0 \ + -b 8 \ + -e 16G \ + -f 2 \ + -g 1 \ + -c 1 \ + -n 100 + - PLACEHOLDER + resources: + limits: + cpu: 1 imagePullSecrets: - name: PLACEHOLDER Worker: diff --git a/.github/eks-workflow-files/post-process-job.yml b/.github/eks-workflow-files/post-process-job.yml index 989ddebe2..a2b4e491f 100644 --- a/.github/eks-workflow-files/post-process-job.yml +++ b/.github/eks-workflow-files/post-process-job.yml @@ -32,10 +32,6 @@ spec: - pipefail - -c - nsys-jax-combine -o /opt/output/combined.zip /opt/output/*.zip --analysis communication - # FIXME: GPU not actually needed, but the test cluster doesn't have appropriate non-GPU nodes - resources: - limits: - nvidia.com/gpu: 1 volumeMounts: - mountPath: /opt/output name: output diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 0848a6e11..b4f3b8143 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -452,12 +452,6 @@ jobs: steps: - name: Check out the repository uses: actions/checkout@v4 - - name: Install yq - run: | - mkdir local_bin/ - curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) - chmod 777 ./local_bin/yq - echo "${PWD}/local_bin" >> "${GITHUB_PATH}" - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: diff --git a/.github/workflows/nccl-k8s.yaml b/.github/workflows/nccl-k8s.yaml index b660fb44b..c9c688a1d 100644 --- a/.github/workflows/nccl-k8s.yaml +++ b/.github/workflows/nccl-k8s.yaml @@ -53,12 +53,6 @@ jobs: steps: - name: Check out the repository uses: actions/checkout@v4 - - name: Install yq - run: | - mkdir local_bin/ - curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) - chmod 777 ./local_bin/yq - echo "${PWD}/local_bin" >> "${GITHUB_PATH}" - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: @@ -86,7 +80,7 @@ jobs: | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE) | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME) | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) - | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[6] = strenv(TEST_NAME) + | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME) | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE) | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME) | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ @@ -97,19 +91,21 @@ jobs: - name: Wait for Kubernetes job to start # Note that this is *not* using JOB_NAME run: | - # Launcher job is only created once the workers are ready; wait for its - # creation. This is where we block if the cluster is busy executing other jobs, - # but it might be better to impose more of a parallelism limit at the GitHub - # Actions level to keep the Kubernetes queue length modest - kubectl wait --for=create job/${LAUNCHER_NAME} --timeout=3600s + # Launcher job is created eagerly, but suspended. Kueue un-suspends it when + # resources are available, but that is where there can be a long wait if the + # cluster is busy executing other jobs. + kubectl wait --for=create job/${LAUNCHER_NAME} + kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s + - name: Stream Kubernetes job output + # Note that this is *not* JOB_NAME + run: | # Streaming logs will fail if the container/pod is still pending while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do sleep 1 done - - name: Stream Kubernetes job output - # Note that this is *not* JOB_NAME - # TODO: --all-containers=true --all-pods=true could make sense here - run: kubectl logs --follow job/${LAUNCHER_NAME} + # TODO: --all-containers=true --all-pods=true could make sense here, but it + # prefixes lines with a rather verbose tag + kubectl logs --follow job/${LAUNCHER_NAME} - name: Retrieve Kubernetes job status shell: bash -exo pipefail {0} run: | @@ -135,7 +131,7 @@ jobs: run: | # Provide better debug in case of launch failures that will not produce log output pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name) - if [[ -n "${powd}" ]]; then + if [[ -n "${pods}" ]]; then kubectl describe ${pods} fi # Clean up in case of errors as well as success diff --git a/README.md b/README.md index 78835517b..4438f7efc 100644 --- a/README.md +++ b/README.md @@ -351,6 +351,7 @@ For a list of previously used XLA flags that are no longer needed, please also r | First nightly with new base container | Base container | | ------------------------------------- | -------------- | +| 2024-12-07 | nvidia/cuda:12.6.3-devel-ubuntu22.04 | | 2024-11-06 | nvidia/cuda:12.6.2-devel-ubuntu22.04 | | 2024-09-25 | nvidia/cuda:12.6.1-devel-ubuntu22.04 | | 2024-07-24 | nvidia/cuda:12.5.0-devel-ubuntu22.04 |