Skip to content

Commit

Permalink
Merge branch 'main' into mace
Browse files Browse the repository at this point in the history
  • Loading branch information
mariogeiger committed Dec 18, 2024
2 parents 48e0a90 + 5a74526 commit cbb3b60
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 51 deletions.
2 changes: 2 additions & 0 deletions .github/container/build-jax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ case "${CPU_ARCH}" in
;;
"arm64")
export CC_OPT_FLAGS="-march=armv8-a"
# ARM ACL build issue introduced in PR#23225
BUILD_PARAM="${BUILD_PARAM} --disable_mkl_dnn"
;;
esac

Expand Down
3 changes: 3 additions & 0 deletions .github/container/test-maxtext.sh
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@ if [ $DTYPE == "fp8" ]; then
fi

GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then
GPUS_PER_NODE=`python -c 'import os; x=os.environ.get("CUDA_VISIBLE_DEVICES", ""); print(len(x.split(",")))'`
fi
NGPUS=$((GPUS_PER_NODE * NODES))

# Heuristic to figure out ici and dcn of DP
Expand Down
2 changes: 2 additions & 0 deletions .github/eks-workflow-files/job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ apiVersion: batch/v1
kind: Job
metadata:
name: PLACEHOLDER
labels:
kueue.x-k8s.io/queue-name: p5-queue
spec:
completions: 2 # number of nodes
parallelism: 2 # number of nodes
Expand Down
53 changes: 29 additions & 24 deletions .github/eks-workflow-files/mpi-nccl-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@ apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: PLACEHOLDER
labels:
kueue.x-k8s.io/queue-name: p5-queue
spec:
# Without this then the first few attempts to run the launcher will result in errors
# due to failed DNS resolution of the worker names. It works eventually, given a big
# enough backoffLimit, but it makes it harder to handle log-streaming and identifying
# the "real" exit code of the job.
launcherCreationPolicy: WaitForWorkersReady
runPolicy:
cleanPodPolicy: Running
# surface errors direct to GitHub Actions without internal retries
# surface errors direct to GitHub Actions without Kubernetes-internal retries
backoffLimit: 0
# start suspended, let kueue unblock
suspend: true
# 1 MPI rank per GPU
slotsPerWorker: 8
mpiReplicaSpecs:
Expand All @@ -27,25 +26,31 @@ spec:
imagePullPolicy: IfNotPresent
name: PLACEHOLDER
command:
- mpirun
- --allow-run-as-root
- -np
- "16"
- -N
- "8"
- PLACEHOLDER
- -b
- "8"
- -e
- "16G"
- -f
- "2"
- -g
- "1"
- bash
- -c
- "1"
- -n
- "100"
- |
# kueue breaks the WaitForWorkersReady policy that mpi-operator
# nominally supports, so manually wait a while for a basic mpirun to
# start working (i.e. for the workers to be ready) before doing
# anything interesting, instead of relying on mpi-operator not to
# start the launcher before it is expected to succeed. This issue
# seems related: https://github.com/kubeflow/mpi-operator/pull/617
limit=5m
if ! timeout ${limit} sh -c "while ! mpirun --allow-run-as-root -N 1 hostname; do sleep 5; done"; then
echo "Workers were still not reachable after ${limit}, exiting"
exit 1
fi
mpirun --allow-run-as-root -np 16 -N 8 $0 \
-b 8 \
-e 16G \
-f 2 \
-g 1 \
-c 1 \
-n 100
- PLACEHOLDER
resources:
limits:
cpu: 1
imagePullSecrets:
- name: PLACEHOLDER
Worker:
Expand Down
4 changes: 0 additions & 4 deletions .github/eks-workflow-files/post-process-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,6 @@ spec:
- pipefail
- -c
- nsys-jax-combine -o /opt/output/combined.zip /opt/output/*.zip --analysis communication
# FIXME: GPU not actually needed, but the test cluster doesn't have appropriate non-GPU nodes
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /opt/output
name: output
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -452,12 +452,6 @@ jobs:
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Install yq
run: |
mkdir local_bin/
curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture)
chmod 777 ./local_bin/yq
echo "${PWD}/local_bin" >> "${GITHUB_PATH}"
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
Expand Down
30 changes: 13 additions & 17 deletions .github/workflows/nccl-k8s.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,6 @@ jobs:
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Install yq
run: |
mkdir local_bin/
curl -L -o ./local_bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture)
chmod 777 ./local_bin/yq
echo "${PWD}/local_bin" >> "${GITHUB_PATH}"
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
Expand Down Expand Up @@ -86,7 +80,7 @@ jobs:
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[6] = strenv(TEST_NAME)
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
Expand All @@ -97,19 +91,21 @@ jobs:
- name: Wait for Kubernetes job to start
# Note that this is *not* using JOB_NAME
run: |
# Launcher job is only created once the workers are ready; wait for its
# creation. This is where we block if the cluster is busy executing other jobs,
# but it might be better to impose more of a parallelism limit at the GitHub
# Actions level to keep the Kubernetes queue length modest
kubectl wait --for=create job/${LAUNCHER_NAME} --timeout=3600s
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
# resources are available, but that is where there can be a long wait if the
# cluster is busy executing other jobs.
kubectl wait --for=create job/${LAUNCHER_NAME}
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=3600s
- name: Stream Kubernetes job output
# Note that this is *not* JOB_NAME
run: |
# Streaming logs will fail if the container/pod is still pending
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
sleep 1
done
- name: Stream Kubernetes job output
# Note that this is *not* JOB_NAME
# TODO: --all-containers=true --all-pods=true could make sense here
run: kubectl logs --follow job/${LAUNCHER_NAME}
# TODO: --all-containers=true --all-pods=true could make sense here, but it
# prefixes lines with a rather verbose tag
kubectl logs --follow job/${LAUNCHER_NAME}
- name: Retrieve Kubernetes job status
shell: bash -exo pipefail {0}
run: |
Expand All @@ -135,7 +131,7 @@ jobs:
run: |
# Provide better debug in case of launch failures that will not produce log output
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
if [[ -n "${powd}" ]]; then
if [[ -n "${pods}" ]]; then
kubectl describe ${pods}
fi
# Clean up in case of errors as well as success
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ For a list of previously used XLA flags that are no longer needed, please also r

| First nightly with new base container | Base container |
| ------------------------------------- | -------------- |
| 2024-12-07 | nvidia/cuda:12.6.3-devel-ubuntu22.04 |
| 2024-11-06 | nvidia/cuda:12.6.2-devel-ubuntu22.04 |
| 2024-09-25 | nvidia/cuda:12.6.1-devel-ubuntu22.04 |
| 2024-07-24 | nvidia/cuda:12.5.0-devel-ubuntu22.04 |
Expand Down

0 comments on commit cbb3b60

Please sign in to comment.