From a2f1a246e9e0934c71303a76af5ace68728bf8ac Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 17 May 2024 16:50:48 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- .github/workflows/unit_test_4gpu.yaml | 2 +- .github/workflows/unit_test_8gpu.yaml | 35 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/unit_test_8gpu.yaml diff --git a/.github/workflows/unit_test_4gpu.yaml b/.github/workflows/unit_test_4gpu.yaml index 5759349d..e59dff34 100644 --- a/.github/workflows/unit_test_4gpu.yaml +++ b/.github/workflows/unit_test_4gpu.yaml @@ -32,4 +32,4 @@ jobs: python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 mkdir artifacts-to-be-uploaded - python ./test_runner.py artifacts-to-be-uploaded + python ./test_runner.py artifacts-to-be-uploaded --ngpu 4 diff --git a/.github/workflows/unit_test_8gpu.yaml b/.github/workflows/unit_test_8gpu.yaml new file mode 100644 index 00000000..a22459f3 --- /dev/null +++ b/.github/workflows/unit_test_8gpu.yaml @@ -0,0 +1,35 @@ +name: 4 GPU Unit Test + +on: + push: + branches: [ main ] + pull_request: + +concurrency: + group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +jobs: + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.g5.48xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + repository: pytorch/torchtitan + upload-artifact: outputs + script: | + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + pip config --user set global.progress_bar off + + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 + mkdir artifacts-to-be-uploaded + python ./test_runner.py artifacts-to-be-uploaded --ngpu 8