From c2f28ab00f8e46a82dce0012cb89c56b5ab03b9c Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 23 Feb 2024 16:34:16 -0800 Subject: [PATCH 1/5] Add 4GPU unit test --- .github/workflows/unit_test_4gpu.yaml | 43 +++++++++++++++++++ .../{unit_test.yaml => unit_test_cpu.yaml} | 4 +- 2 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/unit_test_4gpu.yaml rename .github/workflows/{unit_test.yaml => unit_test_cpu.yaml} (97%) diff --git a/.github/workflows/unit_test_4gpu.yaml b/.github/workflows/unit_test_4gpu.yaml new file mode 100644 index 00000000..51f6a964 --- /dev/null +++ b/.github/workflows/unit_test_4gpu.yaml @@ -0,0 +1,43 @@ +name: 4 GPU Unit Test + +on: + push: + branches: [ main ] + pull_request: + +concurrency: + group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash -l -eo pipefail {0} + +jobs: + unit_tests_4gpu: + runs-on: linux.g5.12xlarge.nvidia.gpu + strategy: + matrix: + python-version: ['3.10'] + steps: + - name: Check out repo + uses: actions/checkout@v3 + - name: Setup conda env + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + miniconda-version: "latest" + activate-environment: test + python-version: ${{ matrix.python-version }} + - name: Update pip + run: python -m pip install --upgrade pip + - name: Install dependencies + run: | + pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 + python -m pip install -r requirements.txt + python -m pip install -r dev-requirements.txt + python -m pip install -e . + - name: Run unit tests with coverage + run: pytest test --cov=. --cov-report=xml --durations=20 -vv + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v3 diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test_cpu.yaml similarity index 97% rename from .github/workflows/unit_test.yaml rename to .github/workflows/unit_test_cpu.yaml index bb71b892..354182ac 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test_cpu.yaml @@ -1,4 +1,4 @@ -name: Unit Test +name: CPU Unit Test on: push: @@ -14,7 +14,7 @@ defaults: shell: bash -l -eo pipefail {0} jobs: - unit_tests: + cpu_unit_tests: runs-on: ubuntu-latest strategy: matrix: From 25b2cb67ae820a7e6ddf0e6d340bd1f6958502a9 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 23 Feb 2024 16:42:51 -0800 Subject: [PATCH 2/5] try to run llama train script --- .github/workflows/unit_test_4gpu.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit_test_4gpu.yaml b/.github/workflows/unit_test_4gpu.yaml index 51f6a964..88c0e103 100644 --- a/.github/workflows/unit_test_4gpu.yaml +++ b/.github/workflows/unit_test_4gpu.yaml @@ -38,6 +38,7 @@ jobs: python -m pip install -r dev-requirements.txt python -m pip install -e . - name: Run unit tests with coverage - run: pytest test --cov=. --cov-report=xml --durations=20 -vv + #run: pytest test --cov=. --cov-report=xml --durations=20 -vv + run: NGPU=4 ./run_llama_train.sh - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 From a3bb5dc1031914c6535247f1725b069a90406574 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 23 Feb 2024 16:58:05 -0800 Subject: [PATCH 3/5] make names better --- .github/workflows/unit_test_4gpu.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/unit_test_4gpu.yaml b/.github/workflows/unit_test_4gpu.yaml index 88c0e103..5e53c08c 100644 --- a/.github/workflows/unit_test_4gpu.yaml +++ b/.github/workflows/unit_test_4gpu.yaml @@ -37,8 +37,7 @@ jobs: python -m pip install -r requirements.txt python -m pip install -r dev-requirements.txt python -m pip install -e . - - name: Run unit tests with coverage - #run: pytest test --cov=. --cov-report=xml --durations=20 -vv + - name: Run NGPU=4 ./run_llama_train.sh run: NGPU=4 ./run_llama_train.sh - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 From 38267d5a5d49e4e93306250552ca58648f510c07 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 23 Feb 2024 16:58:21 -0800 Subject: [PATCH 4/5] Break train.py to see if CI notices --- train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/train.py b/train.py index f3f3389e..30d55205 100644 --- a/train.py +++ b/train.py @@ -256,6 +256,8 @@ def main(job_config: JobConfig): ) scheduler.step() + raise RuntimeError("Testing that CI notices breakage") + checkpoint.save( train_state.step, force=(train_state.step == job_config.training.steps) ) From cd45ca5890307be8c73bfa1a3dcb1c8ffffefe70 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Fri, 23 Feb 2024 17:05:08 -0800 Subject: [PATCH 5/5] Revert "Break train.py to see if CI notices" This reverts commit 38267d5a5d49e4e93306250552ca58648f510c07. --- train.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/train.py b/train.py index 30d55205..f3f3389e 100644 --- a/train.py +++ b/train.py @@ -256,8 +256,6 @@ def main(job_config: JobConfig): ) scheduler.step() - raise RuntimeError("Testing that CI notices breakage") - checkpoint.save( train_state.step, force=(train_state.step == job_config.training.steps) )