diff --git a/.github/workflows/unit_test_4gpu.yaml b/.github/workflows/unit_test_4gpu.yaml index 0088bb3e..c9247688 100644 --- a/.github/workflows/unit_test_4gpu.yaml +++ b/.github/workflows/unit_test_4gpu.yaml @@ -9,34 +9,24 @@ concurrency: group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash -l -eo pipefail {0} - jobs: - unit_tests_4gpu: - runs-on: linux.g5.12xlarge.nvidia.gpu - strategy: - matrix: - python-version: ['3.10'] - steps: - - name: Check out repo - uses: actions/checkout@v3 - - name: Setup conda env - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniconda-version: "latest" - activate-environment: test - python-version: ${{ matrix.python-version }} - - name: Update pip - run: python -m pip install --upgrade pip - - name: Install dependencies - run: | - pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 - python -m pip install -r requirements.txt - python -m pip install -r dev-requirements.txt - - name: Run test_runner.py - run: python ./test_runner.py - - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v3 + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.g5.12xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + # This image is faster to clone than the default, but it lacks CC needed by triton + # docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime" + repository: "pytorch/torchtitan" + upload-artifact: "outputs" + script: | + pip config --user set global.progress_bar off + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 + python -m pip install -r requirements.txt + python -m pip install -r dev-requirements.txt + python ./test_runner.py + mv outputs artifacts-to-be-uploaded + # upload-coverage: + # - name: Upload Coverage to Codecov + # uses: codecov/codecov-action@v3 diff --git a/.github/workflows/unit_test_cpu.yaml b/.github/workflows/unit_test_cpu.yaml index d2b4814b..11461360 100644 --- a/.github/workflows/unit_test_cpu.yaml +++ b/.github/workflows/unit_test_cpu.yaml @@ -9,34 +9,17 @@ concurrency: group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash -l -eo pipefail {0} - jobs: - cpu_unit_tests: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.10'] - steps: - - name: Check out repo - uses: actions/checkout@v3 - - name: Setup conda env - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniconda-version: "latest" - activate-environment: test - python-version: ${{ matrix.python-version }} - - name: Update pip - run: python -m pip install --upgrade pip - - name: Install dependencies - run: | - pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 - python -m pip install -r requirements.txt - python -m pip install -r dev-requirements.txt - - name: Run unit tests with coverage - run: pytest test --cov=. --cov-report=xml --durations=20 -vv - - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v3 + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime" + repository: "pytorch/torchtitan" + script: | + pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 + python -m pip install -r requirements.txt + python -m pip install -r dev-requirements.txt + pytest test --cov=. --cov-report=xml --durations=20 -vv + # upload-coverage: + # - name: Upload Coverage to Codecov + # uses: codecov/codecov-action@v3