diff --git a/.github/workflows/unit_test_4gpu.yaml b/.github/workflows/unit_test_4gpu.yaml index 0088bb3e..bef4238f 100644 --- a/.github/workflows/unit_test_4gpu.yaml +++ b/.github/workflows/unit_test_4gpu.yaml @@ -9,34 +9,26 @@ concurrency: group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash -l -eo pipefail {0} - jobs: - unit_tests_4gpu: - runs-on: linux.g5.12xlarge.nvidia.gpu - strategy: - matrix: - python-version: ['3.10'] - steps: - - name: Check out repo - uses: actions/checkout@v3 - - name: Setup conda env - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniconda-version: "latest" - activate-environment: test - python-version: ${{ matrix.python-version }} - - name: Update pip - run: python -m pip install --upgrade pip - - name: Install dependencies - run: | - pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 - python -m pip install -r requirements.txt - python -m pip install -r dev-requirements.txt - - name: Run test_runner.py - run: python ./test_runner.py - - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v3 + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.g5.12xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime" + repository: "pytorch/torchtitan" + upload-artifact: "outputs" + script: | + conda install -y -q git clang clangxx + export CC=clang + export CXX=clangxx + pip config --user set global.progress_bar off + pip install --upgrade pip + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 + python -m pip install -r requirements.txt + python -m pip install -r dev-requirements.txt + mkdir artifacts-to-be-uploaded + python ./test_runner.py artifacts-to-be-uploaded diff --git a/.github/workflows/unit_test_cpu.yaml b/.github/workflows/unit_test_cpu.yaml index d2b4814b..ccb706ce 100644 --- a/.github/workflows/unit_test_cpu.yaml +++ b/.github/workflows/unit_test_cpu.yaml @@ -9,34 +9,14 @@ concurrency: group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash -l -eo pipefail {0} - jobs: - cpu_unit_tests: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.10'] - steps: - - name: Check out repo - uses: actions/checkout@v3 - - name: Setup conda env - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniconda-version: "latest" - activate-environment: test - python-version: ${{ matrix.python-version }} - - name: Update pip - run: python -m pip install --upgrade pip - - name: Install dependencies - run: | - pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 - python -m pip install -r requirements.txt - python -m pip install -r dev-requirements.txt - - name: Run unit tests with coverage - run: pytest test --cov=. --cov-report=xml --durations=20 -vv - - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v3 + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime" + repository: "pytorch/torchtitan" + script: | + pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 + python -m pip install -r requirements.txt + python -m pip install -r dev-requirements.txt + pytest test --cov=. --cov-report=xml --durations=20 -vv diff --git a/test_runner.py b/test_runner.py index 80d75ad8..e6b03f51 100755 --- a/test_runner.py +++ b/test_runner.py @@ -4,9 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import glob +import argparse import os -import shutil import subprocess from collections import defaultdict from dataclasses import dataclass @@ -18,6 +17,11 @@ import tomli as tomllib +parser = argparse.ArgumentParser() +parser.add_argument("output_dir") +args = parser.parse_args() + + @dataclass class OverrideDefinitions: """ @@ -29,7 +33,6 @@ class OverrideDefinitions: CONFIG_DIR = "./train_configs" -test_checkpoint_dir = "./test_runner_checkpoint" """ key is the config file name and value is a list of OverrideDefinitions @@ -40,13 +43,27 @@ class OverrideDefinitions: integration_tests_flavors["debug_model.toml"] = [ OverrideDefinitions( [ - ["--training.compile"], + [ + f"--job.dump_folder {args.output_dir}/default/", + ], + ], + "Default", + ), + OverrideDefinitions( + [ + [ + "--training.compile", + f"--job.dump_folder {args.output_dir}/1d_compile/", + ], ], "1D compile", ), OverrideDefinitions( [ - ["--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm"], + [ + "--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm", + f"--job.dump_folder {args.output_dir}/eager_2d/", + ], ], "Eager mode 2DParallel", ), @@ -54,11 +71,11 @@ class OverrideDefinitions: [ [ "--checkpoint.enable_checkpoint", - f"--checkpoint.folder {test_checkpoint_dir}_full_checkpoint", + f"--job.dump_folder {args.output_dir}/full_checkpoint/", ], [ "--checkpoint.enable_checkpoint", - f"--checkpoint.folder {test_checkpoint_dir}_full_checkpoint", + f"--job.dump_folder {args.output_dir}/full_checkpoint/", "--training.steps 20", ], ], @@ -68,7 +85,7 @@ class OverrideDefinitions: [ [ "--checkpoint.enable_checkpoint", - f"--checkpoint.folder {test_checkpoint_dir}_model_weights_only_fp32", + f"--job.dump_folder {args.output_dir}/model_weights_only_fp32/", "--checkpoint.model_weights_only", ], ], @@ -78,7 +95,7 @@ class OverrideDefinitions: [ [ "--checkpoint.enable_checkpoint", - f"--checkpoint.folder {test_checkpoint_dir}_model_weights_only_bf16", + f"--job.dump_folder {args.output_dir}/model_weights_only_bf16/", "--checkpoint.model_weights_only", "--checkpoint.export_dtype bfloat16", ], @@ -118,15 +135,5 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str): config = tomllib.load(f) is_integration_test = config["job"].get("use_for_integration_test", False) if is_integration_test: - test_flavors = [OverrideDefinitions()] + integration_tests_flavors[ - config_file - ] - - for test_flavor in test_flavors: + for test_flavor in integration_tests_flavors[config_file]: run_test(test_flavor, full_path) - - # Deleting checkpoint folder from test - dir_list = glob.iglob(f"{test_checkpoint_dir}_*") - for path in dir_list: - if os.path.exists(path): - shutil.rmtree(path)