Skip to content

Commit

Permalink
Use torch generic workflow for CI
Browse files Browse the repository at this point in the history
ghstack-source-id: b1fa8d8c1778ecb532ed71792ead9f4dbb067cf4
Pull Request resolved: #325
  • Loading branch information
wconstab authored and tianyu-l committed May 28, 2024
1 parent f144fe3 commit e467662
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 80 deletions.
52 changes: 22 additions & 30 deletions .github/workflows/unit_test_4gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,26 @@ concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
unit_tests_4gpu:
runs-on: linux.g5.12xlarge.nvidia.gpu
strategy:
matrix:
python-version: ['3.10']
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install dependencies
run: |
pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
- name: Run test_runner.py
run: python ./test_runner.py
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
build-test:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux.g5.12xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
# This image is faster to clone than the default, but it lacks CC needed by triton
# (1m25s vs 2m37s).
docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime"
repository: "pytorch/torchtitan"
upload-artifact: "outputs"
script: |
conda install -y -q git clang clangxx
export CC=clang
export CXX=clangxx
pip config --user set global.progress_bar off
pip install --upgrade pip
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
mkdir artifacts-to-be-uploaded
python ./test_runner.py artifacts-to-be-uploaded
40 changes: 10 additions & 30 deletions .github/workflows/unit_test_cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,14 @@ concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
cpu_unit_tests:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.10']
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install dependencies
run: |
pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
- name: Run unit tests with coverage
run: pytest test --cov=. --cov-report=xml --durations=20 -vv
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
build-test:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime"
repository: "pytorch/torchtitan"
script: |
pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
pytest test --cov=. --cov-report=xml --durations=20 -vv
47 changes: 27 additions & 20 deletions test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import glob
import argparse
import os
import shutil
import subprocess
from collections import defaultdict
from dataclasses import dataclass
Expand All @@ -18,6 +17,11 @@
import tomli as tomllib


parser = argparse.ArgumentParser()
parser.add_argument("output_dir")
args = parser.parse_args()


@dataclass
class OverrideDefinitions:
"""
Expand All @@ -29,7 +33,6 @@ class OverrideDefinitions:


CONFIG_DIR = "./train_configs"
test_checkpoint_dir = "./test_runner_checkpoint"

"""
key is the config file name and value is a list of OverrideDefinitions
Expand All @@ -40,25 +43,39 @@ class OverrideDefinitions:
integration_tests_flavors["debug_model.toml"] = [
OverrideDefinitions(
[
["--training.compile"],
[
f"--job.dump_folder {args.output_dir}/default/",
],
],
"Default",
),
OverrideDefinitions(
[
[
"--training.compile",
f"--job.dump_folder {args.output_dir}/1d_compile/",
],
],
"1D compile",
),
OverrideDefinitions(
[
["--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm"],
[
"--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm",
f"--job.dump_folder {args.output_dir}/eager_2d/",
],
],
"Eager mode 2DParallel",
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_full_checkpoint",
f"--job.dump_folder {args.output_dir}/full_checkpoint/",
],
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_full_checkpoint",
f"--job.dump_folder {args.output_dir}/full_checkpoint/",
"--training.steps 20",
],
],
Expand All @@ -68,7 +85,7 @@ class OverrideDefinitions:
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_model_weights_only_fp32",
f"--job.dump_folder {args.output_dir}/model_weights_only_fp32/",
"--checkpoint.model_weights_only",
],
],
Expand All @@ -78,7 +95,7 @@ class OverrideDefinitions:
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_model_weights_only_bf16",
f"--job.dump_folder {args.output_dir}/model_weights_only_bf16/",
"--checkpoint.model_weights_only",
"--checkpoint.export_dtype bfloat16",
],
Expand Down Expand Up @@ -118,15 +135,5 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str):
config = tomllib.load(f)
is_integration_test = config["job"].get("use_for_integration_test", False)
if is_integration_test:
test_flavors = [OverrideDefinitions()] + integration_tests_flavors[
config_file
]

for test_flavor in test_flavors:
for test_flavor in integration_tests_flavors[config_file]:
run_test(test_flavor, full_path)

# Deleting checkpoint folder from test
dir_list = glob.iglob(f"{test_checkpoint_dir}_*")
for path in dir_list:
if os.path.exists(path):
shutil.rmtree(path)

0 comments on commit e467662

Please sign in to comment.