Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use torch generic workflow for CI, add ssh, artifacts #325

Merged
merged 15 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 22 additions & 30 deletions .github/workflows/unit_test_4gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,26 @@ concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
unit_tests_4gpu:
runs-on: linux.g5.12xlarge.nvidia.gpu
strategy:
matrix:
python-version: ['3.10']
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install dependencies
run: |
pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
- name: Run test_runner.py
run: python ./test_runner.py
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
build-test:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux.g5.12xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
# This image is faster to clone than the default, but it lacks CC needed by triton
# (1m25s vs 2m37s).
docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime"
repository: "pytorch/torchtitan"
upload-artifact: "outputs"
script: |
conda install -y -q git clang clangxx
export CC=clang
export CXX=clangxx
pip config --user set global.progress_bar off
pip install --upgrade pip
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
mkdir artifacts-to-be-uploaded
python ./test_runner.py artifacts-to-be-uploaded
40 changes: 10 additions & 30 deletions .github/workflows/unit_test_cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,14 @@ concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
cpu_unit_tests:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.10']
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Update pip
run: python -m pip install --upgrade pip
- name: Install dependencies
run: |
pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
- name: Run unit tests with coverage
run: pytest test --cov=. --cov-report=xml --durations=20 -vv
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
build-test:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime"
repository: "pytorch/torchtitan"
script: |
pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
pytest test --cov=. --cov-report=xml --durations=20 -vv
47 changes: 27 additions & 20 deletions test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import glob
import argparse
import os
import shutil
import subprocess
from collections import defaultdict
from dataclasses import dataclass
Expand All @@ -18,6 +17,11 @@
import tomli as tomllib


parser = argparse.ArgumentParser()
parser.add_argument("output_dir")
args = parser.parse_args()


@dataclass
class OverrideDefinitions:
"""
Expand All @@ -29,7 +33,6 @@ class OverrideDefinitions:


CONFIG_DIR = "./train_configs"
test_checkpoint_dir = "./test_runner_checkpoint"

"""
key is the config file name and value is a list of OverrideDefinitions
Expand All @@ -40,25 +43,39 @@ class OverrideDefinitions:
integration_tests_flavors["debug_model.toml"] = [
OverrideDefinitions(
[
["--training.compile"],
[
f"--job.dump_folder {args.output_dir}/default/",
],
],
"Default",
),
OverrideDefinitions(
[
[
"--training.compile",
f"--job.dump_folder {args.output_dir}/1d_compile/",
],
],
"1D compile",
),
OverrideDefinitions(
[
["--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm"],
[
"--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm",
f"--job.dump_folder {args.output_dir}/eager_2d/",
],
],
"Eager mode 2DParallel",
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_full_checkpoint",
f"--job.dump_folder {args.output_dir}/full_checkpoint/",
],
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_full_checkpoint",
f"--job.dump_folder {args.output_dir}/full_checkpoint/",
"--training.steps 20",
],
],
Expand All @@ -68,7 +85,7 @@ class OverrideDefinitions:
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_model_weights_only_fp32",
f"--job.dump_folder {args.output_dir}/model_weights_only_fp32/",
"--checkpoint.model_weights_only",
],
],
Expand All @@ -78,7 +95,7 @@ class OverrideDefinitions:
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_model_weights_only_bf16",
f"--job.dump_folder {args.output_dir}/model_weights_only_bf16/",
"--checkpoint.model_weights_only",
"--checkpoint.export_dtype bfloat16",
],
Expand Down Expand Up @@ -118,15 +135,5 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str):
config = tomllib.load(f)
is_integration_test = config["job"].get("use_for_integration_test", False)
if is_integration_test:
test_flavors = [OverrideDefinitions()] + integration_tests_flavors[
config_file
]

for test_flavor in test_flavors:
for test_flavor in integration_tests_flavors[config_file]:
run_test(test_flavor, full_path)

# Deleting checkpoint folder from test
dir_list = glob.iglob(f"{test_checkpoint_dir}_*")
for path in dir_list:
if os.path.exists(path):
shutil.rmtree(path)
Loading