Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
[ghstack-poisoned]
  • Loading branch information
wconstab committed May 15, 2024
1 parent 76216c5 commit c1b4c08
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 22 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/unit_test_4gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ jobs:
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
python -m pip install -r requirements.txt
python -m pip install -r dev-requirements.txt
python ./test_runner.py
mv outputs artifacts-to-be-uploaded
mkdir artifacts-to-be-uploaded
python ./test_runner.py artifacts-to-be-uploaded
# upload-coverage:
# - name: Upload Coverage to Codecov
# uses: codecov/codecov-action@v3
47 changes: 27 additions & 20 deletions test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import glob
import argparse
import os
import shutil
import subprocess
from collections import defaultdict
from dataclasses import dataclass
Expand All @@ -18,6 +17,11 @@
import tomli as tomllib


parser = argparse.ArgumentParser()
parser.add_argument("output_dir")
args = parser.parse_args()


@dataclass
class OverrideDefinitions:
"""
Expand All @@ -29,7 +33,6 @@ class OverrideDefinitions:


CONFIG_DIR = "./train_configs"
test_checkpoint_dir = "./test_runner_checkpoint"

"""
key is the config file name and value is a list of OverrideDefinitions
Expand All @@ -40,25 +43,39 @@ class OverrideDefinitions:
integration_tests_flavors["debug_model.toml"] = [
OverrideDefinitions(
[
["--training.compile"],
[
f"--job.dump_folder {args.output_dir}/default/",
],
],
"Default",
),
OverrideDefinitions(
[
[
"--training.compile",
f"--job.dump_folder {args.output_dir}/1d_compile/",
],
],
"1D compile",
),
OverrideDefinitions(
[
["--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm"],
[
"--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm",
f"--job.dump_folder {args.output_dir}/eager_2d/",
],
],
"Eager mode 2DParallel",
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_full_checkpoint",
f"--job.dump_folder {args.output_dir}/full_checkpoint/",
],
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_full_checkpoint",
f"--job.dump_folder {args.output_dir}/full_checkpoint/",
"--training.steps 20",
],
],
Expand All @@ -68,7 +85,7 @@ class OverrideDefinitions:
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_model_weights_only_fp32",
f"--job.dump_folder {args.output_dir}/model_weights_only_fp32/",
"--checkpoint.model_weights_only",
],
],
Expand All @@ -78,7 +95,7 @@ class OverrideDefinitions:
[
[
"--checkpoint.enable_checkpoint",
f"--checkpoint.folder {test_checkpoint_dir}_model_weights_only_bf16",
f"--job.dump_folder {args.output_dir}/model_weights_only_bf16/",
"--checkpoint.model_weights_only",
"--checkpoint.export_dtype bfloat16",
],
Expand Down Expand Up @@ -118,15 +135,5 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str):
config = tomllib.load(f)
is_integration_test = config["job"].get("use_for_integration_test", False)
if is_integration_test:
test_flavors = [OverrideDefinitions()] + integration_tests_flavors[
config_file
]

for test_flavor in test_flavors:
for test_flavor in integration_tests_flavors[config_file]:
run_test(test_flavor, full_path)

# Deleting checkpoint folder from test
dir_list = glob.iglob(f"{test_checkpoint_dir}_*")
for path in dir_list:
if os.path.exists(path):
shutil.rmtree(path)

0 comments on commit c1b4c08

Please sign in to comment.