Skip to content

Commit

Permalink
test cpu offload with pp
Browse files Browse the repository at this point in the history
  • Loading branch information
mori360 committed Oct 28, 2024
1 parent ab1e258 commit 2ca9882
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
7 changes: 4 additions & 3 deletions test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,12 +354,13 @@ def build_test_list():
OverrideDefinitions(
[
[
"--experimental.pipeline_parallel_degree 2",
"--training.enable_cpu_offload True",
],
],
"Enable CPU Offload",
"enable_cpu_offload",
ngpu=2,
"Enable CPU Offload with PP",
"enable_cpu_offload+PP",
ngpu=4,
),
]
return integration_tests_flavors
Expand Down
6 changes: 4 additions & 2 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,10 @@ def loss_fn(pred, labels):
for m in model_parts:
# apply SPMD-style PT-D techniques
models_parallelize_fns[model_name](m, world_mesh, parallel_dims, job_config)
m.to_empty(device="cuda")
m.init_weights()
init_device = "cpu" if job_config.training.enable_cpu_offload else "cuda"
m.to_empty(device=init_device)
buffer_device = "cuda" if job_config.training.enable_cpu_offload else None
m.init_weights(buffer_device=buffer_device)
m.train()
else:
# apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel
Expand Down

0 comments on commit 2ca9882

Please sign in to comment.