Skip to content

Commit

Permalink
parallelism config
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed May 24, 2024
1 parent fc76734 commit 78de92a
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 16 deletions.
2 changes: 1 addition & 1 deletion composer/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,7 @@ def __init__(
raise ValueError(
'Both deepspeed_config and parallelism_config are specified but incompatible. Please specify only one.',
)
assert parallelism_config is None or parallelism_config['fsdp'] is not None, parallelism_config
assert parallelism_config is None, parallelism_config
if deepspeed_config is not None or parallelism_config is not None or dist.get_world_size() > 1:
# Deepspeed and FSDP both require torch.distributed to be initialized, even if the world size is 1
# And torch.distributed is always required for multi-rank training
Expand Down
34 changes: 19 additions & 15 deletions tests/models/test_hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ def get_lm_trainer(
load_path: Optional[str] = None,
is_conditional_generation: bool = False,
do_eval: bool = False,
fsdp_config: Optional[Dict[str, Any]] = None,
parallelism_config: Optional[Dict[str, Any]] = None,
mlm: bool = True,
add_padding: bool = False,
device_train_microbatch_size: Optional[int] = None,
Expand Down Expand Up @@ -594,7 +594,7 @@ def get_lm_trainer(
save_interval='1ep',
save_filename='hf-checkpoint.pt',
load_path=load_path,
parallelism_config={'fsdp': fsdp_config},
parallelism_config=parallelism_config,
loggers=in_memory_logger,
device_train_microbatch_size=batch_size
if device_train_microbatch_size is None else device_train_microbatch_size,
Expand Down Expand Up @@ -1028,17 +1028,19 @@ def test_hf_fsdp(tiny_bert_config, tiny_bert_tokenizer):

tiny_bert_model = transformers.AutoModelForMaskedLM.from_config(tiny_bert_config)

fsdp_config = {
'sharding_strategy': 'FULL_SHARD',
'cpu_offload': False,
'mixed_precision': 'PURE',
'backward_prefetch': 'BACKWARD_PRE',
'activation_checkpointing': False,
'activation_cpu_offload': False,
'verbose': False,
parallelism_config = {
'fsdp': {
'sharding_strategy': 'FULL_SHARD',
'cpu_offload': False,
'mixed_precision': 'PURE',
'backward_prefetch': 'BACKWARD_PRE',
'activation_checkpointing': False,
'activation_cpu_offload': False,
'verbose': False,
},
}

trainer = get_lm_trainer(tiny_bert_model, tiny_bert_tokenizer, None, fsdp_config=fsdp_config)
trainer = get_lm_trainer(tiny_bert_model, tiny_bert_tokenizer, None, parallelism_config=parallelism_config)

assert is_model_fsdp(trainer.state.model)

Expand Down Expand Up @@ -1486,8 +1488,10 @@ def test_peft_fsdp_trains(
):
pytest.importorskip('peft')

fsdp_config = {
'sharding_strategy': 'FULL_SHARD',
parallelism_config = {
'fsdp': {
'sharding_strategy': 'FULL_SHARD',
},
}

stashed_model = copy.deepcopy(tiny_gpt2_model)
Expand All @@ -1499,7 +1503,7 @@ def test_peft_fsdp_trains(
peft_config=gpt2_peft_config,
device_train_microbatch_size=1,
mlm=False,
fsdp_config=fsdp_config,
parallelism_config=parallelism_config,
should_save_peft_only=should_save_peft_only,
)

Expand All @@ -1520,7 +1524,7 @@ def test_peft_fsdp_trains(
device_train_microbatch_size=1,
mlm=False,
load_path=str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'),
fsdp_config=fsdp_config,
parallelism_config=parallelism_config,
should_save_peft_only=should_save_peft_only,
)

Expand Down

0 comments on commit 78de92a

Please sign in to comment.