From 25898a8a671e5f3883b37195b65f811d8d04fbc4 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 16:04:37 -0500 Subject: [PATCH 01/58] smaller stochastic depth --- tests/algorithms/algorithm_settings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/algorithms/algorithm_settings.py b/tests/algorithms/algorithm_settings.py index c8b11cdcef..7cf0304f67 100644 --- a/tests/algorithms/algorithm_settings.py +++ b/tests/algorithms/algorithm_settings.py @@ -207,19 +207,19 @@ 'model': ( composer_resnet, { - 'model_name': 'resnet50', + 'model_name': 'resnet18', 'num_classes': 2, }, ), 'dataset': ( RandomImageDataset, { - 'shape': (3, 224, 224), + 'shape': (3, 32, 32), }, ), 'kwargs': { 'stochastic_method': 'block', - 'target_layer_name': 'ResNetBottleneck', + 'target_layer_name': 'BasicBlock', 'drop_rate': 0.2, 'drop_distribution': 'linear', 'drop_warmup': '0.0dur', From 7f5f3cd2d81196e1edabdf3381a0b045b70960c7 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 16:17:41 -0500 Subject: [PATCH 02/58] pre-commit --- tests/algorithms/algorithm_settings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/algorithms/algorithm_settings.py b/tests/algorithms/algorithm_settings.py index 7cf0304f67..6c406be75c 100644 --- a/tests/algorithms/algorithm_settings.py +++ b/tests/algorithms/algorithm_settings.py @@ -214,12 +214,12 @@ 'dataset': ( RandomImageDataset, { - 'shape': (3, 32, 32), + 'shape': (3, 32, 32), }, ), 'kwargs': { 'stochastic_method': 'block', - 'target_layer_name': 'BasicBlock', + 'target_layer_name': 'BasicBlock', 'drop_rate': 0.2, 'drop_distribution': 'linear', 'drop_warmup': '0.0dur', From eafe3472a5f45f60055c252b1f176c726df05a7e Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 16:42:18 -0500 Subject: [PATCH 03/58] lessen batches, fix block --- tests/algorithms/algorithm_settings.py | 2 +- tests/algorithms/test_algorithms_train.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/algorithms/algorithm_settings.py b/tests/algorithms/algorithm_settings.py index 6c406be75c..3b8c241ce5 100644 --- a/tests/algorithms/algorithm_settings.py +++ b/tests/algorithms/algorithm_settings.py @@ -218,7 +218,7 @@ }, ), 'kwargs': { - 'stochastic_method': 'block', + 'stochastic_method': 'residual', 'target_layer_name': 'BasicBlock', 'drop_rate': 0.2, 'drop_distribution': 'linear', diff --git a/tests/algorithms/test_algorithms_train.py b/tests/algorithms/test_algorithms_train.py index a73a649b70..83acd37ea0 100644 --- a/tests/algorithms/test_algorithms_train.py +++ b/tests/algorithms/test_algorithms_train.py @@ -18,7 +18,7 @@ def test_algorithm_trains(alg_cls: type[Algorithm]): trainer = Trainer( model=model, train_dataloader=dataloader, - max_duration='2ep', + max_duration='1ep', algorithms=alg_cls(**alg_kwargs), ) trainer.fit() @@ -35,4 +35,4 @@ def test_algorithm_trains(alg_cls: type[Algorithm]): ) # fit again for another epoch - trainer.fit(duration='1ep') + trainer.fit(duration='1ba') From dd39ec337acbd3ffd914b07ad3eca9dc985cefc8 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 17:00:44 -0500 Subject: [PATCH 04/58] epochs to batches, also revert trying to make custom resnet --- tests/algorithms/algorithm_settings.py | 8 ++++---- tests/algorithms/test_algorithms_train.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/algorithms/algorithm_settings.py b/tests/algorithms/algorithm_settings.py index 3b8c241ce5..c8b11cdcef 100644 --- a/tests/algorithms/algorithm_settings.py +++ b/tests/algorithms/algorithm_settings.py @@ -207,19 +207,19 @@ 'model': ( composer_resnet, { - 'model_name': 'resnet18', + 'model_name': 'resnet50', 'num_classes': 2, }, ), 'dataset': ( RandomImageDataset, { - 'shape': (3, 32, 32), + 'shape': (3, 224, 224), }, ), 'kwargs': { - 'stochastic_method': 'residual', - 'target_layer_name': 'BasicBlock', + 'stochastic_method': 'block', + 'target_layer_name': 'ResNetBottleneck', 'drop_rate': 0.2, 'drop_distribution': 'linear', 'drop_warmup': '0.0dur', diff --git a/tests/algorithms/test_algorithms_train.py b/tests/algorithms/test_algorithms_train.py index 83acd37ea0..53d871265c 100644 --- a/tests/algorithms/test_algorithms_train.py +++ b/tests/algorithms/test_algorithms_train.py @@ -18,7 +18,7 @@ def test_algorithm_trains(alg_cls: type[Algorithm]): trainer = Trainer( model=model, train_dataloader=dataloader, - max_duration='1ep', + max_duration='1ba', algorithms=alg_cls(**alg_kwargs), ) trainer.fit() From 8d1bd92a0972676ac1158843b6c21731cbb4555f Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 17:57:10 -0500 Subject: [PATCH 05/58] test --- tests/algorithms/test_algorithms_train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/algorithms/test_algorithms_train.py b/tests/algorithms/test_algorithms_train.py index 53d871265c..0f9cb32de2 100644 --- a/tests/algorithms/test_algorithms_train.py +++ b/tests/algorithms/test_algorithms_train.py @@ -18,7 +18,7 @@ def test_algorithm_trains(alg_cls: type[Algorithm]): trainer = Trainer( model=model, train_dataloader=dataloader, - max_duration='1ba', + max_duration='2ba', algorithms=alg_cls(**alg_kwargs), ) trainer.fit() @@ -34,5 +34,5 @@ def test_algorithm_trains(alg_cls: type[Algorithm]): 'GyroDropout is implemented to be applied on Event.FIT_START, so is not compatible with multiple calls to fit.', ) - # fit again for another epoch + # fit again for another batch trainer.fit(duration='1ba') From a386263e6a789eba3dd6472ff1d6f63468c704b4 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 19:14:26 -0500 Subject: [PATCH 06/58] nlp test reduction --- tests/test_full_nlp.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 14380b38fe..a486239f67 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -35,7 +35,7 @@ def pretraining_test_helper(tokenizer, model, algorithms, tmp_path, device): pretraining_model_copy = copy.deepcopy(model) pretraining_train_dataset = RandomTextLMDataset( - size=8, + size=16, vocab_size=tokenizer.vocab_size, sequence_length=4, use_keys=True, @@ -44,13 +44,13 @@ def pretraining_test_helper(tokenizer, model, algorithms, tmp_path, device): collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) pretraining_train_dataloader = DataLoader( pretraining_train_dataset, - batch_size=4, + batch_size=8, sampler=dist.get_sampler(pretraining_train_dataset), collate_fn=collator, ) pretraining_eval_dataloader = DataLoader( pretraining_train_dataset, - batch_size=4, + batch_size=8, sampler=dist.get_sampler(pretraining_train_dataset), collate_fn=collator, ) @@ -59,7 +59,7 @@ def pretraining_test_helper(tokenizer, model, algorithms, tmp_path, device): model=pretraining_model_copy, train_dataloader=pretraining_train_dataloader, save_folder=str(tmp_path / 'pretraining_checkpoints'), - max_duration='1ep', + max_duration='2ba', seed=17, algorithms=algorithms, device=device, @@ -91,7 +91,7 @@ def finetuning_test_helper(tokenizer, model, algorithms, checkpoint_path, pretra finetuning_model_copy = copy.deepcopy(model) finetuning_train_dataset = RandomTextClassificationDataset( - size=8, + size=16, vocab_size=tokenizer.vocab_size, sequence_length=4, num_classes=3, @@ -99,12 +99,12 @@ def finetuning_test_helper(tokenizer, model, algorithms, checkpoint_path, pretra ) finetuning_train_dataloader = DataLoader( finetuning_train_dataset, - batch_size=4, + batch_size=8, sampler=dist.get_sampler(finetuning_train_dataset), ) finetuning_eval_dataloader = DataLoader( finetuning_train_dataset, - batch_size=4, + batch_size=8, sampler=dist.get_sampler(finetuning_train_dataset), ) @@ -137,7 +137,7 @@ def finetuning_test_helper(tokenizer, model, algorithms, checkpoint_path, pretra load_weights_only=True, load_strict_model_weights=False, loggers=[rud], - max_duration='1ep', + max_duration='2ba', seed=17, algorithms=algorithms, device=device, @@ -229,7 +229,6 @@ def inference_test_helper( @device('cpu', 'gpu') -# Note: the specificity of these settings are due to incompatibilities (e.g. the simpletransformer model is not traceable) @pytest.mark.parametrize( 'model_type,algorithms,save_format', [ @@ -267,9 +266,6 @@ def test_full_nlp_pipeline( if model_type == 'tinybert_hf': tiny_bert_model = request.getfixturevalue('tiny_bert_model') - # pretraining - if model_type == 'tinybert_hf': - assert tiny_bert_model is not None pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( tiny_bert_model, From cff11cf0568797a27e675ca6cc45a2b3c902b090 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 19:30:25 -0500 Subject: [PATCH 07/58] save copy --- tests/test_full_nlp.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index a486239f67..3c44e05163 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -262,23 +262,24 @@ def test_full_nlp_pipeline( device = get_device(device) - tiny_bert_model = None if model_type == 'tinybert_hf': tiny_bert_model = request.getfixturevalue('tiny_bert_model') + tokenizer = tiny_bert_tokenizer pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( tiny_bert_model, - tiny_bert_tokenizer, + tokenizer, use_logits=True, metrics=pretraining_metrics, ) elif model_type == 'simpletransformer': - pretraining_model = SimpleTransformerMaskedLM(vocab_size=tiny_bert_tokenizer.vocab_size) + pretraining_model = SimpleTransformerMaskedLM(vocab_size=tokenizer.vocab_size) + tokenizer = None else: raise ValueError('Unsupported model type') pretraining_output_path = pretraining_test_helper( - tiny_bert_tokenizer, + tokenizer, pretraining_model, algorithms, tmp_path, @@ -295,18 +296,17 @@ def test_full_nlp_pipeline( ) finetuning_model = HuggingFaceModel( model=hf_finetuning_model, - tokenizer=tiny_bert_tokenizer, + tokenizer=tokenizer, use_logits=True, metrics=[finetuning_metric], ) elif model_type == 'simpletransformer': - finetuning_model = SimpleTransformerClassifier(vocab_size=tiny_bert_tokenizer.vocab_size, num_classes=3) + finetuning_model = SimpleTransformerClassifier(vocab_size=tokenizer.vocab_size, num_classes=3) else: raise ValueError('Unsupported model type.') - finetuning_model_copy = copy.deepcopy(finetuning_model) finetuning_trainer, finetuning_dataloader, rud, finetuning_output_path = finetuning_test_helper( - tiny_bert_tokenizer, + tokenizer, finetuning_model, algorithms, pretraining_output_path, @@ -323,7 +323,7 @@ def test_full_nlp_pipeline( inference_test_helper( finetuning_output_path, rud, - finetuning_model_copy, + finetuning_model, algorithms, batch, original_output, From 8e8454456531e4f16c039a943111670572afab46 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 21:45:59 -0500 Subject: [PATCH 08/58] dataset --- tests/test_full_nlp.py | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 3c44e05163..a65cad129f 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -35,22 +35,22 @@ def pretraining_test_helper(tokenizer, model, algorithms, tmp_path, device): pretraining_model_copy = copy.deepcopy(model) pretraining_train_dataset = RandomTextLMDataset( - size=16, + size=4, vocab_size=tokenizer.vocab_size, - sequence_length=4, + sequence_length=2, use_keys=True, ) collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) pretraining_train_dataloader = DataLoader( pretraining_train_dataset, - batch_size=8, + batch_size=2, sampler=dist.get_sampler(pretraining_train_dataset), collate_fn=collator, ) pretraining_eval_dataloader = DataLoader( pretraining_train_dataset, - batch_size=8, + batch_size=2, sampler=dist.get_sampler(pretraining_train_dataset), collate_fn=collator, ) @@ -59,7 +59,7 @@ def pretraining_test_helper(tokenizer, model, algorithms, tmp_path, device): model=pretraining_model_copy, train_dataloader=pretraining_train_dataloader, save_folder=str(tmp_path / 'pretraining_checkpoints'), - max_duration='2ba', + max_duration='1ba', seed=17, algorithms=algorithms, device=device, @@ -91,20 +91,20 @@ def finetuning_test_helper(tokenizer, model, algorithms, checkpoint_path, pretra finetuning_model_copy = copy.deepcopy(model) finetuning_train_dataset = RandomTextClassificationDataset( - size=16, + size=4, vocab_size=tokenizer.vocab_size, - sequence_length=4, + sequence_length=2, num_classes=3, use_keys=isinstance(model, HuggingFaceModel), ) finetuning_train_dataloader = DataLoader( finetuning_train_dataset, - batch_size=8, + batch_size=2, sampler=dist.get_sampler(finetuning_train_dataset), ) finetuning_eval_dataloader = DataLoader( finetuning_train_dataset, - batch_size=8, + batch_size=2, sampler=dist.get_sampler(finetuning_train_dataset), ) @@ -137,7 +137,7 @@ def finetuning_test_helper(tokenizer, model, algorithms, checkpoint_path, pretra load_weights_only=True, load_strict_model_weights=False, loggers=[rud], - max_duration='2ba', + max_duration='1ba', seed=17, algorithms=algorithms, device=device, @@ -229,6 +229,7 @@ def inference_test_helper( @device('cpu', 'gpu') +# Note: the specificity of these settings are due to incompatibilities (e.g. the simpletransformer model is not traceable) @pytest.mark.parametrize( 'model_type,algorithms,save_format', [ @@ -262,24 +263,26 @@ def test_full_nlp_pipeline( device = get_device(device) + tiny_bert_model = None if model_type == 'tinybert_hf': tiny_bert_model = request.getfixturevalue('tiny_bert_model') - tokenizer = tiny_bert_tokenizer + # pretraining + if model_type == 'tinybert_hf': + assert tiny_bert_model is not None pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( tiny_bert_model, - tokenizer, + tiny_bert_tokenizer, use_logits=True, metrics=pretraining_metrics, ) elif model_type == 'simpletransformer': - pretraining_model = SimpleTransformerMaskedLM(vocab_size=tokenizer.vocab_size) - tokenizer = None + pretraining_model = SimpleTransformerMaskedLM(vocab_size=tiny_bert_tokenizer.vocab_size) else: raise ValueError('Unsupported model type') pretraining_output_path = pretraining_test_helper( - tokenizer, + tiny_bert_tokenizer, pretraining_model, algorithms, tmp_path, @@ -296,17 +299,18 @@ def test_full_nlp_pipeline( ) finetuning_model = HuggingFaceModel( model=hf_finetuning_model, - tokenizer=tokenizer, + tokenizer=tiny_bert_tokenizer, use_logits=True, metrics=[finetuning_metric], ) elif model_type == 'simpletransformer': - finetuning_model = SimpleTransformerClassifier(vocab_size=tokenizer.vocab_size, num_classes=3) + finetuning_model = SimpleTransformerClassifier(vocab_size=tiny_bert_tokenizer.vocab_size, num_classes=3) else: raise ValueError('Unsupported model type.') + finetuning_model_copy = copy.deepcopy(finetuning_model) finetuning_trainer, finetuning_dataloader, rud, finetuning_output_path = finetuning_test_helper( - tokenizer, + tiny_bert_tokenizer, finetuning_model, algorithms, pretraining_output_path, @@ -323,7 +327,7 @@ def test_full_nlp_pipeline( inference_test_helper( finetuning_output_path, rud, - finetuning_model, + finetuning_model_copy, algorithms, batch, original_output, From 0d700049ea0bf7e9025d14b74d6a3a5b9526ef92 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 22:11:34 -0500 Subject: [PATCH 09/58] smaller models --- tests/test_full_nlp.py | 55 +++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index a65cad129f..fa842c7ad0 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -9,6 +9,7 @@ from packaging import version from torch.utils.data import DataLoader from torchmetrics.classification import MulticlassAccuracy +from transformers import BertConfig, BertForMaskedLM, BertForSequenceClassification, BertTokenizerFast from composer.algorithms import GatedLinearUnits from composer.loggers import RemoteUploaderDownloader @@ -229,7 +230,6 @@ def inference_test_helper( @device('cpu', 'gpu') -# Note: the specificity of these settings are due to incompatibilities (e.g. the simpletransformer model is not traceable) @pytest.mark.parametrize( 'model_type,algorithms,save_format', [ @@ -242,10 +242,8 @@ def test_full_nlp_pipeline( model_type, algorithms, save_format, - tiny_bert_tokenizer, onnx_opset_version, tmp_path, - request, device, ): """This test is intended to exercise our full pipeline for NLP. @@ -262,27 +260,38 @@ def test_full_nlp_pipeline( algorithms = [algorithm() for algorithm in algorithms] device = get_device(device) - - tiny_bert_model = None - if model_type == 'tinybert_hf': - tiny_bert_model = request.getfixturevalue('tiny_bert_model') - - # pretraining if model_type == 'tinybert_hf': - assert tiny_bert_model is not None + # Updated minimal BERT configuration + config = BertConfig( + vocab_size=30522, + hidden_size=64, + num_hidden_layers=2, + num_attention_heads=2, + intermediate_size=256, + ) + tiny_bert_model = BertForMaskedLM(config) + tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) + pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( tiny_bert_model, - tiny_bert_tokenizer, + tokenizer, use_logits=True, metrics=pretraining_metrics, ) elif model_type == 'simpletransformer': - pretraining_model = SimpleTransformerMaskedLM(vocab_size=tiny_bert_tokenizer.vocab_size) + pretraining_model = SimpleTransformerMaskedLM( + vocab_size=30522, + num_layers=2, + d_model=64, + num_heads=2, + dim_feedforward=256, + ) + tokenizer = None else: raise ValueError('Unsupported model type') pretraining_output_path = pretraining_test_helper( - tiny_bert_tokenizer, + tokenizer, pretraining_model, algorithms, tmp_path, @@ -292,25 +301,27 @@ def test_full_nlp_pipeline( # finetuning if model_type == 'tinybert_hf': finetuning_metric = MulticlassAccuracy(num_classes=3, average='micro') - hf_finetuning_model, _ = HuggingFaceModel.hf_from_composer_checkpoint( - pretraining_output_path, - model_instantiation_class='transformers.AutoModelForSequenceClassification', - model_config_kwargs={'num_labels': 3}, - ) finetuning_model = HuggingFaceModel( - model=hf_finetuning_model, - tokenizer=tiny_bert_tokenizer, + model=BertForSequenceClassification(config), + tokenizer=tokenizer, use_logits=True, metrics=[finetuning_metric], ) elif model_type == 'simpletransformer': - finetuning_model = SimpleTransformerClassifier(vocab_size=tiny_bert_tokenizer.vocab_size, num_classes=3) + finetuning_model = SimpleTransformerClassifier( + vocab_size=30522, + num_classes=3, + num_layers=2, + d_model=64, + num_heads=2, + dim_feedforward=256, + ) else: raise ValueError('Unsupported model type.') finetuning_model_copy = copy.deepcopy(finetuning_model) finetuning_trainer, finetuning_dataloader, rud, finetuning_output_path = finetuning_test_helper( - tiny_bert_tokenizer, + tokenizer, finetuning_model, algorithms, pretraining_output_path, From 0952cc6ca3d2321814c9983a51b05dbf52375c49 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 22:18:43 -0500 Subject: [PATCH 10/58] precommit --- tests/test_full_nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index fa842c7ad0..6995f938ce 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -271,7 +271,7 @@ def test_full_nlp_pipeline( ) tiny_bert_model = BertForMaskedLM(config) tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) - + pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( tiny_bert_model, From 5b0f0a0040673e1ad4c33a4fc250b20f92fe5f25 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 22:32:27 -0500 Subject: [PATCH 11/58] make model bigger --- tests/test_full_nlp.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 6995f938ce..eb360e9772 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -244,6 +244,7 @@ def test_full_nlp_pipeline( save_format, onnx_opset_version, tmp_path, + request, device, ): """This test is intended to exercise our full pipeline for NLP. @@ -271,7 +272,7 @@ def test_full_nlp_pipeline( ) tiny_bert_model = BertForMaskedLM(config) tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) - + pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( tiny_bert_model, @@ -283,9 +284,9 @@ def test_full_nlp_pipeline( pretraining_model = SimpleTransformerMaskedLM( vocab_size=30522, num_layers=2, - d_model=64, + d_model=128, num_heads=2, - dim_feedforward=256, + dim_feedforward=512, ) tokenizer = None else: From a46bcc5deeaa46e79c64e3f4b0d19989c3bd9f3d Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 22:42:07 -0500 Subject: [PATCH 12/58] rm gated --- tests/test_full_nlp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index eb360e9772..a5fbf2c469 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -11,7 +11,7 @@ from torchmetrics.classification import MulticlassAccuracy from transformers import BertConfig, BertForMaskedLM, BertForSequenceClassification, BertTokenizerFast -from composer.algorithms import GatedLinearUnits +from composer.algorithms import LabelSmoothing from composer.loggers import RemoteUploaderDownloader from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy from composer.models import HuggingFaceModel @@ -233,7 +233,7 @@ def inference_test_helper( @pytest.mark.parametrize( 'model_type,algorithms,save_format', [ - ('tinybert_hf', [GatedLinearUnits], 'onnx'), + ('tinybert_hf', [LabelSmoothing], 'onnx'), ('simpletransformer', [], 'torchscript'), ], ) @@ -272,7 +272,7 @@ def test_full_nlp_pipeline( ) tiny_bert_model = BertForMaskedLM(config) tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) - + pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( tiny_bert_model, From 53da8a851ff27905f7027b58b8d9f382fd20a373 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 23:02:40 -0500 Subject: [PATCH 13/58] layerfreezing --- tests/test_full_nlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index a5fbf2c469..8a4145a99f 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -11,7 +11,7 @@ from torchmetrics.classification import MulticlassAccuracy from transformers import BertConfig, BertForMaskedLM, BertForSequenceClassification, BertTokenizerFast -from composer.algorithms import LabelSmoothing +from composer.algorithms import LayerFreezing from composer.loggers import RemoteUploaderDownloader from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy from composer.models import HuggingFaceModel @@ -233,7 +233,7 @@ def inference_test_helper( @pytest.mark.parametrize( 'model_type,algorithms,save_format', [ - ('tinybert_hf', [LabelSmoothing], 'onnx'), + ('tinybert_hf', [LayerFreezing], 'onnx'), ('simpletransformer', [], 'torchscript'), ], ) From d87a68737c11ebf45ce96046e6371f11d9c76c2e Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 23:16:41 -0500 Subject: [PATCH 14/58] new algorithm --- tests/test_full_nlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 8a4145a99f..5785ebb096 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -11,7 +11,7 @@ from torchmetrics.classification import MulticlassAccuracy from transformers import BertConfig, BertForMaskedLM, BertForSequenceClassification, BertTokenizerFast -from composer.algorithms import LayerFreezing +from composer.algorithms import GradientClipping from composer.loggers import RemoteUploaderDownloader from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy from composer.models import HuggingFaceModel @@ -233,7 +233,7 @@ def inference_test_helper( @pytest.mark.parametrize( 'model_type,algorithms,save_format', [ - ('tinybert_hf', [LayerFreezing], 'onnx'), + ('tinybert_hf', [GradientClipping], 'onnx'), ('simpletransformer', [], 'torchscript'), ], ) From eb7ffd496062dbfe513f8de3a04cd6073e2f9652 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 23:28:04 -0500 Subject: [PATCH 15/58] precommit --- tests/test_full_nlp.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 5785ebb096..7f98dc3b2d 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -283,10 +283,6 @@ def test_full_nlp_pipeline( elif model_type == 'simpletransformer': pretraining_model = SimpleTransformerMaskedLM( vocab_size=30522, - num_layers=2, - d_model=128, - num_heads=2, - dim_feedforward=512, ) tokenizer = None else: @@ -312,10 +308,6 @@ def test_full_nlp_pipeline( finetuning_model = SimpleTransformerClassifier( vocab_size=30522, num_classes=3, - num_layers=2, - d_model=64, - num_heads=2, - dim_feedforward=256, ) else: raise ValueError('Unsupported model type.') From f150c14c93a2917f2fe25157c4b52b4dcff76521 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 23:33:02 -0500 Subject: [PATCH 16/58] precommit --- tests/test_full_nlp.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 7f98dc3b2d..a5689e623c 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -281,9 +281,7 @@ def test_full_nlp_pipeline( metrics=pretraining_metrics, ) elif model_type == 'simpletransformer': - pretraining_model = SimpleTransformerMaskedLM( - vocab_size=30522, - ) + pretraining_model = SimpleTransformerMaskedLM(vocab_size=30522,) tokenizer = None else: raise ValueError('Unsupported model type') From 8862b540f839a22db766432e65c11fa0bef299d0 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 23:38:44 -0500 Subject: [PATCH 17/58] precommit --- tests/test_full_nlp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index a5689e623c..81eb00f592 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -244,7 +244,6 @@ def test_full_nlp_pipeline( save_format, onnx_opset_version, tmp_path, - request, device, ): """This test is intended to exercise our full pipeline for NLP. @@ -281,7 +280,7 @@ def test_full_nlp_pipeline( metrics=pretraining_metrics, ) elif model_type == 'simpletransformer': - pretraining_model = SimpleTransformerMaskedLM(vocab_size=30522,) + pretraining_model = SimpleTransformerMaskedLM(vocab_size=30522) tokenizer = None else: raise ValueError('Unsupported model type') From 86373d1bac205c36be19cb9092c68c038b2d72a6 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Fri, 29 Nov 2024 23:45:04 -0500 Subject: [PATCH 18/58] precommit --- tests/test_full_nlp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 81eb00f592..8bf1fa2b6b 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -260,6 +260,7 @@ def test_full_nlp_pipeline( algorithms = [algorithm() for algorithm in algorithms] device = get_device(device) + config = None if model_type == 'tinybert_hf': # Updated minimal BERT configuration config = BertConfig( From 57e6b271af90a2a268212f686d68faf55fe579cb Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 00:00:42 -0500 Subject: [PATCH 19/58] precommit --- tests/test_full_nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 8bf1fa2b6b..d97202421b 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -233,7 +233,7 @@ def inference_test_helper( @pytest.mark.parametrize( 'model_type,algorithms,save_format', [ - ('tinybert_hf', [GradientClipping], 'onnx'), + ('tinybert_hf', [GradientClipping(clipping_type='norm', clipping_threshold=1.0)], 'onnx'), ('simpletransformer', [], 'torchscript'), ], ) From b20e7fda09ef5ad2445a8501f3dc0b022421ef1a Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 00:16:29 -0500 Subject: [PATCH 20/58] algo --- tests/test_full_nlp.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index d97202421b..9b3737d4ca 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -257,10 +257,9 @@ def test_full_nlp_pipeline( if onnx_opset_version == None and version.parse(torch.__version__) < version.parse('1.13'): pytest.skip("Don't test prior PyTorch version's default Opset version.") - algorithms = [algorithm() for algorithm in algorithms] - device = get_device(device) config = None + tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) if model_type == 'tinybert_hf': # Updated minimal BERT configuration config = BertConfig( @@ -271,7 +270,6 @@ def test_full_nlp_pipeline( intermediate_size=256, ) tiny_bert_model = BertForMaskedLM(config) - tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( From e2f9cc53db88f539be76e28819d6fcee7f13fc1a Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 00:22:04 -0500 Subject: [PATCH 21/58] algo --- tests/test_full_nlp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 9b3737d4ca..022924f2a3 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -259,7 +259,6 @@ def test_full_nlp_pipeline( device = get_device(device) config = None - tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) if model_type == 'tinybert_hf': # Updated minimal BERT configuration config = BertConfig( @@ -270,7 +269,7 @@ def test_full_nlp_pipeline( intermediate_size=256, ) tiny_bert_model = BertForMaskedLM(config) - + tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( tiny_bert_model, From 568596baa6683327ac5e294d788a941a3c8e0601 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 00:45:00 -0500 Subject: [PATCH 22/58] algo --- tests/test_full_nlp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 022924f2a3..c006d251bc 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -259,6 +259,7 @@ def test_full_nlp_pipeline( device = get_device(device) config = None + tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) if model_type == 'tinybert_hf': # Updated minimal BERT configuration config = BertConfig( @@ -267,9 +268,9 @@ def test_full_nlp_pipeline( num_hidden_layers=2, num_attention_heads=2, intermediate_size=256, + num_labels=3, ) tiny_bert_model = BertForMaskedLM(config) - tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( tiny_bert_model, From 7772f96eba535d32dacf4e740f1cb9057b16f9ba Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 01:04:42 -0500 Subject: [PATCH 23/58] algo --- tests/test_full_nlp.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index c006d251bc..ed08e83e96 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -11,7 +11,7 @@ from torchmetrics.classification import MulticlassAccuracy from transformers import BertConfig, BertForMaskedLM, BertForSequenceClassification, BertTokenizerFast -from composer.algorithms import GradientClipping +from composer.algorithms import GatedLinearUnits from composer.loggers import RemoteUploaderDownloader from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy from composer.models import HuggingFaceModel @@ -233,7 +233,7 @@ def inference_test_helper( @pytest.mark.parametrize( 'model_type,algorithms,save_format', [ - ('tinybert_hf', [GradientClipping(clipping_type='norm', clipping_threshold=1.0)], 'onnx'), + ('tinybert_hf', [GatedLinearUnits], 'onnx'), ('simpletransformer', [], 'torchscript'), ], ) @@ -257,6 +257,7 @@ def test_full_nlp_pipeline( if onnx_opset_version == None and version.parse(torch.__version__) < version.parse('1.13'): pytest.skip("Don't test prior PyTorch version's default Opset version.") + algorithms = [algorithm() for algorithm in algorithms] device = get_device(device) config = None tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) From 3cfcb4cdb04b52e08ac1b2aef05e3184d88b8d66 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 01:23:49 -0500 Subject: [PATCH 24/58] algo --- tests/test_full_nlp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index ed08e83e96..0de1559fa8 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -281,7 +281,6 @@ def test_full_nlp_pipeline( ) elif model_type == 'simpletransformer': pretraining_model = SimpleTransformerMaskedLM(vocab_size=30522) - tokenizer = None else: raise ValueError('Unsupported model type') pretraining_output_path = pretraining_test_helper( From 0736c580dda117782d52f8277c2502779ff3fa5c Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 01:25:19 -0500 Subject: [PATCH 25/58] smaller --- tests/test_full_nlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 0de1559fa8..a3f342500b 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -265,10 +265,10 @@ def test_full_nlp_pipeline( # Updated minimal BERT configuration config = BertConfig( vocab_size=30522, - hidden_size=64, + hidden_size=16, num_hidden_layers=2, num_attention_heads=2, - intermediate_size=256, + intermediate_size=64, num_labels=3, ) tiny_bert_model = BertForMaskedLM(config) From f2317cedb9a075ed79fd85193f5aac45214ffdf4 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 02:01:19 -0500 Subject: [PATCH 26/58] smol vocab size --- tests/algorithms/test_algorithm_resumption.py | 36 ++++++++----------- tests/test_full_nlp.py | 7 ++-- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/tests/algorithms/test_algorithm_resumption.py b/tests/algorithms/test_algorithm_resumption.py index e0b14ec198..47326a8b7c 100644 --- a/tests/algorithms/test_algorithm_resumption.py +++ b/tests/algorithms/test_algorithm_resumption.py @@ -20,8 +20,8 @@ @pytest.mark.parametrize('alg_cls', get_algs_with_marks()) @pytest.mark.filterwarnings( 'ignore:Detected call of `lr_scheduler.step()', -) # optimizer.step() sometimes skipped when NaN/inf on low batch size -@pytest.mark.filterwarnings(r'ignore:.*Plan failed with a cudnnException.*:UserWarning') # Torch 2.3 regression +) +@pytest.mark.filterwarnings(r'ignore:.*Plan failed with a cudnnException.*:UserWarning') @world_size(1, 2) def test_algorithm_resumption( tmp_path: pathlib.Path, @@ -54,14 +54,14 @@ def test_algorithm_resumption( scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5) shared_config = { - 'max_duration': '2ep', - 'save_filename': 'ep{epoch}-rank{rank}', - 'save_interval': '1ep', - 'train_subset_num_batches': 2, + 'max_duration': '2ba', + 'save_filename': 'ba{batch}-rank{rank}', + 'save_interval': '1ba', + 'train_subset_num_batches': 1, 'precision': 'amp_bf16', } train_dataloader = get_alg_dataloader(alg_cls) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) - # train model once, saving checkpoints every epoch + # train model once, saving checkpoints every batch trainer1 = Trainer( model=model, train_dataloader=train_dataloader, @@ -73,16 +73,11 @@ def test_algorithm_resumption( ) trainer1.fit() - # create second trainer, load an intermediate checkpoint - # and continue training - + # create second trainer, load from the first batch checkpoint, and continue training optimizer = torch.optim.Adam(copied_model.parameters(), lr=0.01) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5) alg = alg_cls(**alg_kwargs) - # SeqLengthWarmup has a call to ._activate_model() that happens on the first call to the algorithm - # in order to get complete matching of the rng state, we have to cause that extra call to be skipped - # when reloading. if alg_cls is SeqLengthWarmup: alg._activated = True # type: ignore @@ -90,7 +85,7 @@ def test_algorithm_resumption( trainer2 = Trainer( model=copied_model, train_dataloader=train_dataloader, - load_path=os.path.join(folder1, 'ep1-rank{rank}'), + load_path=os.path.join(folder1, 'ba1-rank{rank}'), load_weights_only=False, load_strict_model_weights=False, optimizers=optimizer, @@ -100,20 +95,19 @@ def test_algorithm_resumption( **shared_config, ) trainer2.fit() - # check that the checkpoints are equal + # check that the checkpoints after the second batch are equal if world_size == 1 or dist.get_global_rank() == 0: _assert_checkpoints_equal( - file1=os.path.join(folder1, 'ep2-rank0'), - file2=os.path.join(folder2, 'ep2-rank0'), + file1=os.path.join(folder1, 'ba2-rank0'), + file2=os.path.join(folder2, 'ba2-rank0'), ) - # check that different epoch checkpoints are _not_ equal - # this ensures that the model weights are being updated. + # ensure that the first and second batch checkpoints are not equal if world_size == 1 or dist.get_global_rank() == 0: with pytest.raises(AssertionError): _assert_model_weights_equal( - file1=os.path.join(folder1, 'ep1-rank0'), - file2=os.path.join(folder1, 'ep2-rank0'), + file1=os.path.join(folder1, 'ba1-rank0'), + file2=os.path.join(folder1, 'ba2-rank0'), ) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index a3f342500b..1974fcfed2 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -260,11 +260,12 @@ def test_full_nlp_pipeline( algorithms = [algorithm() for algorithm in algorithms] device = get_device(device) config = None + small_vocab_size = 1024 tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) if model_type == 'tinybert_hf': # Updated minimal BERT configuration config = BertConfig( - vocab_size=30522, + vocab_size=small_vocab_size, hidden_size=16, num_hidden_layers=2, num_attention_heads=2, @@ -280,7 +281,7 @@ def test_full_nlp_pipeline( metrics=pretraining_metrics, ) elif model_type == 'simpletransformer': - pretraining_model = SimpleTransformerMaskedLM(vocab_size=30522) + pretraining_model = SimpleTransformerMaskedLM(vocab_size=small_vocab_size) else: raise ValueError('Unsupported model type') pretraining_output_path = pretraining_test_helper( @@ -302,7 +303,7 @@ def test_full_nlp_pipeline( ) elif model_type == 'simpletransformer': finetuning_model = SimpleTransformerClassifier( - vocab_size=30522, + vocab_size=small_vocab_size, num_classes=3, ) else: From f1fbfbf0eb0f77d5af885c97ed5840932d7c9553 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 02:06:56 -0500 Subject: [PATCH 27/58] smol vocab size --- tests/test_full_nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 1974fcfed2..63b7f310fa 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -260,7 +260,7 @@ def test_full_nlp_pipeline( algorithms = [algorithm() for algorithm in algorithms] device = get_device(device) config = None - small_vocab_size = 1024 + small_vocab_size = 2048 tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) if model_type == 'tinybert_hf': # Updated minimal BERT configuration From 4b5c590e8d6055ba3cf7bbc9b91b2fa4b4aaf7bf Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 02:23:39 -0500 Subject: [PATCH 28/58] tiniest model --- tests/test_full_nlp.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 63b7f310fa..fd86b4f8ae 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -9,7 +9,12 @@ from packaging import version from torch.utils.data import DataLoader from torchmetrics.classification import MulticlassAccuracy -from transformers import BertConfig, BertForMaskedLM, BertForSequenceClassification, BertTokenizerFast +from transformers import ( + AutoConfig, + AutoModelForMaskedLM, + AutoModelForSequenceClassification, + AutoTokenizer, +) from composer.algorithms import GatedLinearUnits from composer.loggers import RemoteUploaderDownloader @@ -24,7 +29,7 @@ def get_model_embeddings(model): if isinstance(model, HuggingFaceModel): - return model.model.bert.embeddings.word_embeddings.weight + return model.model.get_input_embeddings().weight elif isinstance(model, SimpleTransformerClassifier) or isinstance(model, SimpleTransformerMaskedLM): return model.transformer_base.embedding.weight else: @@ -259,23 +264,15 @@ def test_full_nlp_pipeline( algorithms = [algorithm() for algorithm in algorithms] device = get_device(device) - config = None - small_vocab_size = 2048 - tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) + model_name = 'hf-internal-testing/tiny-random-bert' + tokenizer = AutoTokenizer.from_pretrained(model_name) + small_vocab_size = tokenizer.vocab_size + if model_type == 'tinybert_hf': - # Updated minimal BERT configuration - config = BertConfig( - vocab_size=small_vocab_size, - hidden_size=16, - num_hidden_layers=2, - num_attention_heads=2, - intermediate_size=64, - num_labels=3, - ) - tiny_bert_model = BertForMaskedLM(config) + tiny_model = AutoModelForMaskedLM.from_pretrained(model_name) pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( - tiny_bert_model, + tiny_model, tokenizer, use_logits=True, metrics=pretraining_metrics, @@ -294,9 +291,10 @@ def test_full_nlp_pipeline( # finetuning if model_type == 'tinybert_hf': + tiny_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) finetuning_metric = MulticlassAccuracy(num_classes=3, average='micro') finetuning_model = HuggingFaceModel( - model=BertForSequenceClassification(config), + model=tiny_model, tokenizer=tokenizer, use_logits=True, metrics=[finetuning_metric], From 3277cba12d40eb1b90b1a077fec454129a9bed72 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 02:39:17 -0500 Subject: [PATCH 29/58] tiniest model --- tests/algorithms/test_algorithm_resumption.py | 4 ++-- tests/test_full_nlp.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/algorithms/test_algorithm_resumption.py b/tests/algorithms/test_algorithm_resumption.py index 47326a8b7c..557f921387 100644 --- a/tests/algorithms/test_algorithm_resumption.py +++ b/tests/algorithms/test_algorithm_resumption.py @@ -20,8 +20,8 @@ @pytest.mark.parametrize('alg_cls', get_algs_with_marks()) @pytest.mark.filterwarnings( 'ignore:Detected call of `lr_scheduler.step()', -) -@pytest.mark.filterwarnings(r'ignore:.*Plan failed with a cudnnException.*:UserWarning') +) # optimizer.step() sometimes skipped when NaN/inf on low batch size +@pytest.mark.filterwarnings(r'ignore:.*Plan failed with a cudnnException.*:UserWarning') # Torch 2.3 regression @world_size(1, 2) def test_algorithm_resumption( tmp_path: pathlib.Path, diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index fd86b4f8ae..2a83e714a2 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -10,7 +10,6 @@ from torch.utils.data import DataLoader from torchmetrics.classification import MulticlassAccuracy from transformers import ( - AutoConfig, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoTokenizer, @@ -291,7 +290,11 @@ def test_full_nlp_pipeline( # finetuning if model_type == 'tinybert_hf': - tiny_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) + tiny_model = AutoModelForSequenceClassification.from_pretrained( + model_name, + num_labels=3, + ignore_mismatched_sizes=True, + ) finetuning_metric = MulticlassAccuracy(num_classes=3, average='micro') finetuning_model = HuggingFaceModel( model=tiny_model, From e638994eb07d12d8de6ab0ab06bde484acf2dbec Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 03:01:56 -0500 Subject: [PATCH 30/58] revert --- tests/algorithms/test_algorithm_resumption.py | 2 +- tests/test_full_nlp.py | 38 +++++++++---------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/tests/algorithms/test_algorithm_resumption.py b/tests/algorithms/test_algorithm_resumption.py index 557f921387..c00fdf2ade 100644 --- a/tests/algorithms/test_algorithm_resumption.py +++ b/tests/algorithms/test_algorithm_resumption.py @@ -57,7 +57,7 @@ def test_algorithm_resumption( 'max_duration': '2ba', 'save_filename': 'ba{batch}-rank{rank}', 'save_interval': '1ba', - 'train_subset_num_batches': 1, + 'train_subset_num_batches': 2, 'precision': 'amp_bf16', } train_dataloader = get_alg_dataloader(alg_cls) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) diff --git a/tests/test_full_nlp.py b/tests/test_full_nlp.py index 2a83e714a2..a3f342500b 100644 --- a/tests/test_full_nlp.py +++ b/tests/test_full_nlp.py @@ -9,11 +9,7 @@ from packaging import version from torch.utils.data import DataLoader from torchmetrics.classification import MulticlassAccuracy -from transformers import ( - AutoModelForMaskedLM, - AutoModelForSequenceClassification, - AutoTokenizer, -) +from transformers import BertConfig, BertForMaskedLM, BertForSequenceClassification, BertTokenizerFast from composer.algorithms import GatedLinearUnits from composer.loggers import RemoteUploaderDownloader @@ -28,7 +24,7 @@ def get_model_embeddings(model): if isinstance(model, HuggingFaceModel): - return model.model.get_input_embeddings().weight + return model.model.bert.embeddings.word_embeddings.weight elif isinstance(model, SimpleTransformerClassifier) or isinstance(model, SimpleTransformerMaskedLM): return model.transformer_base.embedding.weight else: @@ -263,21 +259,28 @@ def test_full_nlp_pipeline( algorithms = [algorithm() for algorithm in algorithms] device = get_device(device) - model_name = 'hf-internal-testing/tiny-random-bert' - tokenizer = AutoTokenizer.from_pretrained(model_name) - small_vocab_size = tokenizer.vocab_size - + config = None + tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', model_max_length=128) if model_type == 'tinybert_hf': - tiny_model = AutoModelForMaskedLM.from_pretrained(model_name) + # Updated minimal BERT configuration + config = BertConfig( + vocab_size=30522, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=2, + intermediate_size=64, + num_labels=3, + ) + tiny_bert_model = BertForMaskedLM(config) pretraining_metrics = [LanguageCrossEntropy(ignore_index=-100), MaskedAccuracy(ignore_index=-100)] pretraining_model = HuggingFaceModel( - tiny_model, + tiny_bert_model, tokenizer, use_logits=True, metrics=pretraining_metrics, ) elif model_type == 'simpletransformer': - pretraining_model = SimpleTransformerMaskedLM(vocab_size=small_vocab_size) + pretraining_model = SimpleTransformerMaskedLM(vocab_size=30522) else: raise ValueError('Unsupported model type') pretraining_output_path = pretraining_test_helper( @@ -290,21 +293,16 @@ def test_full_nlp_pipeline( # finetuning if model_type == 'tinybert_hf': - tiny_model = AutoModelForSequenceClassification.from_pretrained( - model_name, - num_labels=3, - ignore_mismatched_sizes=True, - ) finetuning_metric = MulticlassAccuracy(num_classes=3, average='micro') finetuning_model = HuggingFaceModel( - model=tiny_model, + model=BertForSequenceClassification(config), tokenizer=tokenizer, use_logits=True, metrics=[finetuning_metric], ) elif model_type == 'simpletransformer': finetuning_model = SimpleTransformerClassifier( - vocab_size=small_vocab_size, + vocab_size=30522, num_classes=3, ) else: From e5170d44818db1f0627bcc1aac9138b0b6a6e857 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 15:31:22 -0500 Subject: [PATCH 31/58] speedy resume --- tests/algorithms/test_algorithm_resumption.py | 167 ++++++++++-------- 1 file changed, 89 insertions(+), 78 deletions(-) diff --git a/tests/algorithms/test_algorithm_resumption.py b/tests/algorithms/test_algorithm_resumption.py index c00fdf2ade..f1c464e662 100644 --- a/tests/algorithms/test_algorithm_resumption.py +++ b/tests/algorithms/test_algorithm_resumption.py @@ -28,88 +28,99 @@ def test_algorithm_resumption( alg_cls: type[Algorithm], world_size, ): - folder1 = os.path.join(tmp_path, 'folder1') - folder2 = os.path.join(tmp_path, 'folder2') - os.makedirs(folder1, exist_ok=True) - os.makedirs(folder2, exist_ok=True) - - model = get_alg_model(alg_cls) - alg_kwargs = get_alg_kwargs(alg_cls) - - copied_model = copy.deepcopy(model) # copy the model so the params will start from the same point - - if alg_cls is LayerFreezing: - pytest.xfail('Known issues') - - if alg_cls in (SAM, StochasticDepth): - pytest.xfail('Mismatch in weights when resuming from a checkpoint.') - - if alg_cls is GyroDropout: - pytest.xfail('GyroDropoutLayer is not implemented in a way that allows correct resumption.') - - if alg_cls is SWA and world_size > 1: - pytest.xfail('SWA is not implemented in a way that is compatible correct resumption on multiple devices.') - - optimizer = torch.optim.Adam(model.parameters(), lr=0.01) - scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5) - - shared_config = { - 'max_duration': '2ba', - 'save_filename': 'ba{batch}-rank{rank}', - 'save_interval': '1ba', - 'train_subset_num_batches': 2, - 'precision': 'amp_bf16', - } - train_dataloader = get_alg_dataloader(alg_cls) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) - # train model once, saving checkpoints every batch - trainer1 = Trainer( - model=model, - train_dataloader=train_dataloader, - optimizers=optimizer, - schedulers=scheduler, - save_folder=folder1, - algorithms=alg_cls(**alg_kwargs), - **shared_config, - ) - trainer1.fit() - - # create second trainer, load from the first batch checkpoint, and continue training - optimizer = torch.optim.Adam(copied_model.parameters(), lr=0.01) - scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5) - - alg = alg_cls(**alg_kwargs) - if alg_cls is SeqLengthWarmup: - alg._activated = True # type: ignore - - train_dataloader = get_alg_dataloader(alg_cls) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) - trainer2 = Trainer( - model=copied_model, - train_dataloader=train_dataloader, - load_path=os.path.join(folder1, 'ba1-rank{rank}'), - load_weights_only=False, - load_strict_model_weights=False, - optimizers=optimizer, - schedulers=scheduler, - save_folder=folder2, - algorithms=alg, - **shared_config, - ) - trainer2.fit() - # check that the checkpoints after the second batch are equal - if world_size == 1 or dist.get_global_rank() == 0: - _assert_checkpoints_equal( - file1=os.path.join(folder1, 'ba2-rank0'), - file2=os.path.join(folder2, 'ba2-rank0'), + # Use RAM-based tmp directory instead of disk + from tempfile import TemporaryDirectory + with TemporaryDirectory() as tmpdir: + folder1 = os.path.join(tmpdir, 'folder1') + folder2 = os.path.join(tmpdir, 'folder2') + os.makedirs(folder1, exist_ok=True) + os.makedirs(folder2, exist_ok=True) + + if alg_cls is LayerFreezing: + pytest.xfail('Known issues') + + if alg_cls in (SAM, StochasticDepth): + pytest.xfail('Mismatch in weights when resuming from a checkpoint.') + + if alg_cls is GyroDropout: + pytest.xfail('GyroDropoutLayer is not implemented in a way that allows correct resumption.') + + if alg_cls is SWA and world_size > 1: + pytest.xfail('SWA is not implemented in a way that is compatible correct resumption on multiple devices.') + + model = get_alg_model(alg_cls) + alg_kwargs = get_alg_kwargs(alg_cls) + + copied_model = copy.deepcopy(model) # copy the model so the params will start from the same point + + optimizer = torch.optim.SGD(model.parameters(), lr=0.1) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + + # Reduce training duration and data + shared_config = { + 'max_duration': '2ba', + 'save_filename': 'ba{batch}-rank{rank}', + 'save_interval': '1ba', + 'train_subset_num_batches': 2, + 'precision': 'fp32', + } + train_dataloader = get_alg_dataloader(alg_cls) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) + # train model once, saving checkpoints every epoch + trainer1 = Trainer( + model=model, + train_dataloader=train_dataloader, + optimizers=optimizer, + schedulers=scheduler, + save_folder=folder1, + algorithms=alg_cls(**alg_kwargs), + **shared_config, ) + trainer1.fit() + + # create second trainer, load an intermediate checkpoint + # and continue training + + optimizer = torch.optim.SGD(copied_model.parameters(), lr=0.1) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + + alg = alg_cls(**alg_kwargs) + # SeqLengthWarmup has a call to ._activate_model() that happens on the first call to the algorithm + # in order to get complete matching of the rng state, we have to cause that extra call to be skipped + # when reloading. + if alg_cls is SeqLengthWarmup: + alg._activated = True # type: ignore + train_dataloader = get_alg_dataloader(alg_cls) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) + + trainer2 = Trainer( + model=copied_model, + train_dataloader=train_dataloader, + load_path=os.path.join(folder1, 'ep1-rank{rank}'), + load_weights_only=False, + load_strict_model_weights=False, + optimizers=optimizer, + schedulers=scheduler, + save_folder=folder2, + algorithms=alg, + **shared_config, + ) + trainer2.fit() - # ensure that the first and second batch checkpoints are not equal - if world_size == 1 or dist.get_global_rank() == 0: - with pytest.raises(AssertionError): - _assert_model_weights_equal( - file1=os.path.join(folder1, 'ba1-rank0'), - file2=os.path.join(folder1, 'ba2-rank0'), + # check that the checkpoints are equal + if world_size == 1 or dist.get_global_rank() == 0: + _assert_checkpoints_equal( + file1=os.path.join(folder1, 'ep2-rank0'), + file2=os.path.join(folder2, 'ep2-rank0'), ) + # check that different epoch checkpoints are _not_ equal + # this ensures that the model weights are being updated. + if world_size == 1 or dist.get_global_rank() == 0: + with pytest.raises(AssertionError): + _assert_model_weights_equal( + file1=os.path.join(folder1, 'ep1-rank0'), + file2=os.path.join(folder1, 'ep2-rank0'), + ) + def _assert_checkpoints_equal(file1, file2): # TODO: consider merging with _assert_checkpoints_equivalent From e3b34da819d74f6f5a03dc4efb0b5364e2a4cebf Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 15:46:48 -0500 Subject: [PATCH 32/58] fix pathing --- tests/algorithms/test_algorithm_resumption.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/algorithms/test_algorithm_resumption.py b/tests/algorithms/test_algorithm_resumption.py index f1c464e662..520a7ae5a1 100644 --- a/tests/algorithms/test_algorithm_resumption.py +++ b/tests/algorithms/test_algorithm_resumption.py @@ -15,7 +15,6 @@ from tests.common import deep_compare from tests.common.markers import world_size - @pytest.mark.gpu @pytest.mark.parametrize('alg_cls', get_algs_with_marks()) @pytest.mark.filterwarnings( @@ -59,7 +58,7 @@ def test_algorithm_resumption( # Reduce training duration and data shared_config = { 'max_duration': '2ba', - 'save_filename': 'ba{batch}-rank{rank}', + 'save_filename': 'checkpoint_ba{batch}-rank{rank}', 'save_interval': '1ba', 'train_subset_num_batches': 2, 'precision': 'fp32', @@ -94,7 +93,7 @@ def test_algorithm_resumption( trainer2 = Trainer( model=copied_model, train_dataloader=train_dataloader, - load_path=os.path.join(folder1, 'ep1-rank{rank}'), + load_path=os.path.join(folder1, 'checkpoint_ba1-rank{rank}'), load_weights_only=False, load_strict_model_weights=False, optimizers=optimizer, @@ -108,17 +107,16 @@ def test_algorithm_resumption( # check that the checkpoints are equal if world_size == 1 or dist.get_global_rank() == 0: _assert_checkpoints_equal( - file1=os.path.join(folder1, 'ep2-rank0'), - file2=os.path.join(folder2, 'ep2-rank0'), + os.path.join(folder1, 'checkpoint_ba2-rank0'), + os.path.join(folder2, 'checkpoint_ba2-rank0'), ) - # check that different epoch checkpoints are _not_ equal - # this ensures that the model weights are being updated. - if world_size == 1 or dist.get_global_rank() == 0: + # check that different epoch checkpoints are _not_ equal + # this ensures that the model weights are being updated. with pytest.raises(AssertionError): _assert_model_weights_equal( - file1=os.path.join(folder1, 'ep1-rank0'), - file2=os.path.join(folder1, 'ep2-rank0'), + os.path.join(folder1, 'checkpoint_ba1-rank0'), + os.path.join(folder1, 'checkpoint_ba2-rank0'), ) From 324224ebb12c4016709044efeee0bc10191e537c Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 16:01:42 -0500 Subject: [PATCH 33/58] bf16 --- tests/algorithms/test_algorithm_resumption.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/algorithms/test_algorithm_resumption.py b/tests/algorithms/test_algorithm_resumption.py index 520a7ae5a1..ec6db80204 100644 --- a/tests/algorithms/test_algorithm_resumption.py +++ b/tests/algorithms/test_algorithm_resumption.py @@ -61,7 +61,7 @@ def test_algorithm_resumption( 'save_filename': 'checkpoint_ba{batch}-rank{rank}', 'save_interval': '1ba', 'train_subset_num_batches': 2, - 'precision': 'fp32', + 'precision': 'amp_bf16', } train_dataloader = get_alg_dataloader(alg_cls) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) # train model once, saving checkpoints every epoch From 57b9110507128f490398e4adfdcc211a6e638587 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 16:24:38 -0500 Subject: [PATCH 34/58] faster precision --- tests/algorithms/test_algorithm_resumption.py | 17 +++++++++++------ tests/test_precision.py | 14 +++++++------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/tests/algorithms/test_algorithm_resumption.py b/tests/algorithms/test_algorithm_resumption.py index ec6db80204..6d6145cf62 100644 --- a/tests/algorithms/test_algorithm_resumption.py +++ b/tests/algorithms/test_algorithm_resumption.py @@ -15,6 +15,7 @@ from tests.common import deep_compare from tests.common.markers import world_size + @pytest.mark.gpu @pytest.mark.parametrize('alg_cls', get_algs_with_marks()) @pytest.mark.filterwarnings( @@ -63,7 +64,9 @@ def test_algorithm_resumption( 'train_subset_num_batches': 2, 'precision': 'amp_bf16', } - train_dataloader = get_alg_dataloader(alg_cls) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) + train_dataloader = get_alg_dataloader( + alg_cls, + ) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) # train model once, saving checkpoints every epoch trainer1 = Trainer( model=model, @@ -78,7 +81,7 @@ def test_algorithm_resumption( # create second trainer, load an intermediate checkpoint # and continue training - + optimizer = torch.optim.SGD(copied_model.parameters(), lr=0.1) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) @@ -88,8 +91,10 @@ def test_algorithm_resumption( # when reloading. if alg_cls is SeqLengthWarmup: alg._activated = True # type: ignore - train_dataloader = get_alg_dataloader(alg_cls) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) - + train_dataloader = get_alg_dataloader( + alg_cls, + ) if world_size == 1 else get_alg_dataloader(alg_cls, multigpu=True) + trainer2 = Trainer( model=copied_model, train_dataloader=train_dataloader, @@ -107,8 +112,8 @@ def test_algorithm_resumption( # check that the checkpoints are equal if world_size == 1 or dist.get_global_rank() == 0: _assert_checkpoints_equal( - os.path.join(folder1, 'checkpoint_ba2-rank0'), - os.path.join(folder2, 'checkpoint_ba2-rank0'), + os.path.join(folder1, 'checkpoint_ba2-rank0'), + os.path.join(folder2, 'checkpoint_ba2-rank0'), ) # check that different epoch checkpoints are _not_ equal diff --git a/tests/test_precision.py b/tests/test_precision.py index a23ab5f11b..da3f949886 100644 --- a/tests/test_precision.py +++ b/tests/test_precision.py @@ -23,22 +23,22 @@ def get_trainer(precision: Precision, precision_config: Optional[dict[str, Any]] return Trainer( model=composer_resnet('resnet18'), train_dataloader=DataLoader( - dataset=RandomImageDataset(size=1024), - batch_size=512, + dataset=RandomImageDataset(size=128), + batch_size=128, persistent_workers=False, num_workers=0, ), eval_dataloader=DataLoader( - dataset=RandomImageDataset(size=1024), - batch_size=512, + dataset=RandomImageDataset(size=128), + batch_size=128, persistent_workers=False, num_workers=0, ), precision=precision, precision_config=precision_config, - max_duration='1ep', - eval_interval='1ep', - train_subset_num_batches=1, + max_duration='2ba', + eval_interval='2ba', + train_subset_num_batches=2, ) From 0b13105b7f52c2b260e17f310ca01c4191183cae Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 16:26:48 -0500 Subject: [PATCH 35/58] 1 batch --- tests/test_precision.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_precision.py b/tests/test_precision.py index da3f949886..e9f3e32756 100644 --- a/tests/test_precision.py +++ b/tests/test_precision.py @@ -36,9 +36,9 @@ def get_trainer(precision: Precision, precision_config: Optional[dict[str, Any]] ), precision=precision, precision_config=precision_config, - max_duration='2ba', - eval_interval='2ba', - train_subset_num_batches=2, + max_duration='1ba', + eval_interval='1ba', + train_subset_num_batches=1, ) From 3fd229ab23b99f2905bb2097d458105693ba580e Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 16:50:15 -0500 Subject: [PATCH 36/58] epocj --- tests/test_precision.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_precision.py b/tests/test_precision.py index e9f3e32756..0a2f40a559 100644 --- a/tests/test_precision.py +++ b/tests/test_precision.py @@ -36,8 +36,8 @@ def get_trainer(precision: Precision, precision_config: Optional[dict[str, Any]] ), precision=precision, precision_config=precision_config, - max_duration='1ba', - eval_interval='1ba', + max_duration='1ep', + eval_interval='1ep', train_subset_num_batches=1, ) From d43f6a081d5a38545575cbddb73f47955a7d4056 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 17:11:14 -0500 Subject: [PATCH 37/58] smaller gradient clipping --- tests/algorithms/test_gradient_clipping.py | 8 +++----- tests/test_precision.py | 8 ++++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/algorithms/test_gradient_clipping.py b/tests/algorithms/test_gradient_clipping.py index d749362801..eb95c1359a 100644 --- a/tests/algorithms/test_gradient_clipping.py +++ b/tests/algorithms/test_gradient_clipping.py @@ -20,7 +20,7 @@ def simple_model_with_grads(): # Set up small NN with one linear layer with no bias + softmax, so only # one set of params and get some gradients. - N, hin, num_classes = 8, 4, 3 + N, hin, num_classes = 4, 2, 2 x = torch.rand((N, hin)) y = torch.randint(high=num_classes - 1, size=(N,)) model = nn.Sequential(nn.Linear(hin, num_classes, bias=False), nn.Softmax(dim=1)) @@ -47,8 +47,6 @@ def __init__(self, n_ch, num_fmaps, h, num_classes, filter_size): self.mlp = nn.Sequential( nn.Linear(num_fmaps, h), nn.ReLU(), - nn.Linear(h, h), - nn.ReLU(), nn.Linear(h, num_classes), nn.Softmax(dim=1), ) @@ -60,8 +58,8 @@ def forward(self, x): return out # Generate some gradients. - N, n_ch, num_fmaps, h, num_classes, filter_size = 8, 3, 4, 4, 3, 3 - x = torch.rand((N, n_ch, 16, 16)) + N, n_ch, num_fmaps, h, num_classes, filter_size = 4, 1, 2, 2, 2, 2 + x = torch.rand((N, n_ch, 8, 8)) y = torch.randint(high=num_classes - 1, size=(N,)) model = myNN(n_ch, num_fmaps, h, num_classes, filter_size) diff --git a/tests/test_precision.py b/tests/test_precision.py index 0a2f40a559..a23ab5f11b 100644 --- a/tests/test_precision.py +++ b/tests/test_precision.py @@ -23,14 +23,14 @@ def get_trainer(precision: Precision, precision_config: Optional[dict[str, Any]] return Trainer( model=composer_resnet('resnet18'), train_dataloader=DataLoader( - dataset=RandomImageDataset(size=128), - batch_size=128, + dataset=RandomImageDataset(size=1024), + batch_size=512, persistent_workers=False, num_workers=0, ), eval_dataloader=DataLoader( - dataset=RandomImageDataset(size=128), - batch_size=128, + dataset=RandomImageDataset(size=1024), + batch_size=512, persistent_workers=False, num_workers=0, ), From 4ca7bbbd4c1c68c5dc8c3be531535862db9acd4f Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 17:26:53 -0500 Subject: [PATCH 38/58] speed up fsdp --- tests/trainer/test_fsdp_checkpoint.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index 332ed5b7b7..75e1412f0b 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -83,8 +83,8 @@ def get_trainer( save_folder: Optional[str] = None, save_filename: str = 'ba{batch}-rank{rank}.pt', save_overwrite: bool = False, - num_features: int = 4, - num_classes: int = 2, + num_features: int = 4, # Reduced from default + num_classes: int = 2, # Reduced from default load_path: Optional[str] = None, autoresume: bool = False, run_name: Optional[str] = None, @@ -111,11 +111,11 @@ def get_trainer( val_metrics=val_metrics, ) model.module.to(model_init_device) - dataset = RandomClassificationDataset(shape=(num_features,), num_classes=num_classes, size=128) + dataset = RandomClassificationDataset(shape=(num_features,), num_classes=num_classes, size=32) dataloader = DataLoader( dataset, sampler=dist.get_sampler(dataset), - batch_size=8, + batch_size=2, ) if optimizer == 'adam': optim = torch.optim.Adam(params=model.parameters()) From 17cbd20f6d48c82d7c5b273c8c9891b4eee41476 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 17:42:19 -0500 Subject: [PATCH 39/58] reduce batchsize --- tests/test_events.py | 8 ++++---- tests/trainer/test_fsdp_checkpoint.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 235d0941f1..5a451f6a92 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -31,17 +31,17 @@ def get_trainer(self, precision='fp32', **kwargs): train_dataset = RandomClassificationDataset() eval_dataset = RandomClassificationDataset() - train_batch_size = 4 + train_batch_size = 2 evaluator1 = DataLoader( dataset=eval_dataset, - batch_size=8, + batch_size=2, sampler=dist.get_sampler(eval_dataset), ) evaluator2 = DataLoader( dataset=eval_dataset, - batch_size=4, + batch_size=2, sampler=dist.get_sampler(eval_dataset), ) @@ -57,7 +57,7 @@ def get_trainer(self, precision='fp32', **kwargs): precision=precision, train_subset_num_batches=self.train_subset_num_batches, eval_subset_num_batches=self.eval_subset_num_batches, - max_duration='2ep', + max_duration='1ep1ba', optimizers=optimizer, callbacks=[EventCounterCallback()], **kwargs, diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index 75e1412f0b..b0890d26a9 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -83,8 +83,8 @@ def get_trainer( save_folder: Optional[str] = None, save_filename: str = 'ba{batch}-rank{rank}.pt', save_overwrite: bool = False, - num_features: int = 4, # Reduced from default - num_classes: int = 2, # Reduced from default + num_features: int = 4, + num_classes: int = 2, load_path: Optional[str] = None, autoresume: bool = False, run_name: Optional[str] = None, @@ -111,7 +111,7 @@ def get_trainer( val_metrics=val_metrics, ) model.module.to(model_init_device) - dataset = RandomClassificationDataset(shape=(num_features,), num_classes=num_classes, size=32) + dataset = RandomClassificationDataset(shape=(num_features,), num_classes=num_classes, size=8) dataloader = DataLoader( dataset, sampler=dist.get_sampler(dataset), From 3c826e6534aef5ce59f2c1683d0c2676ab400fdd Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 17:55:09 -0500 Subject: [PATCH 40/58] typo --- tests/test_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_events.py b/tests/test_events.py index 5a451f6a92..601448e2a2 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -57,7 +57,7 @@ def get_trainer(self, precision='fp32', **kwargs): precision=precision, train_subset_num_batches=self.train_subset_num_batches, eval_subset_num_batches=self.eval_subset_num_batches, - max_duration='1ep1ba', + max_duration='1ep', optimizers=optimizer, callbacks=[EventCounterCallback()], **kwargs, From 62f0bea8d9f85d4becb52631f29dd0563d2cd72a Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 18:00:06 -0500 Subject: [PATCH 41/58] revert --- tests/test_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_events.py b/tests/test_events.py index 601448e2a2..f15dcb1fcb 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -57,7 +57,7 @@ def get_trainer(self, precision='fp32', **kwargs): precision=precision, train_subset_num_batches=self.train_subset_num_batches, eval_subset_num_batches=self.eval_subset_num_batches, - max_duration='1ep', + max_duration='2ep', optimizers=optimizer, callbacks=[EventCounterCallback()], **kwargs, From 69f13bad7a5ced5c1ffba04e808cfa7429cfc2a2 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 18:14:37 -0500 Subject: [PATCH 42/58] update num epochs --- tests/test_events.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index f15dcb1fcb..4fba3eaa27 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -57,7 +57,7 @@ def get_trainer(self, precision='fp32', **kwargs): precision=precision, train_subset_num_batches=self.train_subset_num_batches, eval_subset_num_batches=self.eval_subset_num_batches, - max_duration='2ep', + max_duration='1ep', optimizers=optimizer, callbacks=[EventCounterCallback()], **kwargs, @@ -127,7 +127,7 @@ def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, p ) trainer.fit() - self._assert_expected_event_calls(trainer, save_interval, num_epochs=2) + self._assert_expected_event_calls(trainer, save_interval, num_epochs=1) def _assert_expected_event_calls(self, trainer: Trainer, eval_interval: Time, num_epochs: int): state = trainer.state From 7b69fc32fcd31b022268b113c69b04283d580250 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 18:28:19 -0500 Subject: [PATCH 43/58] one epoch only --- tests/test_events.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 4fba3eaa27..b84c90327f 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -22,16 +22,16 @@ def test_event_values(event: Event): class TestEventCalls: - eval_subset_num_batches = 2 - train_subset_num_batches = 2 + eval_subset_num_batches = 1 + train_subset_num_batches = 1 - def get_trainer(self, precision='fp32', **kwargs): + def get_trainer(self, precision='fp32', max_duration='1ep', **kwargs): model = SimpleModel() optimizer = torch.optim.Adam(model.parameters()) train_dataset = RandomClassificationDataset() eval_dataset = RandomClassificationDataset() - train_batch_size = 2 + train_batch_size = 2 evaluator1 = DataLoader( dataset=eval_dataset, @@ -57,7 +57,7 @@ def get_trainer(self, precision='fp32', **kwargs): precision=precision, train_subset_num_batches=self.train_subset_num_batches, eval_subset_num_batches=self.eval_subset_num_batches, - max_duration='1ep', + max_duration=max_duration, optimizers=optimizer, callbacks=[EventCounterCallback()], **kwargs, @@ -99,7 +99,7 @@ def get_trainer(self, precision='fp32', **kwargs): ), ], ) - @pytest.mark.parametrize('save_interval', ['1ep', '1ba']) + @pytest.mark.parametrize('save_interval', ['1ep']) def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval): save_interval = Time.from_timestring(save_interval) @@ -124,6 +124,7 @@ def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, p parallelism_config=parallelism_config, save_interval=save_interval, eval_interval=save_interval, + max_duration='1ep', ) trainer.fit() From 868a71b54733a39f8b0c7a73e75512ba516efac8 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 18:45:05 -0500 Subject: [PATCH 44/58] revery --- tests/trainer/test_fsdp_checkpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index b0890d26a9..332ed5b7b7 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -111,11 +111,11 @@ def get_trainer( val_metrics=val_metrics, ) model.module.to(model_init_device) - dataset = RandomClassificationDataset(shape=(num_features,), num_classes=num_classes, size=8) + dataset = RandomClassificationDataset(shape=(num_features,), num_classes=num_classes, size=128) dataloader = DataLoader( dataset, sampler=dist.get_sampler(dataset), - batch_size=2, + batch_size=8, ) if optimizer == 'adam': optim = torch.optim.Adam(params=model.parameters()) From ceb6591cb811a704d119874a2dc591b77144eaef Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 23:41:04 -0500 Subject: [PATCH 45/58] precommit --- tests/test_events.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index b84c90327f..396e5c2fa9 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -31,7 +31,7 @@ def get_trainer(self, precision='fp32', max_duration='1ep', **kwargs): train_dataset = RandomClassificationDataset() eval_dataset = RandomClassificationDataset() - train_batch_size = 2 + train_batch_size = 2 evaluator1 = DataLoader( dataset=eval_dataset, @@ -99,7 +99,7 @@ def get_trainer(self, precision='fp32', max_duration='1ep', **kwargs): ), ], ) - @pytest.mark.parametrize('save_interval', ['1ep']) + @pytest.mark.parametrize('save_interval', ['1ep', '1ba']) def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval): save_interval = Time.from_timestring(save_interval) From 6785988a86e0c9f95f17759065276b219e0f5a24 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sat, 30 Nov 2024 23:42:41 -0500 Subject: [PATCH 46/58] clean --- tests/test_events.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 396e5c2fa9..8cee8d22e9 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -25,7 +25,7 @@ class TestEventCalls: eval_subset_num_batches = 1 train_subset_num_batches = 1 - def get_trainer(self, precision='fp32', max_duration='1ep', **kwargs): + def get_trainer(self, precision='fp32', **kwargs): model = SimpleModel() optimizer = torch.optim.Adam(model.parameters()) @@ -57,7 +57,7 @@ def get_trainer(self, precision='fp32', max_duration='1ep', **kwargs): precision=precision, train_subset_num_batches=self.train_subset_num_batches, eval_subset_num_batches=self.eval_subset_num_batches, - max_duration=max_duration, + max_duration='1ep', optimizers=optimizer, callbacks=[EventCounterCallback()], **kwargs, From ce2362b982bc4a42c51c932ff84495c09d9cb10f Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 00:02:37 -0500 Subject: [PATCH 47/58] clean --- tests/test_events.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_events.py b/tests/test_events.py index 8cee8d22e9..5df351b178 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -124,7 +124,6 @@ def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, p parallelism_config=parallelism_config, save_interval=save_interval, eval_interval=save_interval, - max_duration='1ep', ) trainer.fit() From a4afdc030b16f3a0ac06227130193ff1d843c5fd Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 00:27:52 -0500 Subject: [PATCH 48/58] clean --- tests/test_events.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 5df351b178..9fe73f0944 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -29,19 +29,19 @@ def get_trainer(self, precision='fp32', **kwargs): model = SimpleModel() optimizer = torch.optim.Adam(model.parameters()) - train_dataset = RandomClassificationDataset() - eval_dataset = RandomClassificationDataset() - train_batch_size = 2 + train_dataset = RandomClassificationDataset(size=16) + eval_dataset = RandomClassificationDataset(size=16) + train_batch_size = 4 evaluator1 = DataLoader( dataset=eval_dataset, - batch_size=2, + batch_size=8, sampler=dist.get_sampler(eval_dataset), ) evaluator2 = DataLoader( dataset=eval_dataset, - batch_size=2, + batch_size=4, sampler=dist.get_sampler(eval_dataset), ) From e595c529983e77e92e92e4278a82f84015947936 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 00:49:23 -0500 Subject: [PATCH 49/58] mock --- tests/test_events.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 9fe73f0944..cc33f4db5c 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -1,7 +1,8 @@ -# Copyright 2022 MosaicML Composer authors +# Copyright ... # SPDX-License-Identifier: Apache-2.0 import math +from unittest.mock import patch import pytest import torch @@ -25,24 +26,27 @@ class TestEventCalls: eval_subset_num_batches = 1 train_subset_num_batches = 1 - def get_trainer(self, precision='fp32', **kwargs): + def get_trainer(self, precision='fp32', max_duration='1ep', save_interval='1ep', **kwargs): model = SimpleModel() optimizer = torch.optim.Adam(model.parameters()) - train_dataset = RandomClassificationDataset(size=16) - eval_dataset = RandomClassificationDataset(size=16) - train_batch_size = 4 + # Minimal dataset size to reduce batches + train_dataset = RandomClassificationDataset(size=4) + eval_dataset = RandomClassificationDataset(size=4) + train_batch_size = 4 evaluator1 = DataLoader( dataset=eval_dataset, - batch_size=8, + batch_size=4, sampler=dist.get_sampler(eval_dataset), + num_workers=0, ) evaluator2 = DataLoader( dataset=eval_dataset, batch_size=4, sampler=dist.get_sampler(eval_dataset), + num_workers=0, ) return Trainer( @@ -51,13 +55,15 @@ def get_trainer(self, precision='fp32', **kwargs): dataset=train_dataset, batch_size=train_batch_size, sampler=dist.get_sampler(train_dataset), + num_workers=0, ), eval_dataloader=(evaluator1, evaluator2), - device_train_microbatch_size=train_batch_size // 2, + device_train_microbatch_size=train_batch_size, precision=precision, train_subset_num_batches=self.train_subset_num_batches, eval_subset_num_batches=self.eval_subset_num_batches, - max_duration='1ep', + max_duration=max_duration, + save_interval=save_interval, optimizers=optimizer, callbacks=[EventCounterCallback()], **kwargs, @@ -101,6 +107,16 @@ def get_trainer(self, precision='fp32', **kwargs): ) @pytest.mark.parametrize('save_interval', ['1ep', '1ba']) def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval): + # handle 1ba save interval separately to optimize speed + if save_interval == '1ba': + # mock the save_checkpoint method to speed up batch saves + with patch('composer.trainer.trainer.Trainer.save_checkpoint') as mock_save: + mock_save.return_value = None + self._run_event_calls_test(world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval, num_epochs=1) + else: + self._run_event_calls_test(world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval, num_epochs=1) + + def _run_event_calls_test(self, world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval, num_epochs): save_interval = Time.from_timestring(save_interval) deepspeed_config = None @@ -127,7 +143,7 @@ def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, p ) trainer.fit() - self._assert_expected_event_calls(trainer, save_interval, num_epochs=1) + self._assert_expected_event_calls(trainer, save_interval, num_epochs=num_epochs) def _assert_expected_event_calls(self, trainer: Trainer, eval_interval: Time, num_epochs: int): state = trainer.state From b72ad75a9ade7952d636a7edf62dad78e053b194 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 00:54:45 -0500 Subject: [PATCH 50/58] precommit --- tests/test_events.py | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index cc33f4db5c..bbb0b96261 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -1,4 +1,4 @@ -# Copyright ... +# Copyright 2024 MosaicML Composer authors # SPDX-License-Identifier: Apache-2.0 import math @@ -33,7 +33,7 @@ def get_trainer(self, precision='fp32', max_duration='1ep', save_interval='1ep', # Minimal dataset size to reduce batches train_dataset = RandomClassificationDataset(size=4) eval_dataset = RandomClassificationDataset(size=4) - train_batch_size = 4 + train_batch_size = 4 evaluator1 = DataLoader( dataset=eval_dataset, @@ -112,11 +112,36 @@ def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, p # mock the save_checkpoint method to speed up batch saves with patch('composer.trainer.trainer.Trainer.save_checkpoint') as mock_save: mock_save.return_value = None - self._run_event_calls_test(world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval, num_epochs=1) + self._run_event_calls_test( + world_size, + device, + deepspeed_zero_stage, + use_fsdp, + precision, + save_interval, + num_epochs=1, + ) else: - self._run_event_calls_test(world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval, num_epochs=1) - - def _run_event_calls_test(self, world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval, num_epochs): + self._run_event_calls_test( + world_size, + device, + deepspeed_zero_stage, + use_fsdp, + precision, + save_interval, + num_epochs=1, + ) + + def _run_event_calls_test( + self, + world_size, + device, + deepspeed_zero_stage, + use_fsdp, + precision, + save_interval, + num_epochs, + ): save_interval = Time.from_timestring(save_interval) deepspeed_config = None From 477b59784be4ed8ab18d5ebc7cf1cc21dc6dcb27 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 01:03:45 -0500 Subject: [PATCH 51/58] precommit --- tests/test_events.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index bbb0b96261..6942fa4205 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -142,8 +142,6 @@ def _run_event_calls_test( save_interval, num_epochs, ): - save_interval = Time.from_timestring(save_interval) - deepspeed_config = None if deepspeed_zero_stage: deepspeed_config = {'zero_optimization': {'stage': deepspeed_zero_stage}} @@ -164,11 +162,11 @@ def _run_event_calls_test( deepspeed_config=deepspeed_config, parallelism_config=parallelism_config, save_interval=save_interval, - eval_interval=save_interval, + eval_interval=Time.from_timestring(save_interval), ) trainer.fit() - self._assert_expected_event_calls(trainer, save_interval, num_epochs=num_epochs) + self._assert_expected_event_calls(trainer, Time.from_timestring(save_interval), num_epochs=num_epochs) def _assert_expected_event_calls(self, trainer: Trainer, eval_interval: Time, num_epochs: int): state = trainer.state From b527187f2fadf3adab09041ffa4d93187fe3c815 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 01:30:50 -0500 Subject: [PATCH 52/58] precommit --- tests/test_events.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 6942fa4205..bd6b1acd67 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -31,15 +31,16 @@ def get_trainer(self, precision='fp32', max_duration='1ep', save_interval='1ep', optimizer = torch.optim.Adam(model.parameters()) # Minimal dataset size to reduce batches - train_dataset = RandomClassificationDataset(size=4) - eval_dataset = RandomClassificationDataset(size=4) + train_dataset = RandomClassificationDataset(size=16) + eval_dataset = RandomClassificationDataset(size=16) train_batch_size = 4 evaluator1 = DataLoader( dataset=eval_dataset, - batch_size=4, + batch_size=8, sampler=dist.get_sampler(eval_dataset), num_workers=0, + drop_last=True, ) evaluator2 = DataLoader( @@ -47,6 +48,7 @@ def get_trainer(self, precision='fp32', max_duration='1ep', save_interval='1ep', batch_size=4, sampler=dist.get_sampler(eval_dataset), num_workers=0, + drop_last=True, ) return Trainer( @@ -58,7 +60,7 @@ def get_trainer(self, precision='fp32', max_duration='1ep', save_interval='1ep', num_workers=0, ), eval_dataloader=(evaluator1, evaluator2), - device_train_microbatch_size=train_batch_size, + device_train_microbatch_size=train_batch_size // 2, precision=precision, train_subset_num_batches=self.train_subset_num_batches, eval_subset_num_batches=self.eval_subset_num_batches, From 12955d110240b078a94cfea99d02d7e63e58a0fe Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 02:20:36 -0500 Subject: [PATCH 53/58] mock and reuse --- tests/test_events.py | 448 +++++++++++++++++++++++-------------------- 1 file changed, 245 insertions(+), 203 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index bd6b1acd67..80e929fa37 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -16,220 +16,262 @@ from tests.common.events import EventCounterCallback -@pytest.mark.parametrize('event', list(Event)) -def test_event_values(event: Event): - assert event.name.lower() == event.value +@pytest.fixture(scope='session') +def train_dataset(): + return RandomClassificationDataset(size=16) -class TestEventCalls: +@pytest.fixture(scope='session') +def eval_dataset(): + return RandomClassificationDataset(size=16) - eval_subset_num_batches = 1 - train_subset_num_batches = 1 - def get_trainer(self, precision='fp32', max_duration='1ep', save_interval='1ep', **kwargs): - model = SimpleModel() - optimizer = torch.optim.Adam(model.parameters()) +@pytest.fixture(scope='session') +def model(): + return SimpleModel() - # Minimal dataset size to reduce batches - train_dataset = RandomClassificationDataset(size=16) - eval_dataset = RandomClassificationDataset(size=16) - train_batch_size = 4 - evaluator1 = DataLoader( - dataset=eval_dataset, - batch_size=8, - sampler=dist.get_sampler(eval_dataset), - num_workers=0, - drop_last=True, - ) +@pytest.fixture(scope='session') +def optimizer(model): + return torch.optim.Adam(model.parameters()) + + +@pytest.fixture(scope='session') +def evaluator1(eval_dataset): + return DataLoader( + dataset=eval_dataset, + batch_size=8, + sampler=dist.get_sampler(eval_dataset), + num_workers=0, + drop_last=True, + ) + + +@pytest.fixture(scope='session') +def evaluator2(eval_dataset): + return DataLoader( + dataset=eval_dataset, + batch_size=4, + sampler=dist.get_sampler(eval_dataset), + num_workers=0, + drop_last=True, + ) + + +@pytest.fixture +def event_counter_callback(): + return EventCounterCallback() + + +@pytest.fixture +def trainer( + model, + optimizer, + train_dataset, + evaluator1, + evaluator2, + event_counter_callback, + request, +): + # extract parameters from the test function + params = request.param + precision = params.get('precision', 'fp32') + max_duration = params.get('max_duration', '1ep') + save_interval = params.get('save_interval', '1ep') + device = params.get('device', 'cpu') + deepspeed_zero_stage = params.get('deepspeed_zero_stage', None) + use_fsdp = params.get('use_fsdp', False) + + deepspeed_config = None + if deepspeed_zero_stage: + deepspeed_config = {'zero_optimization': {'stage': deepspeed_zero_stage}} + + parallelism_config = None + if use_fsdp: + parallelism_config = { + 'fsdp': { + 'sharding_strategy': 'FULL_SHARD', + 'mixed_precision': 'PURE', + 'backward_prefetch': 'BACKWARD_PRE', + }, + } - evaluator2 = DataLoader( - dataset=eval_dataset, + return Trainer( + model=model, + train_dataloader=DataLoader( + dataset=train_dataset, batch_size=4, - sampler=dist.get_sampler(eval_dataset), + sampler=dist.get_sampler(train_dataset), num_workers=0, - drop_last=True, - ) - - return Trainer( - model=model, - train_dataloader=DataLoader( - dataset=train_dataset, - batch_size=train_batch_size, - sampler=dist.get_sampler(train_dataset), - num_workers=0, - ), - eval_dataloader=(evaluator1, evaluator2), - device_train_microbatch_size=train_batch_size // 2, - precision=precision, - train_subset_num_batches=self.train_subset_num_batches, - eval_subset_num_batches=self.eval_subset_num_batches, - max_duration=max_duration, - save_interval=save_interval, - optimizers=optimizer, - callbacks=[EventCounterCallback()], - **kwargs, - ) - - @pytest.mark.parametrize( - 'world_size', - [ - pytest.param(1), - pytest.param(2, marks=pytest.mark.world_size(2)), - ], + ), + eval_dataloader=(evaluator1, evaluator2), + device_train_microbatch_size=2, + precision=precision, + train_subset_num_batches=1, + eval_subset_num_batches=1, + max_duration=max_duration, + save_interval=save_interval, + optimizers=optimizer, + callbacks=[event_counter_callback], + device=device, + deepspeed_config=deepspeed_config, + parallelism_config=parallelism_config, ) - @pytest.mark.parametrize( - 'device,deepspeed_zero_stage,use_fsdp,precision', - [ - pytest.param('cpu', None, False, 'fp32', id='cpu-ddp'), - # TODO: Remove filterwarnings after FSDP remove deprecated code - pytest.param( - 'gpu', - True, - False, - 'fp32', - id='gpu-ddp', - marks=[ - pytest.mark.gpu, - pytest.mark.filterwarnings('ignore::UserWarning'), - ], - ), - pytest.param( - 'gpu', - None, - True, - 'amp_fp16', - id='gpu-fsdp', - marks=[ - pytest.mark.gpu, - pytest.mark.filterwarnings('ignore::UserWarning'), - ], - ), - ], - ) - @pytest.mark.parametrize('save_interval', ['1ep', '1ba']) - def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval): - # handle 1ba save interval separately to optimize speed - if save_interval == '1ba': - # mock the save_checkpoint method to speed up batch saves - with patch('composer.trainer.trainer.Trainer.save_checkpoint') as mock_save: - mock_save.return_value = None - self._run_event_calls_test( - world_size, - device, - deepspeed_zero_stage, - use_fsdp, - precision, - save_interval, - num_epochs=1, - ) - else: - self._run_event_calls_test( - world_size, - device, - deepspeed_zero_stage, - use_fsdp, - precision, - save_interval, - num_epochs=1, + + +@pytest.mark.parametrize('event', list(Event)) +def test_event_values(event: Event): + assert event.name.lower() == event.value + + +@pytest.mark.parametrize( + 'world_size', + [ + pytest.param(1), + pytest.param(2, marks=pytest.mark.world_size(2)), + ], +) +@pytest.mark.parametrize( + 'device,deepspeed_zero_stage,use_fsdp,precision', + [ + pytest.param('cpu', None, False, 'fp32', id='cpu-ddp'), + # TODO: Remove filterwarnings after FSDP remove deprecated code + pytest.param( + 'gpu', + True, + False, + 'fp32', + id='gpu-ddp', + marks=[ + pytest.mark.gpu, + pytest.mark.filterwarnings('ignore::UserWarning'), + ], + ), + pytest.param( + 'gpu', + None, + True, + 'amp_fp16', + id='gpu-fsdp', + marks=[ + pytest.mark.gpu, + pytest.mark.filterwarnings('ignore::UserWarning'), + ], + ), + ], +) +@pytest.mark.parametrize('save_interval', ['1ep', '1ba']) +def test_event_calls( + world_size, + device, + deepspeed_zero_stage, + use_fsdp, + precision, + save_interval, + train_dataset, + eval_dataset, + model, + optimizer, + evaluator1, + evaluator2, + event_counter_callback, +): + with patch.object(Trainer, 'save_checkpoint', return_value=None): + # mock forward and backward to speed up + with patch.object(model, 'forward', return_value=torch.tensor(0.0)) as mock_forward, \ + patch.object(model, 'backward', return_value=None) as mock_backward: + + trainer_instance = Trainer( + model=model, + train_dataloader=DataLoader( + dataset=train_dataset, + batch_size=4, + sampler=dist.get_sampler(train_dataset), + num_workers=0, + ), + eval_dataloader=(evaluator1, evaluator2), + device_train_microbatch_size=2, + precision=precision, + train_subset_num_batches=1, + eval_subset_num_batches=1, + max_duration='1ep', + save_interval=save_interval, + optimizers=optimizer, + callbacks=[event_counter_callback], + device=device, + deepspeed_config={'zero_optimization': { + 'stage': deepspeed_zero_stage, + }} if deepspeed_zero_stage else None, + parallelism_config={ + 'fsdp': { + 'sharding_strategy': 'FULL_SHARD', + 'mixed_precision': 'PURE', + 'backward_prefetch': 'BACKWARD_PRE', + }, + } if use_fsdp else None, ) - def _run_event_calls_test( - self, - world_size, - device, - deepspeed_zero_stage, - use_fsdp, - precision, - save_interval, - num_epochs, - ): - deepspeed_config = None - if deepspeed_zero_stage: - deepspeed_config = {'zero_optimization': {'stage': deepspeed_zero_stage}} - - parallelism_config = None - if use_fsdp: - parallelism_config = { - 'fsdp': { - 'sharding_strategy': 'FULL_SHARD', - 'mixed_precision': 'PURE', - 'backward_prefetch': 'BACKWARD_PRE', - }, - } + trainer_instance.fit() - trainer = self.get_trainer( - precision=precision, - device=device, - deepspeed_config=deepspeed_config, - parallelism_config=parallelism_config, - save_interval=save_interval, - eval_interval=Time.from_timestring(save_interval), - ) - trainer.fit() - - self._assert_expected_event_calls(trainer, Time.from_timestring(save_interval), num_epochs=num_epochs) - - def _assert_expected_event_calls(self, trainer: Trainer, eval_interval: Time, num_epochs: int): - state = trainer.state - - assert state.dataloader_len is not None - total_steps = num_epochs * int(state.dataloader_len) - batch_size = state.train_dataloader.batch_size # type: ignore - assert batch_size is not None - assert state.device_train_microbatch_size is not None - total_microbatches = total_steps * math.ceil(batch_size / state.device_train_microbatch_size) - - if eval_interval.unit == TimeUnit.BATCH: - total_evals = total_steps // int(eval_interval) - elif eval_interval.unit == TimeUnit.EPOCH: - total_evals = num_epochs // int(eval_interval) - else: - total_evals = 0 - - if trainer.state.evaluators: - steps_per_eval = self.eval_subset_num_batches - total_evals_start = total_evals * len(trainer.state.evaluators) - total_eval_steps = total_evals * steps_per_eval * len(trainer.state.evaluators) - else: - total_eval_steps = 0 - total_evals_start = 0 - - expected_num_calls = { - Event.INIT: 1, - Event.BEFORE_LOAD: 1, - Event.AFTER_LOAD: 1, - Event.ITERATION_START: 1, - Event.EPOCH_START: num_epochs, - Event.BATCH_START: total_steps, - Event.BEFORE_DATALOADER: total_steps + num_epochs, # extra call per epoch when dataloader is exhausted - Event.AFTER_DATALOADER: total_steps, - Event.BEFORE_FORWARD: total_microbatches, - Event.AFTER_FORWARD: total_microbatches, - Event.BEFORE_LOSS: total_microbatches, - Event.AFTER_LOSS: total_microbatches, - Event.BEFORE_BACKWARD: total_microbatches, - Event.AFTER_BACKWARD: total_microbatches, - Event.BEFORE_TRAIN_BATCH: total_steps, - Event.AFTER_TRAIN_BATCH: total_steps, - Event.BATCH_END: total_steps, - Event.BATCH_CHECKPOINT: total_steps, - Event.EPOCH_END: num_epochs, - Event.EPOCH_CHECKPOINT: num_epochs, - Event.ITERATION_END: 0, - Event.ITERATION_CHECKPOINT: 0, - Event.EVAL_BEFORE_ALL: total_evals, - Event.EVAL_START: total_evals_start, - Event.EVAL_BATCH_START: total_eval_steps, - Event.EVAL_BEFORE_FORWARD: total_eval_steps, - Event.EVAL_AFTER_FORWARD: total_eval_steps, - Event.EVAL_BATCH_END: total_eval_steps, - Event.EVAL_END: total_evals_start, - Event.EVAL_AFTER_ALL: total_evals, - } + # Assertions + state = trainer_instance.state + + assert state.dataloader_len is not None + total_steps = 1 * int(state.dataloader_len) + batch_size = state.train_dataloader.batch_size # type: ignore + assert batch_size is not None + assert state.device_train_microbatch_size is not None + total_microbatches = total_steps * math.ceil(batch_size / state.device_train_microbatch_size) + + eval_interval = Time.from_timestring(save_interval) + if eval_interval.unit == TimeUnit.BATCH: + total_evals = total_steps // int(eval_interval) + elif eval_interval.unit == TimeUnit.EPOCH: + total_evals = 1 // int(eval_interval) + else: + total_evals = 0 + + if trainer_instance.state.evaluators: + steps_per_eval = 1 + total_evals_start = total_evals * len(trainer_instance.state.evaluators) + total_eval_steps = total_evals * steps_per_eval * len(trainer_instance.state.evaluators) + else: + total_eval_steps = 0 + total_evals_start = 0 + + expected_num_calls = { + Event.INIT: 1, + Event.BEFORE_LOAD: 1, + Event.AFTER_LOAD: 1, + Event.ITERATION_START: 1, + Event.EPOCH_START: 1, + Event.BATCH_START: total_steps, + Event.BEFORE_DATALOADER: total_steps + 1, # extra call per epoch when dataloader is exhausted + Event.AFTER_DATALOADER: total_steps, + Event.BEFORE_FORWARD: total_microbatches, + Event.AFTER_FORWARD: total_microbatches, + Event.BEFORE_LOSS: total_microbatches, + Event.AFTER_LOSS: total_microbatches, + Event.BEFORE_BACKWARD: total_microbatches, + Event.AFTER_BACKWARD: total_microbatches, + Event.BEFORE_TRAIN_BATCH: total_steps, + Event.AFTER_TRAIN_BATCH: total_steps, + Event.BATCH_END: total_steps, + Event.BATCH_CHECKPOINT: total_steps, + Event.EPOCH_END: 1, + Event.EPOCH_CHECKPOINT: 1, + Event.ITERATION_END: 0, + Event.ITERATION_CHECKPOINT: 0, + Event.EVAL_BEFORE_ALL: total_evals, + Event.EVAL_START: total_evals_start, + Event.EVAL_BATCH_START: total_eval_steps, + Event.EVAL_BEFORE_FORWARD: total_eval_steps, + Event.EVAL_AFTER_FORWARD: total_eval_steps, + Event.EVAL_BATCH_END: total_eval_steps, + Event.EVAL_END: total_evals_start, + Event.EVAL_AFTER_ALL: total_evals, + } - counter_callback = (cb for cb in trainer.state.callbacks if isinstance(cb, EventCounterCallback)) - counter_callback = next(counter_callback) - for event, expected in expected_num_calls.items(): - actual = counter_callback.event_to_num_calls[event] - assert expected == actual, f'{event} call mismatch: {expected} != {actual}' + for event, expected in expected_num_calls.items(): + actual = event_counter_callback.event_to_num_calls.get(event, 0) + assert expected == actual, f'{event} call mismatch: {expected} != {actual}' From e0a87afbf9fa33955f11e148dd09e0876533cbf3 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 02:35:55 -0500 Subject: [PATCH 54/58] mock and reuse --- tests/test_events.py | 87 +++++++++----------------------------------- 1 file changed, 18 insertions(+), 69 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 80e929fa37..52a76c2e21 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -63,62 +63,6 @@ def event_counter_callback(): return EventCounterCallback() -@pytest.fixture -def trainer( - model, - optimizer, - train_dataset, - evaluator1, - evaluator2, - event_counter_callback, - request, -): - # extract parameters from the test function - params = request.param - precision = params.get('precision', 'fp32') - max_duration = params.get('max_duration', '1ep') - save_interval = params.get('save_interval', '1ep') - device = params.get('device', 'cpu') - deepspeed_zero_stage = params.get('deepspeed_zero_stage', None) - use_fsdp = params.get('use_fsdp', False) - - deepspeed_config = None - if deepspeed_zero_stage: - deepspeed_config = {'zero_optimization': {'stage': deepspeed_zero_stage}} - - parallelism_config = None - if use_fsdp: - parallelism_config = { - 'fsdp': { - 'sharding_strategy': 'FULL_SHARD', - 'mixed_precision': 'PURE', - 'backward_prefetch': 'BACKWARD_PRE', - }, - } - - return Trainer( - model=model, - train_dataloader=DataLoader( - dataset=train_dataset, - batch_size=4, - sampler=dist.get_sampler(train_dataset), - num_workers=0, - ), - eval_dataloader=(evaluator1, evaluator2), - device_train_microbatch_size=2, - precision=precision, - train_subset_num_batches=1, - eval_subset_num_batches=1, - max_duration=max_duration, - save_interval=save_interval, - optimizers=optimizer, - callbacks=[event_counter_callback], - device=device, - deepspeed_config=deepspeed_config, - parallelism_config=parallelism_config, - ) - - @pytest.mark.parametrize('event', list(Event)) def test_event_values(event: Event): assert event.name.lower() == event.value @@ -177,9 +121,22 @@ def test_event_calls( event_counter_callback, ): with patch.object(Trainer, 'save_checkpoint', return_value=None): - # mock forward and backward to speed up - with patch.object(model, 'forward', return_value=torch.tensor(0.0)) as mock_forward, \ - patch.object(model, 'backward', return_value=None) as mock_backward: + # mock forward method + with patch.object(model, 'forward', return_value=torch.tensor(0.0)): + # initialize the Trainer with the current parameters + deepspeed_config = None + if deepspeed_zero_stage: + deepspeed_config = {'zero_optimization': {'stage': deepspeed_zero_stage}} + + parallelism_config = None + if use_fsdp: + parallelism_config = { + 'fsdp': { + 'sharding_strategy': 'FULL_SHARD', + 'mixed_precision': 'PURE', + 'backward_prefetch': 'BACKWARD_PRE', + }, + } trainer_instance = Trainer( model=model, @@ -199,16 +156,8 @@ def test_event_calls( optimizers=optimizer, callbacks=[event_counter_callback], device=device, - deepspeed_config={'zero_optimization': { - 'stage': deepspeed_zero_stage, - }} if deepspeed_zero_stage else None, - parallelism_config={ - 'fsdp': { - 'sharding_strategy': 'FULL_SHARD', - 'mixed_precision': 'PURE', - 'backward_prefetch': 'BACKWARD_PRE', - }, - } if use_fsdp else None, + deepspeed_config=deepspeed_config, + parallelism_config=parallelism_config, ) trainer_instance.fit() From 8910f2533395cab7d11e2e08ef0f13ba22ef1f9d Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 02:51:42 -0500 Subject: [PATCH 55/58] mock and reuse --- tests/test_events.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 52a76c2e21..91a9380459 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -16,27 +16,31 @@ from tests.common.events import EventCounterCallback -@pytest.fixture(scope='session') +@pytest.fixture def train_dataset(): return RandomClassificationDataset(size=16) -@pytest.fixture(scope='session') +@pytest.fixture def eval_dataset(): return RandomClassificationDataset(size=16) -@pytest.fixture(scope='session') +@pytest.fixture def model(): return SimpleModel() -@pytest.fixture(scope='session') -def optimizer(model): - return torch.optim.Adam(model.parameters()) +@pytest.fixture +def optimizer(): + + def _create_optimizer(model): + return torch.optim.Adam(model.parameters()) + + return _create_optimizer -@pytest.fixture(scope='session') +@pytest.fixture def evaluator1(eval_dataset): return DataLoader( dataset=eval_dataset, @@ -47,7 +51,7 @@ def evaluator1(eval_dataset): ) -@pytest.fixture(scope='session') +@pytest.fixture def evaluator2(eval_dataset): return DataLoader( dataset=eval_dataset, @@ -120,9 +124,14 @@ def test_event_calls( evaluator2, event_counter_callback, ): + + def mock_forward(*args, **kwargs): + input_tensor = args[0] + batch_size = input_tensor.size(0) + return torch.zeros(batch_size, 2) + with patch.object(Trainer, 'save_checkpoint', return_value=None): - # mock forward method - with patch.object(model, 'forward', return_value=torch.tensor(0.0)): + with patch.object(model, 'forward', side_effect=mock_forward): # initialize the Trainer with the current parameters deepspeed_config = None if deepspeed_zero_stage: @@ -153,7 +162,7 @@ def test_event_calls( eval_subset_num_batches=1, max_duration='1ep', save_interval=save_interval, - optimizers=optimizer, + optimizers=optimizer(model), # Create optimizer with the wrapped model callbacks=[event_counter_callback], device=device, deepspeed_config=deepspeed_config, @@ -166,7 +175,7 @@ def test_event_calls( state = trainer_instance.state assert state.dataloader_len is not None - total_steps = 1 * int(state.dataloader_len) + total_steps = 1 * int(state.dataloader_len) # 1 epoch batch_size = state.train_dataloader.batch_size # type: ignore assert batch_size is not None assert state.device_train_microbatch_size is not None From 234b55f77d029b55d5875595689b0a60372c3753 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 03:07:44 -0500 Subject: [PATCH 56/58] mock and reuse --- tests/test_events.py | 54 ++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 91a9380459..33b63e6da3 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -10,12 +10,37 @@ from composer import Trainer from composer.core import Event, Time +from composer.core.callback import Callback from composer.core.time import TimeUnit from composer.utils import dist from tests.common import RandomClassificationDataset, SimpleModel from tests.common.events import EventCounterCallback +class OptimizerInitializerCallback(Callback): + + def __init__(self, optimizer_class, **optimizer_kwargs): + self.optimizer_class = optimizer_class + self.optimizer_kwargs = optimizer_kwargs + + def on_run_start(self, state, logger): + optimizer = self.optimizer_class(state.model.parameters(), **self.optimizer_kwargs) + state.optimizers = optimizer + + +def mock_forward(*args, **kwargs): + if len(args) == 1 and isinstance(args[0], tuple): + input_tensor = args[0][0] + else: + input_tensor = args[0] + + if not isinstance(input_tensor, torch.Tensor): + raise TypeError(f"Expected torch.Tensor as input, got {type(input_tensor)}") + + batch_size = input_tensor.size(0) + return torch.zeros(batch_size, 2) + + @pytest.fixture def train_dataset(): return RandomClassificationDataset(size=16) @@ -31,15 +56,6 @@ def model(): return SimpleModel() -@pytest.fixture -def optimizer(): - - def _create_optimizer(model): - return torch.optim.Adam(model.parameters()) - - return _create_optimizer - - @pytest.fixture def evaluator1(eval_dataset): return DataLoader( @@ -67,6 +83,12 @@ def event_counter_callback(): return EventCounterCallback() +@pytest.fixture +def optimizer_initializer_callback(): + return OptimizerInitializerCallback(torch.optim.Adam, lr=0.001) + + +# Test to verify event values @pytest.mark.parametrize('event', list(Event)) def test_event_values(event: Event): assert event.name.lower() == event.value @@ -119,20 +141,14 @@ def test_event_calls( train_dataset, eval_dataset, model, - optimizer, evaluator1, evaluator2, event_counter_callback, + optimizer_initializer_callback, ): - - def mock_forward(*args, **kwargs): - input_tensor = args[0] - batch_size = input_tensor.size(0) - return torch.zeros(batch_size, 2) - with patch.object(Trainer, 'save_checkpoint', return_value=None): with patch.object(model, 'forward', side_effect=mock_forward): - # initialize the Trainer with the current parameters + # initialize the Trainer with the current parameters and optimizer callback deepspeed_config = None if deepspeed_zero_stage: deepspeed_config = {'zero_optimization': {'stage': deepspeed_zero_stage}} @@ -162,8 +178,8 @@ def mock_forward(*args, **kwargs): eval_subset_num_batches=1, max_duration='1ep', save_interval=save_interval, - optimizers=optimizer(model), # Create optimizer with the wrapped model - callbacks=[event_counter_callback], + optimizers=None, + callbacks=[event_counter_callback, optimizer_initializer_callback], device=device, deepspeed_config=deepspeed_config, parallelism_config=parallelism_config, From a255e90e5d70d0bfde7f66626bb51ad536856692 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 03:39:30 -0500 Subject: [PATCH 57/58] revert --- tests/test_events.py | 432 ++++++++++++++++++++----------------------- 1 file changed, 205 insertions(+), 227 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 33b63e6da3..05cf26364e 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -10,242 +10,220 @@ from composer import Trainer from composer.core import Event, Time -from composer.core.callback import Callback from composer.core.time import TimeUnit from composer.utils import dist from tests.common import RandomClassificationDataset, SimpleModel from tests.common.events import EventCounterCallback -class OptimizerInitializerCallback(Callback): - - def __init__(self, optimizer_class, **optimizer_kwargs): - self.optimizer_class = optimizer_class - self.optimizer_kwargs = optimizer_kwargs - - def on_run_start(self, state, logger): - optimizer = self.optimizer_class(state.model.parameters(), **self.optimizer_kwargs) - state.optimizers = optimizer - - -def mock_forward(*args, **kwargs): - if len(args) == 1 and isinstance(args[0], tuple): - input_tensor = args[0][0] - else: - input_tensor = args[0] - - if not isinstance(input_tensor, torch.Tensor): - raise TypeError(f"Expected torch.Tensor as input, got {type(input_tensor)}") - - batch_size = input_tensor.size(0) - return torch.zeros(batch_size, 2) - - -@pytest.fixture -def train_dataset(): - return RandomClassificationDataset(size=16) - - -@pytest.fixture -def eval_dataset(): - return RandomClassificationDataset(size=16) - - -@pytest.fixture -def model(): - return SimpleModel() - - -@pytest.fixture -def evaluator1(eval_dataset): - return DataLoader( - dataset=eval_dataset, - batch_size=8, - sampler=dist.get_sampler(eval_dataset), - num_workers=0, - drop_last=True, +class TestEventCalls: + + eval_subset_num_batches = 1 + train_subset_num_batches = 1 + + def get_trainer(self, precision='fp32', max_duration='1ep', save_interval='1ep', **kwargs): + model = SimpleModel() + optimizer = torch.optim.Adam(model.parameters()) + + train_dataset = RandomClassificationDataset(size=16) + eval_dataset = RandomClassificationDataset(size=16) + train_batch_size = 4 + + evaluator1 = DataLoader( + dataset=eval_dataset, + batch_size=8, + sampler=dist.get_sampler(eval_dataset), + num_workers=0, + drop_last=True, + ) + + evaluator2 = DataLoader( + dataset=eval_dataset, + batch_size=4, + sampler=dist.get_sampler(eval_dataset), + num_workers=0, + drop_last=True, + ) + + return Trainer( + model=model, + train_dataloader=DataLoader( + dataset=train_dataset, + batch_size=train_batch_size, + sampler=dist.get_sampler(train_dataset), + num_workers=0, + ), + eval_dataloader=(evaluator1, evaluator2), + device_train_microbatch_size=train_batch_size // 2, + precision=precision, + train_subset_num_batches=self.train_subset_num_batches, + eval_subset_num_batches=self.eval_subset_num_batches, + max_duration=max_duration, + save_interval=save_interval, + optimizers=optimizer, + callbacks=[EventCounterCallback()], + **kwargs, + ) + + @pytest.mark.parametrize( + 'world_size', + [ + pytest.param(1), + pytest.param(2, marks=pytest.mark.world_size(2)), + ], ) - - -@pytest.fixture -def evaluator2(eval_dataset): - return DataLoader( - dataset=eval_dataset, - batch_size=4, - sampler=dist.get_sampler(eval_dataset), - num_workers=0, - drop_last=True, + @pytest.mark.parametrize( + 'device,deepspeed_zero_stage,use_fsdp,precision', + [ + pytest.param('cpu', None, False, 'fp32', id='cpu-ddp'), + # TODO: Remove filterwarnings after FSDP remove deprecated code + pytest.param( + 'gpu', + True, + False, + 'fp32', + id='gpu-ddp', + marks=[ + pytest.mark.gpu, + pytest.mark.filterwarnings('ignore::UserWarning'), + ], + ), + pytest.param( + 'gpu', + None, + True, + 'amp_fp16', + id='gpu-fsdp', + marks=[ + pytest.mark.gpu, + pytest.mark.filterwarnings('ignore::UserWarning'), + ], + ), + ], ) - - -@pytest.fixture -def event_counter_callback(): - return EventCounterCallback() - - -@pytest.fixture -def optimizer_initializer_callback(): - return OptimizerInitializerCallback(torch.optim.Adam, lr=0.001) - - -# Test to verify event values -@pytest.mark.parametrize('event', list(Event)) -def test_event_values(event: Event): - assert event.name.lower() == event.value - - -@pytest.mark.parametrize( - 'world_size', - [ - pytest.param(1), - pytest.param(2, marks=pytest.mark.world_size(2)), - ], -) -@pytest.mark.parametrize( - 'device,deepspeed_zero_stage,use_fsdp,precision', - [ - pytest.param('cpu', None, False, 'fp32', id='cpu-ddp'), - # TODO: Remove filterwarnings after FSDP remove deprecated code - pytest.param( - 'gpu', - True, - False, - 'fp32', - id='gpu-ddp', - marks=[ - pytest.mark.gpu, - pytest.mark.filterwarnings('ignore::UserWarning'), - ], - ), - pytest.param( - 'gpu', - None, - True, - 'amp_fp16', - id='gpu-fsdp', - marks=[ - pytest.mark.gpu, - pytest.mark.filterwarnings('ignore::UserWarning'), - ], - ), - ], -) -@pytest.mark.parametrize('save_interval', ['1ep', '1ba']) -def test_event_calls( - world_size, - device, - deepspeed_zero_stage, - use_fsdp, - precision, - save_interval, - train_dataset, - eval_dataset, - model, - evaluator1, - evaluator2, - event_counter_callback, - optimizer_initializer_callback, -): - with patch.object(Trainer, 'save_checkpoint', return_value=None): - with patch.object(model, 'forward', side_effect=mock_forward): - # initialize the Trainer with the current parameters and optimizer callback - deepspeed_config = None - if deepspeed_zero_stage: - deepspeed_config = {'zero_optimization': {'stage': deepspeed_zero_stage}} - - parallelism_config = None - if use_fsdp: - parallelism_config = { - 'fsdp': { - 'sharding_strategy': 'FULL_SHARD', - 'mixed_precision': 'PURE', - 'backward_prefetch': 'BACKWARD_PRE', - }, - } - - trainer_instance = Trainer( - model=model, - train_dataloader=DataLoader( - dataset=train_dataset, - batch_size=4, - sampler=dist.get_sampler(train_dataset), - num_workers=0, - ), - eval_dataloader=(evaluator1, evaluator2), - device_train_microbatch_size=2, - precision=precision, - train_subset_num_batches=1, - eval_subset_num_batches=1, - max_duration='1ep', - save_interval=save_interval, - optimizers=None, - callbacks=[event_counter_callback, optimizer_initializer_callback], - device=device, - deepspeed_config=deepspeed_config, - parallelism_config=parallelism_config, + @pytest.mark.parametrize('save_interval', ['1ep', '1ba']) + def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval): + # Handle '1ba' save interval separately to optimize speed + if save_interval == '1ba': + # Mock the save_checkpoint method to speed up batch saves + with patch('composer.trainer.trainer.Trainer.save_checkpoint') as mock_save: + mock_save.return_value = None + self._run_event_calls_test( + world_size, + device, + deepspeed_zero_stage, + use_fsdp, + precision, + save_interval, + num_epochs=1, + ) + else: + self._run_event_calls_test( + world_size, + device, + deepspeed_zero_stage, + use_fsdp, + precision, + save_interval, + num_epochs=1, ) - trainer_instance.fit() - - # Assertions - state = trainer_instance.state - - assert state.dataloader_len is not None - total_steps = 1 * int(state.dataloader_len) # 1 epoch - batch_size = state.train_dataloader.batch_size # type: ignore - assert batch_size is not None - assert state.device_train_microbatch_size is not None - total_microbatches = total_steps * math.ceil(batch_size / state.device_train_microbatch_size) - - eval_interval = Time.from_timestring(save_interval) - if eval_interval.unit == TimeUnit.BATCH: - total_evals = total_steps // int(eval_interval) - elif eval_interval.unit == TimeUnit.EPOCH: - total_evals = 1 // int(eval_interval) - else: - total_evals = 0 - - if trainer_instance.state.evaluators: - steps_per_eval = 1 - total_evals_start = total_evals * len(trainer_instance.state.evaluators) - total_eval_steps = total_evals * steps_per_eval * len(trainer_instance.state.evaluators) - else: - total_eval_steps = 0 - total_evals_start = 0 - - expected_num_calls = { - Event.INIT: 1, - Event.BEFORE_LOAD: 1, - Event.AFTER_LOAD: 1, - Event.ITERATION_START: 1, - Event.EPOCH_START: 1, - Event.BATCH_START: total_steps, - Event.BEFORE_DATALOADER: total_steps + 1, # extra call per epoch when dataloader is exhausted - Event.AFTER_DATALOADER: total_steps, - Event.BEFORE_FORWARD: total_microbatches, - Event.AFTER_FORWARD: total_microbatches, - Event.BEFORE_LOSS: total_microbatches, - Event.AFTER_LOSS: total_microbatches, - Event.BEFORE_BACKWARD: total_microbatches, - Event.AFTER_BACKWARD: total_microbatches, - Event.BEFORE_TRAIN_BATCH: total_steps, - Event.AFTER_TRAIN_BATCH: total_steps, - Event.BATCH_END: total_steps, - Event.BATCH_CHECKPOINT: total_steps, - Event.EPOCH_END: 1, - Event.EPOCH_CHECKPOINT: 1, - Event.ITERATION_END: 0, - Event.ITERATION_CHECKPOINT: 0, - Event.EVAL_BEFORE_ALL: total_evals, - Event.EVAL_START: total_evals_start, - Event.EVAL_BATCH_START: total_eval_steps, - Event.EVAL_BEFORE_FORWARD: total_eval_steps, - Event.EVAL_AFTER_FORWARD: total_eval_steps, - Event.EVAL_BATCH_END: total_eval_steps, - Event.EVAL_END: total_evals_start, - Event.EVAL_AFTER_ALL: total_evals, + def _run_event_calls_test( + self, + world_size, + device, + deepspeed_zero_stage, + use_fsdp, + precision, + save_interval, + num_epochs, + ): + deepspeed_config = None + if deepspeed_zero_stage: + deepspeed_config = {'zero_optimization': {'stage': deepspeed_zero_stage}} + + parallelism_config = None + if use_fsdp: + parallelism_config = { + 'fsdp': { + 'sharding_strategy': 'FULL_SHARD', + 'mixed_precision': 'PURE', + 'backward_prefetch': 'BACKWARD_PRE', + }, } - for event, expected in expected_num_calls.items(): - actual = event_counter_callback.event_to_num_calls.get(event, 0) - assert expected == actual, f'{event} call mismatch: {expected} != {actual}' + trainer = self.get_trainer( + precision=precision, + device=device, + deepspeed_config=deepspeed_config, + parallelism_config=parallelism_config, + save_interval=save_interval, + eval_interval=Time.from_timestring(save_interval), + ) + + trainer.fit() + self._assert_expected_event_calls(trainer, Time.from_timestring(save_interval), num_epochs=num_epochs) + + def _assert_expected_event_calls(self, trainer: Trainer, eval_interval: Time, num_epochs: int): + state = trainer.state + + assert state.dataloader_len is not None + total_steps = num_epochs * int(state.dataloader_len) + batch_size = state.train_dataloader.batch_size # type: ignore + assert batch_size is not None + assert state.device_train_microbatch_size is not None + total_microbatches = total_steps * math.ceil(batch_size / state.device_train_microbatch_size) + + if eval_interval.unit == TimeUnit.BATCH: + total_evals = total_steps // int(eval_interval) + elif eval_interval.unit == TimeUnit.EPOCH: + total_evals = num_epochs // int(eval_interval) + else: + total_evals = 0 + + if trainer.state.evaluators: + steps_per_eval = self.eval_subset_num_batches + total_evals_start = total_evals * len(trainer.state.evaluators) + total_eval_steps = total_evals * steps_per_eval * len(trainer.state.evaluators) + else: + total_eval_steps = 0 + total_evals_start = 0 + + expected_num_calls = { + Event.INIT: 1, + Event.BEFORE_LOAD: 1, + Event.AFTER_LOAD: 1, + Event.ITERATION_START: 1, + Event.EPOCH_START: num_epochs, + Event.BATCH_START: total_steps, + Event.BEFORE_DATALOADER: total_steps + num_epochs, # extra call per epoch when dataloader is exhausted + Event.AFTER_DATALOADER: total_steps, + Event.BEFORE_FORWARD: total_microbatches, + Event.AFTER_FORWARD: total_microbatches, + Event.BEFORE_LOSS: total_microbatches, + Event.AFTER_LOSS: total_microbatches, + Event.BEFORE_BACKWARD: total_microbatches, + Event.AFTER_BACKWARD: total_microbatches, + Event.BEFORE_TRAIN_BATCH: total_steps, + Event.AFTER_TRAIN_BATCH: total_steps, + Event.BATCH_END: total_steps, + Event.BATCH_CHECKPOINT: total_steps, + Event.EPOCH_END: num_epochs, + Event.EPOCH_CHECKPOINT: num_epochs, + Event.ITERATION_END: 0, + Event.ITERATION_CHECKPOINT: 0, + Event.EVAL_BEFORE_ALL: total_evals, + Event.EVAL_START: total_evals_start, + Event.EVAL_BATCH_START: total_eval_steps, + Event.EVAL_BEFORE_FORWARD: total_eval_steps, + Event.EVAL_AFTER_FORWARD: total_eval_steps, + Event.EVAL_BATCH_END: total_eval_steps, + Event.EVAL_END: total_evals_start, + Event.EVAL_AFTER_ALL: total_evals, + } + + counter_callback = (cb for cb in trainer.state.callbacks if isinstance(cb, EventCounterCallback)) + counter_callback = next(counter_callback) + for event, expected in expected_num_calls.items(): + actual = counter_callback.event_to_num_calls[event] + assert expected == actual, f'{event} call mismatch: {expected} != {actual}' From 91bbd25ee1c2886841f948e85bc27484020d55d0 Mon Sep 17 00:00:00 2001 From: v-chen_data Date: Sun, 1 Dec 2024 03:43:00 -0500 Subject: [PATCH 58/58] revert --- tests/test_events.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_events.py b/tests/test_events.py index 05cf26364e..fe7dd71141 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -16,6 +16,11 @@ from tests.common.events import EventCounterCallback +@pytest.mark.parametrize('event', list(Event)) +def test_event_values(event: Event): + assert event.name.lower() == event.value + + class TestEventCalls: eval_subset_num_batches = 1 @@ -103,9 +108,9 @@ def get_trainer(self, precision='fp32', max_duration='1ep', save_interval='1ep', ) @pytest.mark.parametrize('save_interval', ['1ep', '1ba']) def test_event_calls(self, world_size, device, deepspeed_zero_stage, use_fsdp, precision, save_interval): - # Handle '1ba' save interval separately to optimize speed + # handle 1ba save interval separately to optimize speed if save_interval == '1ba': - # Mock the save_checkpoint method to speed up batch saves + # mock the save_checkpoint method to speed up batch saves with patch('composer.trainer.trainer.Trainer.save_checkpoint') as mock_save: mock_save.return_value = None self._run_event_calls_test( @@ -160,8 +165,8 @@ def _run_event_calls_test( save_interval=save_interval, eval_interval=Time.from_timestring(save_interval), ) - trainer.fit() + self._assert_expected_event_calls(trainer, Time.from_timestring(save_interval), num_epochs=num_epochs) def _assert_expected_event_calls(self, trainer: Trainer, eval_interval: Time, num_epochs: int):