From dc7e96695a760d69369cf82b77fb65d03df1f2dd Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 5 Dec 2023 11:42:29 -0800 Subject: [PATCH 01/15] compile loss fn for all workloads --- submission_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/submission_runner.py b/submission_runner.py index 12494cd6e..74697afe5 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -242,6 +242,7 @@ def train_once( else: logging.info('Performing `torch.compile`.') model_params = torch.compile(model_params) + workload.loss_fn = torch.compile(workload.loss_fn) logging.info('Initializing optimizer.') with profiler.profile('Initializing optimizer'): optimizer_state = init_optimizer_state(workload, From fa411fdc415634ddf4c42adc44d9e12bf747b04d Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 5 Dec 2023 21:04:25 -0800 Subject: [PATCH 02/15] reduce eval_batch_size by half for criteo1tb_pytorch --- .../workloads/criteo1tb/criteo1tb_pytorch/workload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py index 55b68fb2f..1db3b7173 100644 --- a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py +++ b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py @@ -22,7 +22,7 @@ class Criteo1TbDlrmSmallWorkload(BaseCriteo1TbDlrmSmallWorkload): @property def eval_batch_size(self) -> int: - return 32_768 + return 16_384 def _per_example_sigmoid_binary_cross_entropy( self, logits: spec.Tensor, targets: spec.Tensor) -> spec.Tensor: From dfb8ceb0845aaa82cbaeb6798f98d17609d97451 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 5 Dec 2023 22:03:35 -0800 Subject: [PATCH 03/15] further reduce eval_batch_size by half for criteo1tb_pytorch --- .../workloads/criteo1tb/criteo1tb_pytorch/workload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py index 1db3b7173..e8207bedc 100644 --- a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py +++ b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py @@ -22,7 +22,7 @@ class Criteo1TbDlrmSmallWorkload(BaseCriteo1TbDlrmSmallWorkload): @property def eval_batch_size(self) -> int: - return 16_384 + return 8_192 def _per_example_sigmoid_binary_cross_entropy( self, logits: spec.Tensor, targets: spec.Tensor) -> spec.Tensor: From 30062f8f71677a3ac1fb0c4d04b3574bc99a592c Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 5 Dec 2023 22:39:51 -0800 Subject: [PATCH 04/15] reduce to 1/4 --- .../workloads/criteo1tb/criteo1tb_pytorch/workload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py index e8207bedc..0ce603476 100644 --- a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py +++ b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py @@ -22,7 +22,7 @@ class Criteo1TbDlrmSmallWorkload(BaseCriteo1TbDlrmSmallWorkload): @property def eval_batch_size(self) -> int: - return 8_192 + return 2_048 def _per_example_sigmoid_binary_cross_entropy( self, logits: spec.Tensor, targets: spec.Tensor) -> spec.Tensor: From fa3dddd41111c05eccaa3b86fa86248cfa26ef71 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 6 Dec 2023 13:03:32 -0800 Subject: [PATCH 05/15] reduce eval_batch_size for criteo1tb --- .../workloads/criteo1tb/criteo1tb_pytorch/workload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py index 0ce603476..be3ed26cf 100644 --- a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py +++ b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py @@ -22,7 +22,7 @@ class Criteo1TbDlrmSmallWorkload(BaseCriteo1TbDlrmSmallWorkload): @property def eval_batch_size(self) -> int: - return 2_048 + return 512 def _per_example_sigmoid_binary_cross_entropy( self, logits: spec.Tensor, targets: spec.Tensor) -> spec.Tensor: From 1ddfa098f01356adfb966f8faf6930f844f7e627 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 6 Dec 2023 15:17:51 -0800 Subject: [PATCH 06/15] recover eval_batch_size for criteo1tb --- .../workloads/criteo1tb/criteo1tb_pytorch/workload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py index be3ed26cf..55b68fb2f 100644 --- a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py +++ b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py @@ -22,7 +22,7 @@ class Criteo1TbDlrmSmallWorkload(BaseCriteo1TbDlrmSmallWorkload): @property def eval_batch_size(self) -> int: - return 512 + return 32_768 def _per_example_sigmoid_binary_cross_entropy( self, logits: spec.Tensor, targets: spec.Tensor) -> spec.Tensor: From bfe74ce598406e3da76cd80e0d72e99fb7543f92 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 6 Dec 2023 15:19:10 -0800 Subject: [PATCH 07/15] temporary commenting out workload.eval_model for criteo1tb --- submission_runner.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/submission_runner.py b/submission_runner.py index 74697afe5..293004b3b 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -361,25 +361,25 @@ def train_once( try: eval_start_time = get_time() - latest_eval_result = workload.eval_model(global_eval_batch_size, - model_params, - model_state, - eval_rng, - data_dir, - imagenet_v2_data_dir, - global_step) - # Check if targets reached. - train_state['validation_goal_reached'] = ( - workload.has_reached_validation_target(latest_eval_result) or - train_state['validation_goal_reached']) - train_state['test_goal_reached'] = ( - workload.has_reached_test_target(latest_eval_result) or - train_state['test_goal_reached']) + # latest_eval_result = workload.eval_model(global_eval_batch_size, + # model_params, + # model_state, + # eval_rng, + # data_dir, + # imagenet_v2_data_dir, + # global_step) + # # Check if targets reached. + # train_state['validation_goal_reached'] = ( + # workload.has_reached_validation_target(latest_eval_result) or + # train_state['validation_goal_reached']) + # train_state['test_goal_reached'] = ( + # workload.has_reached_test_target(latest_eval_result) or + # train_state['test_goal_reached']) # Save last eval time. eval_end_time = get_time() train_state['last_eval_time'] = eval_end_time - + latest_eval_result = {} # Accumulate eval time. train_state[ 'accumulated_eval_time'] += eval_end_time - eval_start_time From 7a112f5ce118e230b8a628d609bbbf7c7276a2aa Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 6 Dec 2023 16:05:19 -0800 Subject: [PATCH 08/15] disable torch.compile for criteo1tb --- submission_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submission_runner.py b/submission_runner.py index 293004b3b..d8493ae73 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -242,7 +242,7 @@ def train_once( else: logging.info('Performing `torch.compile`.') model_params = torch.compile(model_params) - workload.loss_fn = torch.compile(workload.loss_fn) + # workload.loss_fn = torch.compile(workload.loss_fn) logging.info('Initializing optimizer.') with profiler.profile('Initializing optimizer'): optimizer_state = init_optimizer_state(workload, From 4b53433ee79eba1747f45f256f8ee9b2db71c86b Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 6 Dec 2023 16:33:54 -0800 Subject: [PATCH 09/15] recover evaluation --- submission_runner.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/submission_runner.py b/submission_runner.py index d8493ae73..c85279002 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -242,7 +242,7 @@ def train_once( else: logging.info('Performing `torch.compile`.') model_params = torch.compile(model_params) - # workload.loss_fn = torch.compile(workload.loss_fn) + workload.loss_fn = torch.compile(workload.loss_fn) logging.info('Initializing optimizer.') with profiler.profile('Initializing optimizer'): optimizer_state = init_optimizer_state(workload, @@ -361,25 +361,23 @@ def train_once( try: eval_start_time = get_time() - # latest_eval_result = workload.eval_model(global_eval_batch_size, - # model_params, - # model_state, - # eval_rng, - # data_dir, - # imagenet_v2_data_dir, - # global_step) - # # Check if targets reached. - # train_state['validation_goal_reached'] = ( - # workload.has_reached_validation_target(latest_eval_result) or - # train_state['validation_goal_reached']) - # train_state['test_goal_reached'] = ( - # workload.has_reached_test_target(latest_eval_result) or - # train_state['test_goal_reached']) - + latest_eval_result = workload.eval_model(global_eval_batch_size, + model_params, + model_state, + eval_rng, + data_dir, + imagenet_v2_data_dir, + global_step) + # Check if targets reached. + train_state['validation_goal_reached'] = ( + workload.has_reached_validation_target(latest_eval_result) or + train_state['validation_goal_reached']) + train_state['test_goal_reached'] = ( + workload.has_reached_test_target(latest_eval_result) or + train_state['test_goal_reached']) # Save last eval time. eval_end_time = get_time() train_state['last_eval_time'] = eval_end_time - latest_eval_result = {} # Accumulate eval time. train_state[ 'accumulated_eval_time'] += eval_end_time - eval_start_time From c4029ac564974f5abb7dbc6605434e74319817ea Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 6 Dec 2023 17:29:34 -0800 Subject: [PATCH 10/15] comment out eval to avoid tokenizer for librispeech --- submission_runner.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/submission_runner.py b/submission_runner.py index c85279002..8c78e170e 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -361,26 +361,27 @@ def train_once( try: eval_start_time = get_time() - latest_eval_result = workload.eval_model(global_eval_batch_size, - model_params, - model_state, - eval_rng, - data_dir, - imagenet_v2_data_dir, - global_step) - # Check if targets reached. - train_state['validation_goal_reached'] = ( - workload.has_reached_validation_target(latest_eval_result) or - train_state['validation_goal_reached']) - train_state['test_goal_reached'] = ( - workload.has_reached_test_target(latest_eval_result) or - train_state['test_goal_reached']) + # latest_eval_result = workload.eval_model(global_eval_batch_size, + # model_params, + # model_state, + # eval_rng, + # data_dir, + # imagenet_v2_data_dir, + # global_step) + # # Check if targets reached. + # train_state['validation_goal_reached'] = ( + # workload.has_reached_validation_target(latest_eval_result) or + # train_state['validation_goal_reached']) + # train_state['test_goal_reached'] = ( + # workload.has_reached_test_target(latest_eval_result) or + # train_state['test_goal_reached']) # Save last eval time. eval_end_time = get_time() train_state['last_eval_time'] = eval_end_time # Accumulate eval time. train_state[ 'accumulated_eval_time'] += eval_end_time - eval_start_time + latest_eval_result = {} # Add times to eval results for logging. latest_eval_result['score'] = ( From ea7282761ca840b2339e7eb1d32741ac30346664 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 6 Dec 2023 18:31:37 -0800 Subject: [PATCH 11/15] recover eval_model --- submission_runner.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/submission_runner.py b/submission_runner.py index 8c78e170e..259daa936 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -225,6 +225,7 @@ def train_once( ] eager_backend_workloads = ['librispeech_deepspeech'] aot_eager_backend_workloads = [] + skip_loss_compilation_workloads = ['criteo1tb'] if FLAGS.workload in compile_error_workloads: logging.warning( 'These workloads cannot be fully compiled under current ' @@ -242,7 +243,8 @@ def train_once( else: logging.info('Performing `torch.compile`.') model_params = torch.compile(model_params) - workload.loss_fn = torch.compile(workload.loss_fn) + if FLAGS.workload not in skip_loss_compilation_workloads: + workload.loss_fn = torch.compile(workload.loss_fn) logging.info('Initializing optimizer.') with profiler.profile('Initializing optimizer'): optimizer_state = init_optimizer_state(workload, @@ -361,28 +363,26 @@ def train_once( try: eval_start_time = get_time() - # latest_eval_result = workload.eval_model(global_eval_batch_size, - # model_params, - # model_state, - # eval_rng, - # data_dir, - # imagenet_v2_data_dir, - # global_step) - # # Check if targets reached. - # train_state['validation_goal_reached'] = ( - # workload.has_reached_validation_target(latest_eval_result) or - # train_state['validation_goal_reached']) - # train_state['test_goal_reached'] = ( - # workload.has_reached_test_target(latest_eval_result) or - # train_state['test_goal_reached']) + latest_eval_result = workload.eval_model(global_eval_batch_size, + model_params, + model_state, + eval_rng, + data_dir, + imagenet_v2_data_dir, + global_step) + # Check if targets reached. + train_state['validation_goal_reached'] = ( + workload.has_reached_validation_target(latest_eval_result) or + train_state['validation_goal_reached']) + train_state['test_goal_reached'] = ( + workload.has_reached_test_target(latest_eval_result) or + train_state['test_goal_reached']) # Save last eval time. eval_end_time = get_time() train_state['last_eval_time'] = eval_end_time # Accumulate eval time. train_state[ 'accumulated_eval_time'] += eval_end_time - eval_start_time - latest_eval_result = {} - # Add times to eval results for logging. latest_eval_result['score'] = ( train_state['accumulated_submission_time']) From a57759bb512cbe9937e31b1cb7afa24465a7c22f Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 6 Dec 2023 18:44:28 -0800 Subject: [PATCH 12/15] comment out eval --- submission_runner.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/submission_runner.py b/submission_runner.py index 259daa936..3a09263b3 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -363,20 +363,21 @@ def train_once( try: eval_start_time = get_time() - latest_eval_result = workload.eval_model(global_eval_batch_size, - model_params, - model_state, - eval_rng, - data_dir, - imagenet_v2_data_dir, - global_step) - # Check if targets reached. - train_state['validation_goal_reached'] = ( - workload.has_reached_validation_target(latest_eval_result) or - train_state['validation_goal_reached']) - train_state['test_goal_reached'] = ( - workload.has_reached_test_target(latest_eval_result) or - train_state['test_goal_reached']) + # latest_eval_result = workload.eval_model(global_eval_batch_size, + # model_params, + # model_state, + # eval_rng, + # data_dir, + # imagenet_v2_data_dir, + # global_step) + # # Check if targets reached. + # train_state['validation_goal_reached'] = ( + # workload.has_reached_validation_target(latest_eval_result) or + # train_state['validation_goal_reached']) + # train_state['test_goal_reached'] = ( + # workload.has_reached_test_target(latest_eval_result) or + # train_state['test_goal_reached']) + latest_eval_result = {} # Save last eval time. eval_end_time = get_time() train_state['last_eval_time'] = eval_end_time From 7132021a439361a22a15e1d768173093d7edb959 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Wed, 6 Dec 2023 19:50:13 -0800 Subject: [PATCH 13/15] specify workloads that benefits from compiling loss_fn --- submission_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/submission_runner.py b/submission_runner.py index 3a09263b3..f4439e7e0 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -225,7 +225,7 @@ def train_once( ] eager_backend_workloads = ['librispeech_deepspeech'] aot_eager_backend_workloads = [] - skip_loss_compilation_workloads = ['criteo1tb'] + loss_compilation_workloads = ['fastmri', 'librispeech_deepspeech', 'ogbg', 'wmt'] if FLAGS.workload in compile_error_workloads: logging.warning( 'These workloads cannot be fully compiled under current ' @@ -243,7 +243,7 @@ def train_once( else: logging.info('Performing `torch.compile`.') model_params = torch.compile(model_params) - if FLAGS.workload not in skip_loss_compilation_workloads: + if FLAGS.workload in loss_compilation_workloads: workload.loss_fn = torch.compile(workload.loss_fn) logging.info('Initializing optimizer.') with profiler.profile('Initializing optimizer'): From bcf1ca0701c951fd23ee23282b43cdf5e3493717 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 7 Dec 2023 14:44:33 -0800 Subject: [PATCH 14/15] enable evaluation --- submission_runner.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/submission_runner.py b/submission_runner.py index f4439e7e0..df7be7a86 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -363,21 +363,20 @@ def train_once( try: eval_start_time = get_time() - # latest_eval_result = workload.eval_model(global_eval_batch_size, - # model_params, - # model_state, - # eval_rng, - # data_dir, - # imagenet_v2_data_dir, - # global_step) - # # Check if targets reached. - # train_state['validation_goal_reached'] = ( - # workload.has_reached_validation_target(latest_eval_result) or - # train_state['validation_goal_reached']) - # train_state['test_goal_reached'] = ( - # workload.has_reached_test_target(latest_eval_result) or - # train_state['test_goal_reached']) - latest_eval_result = {} + latest_eval_result = workload.eval_model(global_eval_batch_size, + model_params, + model_state, + eval_rng, + data_dir, + imagenet_v2_data_dir, + global_step) + # Check if targets reached. + train_state['validation_goal_reached'] = ( + workload.has_reached_validation_target(latest_eval_result) or + train_state['validation_goal_reached']) + train_state['test_goal_reached'] = ( + workload.has_reached_test_target(latest_eval_result) or + train_state['test_goal_reached']) # Save last eval time. eval_end_time = get_time() train_state['last_eval_time'] = eval_end_time From 33ecf8e375900e2f22d9e7a92e835c2d2b5f2f99 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 7 Dec 2023 14:46:00 -0800 Subject: [PATCH 15/15] nit --- submission_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/submission_runner.py b/submission_runner.py index df7be7a86..31453c736 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -377,12 +377,15 @@ def train_once( train_state['test_goal_reached'] = ( workload.has_reached_test_target(latest_eval_result) or train_state['test_goal_reached']) + # Save last eval time. eval_end_time = get_time() train_state['last_eval_time'] = eval_end_time + # Accumulate eval time. train_state[ 'accumulated_eval_time'] += eval_end_time - eval_start_time + # Add times to eval results for logging. latest_eval_result['score'] = ( train_state['accumulated_submission_time'])