From eddce12658359f94e530763f8d25f9026cc7502a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 23 Jan 2025 23:26:05 +0100 Subject: [PATCH] Only init the gloo process group when necessary (#798) A small startup optimization --- torchtitan/checkpoint.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torchtitan/checkpoint.py b/torchtitan/checkpoint.py index db54ccd9..68479aad 100644 --- a/torchtitan/checkpoint.py +++ b/torchtitan/checkpoint.py @@ -197,13 +197,14 @@ def __init__( self.begin_time = 0 self.time_sync_work = None self.time_sync_result = None - self.pg = dist.new_group(backend="gloo") + async_mode = ckpt_config.async_mode.lower() + if async_mode == AsyncMode.ASYNC or self.interval_type == IntervalType.SECONDS: + self.pg = dist.new_group(backend="gloo") self.model_weights_only = ckpt_config.model_weights_only self.export_dtype = TORCH_DTYPE_MAP[ckpt_config.export_dtype] self.mp = None - async_mode = ckpt_config.async_mode.lower() if async_mode == AsyncMode.DISABLED: self.async_mode = AsyncMode.DISABLED elif async_mode == AsyncMode.ASYNC: