Skip to content

Commit

Permalink
Convert perf plugin env vars to strings (#10947)
Browse files Browse the repository at this point in the history
Signed-off-by: Hemil Desai <[email protected]>
  • Loading branch information
hemildesai authored Oct 21, 2024
1 parent 7a3dd6b commit b77c743
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions nemo/lightning/run/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,16 +287,16 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
tp_size = task.trainer.strategy.tensor_model_parallel_size
cp_size = task.trainer.strategy.context_parallel_size
if tp_size > 1 and cp_size > 1:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = 1
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"

# Set LayerNorm SM margin to support the overlap with LayerNorm kernel
if self.enable_layernorm_sm_margin:
executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = self.layernorm_sm_margin
executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = self.layernorm_sm_margin
executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)

# Force Transformer Engine to use cuDNN attention over HazyResearch's Flash Attention
executor.env_vars["NVTE_FLASH_ATTN"] = 0
executor.env_vars["NVTE_FUSED_ATTN"] = 1
executor.env_vars["NVTE_FLASH_ATTN"] = "0"
executor.env_vars["NVTE_FUSED_ATTN"] = "1"

# Improve perf by steering power to tensor cores, may not work on all systems
if self.enable_vboost and isinstance(executor, run.SlurmExecutor):
Expand Down

0 comments on commit b77c743

Please sign in to comment.