From 68f8792379f7d76ce3135154938dad1b39ecda6d Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Tue, 17 Dec 2024 11:51:21 -0800 Subject: [PATCH] fix Signed-off-by: Hemil Desai --- src/nemo_run/core/execution/slurm.py | 6 +++++- src/nemo_run/run/torchx_backend/schedulers/slurm.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/nemo_run/core/execution/slurm.py b/src/nemo_run/core/execution/slurm.py index 1b2d5ca..cbd5ad3 100644 --- a/src/nemo_run/core/execution/slurm.py +++ b/src/nemo_run/core/execution/slurm.py @@ -543,7 +543,7 @@ def package_configs(self, *cfgs: tuple[str, str]) -> list[str]: return filenames def package(self, packager: Packager, job_name: str): - if job_name in self.tunnel.packaging_jobs: + if job_name in self.tunnel.packaging_jobs and not packager.symlink_from_remote_dir: logger.info( f"Packaging for job {job_name} in tunnel {self.tunnel} already done. Skipping subsequent packagings.\n" "This may cause issues if you have multiple tasks with the same name but different packagers, as only the first packager will be used." @@ -570,6 +570,10 @@ def package(self, packager: Packager, job_name: str): if base_remote_mount not in self.container_mounts: self.container_mounts.append(f"{base_remote_dir}:{base_remote_dir}") + for req in self.resource_group: + if base_remote_mount not in req.container_mounts: + req.container_mounts.append(base_remote_mount) + return assert self.experiment_id, "Executor not assigned to an experiment." diff --git a/src/nemo_run/run/torchx_backend/schedulers/slurm.py b/src/nemo_run/run/torchx_backend/schedulers/slurm.py index 75aaf41..c0eafb5 100644 --- a/src/nemo_run/run/torchx_backend/schedulers/slurm.py +++ b/src/nemo_run/run/torchx_backend/schedulers/slurm.py @@ -96,6 +96,8 @@ def _submit_dryrun(self, app: AppDef, cfg: Executor) -> AppDryRunInfo[Any]: # t partition = executor.partition assert partition is None or isinstance(partition, str), "partition must be str" + executor.package(packager=executor.packager, job_name=Path(job_dir).name) + srun_cmds: list[list[str]] = [] jobs = [] envs = {} @@ -137,8 +139,6 @@ def _submit_dryrun(self, app: AppDef, cfg: Executor) -> AppDryRunInfo[Any]: # t with open(path, "w") as f: f.write(script) - executor.package(packager=executor.packager, job_name=Path(job_dir).name) - return AppDryRunInfo(req, repr) def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str: # type: ignore