Skip to content

Commit

Permalink
fix gpus per node env var
Browse files Browse the repository at this point in the history
  • Loading branch information
matbun committed Nov 7, 2024
1 parent 75e0ff7 commit f1d2b2b
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion ci/src/main/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ async def test_remote(self, kubeconfig_str: str)->str:
# pre_exec_cmd = (
# "ls /pippo"
# )
gpus_per_node = 4
pre_exec_cmd = (
"export CONTAINER_PATH=itwinai_dist_test.sif "
f"&& singularity pull --force $CONTAINER_PATH docker://{self.full_name} "
Expand All @@ -113,11 +114,13 @@ async def test_remote(self, kubeconfig_str: str)->str:
"&& export DIST_MODE=ddp "
"&& export RUN_NAME=ddp-itwinai "
"&& export COMMAND='pytest -v -m torch_dist' "
# Quick fix
f"&& export SLURM_GPUS_PER_NODE={gpus_per_node} "
# Launch code in SLURM job
"&& source slurm.vega.sh "
)
annotations = {
"slurm-job.vk.io/flags": "-p gpu --gres=gpu:4 --ntasks-per-node=1 --nodes=1 --time=00:10:00",
"slurm-job.vk.io/flags": f"-p gpu --gres=gpu:{gpus_per_node} --ntasks-per-node=1 --nodes=1 --time=00:10:00",
"slurm-job.vk.io/pre-exec": f" {pre_exec_cmd} || export SINGULARITYENV_PRE_EXEC_RETURN_CODE=1"
}
image_path = "/ceph/hpc/data/st2301-itwin-users/cern/hello-world-image.sif"
Expand Down

0 comments on commit f1d2b2b

Please sign in to comment.