Skip to content

Commit

Permalink
Make worker spawning in Slurm allocations more robust
Browse files Browse the repository at this point in the history
  • Loading branch information
Kobzol committed Aug 17, 2024
1 parent 91ddb55 commit a49ef4a
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
12 changes: 11 additions & 1 deletion crates/hyperqueue/src/server/autoalloc/queue/slurm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,17 @@ fn build_slurm_submit_script(
writeln!(script, "#SBATCH {sbatch_args}").unwrap();
}

let prefix = if nodes > 1 { "srun --overlap " } else { "" };
// Some Slurm clusters have a default that does not play well with simply running
// `srun`. For example, they can configure `--ntasks-per-node X` as a default option.
// We should make sure that we execute exactly the number of workers that we want, on exactly
// the number of nodes that we want. Therefore, we use `--ntasks` and `--nodes`.
// The `--overlap` parameter is then used to make sure that nested invocations within the HQ
// worker will be able to still consume Slurm resources.
let prefix = if nodes > 1 {
format!("srun --overlap --ntasks={nodes} --nodes={nodes} ")
} else {
"".to_string()
};
write!(script, "\n{prefix}{worker_cmd}").unwrap();
script
}
2 changes: 1 addition & 1 deletion tests/autoalloc/test_autoalloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def test_slurm_multinode_allocation(hq_env: HqEnv):
with open(sbatch_script_path) as f:
commands = normalize_output(hq_env, "slurm", extract_script_commands(f.read()))
assert commands == snapshot(
'srun --overlap RUST_LOG=tako=trace,hyperqueue=trace <hq-binary> worker start --idle-timeout "5m"'
'srun --overlap --ntasks=2 --nodes=2 RUST_LOG=tako=trace,hyperqueue=trace <hq-binary> worker start --idle-timeout "5m"'
' --manager "<manager>" --server-dir "<server-dir>/001" --on-server-lost "finish-running" --time-limit'
' "1h"'
)
Expand Down

0 comments on commit a49ef4a

Please sign in to comment.