From 3f1b658e730a20b7e7bbb2bc211304b25b7ace13 Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Mon, 12 Dec 2022 16:03:36 -0500 Subject: [PATCH 1/2] fix: Multi-node GPU summissions for greatlakes and picotte. Fixes logic where the --ntasks-per-node would not normalize based on number of nodes for GPU submissions where the number of tasks is often the number of GPUs. --- flow/templates/drexel-picotte.sh | 2 +- flow/templates/umich-greatlakes.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/templates/drexel-picotte.sh b/flow/templates/drexel-picotte.sh index 2a1003c4d..2a9a6e63f 100644 --- a/flow/templates/drexel-picotte.sh +++ b/flow/templates/drexel-picotte.sh @@ -12,7 +12,7 @@ {% set nn = nn|default((nn_cpu, nn_gpu)|max, true) %} {% if partition == 'gpu' %} #SBATCH --nodes={{ nn|default(1, true) }} -#SBATCH --ntasks-per-node={{ (gpu_tasks, cpu_tasks)|max }} +#SBATCH --ntasks-per-node={{ ((gpu_tasks, cpu_tasks)|max / nn)|int }} #SBATCH --gres=gpu:{{ gpu_tasks }} {% else %}{# def partition #} #SBATCH --nodes={{ nn }} diff --git a/flow/templates/umich-greatlakes.sh b/flow/templates/umich-greatlakes.sh index 673b90df0..09e1aa2da 100644 --- a/flow/templates/umich-greatlakes.sh +++ b/flow/templates/umich-greatlakes.sh @@ -12,7 +12,7 @@ {% set nn = nn|default((nn_cpu, nn_gpu)|max, true) %} {% if partition == 'gpu' %} #SBATCH --nodes={{ nn|default(1, true) }} -#SBATCH --ntasks-per-node={{ (gpu_tasks, cpu_tasks)|max }} +#SBATCH --ntasks-per-node={{ ((gpu_tasks, cpu_tasks)|max / nn)|int }} #SBATCH --gpus={{ gpu_tasks }} {% else %}{# standard compute partition #} #SBATCH --nodes={{ nn }} From 4c4fa7b5c7a4c31205aa9b032e11692c3c5c2bdc Mon Sep 17 00:00:00 2001 From: Brandon Butler Date: Wed, 11 Jan 2023 12:34:20 -0500 Subject: [PATCH 2/2] fix: Template handling when ntasks % n_nodes != 0. --- flow/templates/drexel-picotte.sh | 12 +++++++++++- flow/templates/umich-greatlakes.sh | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/flow/templates/drexel-picotte.sh b/flow/templates/drexel-picotte.sh index 2a9a6e63f..bcb7336d7 100644 --- a/flow/templates/drexel-picotte.sh +++ b/flow/templates/drexel-picotte.sh @@ -12,11 +12,21 @@ {% set nn = nn|default((nn_cpu, nn_gpu)|max, true) %} {% if partition == 'gpu' %} #SBATCH --nodes={{ nn|default(1, true) }} + {# Check to make sure requested tasks is a multiple of number of nodes. #} + {% if (gpu_tasks, cpu_tasks)|max % nn == 0 %} #SBATCH --ntasks-per-node={{ ((gpu_tasks, cpu_tasks)|max / nn)|int }} + {% else %} +#SBATCH --ntasks={{ (gpu_tasks, cpu_tasks)|max }} + {% endif %} #SBATCH --gres=gpu:{{ gpu_tasks }} {% else %}{# def partition #} #SBATCH --nodes={{ nn }} -#SBATCH --ntasks-per-node={{ (48, cpu_tasks)|min }} + {# Check to make sure requested tasks is a multiple of number of nodes. #} + {% if cpu_tasks % nn == 0 %} +#SBATCH --ntasks-per-node={{ (cpu_tasks / nn)|int }} + {% else %} +#SBATCH --ntasks={{ cpu_tasks }} + {% endif %} {% endif %} {% endblock tasks %} {% block header %} diff --git a/flow/templates/umich-greatlakes.sh b/flow/templates/umich-greatlakes.sh index 09e1aa2da..b8eb466b2 100644 --- a/flow/templates/umich-greatlakes.sh +++ b/flow/templates/umich-greatlakes.sh @@ -12,11 +12,21 @@ {% set nn = nn|default((nn_cpu, nn_gpu)|max, true) %} {% if partition == 'gpu' %} #SBATCH --nodes={{ nn|default(1, true) }} + {# Check to make sure requested tasks is a multiple of number of nodes. #} + {% if (gpu_tasks, cpu_tasks)|max % nn == 0 %} #SBATCH --ntasks-per-node={{ ((gpu_tasks, cpu_tasks)|max / nn)|int }} + {% else %} +#SBATCH --ntasks={{ (gpu_tasks, cpu_tasks)|max }} + {% endif %} #SBATCH --gpus={{ gpu_tasks }} {% else %}{# standard compute partition #} #SBATCH --nodes={{ nn }} -#SBATCH --ntasks-per-node={{ (36, cpu_tasks)|min }} + {# Check to make sure requested tasks is a multiple of number of nodes. #} + {% if cpu_tasks % nn == 0 %} +#SBATCH --ntasks-per-node={{ (cpu_tasks / nn)|int }} + {% else %} +#SBATCH --ntasks={{ cpu_tasks }} + {% endif %} {% endif %} {% endblock tasks %} {% block header %}