diff --git a/flow/environment.py b/flow/environment.py index db73231f1..605bed25c 100644 --- a/flow/environment.py +++ b/flow/environment.py @@ -34,6 +34,7 @@ from .scheduling.pbs import PBSScheduler from .scheduling.simple_scheduler import SimpleScheduler from .scheduling.slurm import SlurmScheduler +from .util.template_filters import calc_num_nodes, calc_tasks logger = logging.getLogger(__name__) @@ -108,6 +109,10 @@ class ComputeEnvironment(metaclass=_ComputeEnvironmentType): template = "base_script.sh" mpi_cmd = "mpiexec" + _cpus_per_node = {"default": -1} + _gpus_per_node = {"default": -1} + _shared_partitions = set() + @classmethod def is_present(cls): """Determine whether this specific compute environment is present. @@ -283,6 +288,67 @@ def _get_default_directives(cls): ) ) + @classmethod + def _get_scheduler_values(cls, context): + """Return a dictionary of computed quantities regarding submission. + + Warning + ------- + Must be called after the rest of the template context has been gathered. + """ + partition = context.get("partition", None) + force = context.get("force", False) + if force or partition in cls._shared_partitions: + threshold = 0.0 + else: + threshold = 0.9 + cpu_tasks_total = calc_tasks( + context["operations"], + "np", + context.get("parallel", False), + context.get("force", False), + ) + gpu_tasks_total = calc_tasks( + context["operations"], + "ngpu", + context.get("parallel", False), + context.get("force", False), + ) + + if gpu_tasks_total > 0: + num_nodes_gpu = cls._calc_num_nodes( + gpu_tasks_total, cls._get_gpus_per_node(partition), threshold + ) + num_nodes_cpu = cls._calc_num_nodes( + cpu_tasks_total, cls._get_cpus_per_node(partition), 0 + ) + else: + num_nodes_gpu = 0 + num_nodes_cpu = cls._calc_num_nodes( + cpu_tasks_total, cls._get_cpus_per_node(partition), threshold + ) + num_nodes = max(num_nodes_cpu, num_nodes_gpu, 1) + return { + "ncpu_tasks": cpu_tasks_total, + "ngpu_tasks": gpu_tasks_total, + "num_nodes": num_nodes, + } + + @classmethod + def _get_cpus_per_node(cls, partition): + return cls._cpus_per_node.get(partition, cls._cpus_per_node["default"]) + + @classmethod + def _get_gpus_per_node(cls, partition): + return cls._gpus_per_node.get(partition, cls._gpus_per_node["default"]) + + @classmethod + def _calc_num_nodes(cls, tasks, processors, threshold): + """Call calc_num_nodes but handles the -1 sentinal value.""" + if processors == -1: + return 1 + return calc_num_nodes(tasks, processors, threshold) + class StandardEnvironment(ComputeEnvironment): """Default environment which is always present.""" diff --git a/flow/environments/incite.py b/flow/environments/incite.py index 8225b18b7..5b47e1482 100644 --- a/flow/environments/incite.py +++ b/flow/environments/incite.py @@ -36,8 +36,8 @@ def my_operation(job): hostname_pattern = r".*\.summit\.olcf\.ornl\.gov" template = "summit.sh" mpi_cmd = "jsrun" - cores_per_node = 42 - gpus_per_node = 6 + _cpus_per_node = {"default": 42} + _gpus_per_node = {"default": 6} @template_filter def calc_num_nodes(cls, resource_sets, parallel=False): @@ -187,7 +187,8 @@ class AndesEnvironment(DefaultSlurmEnvironment): hostname_pattern = r"andes-.*\.olcf\.ornl\.gov" template = "andes.sh" mpi_cmd = "srun" - cores_per_node = 32 + _cpus_per_node = {"default": 32, "gpu": 28} + _gpus_per_node = {"default": 0, "gpu": 2} @classmethod def add_args(cls, parser): @@ -216,8 +217,9 @@ class CrusherEnvironment(DefaultSlurmEnvironment): hostname_pattern = r".*\.crusher\.olcf\.ornl\.gov" template = "crusher.sh" - cores_per_node = 56 - gpus_per_node = 8 + _cpus_per_node = {"default": 56} + _gpus_per_node = {"default": 8} + mpi_cmd = "srun" @template_filter @@ -267,25 +269,10 @@ class FrontierEnvironment(DefaultSlurmEnvironment): hostname_pattern = r".*\.frontier\.olcf\.ornl\.gov" template = "frontier.sh" - cores_per_node = 56 - gpus_per_node = 8 + _cpus_per_node = {"default": 56} + _gpus_per_node = {"default": 8} mpi_cmd = "srun" - @template_filter - def calc_num_nodes(cls, ngpus, ncpus, threshold): - """Compute the number of nodes needed to meet the resource request. - - Also raise an error when the requested resource do not come close to saturating the asked - for nodes. - """ - nodes_gpu = max(1, int(ceil(ngpus / cls.gpus_per_node))) - nodes_cpu = max(1, int(ceil(ncpus / cls.cores_per_node))) - if nodes_gpu >= nodes_cpu: - check_utilization(nodes_gpu, ngpus, cls.gpus_per_node, threshold, "compute") - return nodes_gpu - check_utilization(nodes_cpu, ncpus, cls.cores_per_node, threshold, "compute") - return nodes_cpu - @classmethod def _get_mpi_prefix(cls, operation, parallel): """Get the correct srun command for the job. diff --git a/flow/environments/umich.py b/flow/environments/umich.py index d68b611a0..fed0ecb95 100644 --- a/flow/environments/umich.py +++ b/flow/environments/umich.py @@ -13,7 +13,11 @@ class GreatLakesEnvironment(DefaultSlurmEnvironment): hostname_pattern = r"gl(-login)?[0-9]+\.arc-ts\.umich\.edu" template = "umich-greatlakes.sh" - cores_per_node = 1 + _cpus_per_node = {"default": 36, "gpu": 40} + _gpus_per_node = {"default": 2} + _shared_partitions = {"standard", "gpu"} + + mpi_cmd = "srun" @classmethod def add_args(cls, parser): diff --git a/flow/environments/xsede.py b/flow/environments/xsede.py index ab5bf1351..b8923bb22 100644 --- a/flow/environments/xsede.py +++ b/flow/environments/xsede.py @@ -25,6 +25,14 @@ class Stampede2Environment(DefaultSlurmEnvironment): mpi_cmd = "ibrun" offset_counter = 0 base_offset = _STAMPEDE_OFFSET + _cpus_per_node = { + "default": 48, + "skx-dev": 68, + "skx-normal": 68, + "skx-large": 68, + "icx-normal": 80, + } + _gpus_per_node = {"default": 0.0} @template_filter def return_and_increment(cls, increment): @@ -138,8 +146,10 @@ class Bridges2Environment(DefaultSlurmEnvironment): hostname_pattern = r".*\.bridges2\.psc\.edu$" template = "bridges2.sh" - cores_per_node = 128 mpi_cmd = "mpirun" + _cpus_per_node = {"default": 128, "EM": 96, "GPU": 40, "GPU-shared": 40} + _gpus_per_node = {"default": 8} + _shared_partitions = {"RM-shared", "GPU-shared"} @classmethod def add_args(cls, parser): @@ -175,8 +185,9 @@ class ExpanseEnvironment(DefaultSlurmEnvironment): hostname_pattern = r".*\.expanse\.sdsc\.edu$" template = "expanse.sh" - cores_per_node = 128 - gpus_per_node = 4 + _cpus_per_node = {"default": 128, "GPU": 40} + _gpus_per_node = {"default": 4} + _shared_partitions = {"shared", "gpu-shared"} @classmethod def add_args(cls, parser): @@ -218,7 +229,15 @@ class DeltaEnvironment(DefaultSlurmEnvironment): # be safer given the parts listed are less likely to change. hostname_pattern = r"(gpua|dt|cn)(-login)?[0-9]+\.delta.*\.ncsa.*\.edu" template = "delta.sh" - cores_per_node = 128 + _cpus_per_node = { + "default": 128, + "gpuA40x4": 64, + "gpuA100x4": 64, + "gpuA100x8": 128, + "gpuMI100x8": 128, + } + _gpus_per_node = {"default": 4, "gpuA100x8": 8, "gpuMI100x8": 8} + _shared_partitions = {"cpu", "gpuA100x4", "gpuA40x4", "gpuA100x8", "gpuMI100x8"} @classmethod def add_args(cls, parser): diff --git a/flow/project.py b/flow/project.py index 0bf3b93cd..47a17c0f0 100644 --- a/flow/project.py +++ b/flow/project.py @@ -663,11 +663,6 @@ def __init__( self._project = project self.submit_options = submit_options self.run_options = run_options - # We register aggregators associated with operation functions in - # `_register_groups` and we do not set the aggregator explicitly. - # We delay setting the aggregator because we do not restrict the - # decorator placement in terms of `@FlowGroupEntry`, `@aggregator`, or - # `@operation`. self.group_aggregator = group_aggregator def __call__(self, func=None, /, *, directives=None): @@ -1534,11 +1529,6 @@ def _internal_call( # Append the name and function to the class registry self._parent_class._OPERATION_FUNCTIONS.append((name, func)) - # We register aggregators associated with operation functions in - # `_register_groups` and we do not set the aggregator explicitly. We - # delay setting the aggregator because we do not restrict the decorator - # placement in terms of `@FlowGroupEntry`, `@aggregator`, or - # `@operation`. self._parent_class._GROUPS.append( FlowGroupEntry(name=name, project=self._parent_class) ) @@ -4004,6 +3994,7 @@ def _generate_submit_script( context["id"] = _id context["operations"] = list(operations) context.update(kwargs) + context["resources"] = self._environment._get_scheduler_values(context) if show_template_help: self._show_template_help_and_exit(template_environment, context) return template.render(**context) diff --git a/flow/templates/andes.sh b/flow/templates/andes.sh index 27a0172e4..5fd55ff6e 100644 --- a/flow/templates/andes.sh +++ b/flow/templates/andes.sh @@ -1,37 +1,24 @@ {# Templated in accordance with: https://docs.olcf.ornl.gov/systems/andes_user_guide.html #} {% extends "slurm.sh" %} {% block tasks %} - {% set threshold = 0 if force else 0.9 %} - {% set cpu_tasks = operations|calc_tasks('np', parallel, force) %} - {% set gpu_tasks = operations|calc_tasks('ngpu', parallel, force) %} - {% if gpu_tasks %} + {% if resources.ngpu_tasks %} {% if not ('GPU' in partition or force) %} {% raise "GPU operations require a GPU partition!" %} {% endif %} - {# GPU nodes have 2 NVIDIA K80s #} - {% set nn_gpu = gpu_tasks|calc_num_nodes(2) %} - {% set nn = nn_gpu %} {% else %} {% if 'gpu' in partition and not force %} {% raise "Requesting gpu partition, but no GPUs requested!" %} {% endif %} - {% set nn = nn|default(cpu_tasks|calc_num_nodes(32), true) %} {% endif %} {% if 'gpu' in partition %} - {% set gpus_per_node = (gpu_tasks / nn)|round(0, 'ceil')|int %} - {% set cpus_per_node = (cpu_tasks / nn)|round(0, 'ceil')|int %} - {% if cpus_per_node > gpus_per_node * 14 and not force %} + {% if resources.ncpu_tasks > resources.ngpu_tasks * 14 and not force %} {% raise "Cannot request more than 14 CPUs per GPU." %} {% endif %} {% endif %} +#SBATCH -N {{ resources.num_nodes }} +#SBATCH --ntasks={{ resources.ncpu_tasks }} {% if partition == 'gpu' %} -#SBATCH -N {{ nn|check_utilization(gpu_tasks, 2, threshold, 'GPU') }} -#SBATCH --ntasks-per-node={{ cpus_per_node }} -#SBATCH --gpus={{ gpu_tasks }} - {% else %} - {# This should cover batch #} -#SBATCH -N {{ nn|check_utilization(cpu_tasks, 32, threshold, 'CPU') }} -#SBATCH --ntasks-per-node={{ (32, cpu_tasks)|min }} +#SBATCH --gpus={{ resources.ngpu_tasks }} {% endif %} {% endblock tasks %} {% block header %} diff --git a/flow/templates/bridges2.sh b/flow/templates/bridges2.sh index f04ede0be..6606021e1 100644 --- a/flow/templates/bridges2.sh +++ b/flow/templates/bridges2.sh @@ -1,48 +1,27 @@ {# Templated in accordance with: https://www.psc.edu/resources/bridges-2/user-guide #} {% extends "slurm.sh" %} {% block tasks %} - {% set threshold = 0 if force else 0.9 %} - {% set cpu_tasks = operations|calc_tasks('np', parallel, force) %} - {% set gpu_tasks = operations|calc_tasks('ngpu', parallel, force) %} - {% if gpu_tasks %} + {% if resources.ngpu_tasks %} {% if not ('GPU' in partition or force) %} {% raise "GPU operations require a GPU partition!" %} {% endif %} - {#- GPU nodes have 8 NVIDIA V100-32GB SXM2 #} - {% set nn_gpu = gpu_tasks|calc_num_nodes(8) %} - {% set nn = nn_gpu %} + {% if partition == "GPU-shared" and resources.ngpu_tasks > 4 %} + {% raise "Cannot request GPU-shared with more than 4 GPUs." %} + {% endif %} {% else %} {% if 'GPU' in partition and not force %} {% raise "Requesting GPU partition, but no GPUs requested!" %} {% endif %} - {% set nn = nn|default(cpu_tasks|calc_num_nodes(128), true) %} {% endif %} - {% if 'GPU' in partition %} - {% set gpus_per_node = (gpu_tasks / nn)|round(0, 'ceil')|int %} - {% set cpus_per_node = (cpu_tasks / nn)|round(0, 'ceil')|int %} - {% if cpus_per_node > gpus_per_node * 5 and not force %} - {% raise "Cannot request more than 5 CPUs per GPU." %} - {% endif %} + {% if partition == 'RM-shared' and resources.ncpu_tasks > 64 %} + {% raise "Cannot request RM-shared with more than 64 tasks or multiple nodes." %} {% endif %} - {% if partition == 'GPU' %} -#SBATCH -N {{ nn|check_utilization(gpu_tasks, 8, threshold, 'GPU') }} -#SBATCH --gpus={{ gpu_tasks }} - {% elif partition == 'GPU-shared' %} -#SBATCH -N {{ nn|check_utilization(gpu_tasks, 1, threshold, 'GPU') }} -#SBATCH --gpus={{ gpu_tasks }} - {% elif partition == 'EM' %} -#SBATCH -N {{ nn|check_utilization(cpu_tasks, 96, threshold, 'CPU') }} -#SBATCH --ntasks-per-node={{ (96, cpu_tasks)|min }} - {% elif partition == 'RM-shared' %} - {% if nn|default(1, true) > 1 or cpu_tasks > 64 %} - {% raise "Cannot request RM-shared with more than 64 tasks or multiple nodes." %} - {% endif %} -#SBATCH -N {{ nn|default(1, true) }} -#SBATCH --ntasks={{ cpu_tasks }} - {% else %} -{#- This should cover RM, RM-512, and possibly RM-small (not documented) #} -#SBATCH -N {{ nn|check_utilization(cpu_tasks, 128, threshold, 'CPU') }} -#SBATCH --ntasks-per-node={{ (128, cpu_tasks)|min }} + {% if resources.num_nodes > 1 or resources.ncpu_tasks >= 128 or resources.ngpu_tasks >= 8 %} +#SBATCH -N {{ resources.num_nodes }} + {% endif %} +#SBATCH --ntasks={{ resources.ncpu_tasks }} + {% if 'GPU' in partition %} +#SBATCH --gpus={{ resources.ngpu_tasks }} {% endif %} {% endblock tasks %} {% block header %} diff --git a/flow/templates/crusher.sh b/flow/templates/crusher.sh index 430be4c9d..842d10bb5 100644 --- a/flow/templates/crusher.sh +++ b/flow/templates/crusher.sh @@ -1,11 +1,7 @@ {# Templated in accordance with: https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html #} {% extends "slurm.sh" %} {% block tasks %} - {% set threshold = 0 if force else 0.9 %} - {% set cpu_tasks = operations|calc_tasks('np', parallel, force) %} - {% set gpu_tasks = operations|calc_tasks('ngpu', parallel, force) %} - {% set nn = gpu_tasks|calc_num_nodes(cpu_tasks, threshold) %} -#SBATCH --nodes={{ nn }} +#SBATCH --nodes={{ resources.num_nodes }} {% endblock tasks %} {% block header %} {{- super() -}} diff --git a/flow/templates/delta.sh b/flow/templates/delta.sh index 8e173ff20..4785da06a 100644 --- a/flow/templates/delta.sh +++ b/flow/templates/delta.sh @@ -1,56 +1,26 @@ {# Templated in accordance with: https://https://wiki.ncsa.illinois.edu/display/DSC/Delta+User+Guide #} {% extends "slurm.sh" %} {% block tasks %} - {% set threshold = 0 if force else 0.9 %} - {% set cpu_tasks = operations|calc_tasks('np', parallel, force) %} - {% set gpu_tasks = operations|calc_tasks('ngpu', parallel, force) %} {% if partition in ["gpuA100x8", "gpuMI100x8"] %} - {% raise "Cannot use given partition in default Delta template." %} + {% raise "This partition is not supported as it has few nodes, + increased charges and is expected to be suitable for a + minority of use cases." %} {% endif %} - {% if gpu_tasks %} - {% if partition == "gpuA40x4" %} - {% set nn_gpu = gpu_tasks|calc_num_nodes(4) %} - {% elif partition == "gpuA100x4" %} - {% set nn_gpu = gpu_tasks|calc_num_nodes(4) %} - {# We do not allow submission to the partitions below as they have few #} - {# nodes, should be rarely needed, and have increased charges for use. #} - {% elif partition in ["gpuA100x8", "gpuMI100x8"] %} - {% raise "This partition is not supported as it has few nodes, - increased charges and is expected to be suitable for a - minority of use cases." %} - {% else %} + {% if resources.ngpu_tasks %} + {% if not ("gpu" in partition or force) %} {% raise "GPU operations require a GPU partition!" %} {% endif %} - {% set nn = nn_gpu %} {% else %} {% if 'gpu' in partition and not force %} {% raise "Requesting GPU partition, but no GPUs requested!" %} {% endif %} - {% set nn = nn|default(cpu_tasks|calc_num_nodes(128), true) %} {% endif %} - {% if 'gpu' in partition %} - {% set gpus_per_node = (gpu_tasks / nn)|round(0, 'ceil')|int %} - {% set cpus_per_node = (cpu_tasks / nn)|round(0, 'ceil')|int %} - {% if cpus_per_node > gpus_per_node * 16 and not force %} - {% raise "Cannot request more than 16 CPUs per GPU." %} - {% endif %} + {% if resources.num_nodes > 1 %} +#SBATCH -N {{ resources.num_nodes }} {% endif %} +#SBATCH --ntasks={{ resources.ncpu_tasks }} {% if "gpu" in partition %} - {% if nn ==1 %} -#SBATCH -N {{ nn }} - {% else %} -#SBATCH -N {{ nn|check_utilization(gpu_tasks, 4, threshold, 'GPU') }} - {% endif %} -#SBATCH --ntasks-per-node={{ cpus_per_node }} -#SBATCH --gpus-per-node={{ gpus_per_node }} - {% else %} - {% if nn == 1 %} -#SBATCH -N {{ nn }} -#SBATCH --ntasks-per-node={{ (128, cpu_tasks)|min }} - {% else %} -#SBATCH -N {{ nn|check_utilization(cpu_tasks, 128, threshold, 'CPU') }} -#SBATCH --ntasks-per-node={{ (128, cpu_tasks)|min }} - {% endif %} +#SBATCH --gpus={{ resources.ngpu_tasks }} {% endif %} {% endblock tasks %} {% block header %} diff --git a/flow/templates/expanse.sh b/flow/templates/expanse.sh index 9ec2778e9..a052726cc 100644 --- a/flow/templates/expanse.sh +++ b/flow/templates/expanse.sh @@ -1,50 +1,24 @@ {# Templated in accordance with: https://www.sdsc.edu/support/user_guides/expanse.html #} {% extends "slurm.sh" %} {% block tasks %} - {% set threshold = 0 if force else 0.9 %} - {% set cpu_tasks = operations|calc_tasks('np', parallel, force) %} - {% set gpu_tasks = operations|calc_tasks('ngpu', parallel, force) %} - {% if gpu_tasks %} + {% if resources.gpu_tasks %} {% if not ('gpu' in partition or force) %} {% raise "GPU operations require a GPU partition!" %} {% endif %} - {# GPU nodes have 4 NVIDIA V100-32GB SMX2 #} - {% set nn_gpu = gpu_tasks|calc_num_nodes(4) %} - {% set nn = nn_gpu %} {% else %} {% if 'gpu' in partition and not force %} {% raise "Requesting GPU partition, but no GPUs requested!" %} {% endif %} - {% set nn = nn|default(cpu_tasks|calc_num_nodes(128), true) %} {% endif %} - {% if 'gpu' in partition %} - {% set gpus_per_node = (gpu_tasks / nn)|round(0, 'ceil')|int %} - {% set cpus_per_node = (cpu_tasks / nn)|round(0, 'ceil')|int %} - {% if cpus_per_node > gpus_per_node * 10 and not force %} - {% raise "Cannot request more than 10 CPUs per GPU." %} - {% endif %} + {% if "shared" in partition and resources.num_nodes > 1 %} + {% raise "Cannot request shared partition with resources spanning multiple nodes." %} {% endif %} - {% if partition == 'gpu' %} -#SBATCH -N {{ nn|check_utilization(gpu_tasks, 4, threshold, 'GPU') }} -#SBATCH --ntasks-per-node={{ cpus_per_node }} -#SBATCH --gpus={{ gpu_tasks }} - {% elif partition == 'gpu-shared' %} - {% if nn|default(1, true) > 1 %} - {% raise "Cannot request shared partition with resources spanning multiple nodes." %} - {% endif %} -#SBATCH -N {{ nn|check_utilization(gpu_tasks, 1, threshold, 'GPU') }} -#SBATCH --ntasks-per-node={{ cpus_per_node }} -#SBATCH --gpus={{ gpu_tasks }} - {% elif partition == 'shared' %} - {% if nn|default(1, true) > 1 %} - {% raise "Cannot request shared partition with resources spanning multiple nodes." %} - {% endif %} -#SBATCH -N 1 -#SBATCH --ntasks={{ cpu_tasks }} - {% else %} -{# This should cover compute and large-memory #} -#SBATCH -N {{ nn|check_utilization(cpu_tasks, 128, threshold, 'CPU') }} -#SBATCH --ntasks-per-node={{ (128, cpu_tasks)|min }} + {% if "shared" not in partition %} +#SBATCH -N {{ resources.num_nodes }} + {% endif %} +#SBATCH --ntasks={{ resources.ncpus_tasks }} + {% if 'gpu' in partition %} +#SBATCH --gpus={{ resources.gpu_tasks }} {% endif %} {% endblock tasks %} {% block header %} diff --git a/flow/templates/frontier.sh b/flow/templates/frontier.sh index c92d65902..4eb5553ac 100644 --- a/flow/templates/frontier.sh +++ b/flow/templates/frontier.sh @@ -1,14 +1,10 @@ {# Templated in accordance with: https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html #} {% extends "slurm.sh" %} {% block tasks %} - {% set threshold = 0 if force else 0.9 %} - {% set cpu_tasks = operations|calc_tasks('np', parallel, force) %} - {% set gpu_tasks = operations|calc_tasks('ngpu', parallel, force) %} - {% set nn = gpu_tasks|calc_num_nodes(cpu_tasks, threshold) %} - {% if gpu_tasks < 1 and not force %} + {% if not resources.ngpu_tasks and not force %} {% raise "Must request GPUs to use Frontier." %} {% endif %} -#SBATCH --nodes={{ nn }} +#SBATCH --nodes={{ resources.num_nodes }} {% endblock tasks %} {% block header %} {{- super() -}} diff --git a/flow/templates/lsf.sh b/flow/templates/lsf.sh index 1f47127f9..4c9672112 100644 --- a/flow/templates/lsf.sh +++ b/flow/templates/lsf.sh @@ -17,6 +17,6 @@ #BSUB -eo {{ job_output }} {% endif %} {% block tasks %} -#BSUB -n {{ operations|calc_tasks('np', parallel, force) }} +#BSUB -n {{ resources.ncpu_tasks }} {% endblock tasks %} {% endblock header %} diff --git a/flow/templates/pbs.sh b/flow/templates/pbs.sh index 24211d510..08c28f9f1 100644 --- a/flow/templates/pbs.sh +++ b/flow/templates/pbs.sh @@ -20,14 +20,12 @@ {% endblock preamble %} {% block tasks %} {% set threshold = 0 if force else 0.9 %} - {% set cpu_tasks = operations|calc_tasks('np', parallel, force) %} - {% set gpu_tasks = operations|calc_tasks('ngpu', parallel, force) %} - {% set s_gpu = ':gpus=1' if gpu_tasks else '' %} + {% set s_gpu = ':gpus=1' if resources.ngpu_tasks else '' %} {% set ppn = ppn|default(operations|calc_tasks('omp_num_threads', parallel, force), true) %} {% if ppn %} -#PBS -l nodes={{ nn|default(cpu_tasks|calc_num_nodes(ppn, threshold, 'CPU'), true) }}:ppn={{ ppn }}{{ s_gpu }} +#PBS -l nodes={{ resources.num_nodes }}:ppn={{ ppn }}{{ s_gpu }} {% else %} -#PBS -l procs={{ cpu_tasks }}{{ s_gpu }} +#PBS -l procs={{ resources.cpu_tasks }}{{ s_gpu }} {% endif %} {% endblock tasks %} {% endblock header %} diff --git a/flow/templates/slurm.sh b/flow/templates/slurm.sh index 6ab494032..88f4c2747 100644 --- a/flow/templates/slurm.sh +++ b/flow/templates/slurm.sh @@ -20,6 +20,6 @@ {% endif %} {% endblock preamble %} {% block tasks %} -#SBATCH --ntasks={{ operations|calc_tasks('np', parallel, force) }} +#SBATCH --ntasks={{ resources.ncpu_tasks }} {% endblock tasks %} {% endblock header %} diff --git a/flow/templates/summit.sh b/flow/templates/summit.sh index e3ac08ba1..910383fdb 100644 --- a/flow/templates/summit.sh +++ b/flow/templates/summit.sh @@ -1,9 +1,7 @@ {# Templated in accordance with: https://www.olcf.ornl.gov/for-users/system-user-guides/summit/running-jobs/ #} {% extends "lsf.sh" %} {% block tasks %} - {% set threshold = 0 if force else 0.9 %} - {% set nn = operations|map('guess_resource_sets')|calc_num_nodes(parallel) %} -#BSUB -nnodes {{ nn }} +#BSUB -nnodes {{ resources.num_nodes }} {% endblock tasks %} {% block header %} {{- super() -}} diff --git a/flow/templates/umich-greatlakes.sh b/flow/templates/umich-greatlakes.sh index 673b90df0..abb640b75 100644 --- a/flow/templates/umich-greatlakes.sh +++ b/flow/templates/umich-greatlakes.sh @@ -1,22 +1,16 @@ {% extends "slurm.sh" %} {% set partition = partition|default('standard', true) %} {% block tasks %} - {% set threshold = 0 if force else 0.9 %} - {% set cpu_tasks = operations|calc_tasks('np', parallel, force) %} - {% set gpu_tasks = operations|calc_tasks('ngpu', parallel, force) %} - {% if gpu_tasks and 'gpu' not in partition and not force %} + {% if resources.ngpu_tasks and 'gpu' not in partition and not force %} {% raise "Requesting GPUs requires a gpu partition!" %} {% endif %} - {% set nn_cpu = cpu_tasks|calc_num_nodes(36) if 'gpu' not in partition else cpu_tasks|calc_num_nodes(40) %} - {% set nn_gpu = gpu_tasks|calc_num_nodes(2) if 'gpu' in partition else 0 %} - {% set nn = nn|default((nn_cpu, nn_gpu)|max, true) %} + {% if 'gpu' in partition and resources.ngpu_tasks == 0 and not force %} + {% raise "Requesting gpu partition without GPUs!" %} + {% endif %} +#SBATCH --nodes={{ resources.num_nodes }} +#SBATCH --ntasks={{ resources.ncpu_tasks }} {% if partition == 'gpu' %} -#SBATCH --nodes={{ nn|default(1, true) }} -#SBATCH --ntasks-per-node={{ (gpu_tasks, cpu_tasks)|max }} -#SBATCH --gpus={{ gpu_tasks }} - {% else %}{# standard compute partition #} -#SBATCH --nodes={{ nn }} -#SBATCH --ntasks-per-node={{ (36, cpu_tasks)|min }} +#SBATCH --gpus={{ resources.ngpu_tasks }} {% endif %} {% endblock tasks %} {% block header %} diff --git a/tests/template_reference_data.tar.gz b/tests/template_reference_data.tar.gz index 0614d133b..96cdbcb95 100644 Binary files a/tests/template_reference_data.tar.gz and b/tests/template_reference_data.tar.gz differ diff --git a/tests/test_project.py b/tests/test_project.py index 93b769ca7..ef2bb52b3 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -1149,7 +1149,9 @@ def test_submit_operations(self): assert len(list(MockScheduler.jobs())) == 0 cluster_job_id = project._store_bundled(operations) with redirect_stderr(StringIO()): - project._submit_operations(_id=cluster_job_id, operations=operations) + project._submit_operations( + _id=cluster_job_id, operations=operations, force=True + ) assert len(list(MockScheduler.jobs())) == 1 def test_submit(self): @@ -1208,13 +1210,15 @@ def test_resubmit(self): def test_bundles(self): project = self.mock_project() assert len(list(MockScheduler.jobs())) == 0 + # Cannot use GPU operations since this will lead to missmatched resources + op_names = ["op1", "op2", "op4"] with redirect_stderr(StringIO()): - project.submit(bundle_size=2, num=2) + project.submit(bundle_size=2, num=2, names=op_names) assert len(list(MockScheduler.jobs())) == 1 - project.submit(bundle_size=2, num=4) + project.submit(bundle_size=2, num=4, names=op_names) assert len(list(MockScheduler.jobs())) == 3 MockScheduler.reset() - project.submit(bundle_size=0) + project.submit(bundle_size=0, names=op_names) assert len(list(MockScheduler.jobs())) == 1 def test_submit_status(self): @@ -1265,7 +1269,9 @@ def test_submit_operations_bad_directive(self): project = self.mock_project() operations = [] for job in project: - operations.extend(project._next_operations([(job,)])) + operations.extend( + project._next_operations([(job,)], operation_names=["op1"]) + ) assert len(list(MockScheduler.jobs())) == 0 cluster_job_id = project._store_bundled(operations) stderr = StringIO()