From da72e5db3a31ffa279b47b7dbc19fcd8fb9cb604 Mon Sep 17 00:00:00 2001 From: Henry LE BERRE Date: Sat, 6 Jan 2024 01:26:10 -0500 Subject: [PATCH] Batch files per computer (#240 & 287) --- toolchain/mfc/args.py | 8 ++- toolchain/mfc/common.py | 1 + toolchain/mfc/run/engines.py | 20 +++--- toolchain/mfc/run/queues.py | 30 +++++--- toolchain/templates/computer/summit.sh | 50 +++++++++++++ toolchain/templates/generic/lsf.sh | 54 ++++++++++++++ toolchain/templates/generic/pbs.sh | 65 +++++++++++++++++ toolchain/templates/generic/slurm.sh | 76 ++++++++++++++++++++ toolchain/templates/lsf.sh | 88 ----------------------- toolchain/templates/pbs.sh | 90 ----------------------- toolchain/templates/slurm.sh | 99 -------------------------- 11 files changed, 287 insertions(+), 294 deletions(-) create mode 100644 toolchain/templates/computer/summit.sh create mode 100644 toolchain/templates/generic/lsf.sh create mode 100644 toolchain/templates/generic/pbs.sh create mode 100644 toolchain/templates/generic/slurm.sh delete mode 100644 toolchain/templates/lsf.sh delete mode 100644 toolchain/templates/pbs.sh delete mode 100644 toolchain/templates/slurm.sh diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index 9fe355b9c0..449bb80cc1 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -6,7 +6,7 @@ from .run.engines import ENGINES from .run.mpi_bins import BINARIES -# pylint: disable=too-many-locals, too-many-statements +# pylint: disable=too-many-locals, too-many-branches, too-many-statements def parse(config): parser = argparse.ArgumentParser( prog="./mfc.sh", @@ -123,6 +123,7 @@ def add_common_arguments(p, mask = None): run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.") run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.") + run.add_argument("-c", "--computer", metavar="COMPUTER", type=str, default=None, help="(Batch) Computer to run on or path to a template batch submission file.") # === BENCH === add_common_arguments(bench, "t") @@ -157,6 +158,11 @@ def add_common_arguments(p, mask = None): if args["command"] == "build": if (args["input"] is not None) ^ args["case_optimization"] : raise MFCException("./mfc.sh build's --case-optimization requires --input") + if args["command"] == "run" and args["engine"] == "batch": + if args["computer"] is None: + raise MFCException("./mfc.sh run's --computer is required when --engine=batch") + if args["binary"] is not None: + raise MFCException("./mfc.sh run's --binary is not allowed when --engine=batch") # Input files to absolute paths for e in ["input", "input1", "input2"]: diff --git a/toolchain/mfc/common.py b/toolchain/mfc/common.py index 3701a4e89f..47fa516c5f 100644 --- a/toolchain/mfc/common.py +++ b/toolchain/mfc/common.py @@ -8,6 +8,7 @@ MFC_ROOTDIR = normpath(f"{dirname(realpath(__file__))}/../..") MFC_TESTDIR = abspath(f"{MFC_ROOTDIR}/tests") MFC_SUBDIR = abspath(f"{MFC_ROOTDIR}/build") +MFC_TEMPLATEDIR = abspath(f"{MFC_ROOTDIR}/toolchain/templates") MFC_LOCK_FILEPATH = abspath(f"{MFC_SUBDIR}/lock.yaml") MFC_BENCH_FILEPATH = abspath(f"{MFC_ROOTDIR}/toolchain/bench.yaml") diff --git a/toolchain/mfc/run/engines.py b/toolchain/mfc/run/engines.py index 988363c926..6f0815d76d 100644 --- a/toolchain/mfc/run/engines.py +++ b/toolchain/mfc/run/engines.py @@ -215,17 +215,22 @@ def __get_batch_filepath(self): self.__get_batch_filename() ])) - def __generate_prologue(self, qsystem: queues.QueueSystem) -> str: - modules = f"" - + def __generate_module_load(self) -> str: if does_system_use_modules(): - modules = f"""\ -printf ":) Loading modules...\\n" + return f"""\ +printf ":) Loading modules...\\n module purge module load {' '.join(get_loaded_modules())} """ + return f"""\ +printf ":) Loading modules...\\n + +# No modules to load. +""" + + def __generate_prologue(self, qsystem: queues.QueueSystem) -> str: return f"""\ TABLE_FORMAT_LINE="| - %-14s %-35s - %-14s %-35s |\\n" TABLE_HEADER="+-----------------------------------------------------------------------------------------------------------+ \\n" @@ -245,8 +250,6 @@ def __generate_prologue(self, qsystem: queues.QueueSystem) -> str: printf "$TABLE_CONTENT\\n" printf "$TABLE_FOOTER\\n" -{modules} - cd "{self.input.case_dirpath}" t_start=$(date +%s) @@ -282,6 +285,7 @@ def __batch_evaluate(self, s: str, qsystem: queues.QueueSystem, targets: typing. ("{MFC::PROLOGUE}", self.__generate_prologue(qsystem)), ("{MFC::PROFILER}", ' '.join(profiler_prepend())), ("{MFC::EPILOGUE}", self.__generate_epilogue()), + ("{MFC::MODULES}", self.__generate_module_load()), ("{MFC::BINARIES}", ' '.join([f"'{target.get_install_binpath()}'" for target in targets])), ] @@ -310,7 +314,7 @@ def __create_batch_file(self, qsystem: queues.QueueSystem, targets: typing.List[ cons.print("> Generating batch file...") filepath = self.__get_batch_filepath() cons.print("> Evaluating template file...") - content = self.__batch_evaluate(qsystem.template, qsystem, targets) + content = self.__batch_evaluate(qsystem.get_template(), qsystem, targets) cons.print("> Writing batch file...") file_write(filepath, content) diff --git a/toolchain/mfc/run/queues.py b/toolchain/mfc/run/queues.py index acb833888c..33eeb88533 100644 --- a/toolchain/mfc/run/queues.py +++ b/toolchain/mfc/run/queues.py @@ -2,15 +2,29 @@ from mfc import common from ..state import ARG +from ..common import MFC_TEMPLATEDIR @dataclasses.dataclass class QueueSystem: - name: str - template: str + name: str - def __init__(self, name: str, filename: str) -> None: - self.name = name - self.template = common.file_read(os.sep.join(["toolchain", "templates", filename])) + def __init__(self, name: str) -> None: + self.name = name + + def get_template(self) -> str: + candidates = [ + ARG("computer"), + os.path.join(MFC_TEMPLATEDIR, "computer", ARG("computer")), + os.path.join(MFC_TEMPLATEDIR, "generic", ARG("computer")), + ] + + for candidate in candidates: + for ext in ["", ".sh"]: + filepath = f"{candidate}{ext}" + if os.path.isfile(filepath): + return common.file_read(filepath) + + raise common.MFCException(f"QueueSystem: Failed to find computer/template file for {ARG('computer')}.") def is_active(self) -> bool: raise common.MFCException("QueueSystem::is_active: not implemented.") @@ -21,7 +35,7 @@ def gen_submit_cmd(self, filepath: str) -> typing.List[str]: class PBSSystem(QueueSystem): def __init__(self) -> None: - super().__init__("PBS", "pbs.sh") + super().__init__("PBS") def is_active(self) -> bool: return common.does_command_exist("qsub") @@ -35,7 +49,7 @@ def gen_submit_cmd(self, filepath: str) -> typing.List[str]: class LSFSystem(QueueSystem): def __init__(self) -> None: - super().__init__("LSF", "lsf.sh") + super().__init__("LSF") def is_active(self) -> bool: return common.does_command_exist("bsub") and common.does_command_exist("bqueues") @@ -51,7 +65,7 @@ def gen_submit_cmd(self, filepath: str) -> None: class SLURMSystem(QueueSystem): def __init__(self) -> None: - super().__init__("SLURM", "slurm.sh") + super().__init__("SLURM") def is_active(self) -> bool: return common.does_command_exist("sbatch") diff --git a/toolchain/templates/computer/summit.sh b/toolchain/templates/computer/summit.sh new file mode 100644 index 0000000000..444e075a19 --- /dev/null +++ b/toolchain/templates/computer/summit.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +#> +#> This file is part of the ./mfc.sh run subsystem. Expressions enclosed in +#> curly braces are expanded and evaluated using Python's eval() function and +#> data from ./mfc.sh run. The resulting file is submitted to the queue system. +#> +#BSUB -J {name} +#BSUB -nnodes {nodes} +#BSUB -N +#BSUB -P {account} +#BSUB -W {walltime[:-3]} + + +. ./mfc.sh load -c s -m {'g' if gpu else 'c'} + + +#> +#> The MFC prologue sets up the environment required to run MFC prior to +#> execution and starts the timer. +#> +{MFC::PROLOGUE} + + +#> +#> Iterate over all MFC binaries (as specified through --targets) and execute +#> them, one by one, with profiling enabled if requested. +#> +for binpath in {MFC::BINARIES}; do + + echo -e ":) Running $binpath:" + echo "" + + jsrun \ + {'--smpiargs="-gpu"' if gpu else ''} \ + --nrs {tasks_per_node*nodes} \ + --cpu_per_rs 1 \ + --gpu_per_rs {1 if gpu else 0} \ + --tasks_per_rs 1 \ + {MFC::PROFILER} "$binpath" + + echo "" + +done + + +#> +#> The MFC epilogue stops the timer and prints the execution summary. It also +#> performs some cleanup and housekeeping tasks before exiting. +#> +{MFC::EPILOGUE} diff --git a/toolchain/templates/generic/lsf.sh b/toolchain/templates/generic/lsf.sh new file mode 100644 index 0000000000..feed1b911a --- /dev/null +++ b/toolchain/templates/generic/lsf.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +#> +#> This file is part of the ./mfc.sh run subsystem. Expressions enclosed in +#> curly braces are expanded and evaluated using Python's eval() function and +#> data from ./mfc.sh run. The resulting file is submitted to the queue system. +#> +#BSUB -J {name} +#BSUB -nnodes {nodes} +#BSUB -N +#BSUB -P {account} +#BSUB -W {walltime[:-3]} + + +#> +#> Load the same modules as the ones currently loaded in the login shell. These +#> are usually the ones used to compile MFC. +#> +{MFC::MODULES} + + +#> +#> The MFC prologue sets up the environment required to run MFC prior to +#> execution and starts the timer. +#> +{MFC::PROLOGUE} + + +#> +#> Iterate over all MFC binaries (as specified through --targets) and execute +#> them, one by one, with profiling enabled if requested. +#> +for binpath in {MFC::BINARIES}; do + + echo -e ":) Running $binpath:" + echo "" + + jsrun \ + {'--smpiargs="-gpu"' if gpu else ''} \ + --nrs {tasks_per_node*nodes} \ + --cpu_per_rs 1 \ + --gpu_per_rs {1 if gpu else 0} \ + --tasks_per_rs 1 \ + {MFC::PROFILER} "$binpath" + + echo "" + +done + + +#> +#> The MFC epilogue stops the timer and prints the execution summary. It also +#> performs some cleanup and housekeeping tasks before exiting. +#> +{MFC::EPILOGUE} diff --git a/toolchain/templates/generic/pbs.sh b/toolchain/templates/generic/pbs.sh new file mode 100644 index 0000000000..215358c226 --- /dev/null +++ b/toolchain/templates/generic/pbs.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +#> +#> This file is part of the ./mfc.sh run subsystem. Expressions enclosed in +#> curly braces are expanded and evaluated using Python's eval() function and +#> data from ./mfc.sh run. The resulting file is submitted to the queue system. +#> +#PBS -N {name} +#PBS -l nodes={nodes}:ppn={tasks_per_node} +#PBS -A {account} +#PBS -l walltime={walltime} +#PBS -q {partition} +#PBS -M {email} +#> +#> Note: The following options aren't enabled by default. +#> They serve as a guide to users that wish to pass +#> more options to the batch system. +#> + + +#> +#> Load the same modules as the ones currently loaded in the login shell. These +#> are usually the ones used to compile MFC. +#> +{MFC::MODULES} + + +#> +#> The MFC prologue sets up the environment required to run MFC prior to +#> execution and starts the timer. +#> +{MFC::PROLOGUE} + + +#> +#> Iterate over all MFC binaries (as specified through --targets) and execute +#> them, one by one, with profiling enabled if requested. +#> +for binpath in {MFC::BINARIES}; do + + echo -e ":) Running $binpath:" + + if command -v srun > /dev/null 2>&1; then + srun \ + --nodes {nodes} \ + --ntasks-per-node {tasks_per_node} \ + {MFC::PROFILER} "$binpath" + + #> + #> srun --mpi=pmix \ + #> {MFC::PROFILER} "$binpath" + else + mpirun \ + -np {tasks_per_node*nodes} \ + {MFC::PROFILER} "$binpath" + + fi + +done + + +#> +#> The MFC epilogue stops the timer and prints the execution summary. It also +#> performs some cleanup and housekeeping tasks before exiting. +#> +{MFC::EPILOGUE} diff --git a/toolchain/templates/generic/slurm.sh b/toolchain/templates/generic/slurm.sh new file mode 100644 index 0000000000..7e19068b3c --- /dev/null +++ b/toolchain/templates/generic/slurm.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +#> +#> This file is part of the ./mfc.sh run subsystem. Expressions enclosed in +#> curly braces are expanded and evaluated using Python's eval() function and +#> data from ./mfc.sh run. The resulting file is submitted to the queue system. +#> +#SBATCH --job-name="{name}" +#SBATCH --nodes={nodes} +#SBATCH --ntasks-per-node={tasks_per_node} +#SBATCH --cpus-per-task=1 +#SBATCH --gpu-bind=verbose,closest +#SBATCH --gpus=v100-16:{(1 if gpu else 0)*tasks_per_node*nodes} +#SBATCH --time={walltime} +#SBATCH --partition="{partition}" +#SBATCH --output="{name}.out" +#SBATCH --account="{account}" +#SBATCH --error="{name}.err" +#SBATCH --mail-user="{email}" +#SBATCH --export=ALL +#SBATCH --mail-type="BEGIN, END, FAIL" +#> +#> Note: The following options aren't enabled by default. +#> They serve as a guide to users that wish to pass +#> more options to the batch system. +#> +#> #SBATCH --mem=... +#> #SBATCH --constraint="lustre" +#> #SBATCH --gpus-per-task={1 if gpu else 0} + + +#> +#> Load the same modules as the ones currently loaded in the login shell. These +#> are usually the ones used to compile MFC. +#> +{MFC::MODULES} + + +#> +#> The MFC prologue sets up the environment required to run MFC prior to +#> execution and starts the timer. +#> +{MFC::PROLOGUE} + + +#> +#> Iterate over all MFC binaries (as specified through --targets) and execute +#> them, one by one, with profiling enabled if requested. +#> +for binpath in {MFC::BINARIES}; do + + echo -e ":) Running $binpath:" + + if command -v srun > /dev/null 2>&1; then + srun \ + --nodes {nodes} \ + --ntasks-per-node {tasks_per_node} \ + {MFC::PROFILER} "$binpath" + + #> + #> srun --mpi=pmix \ + #> {MFC::PROFILER} "$binpath" + #> + else + mpirun \ + -np {nodes*tasks_per_node} \ + {MFC::PROFILER} "$binpath" + fi + +done + + +#> +#> The MFC epilogue stops the timer and prints the execution summary. It also +#> performs some cleanup and housekeeping tasks before exiting. +#> +{MFC::EPILOGUE} diff --git a/toolchain/templates/lsf.sh b/toolchain/templates/lsf.sh deleted file mode 100644 index 1db260b0ba..0000000000 --- a/toolchain/templates/lsf.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env bash -#> -#> - LSF Batch File Template - -#> -#> This file is part of the ./mfc.sh run subsystem. -#> For more information, please consult the README. -#> -#> - You are invited to modify this file to suit your -#> needs, in order to get MFC running properly on -#> your system. -#> -#> - Lines that begin with "#>" are ignored and won't -#> figure in the final batch script, not even as a -#> comment. -#> -#> - Statements of the form `${expression}` are string- -#> -replaced by mfc.sh run to provide runtime parameters, -#> most notably execution options. They reference the -#> variables in the same format as those under the "run" -#> section of [mfc.user.yaml](mfc.user.yaml), replacing -#> "-" for "_". You can perform therein any Python operation -#> recognized by the built-in `expr()` function. -#> -#> - Statements of the form {MFC::expression} tell MFC -#> where to place the common code, across all batch -#> files that is required to run MFC. They are not -#> intended to be modified by users. -#> -#BSUB -J {name} -#BSUB -nnodes {nodes} -#BSUB -N -#BSUB -P {account} -#BSUB -W {walltime[:-3]} -#> -#> Note: The above expression for the walltime converts -#> the expression "hh:mm:ss" to the appropriate -#> format for the batch system ("hh:mm"). It is -#> a python expression evaluated at runtime. -#> -#> -#> Note: The following options aren't enabled by default. -#> They serve as a guide to users that wish to pass -#> more options to the batch system. -#> - - - -#> -#> Note: If your system requires you to load environment -#> modules inside of your batch script, please load -#> them bellow. -#> - - - -#> -#> Note: The MFC prologue sets up the environment required -#> prior to execution. -#> -{MFC::PROLOGUE} - -#> -#> Note: This MPI executable might not be well supported -#> on your system - if at all. {MFC::BIN} refers to -#> the path the MFC executable. -#> - -for binpath in {MFC::BINARIES}; do - - echo -e ":) Running $binpath:" - echo "" - - jsrun \ - {'--smpiargs="-gpu"' if gpu else ''} \ - --nrs {tasks_per_node*nodes} \ - --cpu_per_rs 1 \ - --gpu_per_rs {1 if gpu else 0} \ - --tasks_per_rs 1 \ - {MFC::PROFILER} "$binpath" - - echo "" - -done - -{MFC::EPILOGUE} -#> -#> Note: Lines after the MFC Epilogue will not be executed. -#> diff --git a/toolchain/templates/pbs.sh b/toolchain/templates/pbs.sh deleted file mode 100644 index 839b9fe3cf..0000000000 --- a/toolchain/templates/pbs.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env bash -#> -#> - PBS Batch File Template - -#> -#> This file is part of the ./mfc.sh run subsystem. -#> For more information, please consult the README. -#> -#> - You are invited to modify this file to suit your -#> needs, in order to get MFC running properly on -#> your system. -#> -#> - Lines that begin with "#>" are ignored and won't -#> figure in the final batch script, not even as a -#> comment. -#> -#> - Statements of the form `${expression}` are string- -#> -replaced by mfc.sh run to provide runtime parameters, -#> most notably execution options. They reference the -#> variables in the same format as those under the "run" -#> section of [mfc.user.yaml](mfc.user.yaml), replacing -#> "-" for "_". You can perform therein any Python operation -#> recognized by the built-in `expr()` function. -#> -#> - Statements of the form {MFC::expression} tell MFC -#> where to place the common code, across all batch -#> files that is required to run MFC. They are not -#> intended to be modified by users. -#> -#PBS -N {name} -#PBS -l nodes={nodes}:ppn={tasks_per_node} -#PBS -A {account} -#PBS -l walltime={walltime} -#PBS -q {partition} -#PBS -M {email} -#> -#> Note: The following options aren't enabled by default. -#> They serve as a guide to users that wish to pass -#> more options to the batch system. -#> - - - - -#> -#> Note: If your system requires you to load environment -#> modules inside of your batch script, please load -#> them bellow. -#> - - - -#> -#> Note: The MFC prologue sets up the environment required -#> prior to execution. -#> -{MFC::PROLOGUE} - -#> -#> Note: This MPI executable might not be well supported -#> on your system - if at all. {MFC::BIN} refers to -#> the path the MFC executable. -#> - -for binpath in {MFC::BINARIES}; do - - echo -e ":) Running $binpath:" - - if command -v srun > /dev/null 2>&1; then - srun \ - --nodes {nodes} \ - --ntasks-per-node {tasks_per_node} \ - {MFC::PROFILER} "$binpath" - - #> - #> srun --mpi=pmix \ - #> {MFC::PROFILER} "$binpath" - else - mpirun \ - -np {tasks_per_node*nodes} \ - {MFC::PROFILER} "$binpath" - - fi - -done - -{MFC::EPILOGUE} -#> -#> Note: Lines after the MFC Epilogue will not be executed. -#> - diff --git a/toolchain/templates/slurm.sh b/toolchain/templates/slurm.sh deleted file mode 100644 index 11982cb408..0000000000 --- a/toolchain/templates/slurm.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env bash -#> -#> - SLURM Batch File Template - -#> -#> This file is part of the ./mfc.sh run subsystem. -#> For more information, please consult the README. -#> -#> - You are invited to modify this file to suit your -#> needs, in order to get MFC running properly on -#> your system. -#> -#> - Lines that begin with "#>" are ignored and won't -#> figure in the final batch script, not even as a -#> comment. -#> -#> - Statements of the form `${expression}` are string- -#> -replaced by mfc.sh run to provide runtime parameters, -#> most notably execution options. They reference the -#> variables in the same format as those under the "run" -#> section of [mfc.user.yaml](mfc.user.yaml), replacing -#> "-" for "_". You can perform therein any Python operation -#> recognized by the built-in `expr()` function. -#> -#> - Statements of the form {MFC::expression} tell MFC -#> where to place the common code, across all batch -#> files that is required to run MFC. They are not -#> intended to be modified by users. -#> -#SBATCH --job-name="{name}" -#SBATCH --nodes={nodes} -#SBATCH --ntasks-per-node={tasks_per_node} -#SBATCH --cpus-per-task=1 -#SBATCH --gpu-bind=verbose,closest -#SBATCH --gpus=v100-16:{(1 if gpu else 0)*tasks_per_node*nodes} -#SBATCH --time={walltime} -#SBATCH --partition="{partition}" -#SBATCH --output="{name}.out" -#SBATCH --account="{account}" -#SBATCH --error="{name}.err" -#SBATCH --mail-user="{email}" -#SBATCH --export=ALL -#SBATCH --mail-type="BEGIN, END, FAIL" -#> -#> Note: The following options aren't enabled by default. -#> They serve as a guide to users that wish to pass -#> more options to the batch system. -#> -#> #SBATCH --mem=... -#> #SBATCH --constraint="lustre" -#> #SBATCH --gpus-per-task={1 if gpu else 0} - - -#> -#> Note: If your system requires you to load environment -#> modules inside of your batch script, please load -#> them bellow. -#> - - -#> -#> Note: The MFC prologue sets up the environment required -#> prior to execution. -#> -{MFC::PROLOGUE} - - -#> -#> Note: This MPI executable might not be well supported -#> on your system - if at all. {MFC::BIN} refers to -#> the path the MFC executable. -#> - -for binpath in {MFC::BINARIES}; do - - echo -e ":) Running $binpath:" - - if command -v srun > /dev/null 2>&1; then - srun \ - --nodes {nodes} \ - --ntasks-per-node {tasks_per_node} \ - {MFC::PROFILER} "$binpath" - - #> - #> srun --mpi=pmix \ - #> {MFC::PROFILER} "$binpath" - #> - else - mpirun \ - -np {nodes*tasks_per_node} \ - {MFC::PROFILER} "$binpath" - fi - -done - -{MFC::EPILOGUE} - -#> -#> Note: Lines after the MFC Epilogue will not be executed. -#>