From 2e1ff0fc206df6cb381b69cdabc986be490c19a7 Mon Sep 17 00:00:00 2001 From: Henry LE BERRE Date: Sat, 6 Jan 2024 01:26:10 -0500 Subject: [PATCH] Batch files per computer (#240 & #287) --- toolchain/mfc/args.py | 59 ++-- toolchain/mfc/common.py | 11 +- toolchain/mfc/run/engines.py | 347 ---------------------- toolchain/mfc/run/mpi_bins.py | 120 -------- toolchain/mfc/run/queues.py | 39 ++- toolchain/mfc/run/run.py | 117 ++++++-- toolchain/requirements.txt | 1 + toolchain/templates/bridges2.mako | 0 toolchain/templates/default.mako | 53 ++++ toolchain/templates/include/epilogue.mako | 16 + toolchain/templates/include/modules.mako | 3 + toolchain/templates/include/prologue.mako | 32 ++ toolchain/templates/lsf.sh | 88 ------ toolchain/templates/pbs.sh | 90 ------ toolchain/templates/phoenix.mako | 56 ++++ toolchain/templates/slurm.sh | 99 ------ toolchain/templates/summit.mako | 46 +++ 17 files changed, 356 insertions(+), 821 deletions(-) delete mode 100644 toolchain/mfc/run/engines.py delete mode 100644 toolchain/mfc/run/mpi_bins.py create mode 100644 toolchain/templates/bridges2.mako create mode 100644 toolchain/templates/default.mako create mode 100644 toolchain/templates/include/epilogue.mako create mode 100644 toolchain/templates/include/modules.mako create mode 100644 toolchain/templates/include/prologue.mako delete mode 100644 toolchain/templates/lsf.sh delete mode 100644 toolchain/templates/pbs.sh create mode 100644 toolchain/templates/phoenix.mako delete mode 100644 toolchain/templates/slurm.sh create mode 100644 toolchain/templates/summit.mako diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index eeff424654..1855d01057 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -1,12 +1,11 @@ import re, os.path, argparse, dataclasses -from .build import TARGETS, DEFAULT_TARGETS, DEPENDENCY_TARGETS -from .common import MFCException, format_list_to_string -from .test.cases import generate_cases -from .run.engines import ENGINES -from .run.mpi_bins import BINARIES +from .run.run import get_baked_templates +from .build import TARGETS, DEFAULT_TARGETS, DEPENDENCY_TARGETS +from .common import MFCException, format_list_to_string +from .test.cases import generate_cases -# pylint: disable=too-many-locals, too-many-statements +# pylint: disable=too-many-locals, too-many-branches, too-many-statements def parse(config): parser = argparse.ArgumentParser( prog="./mfc.sh", @@ -75,8 +74,6 @@ def add_common_arguments(p, mask = None): # === CLEAN === add_common_arguments(clean, "jg") - binaries = [ b.bin for b in BINARIES ] - # === TEST === test_cases = generate_cases() @@ -100,29 +97,28 @@ def add_common_arguments(p, mask = None): test.add_argument(metavar="FORWARDED", default=[], dest="--", nargs="*", help="Arguments to forward to the ./mfc.sh run invocations.") # === RUN === - engines = [ e.slug for e in ENGINES ] - add_common_arguments(run) - run.add_argument("input", metavar="INPUT", type=str, help="Input file to run.") - run.add_argument("arguments", metavar="ARGUMENTS", nargs="*", type=str, default=[], help="Additional arguments to pass to the case file.") - run.add_argument("-e", "--engine", choices=engines, type=str, default=engines[0], help="Job execution/submission engine choice.") + run.add_argument("input", metavar="INPUT", type=str, help="Input file to run.") + run.add_argument("arguments", metavar="ARGUMENTS", nargs="*", type=str, default=[], help="Additional positional arguments to pass to the case file.") + run.add_argument("-e", "--engine", choices=["interactive", "batch"], type=str, default="interactive", help="Job execution/submission engine choice.") run.add_argument("--output-summary", type=str, default=None, help="(Interactive) Output a YAML summary file.") - run.add_argument("-p", "--partition", metavar="PARTITION", type=str, default="", help="(Batch) Partition for job submission.") - run.add_argument("-N", "--nodes", metavar="NODES", type=int, default=1, help="(Batch) Number of nodes.") - run.add_argument("-n", "--tasks-per-node", metavar="TASKS", type=int, default=1, help="Number of tasks per node.") - run.add_argument("-w", "--walltime", metavar="WALLTIME", type=str, default="01:00:00", help="(Batch) Walltime.") - run.add_argument("-a", "--account", metavar="ACCOUNT", type=str, default="", help="(Batch) Account to charge.") - run.add_argument("-@", "--email", metavar="EMAIL", type=str, default="", help="(Batch) Email for job notification.") - run.add_argument("-#", "--name", metavar="NAME", type=str, default="MFC", help="(Batch) Job name.") - run.add_argument("-b", "--binary", choices=binaries, type=str, default=None, help="(Interactive) Override MPI execution binary") - run.add_argument("-s", "--scratch", action="store_true", default=False, help="Build from scratch.") - run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.") - run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.") - run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.") - run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") - run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") - run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.") - run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.") + run.add_argument("-p", "--partition", metavar="PARTITION", type=str, default="", help="(Batch) Partition for job submission.") + run.add_argument("-q", "--quality_of_service", metavar="QOS", type=str, default="", help="(Batch) Quality of Service for job submission.") + run.add_argument("-N", "--nodes", metavar="NODES", type=int, default=1, help="(Batch) Number of nodes.") + run.add_argument("-n", "--tasks-per-node", metavar="TASKS", type=int, default=1, help="Number of tasks per node.") + run.add_argument("-w", "--walltime", metavar="WALLTIME", type=str, default="01:00:00", help="(Batch) Walltime.") + run.add_argument("-a", "--account", metavar="ACCOUNT", type=str, default="", help="(Batch) Account to charge.") + run.add_argument("-@", "--email", metavar="EMAIL", type=str, default="", help="(Batch) Email for job notification.") + run.add_argument("-#", "--name", metavar="NAME", type=str, default="MFC", help="(Batch) Job name.") + run.add_argument("-s", "--scratch", action="store_true", default=False, help="Build from scratch.") + run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.") + run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.") + run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.") + run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") + run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.") + run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.") + run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.") + run.add_argument("-c", "--computer", metavar="COMPUTER", type=str, default="default", help=f"(Batch) Path to a custom submission file template or one of {format_list_to_string(list(get_baked_templates().keys()))}.") # === BENCH === add_common_arguments(bench, "t") @@ -153,10 +149,11 @@ def add_common_arguments(p, mask = None): # "Slugify" the name of the job args["name"] = re.sub(r'[\W_]+', '-', args["name"]) - # build's --case-optimization and --input depend on each other + # We need to check for some invalid combinations of arguments because of + # the limitations of argparse. if args["command"] == "build": if (args["input"] is not None) ^ args["case_optimization"] : - raise MFCException("./mfc.sh build's --case-optimization requires --input") + raise MFCException("./mfc.sh build's --case-optimization and --input must be used together.") # Input files to absolute paths for e in ["input", "input1", "input2"]: diff --git a/toolchain/mfc/common.py b/toolchain/mfc/common.py index 2e3a4d7cae..b51f7d5416 100644 --- a/toolchain/mfc/common.py +++ b/toolchain/mfc/common.py @@ -5,9 +5,10 @@ from .printer import cons -MFC_ROOTDIR = normpath(f"{dirname(realpath(__file__))}/../..") +MFC_ROOTDIR = abspath(normpath(f"{dirname(realpath(__file__))}/../..")) MFC_TESTDIR = abspath(f"{MFC_ROOTDIR}/tests") MFC_SUBDIR = abspath(f"{MFC_ROOTDIR}/build") +MFC_TEMPLATEDIR = abspath(f"{MFC_ROOTDIR}/toolchain/templates") MFC_LOCK_FILEPATH = abspath(f"{MFC_SUBDIR}/lock.yaml") MFC_BENCH_FILEPATH = abspath(f"{MFC_ROOTDIR}/toolchain/bench.yaml") @@ -179,14 +180,6 @@ def does_system_use_modules() -> bool: return does_command_exist("module") -def get_loaded_modules() -> typing.List[str]: - """ - Returns a list of loaded modules. - """ - - return [ l for l in subprocess.getoutput("module -t list").splitlines() if ' ' not in l ] - - def is_number(x: str) -> bool: if x is None: return False diff --git a/toolchain/mfc/run/engines.py b/toolchain/mfc/run/engines.py deleted file mode 100644 index dc8f431a48..0000000000 --- a/toolchain/mfc/run/engines.py +++ /dev/null @@ -1,347 +0,0 @@ -import re, os, time, copy, typing, datetime, subprocess, dataclasses, multiprocessing - -from ..state import ARG, ARGS -from ..printer import cons -from ..build import MFCTarget, SYSCHECK, get_targets -from ..common import MFCException, does_command_exist, isspace, system -from ..common import format_list_to_string, does_system_use_modules -from ..common import get_loaded_modules, file_write -from ..run import queues, mpi_bins -from ..run.input import MFCInputFile - - -def profiler_prepend(): - if ARG("ncu") is not None: - if not does_command_exist("ncu"): - raise MFCException("Failed to locate [bold green]NVIDIA Nsight Compute[/bold green] (ncu).") - - return ["ncu", "--nvtx", "--mode=launch-and-attach", - "--cache-control=none", "--clock-control=none"] + ARG("ncu") - - if ARG("nsys") is not None: - if not does_command_exist("nsys"): - raise MFCException("Failed to locate [bold green]NVIDIA Nsight Systems[/bold green] (nsys).") - - return ["nsys", "profile", "--stats=true", "--trace=mpi,nvtx,openacc"] + ARG("nsys") - - return [] - - -@dataclasses.dataclass(init=False) -class Engine: - name: str - slug: str - input: MFCInputFile - - def __init__(self, name: str, slug: str) -> None: - self.name = name - self.slug = slug - - def init(self, _input: MFCInputFile) -> None: - self.input = _input - - self._init() - - def _init(self) -> None: - pass - - def get_args(self) -> typing.List[str]: - raise MFCException(f"MFCEngine::get_args: not implemented for {self.name}.") - - def run(self, targets: typing.List[MFCTarget]) -> None: - raise MFCException(f"MFCEngine::run: not implemented for {self.name}.") - - -def _interactive_working_worker(cmd: typing.List[str], q: multiprocessing.Queue): - """ Runs a command and puts the result in a queue. """ - cmd = [ str(_) for _ in cmd ] - cons.print(f"$ {' '.join(cmd)}") - result = subprocess.run( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False) - q.put(result) - -class InteractiveEngine(Engine): - def __init__(self) -> None: - super().__init__("Interactive", "interactive") - - # pylint: disable=attribute-defined-outside-init - def _init(self) -> None: - self.mpibin = mpi_bins.get_binary() - - # If using MPI, we don't know yet whether this engine works - self.bKnowWorks = not ARG("mpi") - - def get_args(self) -> str: - return f"""\ -Nodes (-N) {ARG('nodes')} -Tasks (/node) (-n) {ARG('tasks_per_node')} -MPI Binary (-b) {self.mpibin.bin}\ -""" - - - def __get_exec_cmd(self, target: MFCTarget) -> typing.List[str]: - cmd = [] - if ARG("mpi"): - cmd += [self.mpibin.bin] + self.mpibin.gen_params() + ARG("--")[:] - - cmd += profiler_prepend() - - cmd.append(target.get_install_binpath()) - - return cmd - - def run(self, targets) -> None: - targets = get_targets(targets) - - if not self.bKnowWorks: - # Fix MFlowCode/MFC#21: Check whether attempting to run a job will hang - # forever. This can happen when using the wrong queue system. - - work_timeout = 30 - - cons.print(f"Ensuring the [bold magenta]Interactive Engine[/bold magenta] works ({work_timeout}s timeout) via [bold magenta]syscheck[/bold magenta]:") - cons.print() - cons.indent() - - q = multiprocessing.Queue() - p = multiprocessing.Process( - target=_interactive_working_worker, - args=( - [self.mpibin.bin] + self.mpibin.gen_params() + [os.sep.join([SYSCHECK.get_install_dirpath(), "bin", "syscheck"])], - q, - )) - - p.start() - p.join(work_timeout) - - if p.is_alive(): - raise MFCException("""\ -The [bold magenta]Interactive Engine[/bold magenta] appears to hang. -This may indicate that the wrong MPI binary is being used to launch parallel jobs. You can specify the correct one for your system -using the <-b,--binary> option. For example: -* ./mfc.sh run -b mpirun -* ./mfc.sh run -b srun -""") - - result = q.get(block=False) - self.bKnowWorks = result.returncode == 0 - - if not self.bKnowWorks: - error_txt = """\ -MFC's [bold magenta]syscheck[/bold magenta] (system check) failed to run successfully. -Please review the output bellow and ensure that your system is configured correctly: - -""" - - if result is not None: - error_txt += f"""\ -STDOUT: -{result.stdout} - -STDERR: -{result.stderr} -""" - else: - error_txt += f"Evaluation timed out after {work_timeout}s." - - raise MFCException(error_txt) - - cons.print() - cons.unindent() - - for target in targets: - cons.print(f"[bold]Running [magenta]{target.name}[/magenta][/bold]:") - cons.indent() - - if not ARG("dry_run"): - start_time = time.monotonic() - env = os.environ.copy() - if ARG('gpus') is not None: - env['CUDA_VISIBLE_DEVICES'] = ','.join([str(_) for _ in ARG('gpus')]) - - system( - self.__get_exec_cmd(target), cwd=self.input.case_dirpath, - env=env - ) - end_time = time.monotonic() - cons.print(no_indent=True) - - cons.print(f"[bold green]Done[/bold green] (in {datetime.timedelta(seconds=end_time - start_time)})") - - cons.print() - cons.unindent() - - -class BatchEngine(Engine): - def __init__(self) -> None: - super().__init__("Batch", "batch") - - def get_args(self) -> str: - return f"""\ -Nodes (-N) {ARG('nodes')} -Tasks (/node) (-n) {ARG('tasks_per_node')} -Walltime (-w) {ARG("walltime")} -Partition (-p) {ARG("partition")} -Account (-a) {ARG("account")} -Email (-@) {ARG("email")} -""" - - def run(self, targets) -> None: - qsystem = queues.get_system() - cons.print(f"Detected the [bold magenta]{qsystem.name}[/bold magenta] queue system.") - - targets = get_targets([SYSCHECK] + targets) - - cons.print(f"Running {format_list_to_string([_.name for _ in targets], 'bold magenta')}:") - cons.indent() - - self.__create_batch_file(qsystem, targets) - - if not ARG("dry_run"): - self.__execute_batch_file(qsystem) - - cons.print("[bold yellow]INFO:[/bold yellow] Batch file submitted! Please check your queue system for the job status.") - cons.print("[bold yellow]INFO:[/bold yellow] If an error occurs, please check the generated batch file and error logs for more information.") - cons.print("[bold yellow]INFO:[/bold yellow] You can modify the template batch file to your needs.") - - cons.unindent() - - def __get_batch_dirpath(self) -> str: - return copy.copy(self.input.case_dirpath) - - def __get_batch_filename(self) -> str: - return f"{ARG('name')}.sh" - - def __get_batch_filepath(self): - return os.path.abspath(os.sep.join([ - self.__get_batch_dirpath(), - self.__get_batch_filename() - ])) - - def __generate_prologue(self, qsystem: queues.QueueSystem) -> str: - modules = f"" - - if does_system_use_modules(): - modules = f"""\ -printf ":) Loading modules...\\n" - -module purge -module load {' '.join(get_loaded_modules())} -""" - - return f"""\ -TABLE_FORMAT_LINE="| - %-14s %-35s - %-14s %-35s |\\n" -TABLE_HEADER="+-----------------------------------------------------------------------------------------------------------+ \\n" -TABLE_FOOTER="+-----------------------------------------------------------------------------------------------------------+ \\n" -TABLE_TITLE_FORMAT="| %8s %-96s |\\n" -TABLE_CONTENT=$(cat <<-END -$(printf "$TABLE_FORMAT_LINE" "Start-time:" "$(date +%T)" "Start-date:" "$(date +%T)") -$(printf "$TABLE_FORMAT_LINE" "Partition:" "{ARG("partition")}" "Walltime:" "{ARG("walltime")}") -$(printf "$TABLE_FORMAT_LINE" "Account:" "{ARG("account")}" "Nodes:" "{ARG("nodes")}") -$(printf "$TABLE_FORMAT_LINE" "Job Name:" "{ARG("name")}" "Engine" "{ARG("engine")}") -$(printf "$TABLE_FORMAT_LINE" "Queue System:" "{qsystem.name}" "Email:" "{ARG("email")}") -END -) - -printf "$TABLE_HEADER" -printf "$TABLE_TITLE_FORMAT" "Starting" "{ARG("name")} from {ARG("input")}:" -printf "$TABLE_CONTENT\\n" -printf "$TABLE_FOOTER\\n" - -{modules} - -cd "{self.input.case_dirpath}" - -t_start=$(date +%s) -""" - - def __generate_epilogue(self) -> str: - return f"""\ -code=$? - -t_stop="$(date +%s)" - -printf "\\n$TABLE_HEADER" -printf "$TABLE_TITLE_FORMAT" "Finished" "{ARG("name")}:" -printf "$TABLE_FORMAT_LINE" "Total-time:" "$(expr $t_stop - $t_start)s" "Exit Code:" "$code" -printf "$TABLE_FORMAT_LINE" "End-time:" "$(date +%T)" "End-date:" "$(date +%T)" -printf "$TABLE_FOOTER" - -exit $code -""" - - def __evaluate_expression(self, expr: str) -> str: - # See if it computable - try: - # We assume eval is safe because we control the expression. - # pylint: disable=eval-used - r = str(eval(expr, ARGS())) - return r if not isspace(r) else None - except Exception as exc: - raise MFCException(f"BatchEngine: '{expr}' is not a valid expression in the template file. Please check your spelling.") from exc - - def __batch_evaluate(self, s: str, qsystem: queues.QueueSystem, targets): - targets = get_targets(targets) - - replace_list = [ - ("{MFC::PROLOGUE}", self.__generate_prologue(qsystem)), - ("{MFC::PROFILER}", ' '.join(profiler_prepend())), - ("{MFC::EPILOGUE}", self.__generate_epilogue()), - ("{MFC::BINARIES}", ' '.join([f"'{target.get_install_binpath()}'" for target in targets])), - ] - - for (key, value) in replace_list: - s = s.replace(key, value) - - # Remove "#>" comments & redundant newlines - s = re.sub(r"^#>.*\n", "", s, flags=re.MULTILINE) - s = re.sub(r"^\n{2,}", "\n", s, flags=re.MULTILINE) - - # Evaluate expressions of the form "{expression}" - for match in re.findall(r"{[^\{]+}", s, flags=re.MULTILINE): - repl = self.__evaluate_expression(match[1:-1]) - - if repl is not None: - s = s.replace(match, repl) - else: - # If not specified, then remove the line it appears on - s = re.sub(rf"^.*{match}.*$\n", "", s, flags=re.MULTILINE) - - cons.print(f"> > [bold yellow]Warning:[/bold yellow] [magenta]{match[1:-1]}[/magenta] was not specified. Thus, any line it appears on will be discarded.") - - return s - - def __create_batch_file(self, qsystem: queues.QueueSystem, targets: typing.List[MFCTarget]): - cons.print("> Generating batch file...") - filepath = self.__get_batch_filepath() - cons.print("> Evaluating template file...") - content = self.__batch_evaluate(qsystem.template, qsystem, targets) - - cons.print("> Writing batch file...") - file_write(filepath, content) - - def __execute_batch_file(self, qsystem: queues.QueueSystem): - # We CD to the case directory before executing the batch file so that - # any files the queue system generates (like .err and .out) are created - # in the correct directory. - cmd = qsystem.gen_submit_cmd(self.__get_batch_filename()) - - if system(cmd, cwd=self.__get_batch_dirpath()) != 0: - raise MFCException(f"Submitting batch file for {qsystem.name} failed. It can be found here: {self.__get_batch_filepath()}. Please check the file for errors.") - - -ENGINES = [ InteractiveEngine(), BatchEngine() ] - -def get_engine(slug: str) -> Engine: - engine: Engine = None - for candidate in ENGINES: - candidate: Engine - - if candidate.slug == slug: - engine = candidate - break - - if engine is None: - raise MFCException(f"Unsupported engine {slug}.") - - return engine diff --git a/toolchain/mfc/run/mpi_bins.py b/toolchain/mfc/run/mpi_bins.py deleted file mode 100644 index 2d39a4fd0f..0000000000 --- a/toolchain/mfc/run/mpi_bins.py +++ /dev/null @@ -1,120 +0,0 @@ -import typing, dataclasses - -from .. import common -from ..state import ARG - -# Note: This file is now only used when running -# in -e interactive mode. - -@dataclasses.dataclass -class MPIBinary: - name: str - bin: str - - def is_present(self) -> bool: - return common.does_command_exist(self.bin) - - def gen_params(self) -> typing.List[str]: - raise common.MFCException(f"MPIBinary::gen_params <{self.name}> not implemented.") - - -class NOMPIBIN(MPIBinary): - def __init__(self): - super().__init__("N/A", "N/A") - - def is_present(self) -> bool: - return not ARG("mpi") - - def gen_params(self) -> typing.List[str]: - return [] - - -class JSRUN(MPIBinary): - def __init__(self): - super().__init__("IBM's JSRUN", "jsrun") - - def gen_params(self) -> typing.List[str]: - # ORNL Summit: https://docs.olcf.ornl.gov/systems/summit_user_guide.html?highlight=lsf#launching-a-job-with-jsrun - # We create one resource-set per CPU(Core)/GPU pair. - nrs=ARG("tasks_per_node")*ARG("nodes") - cores_per_rs=1 - gpus_per_rs=min(ARG("tasks_per_node"), 1) - tasks_per_rs=1 - - arguments=[ - '--nrs', nrs, - '--cpu_per_rs', cores_per_rs, - '--gpu_per_rs', gpus_per_rs, - '--tasks_per_rs', tasks_per_rs - ] - - if gpus_per_rs >= 1: - arguments.append('--smpiargs=-gpu') - - return arguments - - -class SRUN(MPIBinary): - def __init__(self): - super().__init__("SLURM's SRUN", "srun") - - def gen_params(self) -> typing.List[str]: - params = ['--ntasks-per-node', ARG("tasks_per_node")] - - if ARG("nodes") != 1: - params += ['-N', ARG("nodes")] - - # MFC binds its GPUs on its own, as long as they have been allocated - # by the system's scheduler, or are present on your local machine, - # if running in serial mode. - - if not common.isspace(ARG("account")): - params += ['-A', ARG("account")] - - if not common.isspace(ARG("partition")): - params += ['-p', ARG("partition")] - - return params - - -class MPIEXEC(MPIBinary): - def __init__(self): - super().__init__("MPIEXEC", "mpiexec") - - def gen_params(self) -> str: - return ["-np", ARG("tasks_per_node")*ARG("nodes")] - - -class MPIRUN(MPIBinary): - def __init__(self): - super().__init__("MPIRUN", "mpirun") - - def gen_params(self) -> str: - return ["-np", ARG("tasks_per_node")*ARG("nodes")] - - -# In descending order of priority (if no override present) -BINARIES: list = [ JSRUN(), SRUN(), MPIRUN(), MPIEXEC(), NOMPIBIN() ] - -def get_binary(exclude: typing.List[str] = None) -> MPIBinary: - if exclude is None: - exclude = [] - - binaries = [ - b for b in BINARIES if b.is_present() and b.bin not in exclude - ] - - if len(binaries) == 0: - raise common.MFCException("No MPI binary found.") - - # Handle user override - if ARG("binary") is not None: - for binary in binaries: - binary: MPIBinary - - if binary.bin == ARG("binary"): - return binary - - raise common.MFCException(f"MPI Binary <{ARG('binary')}> not found.") - - return binaries[0] diff --git a/toolchain/mfc/run/queues.py b/toolchain/mfc/run/queues.py index acb833888c..bb8a17b453 100644 --- a/toolchain/mfc/run/queues.py +++ b/toolchain/mfc/run/queues.py @@ -1,16 +1,15 @@ -import os, typing, dataclasses +import typing, dataclasses -from mfc import common +from mfc import common from ..state import ARG + @dataclasses.dataclass class QueueSystem: - name: str - template: str + name: str - def __init__(self, name: str, filename: str) -> None: - self.name = name - self.template = common.file_read(os.sep.join(["toolchain", "templates", filename])) + def __init__(self, name: str) -> None: + self.name = name def is_active(self) -> bool: raise common.MFCException("QueueSystem::is_active: not implemented.") @@ -19,9 +18,20 @@ def gen_submit_cmd(self, filepath: str) -> typing.List[str]: raise common.MFCException("QueueSystem::gen_submit_cmd: not implemented.") +class InteractiveSystem(QueueSystem): + def __init__(self) -> None: + super().__init__("Interactive") + + def is_active(self) -> bool: + return True + + def gen_submit_cmd(self, filepath: str) -> typing.List[str]: + return ["/bin/bash", filepath] + + class PBSSystem(QueueSystem): def __init__(self) -> None: - super().__init__("PBS", "pbs.sh") + super().__init__("PBS") def is_active(self) -> bool: return common.does_command_exist("qsub") @@ -35,7 +45,7 @@ def gen_submit_cmd(self, filepath: str) -> typing.List[str]: class LSFSystem(QueueSystem): def __init__(self) -> None: - super().__init__("LSF", "lsf.sh") + super().__init__("LSF") def is_active(self) -> bool: return common.does_command_exist("bsub") and common.does_command_exist("bqueues") @@ -51,7 +61,7 @@ def gen_submit_cmd(self, filepath: str) -> None: class SLURMSystem(QueueSystem): def __init__(self) -> None: - super().__init__("SLURM", "slurm.sh") + super().__init__("SLURM") def is_active(self) -> bool: return common.does_command_exist("sbatch") @@ -65,11 +75,14 @@ def gen_submit_cmd(self, filepath: str) -> None: return cmd + [filepath] -QUEUE_SYSTEMS = [ LSFSystem(), SLURMSystem(), PBSSystem() ] +BATCH_SYSTEMS = [ LSFSystem(), SLURMSystem(), PBSSystem() ] def get_system() -> QueueSystem: - for system in QUEUE_SYSTEMS: + if ARG("engine") == "interactive": + return InteractiveSystem() + + for system in BATCH_SYSTEMS: if system.is_active(): return system - raise common.MFCException("Failed to detect a queue system.") + raise common.MFCException(f"Failed to detect a queue system for engine [magenta]{ARG('engine')}[/magenta].") diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py index 2fd7ec8b48..965367976f 100644 --- a/toolchain/mfc/run/run.py +++ b/toolchain/mfc/run/run.py @@ -1,14 +1,21 @@ -import re +import re, os, typing + +from glob import glob + +from mako.lookup import TemplateLookup +from mako.template import Template from ..build import get_targets, build from ..printer import cons -from ..state import ARG -from ..common import MFCException, isspace +from ..state import ARG, ARGS +from ..common import MFCException, isspace, file_read, does_command_exist +from ..common import MFC_TEMPLATEDIR, file_write, system, MFC_ROOTDIR +from ..common import format_list_to_string -from . import engines, input +from . import queues, input -def validate_job_options() -> None: +def __validate_job_options() -> None: if not ARG("mpi") and any({ARG("nodes") > 1, ARG("tasks_per_node") > 1}): raise MFCException("RUN: Cannot run on more than one rank with --no-mpi.") @@ -24,30 +31,67 @@ def validate_job_options() -> None: raise MFCException(f'RUN: {ARG("email")} is not a valid e-mail address.') -def run(targets = None): - targets = get_targets(targets or ARG("targets")) +def __profiler_prepend() -> typing.List[str]: + if ARG("ncu") is not None: + if not does_command_exist("ncu"): + raise MFCException("Failed to locate [bold green]NVIDIA Nsight Compute[/bold green] (ncu).") - build(targets) + return ["ncu", "--nvtx", "--mode=launch-and-attach", + "--cache-control=none", "--clock-control=none"] + ARG("ncu") - cons.print("[bold]Run[/bold]") - cons.indent() + if ARG("nsys") is not None: + if not does_command_exist("nsys"): + raise MFCException("Failed to locate [bold green]NVIDIA Nsight Systems[/bold green] (nsys).") - input_file = input.load() + return ["nsys", "profile", "--stats=true", "--trace=mpi,nvtx,openacc"] + ARG("nsys") - engine = engines.get_engine(ARG("engine")) - engine.init(input_file) + return [] + + +def get_baked_templates() -> dict: + return { + os.path.splitext(os.path.basename(f))[0] : file_read(f) + for f in glob(os.path.join(MFC_TEMPLATEDIR, "*.mako")) + } + + +def __job_script_filepath() -> str: + return os.path.abspath(os.sep.join([ + os.path.dirname(ARG("input")), + f"{ARG('name')}.sh" + ])) + + +def __get_template() -> Template: + computer = ARG("computer") + lookup = TemplateLookup(directories=[MFC_TEMPLATEDIR, os.path.join(MFC_TEMPLATEDIR, "include")]) + baked = get_baked_templates() + + if (content := baked.get(computer)) is not None: + cons.print(f"Using baked-in template for [magenta]{computer}[/magenta].") + return Template(content, lookup=lookup) + + if os.path.isfile(computer): + cons.print(f"Using template from [magenta]{computer}[/magenta].") + return Template(file_read(computer), lookup=lookup) + + raise MFCException(f"Failed to find a template for --computer '{computer}'. Baked-in templates are: {format_list_to_string(list(baked.keys()), 'magenta')}.") - cons.print(f"Configuration:") - cons.indent() - cons.print(f"""\ -Input {ARG('input')} -Job Name (-#) {ARG('name')} -Engine (-e) {ARG('engine')} -{engine.get_args()}\ -""") - cons.unindent() - validate_job_options() +def __generate_job_script(targets): + content = __get_template().render( + **ARGS(), + ARG=ARG, + rootdir=MFC_ROOTDIR, + binpaths=[target.get_install_binpath() for target in targets], + profiler=__profiler_prepend(), + ) + + file_write(__job_script_filepath(), content) + + +def __generate_input_files(targets): + input_file = input.load() for target in targets: cons.print(f"Generating input files for [magenta]{target.name}[/magenta]...") @@ -57,4 +101,29 @@ def run(targets = None): cons.print() cons.unindent() - engine.run(targets) + +def __execute_job_script(qsystem: queues.QueueSystem): + # We CD to the case directory before executing the batch file so that + # any files the queue system generates (like .err and .out) are created + # in the correct directory. + cmd = qsystem.gen_submit_cmd(__job_script_filepath()) + + if system(cmd, cwd=os.path.dirname(ARG("input"))).returncode != 0: + raise MFCException(f"Submitting batch file for {qsystem.name} failed. It can be found here: {__job_script_filepath()}. Please check the file for errors.") + + +def run(targets = None): + targets = get_targets(targets or ARG("targets")) + + build(targets) + + cons.print("[bold]Run[/bold]") + cons.indent() + + qsystem = queues.get_system() + cons.print(f"Using queue system [magenta]{qsystem.name}[/magenta].") + + __generate_job_script(targets) + __validate_job_options() + __generate_input_files(targets) + __execute_job_script(qsystem) diff --git a/toolchain/requirements.txt b/toolchain/requirements.txt index b3495c7d58..fd2377a6df 100644 --- a/toolchain/requirements.txt +++ b/toolchain/requirements.txt @@ -1,5 +1,6 @@ rich fypp +mako wheel typing PyYAML diff --git a/toolchain/templates/bridges2.mako b/toolchain/templates/bridges2.mako new file mode 100644 index 0000000000..e69de29bb2 diff --git a/toolchain/templates/default.mako b/toolchain/templates/default.mako new file mode 100644 index 0000000000..32cf69f3ef --- /dev/null +++ b/toolchain/templates/default.mako @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +. "${rootdir}/toolchain/util.sh" + +% if engine == 'batch': + error "The$MAGENTA default$COLOR_RESET template does not support batch jobs. Please use a different template via the $MAGENTA--computer$COLOR_RESET option.\n" + exit 1 +% endif + +<%include file="prologue.mako"/> + +warn "This is the$MAGENTA default$COLOR_RESET template." +warn "It is not intended to support all systems and execution engines." +warn "Please use a different template via the $MAGENTA--computer$COLOR_RESET option." +echo + +% for binpath in binpaths: + echo -e ":) Running $MAGENTA${binpath}$COLOR_RESET:\n" + + % if not mpi: + ${' '.join([f"'{x}'" for x in profiler ])} "${binpath}" + % else: + if command -v jsrun > /dev/null; then + jsrun --nrs ${tasks_per_node*nodes} \ + --cpu_per_rs 1 \ + --gpu_per_rs ${1 if gpu else 0} \ + --tasks_per_rs 1 \ + ${' '.join([f"'{x}'" for x in profiler ])} \ + "${binpath}" + elif command -v srun > /dev/null; then + srun --ntasks-per-node ${tasks_per_node} \ + ${' '.join([f"'{x}'" for x in profiler ])} \ + "${binpath}" + elif command -v mpirun > /dev/null; then + mpirun -np ${nodes*tasks_per_node} \ + ${' '.join([f"'{x}'" for x in profiler ])} \ + "${binpath}" + else + echo -e "\n:( Could not find a suitable MPI launcher.\n" + exit 1 + fi + % endif + + code=$? + if [ $code -ne 0 ]; then + echo -e "\n:( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET.\n" + exit 1 + fi + + echo +% endfor + +<%include file="epilogue.mako"/> diff --git a/toolchain/templates/include/epilogue.mako b/toolchain/templates/include/epilogue.mako new file mode 100644 index 0000000000..183b6b8f54 --- /dev/null +++ b/toolchain/templates/include/epilogue.mako @@ -0,0 +1,16 @@ +#> +#> The MFC epilogue stops the timer and prints the execution summary. It also +#> performs some cleanup and housekeeping tasks before exiting. +#> + +code=$? + +t_stop="$(date +%s)" + +printf "$TABLE_HEADER" +printf "$TABLE_TITLE_FORMAT" "Finished" "$MAGENTA${name}$COLOR_RESET:" +printf "$TABLE_FORMAT_LINE" "Total-time:" "$(expr $t_stop - $t_start)s" "Exit Code:" "$code" +printf "$TABLE_FORMAT_LINE" "End-time:" "$(date +%T)" "End-date:" "$(date +%T)" +printf "$TABLE_FOOTER" + +exit $code \ No newline at end of file diff --git a/toolchain/templates/include/modules.mako b/toolchain/templates/include/modules.mako new file mode 100644 index 0000000000..6aed7915f1 --- /dev/null +++ b/toolchain/templates/include/modules.mako @@ -0,0 +1,3 @@ +<%def name="modules(x)"> + this is myfunc, x is ${x} + \ No newline at end of file diff --git a/toolchain/templates/include/prologue.mako b/toolchain/templates/include/prologue.mako new file mode 100644 index 0000000000..d825cc3d50 --- /dev/null +++ b/toolchain/templates/include/prologue.mako @@ -0,0 +1,32 @@ +#> +#> The MFC prologue prints a summary of the running job and starts a timer. +#> + +<%! +import os +%>\ + +. "${rootdir}/toolchain/util.sh" + +TABLE_FORMAT_LINE="| * %-14s $MAGENTA%-35s$COLOR_RESET * %-14s $MAGENTA%-35s$COLOR_RESET |\\n" +TABLE_HEADER="+-----------------------------------------------------------------------------------------------------------+ \\n" +TABLE_FOOTER="+-----------------------------------------------------------------------------------------------------------+ \\n" +TABLE_TITLE_FORMAT="| %-105s |\\n" +TABLE_CONTENT=$(cat <<-END +$(printf "$TABLE_FORMAT_LINE" "Start-time" "$(date +%T)" "Start-date" "$(date +%T)") +$(printf "$TABLE_FORMAT_LINE" "Partition" "${partition}" "Walltime" "${walltime}") +$(printf "$TABLE_FORMAT_LINE" "Account" "${account}" "Nodes" "${nodes}") +$(printf "$TABLE_FORMAT_LINE" "Job Name" "${name}" "Engine" "${engine}") +$(printf "$TABLE_FORMAT_LINE" "Queue System" "{qsystem.name}" "Email" "${email}") +END +) + +printf "$TABLE_HEADER" +printf "$TABLE_TITLE_FORMAT" "MFC case # ${name} @ ${input}:" +printf "$TABLE_HEADER" +printf "$TABLE_CONTENT\\n" +printf "$TABLE_FOOTER\\n" + +cd "${os.path.dirname(input)}" + +t_start=$(date +%s) \ No newline at end of file diff --git a/toolchain/templates/lsf.sh b/toolchain/templates/lsf.sh deleted file mode 100644 index 1db260b0ba..0000000000 --- a/toolchain/templates/lsf.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env bash -#> -#> - LSF Batch File Template - -#> -#> This file is part of the ./mfc.sh run subsystem. -#> For more information, please consult the README. -#> -#> - You are invited to modify this file to suit your -#> needs, in order to get MFC running properly on -#> your system. -#> -#> - Lines that begin with "#>" are ignored and won't -#> figure in the final batch script, not even as a -#> comment. -#> -#> - Statements of the form `${expression}` are string- -#> -replaced by mfc.sh run to provide runtime parameters, -#> most notably execution options. They reference the -#> variables in the same format as those under the "run" -#> section of [mfc.user.yaml](mfc.user.yaml), replacing -#> "-" for "_". You can perform therein any Python operation -#> recognized by the built-in `expr()` function. -#> -#> - Statements of the form {MFC::expression} tell MFC -#> where to place the common code, across all batch -#> files that is required to run MFC. They are not -#> intended to be modified by users. -#> -#BSUB -J {name} -#BSUB -nnodes {nodes} -#BSUB -N -#BSUB -P {account} -#BSUB -W {walltime[:-3]} -#> -#> Note: The above expression for the walltime converts -#> the expression "hh:mm:ss" to the appropriate -#> format for the batch system ("hh:mm"). It is -#> a python expression evaluated at runtime. -#> -#> -#> Note: The following options aren't enabled by default. -#> They serve as a guide to users that wish to pass -#> more options to the batch system. -#> - - - -#> -#> Note: If your system requires you to load environment -#> modules inside of your batch script, please load -#> them bellow. -#> - - - -#> -#> Note: The MFC prologue sets up the environment required -#> prior to execution. -#> -{MFC::PROLOGUE} - -#> -#> Note: This MPI executable might not be well supported -#> on your system - if at all. {MFC::BIN} refers to -#> the path the MFC executable. -#> - -for binpath in {MFC::BINARIES}; do - - echo -e ":) Running $binpath:" - echo "" - - jsrun \ - {'--smpiargs="-gpu"' if gpu else ''} \ - --nrs {tasks_per_node*nodes} \ - --cpu_per_rs 1 \ - --gpu_per_rs {1 if gpu else 0} \ - --tasks_per_rs 1 \ - {MFC::PROFILER} "$binpath" - - echo "" - -done - -{MFC::EPILOGUE} -#> -#> Note: Lines after the MFC Epilogue will not be executed. -#> diff --git a/toolchain/templates/pbs.sh b/toolchain/templates/pbs.sh deleted file mode 100644 index 839b9fe3cf..0000000000 --- a/toolchain/templates/pbs.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env bash -#> -#> - PBS Batch File Template - -#> -#> This file is part of the ./mfc.sh run subsystem. -#> For more information, please consult the README. -#> -#> - You are invited to modify this file to suit your -#> needs, in order to get MFC running properly on -#> your system. -#> -#> - Lines that begin with "#>" are ignored and won't -#> figure in the final batch script, not even as a -#> comment. -#> -#> - Statements of the form `${expression}` are string- -#> -replaced by mfc.sh run to provide runtime parameters, -#> most notably execution options. They reference the -#> variables in the same format as those under the "run" -#> section of [mfc.user.yaml](mfc.user.yaml), replacing -#> "-" for "_". You can perform therein any Python operation -#> recognized by the built-in `expr()` function. -#> -#> - Statements of the form {MFC::expression} tell MFC -#> where to place the common code, across all batch -#> files that is required to run MFC. They are not -#> intended to be modified by users. -#> -#PBS -N {name} -#PBS -l nodes={nodes}:ppn={tasks_per_node} -#PBS -A {account} -#PBS -l walltime={walltime} -#PBS -q {partition} -#PBS -M {email} -#> -#> Note: The following options aren't enabled by default. -#> They serve as a guide to users that wish to pass -#> more options to the batch system. -#> - - - - -#> -#> Note: If your system requires you to load environment -#> modules inside of your batch script, please load -#> them bellow. -#> - - - -#> -#> Note: The MFC prologue sets up the environment required -#> prior to execution. -#> -{MFC::PROLOGUE} - -#> -#> Note: This MPI executable might not be well supported -#> on your system - if at all. {MFC::BIN} refers to -#> the path the MFC executable. -#> - -for binpath in {MFC::BINARIES}; do - - echo -e ":) Running $binpath:" - - if command -v srun > /dev/null 2>&1; then - srun \ - --nodes {nodes} \ - --ntasks-per-node {tasks_per_node} \ - {MFC::PROFILER} "$binpath" - - #> - #> srun --mpi=pmix \ - #> {MFC::PROFILER} "$binpath" - else - mpirun \ - -np {tasks_per_node*nodes} \ - {MFC::PROFILER} "$binpath" - - fi - -done - -{MFC::EPILOGUE} -#> -#> Note: Lines after the MFC Epilogue will not be executed. -#> - diff --git a/toolchain/templates/phoenix.mako b/toolchain/templates/phoenix.mako new file mode 100644 index 0000000000..cdc8cde7e5 --- /dev/null +++ b/toolchain/templates/phoenix.mako @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +% if engine == 'batch': +#SBATCH --nodes=${nodes} +#SBATCH --ntasks-per-node=${tasks_per_node} +#SBATCH --job-name="${name}" +#SBATCH --output="${name}.out" +#SBATCH --time=${walltime} +% if account: +#SBATCH --account=${account} +% endif +% if partition: +#SBATCH --partition=${partition} +% endif +% if quality_of_service: +#SBATCH --qos=${quality_of_service} +% endif +% if gpu: +#SBATCH --gres=gpu:V100:${tasks_per_node} +#SBATCH --mem-per-gpu=16G\ +% endif +% if email: +#SBATCH --mail-user=${email} +#SBATCH --mail-type="BEGIN, END, FAIL" +% endif +% endif + +echo -e ":) Loading modules:\n" +cd "${rootdir}" && . ./mfc.sh load -c p -m ${'g' if gpu else 'c'} +echo + +<%include file="prologue.mako"/> + +% for binpath in binpaths: + echo -e ":) Running ${binpath.split('/')[-1]}:\n" + + % if not mpi: + ${' '.join([f"'{x}'" for x in profiler ])} "${binpath}" + % else: + mpirun -np ${nodes*tasks_per_node} \ + ${' '.join([f"'{x}'" for x in profiler ])} \ + "${binpath}" + % endif + + % if engine == 'interactive': + code=$? + if [ $code -ne 0 ]; then + echo -e "\n:( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET.\n" + exit 1 + fi + % endif + + echo +% endfor + +<%include file="epilogue.mako"/> diff --git a/toolchain/templates/slurm.sh b/toolchain/templates/slurm.sh deleted file mode 100644 index 11982cb408..0000000000 --- a/toolchain/templates/slurm.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env bash -#> -#> - SLURM Batch File Template - -#> -#> This file is part of the ./mfc.sh run subsystem. -#> For more information, please consult the README. -#> -#> - You are invited to modify this file to suit your -#> needs, in order to get MFC running properly on -#> your system. -#> -#> - Lines that begin with "#>" are ignored and won't -#> figure in the final batch script, not even as a -#> comment. -#> -#> - Statements of the form `${expression}` are string- -#> -replaced by mfc.sh run to provide runtime parameters, -#> most notably execution options. They reference the -#> variables in the same format as those under the "run" -#> section of [mfc.user.yaml](mfc.user.yaml), replacing -#> "-" for "_". You can perform therein any Python operation -#> recognized by the built-in `expr()` function. -#> -#> - Statements of the form {MFC::expression} tell MFC -#> where to place the common code, across all batch -#> files that is required to run MFC. They are not -#> intended to be modified by users. -#> -#SBATCH --job-name="{name}" -#SBATCH --nodes={nodes} -#SBATCH --ntasks-per-node={tasks_per_node} -#SBATCH --cpus-per-task=1 -#SBATCH --gpu-bind=verbose,closest -#SBATCH --gpus=v100-16:{(1 if gpu else 0)*tasks_per_node*nodes} -#SBATCH --time={walltime} -#SBATCH --partition="{partition}" -#SBATCH --output="{name}.out" -#SBATCH --account="{account}" -#SBATCH --error="{name}.err" -#SBATCH --mail-user="{email}" -#SBATCH --export=ALL -#SBATCH --mail-type="BEGIN, END, FAIL" -#> -#> Note: The following options aren't enabled by default. -#> They serve as a guide to users that wish to pass -#> more options to the batch system. -#> -#> #SBATCH --mem=... -#> #SBATCH --constraint="lustre" -#> #SBATCH --gpus-per-task={1 if gpu else 0} - - -#> -#> Note: If your system requires you to load environment -#> modules inside of your batch script, please load -#> them bellow. -#> - - -#> -#> Note: The MFC prologue sets up the environment required -#> prior to execution. -#> -{MFC::PROLOGUE} - - -#> -#> Note: This MPI executable might not be well supported -#> on your system - if at all. {MFC::BIN} refers to -#> the path the MFC executable. -#> - -for binpath in {MFC::BINARIES}; do - - echo -e ":) Running $binpath:" - - if command -v srun > /dev/null 2>&1; then - srun \ - --nodes {nodes} \ - --ntasks-per-node {tasks_per_node} \ - {MFC::PROFILER} "$binpath" - - #> - #> srun --mpi=pmix \ - #> {MFC::PROFILER} "$binpath" - #> - else - mpirun \ - -np {nodes*tasks_per_node} \ - {MFC::PROFILER} "$binpath" - fi - -done - -{MFC::EPILOGUE} - -#> -#> Note: Lines after the MFC Epilogue will not be executed. -#> diff --git a/toolchain/templates/summit.mako b/toolchain/templates/summit.mako new file mode 100644 index 0000000000..b8ce8fd2d1 --- /dev/null +++ b/toolchain/templates/summit.mako @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +% if engine == 'batch': +#BSUB -J {{{name}}} +#BSUB -nnodes {{{nodes}}} +#BSUB -W {{{walltime[:-3]}}} +#BSUB -N +% if account: +#BSUB -P {{{account}}} +% endif +% endif + +echo -e ":) Loading modules:\n" +. ./mfc.sh load -c s -m ${'g' if gpu else 'c'} +echo + +<%include file="prologue.mako"/> + +% for binpath in binpaths: + echo -e ":) Running ${binpath}:\n" + + % if not mpi: + ${' '.join([f"'{x}'" for x in profiler ])} "${binpath}" + % else: + jsrun \ + ${'--smpiargs="-gpu"' if gpu else ''} \ + --nrs ${tasks_per_node*nodes} \ + --cpu_per_rs 1 \ + --gpu_per_rs ${1 if gpu else 0} \ + --tasks_per_rs 1 \ + ${' '.join([f"'{x}'" for x in profiler ])} \ + "${binpath}" + % endif + + % if engine == 'interactive': + code=$? + if [ $code -ne 0 ]; then + echo -e "\n:( $MAGENTA${binpath}$COLOR_RESET failed with exit code $MAGENTA$code$COLOR_RESET.\n" + exit 1 + fi + % endif + + echo +% endfor + +<%include file="epilogue.mako"/>