Skip to content

Commit

Permalink
Batch files per computer (MFlowCode#240 & MFlowCode#287)
Browse files Browse the repository at this point in the history
  • Loading branch information
henryleberre committed Jan 7, 2024
1 parent bc43155 commit c370c32
Show file tree
Hide file tree
Showing 12 changed files with 288 additions and 324 deletions.
8 changes: 7 additions & 1 deletion toolchain/mfc/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from .run.engines import ENGINES
from .run.mpi_bins import BINARIES

# pylint: disable=too-many-locals, too-many-statements
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
def parse(config):
parser = argparse.ArgumentParser(
prog="./mfc.sh",
Expand Down Expand Up @@ -123,6 +123,7 @@ def add_common_arguments(p, mask = None):
run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.")
run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.")
run.add_argument("-c", "--computer", metavar="COMPUTER", type=str, default=None, help="(Batch) Computer to run on or path to a template batch submission file.")

# === BENCH ===
add_common_arguments(bench, "t")
Expand Down Expand Up @@ -157,6 +158,11 @@ def add_common_arguments(p, mask = None):
if args["command"] == "build":
if (args["input"] is not None) ^ args["case_optimization"] :
raise MFCException("./mfc.sh build's --case-optimization requires --input")
if args["command"] == "run" and args["engine"] == "batch":
if args["computer"] is None:
raise MFCException("./mfc.sh run's --computer is required when --engine=batch")
if args["binary"] is not None:
raise MFCException("./mfc.sh run's --binary is not allowed when --engine=batch")

# Input files to absolute paths
for e in ["input", "input1", "input2"]:
Expand Down
1 change: 1 addition & 0 deletions toolchain/mfc/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
MFC_ROOTDIR = normpath(f"{dirname(realpath(__file__))}/../..")
MFC_TESTDIR = abspath(f"{MFC_ROOTDIR}/tests")
MFC_SUBDIR = abspath(f"{MFC_ROOTDIR}/build")
MFC_TEMPLATEDIR = abspath(f"{MFC_ROOTDIR}/toolchain/templates")
MFC_LOCK_FILEPATH = abspath(f"{MFC_SUBDIR}/lock.yaml")
MFC_BENCH_FILEPATH = abspath(f"{MFC_ROOTDIR}/toolchain/bench.yaml")

Expand Down
40 changes: 12 additions & 28 deletions toolchain/mfc/run/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@ def init(self, _input: MFCInputFile) -> None:
def _init(self) -> None:
pass

def get_args(self) -> typing.List[str]:
raise MFCException(f"MFCEngine::get_args: not implemented for {self.name}.")

def run(self, targets: typing.List[MFCTarget]) -> None:
raise MFCException(f"MFCEngine::run: not implemented for {self.name}.")

Expand All @@ -71,13 +68,6 @@ def _init(self) -> None:
# If using MPI, we don't know yet whether this engine works
self.bKnowWorks = not ARG("mpi")

def get_args(self) -> str:
return f"""\
Nodes (-N) {ARG('nodes')}
Tasks (/node) (-n) {ARG('tasks_per_node')}
MPI Binary (-b) {self.mpibin.bin}\
"""

def get_exec_cmd(self, target: MFCTarget) -> typing.List[str]:
cmd = []
if ARG("mpi"):
Expand Down Expand Up @@ -173,16 +163,6 @@ class BatchEngine(Engine):
def __init__(self) -> None:
super().__init__("Batch", "batch")

def get_args(self) -> str:
return f"""\
Nodes (-N) {ARG('nodes')}
Tasks (/node) (-n) {ARG('tasks_per_node')}
Walltime (-w) {ARG("walltime")}
Partition (-p) {ARG("partition")}
Account (-a) {ARG("account")}
Email (-@) {ARG("email")}
"""

def run(self, targets: typing.List[MFCTarget]) -> None:
qsystem = queues.get_system()
cons.print(f"Detected the [bold magenta]{qsystem.name}[/bold magenta] queue system.")
Expand Down Expand Up @@ -215,17 +195,22 @@ def __get_batch_filepath(self):
self.__get_batch_filename()
]))

def __generate_prologue(self, qsystem: queues.QueueSystem) -> str:
modules = f""

def __generate_module_load(self) -> str:
if does_system_use_modules():
modules = f"""\
printf ":) Loading modules...\\n"
return f"""\
printf ":) Loading modules...\\n
module purge
module load {' '.join(get_loaded_modules())}
"""

return f"""\
printf ":) Loading modules...\\n
# No modules to load.
"""

def __generate_prologue(self, qsystem: queues.QueueSystem) -> str:
return f"""\
TABLE_FORMAT_LINE="| - %-14s %-35s - %-14s %-35s |\\n"
TABLE_HEADER="+-----------------------------------------------------------------------------------------------------------+ \\n"
Expand All @@ -245,8 +230,6 @@ def __generate_prologue(self, qsystem: queues.QueueSystem) -> str:
printf "$TABLE_CONTENT\\n"
printf "$TABLE_FOOTER\\n"
{modules}
cd "{self.input.case_dirpath}"
t_start=$(date +%s)
Expand Down Expand Up @@ -282,6 +265,7 @@ def __batch_evaluate(self, s: str, qsystem: queues.QueueSystem, targets: typing.
("{MFC::PROLOGUE}", self.__generate_prologue(qsystem)),
("{MFC::PROFILER}", ' '.join(profiler_prepend())),
("{MFC::EPILOGUE}", self.__generate_epilogue()),
("{MFC::MODULES}", self.__generate_module_load()),
("{MFC::BINARIES}", ' '.join([f"'{target.get_install_binpath()}'" for target in targets])),
]

Expand Down Expand Up @@ -310,7 +294,7 @@ def __create_batch_file(self, qsystem: queues.QueueSystem, targets: typing.List[
cons.print("> Generating batch file...")
filepath = self.__get_batch_filepath()
cons.print("> Evaluating template file...")
content = self.__batch_evaluate(qsystem.template, qsystem, targets)
content = self.__batch_evaluate(qsystem.get_template(), qsystem, targets)

cons.print("> Writing batch file...")
file_write(filepath, content)
Expand Down
31 changes: 23 additions & 8 deletions toolchain/mfc/run/queues.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,30 @@

from mfc import common
from ..state import ARG
from ..common import MFC_TEMPLATEDIR

@dataclasses.dataclass
class QueueSystem:
name: str
template: str
name: str

def __init__(self, name: str, filename: str) -> None:
self.name = name
self.template = common.file_read(os.sep.join(["toolchain", "templates", filename]))
def __init__(self, name: str) -> None:
self.name = name

def get_template(self) -> str:
candidates = [
ARG("computer"),
os.path.join(MFC_TEMPLATEDIR, "computer", ARG("computer")),
os.path.join(MFC_TEMPLATEDIR, "generic", ARG("computer")),
os.path.join(MFC_TEMPLATEDIR, "generic", self.name)
]

for candidate in candidates:
for ext in ["", ".sh"]:
filepath = f"{candidate}{ext}"
if os.path.isfile(filepath):
return common.file_read(filepath)

raise common.MFCException(f"QueueSystem: Failed to find computer/template file for {ARG('computer')}.")

def is_active(self) -> bool:
raise common.MFCException("QueueSystem::is_active: not implemented.")
Expand All @@ -21,7 +36,7 @@ def gen_submit_cmd(self, filepath: str) -> typing.List[str]:

class PBSSystem(QueueSystem):
def __init__(self) -> None:
super().__init__("PBS", "pbs.sh")
super().__init__("PBS")

def is_active(self) -> bool:
return common.does_command_exist("qsub")
Expand All @@ -35,7 +50,7 @@ def gen_submit_cmd(self, filepath: str) -> typing.List[str]:

class LSFSystem(QueueSystem):
def __init__(self) -> None:
super().__init__("LSF", "lsf.sh")
super().__init__("LSF")

def is_active(self) -> bool:
return common.does_command_exist("bsub") and common.does_command_exist("bqueues")
Expand All @@ -51,7 +66,7 @@ def gen_submit_cmd(self, filepath: str) -> None:

class SLURMSystem(QueueSystem):
def __init__(self) -> None:
super().__init__("SLURM", "slurm.sh")
super().__init__("SLURM")

def is_active(self) -> bool:
return common.does_command_exist("sbatch")
Expand Down
10 changes: 0 additions & 10 deletions toolchain/mfc/run/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,6 @@ def run_targets(targets: typing.List[MFCTarget]):
engine = engines.get_engine(ARG("engine"))
engine.init(input_file)

cons.print(f"Configuration:")
cons.indent()
cons.print(f"""\
Input {ARG('input')}
Job Name (-#) {ARG('name')}
Engine (-e) {ARG('engine')}
{engine.get_args()}\
""")
cons.unindent()

validate_job_options()

for target in targets:
Expand Down
50 changes: 50 additions & 0 deletions toolchain/templates/computer/summit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash
#>
#> This file is part of the ./mfc.sh run subsystem. Expressions enclosed in
#> curly braces are expanded and evaluated using Python's eval() function and
#> data from ./mfc.sh run. The resulting file is submitted to the queue system.
#>
#BSUB -J {name}
#BSUB -nnodes {nodes}
#BSUB -N
#BSUB -P {account}
#BSUB -W {walltime[:-3]}


. ./mfc.sh load -c s -m {'g' if gpu else 'c'}


#>
#> The MFC prologue sets up the environment required to run MFC prior to
#> execution and starts the timer.
#>
{MFC::PROLOGUE}


#>
#> Iterate over all MFC binaries (as specified through --targets) and execute
#> them, one by one, with profiling enabled if requested.
#>
for binpath in {MFC::BINARIES}; do

echo -e ":) Running $binpath:"
echo ""

jsrun \
{'--smpiargs="-gpu"' if gpu else ''} \
--nrs {tasks_per_node*nodes} \
--cpu_per_rs 1 \
--gpu_per_rs {1 if gpu else 0} \
--tasks_per_rs 1 \
{MFC::PROFILER} "$binpath"

echo ""

done


#>
#> The MFC epilogue stops the timer and prints the execution summary. It also
#> performs some cleanup and housekeeping tasks before exiting.
#>
{MFC::EPILOGUE}
54 changes: 54 additions & 0 deletions toolchain/templates/generic/lsf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env bash
#>
#> This file is part of the ./mfc.sh run subsystem. Expressions enclosed in
#> curly braces are expanded and evaluated using Python's eval() function and
#> data from ./mfc.sh run. The resulting file is submitted to the queue system.
#>
#BSUB -J {name}
#BSUB -nnodes {nodes}
#BSUB -N
#BSUB -P {account}
#BSUB -W {walltime[:-3]}


#>
#> Load the same modules as the ones currently loaded in the login shell. These
#> are usually the ones used to compile MFC.
#>
{MFC::MODULES}


#>
#> The MFC prologue sets up the environment required to run MFC prior to
#> execution and starts the timer.
#>
{MFC::PROLOGUE}


#>
#> Iterate over all MFC binaries (as specified through --targets) and execute
#> them, one by one, with profiling enabled if requested.
#>
for binpath in {MFC::BINARIES}; do

echo -e ":) Running $binpath:"
echo ""

jsrun \
{'--smpiargs="-gpu"' if gpu else ''} \
--nrs {tasks_per_node*nodes} \
--cpu_per_rs 1 \
--gpu_per_rs {1 if gpu else 0} \
--tasks_per_rs 1 \
{MFC::PROFILER} "$binpath"

echo ""

done


#>
#> The MFC epilogue stops the timer and prints the execution summary. It also
#> performs some cleanup and housekeeping tasks before exiting.
#>
{MFC::EPILOGUE}
65 changes: 65 additions & 0 deletions toolchain/templates/generic/pbs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env bash
#>
#> This file is part of the ./mfc.sh run subsystem. Expressions enclosed in
#> curly braces are expanded and evaluated using Python's eval() function and
#> data from ./mfc.sh run. The resulting file is submitted to the queue system.
#>
#PBS -N {name}
#PBS -l nodes={nodes}:ppn={tasks_per_node}
#PBS -A {account}
#PBS -l walltime={walltime}
#PBS -q {partition}
#PBS -M {email}
#>
#> Note: The following options aren't enabled by default.
#> They serve as a guide to users that wish to pass
#> more options to the batch system.
#>


#>
#> Load the same modules as the ones currently loaded in the login shell. These
#> are usually the ones used to compile MFC.
#>
{MFC::MODULES}


#>
#> The MFC prologue sets up the environment required to run MFC prior to
#> execution and starts the timer.
#>
{MFC::PROLOGUE}


#>
#> Iterate over all MFC binaries (as specified through --targets) and execute
#> them, one by one, with profiling enabled if requested.
#>
for binpath in {MFC::BINARIES}; do

echo -e ":) Running $binpath:"

if command -v srun > /dev/null 2>&1; then
srun \
--nodes {nodes} \
--ntasks-per-node {tasks_per_node} \
{MFC::PROFILER} "$binpath"

#>
#> srun --mpi=pmix \
#> {MFC::PROFILER} "$binpath"
else
mpirun \
-np {tasks_per_node*nodes} \
{MFC::PROFILER} "$binpath"

fi

done


#>
#> The MFC epilogue stops the timer and prints the execution summary. It also
#> performs some cleanup and housekeeping tasks before exiting.
#>
{MFC::EPILOGUE}
Loading

0 comments on commit c370c32

Please sign in to comment.