Skip to content

Commit

Permalink
Batch files per computer (MFlowCode#240 & MFlowCode#287)
Browse files Browse the repository at this point in the history
  • Loading branch information
henryleberre committed Jan 8, 2024
1 parent 15bd177 commit 8544f36
Show file tree
Hide file tree
Showing 13 changed files with 357 additions and 331 deletions.
20 changes: 14 additions & 6 deletions toolchain/mfc/args.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import re, os.path, argparse, dataclasses

from .build import TARGETS, DEFAULT_TARGETS, DEPENDENCY_TARGETS
from .common import MFCException, format_list_to_string
from .test.cases import generate_cases
from .run.queues import get_baked_templates
from .build import TARGETS, DEFAULT_TARGETS, DEPENDENCY_TARGETS
from .common import MFCException, format_list_to_string
from .test.cases import generate_cases
from .run.engines import ENGINES
from .run.mpi_bins import BINARIES

# pylint: disable=too-many-locals, too-many-statements
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
def parse(config):
parser = argparse.ArgumentParser(
prog="./mfc.sh",
Expand Down Expand Up @@ -123,6 +124,7 @@ def add_common_arguments(p, mask = None):
run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.")
run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.")
run.add_argument("-c", "--computer", metavar="COMPUTER", type=str, default=None, help=f"(Batch) Path to a custom submission file template or one of {format_list_to_string(list(get_baked_templates().keys()))}.")

# === BENCH ===
add_common_arguments(bench, "t")
Expand Down Expand Up @@ -153,10 +155,16 @@ def add_common_arguments(p, mask = None):
# "Slugify" the name of the job
args["name"] = re.sub(r'[\W_]+', '-', args["name"])

# build's --case-optimization and --input depend on each other
# We need to check for some invalid combinations of arguments because of
# the limitations of argparse.
if args["command"] == "build":
if (args["input"] is not None) ^ args["case_optimization"] :
raise MFCException("./mfc.sh build's --case-optimization requires --input")
raise MFCException("./mfc.sh build's --case-optimization and --input must be used together.")
if args["command"] == "run" and args["engine"] == "batch":
if args["computer"] is None:
raise MFCException("./mfc.sh run's --computer is required when --engine=batch")
if args["binary"] is not None:
raise MFCException("./mfc.sh run's --binary is not allowed when --engine=batch")

# Input files to absolute paths
for e in ["input", "input1", "input2"]:
Expand Down
1 change: 1 addition & 0 deletions toolchain/mfc/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
MFC_ROOTDIR = normpath(f"{dirname(realpath(__file__))}/../..")
MFC_TESTDIR = abspath(f"{MFC_ROOTDIR}/tests")
MFC_SUBDIR = abspath(f"{MFC_ROOTDIR}/build")
MFC_TEMPLATEDIR = abspath(f"{MFC_ROOTDIR}/toolchain/templates")
MFC_LOCK_FILEPATH = abspath(f"{MFC_SUBDIR}/lock.yaml")
MFC_BENCH_FILEPATH = abspath(f"{MFC_ROOTDIR}/toolchain/bench.yaml")

Expand Down
41 changes: 12 additions & 29 deletions toolchain/mfc/run/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@ def init(self, _input: MFCInputFile) -> None:
def _init(self) -> None:
pass

def get_args(self) -> typing.List[str]:
raise MFCException(f"MFCEngine::get_args: not implemented for {self.name}.")

def run(self, targets: typing.List[MFCTarget]) -> None:
raise MFCException(f"MFCEngine::run: not implemented for {self.name}.")

Expand All @@ -71,14 +68,6 @@ def _init(self) -> None:
# If using MPI, we don't know yet whether this engine works
self.bKnowWorks = not ARG("mpi")

def get_args(self) -> str:
return f"""\
Nodes (-N) {ARG('nodes')}
Tasks (/node) (-n) {ARG('tasks_per_node')}
MPI Binary (-b) {self.mpibin.bin}\
"""


def __get_exec_cmd(self, target: MFCTarget) -> typing.List[str]:
cmd = []
if ARG("mpi"):
Expand Down Expand Up @@ -176,16 +165,6 @@ class BatchEngine(Engine):
def __init__(self) -> None:
super().__init__("Batch", "batch")

def get_args(self) -> str:
return f"""\
Nodes (-N) {ARG('nodes')}
Tasks (/node) (-n) {ARG('tasks_per_node')}
Walltime (-w) {ARG("walltime")}
Partition (-p) {ARG("partition")}
Account (-a) {ARG("account")}
Email (-@) {ARG("email")}
"""

def run(self, targets) -> None:
qsystem = queues.get_system()
cons.print(f"Detected the [bold magenta]{qsystem.name}[/bold magenta] queue system.")
Expand Down Expand Up @@ -218,17 +197,22 @@ def __get_batch_filepath(self):
self.__get_batch_filename()
]))

def __generate_prologue(self, qsystem: queues.QueueSystem) -> str:
modules = f""

def __generate_module_load(self) -> str:
if does_system_use_modules():
modules = f"""\
printf ":) Loading modules...\\n"
return f"""\
printf ":) Loading modules...\\n
module purge
module load {' '.join(get_loaded_modules())}
"""

return f"""\
printf ":) Loading modules...\\n
# No modules to load.
"""

def __generate_prologue(self, qsystem: queues.QueueSystem) -> str:
return f"""\
TABLE_FORMAT_LINE="| - %-14s %-35s - %-14s %-35s |\\n"
TABLE_HEADER="+-----------------------------------------------------------------------------------------------------------+ \\n"
Expand All @@ -248,8 +232,6 @@ def __generate_prologue(self, qsystem: queues.QueueSystem) -> str:
printf "$TABLE_CONTENT\\n"
printf "$TABLE_FOOTER\\n"
{modules}
cd "{self.input.case_dirpath}"
t_start=$(date +%s)
Expand Down Expand Up @@ -287,6 +269,7 @@ def __batch_evaluate(self, s: str, qsystem: queues.QueueSystem, targets):
("{MFC::PROLOGUE}", self.__generate_prologue(qsystem)),
("{MFC::PROFILER}", ' '.join(profiler_prepend())),
("{MFC::EPILOGUE}", self.__generate_epilogue()),
("{MFC::MODULES}", self.__generate_module_load()),
("{MFC::BINARIES}", ' '.join([f"'{target.get_install_binpath()}'" for target in targets])),
]

Expand Down Expand Up @@ -315,7 +298,7 @@ def __create_batch_file(self, qsystem: queues.QueueSystem, targets: typing.List[
cons.print("> Generating batch file...")
filepath = self.__get_batch_filepath()
cons.print("> Evaluating template file...")
content = self.__batch_evaluate(qsystem.template, qsystem, targets)
content = self.__batch_evaluate(qsystem.get_template(), qsystem, targets)

cons.print("> Writing batch file...")
file_write(filepath, content)
Expand Down
41 changes: 32 additions & 9 deletions toolchain/mfc/run/queues.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,39 @@
import os, typing, dataclasses
import os, glob, typing, dataclasses

from mfc import common
from ..printer import cons
from ..state import ARG
from ..common import MFC_TEMPLATEDIR


def get_baked_templates() -> dict:
return {
os.path.splitext(os.path.basename(f))[0] : common.file_read(f)
for f in glob.glob(os.path.join(MFC_TEMPLATEDIR, "*.sh"))
}


@dataclasses.dataclass
class QueueSystem:
name: str
template: str
name: str

def __init__(self, name: str) -> None:
self.name = name

def get_template(self) -> str:
if computer := ARG("computer") is None:
raise common.MFCException(f"{self.name}: --computer is required.")

baked = get_baked_templates()
if content := baked.get(self.name.lower()):
cons.print(f"Using baked-in template for [magenta]{self.name}[/magenta].")
return content

if os.path.isfile(computer):
cons.print(f"Using template from [magenta]{computer}[/magenta].")
return common.file_read(computer)

def __init__(self, name: str, filename: str) -> None:
self.name = name
self.template = common.file_read(os.sep.join(["toolchain", "templates", filename]))
raise common.MFCException(f"{self.name}: Failed to find a template for --computer '{computer}'.")

def is_active(self) -> bool:
raise common.MFCException("QueueSystem::is_active: not implemented.")
Expand All @@ -21,7 +44,7 @@ def gen_submit_cmd(self, filepath: str) -> typing.List[str]:

class PBSSystem(QueueSystem):
def __init__(self) -> None:
super().__init__("PBS", "pbs.sh")
super().__init__("PBS")

def is_active(self) -> bool:
return common.does_command_exist("qsub")
Expand All @@ -35,7 +58,7 @@ def gen_submit_cmd(self, filepath: str) -> typing.List[str]:

class LSFSystem(QueueSystem):
def __init__(self) -> None:
super().__init__("LSF", "lsf.sh")
super().__init__("LSF")

def is_active(self) -> bool:
return common.does_command_exist("bsub") and common.does_command_exist("bqueues")
Expand All @@ -51,7 +74,7 @@ def gen_submit_cmd(self, filepath: str) -> None:

class SLURMSystem(QueueSystem):
def __init__(self) -> None:
super().__init__("SLURM", "slurm.sh")
super().__init__("SLURM")

def is_active(self) -> bool:
return common.does_command_exist("sbatch")
Expand Down
10 changes: 0 additions & 10 deletions toolchain/mfc/run/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,6 @@ def run(targets = None):
engine = engines.get_engine(ARG("engine"))
engine.init(input_file)

cons.print(f"Configuration:")
cons.indent()
cons.print(f"""\
Input {ARG('input')}
Job Name (-#) {ARG('name')}
Engine (-e) {ARG('engine')}
{engine.get_args()}\
""")
cons.unindent()

validate_job_options()

for target in targets:
Expand Down
53 changes: 53 additions & 0 deletions toolchain/templates/computer/phoenix.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env bash
#>
#> This file is part of the ./mfc.sh run subsystem. Expressions enclosed in
#> curly braces are expanded and evaluated using Python's eval() function and
#> data from ./mfc.sh run. The resulting file is submitted to the queue system.
#>

#SBATCH --job-name="{name}"
#SBATCH --account={account}
#SBATCH --partition={partition}
#SBATCH --nodes={nodes}
#SBATCH --ntasks-per-node={tasks_per_node}
#SBATCH --gres=gpu:V100:{tasks_per_node if gpu else 0}
#SBATCH --mem-per-gpu=16G
#SBATCH --output="{name}.out"
#SBATCH --time={walltime}
#SBATCH --mail-user={email}
#SBATCH --mail-type="BEGIN, END, FAIL"


. ./mfc.sh load -c p -m {'g' if gpu else 'c'}


#>
#> The MFC prologue sets up the environment required to run MFC prior to
#> execution and starts the timer.
#>
{MFC::PROLOGUE}


#>
#> Iterate over all MFC binaries (as specified through --targets) and execute
#> them, one by one, with profiling enabled if requested.
#>
for binpath in {MFC::BINARIES}; do

echo -e ":) Running $binpath:"
echo ""

mpirun \
-np {nodes*tasks_per_node} \
{MFC::PROFILER} "$binpath"

echo ""

done


#>
#> The MFC epilogue stops the timer and prints the execution summary. It also
#> performs some cleanup and housekeeping tasks before exiting.
#>
{MFC::EPILOGUE}
50 changes: 50 additions & 0 deletions toolchain/templates/computer/summit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash
#>
#> This file is part of the ./mfc.sh run subsystem. Expressions enclosed in
#> curly braces are expanded and evaluated using Python's eval() function and
#> data from ./mfc.sh run. The resulting file is submitted to the queue system.
#>
#BSUB -J {name}
#BSUB -nnodes {nodes}
#BSUB -N
#BSUB -P {account}
#BSUB -W {walltime[:-3]}


. ./mfc.sh load -c s -m {'g' if gpu else 'c'}


#>
#> The MFC prologue sets up the environment required to run MFC prior to
#> execution and starts the timer.
#>
{MFC::PROLOGUE}


#>
#> Iterate over all MFC binaries (as specified through --targets) and execute
#> them, one by one, with profiling enabled if requested.
#>
for binpath in {MFC::BINARIES}; do

echo -e ":) Running $binpath:"
echo ""

jsrun \
{'--smpiargs="-gpu"' if gpu else ''} \
--nrs {tasks_per_node*nodes} \
--cpu_per_rs 1 \
--gpu_per_rs {1 if gpu else 0} \
--tasks_per_rs 1 \
{MFC::PROFILER} "$binpath"

echo ""

done


#>
#> The MFC epilogue stops the timer and prints the execution summary. It also
#> performs some cleanup and housekeeping tasks before exiting.
#>
{MFC::EPILOGUE}
54 changes: 54 additions & 0 deletions toolchain/templates/generic/lsf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env bash
#>
#> This file is part of the ./mfc.sh run subsystem. Expressions enclosed in
#> curly braces are expanded and evaluated using Python's eval() function and
#> data from ./mfc.sh run. The resulting file is submitted to the queue system.
#>
#BSUB -J {name}
#BSUB -nnodes {nodes}
#BSUB -N
#BSUB -P {account}
#BSUB -W {walltime[:-3]}


#>
#> Load the same modules as the ones currently loaded in the login shell. These
#> are usually the ones used to compile MFC.
#>
{MFC::MODULES}


#>
#> The MFC prologue sets up the environment required to run MFC prior to
#> execution and starts the timer.
#>
{MFC::PROLOGUE}


#>
#> Iterate over all MFC binaries (as specified through --targets) and execute
#> them, one by one, with profiling enabled if requested.
#>
for binpath in {MFC::BINARIES}; do

echo -e ":) Running $binpath:"
echo ""

jsrun \
{'--smpiargs="-gpu"' if gpu else ''} \
--nrs {tasks_per_node*nodes} \
--cpu_per_rs 1 \
--gpu_per_rs {1 if gpu else 0} \
--tasks_per_rs 1 \
{MFC::PROFILER} "$binpath"

echo ""

done


#>
#> The MFC epilogue stops the timer and prints the execution summary. It also
#> performs some cleanup and housekeeping tasks before exiting.
#>
{MFC::EPILOGUE}
Loading

0 comments on commit 8544f36

Please sign in to comment.