Skip to content

Commit

Permalink
Batch files per computer (MFlowCode#240 & MFlowCode#287)
Browse files Browse the repository at this point in the history
  • Loading branch information
henryleberre committed Jan 9, 2024
1 parent 309ab97 commit 43ec371
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 377 deletions.
59 changes: 34 additions & 25 deletions toolchain/mfc/args.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import re, os.path, argparse, dataclasses

from .build import TARGETS, DEFAULT_TARGETS, DEPENDENCY_TARGETS
from .common import MFCException, format_list_to_string
from .test.cases import generate_cases
from .run.queues import get_baked_templates
from .build import TARGETS, DEFAULT_TARGETS, DEPENDENCY_TARGETS
from .common import MFCException, format_list_to_string
from .test.cases import generate_cases
from .run.engines import ENGINES
from .run.mpi_bins import BINARIES

# pylint: disable=too-many-locals, too-many-statements
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
def parse(config):
parser = argparse.ArgumentParser(
prog="./mfc.sh",
Expand Down Expand Up @@ -103,26 +104,28 @@ def add_common_arguments(p, mask = None):
engines = [ e.slug for e in ENGINES ]

add_common_arguments(run)
run.add_argument("input", metavar="INPUT", type=str, help="Input file to run.")
run.add_argument("arguments", metavar="ARGUMENTS", nargs="*", type=str, default=[], help="Additional arguments to pass to the case file.")
run.add_argument("-e", "--engine", choices=engines, type=str, default=engines[0], help="Job execution/submission engine choice.")
run.add_argument("input", metavar="INPUT", type=str, help="Input file to run.")
run.add_argument("arguments", metavar="ARGUMENTS", nargs="*", type=str, default=[], help="Additional arguments to pass to the case file.")
run.add_argument("-e", "--engine", choices=engines, type=str, default=engines[0], help="Job execution/submission engine choice.")
run.add_argument("--output-summary", type=str, default=None, help="(Interactive) Output a YAML summary file.")
run.add_argument("-p", "--partition", metavar="PARTITION", type=str, default="", help="(Batch) Partition for job submission.")
run.add_argument("-N", "--nodes", metavar="NODES", type=int, default=1, help="(Batch) Number of nodes.")
run.add_argument("-n", "--tasks-per-node", metavar="TASKS", type=int, default=1, help="Number of tasks per node.")
run.add_argument("-w", "--walltime", metavar="WALLTIME", type=str, default="01:00:00", help="(Batch) Walltime.")
run.add_argument("-a", "--account", metavar="ACCOUNT", type=str, default="", help="(Batch) Account to charge.")
run.add_argument("-@", "--email", metavar="EMAIL", type=str, default="", help="(Batch) Email for job notification.")
run.add_argument("-#", "--name", metavar="NAME", type=str, default="MFC", help="(Batch) Job name.")
run.add_argument("-b", "--binary", choices=binaries, type=str, default=None, help="(Interactive) Override MPI execution binary")
run.add_argument("-s", "--scratch", action="store_true", default=False, help="Build from scratch.")
run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.")
run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.")
run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.")
run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.")
run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.")
run.add_argument("-p", "--partition", metavar="PARTITION", type=str, default="", help="(Batch) Partition for job submission.")
run.add_argument("-q", "--quality_of_service", metavar="QOS", type=str, default="", help="(Batch) Quality of Service for job submission.")
run.add_argument("-N", "--nodes", metavar="NODES", type=int, default=1, help="(Batch) Number of nodes.")
run.add_argument("-n", "--tasks-per-node", metavar="TASKS", type=int, default=1, help="Number of tasks per node.")
run.add_argument("-w", "--walltime", metavar="WALLTIME", type=str, default="01:00:00", help="(Batch) Walltime.")
run.add_argument("-a", "--account", metavar="ACCOUNT", type=str, default="", help="(Batch) Account to charge.")
run.add_argument("-@", "--email", metavar="EMAIL", type=str, default="", help="(Batch) Email for job notification.")
run.add_argument("-#", "--name", metavar="NAME", type=str, default="MFC", help="(Batch) Job name.")
run.add_argument("-b", "--binary", choices=binaries, type=str, default=None, help="(Interactive) Override MPI execution binary")
run.add_argument("-s", "--scratch", action="store_true", default=False, help="Build from scratch.")
run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.")
run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.")
run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.")
run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.")
run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.")
run.add_argument("-c", "--computer", metavar="COMPUTER", type=str, default=None, help=f"(Batch) Path to a custom submission file template or one of {format_list_to_string(list(get_baked_templates().keys()))}.")

# === BENCH ===
add_common_arguments(bench, "t")
Expand Down Expand Up @@ -153,10 +156,16 @@ def add_common_arguments(p, mask = None):
# "Slugify" the name of the job
args["name"] = re.sub(r'[\W_]+', '-', args["name"])

# build's --case-optimization and --input depend on each other
# We need to check for some invalid combinations of arguments because of
# the limitations of argparse.
if args["command"] == "build":
if (args["input"] is not None) ^ args["case_optimization"] :
raise MFCException("./mfc.sh build's --case-optimization requires --input")
raise MFCException("./mfc.sh build's --case-optimization and --input must be used together.")
if args["command"] == "run" and args["engine"] == "batch":
if args["computer"] is None:
raise MFCException("./mfc.sh run's --computer is required when --engine=batch")
if args["binary"] is not None:
raise MFCException("./mfc.sh run's --binary is not allowed when --engine=batch")

# Input files to absolute paths
for e in ["input", "input1", "input2"]:
Expand Down
9 changes: 1 addition & 8 deletions toolchain/mfc/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
MFC_ROOTDIR = normpath(f"{dirname(realpath(__file__))}/../..")
MFC_TESTDIR = abspath(f"{MFC_ROOTDIR}/tests")
MFC_SUBDIR = abspath(f"{MFC_ROOTDIR}/build")
MFC_TEMPLATEDIR = abspath(f"{MFC_ROOTDIR}/toolchain/templates")
MFC_LOCK_FILEPATH = abspath(f"{MFC_SUBDIR}/lock.yaml")
MFC_BENCH_FILEPATH = abspath(f"{MFC_ROOTDIR}/toolchain/bench.yaml")

Expand Down Expand Up @@ -179,14 +180,6 @@ def does_system_use_modules() -> bool:
return does_command_exist("module")


def get_loaded_modules() -> typing.List[str]:
"""
Returns a list of loaded modules.
"""

return [ l for l in subprocess.getoutput("module -t list").splitlines() if ' ' not in l ]


def is_number(x: str) -> bool:
if x is None:
return False
Expand Down
63 changes: 15 additions & 48 deletions toolchain/mfc/run/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from ..printer import cons
from ..build import MFCTarget, SYSCHECK, get_targets
from ..common import MFCException, does_command_exist, isspace, system
from ..common import format_list_to_string, does_system_use_modules
from ..common import get_loaded_modules, file_write
from ..common import format_list_to_string
from ..common import file_write
from ..run import queues, mpi_bins
from ..run.input import MFCInputFile

Expand Down Expand Up @@ -45,9 +45,6 @@ def init(self, _input: MFCInputFile) -> None:
def _init(self) -> None:
pass

def get_args(self) -> typing.List[str]:
raise MFCException(f"MFCEngine::get_args: not implemented for {self.name}.")

def run(self, targets: typing.List[MFCTarget]) -> None:
raise MFCException(f"MFCEngine::run: not implemented for {self.name}.")

Expand All @@ -71,14 +68,6 @@ def _init(self) -> None:
# If using MPI, we don't know yet whether this engine works
self.bKnowWorks = not ARG("mpi")

def get_args(self) -> str:
return f"""\
Nodes (-N) {ARG('nodes')}
Tasks (/node) (-n) {ARG('tasks_per_node')}
MPI Binary (-b) {self.mpibin.bin}\
"""


def __get_exec_cmd(self, target: MFCTarget) -> typing.List[str]:
cmd = []
if ARG("mpi"):
Expand Down Expand Up @@ -176,16 +165,6 @@ class BatchEngine(Engine):
def __init__(self) -> None:
super().__init__("Batch", "batch")

def get_args(self) -> str:
return f"""\
Nodes (-N) {ARG('nodes')}
Tasks (/node) (-n) {ARG('tasks_per_node')}
Walltime (-w) {ARG("walltime")}
Partition (-p) {ARG("partition")}
Account (-a) {ARG("account")}
Email (-@) {ARG("email")}
"""

def run(self, targets) -> None:
qsystem = queues.get_system()
cons.print(f"Detected the [bold magenta]{qsystem.name}[/bold magenta] queue system.")
Expand Down Expand Up @@ -219,16 +198,6 @@ def __get_batch_filepath(self):
]))

def __generate_prologue(self, qsystem: queues.QueueSystem) -> str:
modules = f""

if does_system_use_modules():
modules = f"""\
printf ":) Loading modules...\\n"
module purge
module load {' '.join(get_loaded_modules())}
"""

return f"""\
TABLE_FORMAT_LINE="| - %-14s %-35s - %-14s %-35s |\\n"
TABLE_HEADER="+-----------------------------------------------------------------------------------------------------------+ \\n"
Expand All @@ -248,8 +217,6 @@ def __generate_prologue(self, qsystem: queues.QueueSystem) -> str:
printf "$TABLE_CONTENT\\n"
printf "$TABLE_FOOTER\\n"
{modules}
cd "{self.input.case_dirpath}"
t_start=$(date +%s)
Expand Down Expand Up @@ -283,29 +250,29 @@ def __evaluate_expression(self, expr: str) -> str:
def __batch_evaluate(self, s: str, qsystem: queues.QueueSystem, targets):
targets = get_targets(targets)

replace_list = [
("{MFC::PROLOGUE}", self.__generate_prologue(qsystem)),
("{MFC::PROFILER}", ' '.join(profiler_prepend())),
("{MFC::EPILOGUE}", self.__generate_epilogue()),
("{MFC::BINARIES}", ' '.join([f"'{target.get_install_binpath()}'" for target in targets])),
]
replace_list = {
"PROLOGUE": self.__generate_prologue(qsystem),
"PROFILER": ' '.join(profiler_prepend()),
"EPILOGUE": self.__generate_epilogue(),
"BINARIES": ' '.join([f"'{target.get_install_binpath()}'" for target in targets]),
}

for (key, value) in replace_list:
s = s.replace(key, value)
for (key, value) in replace_list.items():
s = s.replace("{{{" + key + "}}}", value)

# Remove "#>" comments & redundant newlines
s = re.sub(r"^#>.*\n", "", s, flags=re.MULTILINE)
s = re.sub(r"^\n{2,}", "\n", s, flags=re.MULTILINE)

# Evaluate expressions of the form "{expression}"
for match in re.findall(r"{[^\{]+}", s, flags=re.MULTILINE):
repl = self.__evaluate_expression(match[1:-1])
# Evaluate expressions of the form "{{{expression}}}"
for match in re.findall(r"{{{[\s\S]+?}}}", s, flags=re.MULTILINE):
repl = self.__evaluate_expression(match[3:-3])

if repl is not None:
s = s.replace(match, repl)
else:
# If not specified, then remove the line it appears on
s = re.sub(rf"^.*{match}.*$\n", "", s, flags=re.MULTILINE)
s = re.sub(rf"^.*{re.escape(match)}[\s\S]*?$\n", "", s, flags=re.MULTILINE)

cons.print(f"> > [bold yellow]Warning:[/bold yellow] [magenta]{match[1:-1]}[/magenta] was not specified. Thus, any line it appears on will be discarded.")

Expand All @@ -315,7 +282,7 @@ def __create_batch_file(self, qsystem: queues.QueueSystem, targets: typing.List[
cons.print("> Generating batch file...")
filepath = self.__get_batch_filepath()
cons.print("> Evaluating template file...")
content = self.__batch_evaluate(qsystem.template, qsystem, targets)
content = self.__batch_evaluate(qsystem.get_template(), qsystem, targets)

cons.print("> Writing batch file...")
file_write(filepath, content)
Expand Down
41 changes: 32 additions & 9 deletions toolchain/mfc/run/queues.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,39 @@
import os, typing, dataclasses
import os, glob, typing, dataclasses

from mfc import common
from ..printer import cons
from ..state import ARG
from ..common import MFC_TEMPLATEDIR


def get_baked_templates() -> dict:
return {
os.path.splitext(os.path.basename(f))[0] : common.file_read(f)
for f in glob.glob(os.path.join(MFC_TEMPLATEDIR, "*.sh"))
}


@dataclasses.dataclass
class QueueSystem:
name: str
template: str
name: str

def __init__(self, name: str) -> None:
self.name = name

def get_template(self) -> str:
if (computer := ARG("computer")) is None:
raise common.MFCException(f"{self.name}: --computer is required.")

baked = get_baked_templates()
if (content := baked.get(computer)) is not None:
cons.print(f"Using baked-in template for [magenta]{self.name}[/magenta].")
return content

if os.path.isfile(computer):
cons.print(f"Using template from [magenta]{computer}[/magenta].")
return common.file_read(computer)

def __init__(self, name: str, filename: str) -> None:
self.name = name
self.template = common.file_read(os.sep.join(["toolchain", "templates", filename]))
raise common.MFCException(f"{self.name}: Failed to find a template for --computer '{computer}'.")

def is_active(self) -> bool:
raise common.MFCException("QueueSystem::is_active: not implemented.")
Expand All @@ -21,7 +44,7 @@ def gen_submit_cmd(self, filepath: str) -> typing.List[str]:

class PBSSystem(QueueSystem):
def __init__(self) -> None:
super().__init__("PBS", "pbs.sh")
super().__init__("PBS")

def is_active(self) -> bool:
return common.does_command_exist("qsub")
Expand All @@ -35,7 +58,7 @@ def gen_submit_cmd(self, filepath: str) -> typing.List[str]:

class LSFSystem(QueueSystem):
def __init__(self) -> None:
super().__init__("LSF", "lsf.sh")
super().__init__("LSF")

def is_active(self) -> bool:
return common.does_command_exist("bsub") and common.does_command_exist("bqueues")
Expand All @@ -51,7 +74,7 @@ def gen_submit_cmd(self, filepath: str) -> None:

class SLURMSystem(QueueSystem):
def __init__(self) -> None:
super().__init__("SLURM", "slurm.sh")
super().__init__("SLURM")

def is_active(self) -> bool:
return common.does_command_exist("sbatch")
Expand Down
10 changes: 0 additions & 10 deletions toolchain/mfc/run/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,6 @@ def run(targets = None):
engine = engines.get_engine(ARG("engine"))
engine.init(input_file)

cons.print(f"Configuration:")
cons.indent()
cons.print(f"""\
Input {ARG('input')}
Job Name (-#) {ARG('name')}
Engine (-e) {ARG('engine')}
{engine.get_args()}\
""")
cons.unindent()

validate_job_options()

for target in targets:
Expand Down
Loading

0 comments on commit 43ec371

Please sign in to comment.