Skip to content

Commit

Permalink
Batch files per computer (MFlowCode#240 & MFlowCode#287)
Browse files Browse the repository at this point in the history
  • Loading branch information
henryleberre committed Jan 9, 2024
1 parent 15bd177 commit 5083b0a
Show file tree
Hide file tree
Showing 13 changed files with 350 additions and 369 deletions.
140 changes: 140 additions & 0 deletions diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py
index 8b2d48e..06add8b 100644
--- a/toolchain/mfc/args.py
+++ b/toolchain/mfc/args.py
@@ -104,27 +104,28 @@ started, run ./mfc.sh build -h.""",
engines = [ e.slug for e in ENGINES ]

add_common_arguments(run)
- run.add_argument("input", metavar="INPUT", type=str, help="Input file to run.")
- run.add_argument("arguments", metavar="ARGUMENTS", nargs="*", type=str, default=[], help="Additional arguments to pass to the case file.")
- run.add_argument("-e", "--engine", choices=engines, type=str, default=engines[0], help="Job execution/submission engine choice.")
+ run.add_argument("input", metavar="INPUT", type=str, help="Input file to run.")
+ run.add_argument("arguments", metavar="ARGUMENTS", nargs="*", type=str, default=[], help="Additional arguments to pass to the case file.")
+ run.add_argument("-e", "--engine", choices=engines, type=str, default=engines[0], help="Job execution/submission engine choice.")
run.add_argument("--output-summary", type=str, default=None, help="(Interactive) Output a YAML summary file.")
- run.add_argument("-p", "--partition", metavar="PARTITION", type=str, default="", help="(Batch) Partition for job submission.")
- run.add_argument("-N", "--nodes", metavar="NODES", type=int, default=1, help="(Batch) Number of nodes.")
- run.add_argument("-n", "--tasks-per-node", metavar="TASKS", type=int, default=1, help="Number of tasks per node.")
- run.add_argument("-w", "--walltime", metavar="WALLTIME", type=str, default="01:00:00", help="(Batch) Walltime.")
- run.add_argument("-a", "--account", metavar="ACCOUNT", type=str, default="", help="(Batch) Account to charge.")
- run.add_argument("-@", "--email", metavar="EMAIL", type=str, default="", help="(Batch) Email for job notification.")
- run.add_argument("-#", "--name", metavar="NAME", type=str, default="MFC", help="(Batch) Job name.")
- run.add_argument("-b", "--binary", choices=binaries, type=str, default=None, help="(Interactive) Override MPI execution binary")
- run.add_argument("-s", "--scratch", action="store_true", default=False, help="Build from scratch.")
- run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.")
- run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.")
- run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.")
- run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
- run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
- run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.")
- run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.")
- run.add_argument("-c", "--computer", metavar="COMPUTER", type=str, default=None, help=f"(Batch) Path to a custom submission file template or one of {format_list_to_string(list(get_baked_templates().keys()))}.")
+ run.add_argument("-p", "--partition", metavar="PARTITION", type=str, default="", help="(Batch) Partition for job submission.")
+ run.add_argument("-q", "--quality_of_service", metavar="QOS", type=str, default="", help="(Batch) Quality of Service for job submission.")
+ run.add_argument("-N", "--nodes", metavar="NODES", type=int, default=1, help="(Batch) Number of nodes.")
+ run.add_argument("-n", "--tasks-per-node", metavar="TASKS", type=int, default=1, help="Number of tasks per node.")
+ run.add_argument("-w", "--walltime", metavar="WALLTIME", type=str, default="01:00:00", help="(Batch) Walltime.")
+ run.add_argument("-a", "--account", metavar="ACCOUNT", type=str, default="", help="(Batch) Account to charge.")
+ run.add_argument("-@", "--email", metavar="EMAIL", type=str, default="", help="(Batch) Email for job notification.")
+ run.add_argument("-#", "--name", metavar="NAME", type=str, default="MFC", help="(Batch) Job name.")
+ run.add_argument("-b", "--binary", choices=binaries, type=str, default=None, help="(Interactive) Override MPI execution binary")
+ run.add_argument("-s", "--scratch", action="store_true", default=False, help="Build from scratch.")
+ run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.")
+ run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.")
+ run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.")
+ run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
+ run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
+ run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.")
+ run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.")
+ run.add_argument("-c", "--computer", metavar="COMPUTER", type=str, default=None, help=f"(Batch) Path to a custom submission file template or one of {format_list_to_string(list(get_baked_templates().keys()))}.")

# === BENCH ===
add_common_arguments(bench, "t")
diff --git a/toolchain/mfc/run/engines.py b/toolchain/mfc/run/engines.py
index c095e9b..b9830fb 100644
--- a/toolchain/mfc/run/engines.py
+++ b/toolchain/mfc/run/engines.py
@@ -264,15 +264,15 @@ exit $code
s = re.sub(r"^#>.*\n", "", s, flags=re.MULTILINE)
s = re.sub(r"^\n{2,}", "\n", s, flags=re.MULTILINE)

- # Evaluate expressions of the form "{expression}"
- for match in re.findall(r"{[^\{]+}", s, flags=re.MULTILINE):
- repl = self.__evaluate_expression(match[1:-1])
+ # Evaluate expressions of the form "{{{expression}}}"
+ for match in re.findall(r"{{{[\s\S]+?}}}", s, flags=re.MULTILINE):
+ repl = self.__evaluate_expression(match[3:-3])

if repl is not None:
s = s.replace(match, repl)
else:
# If not specified, then remove the line it appears on
- s = re.sub(rf"^.*{match}.*$\n", "", s, flags=re.MULTILINE)
+ s = re.sub(rf"^.*{re.escape(match)}[\s\S]*?$\n", "", s, flags=re.MULTILINE)

cons.print(f"> > [bold yellow]Warning:[/bold yellow] [magenta]{match[1:-1]}[/magenta] was not specified. Thus, any line it appears on will be discarded.")

diff --git a/toolchain/mfc/run/queues.py b/toolchain/mfc/run/queues.py
index 2248945..124716a 100644
--- a/toolchain/mfc/run/queues.py
+++ b/toolchain/mfc/run/queues.py
@@ -21,11 +21,11 @@ class QueueSystem:
self.name = name

def get_template(self) -> str:
- if computer := ARG("computer") is None:
+ if (computer := ARG("computer")) is None:
raise common.MFCException(f"{self.name}: --computer is required.")

baked = get_baked_templates()
- if content := baked.get(self.name.lower()):
+ if (content := baked.get(computer)) is not None:
cons.print(f"Using baked-in template for [magenta]{self.name}[/magenta].")
return content

diff --git a/toolchain/templates/phoenix.sh b/toolchain/templates/phoenix.sh
index b77d4d9..c180ab4 100644
--- a/toolchain/templates/phoenix.sh
+++ b/toolchain/templates/phoenix.sh
@@ -5,16 +5,19 @@
#> data from ./mfc.sh run. The resulting file is submitted to the queue system.
#>

-#SBATCH --job-name="{name}"
-#SBATCH --account={account}
-#SBATCH --partition={partition}
-#SBATCH --nodes={nodes}
-#SBATCH --ntasks-per-node={tasks_per_node}
-#SBATCH --gres=gpu:V100:{tasks_per_node if gpu else 0}
-#SBATCH --mem-per-gpu=16G
-#SBATCH --output="{name}.out"
-#SBATCH --time={walltime}
-#SBATCH --mail-user={email}
+#SBATCH --job-name="{{{name}}}"
+#SBATCH --account={{{account}}}
+#SBATCH --partition={{{partition}}}
+#SBATCH --qos={{{quality_of_service}}}
+#SBATCH --nodes={{{nodes}}}
+#SBATCH --ntasks-per-node={{{tasks_per_node}}}
+{{{f'''\
+#SBATCH --gres=gpu:V100:{tasks_per_node}
+#SBATCH --mem-per-gpu=16G\
+''' if gpu else ''}}}
+#SBATCH --output="{{{name}}}.out"
+#SBATCH --time={{{walltime}}}
+#SBATCH --mail-user={{{email}}}
#SBATCH --mail-type="BEGIN, END, FAIL"


@@ -40,8 +43,8 @@ for binpath in {MFC::BINARIES}; do

echo -e ":) Running $binpath:\n"

- mpirun \
- -np {nodes*tasks_per_node} \
+ mpirun \
+ -np {{{nodes*tasks_per_node}}} \
{MFC::PROFILER} "$binpath"

echo
18 changes: 18 additions & 0 deletions mauro_patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
diff --git a/toolchain/mfc/run/engines.py b/toolchain/mfc/run/engines.py
index b8c45f7..53f9f9e 100644
--- a/toolchain/mfc/run/engines.py
+++ b/toolchain/mfc/run/engines.py
@@ -307,11 +307,11 @@ exit $code
cons.print("> Writing batch file...")
file_write(filepath, content)

- def __execute_batch_file(self, system: queues.QueueSystem):
+ def __execute_batch_file(self, queue: queues.QueueSystem):
# We CD to the case directory before executing the batch file so that
# any files the queue system generates (like .err and .out) are created
# in the correct directory.
- cmd = system.gen_submit_cmd(self.__get_batch_filename())
+ cmd = queue.gen_submit_cmd(self.__get_batch_filename())

if system(cmd, cwd=self.__get_batch_dirpath()) != 0:
raise MFCException(f"Submitting batch file for {system.name} failed. It can be found here: {self.__get_batch_filepath()}. Please check the file for errors.")
6 changes: 6 additions & 0 deletions notes
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

2D_shockbubble

4522792 cpu-small hberre3- hberre3 PD 0:00 1 (AssocGrpBillingMinutes)
4522791 gpu-v100 hberre3- hberre3 PD 0:00 1 (AssocGrpBillingMinutes)

59 changes: 34 additions & 25 deletions toolchain/mfc/args.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import re, os.path, argparse, dataclasses

from .build import TARGETS, DEFAULT_TARGETS, DEPENDENCY_TARGETS
from .common import MFCException, format_list_to_string
from .test.cases import generate_cases
from .run.queues import get_baked_templates
from .build import TARGETS, DEFAULT_TARGETS, DEPENDENCY_TARGETS
from .common import MFCException, format_list_to_string
from .test.cases import generate_cases
from .run.engines import ENGINES
from .run.mpi_bins import BINARIES

# pylint: disable=too-many-locals, too-many-statements
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
def parse(config):
parser = argparse.ArgumentParser(
prog="./mfc.sh",
Expand Down Expand Up @@ -103,26 +104,28 @@ def add_common_arguments(p, mask = None):
engines = [ e.slug for e in ENGINES ]

add_common_arguments(run)
run.add_argument("input", metavar="INPUT", type=str, help="Input file to run.")
run.add_argument("arguments", metavar="ARGUMENTS", nargs="*", type=str, default=[], help="Additional arguments to pass to the case file.")
run.add_argument("-e", "--engine", choices=engines, type=str, default=engines[0], help="Job execution/submission engine choice.")
run.add_argument("input", metavar="INPUT", type=str, help="Input file to run.")
run.add_argument("arguments", metavar="ARGUMENTS", nargs="*", type=str, default=[], help="Additional arguments to pass to the case file.")
run.add_argument("-e", "--engine", choices=engines, type=str, default=engines[0], help="Job execution/submission engine choice.")
run.add_argument("--output-summary", type=str, default=None, help="(Interactive) Output a YAML summary file.")
run.add_argument("-p", "--partition", metavar="PARTITION", type=str, default="", help="(Batch) Partition for job submission.")
run.add_argument("-N", "--nodes", metavar="NODES", type=int, default=1, help="(Batch) Number of nodes.")
run.add_argument("-n", "--tasks-per-node", metavar="TASKS", type=int, default=1, help="Number of tasks per node.")
run.add_argument("-w", "--walltime", metavar="WALLTIME", type=str, default="01:00:00", help="(Batch) Walltime.")
run.add_argument("-a", "--account", metavar="ACCOUNT", type=str, default="", help="(Batch) Account to charge.")
run.add_argument("-@", "--email", metavar="EMAIL", type=str, default="", help="(Batch) Email for job notification.")
run.add_argument("-#", "--name", metavar="NAME", type=str, default="MFC", help="(Batch) Job name.")
run.add_argument("-b", "--binary", choices=binaries, type=str, default=None, help="(Interactive) Override MPI execution binary")
run.add_argument("-s", "--scratch", action="store_true", default=False, help="Build from scratch.")
run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.")
run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.")
run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.")
run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.")
run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.")
run.add_argument("-p", "--partition", metavar="PARTITION", type=str, default="", help="(Batch) Partition for job submission.")
run.add_argument("-q", "--quality_of_service", metavar="QOS", type=str, default="", help="(Batch) Quality of Service for job submission.")
run.add_argument("-N", "--nodes", metavar="NODES", type=int, default=1, help="(Batch) Number of nodes.")
run.add_argument("-n", "--tasks-per-node", metavar="TASKS", type=int, default=1, help="Number of tasks per node.")
run.add_argument("-w", "--walltime", metavar="WALLTIME", type=str, default="01:00:00", help="(Batch) Walltime.")
run.add_argument("-a", "--account", metavar="ACCOUNT", type=str, default="", help="(Batch) Account to charge.")
run.add_argument("-@", "--email", metavar="EMAIL", type=str, default="", help="(Batch) Email for job notification.")
run.add_argument("-#", "--name", metavar="NAME", type=str, default="MFC", help="(Batch) Job name.")
run.add_argument("-b", "--binary", choices=binaries, type=str, default=None, help="(Interactive) Override MPI execution binary")
run.add_argument("-s", "--scratch", action="store_true", default=False, help="Build from scratch.")
run.add_argument("--ncu", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Compute.")
run.add_argument("--nsys", nargs=argparse.REMAINDER, type=str, help="Profile with NVIDIA Nsight Systems.")
run.add_argument( "--dry-run", action="store_true", default=False, help="(Batch) Run without submitting batch file.")
run.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
run.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
run.add_argument("--wait", action="store_true", default=False, help="(Batch) Wait for the job to finish.")
run.add_argument("-f", "--flags", metavar="FLAGS", dest="--", nargs=argparse.REMAINDER, type=str, default=[], help="(Interactive) Arguments to forward to the MPI invocation.")
run.add_argument("-c", "--computer", metavar="COMPUTER", type=str, default=None, help=f"(Batch) Path to a custom submission file template or one of {format_list_to_string(list(get_baked_templates().keys()))}.")

# === BENCH ===
add_common_arguments(bench, "t")
Expand Down Expand Up @@ -153,10 +156,16 @@ def add_common_arguments(p, mask = None):
# "Slugify" the name of the job
args["name"] = re.sub(r'[\W_]+', '-', args["name"])

# build's --case-optimization and --input depend on each other
# We need to check for some invalid combinations of arguments because of
# the limitations of argparse.
if args["command"] == "build":
if (args["input"] is not None) ^ args["case_optimization"] :
raise MFCException("./mfc.sh build's --case-optimization requires --input")
raise MFCException("./mfc.sh build's --case-optimization and --input must be used together.")
if args["command"] == "run" and args["engine"] == "batch":
if args["computer"] is None:
raise MFCException("./mfc.sh run's --computer is required when --engine=batch")
if args["binary"] is not None:
raise MFCException("./mfc.sh run's --binary is not allowed when --engine=batch")

# Input files to absolute paths
for e in ["input", "input1", "input2"]:
Expand Down
Loading

0 comments on commit 5083b0a

Please sign in to comment.