diff --git a/README.md b/README.md index 972be9a..9fe7c07 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,12 @@ properties in this file are: |-------------------|----------------------------------------------------------------------------------------------------------------| |partition | Which Slurm partition should the job run in? | |account | What account name to run under | +| environment | List of additional environment variables to add to the job | gpus_per_node | On GPU partitions how many GPUs to allocate per node | | gres | SLURM Generic RESources requests | | mem | Amount of memory to allocate to CPU jobs | | modules | List of modules to load before starting job | +| nodes | Number of nodes to request from SLURM | | time | Max CPU time job may run | | sbatch-script-file | Name of batch file to be produced. Leave blank to have service generate a script file name based on the run ID | diff --git a/mlflow_slurm/templates/sbatch_template.sh b/mlflow_slurm/templates/sbatch_template.sh index 87df098..f4171ed 100644 --- a/mlflow_slurm/templates/sbatch_template.sh +++ b/mlflow_slurm/templates/sbatch_template.sh @@ -19,6 +19,9 @@ {% if config.time %} #SBATCH --time={{ config.time }} {% endif %} +{% if config.nodes %} +#SBATCH --nodes={{ config.nodes }} +{% endif %} module reset # drop modules and explicitly load the ones needed # (good job metadata and reproducibility) # $WORK and $SCRATCH are now set @@ -26,6 +29,15 @@ module reset # drop modules and explicitly load the ones needed module load {{ module }} {% endfor %} module list # job documentation and metadata + +{% for env in config.environment %} +export {{ env }} +{% endfor %} + export MLFLOW_RUN_ID={{ run_id }} echo "job is starting on `hostname`" +{% if config.nodes %} +srun --export=ALL /bin/bash -c '{{ command }}' +{% else %} {{ command }} +{% endif %}