JamieMair · JamieMair · Dec 15, 2023 · Dec 15, 2023 · Dec 15, 2023 · Dec 15, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 *.jl.*.cov
 *.jl.cov
 *.jl.mem
-/Manifest.toml
+Manifest.toml
 /docs/build/
 .vscode/
diff --git a/Project.toml b/Project.toml
@@ -7,6 +7,7 @@ version = "0.1.3"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -15,17 +16,23 @@ SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
+[weakdeps]
+ClusterManagers = "34f1f09b-3a8b-5176-ab39-66d58a4d544e"
+
+[extensions]
+SlurmExt = ["ClusterManagers"]
+
 [compat]
 DataFrames = "1"
+Distributed = "^1.6"
+Logging = "^1.6"
+Pkg = "^1.6"
 ProgressBars = "1"
+Random = "^1.6"
 SQLite = "1"
 SafeTestsets = "0.0"
-Serialization = "1.6"
-Pkg = "1.6"
-Distributed = "1.6"
-Logging = "1.6"
-Random = "1.6"
-julia = "1.6"
+Serialization = "^1.6"
+julia = "^1.6"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/docs/make.jl b/docs/make.jl
@@ -19,6 +19,7 @@ makedocs(;
         "Distributed Execution" => "distributed.md",
         "Data Store" => "store.md",
         "Custom Snapshots" => "snapshots.md",
+        "Cluster Support" => "clusters.md",
         "Public API" => "api.md"
     ],
 )

diff --git a/docs/src/clusters.md b/docs/src/clusters.md
@@ -0,0 +1,106 @@
+# Clusters
+
+This package provides some basic support for running an experiment on a HPC. This uses `ClusterManagers.jl` under the hood.
+
+At the moment, we only support running on a SLURM cluster, but any PRs to support other clusters are welcome.
+
+## SLURM
+
+Normally when running on SLURM, one creates a bash script to tell the scheduler about the resource requirements for a job. The following is an example:
+```bash
+#!/bin/bash
+
+#SBATCH --nodes=2
+#SBATCH --ntasks=2
+#SBATCH --cpus-per-task=2
+#SBATCH --mem-per-cpu=1024
+#SBATCH --time=00:30:00
+#SBATCH -o hpc/output/test_job_%j.out
+```
+
+The function [`Experimenter.Cluster.create_slurm_template`](@ref) provides an easy way to create one of these bash scripts with everything you need to run.
+
+### Example
+
+Let us take the following end-to-end example. Say that we have an experiment script at `my_experiment.jl` (contents below), which now initialises the cluster:
+```julia
+using Experimenter
+
+config = Dict{Symbol,Any}(
+    :N => IterableVariable([Int(1e6), Int(2e6), Int(3e6)]),
+    :seed => IterableVariable([1234, 4321, 3467, 134234, 121]),
+    :sigma => 0.0001)
+experiment = Experiment(
+    name="Test Experiment",
+    include_file="run.jl",
+    function_name="run_trial",
+    configuration=deepcopy(config)
+)
+
+db = open_db("experiments.db")
+
+# Init the cluster
+Experimenter.Cluster.init()
+
+@execute experiment db DistributedMode
+```
+Additionally, we have the file `run.jl` containing:
+```julia
+using Random
+using Distributed
+function run_trial(config::Dict{Symbol,Any}, trial_id)
+    results = Dict{Symbol, Any}()
+    sigma = config[:sigma]
+    N = config[:N]
+    seed = config[:seed]
+    rng = Random.Xoshiro(seed)
+    # Perform some calculation
+    results[:distance] = sum(rand(rng) * sigma for _ in 1:N)
+    results[:num_threads] = Threads.nthreads()
+    results[:hostname] = gethostname()
+    results[:pid] = Distributed.myid()
+    # Must return a Dict{Symbol, Any}, with the data we want to save
+    return results
+end
+```
+We can now create a bash script to run our experiment. We create a template by running the following in the terminal (or adjust or the REPL)
+```bash
+julia --project -e 'using Experimenter; Experimenter.Cluster.create_slurm_template("myrun.sh")'
+```
+We then modify the create `myrun.sh` file to the following:
+```bash
+#!/bin/bash
+
+#SBATCH --ntasks=4
+#SBATCH --cpus-per-task=2
+#SBATCH --mem-per-cpu=1024
+#SBATCH --time=00:30:00
+#SBATCH -o hpc/logs/job_%j.out
+
+julia --project my_experiment.jl --threads=1
+
+# Optional: Remove the files created by ClusterManagers.jl
+rm -fr julia-*.out
+
+```
+
+Once written, we execute this on the cluster via
+```bash
+sbatch myrun.sh
+```
+
+We can then open a Julia REPL (once the job has finished) to see the results:
+```julia
+using Experimenter
+db = open_db("experiments.db")
+trials = get_trials_by_name(db, "Test Experiment")
+
+for (i, t) in enumerate(trials)
+    hostname = t.results[:hostname]
+    id = t.results[:pid]
+    println("Trial $i ran on $hostname on worker $id")
+end
+```
+
+Support for running on SLURM is based on [this gist](https://gist.github.com/JamieMair/0b1ffbd4ee424c173e6b42fe756e877a) available on GitHub. This gist also provides information on how to adjust the SLURM script to allow for one GPU to be allocated to each worker.
+
diff --git a/examples/slurm/.gitignore b/examples/slurm/.gitignore
@@ -0,0 +1,2 @@
+experiments/
+*.out
diff --git a/examples/slurm/Project.toml b/examples/slurm/Project.toml
@@ -0,0 +1,4 @@
+[deps]
+ClusterManagers = "34f1f09b-3a8b-5176-ab39-66d58a4d544e"
+Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+Experimenter = "6aee034a-9508-47b1-8e11-813cc29af79f"
diff --git a/examples/slurm/check_results.jl b/examples/slurm/check_results.jl
@@ -0,0 +1,9 @@
+using Experimenter
+db = open_db("experiments.db")
+trials = get_trials_by_name(db, "Test Experiment")
+
+for (i, t) in enumerate(trials)
+    hostname = t.results[:hostname]
+    id = t.results[:pid]
+    println("Trial $i ran on $hostname on worker $id")
+end
diff --git a/examples/slurm/hpc/logs/log.txt b/examples/slurm/hpc/logs/log.txt
@@ -0,0 +1 @@
+This is a file to make sure this directory exists.
diff --git a/examples/slurm/my_experiment.jl b/examples/slurm/my_experiment.jl
@@ -0,0 +1,19 @@
+using Experimenter
+
+config = Dict{Symbol,Any}(
+    :N => IterableVariable([Int(1e6), Int(2e6), Int(3e6)]),
+    :seed => IterableVariable([1234, 4321, 3467, 134234, 121]),
+    :sigma => 0.0001)
+experiment = Experiment(
+    name="Test Experiment",
+    include_file="run.jl",
+    function_name="run_trial",
+    configuration=deepcopy(config)
+)
+
+db = open_db("experiments.db")
+
+# Init the cluster
+Experimenter.Cluster.init()
+
+@execute experiment db DistributedMode
diff --git a/examples/slurm/myrun.sh b/examples/slurm/myrun.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+#SBATCH --ntasks=4
+#SBATCH --cpus-per-task=2
+#SBATCH --mem-per-cpu=1024
+#SBATCH --time=00:30:00
+#SBATCH -o hpc/logs/job_%j.out
+
+module purge
+module load julia/1.9.4
+
+julia --project my_experiment.jl --threads=1
+
+# Optional: Remove the files created by ClusterManagers.jl
+rm -fr julia-*.out
diff --git a/examples/slurm/run.jl b/examples/slurm/run.jl
@@ -0,0 +1,16 @@
+using Random
+using Distributed
+function run_trial(config::Dict{Symbol,Any}, trial_id)
+    results = Dict{Symbol, Any}()
+    sigma = config[:sigma]
+    N = config[:N]
+    seed = config[:seed]
+    rng = Random.Xoshiro(seed)
+    # Perform some calculation
+    results[:distance] = sum(rand(rng) * sigma for _ in 1:N)
+    results[:num_threads] = Threads.nthreads()
+    results[:hostname] = gethostname()
+    results[:pid] = Distributed.myid()
+    # Must return a Dict{Symbol, Any}, with the data we want to save
+    return results
+end
diff --git a/ext/SlurmExt/SlurmExt.jl b/ext/SlurmExt/SlurmExt.jl
@@ -0,0 +1,44 @@
+module SlurmExt
+
+############ Module dependencies ############
+if isdefined(Base, :get_extension)
+    using Experimenter
+    using Distributed
+    using ClusterManagers
+else
+    using ..Experimenter
+    using ..Distributed
+    using ..ClusterManagers
+end
+
+
+############     Module Code     ############
+function Experimenter.Cluster.init_slurm(; sysimage_path::Union{String, Nothing}=nothing)
+    @info "Setting up SLURM"
+    # Setup SLURM
+    num_tasks = parse(Int, ENV["SLURM_NTASKS"])
+    cpus_per_task = parse(Int, ENV["SLURM_CPUS_PER_TASK"])
+    @info "Using $cpus_per_task threads on each worker"
+    exeflags = ["--project", "-t$cpus_per_task"]
+    if !isnothing(sysimage_path)
+        @info "Using the sysimage: $sysimage_path"
+        push!(exeflags, "--sysimage")
+        push!(exeflags, "\"$sysimage_path\"")
+    end
+    addprocs(SlurmManager(num_tasks); exeflags=exeflags, topology=:master_worker)
+
+    @info "SLURM workers launched: $(length(workers()))"
+end
+
+# @doc """
+# init_slurm(; sysimage_path=nothing)
+
+# Spins up all the processes as indicated by the SLURM environment variables.
+
+# # Arguments
+
+# - `sysimage_path`: A path to the sysimage that the workers should use to avoid unneccessary precompilation
+# """ Experimenter.Cluster.init_slurm
+
+
+end
diff --git a/src/Experimenter.jl b/src/Experimenter.jl
@@ -8,6 +8,98 @@ include("heterogeneous_mapper.jl")
 include("runner.jl")
 
 
+module Cluster
+    function init_cluster_support()
+        @eval Main using ClusterManagers
+        if isdefined(Base, :get_extension)
+            @eval Main Base.retry_load_extensions()
+        end
+    end
+    function install_cluster_support()
+        @eval Main import Pkg
+        @eval Main Pkg.add(["ClusterManagers"])
+    end
+
+    """
+        init(; kwargs...)
+
+    Checks the environment variables to see if a script is running on a cluster 
+    and then launches the processes as determined by the environment variables.
+
+    # Arguments
+
+    The keyword arguments are forwarded to the init function for each cluster
+    management system. Check the `ext` folder for extensions to see which
+    keywords are supported.
+    """
+    function init(; kwargs...)
+        if haskey(ENV, "SLURM_JOB_NAME")
+            @eval Main Experimenter.Cluster.init_cluster_support()
+            @eval Main Experimenter.Cluster.init_slurm(; $(kwargs)...)
+        else
+            @info "Cluster not detected, doing nothing."
+        end
+    end
+
+    """
+        create_slurm_template(file_loc; job_logs_dir="hpc/logs")
+
+    Creates a template bash script at the supplied file location and
+    creates the log directory used for the outputs. You should modify
+    this script to adjust the resources required.
+    """
+    function create_slurm_template(file_loc::AbstractString;
+        job_logs_dir::AbstractString="hpc/logs")
+
+        log_dir = joinpath(dirname(file_loc), job_logs_dir)
+        if !isdir(log_dir) && isdirpath(log_dir)
+            @info "Creating directory at $log_dir to store the log files"
+            mkdir(log_dir)
+        end
+
+
+        file_contents = """#!/bin/bash
+
+        #SBATCH --nodes=1
+        #SBATCH --ntasks=1
+        #SBATCH --cpus-per-task=2
+        #SBATCH --mem-per-cpu=1024
+        #SBATCH --time=00:30:00
+        #SBATCH -o $log_dir/job_%j.out
+        #SBATCH --partition=compute
+
+        # Change below to load version of Julia used
+        module load julia
+
+        # Change directory if needed
+        # cd "experiments"
+
+        julia --project myscript.jl --threads=1
+
+        # Optional: Remove the files created by ClusterManagers.jl
+        # rm -fr julia-*.out
+        """
+
+        open(file_loc, "w") do io
+            print(io, file_contents)
+        end
+
+        @info "Wrote template file to $(abspath(file_loc))"
+
+        nothing
+    end
+    function init_slurm end
+
+    export init, install_cluster_support, init_cluster_support
+end
+
+using PackageExtensionCompat
+function __init__()
+    @require_extensions
+end
+
+
+
 ## API
 
 ### Database