Skip to content

Commit

Permalink
Merge pull request #26 from JamieMair/add-slurm-support
Browse files Browse the repository at this point in the history
Add slurm support
  • Loading branch information
JamieMair authored Dec 15, 2023
2 parents 844206d + dd9b79b commit 18472e3
Show file tree
Hide file tree
Showing 14 changed files with 330 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
*.jl.*.cov
*.jl.cov
*.jl.mem
/Manifest.toml
Manifest.toml
/docs/build/
.vscode/
19 changes: 13 additions & 6 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ version = "0.1.3"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand All @@ -15,17 +16,23 @@ SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[weakdeps]
ClusterManagers = "34f1f09b-3a8b-5176-ab39-66d58a4d544e"

[extensions]
SlurmExt = ["ClusterManagers"]

[compat]
DataFrames = "1"
Distributed = "^1.6"
Logging = "^1.6"
Pkg = "^1.6"
ProgressBars = "1"
Random = "^1.6"
SQLite = "1"
SafeTestsets = "0.0"
Serialization = "1.6"
Pkg = "1.6"
Distributed = "1.6"
Logging = "1.6"
Random = "1.6"
julia = "1.6"
Serialization = "^1.6"
julia = "^1.6"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ makedocs(;
"Distributed Execution" => "distributed.md",
"Data Store" => "store.md",
"Custom Snapshots" => "snapshots.md",
"Cluster Support" => "clusters.md",
"Public API" => "api.md"
],
)
Expand Down
106 changes: 106 additions & 0 deletions docs/src/clusters.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Clusters

This package provides some basic support for running an experiment on a HPC. This uses `ClusterManagers.jl` under the hood.

At the moment, we only support running on a SLURM cluster, but any PRs to support other clusters are welcome.

## SLURM

Normally when running on SLURM, one creates a bash script to tell the scheduler about the resource requirements for a job. The following is an example:
```bash
#!/bin/bash

#SBATCH --nodes=2
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=1024
#SBATCH --time=00:30:00
#SBATCH -o hpc/output/test_job_%j.out
```

The function [`Experimenter.Cluster.create_slurm_template`](@ref) provides an easy way to create one of these bash scripts with everything you need to run.

### Example

Let us take the following end-to-end example. Say that we have an experiment script at `my_experiment.jl` (contents below), which now initialises the cluster:
```julia
using Experimenter

config = Dict{Symbol,Any}(
:N => IterableVariable([Int(1e6), Int(2e6), Int(3e6)]),
:seed => IterableVariable([1234, 4321, 3467, 134234, 121]),
:sigma => 0.0001)
experiment = Experiment(
name="Test Experiment",
include_file="run.jl",
function_name="run_trial",
configuration=deepcopy(config)
)

db = open_db("experiments.db")

# Init the cluster
Experimenter.Cluster.init()

@execute experiment db DistributedMode
```
Additionally, we have the file `run.jl` containing:
```julia
using Random
using Distributed
function run_trial(config::Dict{Symbol,Any}, trial_id)
results = Dict{Symbol, Any}()
sigma = config[:sigma]
N = config[:N]
seed = config[:seed]
rng = Random.Xoshiro(seed)
# Perform some calculation
results[:distance] = sum(rand(rng) * sigma for _ in 1:N)
results[:num_threads] = Threads.nthreads()
results[:hostname] = gethostname()
results[:pid] = Distributed.myid()
# Must return a Dict{Symbol, Any}, with the data we want to save
return results
end
```
We can now create a bash script to run our experiment. We create a template by running the following in the terminal (or adjust or the REPL)
```bash
julia --project -e 'using Experimenter; Experimenter.Cluster.create_slurm_template("myrun.sh")'
```
We then modify the create `myrun.sh` file to the following:
```bash
#!/bin/bash

#SBATCH --ntasks=4
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=1024
#SBATCH --time=00:30:00
#SBATCH -o hpc/logs/job_%j.out

julia --project my_experiment.jl --threads=1

# Optional: Remove the files created by ClusterManagers.jl
rm -fr julia-*.out

```

Once written, we execute this on the cluster via
```bash
sbatch myrun.sh
```

We can then open a Julia REPL (once the job has finished) to see the results:
```julia
using Experimenter
db = open_db("experiments.db")
trials = get_trials_by_name(db, "Test Experiment")

for (i, t) in enumerate(trials)
hostname = t.results[:hostname]
id = t.results[:pid]
println("Trial $i ran on $hostname on worker $id")
end
```

Support for running on SLURM is based on [this gist](https://gist.github.com/JamieMair/0b1ffbd4ee424c173e6b42fe756e877a) available on GitHub. This gist also provides information on how to adjust the SLURM script to allow for one GPU to be allocated to each worker.

2 changes: 2 additions & 0 deletions examples/slurm/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
experiments/
*.out
4 changes: 4 additions & 0 deletions examples/slurm/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[deps]
ClusterManagers = "34f1f09b-3a8b-5176-ab39-66d58a4d544e"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Experimenter = "6aee034a-9508-47b1-8e11-813cc29af79f"
9 changes: 9 additions & 0 deletions examples/slurm/check_results.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
using Experimenter
db = open_db("experiments.db")
trials = get_trials_by_name(db, "Test Experiment")

for (i, t) in enumerate(trials)
hostname = t.results[:hostname]
id = t.results[:pid]
println("Trial $i ran on $hostname on worker $id")
end
1 change: 1 addition & 0 deletions examples/slurm/hpc/logs/log.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a file to make sure this directory exists.
19 changes: 19 additions & 0 deletions examples/slurm/my_experiment.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using Experimenter

config = Dict{Symbol,Any}(
:N => IterableVariable([Int(1e6), Int(2e6), Int(3e6)]),
:seed => IterableVariable([1234, 4321, 3467, 134234, 121]),
:sigma => 0.0001)
experiment = Experiment(
name="Test Experiment",
include_file="run.jl",
function_name="run_trial",
configuration=deepcopy(config)
)

db = open_db("experiments.db")

# Init the cluster
Experimenter.Cluster.init()

@execute experiment db DistributedMode
15 changes: 15 additions & 0 deletions examples/slurm/myrun.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

#SBATCH --ntasks=4
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=1024
#SBATCH --time=00:30:00
#SBATCH -o hpc/logs/job_%j.out

module purge
module load julia/1.9.4

julia --project my_experiment.jl --threads=1

# Optional: Remove the files created by ClusterManagers.jl
rm -fr julia-*.out
16 changes: 16 additions & 0 deletions examples/slurm/run.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
using Random
using Distributed
function run_trial(config::Dict{Symbol,Any}, trial_id)
results = Dict{Symbol, Any}()
sigma = config[:sigma]
N = config[:N]
seed = config[:seed]
rng = Random.Xoshiro(seed)
# Perform some calculation
results[:distance] = sum(rand(rng) * sigma for _ in 1:N)
results[:num_threads] = Threads.nthreads()
results[:hostname] = gethostname()
results[:pid] = Distributed.myid()
# Must return a Dict{Symbol, Any}, with the data we want to save
return results
end
44 changes: 44 additions & 0 deletions ext/SlurmExt/SlurmExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
module SlurmExt

############ Module dependencies ############
if isdefined(Base, :get_extension)
using Experimenter
using Distributed
using ClusterManagers
else
using ..Experimenter
using ..Distributed
using ..ClusterManagers
end


############ Module Code ############
function Experimenter.Cluster.init_slurm(; sysimage_path::Union{String, Nothing}=nothing)
@info "Setting up SLURM"
# Setup SLURM
num_tasks = parse(Int, ENV["SLURM_NTASKS"])
cpus_per_task = parse(Int, ENV["SLURM_CPUS_PER_TASK"])
@info "Using $cpus_per_task threads on each worker"
exeflags = ["--project", "-t$cpus_per_task"]
if !isnothing(sysimage_path)
@info "Using the sysimage: $sysimage_path"
push!(exeflags, "--sysimage")
push!(exeflags, "\"$sysimage_path\"")
end
addprocs(SlurmManager(num_tasks); exeflags=exeflags, topology=:master_worker)

@info "SLURM workers launched: $(length(workers()))"
end

# @doc """
# init_slurm(; sysimage_path=nothing)

# Spins up all the processes as indicated by the SLURM environment variables.

# # Arguments

# - `sysimage_path`: A path to the sysimage that the workers should use to avoid unneccessary precompilation
# """ Experimenter.Cluster.init_slurm


end
92 changes: 92 additions & 0 deletions src/Experimenter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,98 @@ include("heterogeneous_mapper.jl")
include("runner.jl")


module Cluster
function init_cluster_support()
@eval Main using ClusterManagers
if isdefined(Base, :get_extension)
@eval Main Base.retry_load_extensions()
end
end
function install_cluster_support()
@eval Main import Pkg
@eval Main Pkg.add(["ClusterManagers"])
end

"""
init(; kwargs...)
Checks the environment variables to see if a script is running on a cluster
and then launches the processes as determined by the environment variables.
# Arguments
The keyword arguments are forwarded to the init function for each cluster
management system. Check the `ext` folder for extensions to see which
keywords are supported.
"""
function init(; kwargs...)
if haskey(ENV, "SLURM_JOB_NAME")
@eval Main Experimenter.Cluster.init_cluster_support()
@eval Main Experimenter.Cluster.init_slurm(; $(kwargs)...)
else
@info "Cluster not detected, doing nothing."
end
end

"""
create_slurm_template(file_loc; job_logs_dir="hpc/logs")
Creates a template bash script at the supplied file location and
creates the log directory used for the outputs. You should modify
this script to adjust the resources required.
"""
function create_slurm_template(file_loc::AbstractString;
job_logs_dir::AbstractString="hpc/logs")

log_dir = joinpath(dirname(file_loc), job_logs_dir)
if !isdir(log_dir) && isdirpath(log_dir)
@info "Creating directory at $log_dir to store the log files"
mkdir(log_dir)
end


file_contents = """#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=1024
#SBATCH --time=00:30:00
#SBATCH -o $log_dir/job_%j.out
#SBATCH --partition=compute
# Change below to load version of Julia used
module load julia
# Change directory if needed
# cd "experiments"
julia --project myscript.jl --threads=1
# Optional: Remove the files created by ClusterManagers.jl
# rm -fr julia-*.out
"""

open(file_loc, "w") do io
print(io, file_contents)
end

@info "Wrote template file to $(abspath(file_loc))"

nothing
end
function init_slurm end

export init, install_cluster_support, init_cluster_support
end

using PackageExtensionCompat
function __init__()
@require_extensions
end



## API

### Database
Expand Down
Loading

0 comments on commit 18472e3

Please sign in to comment.