diff --git a/.gitignore b/.gitignore index b5ac9c8..8a36a1b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.jl.*.cov *.jl.cov *.jl.mem -/Manifest.toml +Manifest.toml /docs/build/ .vscode/ \ No newline at end of file diff --git a/Project.toml b/Project.toml index e16cda4..3f5f8ac 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.1.3" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -15,17 +16,23 @@ SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" +[weakdeps] +ClusterManagers = "34f1f09b-3a8b-5176-ab39-66d58a4d544e" + +[extensions] +SlurmExt = ["ClusterManagers"] + [compat] DataFrames = "1" +Distributed = "^1.6" +Logging = "^1.6" +Pkg = "^1.6" ProgressBars = "1" +Random = "^1.6" SQLite = "1" SafeTestsets = "0.0" -Serialization = "1.6" -Pkg = "1.6" -Distributed = "1.6" -Logging = "1.6" -Random = "1.6" -julia = "1.6" +Serialization = "^1.6" +julia = "^1.6" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/docs/make.jl b/docs/make.jl index 188bdd6..54d4585 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -19,6 +19,7 @@ makedocs(; "Distributed Execution" => "distributed.md", "Data Store" => "store.md", "Custom Snapshots" => "snapshots.md", + "Cluster Support" => "clusters.md", "Public API" => "api.md" ], ) diff --git a/docs/src/clusters.md b/docs/src/clusters.md new file mode 100644 index 0000000..9992ed1 --- /dev/null +++ b/docs/src/clusters.md @@ -0,0 +1,106 @@ +# Clusters + +This package provides some basic support for running an experiment on a HPC. This uses `ClusterManagers.jl` under the hood. + +At the moment, we only support running on a SLURM cluster, but any PRs to support other clusters are welcome. + +## SLURM + +Normally when running on SLURM, one creates a bash script to tell the scheduler about the resource requirements for a job. The following is an example: +```bash +#!/bin/bash + +#SBATCH --nodes=2 +#SBATCH --ntasks=2 +#SBATCH --cpus-per-task=2 +#SBATCH --mem-per-cpu=1024 +#SBATCH --time=00:30:00 +#SBATCH -o hpc/output/test_job_%j.out +``` + +The function [`Experimenter.Cluster.create_slurm_template`](@ref) provides an easy way to create one of these bash scripts with everything you need to run. + +### Example + +Let us take the following end-to-end example. Say that we have an experiment script at `my_experiment.jl` (contents below), which now initialises the cluster: +```julia +using Experimenter + +config = Dict{Symbol,Any}( + :N => IterableVariable([Int(1e6), Int(2e6), Int(3e6)]), + :seed => IterableVariable([1234, 4321, 3467, 134234, 121]), + :sigma => 0.0001) +experiment = Experiment( + name="Test Experiment", + include_file="run.jl", + function_name="run_trial", + configuration=deepcopy(config) +) + +db = open_db("experiments.db") + +# Init the cluster +Experimenter.Cluster.init() + +@execute experiment db DistributedMode +``` +Additionally, we have the file `run.jl` containing: +```julia +using Random +using Distributed +function run_trial(config::Dict{Symbol,Any}, trial_id) + results = Dict{Symbol, Any}() + sigma = config[:sigma] + N = config[:N] + seed = config[:seed] + rng = Random.Xoshiro(seed) + # Perform some calculation + results[:distance] = sum(rand(rng) * sigma for _ in 1:N) + results[:num_threads] = Threads.nthreads() + results[:hostname] = gethostname() + results[:pid] = Distributed.myid() + # Must return a Dict{Symbol, Any}, with the data we want to save + return results +end +``` +We can now create a bash script to run our experiment. We create a template by running the following in the terminal (or adjust or the REPL) +```bash +julia --project -e 'using Experimenter; Experimenter.Cluster.create_slurm_template("myrun.sh")' +``` +We then modify the create `myrun.sh` file to the following: +```bash +#!/bin/bash + +#SBATCH --ntasks=4 +#SBATCH --cpus-per-task=2 +#SBATCH --mem-per-cpu=1024 +#SBATCH --time=00:30:00 +#SBATCH -o hpc/logs/job_%j.out + +julia --project my_experiment.jl --threads=1 + +# Optional: Remove the files created by ClusterManagers.jl +rm -fr julia-*.out + +``` + +Once written, we execute this on the cluster via +```bash +sbatch myrun.sh +``` + +We can then open a Julia REPL (once the job has finished) to see the results: +```julia +using Experimenter +db = open_db("experiments.db") +trials = get_trials_by_name(db, "Test Experiment") + +for (i, t) in enumerate(trials) + hostname = t.results[:hostname] + id = t.results[:pid] + println("Trial $i ran on $hostname on worker $id") +end +``` + +Support for running on SLURM is based on [this gist](https://gist.github.com/JamieMair/0b1ffbd4ee424c173e6b42fe756e877a) available on GitHub. This gist also provides information on how to adjust the SLURM script to allow for one GPU to be allocated to each worker. + diff --git a/examples/slurm/.gitignore b/examples/slurm/.gitignore new file mode 100644 index 0000000..59cb419 --- /dev/null +++ b/examples/slurm/.gitignore @@ -0,0 +1,2 @@ +experiments/ +*.out \ No newline at end of file diff --git a/examples/slurm/Project.toml b/examples/slurm/Project.toml new file mode 100644 index 0000000..3c40287 --- /dev/null +++ b/examples/slurm/Project.toml @@ -0,0 +1,4 @@ +[deps] +ClusterManagers = "34f1f09b-3a8b-5176-ab39-66d58a4d544e" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +Experimenter = "6aee034a-9508-47b1-8e11-813cc29af79f" diff --git a/examples/slurm/check_results.jl b/examples/slurm/check_results.jl new file mode 100644 index 0000000..e0e22f1 --- /dev/null +++ b/examples/slurm/check_results.jl @@ -0,0 +1,9 @@ +using Experimenter +db = open_db("experiments.db") +trials = get_trials_by_name(db, "Test Experiment") + +for (i, t) in enumerate(trials) + hostname = t.results[:hostname] + id = t.results[:pid] + println("Trial $i ran on $hostname on worker $id") +end \ No newline at end of file diff --git a/examples/slurm/hpc/logs/log.txt b/examples/slurm/hpc/logs/log.txt new file mode 100644 index 0000000..8764e49 --- /dev/null +++ b/examples/slurm/hpc/logs/log.txt @@ -0,0 +1 @@ +This is a file to make sure this directory exists. \ No newline at end of file diff --git a/examples/slurm/my_experiment.jl b/examples/slurm/my_experiment.jl new file mode 100644 index 0000000..e728c98 --- /dev/null +++ b/examples/slurm/my_experiment.jl @@ -0,0 +1,19 @@ +using Experimenter + +config = Dict{Symbol,Any}( + :N => IterableVariable([Int(1e6), Int(2e6), Int(3e6)]), + :seed => IterableVariable([1234, 4321, 3467, 134234, 121]), + :sigma => 0.0001) +experiment = Experiment( + name="Test Experiment", + include_file="run.jl", + function_name="run_trial", + configuration=deepcopy(config) +) + +db = open_db("experiments.db") + +# Init the cluster +Experimenter.Cluster.init() + +@execute experiment db DistributedMode \ No newline at end of file diff --git a/examples/slurm/myrun.sh b/examples/slurm/myrun.sh new file mode 100644 index 0000000..93742f2 --- /dev/null +++ b/examples/slurm/myrun.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +#SBATCH --ntasks=4 +#SBATCH --cpus-per-task=2 +#SBATCH --mem-per-cpu=1024 +#SBATCH --time=00:30:00 +#SBATCH -o hpc/logs/job_%j.out + +module purge +module load julia/1.9.4 + +julia --project my_experiment.jl --threads=1 + +# Optional: Remove the files created by ClusterManagers.jl +rm -fr julia-*.out \ No newline at end of file diff --git a/examples/slurm/run.jl b/examples/slurm/run.jl new file mode 100644 index 0000000..8bce890 --- /dev/null +++ b/examples/slurm/run.jl @@ -0,0 +1,16 @@ +using Random +using Distributed +function run_trial(config::Dict{Symbol,Any}, trial_id) + results = Dict{Symbol, Any}() + sigma = config[:sigma] + N = config[:N] + seed = config[:seed] + rng = Random.Xoshiro(seed) + # Perform some calculation + results[:distance] = sum(rand(rng) * sigma for _ in 1:N) + results[:num_threads] = Threads.nthreads() + results[:hostname] = gethostname() + results[:pid] = Distributed.myid() + # Must return a Dict{Symbol, Any}, with the data we want to save + return results +end \ No newline at end of file diff --git a/ext/SlurmExt/SlurmExt.jl b/ext/SlurmExt/SlurmExt.jl new file mode 100644 index 0000000..d4f2502 --- /dev/null +++ b/ext/SlurmExt/SlurmExt.jl @@ -0,0 +1,44 @@ +module SlurmExt + +############ Module dependencies ############ +if isdefined(Base, :get_extension) + using Experimenter + using Distributed + using ClusterManagers +else + using ..Experimenter + using ..Distributed + using ..ClusterManagers +end + + +############ Module Code ############ +function Experimenter.Cluster.init_slurm(; sysimage_path::Union{String, Nothing}=nothing) + @info "Setting up SLURM" + # Setup SLURM + num_tasks = parse(Int, ENV["SLURM_NTASKS"]) + cpus_per_task = parse(Int, ENV["SLURM_CPUS_PER_TASK"]) + @info "Using $cpus_per_task threads on each worker" + exeflags = ["--project", "-t$cpus_per_task"] + if !isnothing(sysimage_path) + @info "Using the sysimage: $sysimage_path" + push!(exeflags, "--sysimage") + push!(exeflags, "\"$sysimage_path\"") + end + addprocs(SlurmManager(num_tasks); exeflags=exeflags, topology=:master_worker) + + @info "SLURM workers launched: $(length(workers()))" +end + +# @doc """ +# init_slurm(; sysimage_path=nothing) + +# Spins up all the processes as indicated by the SLURM environment variables. + +# # Arguments + +# - `sysimage_path`: A path to the sysimage that the workers should use to avoid unneccessary precompilation +# """ Experimenter.Cluster.init_slurm + + +end \ No newline at end of file diff --git a/src/Experimenter.jl b/src/Experimenter.jl index da315a7..e7512f9 100644 --- a/src/Experimenter.jl +++ b/src/Experimenter.jl @@ -8,6 +8,98 @@ include("heterogeneous_mapper.jl") include("runner.jl") +module Cluster + function init_cluster_support() + @eval Main using ClusterManagers + if isdefined(Base, :get_extension) + @eval Main Base.retry_load_extensions() + end + end + function install_cluster_support() + @eval Main import Pkg + @eval Main Pkg.add(["ClusterManagers"]) + end + + """ + init(; kwargs...) + + Checks the environment variables to see if a script is running on a cluster + and then launches the processes as determined by the environment variables. + + # Arguments + + The keyword arguments are forwarded to the init function for each cluster + management system. Check the `ext` folder for extensions to see which + keywords are supported. + """ + function init(; kwargs...) + if haskey(ENV, "SLURM_JOB_NAME") + @eval Main Experimenter.Cluster.init_cluster_support() + @eval Main Experimenter.Cluster.init_slurm(; $(kwargs)...) + else + @info "Cluster not detected, doing nothing." + end + end + + """ + create_slurm_template(file_loc; job_logs_dir="hpc/logs") + + Creates a template bash script at the supplied file location and + creates the log directory used for the outputs. You should modify + this script to adjust the resources required. + """ + function create_slurm_template(file_loc::AbstractString; + job_logs_dir::AbstractString="hpc/logs") + + log_dir = joinpath(dirname(file_loc), job_logs_dir) + if !isdir(log_dir) && isdirpath(log_dir) + @info "Creating directory at $log_dir to store the log files" + mkdir(log_dir) + end + + + file_contents = """#!/bin/bash + + #SBATCH --nodes=1 + #SBATCH --ntasks=1 + #SBATCH --cpus-per-task=2 + #SBATCH --mem-per-cpu=1024 + #SBATCH --time=00:30:00 + #SBATCH -o $log_dir/job_%j.out + #SBATCH --partition=compute + + # Change below to load version of Julia used + module load julia + + # Change directory if needed + # cd "experiments" + + julia --project myscript.jl --threads=1 + + # Optional: Remove the files created by ClusterManagers.jl + # rm -fr julia-*.out + """ + + open(file_loc, "w") do io + print(io, file_contents) + end + + @info "Wrote template file to $(abspath(file_loc))" + + nothing + end + function init_slurm end + + export init, install_cluster_support, init_cluster_support +end + +using PackageExtensionCompat +function __init__() + @require_extensions +end + + + ## API ### Database diff --git a/src/database.jl b/src/database.jl index c571043..858de4e 100644 --- a/src/database.jl +++ b/src/database.jl @@ -208,7 +208,13 @@ function _deserialize_columns(df::DataFrame) col = df[!, colname] if eltype(col) <: Vector{UInt8} # raw binary df[!, colname] = map(col) do c - return Serialization.deserialize(c) + io = IOBuffer(c) + obj = Serialization.deserialize(io) + if typeof(obj) <: SQLite.Serialized + return obj.object + else + return obj + end end end end