Skip to content

Commit

Permalink
Merge pull request #781 from ocaisa/lmod_gpu
Browse files Browse the repository at this point in the history
Add accelerator detection to Lmod version of EESSI initialisation
  • Loading branch information
boegel authored Oct 17, 2024
2 parents 8232a60 + 04c2573 commit 901a944
Show file tree
Hide file tree
Showing 3 changed files with 241 additions and 32 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/modules/fake_module.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
setenv("INSIDE_GITHUB_ACTIONS", "true")
-- Interfere with PATH so Lmod keeps a record
prepend_path("PATH", "/snap/bin")
147 changes: 134 additions & 13 deletions .github/workflows/tests_eessi_module.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ on:
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
build:
basic_checks:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
EESSI_VERSION:
- 2023.06
- 2023.06
steps:
- name: Check out software-layer repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
Expand Down Expand Up @@ -45,10 +45,11 @@ jobs:
- name: Test for archdetect_cpu functionality with invalid path
run: |
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash # Initialise Lmod
# Initialise Lmod
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash
export MODULEPATH=init/modules
set +e # Do not exit immediately if a command exits with a non-zero status
export EESSI_ARCHDETECT_OPTIONS="dummy/cpu"
export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="dummy/cpu"
outfile="outfile.txt"
module load EESSI/${{matrix.EESSI_VERSION}} > "${outfile}" 2>&1
cat "${outfile}"
Expand All @@ -58,29 +59,149 @@ jobs:
echo "Test for picking up invalid path on \${archdetect_cpu} FAILED" >&2
exit 1
fi
unset EESSI_ARCHDETECT_OPTIONS
unset EESSI_ARCHDETECT_OPTIONS_OVERRIDE
set -e # Re-enable exit on non-zero status
lmod_and_init_script_comparison:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
EESSI_VERSION:
- 2023.06
EESSI_SOFTWARE_SUBDIR_OVERRIDE:
- x86_64/amd/zen3
- x86_64/amd/zen4
EESSI_ACCELERATOR_TARGET_OVERRIDE:
- accel/nvidia/cc80
steps:
- name: Check out software-layer repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Mount EESSI CernVM-FS pilot repository
uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0
with:
cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb
cvmfs_http_proxy: DIRECT
cvmfs_repositories: software.eessi.io

- name: Test for expected variables while adding dummy cpu archs and loading EESSI module
- name: Test for expected variables match between Lmod init script and original bash script
run: |
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash # Initialise Lmod
export MODULEPATH=init/modules
CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath)
export EESSI_ARCHDETECT_OPTIONS="dummy/cpu:${CPU_ARCH}:dummy1/cpu1"
# Initialise Lmod
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash
# Set our path overrides according to our matrix
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}}
export EESSI_ACCELERATOR_TARGET_OVERRIDE=${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}}
moduleoutfile="moduleout.txt"
sourceoutfile="sourceout.txt"
# First do (and undo) the Lmod initialisation
export MODULEPATH=init/modules
# Turn on debug output in case we want to take a look
export EESSI_DEBUG_INIT=true
CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath)
export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="dummy/cpu:${CPU_ARCH}:dummy1/cpu1"
module load EESSI/${{matrix.EESSI_VERSION}}
env | grep -E '^(EESSI_S|EESSI_C)' | sort > "${moduleoutfile}"
# EESSI_ARCHDETECT_OPTIONS_OVERRIDE/EESSI_DEBUG_INIT only relevant for Lmod init
unset EESSI_ARCHDETECT_OPTIONS_OVERRIDE
unset EESSI_DEBUG_INIT
# Store all relevant environment variables
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${moduleoutfile}"
module unload EESSI/${{matrix.EESSI_VERSION}}
# Now do the init script initialisation
source ./init/bash
env | grep -E '^(EESSI_S|EESSI_C)' | sort > "${sourceoutfile}"
# source script version sets environment variables to force archdetect, ignore these
unset EESSI_USE_ARCHSPEC
unset EESSI_USE_ARCHDETECT
env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${sourceoutfile}"
# Now compare the two results
echo ""
echo "Lmod initialisation:"
cat "${moduleoutfile}"
echo ""
echo "Source script initialisation:"
cat "${sourceoutfile}"
echo ""
echo ""
if (diff "${moduleoutfile}" "${sourceoutfile}" > /dev/null); then
echo "Test for checking env variables PASSED"
else
echo "Test for checking env variables FAILED" >&2
diff "${moduleoutfile}" "${sourceoutfile}"
diff --unified=0 "${moduleoutfile}" "${sourceoutfile}"
exit 1
fi
make_sure_load_and_unload_work:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
EESSI_VERSION:
- 2023.06
EESSI_SOFTWARE_SUBDIR_OVERRIDE:
- none
- x86_64/amd/zen2
- x86_64/amd/zen4
EESSI_ACCELERATOR_TARGET_OVERRIDE:
- none
- accel/nvidia/cc80
steps:
- name: Check out software-layer repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Mount EESSI CernVM-FS pilot repository
uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0
with:
cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb
cvmfs_http_proxy: DIRECT
cvmfs_repositories: software.eessi.io

- name: Test for identical environment after loading and unloading the EESSI module
run: |
# Initialise Lmod
. /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash
# Set our cpu path overrides according to our matrix
if [[ "${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}}" != "none" ]]; then
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}}
fi
# Set our accelerator path overrides according to our matrix
if [[ "${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}}" != "none" ]]; then
export EESSI_ACCELERATOR_TARGET_OVERRIDE=${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}}
fi
# Turn on debug output in case we want to take a look
export EESSI_DEBUG_INIT=true
initial_env_file="initial_env.txt"
module_cycled_file="load_unload_cycle.txt"
# prepare Lmod, resetting it in a roundabout given we don't want defaults set
export MODULEPATH=init/modules:.github/workflows/modules
module load fake_module
module purge
module unuse .github/workflows/modules
module avail
# Store the initial environment (ignoring Lmod tables)
env | grep -v _ModuleTable | sort > "${initial_env_file}"
# Do (and undo) loading the EESSI module
CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath)
module load EESSI/${{matrix.EESSI_VERSION}}
module unload EESSI/${{matrix.EESSI_VERSION}}
env | grep -v _ModuleTable | sort > "${module_cycled_file}"
# Now compare the two results (do not expose the files, as they contain the full environment!)
if (diff "${initial_env_file}" "${module_cycled_file}" > /dev/null); then
echo "Test for checking env variables PASSED"
else
echo "Test for checking env variables FAILED" >&2
diff --unified=0 "${initial_env_file}" "${module_cycled_file}"
exit 1
fi
123 changes: 104 additions & 19 deletions init/modules/EESSI/2023.06.lua
Original file line number Diff line number Diff line change
Expand Up @@ -17,56 +17,141 @@ local eessi_os_type = "linux"
setenv("EESSI_VERSION", eessi_version)
setenv("EESSI_CVMFS_REPO", eessi_repo)
setenv("EESSI_OS_TYPE", eessi_os_type)
function eessiDebug(text)
if (mode() == "load" and os.getenv("EESSI_DEBUG_INIT")) then
LmodMessage(text)
end
end
function archdetect_cpu()
local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper.sh')
if not os.getenv("EESSI_ARCHDETECT_OPTIONS") then
-- make sure that we grab the value for architecture before the module unsets the environment variable (in unload mode)
local archdetect_options = os.getenv("EESSI_ARCHDETECT_OPTIONS") or (os.getenv("EESSI_ARCHDETECT_OPTIONS_OVERRIDE") or "")
if not os.getenv("EESSI_ARCHDETECT_OPTIONS_OVERRIDE") then
if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then
LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ARCHDETECT_OPTIONS to the available cpu architecture in the form of: x86_64/intel/haswell:x86_64/generic or aarch64/neoverse_v1:aarch64/generic")
LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ARCHDETECT_OPTIONS_OVERRIDE to the available cpu architecture in the form of: x86_64/intel/haswell:x86_64/generic or aarch64/neoverse_v1:aarch64/generic")
end
source_sh("bash", script)
end
local archdetect_options = os.getenv("EESSI_ARCHDETECT_OPTIONS") or ""
for archdetect_filter_cpu in string.gmatch(archdetect_options, "([^" .. ":" .. "]+)") do
if isDir(pathJoin(eessi_prefix, "software", eessi_os_type, archdetect_filter_cpu, "software")) then
-- use x86_64/amd/zen3 for now when AMD Genoa (Zen4) CPU is detected,
-- since optimized software installations for Zen4 are a work-in-progress,
-- see https://gitlab.com/eessi/support/-/issues/37
if archdetect_filter_cpu == "x86_64/amd/zen4" then
archdetect_filter_cpu = "x86_64/amd/zen3"
if mode() == "load" then
LmodMessage("Sticking to " .. archdetect_filter_cpu .. " for now, since optimized installations for AMD Genoa (Zen4) are a work in progress.")
-- EESSI_ARCHDETECT_OPTIONS is set by the script (_if_ it was called)
archdetect_options = os.getenv("EESSI_ARCHDETECT_OPTIONS") or archdetect_options
if archdetect_options then
eessiDebug("Got archdetect CPU options: " .. archdetect_options)
-- archdetect_options is a colon-separated list of CPU architectures that are compatible with
-- the host CPU and ordered from most specific to least specific, e.g.,
-- x86_64/intel/skylake_avx512:x86_64/intel/haswell:x86_64/generic
-- We loop over the list, and return the highest matching arch for which a directory exists for this EESSI version
for archdetect_filter_cpu in string.gmatch(archdetect_options, "([^" .. ":" .. "]+)") do
if isDir(pathJoin(eessi_prefix, "software", eessi_os_type, archdetect_filter_cpu, "software")) then
-- use x86_64/amd/zen3 for now when AMD Genoa (Zen4) CPU is detected,
-- since optimized software installations for Zen4 are a work-in-progress,
-- see https://gitlab.com/eessi/support/-/issues/37
if (archdetect_filter_cpu == "x86_64/amd/zen4" and not os.getenv("EESSI_SOFTWARE_SUBDIR_OVERRIDE") == "x86_64/amd/zen4") then
archdetect_filter_cpu = "x86_64/amd/zen3"
if mode() == "load" then
LmodMessage("Sticking to " .. archdetect_filter_cpu .. " for now, since optimized installations for AMD Genoa (Zen4) are a work in progress.")
end
end
eessiDebug("Selected archdetect CPU: " .. archdetect_filter_cpu)
return archdetect_filter_cpu
end
return archdetect_filter_cpu
end
LmodError("Software directory check for the detected architecture failed")
else
-- Still need to return something
return nil
end
LmodError("Software directory check for the detected architecture failed")
end
function archdetect_accel()
local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper_accel.sh')
-- for unload mode, we need to grab the value before it is unset
local archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or (os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") or "")
if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE ") then
if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then
LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ACCELERATOR_TARGET_OVERRIDE to the available accelerator architecture in the form of: accel/nvidia/cc80")
end
source_sh("bash", script)
end
archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or archdetect_accel
eessiDebug("Got archdetect accel option: " .. archdetect_accel)
return archdetect_accel
end
-- archdetect finds the best compatible architecture, e.g., x86_64/amd/zen3
local archdetect = archdetect_cpu()
-- archdetect_accel() attempts to identify an accelerator, e.g., accel/nvidia/cc80
local archdetect_accel = archdetect_accel()
-- eessi_cpu_family is derived from the archdetect match, e.g., x86_64
local eessi_cpu_family = archdetect:match("([^/]+)")
local eessi_software_subdir = archdetect
-- eessi_eprefix is the base location of the compat layer, e.g., /cvmfs/software.eessi.io/versions/2023.06/compat/linux/x86_64
local eessi_eprefix = pathJoin(eessi_prefix, "compat", eessi_os_type, eessi_cpu_family)
-- eessi_software_path is the location of the software installations, e.g.,
-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3
local eessi_software_path = pathJoin(eessi_prefix, "software", eessi_os_type, eessi_software_subdir)
local eessi_module_path = pathJoin(eessi_software_path, "modules", "all")
local eessi_modules_subdir = pathJoin("modules", "all")
-- eessi_module_path is the location of the _CPU_ module files, e.g.,
-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/modules/all
local eessi_module_path = pathJoin(eessi_software_path, eessi_modules_subdir)
local eessi_site_software_path = string.gsub(eessi_software_path, "versions", "host_injections")
local eessi_site_module_path = pathJoin(eessi_site_software_path, "modules", "all")
-- Site module path is the same as the EESSI one, but with `versions` changed to `host_injections`, e.g.,
-- /cvmfs/software.eessi.io/host_injections/2023.06/software/linux/x86_64/amd/zen3/modules/all
local eessi_site_module_path = pathJoin(eessi_site_software_path, eessi_modules_subdir)
setenv("EPREFIX", eessi_eprefix)
eessiDebug("Setting EPREFIX to " .. eessi_eprefix)
setenv("EESSI_CPU_FAMILY", eessi_cpu_family)
eessiDebug("Setting EESSI_CPU_FAMILY to " .. eessi_cpu_family)
setenv("EESSI_SITE_SOFTWARE_PATH", eessi_site_software_path)
eessiDebug("Setting EESSI_SITE_SOFTWARE_PATH to " .. eessi_site_software_path)
setenv("EESSI_SITE_MODULEPATH", eessi_site_module_path)
eessiDebug("Setting EESSI_SITE_MODULEPATH to " .. eessi_site_module_path)
setenv("EESSI_SOFTWARE_SUBDIR", eessi_software_subdir)
eessiDebug("Setting EESSI_SOFTWARE_SUBDIR to " .. eessi_software_subdir)
setenv("EESSI_PREFIX", eessi_prefix)
eessiDebug("Setting EESSI_PREFIX to " .. eessi_prefix)
setenv("EESSI_EPREFIX", eessi_eprefix)
eessiDebug("Setting EPREFIX to " .. eessi_eprefix)
prepend_path("PATH", pathJoin(eessi_eprefix, "bin"))
prepend_path("PATH", pathJoin(eessi_eprefix, "usr/bin"))
eessiDebug("Adding " .. pathJoin(eessi_eprefix, "bin") .. " to PATH")
prepend_path("PATH", pathJoin(eessi_eprefix, "usr", "bin"))
eessiDebug("Adding " .. pathJoin(eessi_eprefix, "usr", "bin") .. " to PATH")
setenv("EESSI_SOFTWARE_PATH", eessi_software_path)
eessiDebug("Setting EESSI_SOFTWARE_PATH to " .. eessi_software_path)
setenv("EESSI_MODULEPATH", eessi_module_path)
eessiDebug("Setting EESSI_MODULEPATH to " .. eessi_module_path)
-- We ship our spider cache, so this location does not need to be spider-ed
if ( mode() ~= "spider" ) then
prepend_path("MODULEPATH", eessi_module_path)
eessiDebug("Adding " .. eessi_module_path .. " to MODULEPATH")
end
prepend_path("LMOD_RC", pathJoin(eessi_software_path, "/.lmod/lmodrc.lua"))
prepend_path("LMOD_RC", pathJoin(eessi_software_path, ".lmod", "lmodrc.lua"))
eessiDebug("Adding " .. pathJoin(eessi_software_path, ".lmod", "lmodrc.lua") .. " to LMOD_RC")
-- Use pushenv for LMOD_PACKAGE_PATH as this may be set locally by the site
pushenv("LMOD_PACKAGE_PATH", pathJoin(eessi_software_path, ".lmod"))
eessiDebug("Setting LMOD_PACKAGE_PATH to " .. pathJoin(eessi_software_path, ".lmod"))

-- the accelerator may have an empty value and we need to give some flexibility
-- * construct the path we expect to find
-- * then check it exists
-- * then update the modulepath
if not (archdetect_accel == nil or archdetect_accel == '') then
-- The CPU subdirectory of the accelerator installations is _usually_ the same as host CPU, but this can be overridden
eessi_accel_software_subdir = os.getenv("EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE") or eessi_software_subdir
-- CPU location of the accelerator installations, e.g.,
-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3
eessi_accel_software_path = pathJoin(eessi_prefix, "software", eessi_os_type, eessi_accel_software_subdir)
-- location of the accelerator modules, e.g.,
-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all
eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir)
eessiDebug("Checking if " .. eessi_module_path_accel .. " exists")
if isDir(eessi_module_path_accel) then
setenv("EESSI_MODULEPATH_ACCEL", eessi_module_path_accel)
prepend_path("MODULEPATH", eessi_module_path_accel)
eessiDebug("Using acclerator modules at: " .. eessi_module_path_accel)
end
end

-- prepend the site module path last so it has priority
prepend_path("MODULEPATH", eessi_site_module_path)
setenv("LMOD_PACKAGE_PATH", pathJoin(eessi_software_path, ".lmod"))
eessiDebug("Adding " .. eessi_site_module_path .. " to MODULEPATH")
if mode() == "load" then
LmodMessage("EESSI/" .. eessi_version .. " loaded successfully")
end

0 comments on commit 901a944

Please sign in to comment.