Skip to content

Commit

Permalink
Try to drop consul for slurm.conf
Browse files Browse the repository at this point in the history
  • Loading branch information
cmd-ntrf committed Nov 14, 2024
1 parent ad7616f commit 54c13ed
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 51 deletions.
16 changes: 16 additions & 0 deletions site/profile/functions/gethostnamewithclass.pp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
function profile::gethostnamewithclass($class_name) >> Array[String] {
$instances = lookup('terraform.instances')
$site_all = lookup('magic_castle::site::all')
$site_tags = lookup('magic_castle::site::tags')

if $class_name in $site_all {
return $instances.keys()
} else {
$tags = keys($site_tags).filter |$tag| {
$class_name in $site_tags[$tag]
}
return keys($instances).filter |$hostname| {
!intersection($tags, $instances[$hostname]['tags']).empty
}
}
}
1 change: 0 additions & 1 deletion site/profile/manifests/metrics.pp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@
require => [
Package['prometheus-slurm-exporter'],
File['/etc/systemd/system/prometheus-slurm-exporter.service'],
Wait_for['slurmctldhost_set'],
],
}
}
51 changes: 2 additions & 49 deletions site/profile/manifests/slurm.pp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@
'suspend_time' => $suspend_time,
'memlimit' => $os_reserved_memory,
'partitions' => $partitions,
'slurmctl' => profile::gethostnamewithclass('profile::slurm::controller'),
'slurmdb' => profile::gethostnamewithclass('profile::slurm::accounting'),
}),
group => 'slurm',
owner => 'slurm',
Expand All @@ -214,25 +216,6 @@
require => File['/etc/slurm'],
}

file { '/etc/slurm/slurm-consul.tpl':
ensure => 'present',
source => 'puppet:///modules/profile/slurm/slurm-consul.tpl',
notify => Service['consul-template'],
}

wait_for { 'slurmctldhost_set':
query => 'cat /etc/slurm/slurm-consul.conf',
regex => '^SlurmctldHost=',
polling_frequency => 10, # Wait up to 5 minutes (30 * 10 seconds).
max_retries => 30,
require => [
Service['consul-template'],
Class['consul::reload_service'],
],
refreshonly => true,
subscribe => File['/etc/slurm/slurm-consul.tpl'],
}

# SELinux policy required to allow confined users to submit job with Slurm 19, 20, 21.
# Slurm commands tries to write to a socket in /var/run/munge.
# Confined users cannot stat this file, neither write to it. The policy
Expand All @@ -256,32 +239,6 @@
}),
}

file { '/opt/software/slurm/bin/cond_restart_slurm_services':
require => Package['slurm'],
mode => '0755',
content => @("EOT"),
#!/bin/bash
{
/usr/bin/systemctl -q is-active slurmd && /usr/bin/systemctl restart slurmd || /usr/bin/true
/usr/bin/systemctl -q is-active slurmctld && /usr/bin/systemctl restart slurmctld || /usr/bin/true
} &> /var/log/slurm/cond_restart_slurm_services.log
|EOT
}


consul_template::watch { 'slurm-consul.conf':
require => [
File['/etc/slurm/slurm-consul.tpl'],
File['/opt/software/slurm/bin/cond_restart_slurm_services'],
],
config_hash => {
perms => '0644',
source => '/etc/slurm/slurm-consul.tpl',
destination => '/etc/slurm/slurm-consul.conf',
command => '/opt/software/slurm/bin/cond_restart_slurm_services',
}
}

}

# Slurm accouting. This where is slurm accounting database and daemon is ran.
Expand Down Expand Up @@ -376,7 +333,6 @@
require => [
Service['slurmdbd'],
Wait_for['slurmdbd_started'],
Wait_for['slurmctldhost_set'],
],
before => [
Service['slurmctld']
Expand Down Expand Up @@ -515,7 +471,6 @@
port => 6817,
require => Tcp_conn_validator['consul'],
token => lookup('profile::consul::acl_api_token'),
before => Wait_for['slurmctldhost_set'],
}

package { 'slurm-slurmctld':
Expand All @@ -528,7 +483,6 @@
enable => true,
require => [
Package['slurm-slurmctld'],
Wait_for['slurmctldhost_set'],
],
subscribe => [
File['/etc/slurm/slurm.conf'],
Expand Down Expand Up @@ -747,7 +701,6 @@
],
require => [
Package['slurm-slurmd'],
Wait_for['slurmctldhost_set'],
],
}

Expand Down
17 changes: 16 additions & 1 deletion site/profile/templates/slurm/slurm.conf.epp
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
include /etc/slurm/slurm-consul.conf
include /etc/slurm/nodes.conf

<% if ! $slurmctl.empty { -%>
SlurmctldHost=<%= join($slurmctl, ',') %>
<% } -%>
SlurmctldPort=6817

## Accounting
<% if ! $slurmdb.empty { -%>
AccountingStorageHost=<%= join($slurmdb, ',') %>
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageTRES=gres/gpu,cpu,mem
AccountingStorageEnforce=associations
JobAcctGatherType=jobacct_gather/cgroup
JobAcctGatherFrequency=task=30
JobAcctGatherParams=NoOverMemoryKill,UsePSS
<% } -%>

# MANAGEMENT POLICIES
ClusterName=<%= $cluster_name %>
AuthType=auth/munge
Expand Down

0 comments on commit 54c13ed

Please sign in to comment.