diff --git a/site/profile/functions/gethostnamewithclass.pp b/site/profile/functions/gethostnamewithclass.pp new file mode 100644 index 000000000..d8e757e9e --- /dev/null +++ b/site/profile/functions/gethostnamewithclass.pp @@ -0,0 +1,16 @@ +function profile::gethostnamewithclass($class_name) >> Array[String] { + $instances = lookup('terraform.instances') + $site_all = lookup('magic_castle::site::all') + $site_tags = lookup('magic_castle::site::tags') + + if $class_name in $site_all { + return $instances.keys() + } else { + $tags = keys($site_tags).filter |$tag| { + $class_name in $site_tags[$tag] + } + return keys($instances).filter |$hostname| { + !intersection($tags, $instances[$hostname]['tags']).empty + } + } +} diff --git a/site/profile/manifests/metrics.pp b/site/profile/manifests/metrics.pp index 2ef2899b4..57680ace8 100644 --- a/site/profile/manifests/metrics.pp +++ b/site/profile/manifests/metrics.pp @@ -89,7 +89,6 @@ require => [ Package['prometheus-slurm-exporter'], File['/etc/systemd/system/prometheus-slurm-exporter.service'], - Wait_for['slurmctldhost_set'], ], } } diff --git a/site/profile/manifests/slurm.pp b/site/profile/manifests/slurm.pp index bbd021834..74de9f795 100644 --- a/site/profile/manifests/slurm.pp +++ b/site/profile/manifests/slurm.pp @@ -193,6 +193,8 @@ 'suspend_time' => $suspend_time, 'memlimit' => $os_reserved_memory, 'partitions' => $partitions, + 'slurmctl' => profile::gethostnamewithclass('profile::slurm::controller'), + 'slurmdb' => profile::gethostnamewithclass('profile::slurm::accounting'), }), group => 'slurm', owner => 'slurm', @@ -214,25 +216,6 @@ require => File['/etc/slurm'], } - file { '/etc/slurm/slurm-consul.tpl': - ensure => 'present', - source => 'puppet:///modules/profile/slurm/slurm-consul.tpl', - notify => Service['consul-template'], - } - - wait_for { 'slurmctldhost_set': - query => 'cat /etc/slurm/slurm-consul.conf', - regex => '^SlurmctldHost=', - polling_frequency => 10, # Wait up to 5 minutes (30 * 10 seconds). - max_retries => 30, - require => [ - Service['consul-template'], - Class['consul::reload_service'], - ], - refreshonly => true, - subscribe => File['/etc/slurm/slurm-consul.tpl'], - } - # SELinux policy required to allow confined users to submit job with Slurm 19, 20, 21. # Slurm commands tries to write to a socket in /var/run/munge. # Confined users cannot stat this file, neither write to it. The policy @@ -256,32 +239,6 @@ }), } - file { '/opt/software/slurm/bin/cond_restart_slurm_services': - require => Package['slurm'], - mode => '0755', - content => @("EOT"), -#!/bin/bash -{ - /usr/bin/systemctl -q is-active slurmd && /usr/bin/systemctl restart slurmd || /usr/bin/true - /usr/bin/systemctl -q is-active slurmctld && /usr/bin/systemctl restart slurmctld || /usr/bin/true -} &> /var/log/slurm/cond_restart_slurm_services.log -|EOT - } - - - consul_template::watch { 'slurm-consul.conf': - require => [ - File['/etc/slurm/slurm-consul.tpl'], - File['/opt/software/slurm/bin/cond_restart_slurm_services'], - ], - config_hash => { - perms => '0644', - source => '/etc/slurm/slurm-consul.tpl', - destination => '/etc/slurm/slurm-consul.conf', - command => '/opt/software/slurm/bin/cond_restart_slurm_services', - } - } - } # Slurm accouting. This where is slurm accounting database and daemon is ran. @@ -376,7 +333,6 @@ require => [ Service['slurmdbd'], Wait_for['slurmdbd_started'], - Wait_for['slurmctldhost_set'], ], before => [ Service['slurmctld'] @@ -515,7 +471,6 @@ port => 6817, require => Tcp_conn_validator['consul'], token => lookup('profile::consul::acl_api_token'), - before => Wait_for['slurmctldhost_set'], } package { 'slurm-slurmctld': @@ -528,7 +483,6 @@ enable => true, require => [ Package['slurm-slurmctld'], - Wait_for['slurmctldhost_set'], ], subscribe => [ File['/etc/slurm/slurm.conf'], @@ -747,7 +701,6 @@ ], require => [ Package['slurm-slurmd'], - Wait_for['slurmctldhost_set'], ], } diff --git a/site/profile/templates/slurm/slurm.conf.epp b/site/profile/templates/slurm/slurm.conf.epp index 2ea90aa27..d84323c00 100644 --- a/site/profile/templates/slurm/slurm.conf.epp +++ b/site/profile/templates/slurm/slurm.conf.epp @@ -1,6 +1,21 @@ -include /etc/slurm/slurm-consul.conf include /etc/slurm/nodes.conf +<% if ! $slurmctl.empty { -%> +SlurmctldHost=<%= join($slurmctl, ',') %> +<% } -%> +SlurmctldPort=6817 + +## Accounting +<% if ! $slurmdb.empty { -%> +AccountingStorageHost=<%= join($slurmdb, ',') %> +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageTRES=gres/gpu,cpu,mem +AccountingStorageEnforce=associations +JobAcctGatherType=jobacct_gather/cgroup +JobAcctGatherFrequency=task=30 +JobAcctGatherParams=NoOverMemoryKill,UsePSS +<% } -%> + # MANAGEMENT POLICIES ClusterName=<%= $cluster_name %> AuthType=auth/munge