diff --git a/accounts.go b/accounts.go index 99034c9..73ed2bf 100644 --- a/accounts.go +++ b/accounts.go @@ -17,47 +17,47 @@ along with this program. If not, see . */ package main import ( - "github.com/prometheus/client_golang/prometheus" - "regexp" - "strconv" - "strings" + "strings" + "strconv" + "regexp" + "github.com/prometheus/client_golang/prometheus" ) type JobMetrics struct { - pending float64 - running float64 - running_cpus float64 - suspended float64 + pending float64 + running float64 + running_cpus float64 + suspended float64 } -func ParseAccountsMetrics(squeueOutput []byte) map[string]*JobMetrics { - accounts := make(map[string]*JobMetrics) - lines := strings.Split(string(squeueOutput), "\n") - for _, line := range lines { - if strings.Contains(line, "|") { - account := strings.Split(line, "|")[1] - _, key := accounts[account] - if !key { - accounts[account] = &JobMetrics{0, 0, 0, 0} - } - state := strings.Split(line, "|")[2] - state = strings.ToLower(state) - cpus, _ := strconv.ParseFloat(strings.Split(line, "|")[3], 64) - pending := regexp.MustCompile(`^pending`) - running := regexp.MustCompile(`^running`) - suspended := regexp.MustCompile(`^suspended`) - switch { - case pending.MatchString(state) == true: - accounts[account].pending++ - case running.MatchString(state) == true: - accounts[account].running++ - accounts[account].running_cpus += cpus - case suspended.MatchString(state) == true: - accounts[account].suspended++ - } - } - } - return accounts +func ParseAccountsMetrics(input []byte) map[string]*JobMetrics { + accounts := make(map[string]*JobMetrics) + lines := strings.Split(string(input), "\n") + for _, line := range lines { + if strings.Contains(line,"|") { + account := strings.Split(line,"|")[1] + _,key := accounts[account] + if !key { + accounts[account] = &JobMetrics{0,0,0,0} + } + state := strings.Split(line,"|")[2] + state = strings.ToLower(state) + cpus,_ := strconv.ParseFloat(strings.Split(line,"|")[3],64) + pending := regexp.MustCompile(`^pending`) + running := regexp.MustCompile(`^running`) + suspended := regexp.MustCompile(`^suspended`) + switch { + case pending.MatchString(state) == true: + accounts[account].pending++ + case running.MatchString(state) == true: + accounts[account].running++ + accounts[account].running_cpus += cpus + case suspended.MatchString(state) == true: + accounts[account].suspended++ + } + } + } + return accounts } func GetAccountsMetrics() map[string]*JobMetrics { @@ -65,43 +65,43 @@ func GetAccountsMetrics() map[string]*JobMetrics { } type AccountsCollector struct { - pending *prometheus.Desc - running *prometheus.Desc - running_cpus *prometheus.Desc - suspended *prometheus.Desc + pending *prometheus.Desc + running *prometheus.Desc + running_cpus *prometheus.Desc + suspended *prometheus.Desc } func NewAccountsCollector() *AccountsCollector { - labels := []string{"account"} - return &AccountsCollector{ - pending: prometheus.NewDesc("slurm_account_jobs_pending", "Pending jobs for account", labels, nil), - running: prometheus.NewDesc("slurm_account_jobs_running", "Running jobs for account", labels, nil), - running_cpus: prometheus.NewDesc("slurm_account_cpus_running", "Running cpus for account", labels, nil), - suspended: prometheus.NewDesc("slurm_account_jobs_suspended", "Suspended jobs for account", labels, nil), - } + labels := []string{"account"} + return &AccountsCollector{ + pending: prometheus.NewDesc("slurm_account_jobs_pending", "Pending jobs for account", labels, nil), + running: prometheus.NewDesc("slurm_account_jobs_running", "Running jobs for account", labels, nil), + running_cpus: prometheus.NewDesc("slurm_account_cpus_running", "Running cpus for account", labels, nil), + suspended: prometheus.NewDesc("slurm_account_jobs_suspended", "Suspended jobs for account", labels, nil), + } } func (ac *AccountsCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- ac.pending - ch <- ac.running - ch <- ac.running_cpus - ch <- ac.suspended + ch <- ac.pending + ch <- ac.running + ch <- ac.running_cpus + ch <- ac.suspended } func (ac *AccountsCollector) Collect(ch chan<- prometheus.Metric) { - am := GetAccountsMetrics() - for a := range am { - if am[a].pending > 0 { - ch <- prometheus.MustNewConstMetric(ac.pending, prometheus.GaugeValue, am[a].pending, a) - } - if am[a].running > 0 { - ch <- prometheus.MustNewConstMetric(ac.running, prometheus.GaugeValue, am[a].running, a) - } - if am[a].running_cpus > 0 { - ch <- prometheus.MustNewConstMetric(ac.running_cpus, prometheus.GaugeValue, am[a].running_cpus, a) - } - if am[a].suspended > 0 { - ch <- prometheus.MustNewConstMetric(ac.suspended, prometheus.GaugeValue, am[a].suspended, a) - } - } + am := GetAccountsMetrics() + for a := range am { + if am[a].pending > 0 { + ch <- prometheus.MustNewConstMetric(ac.pending, prometheus.GaugeValue, am[a].pending, a) + } + if am[a].running > 0 { + ch <- prometheus.MustNewConstMetric(ac.running, prometheus.GaugeValue, am[a].running, a) + } + if am[a].running_cpus > 0 { + ch <- prometheus.MustNewConstMetric(ac.running_cpus, prometheus.GaugeValue, am[a].running_cpus, a) + } + if am[a].suspended > 0 { + ch <- prometheus.MustNewConstMetric(ac.suspended, prometheus.GaugeValue, am[a].suspended, a) + } + } } diff --git a/gpus.go b/gpus.go index 5ff27c8..c2d23b1 100644 --- a/gpus.go +++ b/gpus.go @@ -92,9 +92,9 @@ func GetGPUsMetrics() *GPUsMetrics { func NewGPUsCollector() *GPUsCollector { return &GPUsCollector{ - alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil), - idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil), - total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil), + alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil), + idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil), + total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil), utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil), } } diff --git a/main.go b/main.go index 9980d69..f5613a0 100644 --- a/main.go +++ b/main.go @@ -27,9 +27,9 @@ func init() { // Metrics have to be registered to be exposed prometheus.MustRegister(NewAccountsCollector()) // from accounts.go prometheus.MustRegister(NewCPUsCollector()) // from cpus.go + prometheus.MustRegister(NewGPUsCollector()) // from gpus.go prometheus.MustRegister(NewNodesCollector()) // from nodes.go prometheus.MustRegister(NewNodeCollector()) // from node.go - prometheus.MustRegister(NewGPUsCollector()) // from gpus.go prometheus.MustRegister(NewPartitionsCollector()) // from partitions.go prometheus.MustRegister(NewQueueCollector()) // from queue.go prometheus.MustRegister(NewSchedulerCollector()) // from scheduler.go diff --git a/node_test.go b/node_test.go index 8dcfb3e..6933aa4 100644 --- a/node_test.go +++ b/node_test.go @@ -46,7 +46,6 @@ func TestNodeMetrics(t *testing.T) { t.Fatalf("Can not open test data: %v", err) } metrics := ParseNodeMetrics(data) - // t.Logf("%+v", metrics) assert.Contains(t, metrics, "b001") assert.Equal(t, uint64(327680), metrics["b001"].memAlloc) diff --git a/partitions.go b/partitions.go index 6b31b04..96ba715 100644 --- a/partitions.go +++ b/partitions.go @@ -23,46 +23,46 @@ import ( ) type PartitionMetrics struct { - allocated float64 - idle float64 - other float64 - pending float64 - total float64 + allocated float64 + idle float64 + other float64 + pending float64 + total float64 } func ParsePartitionsMetrics(sinfoOutput []byte, squeueOutput []byte) map[string]*PartitionMetrics { - partitions := make(map[string]*PartitionMetrics) - lines := strings.Split(string(sinfoOutput), "\n") - for _, line := range lines { - if strings.Contains(line, ",") { - // name of a partition - partition := strings.Split(line, ",")[0] - _, key := partitions[partition] - if !key { - partitions[partition] = &PartitionMetrics{0, 0, 0, 0, 0} - } - states := strings.Split(line, ",")[1] - allocated, _ := strconv.ParseFloat(strings.Split(states, "/")[0], 64) - idle, _ := strconv.ParseFloat(strings.Split(states, "/")[1], 64) - other, _ := strconv.ParseFloat(strings.Split(states, "/")[2], 64) - total, _ := strconv.ParseFloat(strings.Split(states, "/")[3], 64) - partitions[partition].allocated = allocated - partitions[partition].idle = idle - partitions[partition].other = other - partitions[partition].total = total - } - } - // get list of pending jobs by partition name - list := strings.Split(string(squeueOutput), "\n") - for _, partition := range list { - // accumulate the number of pending jobs - _, key := partitions[partition] - if key { - partitions[partition].pending += 1 - } - } + partitions := make(map[string]*PartitionMetrics) + lines := strings.Split(string(sinfoOutput),"\n") + for _, line := range lines { + if strings.Contains(line,",") { + // name of a partition + partition := strings.Split(line,",")[0] + _,key := partitions[partition] + if !key { + partitions[partition] = &PartitionMetrics{0,0,0,0,0} + } + states := strings.Split(line,",")[1] + allocated,_ := strconv.ParseFloat(strings.Split(states,"/")[0],64) + idle,_ := strconv.ParseFloat(strings.Split(states,"/")[1],64) + other,_ := strconv.ParseFloat(strings.Split(states,"/")[2],64) + total,_ := strconv.ParseFloat(strings.Split(states,"/")[3],64) + partitions[partition].allocated = allocated + partitions[partition].idle = idle + partitions[partition].other = other + partitions[partition].total = total + } + } + // get list of pending jobs by partition name + list := strings.Split(string(squeueOutput), "\n") + for _,partition := range list { + // accumulate the number of pending jobs + _, key := partitions[partition] + if key { + partitions[partition].pending += 1 + } + } - return partitions + return partitions } func GetPartitionsMetrics() map[string]*PartitionMetrics { @@ -73,49 +73,49 @@ func GetPartitionsMetrics() map[string]*PartitionMetrics { } type PartitionsCollector struct { - allocated *prometheus.Desc - idle *prometheus.Desc - other *prometheus.Desc - pending *prometheus.Desc - total *prometheus.Desc + allocated *prometheus.Desc + idle *prometheus.Desc + other *prometheus.Desc + pending *prometheus.Desc + total *prometheus.Desc } func NewPartitionsCollector() *PartitionsCollector { - labels := []string{"partition"} - return &PartitionsCollector{ - allocated: prometheus.NewDesc("slurm_partition_cpus_allocated", "Allocated CPUs for partition", labels, nil), - idle: prometheus.NewDesc("slurm_partition_cpus_idle", "Idle CPUs for partition", labels, nil), - other: prometheus.NewDesc("slurm_partition_cpus_other", "Other CPUs for partition", labels, nil), - pending: prometheus.NewDesc("slurm_partition_jobs_pending", "Pending jobs for partition", labels, nil), - total: prometheus.NewDesc("slurm_partition_cpus_total", "Total CPUs for partition", labels, nil), - } + labels := []string{"partition"} + return &PartitionsCollector{ + allocated: prometheus.NewDesc("slurm_partition_cpus_allocated", "Allocated CPUs for partition", labels,nil), + idle: prometheus.NewDesc("slurm_partition_cpus_idle", "Idle CPUs for partition", labels,nil), + other: prometheus.NewDesc("slurm_partition_cpus_other", "Other CPUs for partition", labels,nil), + pending: prometheus.NewDesc("slurm_partition_jobs_pending", "Pending jobs for partition", labels,nil), + total: prometheus.NewDesc("slurm_partition_cpus_total", "Total CPUs for partition", labels,nil), + } } func (pc *PartitionsCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- pc.allocated - ch <- pc.idle - ch <- pc.other - ch <- pc.pending - ch <- pc.total + ch <- pc.allocated + ch <- pc.idle + ch <- pc.other + ch <- pc.pending + ch <- pc.total } func (pc *PartitionsCollector) Collect(ch chan<- prometheus.Metric) { - pm := GetPartitionsMetrics() - for p := range pm { - if pm[p].allocated > 0 { - ch <- prometheus.MustNewConstMetric(pc.allocated, prometheus.GaugeValue, pm[p].allocated, p) - } - if pm[p].idle > 0 { - ch <- prometheus.MustNewConstMetric(pc.idle, prometheus.GaugeValue, pm[p].idle, p) - } - if pm[p].other > 0 { - ch <- prometheus.MustNewConstMetric(pc.other, prometheus.GaugeValue, pm[p].other, p) - } - if pm[p].pending > 0 { - ch <- prometheus.MustNewConstMetric(pc.pending, prometheus.GaugeValue, pm[p].pending, p) - } - if pm[p].total > 0 { - ch <- prometheus.MustNewConstMetric(pc.total, prometheus.GaugeValue, pm[p].total, p) - } - } + pm := GetPartitionsMetrics() + for p := range pm { + if pm[p].allocated > 0 { + ch <- prometheus.MustNewConstMetric(pc.allocated, prometheus.GaugeValue, pm[p].allocated, p) + } + if pm[p].idle > 0 { + ch <- prometheus.MustNewConstMetric(pc.idle, prometheus.GaugeValue, pm[p].idle, p) + } + if pm[p].other > 0 { + ch <- prometheus.MustNewConstMetric(pc.other, prometheus.GaugeValue, pm[p].other, p) + } + if pm[p].pending > 0 { + ch <- prometheus.MustNewConstMetric(pc.pending, prometheus.GaugeValue, pm[p].pending, p) + } + if pm[p].total > 0 { + ch <- prometheus.MustNewConstMetric(pc.total, prometheus.GaugeValue, pm[p].total, p) + } + } } diff --git a/sshare.go b/sshare.go index 3e76a09..7f808ea 100644 --- a/sshare.go +++ b/sshare.go @@ -23,26 +23,26 @@ import ( ) type FairShareMetrics struct { - fairshare float64 + fairshare float64 } func ParseFairShareMetrics(sshareOutput []byte) map[string]*FairShareMetrics { - accounts := make(map[string]*FairShareMetrics) - lines := strings.Split(string(sshareOutput), "\n") - for _, line := range lines { - if !strings.HasPrefix(line, " ") { - if strings.Contains(line, "|") { - account := strings.Trim(strings.Split(line, "|")[0], " ") - _, key := accounts[account] - if !key { - accounts[account] = &FairShareMetrics{0} - } - fairshare, _ := strconv.ParseFloat(strings.Split(line, "|")[1], 64) - accounts[account].fairshare = fairshare - } - } - } - return accounts + accounts := make(map[string]*FairShareMetrics) + lines := strings.Split(string(sshareOutput), "\n") + for _, line := range lines { + if ! strings.HasPrefix(line," ") { + if strings.Contains(line,"|") { + account := strings.Trim(strings.Split(line,"|")[0]," ") + _,key := accounts[account] + if !key { + accounts[account] = &FairShareMetrics{0} + } + fairshare,_ := strconv.ParseFloat(strings.Split(line,"|")[1],64) + accounts[account].fairshare = fairshare + } + } + } + return accounts } func GetFairShareMetrics() map[string]*FairShareMetrics { @@ -50,23 +50,23 @@ func GetFairShareMetrics() map[string]*FairShareMetrics { } type FairShareCollector struct { - fairshare *prometheus.Desc + fairshare *prometheus.Desc } func NewFairShareCollector() *FairShareCollector { - labels := []string{"account"} - return &FairShareCollector{ - fairshare: prometheus.NewDesc("slurm_account_fairshare", "FairShare for account", labels, nil), - } + labels := []string{"account"} + return &FairShareCollector{ + fairshare: prometheus.NewDesc("slurm_account_fairshare","FairShare for account" , labels,nil), + } } func (fsc *FairShareCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- fsc.fairshare + ch <- fsc.fairshare } func (fsc *FairShareCollector) Collect(ch chan<- prometheus.Metric) { - fsm := GetFairShareMetrics() - for f := range fsm { - ch <- prometheus.MustNewConstMetric(fsc.fairshare, prometheus.GaugeValue, fsm[f].fairshare, f) - } + fsm := GetFairShareMetrics() + for f := range fsm { + ch <- prometheus.MustNewConstMetric(fsc.fairshare, prometheus.GaugeValue, fsm[f].fairshare, f) + } } diff --git a/users.go b/users.go index abcc02d..a9a7dc1 100644 --- a/users.go +++ b/users.go @@ -17,91 +17,91 @@ along with this program. If not, see . */ package main import ( - "github.com/prometheus/client_golang/prometheus" - "regexp" - "strconv" - "strings" + "strings" + "strconv" + "regexp" + "github.com/prometheus/client_golang/prometheus" ) type UserJobMetrics struct { - pending float64 - running float64 - running_cpus float64 - suspended float64 + pending float64 + running float64 + running_cpus float64 + suspended float64 } func ParseUsersMetrics(squeueOutput []byte) map[string]*UserJobMetrics { - users := make(map[string]*UserJobMetrics) - lines := strings.Split(string(squeueOutput), "\n") - for _, line := range lines { - if strings.Contains(line, "|") { - user := strings.Split(line, "|")[1] - _, key := users[user] - if !key { - users[user] = &UserJobMetrics{0, 0, 0, 0} - } - state := strings.Split(line, "|")[2] - state = strings.ToLower(state) - cpus, _ := strconv.ParseFloat(strings.Split(line, "|")[3], 64) - pending := regexp.MustCompile(`^pending`) - running := regexp.MustCompile(`^running`) - suspended := regexp.MustCompile(`^suspended`) - switch { - case pending.MatchString(state) == true: - users[user].pending++ - case running.MatchString(state) == true: - users[user].running++ - users[user].running_cpus += cpus - case suspended.MatchString(state) == true: - users[user].suspended++ - } - } - } - return users + users := make(map[string]*UserJobMetrics) + lines := strings.Split(string(squeueOutput), "\n") + for _, line := range lines { + if strings.Contains(line,"|") { + user := strings.Split(line,"|")[1] + _,key := users[user] + if !key { + users[user] = &UserJobMetrics{0,0,0,0} + } + state := strings.Split(line,"|")[2] + state = strings.ToLower(state) + cpus,_ := strconv.ParseFloat(strings.Split(line,"|")[3],64) + pending := regexp.MustCompile(`^pending`) + running := regexp.MustCompile(`^running`) + suspended := regexp.MustCompile(`^suspended`) + switch { + case pending.MatchString(state) == true: + users[user].pending++ + case running.MatchString(state) == true: + users[user].running++ + users[user].running_cpus += cpus + case suspended.MatchString(state) == true: + users[user].suspended++ + } + } + } + return users } func GetUsersMetrics() map[string]*UserJobMetrics { - return ParseUsersMetrics(Subprocess("squeue", "-a", "-r", "-h", "-o %A|%u|%T|%C")) + return ParseUsersMetrics(Subprocess("squeue", "-a", "-r", "-h", "-o %A|%u|%T|%C")) } type UsersCollector struct { - pending *prometheus.Desc - running *prometheus.Desc - running_cpus *prometheus.Desc - suspended *prometheus.Desc + pending *prometheus.Desc + running *prometheus.Desc + running_cpus *prometheus.Desc + suspended *prometheus.Desc } func NewUsersCollector() *UsersCollector { - labels := []string{"user"} - return &UsersCollector{ - pending: prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil), - running: prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil), - running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil), - suspended: prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil), - } + labels := []string{"user"} + return &UsersCollector { + pending: prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil), + running: prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil), + running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil), + suspended: prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil), + } } func (uc *UsersCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- uc.pending - ch <- uc.running - ch <- uc.running_cpus - ch <- uc.suspended + ch <- uc.pending + ch <- uc.running + ch <- uc.running_cpus + ch <- uc.suspended } func (uc *UsersCollector) Collect(ch chan<- prometheus.Metric) { - um := GetUsersMetrics() - for u := range um { - if um[u].pending > 0 { - ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u) - } - if um[u].running > 0 { - ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u) - } - if um[u].running_cpus > 0 { - ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u) - } - if um[u].suspended > 0 { - ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u) - } - } + um := GetUsersMetrics() + for u := range um { + if um[u].pending > 0 { + ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u) + } + if um[u].running > 0 { + ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u) + } + if um[u].running_cpus > 0 { + ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u) + } + if um[u].suspended > 0 { + ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u) + } + } }