diff --git a/accounts.go b/accounts.go
index 99034c9..73ed2bf 100644
--- a/accounts.go
+++ b/accounts.go
@@ -17,47 +17,47 @@ along with this program. If not, see . */
package main
import (
- "github.com/prometheus/client_golang/prometheus"
- "regexp"
- "strconv"
- "strings"
+ "strings"
+ "strconv"
+ "regexp"
+ "github.com/prometheus/client_golang/prometheus"
)
type JobMetrics struct {
- pending float64
- running float64
- running_cpus float64
- suspended float64
+ pending float64
+ running float64
+ running_cpus float64
+ suspended float64
}
-func ParseAccountsMetrics(squeueOutput []byte) map[string]*JobMetrics {
- accounts := make(map[string]*JobMetrics)
- lines := strings.Split(string(squeueOutput), "\n")
- for _, line := range lines {
- if strings.Contains(line, "|") {
- account := strings.Split(line, "|")[1]
- _, key := accounts[account]
- if !key {
- accounts[account] = &JobMetrics{0, 0, 0, 0}
- }
- state := strings.Split(line, "|")[2]
- state = strings.ToLower(state)
- cpus, _ := strconv.ParseFloat(strings.Split(line, "|")[3], 64)
- pending := regexp.MustCompile(`^pending`)
- running := regexp.MustCompile(`^running`)
- suspended := regexp.MustCompile(`^suspended`)
- switch {
- case pending.MatchString(state) == true:
- accounts[account].pending++
- case running.MatchString(state) == true:
- accounts[account].running++
- accounts[account].running_cpus += cpus
- case suspended.MatchString(state) == true:
- accounts[account].suspended++
- }
- }
- }
- return accounts
+func ParseAccountsMetrics(input []byte) map[string]*JobMetrics {
+ accounts := make(map[string]*JobMetrics)
+ lines := strings.Split(string(input), "\n")
+ for _, line := range lines {
+ if strings.Contains(line,"|") {
+ account := strings.Split(line,"|")[1]
+ _,key := accounts[account]
+ if !key {
+ accounts[account] = &JobMetrics{0,0,0,0}
+ }
+ state := strings.Split(line,"|")[2]
+ state = strings.ToLower(state)
+ cpus,_ := strconv.ParseFloat(strings.Split(line,"|")[3],64)
+ pending := regexp.MustCompile(`^pending`)
+ running := regexp.MustCompile(`^running`)
+ suspended := regexp.MustCompile(`^suspended`)
+ switch {
+ case pending.MatchString(state) == true:
+ accounts[account].pending++
+ case running.MatchString(state) == true:
+ accounts[account].running++
+ accounts[account].running_cpus += cpus
+ case suspended.MatchString(state) == true:
+ accounts[account].suspended++
+ }
+ }
+ }
+ return accounts
}
func GetAccountsMetrics() map[string]*JobMetrics {
@@ -65,43 +65,43 @@ func GetAccountsMetrics() map[string]*JobMetrics {
}
type AccountsCollector struct {
- pending *prometheus.Desc
- running *prometheus.Desc
- running_cpus *prometheus.Desc
- suspended *prometheus.Desc
+ pending *prometheus.Desc
+ running *prometheus.Desc
+ running_cpus *prometheus.Desc
+ suspended *prometheus.Desc
}
func NewAccountsCollector() *AccountsCollector {
- labels := []string{"account"}
- return &AccountsCollector{
- pending: prometheus.NewDesc("slurm_account_jobs_pending", "Pending jobs for account", labels, nil),
- running: prometheus.NewDesc("slurm_account_jobs_running", "Running jobs for account", labels, nil),
- running_cpus: prometheus.NewDesc("slurm_account_cpus_running", "Running cpus for account", labels, nil),
- suspended: prometheus.NewDesc("slurm_account_jobs_suspended", "Suspended jobs for account", labels, nil),
- }
+ labels := []string{"account"}
+ return &AccountsCollector{
+ pending: prometheus.NewDesc("slurm_account_jobs_pending", "Pending jobs for account", labels, nil),
+ running: prometheus.NewDesc("slurm_account_jobs_running", "Running jobs for account", labels, nil),
+ running_cpus: prometheus.NewDesc("slurm_account_cpus_running", "Running cpus for account", labels, nil),
+ suspended: prometheus.NewDesc("slurm_account_jobs_suspended", "Suspended jobs for account", labels, nil),
+ }
}
func (ac *AccountsCollector) Describe(ch chan<- *prometheus.Desc) {
- ch <- ac.pending
- ch <- ac.running
- ch <- ac.running_cpus
- ch <- ac.suspended
+ ch <- ac.pending
+ ch <- ac.running
+ ch <- ac.running_cpus
+ ch <- ac.suspended
}
func (ac *AccountsCollector) Collect(ch chan<- prometheus.Metric) {
- am := GetAccountsMetrics()
- for a := range am {
- if am[a].pending > 0 {
- ch <- prometheus.MustNewConstMetric(ac.pending, prometheus.GaugeValue, am[a].pending, a)
- }
- if am[a].running > 0 {
- ch <- prometheus.MustNewConstMetric(ac.running, prometheus.GaugeValue, am[a].running, a)
- }
- if am[a].running_cpus > 0 {
- ch <- prometheus.MustNewConstMetric(ac.running_cpus, prometheus.GaugeValue, am[a].running_cpus, a)
- }
- if am[a].suspended > 0 {
- ch <- prometheus.MustNewConstMetric(ac.suspended, prometheus.GaugeValue, am[a].suspended, a)
- }
- }
+ am := GetAccountsMetrics()
+ for a := range am {
+ if am[a].pending > 0 {
+ ch <- prometheus.MustNewConstMetric(ac.pending, prometheus.GaugeValue, am[a].pending, a)
+ }
+ if am[a].running > 0 {
+ ch <- prometheus.MustNewConstMetric(ac.running, prometheus.GaugeValue, am[a].running, a)
+ }
+ if am[a].running_cpus > 0 {
+ ch <- prometheus.MustNewConstMetric(ac.running_cpus, prometheus.GaugeValue, am[a].running_cpus, a)
+ }
+ if am[a].suspended > 0 {
+ ch <- prometheus.MustNewConstMetric(ac.suspended, prometheus.GaugeValue, am[a].suspended, a)
+ }
+ }
}
diff --git a/gpus.go b/gpus.go
index 5ff27c8..c2d23b1 100644
--- a/gpus.go
+++ b/gpus.go
@@ -92,9 +92,9 @@ func GetGPUsMetrics() *GPUsMetrics {
func NewGPUsCollector() *GPUsCollector {
return &GPUsCollector{
- alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
- idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
- total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
+ alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
+ idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
+ total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
}
}
diff --git a/main.go b/main.go
index 9980d69..f5613a0 100644
--- a/main.go
+++ b/main.go
@@ -27,9 +27,9 @@ func init() {
// Metrics have to be registered to be exposed
prometheus.MustRegister(NewAccountsCollector()) // from accounts.go
prometheus.MustRegister(NewCPUsCollector()) // from cpus.go
+ prometheus.MustRegister(NewGPUsCollector()) // from gpus.go
prometheus.MustRegister(NewNodesCollector()) // from nodes.go
prometheus.MustRegister(NewNodeCollector()) // from node.go
- prometheus.MustRegister(NewGPUsCollector()) // from gpus.go
prometheus.MustRegister(NewPartitionsCollector()) // from partitions.go
prometheus.MustRegister(NewQueueCollector()) // from queue.go
prometheus.MustRegister(NewSchedulerCollector()) // from scheduler.go
diff --git a/node_test.go b/node_test.go
index 8dcfb3e..6933aa4 100644
--- a/node_test.go
+++ b/node_test.go
@@ -46,7 +46,6 @@ func TestNodeMetrics(t *testing.T) {
t.Fatalf("Can not open test data: %v", err)
}
metrics := ParseNodeMetrics(data)
- // t.Logf("%+v", metrics)
assert.Contains(t, metrics, "b001")
assert.Equal(t, uint64(327680), metrics["b001"].memAlloc)
diff --git a/partitions.go b/partitions.go
index 6b31b04..96ba715 100644
--- a/partitions.go
+++ b/partitions.go
@@ -23,46 +23,46 @@ import (
)
type PartitionMetrics struct {
- allocated float64
- idle float64
- other float64
- pending float64
- total float64
+ allocated float64
+ idle float64
+ other float64
+ pending float64
+ total float64
}
func ParsePartitionsMetrics(sinfoOutput []byte, squeueOutput []byte) map[string]*PartitionMetrics {
- partitions := make(map[string]*PartitionMetrics)
- lines := strings.Split(string(sinfoOutput), "\n")
- for _, line := range lines {
- if strings.Contains(line, ",") {
- // name of a partition
- partition := strings.Split(line, ",")[0]
- _, key := partitions[partition]
- if !key {
- partitions[partition] = &PartitionMetrics{0, 0, 0, 0, 0}
- }
- states := strings.Split(line, ",")[1]
- allocated, _ := strconv.ParseFloat(strings.Split(states, "/")[0], 64)
- idle, _ := strconv.ParseFloat(strings.Split(states, "/")[1], 64)
- other, _ := strconv.ParseFloat(strings.Split(states, "/")[2], 64)
- total, _ := strconv.ParseFloat(strings.Split(states, "/")[3], 64)
- partitions[partition].allocated = allocated
- partitions[partition].idle = idle
- partitions[partition].other = other
- partitions[partition].total = total
- }
- }
- // get list of pending jobs by partition name
- list := strings.Split(string(squeueOutput), "\n")
- for _, partition := range list {
- // accumulate the number of pending jobs
- _, key := partitions[partition]
- if key {
- partitions[partition].pending += 1
- }
- }
+ partitions := make(map[string]*PartitionMetrics)
+ lines := strings.Split(string(sinfoOutput),"\n")
+ for _, line := range lines {
+ if strings.Contains(line,",") {
+ // name of a partition
+ partition := strings.Split(line,",")[0]
+ _,key := partitions[partition]
+ if !key {
+ partitions[partition] = &PartitionMetrics{0,0,0,0,0}
+ }
+ states := strings.Split(line,",")[1]
+ allocated,_ := strconv.ParseFloat(strings.Split(states,"/")[0],64)
+ idle,_ := strconv.ParseFloat(strings.Split(states,"/")[1],64)
+ other,_ := strconv.ParseFloat(strings.Split(states,"/")[2],64)
+ total,_ := strconv.ParseFloat(strings.Split(states,"/")[3],64)
+ partitions[partition].allocated = allocated
+ partitions[partition].idle = idle
+ partitions[partition].other = other
+ partitions[partition].total = total
+ }
+ }
+ // get list of pending jobs by partition name
+ list := strings.Split(string(squeueOutput), "\n")
+ for _,partition := range list {
+ // accumulate the number of pending jobs
+ _, key := partitions[partition]
+ if key {
+ partitions[partition].pending += 1
+ }
+ }
- return partitions
+ return partitions
}
func GetPartitionsMetrics() map[string]*PartitionMetrics {
@@ -73,49 +73,49 @@ func GetPartitionsMetrics() map[string]*PartitionMetrics {
}
type PartitionsCollector struct {
- allocated *prometheus.Desc
- idle *prometheus.Desc
- other *prometheus.Desc
- pending *prometheus.Desc
- total *prometheus.Desc
+ allocated *prometheus.Desc
+ idle *prometheus.Desc
+ other *prometheus.Desc
+ pending *prometheus.Desc
+ total *prometheus.Desc
}
func NewPartitionsCollector() *PartitionsCollector {
- labels := []string{"partition"}
- return &PartitionsCollector{
- allocated: prometheus.NewDesc("slurm_partition_cpus_allocated", "Allocated CPUs for partition", labels, nil),
- idle: prometheus.NewDesc("slurm_partition_cpus_idle", "Idle CPUs for partition", labels, nil),
- other: prometheus.NewDesc("slurm_partition_cpus_other", "Other CPUs for partition", labels, nil),
- pending: prometheus.NewDesc("slurm_partition_jobs_pending", "Pending jobs for partition", labels, nil),
- total: prometheus.NewDesc("slurm_partition_cpus_total", "Total CPUs for partition", labels, nil),
- }
+ labels := []string{"partition"}
+ return &PartitionsCollector{
+ allocated: prometheus.NewDesc("slurm_partition_cpus_allocated", "Allocated CPUs for partition", labels,nil),
+ idle: prometheus.NewDesc("slurm_partition_cpus_idle", "Idle CPUs for partition", labels,nil),
+ other: prometheus.NewDesc("slurm_partition_cpus_other", "Other CPUs for partition", labels,nil),
+ pending: prometheus.NewDesc("slurm_partition_jobs_pending", "Pending jobs for partition", labels,nil),
+ total: prometheus.NewDesc("slurm_partition_cpus_total", "Total CPUs for partition", labels,nil),
+ }
}
func (pc *PartitionsCollector) Describe(ch chan<- *prometheus.Desc) {
- ch <- pc.allocated
- ch <- pc.idle
- ch <- pc.other
- ch <- pc.pending
- ch <- pc.total
+ ch <- pc.allocated
+ ch <- pc.idle
+ ch <- pc.other
+ ch <- pc.pending
+ ch <- pc.total
}
func (pc *PartitionsCollector) Collect(ch chan<- prometheus.Metric) {
- pm := GetPartitionsMetrics()
- for p := range pm {
- if pm[p].allocated > 0 {
- ch <- prometheus.MustNewConstMetric(pc.allocated, prometheus.GaugeValue, pm[p].allocated, p)
- }
- if pm[p].idle > 0 {
- ch <- prometheus.MustNewConstMetric(pc.idle, prometheus.GaugeValue, pm[p].idle, p)
- }
- if pm[p].other > 0 {
- ch <- prometheus.MustNewConstMetric(pc.other, prometheus.GaugeValue, pm[p].other, p)
- }
- if pm[p].pending > 0 {
- ch <- prometheus.MustNewConstMetric(pc.pending, prometheus.GaugeValue, pm[p].pending, p)
- }
- if pm[p].total > 0 {
- ch <- prometheus.MustNewConstMetric(pc.total, prometheus.GaugeValue, pm[p].total, p)
- }
- }
+ pm := GetPartitionsMetrics()
+ for p := range pm {
+ if pm[p].allocated > 0 {
+ ch <- prometheus.MustNewConstMetric(pc.allocated, prometheus.GaugeValue, pm[p].allocated, p)
+ }
+ if pm[p].idle > 0 {
+ ch <- prometheus.MustNewConstMetric(pc.idle, prometheus.GaugeValue, pm[p].idle, p)
+ }
+ if pm[p].other > 0 {
+ ch <- prometheus.MustNewConstMetric(pc.other, prometheus.GaugeValue, pm[p].other, p)
+ }
+ if pm[p].pending > 0 {
+ ch <- prometheus.MustNewConstMetric(pc.pending, prometheus.GaugeValue, pm[p].pending, p)
+ }
+ if pm[p].total > 0 {
+ ch <- prometheus.MustNewConstMetric(pc.total, prometheus.GaugeValue, pm[p].total, p)
+ }
+ }
}
diff --git a/sshare.go b/sshare.go
index 3e76a09..7f808ea 100644
--- a/sshare.go
+++ b/sshare.go
@@ -23,26 +23,26 @@ import (
)
type FairShareMetrics struct {
- fairshare float64
+ fairshare float64
}
func ParseFairShareMetrics(sshareOutput []byte) map[string]*FairShareMetrics {
- accounts := make(map[string]*FairShareMetrics)
- lines := strings.Split(string(sshareOutput), "\n")
- for _, line := range lines {
- if !strings.HasPrefix(line, " ") {
- if strings.Contains(line, "|") {
- account := strings.Trim(strings.Split(line, "|")[0], " ")
- _, key := accounts[account]
- if !key {
- accounts[account] = &FairShareMetrics{0}
- }
- fairshare, _ := strconv.ParseFloat(strings.Split(line, "|")[1], 64)
- accounts[account].fairshare = fairshare
- }
- }
- }
- return accounts
+ accounts := make(map[string]*FairShareMetrics)
+ lines := strings.Split(string(sshareOutput), "\n")
+ for _, line := range lines {
+ if ! strings.HasPrefix(line," ") {
+ if strings.Contains(line,"|") {
+ account := strings.Trim(strings.Split(line,"|")[0]," ")
+ _,key := accounts[account]
+ if !key {
+ accounts[account] = &FairShareMetrics{0}
+ }
+ fairshare,_ := strconv.ParseFloat(strings.Split(line,"|")[1],64)
+ accounts[account].fairshare = fairshare
+ }
+ }
+ }
+ return accounts
}
func GetFairShareMetrics() map[string]*FairShareMetrics {
@@ -50,23 +50,23 @@ func GetFairShareMetrics() map[string]*FairShareMetrics {
}
type FairShareCollector struct {
- fairshare *prometheus.Desc
+ fairshare *prometheus.Desc
}
func NewFairShareCollector() *FairShareCollector {
- labels := []string{"account"}
- return &FairShareCollector{
- fairshare: prometheus.NewDesc("slurm_account_fairshare", "FairShare for account", labels, nil),
- }
+ labels := []string{"account"}
+ return &FairShareCollector{
+ fairshare: prometheus.NewDesc("slurm_account_fairshare","FairShare for account" , labels,nil),
+ }
}
func (fsc *FairShareCollector) Describe(ch chan<- *prometheus.Desc) {
- ch <- fsc.fairshare
+ ch <- fsc.fairshare
}
func (fsc *FairShareCollector) Collect(ch chan<- prometheus.Metric) {
- fsm := GetFairShareMetrics()
- for f := range fsm {
- ch <- prometheus.MustNewConstMetric(fsc.fairshare, prometheus.GaugeValue, fsm[f].fairshare, f)
- }
+ fsm := GetFairShareMetrics()
+ for f := range fsm {
+ ch <- prometheus.MustNewConstMetric(fsc.fairshare, prometheus.GaugeValue, fsm[f].fairshare, f)
+ }
}
diff --git a/users.go b/users.go
index abcc02d..a9a7dc1 100644
--- a/users.go
+++ b/users.go
@@ -17,91 +17,91 @@ along with this program. If not, see . */
package main
import (
- "github.com/prometheus/client_golang/prometheus"
- "regexp"
- "strconv"
- "strings"
+ "strings"
+ "strconv"
+ "regexp"
+ "github.com/prometheus/client_golang/prometheus"
)
type UserJobMetrics struct {
- pending float64
- running float64
- running_cpus float64
- suspended float64
+ pending float64
+ running float64
+ running_cpus float64
+ suspended float64
}
func ParseUsersMetrics(squeueOutput []byte) map[string]*UserJobMetrics {
- users := make(map[string]*UserJobMetrics)
- lines := strings.Split(string(squeueOutput), "\n")
- for _, line := range lines {
- if strings.Contains(line, "|") {
- user := strings.Split(line, "|")[1]
- _, key := users[user]
- if !key {
- users[user] = &UserJobMetrics{0, 0, 0, 0}
- }
- state := strings.Split(line, "|")[2]
- state = strings.ToLower(state)
- cpus, _ := strconv.ParseFloat(strings.Split(line, "|")[3], 64)
- pending := regexp.MustCompile(`^pending`)
- running := regexp.MustCompile(`^running`)
- suspended := regexp.MustCompile(`^suspended`)
- switch {
- case pending.MatchString(state) == true:
- users[user].pending++
- case running.MatchString(state) == true:
- users[user].running++
- users[user].running_cpus += cpus
- case suspended.MatchString(state) == true:
- users[user].suspended++
- }
- }
- }
- return users
+ users := make(map[string]*UserJobMetrics)
+ lines := strings.Split(string(squeueOutput), "\n")
+ for _, line := range lines {
+ if strings.Contains(line,"|") {
+ user := strings.Split(line,"|")[1]
+ _,key := users[user]
+ if !key {
+ users[user] = &UserJobMetrics{0,0,0,0}
+ }
+ state := strings.Split(line,"|")[2]
+ state = strings.ToLower(state)
+ cpus,_ := strconv.ParseFloat(strings.Split(line,"|")[3],64)
+ pending := regexp.MustCompile(`^pending`)
+ running := regexp.MustCompile(`^running`)
+ suspended := regexp.MustCompile(`^suspended`)
+ switch {
+ case pending.MatchString(state) == true:
+ users[user].pending++
+ case running.MatchString(state) == true:
+ users[user].running++
+ users[user].running_cpus += cpus
+ case suspended.MatchString(state) == true:
+ users[user].suspended++
+ }
+ }
+ }
+ return users
}
func GetUsersMetrics() map[string]*UserJobMetrics {
- return ParseUsersMetrics(Subprocess("squeue", "-a", "-r", "-h", "-o %A|%u|%T|%C"))
+ return ParseUsersMetrics(Subprocess("squeue", "-a", "-r", "-h", "-o %A|%u|%T|%C"))
}
type UsersCollector struct {
- pending *prometheus.Desc
- running *prometheus.Desc
- running_cpus *prometheus.Desc
- suspended *prometheus.Desc
+ pending *prometheus.Desc
+ running *prometheus.Desc
+ running_cpus *prometheus.Desc
+ suspended *prometheus.Desc
}
func NewUsersCollector() *UsersCollector {
- labels := []string{"user"}
- return &UsersCollector{
- pending: prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil),
- running: prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil),
- running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil),
- suspended: prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil),
- }
+ labels := []string{"user"}
+ return &UsersCollector {
+ pending: prometheus.NewDesc("slurm_user_jobs_pending", "Pending jobs for user", labels, nil),
+ running: prometheus.NewDesc("slurm_user_jobs_running", "Running jobs for user", labels, nil),
+ running_cpus: prometheus.NewDesc("slurm_user_cpus_running", "Running cpus for user", labels, nil),
+ suspended: prometheus.NewDesc("slurm_user_jobs_suspended", "Suspended jobs for user", labels, nil),
+ }
}
func (uc *UsersCollector) Describe(ch chan<- *prometheus.Desc) {
- ch <- uc.pending
- ch <- uc.running
- ch <- uc.running_cpus
- ch <- uc.suspended
+ ch <- uc.pending
+ ch <- uc.running
+ ch <- uc.running_cpus
+ ch <- uc.suspended
}
func (uc *UsersCollector) Collect(ch chan<- prometheus.Metric) {
- um := GetUsersMetrics()
- for u := range um {
- if um[u].pending > 0 {
- ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u)
- }
- if um[u].running > 0 {
- ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u)
- }
- if um[u].running_cpus > 0 {
- ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u)
- }
- if um[u].suspended > 0 {
- ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u)
- }
- }
+ um := GetUsersMetrics()
+ for u := range um {
+ if um[u].pending > 0 {
+ ch <- prometheus.MustNewConstMetric(uc.pending, prometheus.GaugeValue, um[u].pending, u)
+ }
+ if um[u].running > 0 {
+ ch <- prometheus.MustNewConstMetric(uc.running, prometheus.GaugeValue, um[u].running, u)
+ }
+ if um[u].running_cpus > 0 {
+ ch <- prometheus.MustNewConstMetric(uc.running_cpus, prometheus.GaugeValue, um[u].running_cpus, u)
+ }
+ if um[u].suspended > 0 {
+ ch <- prometheus.MustNewConstMetric(uc.suspended, prometheus.GaugeValue, um[u].suspended, u)
+ }
+ }
}