Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Commit

Permalink
Issue-43 Add Node Health Prom Metrics per Chain (#44)
Browse files Browse the repository at this point in the history
  • Loading branch information
Maxitosh authored May 27, 2024
1 parent 93ffe84 commit fd302f2
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 3 deletions.
4 changes: 2 additions & 2 deletions internal/node_selector_service/models/qos_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func NewQosNode(morseNode *models.Node, pocketSession *models.Session, appSigner
}

func (n *QosNode) IsHealthy() bool {
return !n.isInTimeout() && n.IsSynced()
return !n.IsInTimeout() && n.IsSynced()
}

func (n *QosNode) IsSynced() bool {
Expand All @@ -80,7 +80,7 @@ func (n *QosNode) SetSynced(synced bool) {
n.synced = synced
}

func (n *QosNode) isInTimeout() bool {
func (n *QosNode) IsInTimeout() bool {
return !n.timeoutUntil.IsZero() && time.Now().Before(n.timeoutUntil)
}

Expand Down
64 changes: 63 additions & 1 deletion internal/session_registry/cached_session_registry_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,17 @@ import (
var (
counterSessionRequest *prometheus.CounterVec
histogramSessionRequestLatency *prometheus.HistogramVec
healthyNodesPerChainGauge *prometheus.GaugeVec
syncedNodesPerChainGauge *prometheus.GaugeVec
timeoutNodesPerChainGauge *prometheus.GaugeVec
ErrRecentlyFailed = errors.New("dispatch recently failed, returning early")
)

const (
blocksPerSession = 4
sessionPrimerInterval = time.Second * 5
ttlCacheCleanerInterval = time.Second * 15
nodeMetricsExporterInterval = time.Second * 20
reasonSessionSuccessCached = "session_cached"
reasonSessionSuccessColdHit = "session_cold_hit"
reasonSessionFailedBackoff = "session_failed_backoff"
Expand All @@ -53,7 +57,28 @@ func init() {
[]string{"cached"},
)

prometheus.MustRegister(counterSessionRequest, histogramSessionRequestLatency)
healthyNodesPerChainGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cached_client_session_healthy_nodes",
Help: "Number of healthy (synced + not in timeout) nodes per chain",
},
[]string{"chain_id"},
)
syncedNodesPerChainGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cached_client_session_synced_nodes",
Help: "Number of synced nodes per chain",
},
[]string{"chain_id"},
)
timeoutNodesPerChainGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "cached_client_session_timeout_nodes",
Help: "Number of nodes in timeout per chain",
},
[]string{"chain_id"},
)
prometheus.MustRegister(counterSessionRequest, histogramSessionRequestLatency, healthyNodesPerChainGauge, syncedNodesPerChainGauge, timeoutNodesPerChainGauge)
}

type CachedSessionRegistryService struct {
Expand All @@ -77,6 +102,7 @@ func NewCachedSessionRegistryService(poktClient pokt_v0.PocketService, appRegist
go nodeCache.Start()
cachedRegistry.startTTLCacheCleaner()
cachedRegistry.startSessionUpdater()
cachedRegistry.startNodeMetricsExporter()
return cachedRegistry
}

Expand Down Expand Up @@ -282,3 +308,39 @@ func (c *CachedSessionRegistryService) shouldBackoffDispatchFailure() bool {
func getSessionCacheKey(req *models.GetSessionRequest) string {
return fmt.Sprintf("%s-%s-%d", req.AppPubKey, req.Chain, req.SessionHeight)
}

func (c *CachedSessionRegistryService) exportNodeMetrics() {
nodesMap := c.GetNodesMap()
for sessionKey, sessionItem := range nodesMap {
chainId := sessionKey.Chain
var healthyNodesCount, syncedNodesCount, timeoutNodesCount int

for _, node := range sessionItem.Value() {
if node.IsHealthy() {
healthyNodesCount++
}
if node.IsSynced() {
syncedNodesCount++
}
if node.IsInTimeout() {
timeoutNodesCount++
}
}

healthyNodesPerChainGauge.WithLabelValues(chainId).Set(float64(healthyNodesCount))
syncedNodesPerChainGauge.WithLabelValues(chainId).Set(float64(syncedNodesCount))
timeoutNodesPerChainGauge.WithLabelValues(chainId).Set(float64(timeoutNodesCount))
}
}

func (c *CachedSessionRegistryService) startNodeMetricsExporter() {
ticker := time.Tick(nodeMetricsExporterInterval)
go func() {
for {
select {
case <-ticker:
c.exportNodeMetrics()
}
}
}()
}

0 comments on commit fd302f2

Please sign in to comment.