diff --git a/gas/backend.go b/gas/backend.go index 7aa46971..68d91a1b 100644 --- a/gas/backend.go +++ b/gas/backend.go @@ -21,8 +21,8 @@ import ( ) var ( - metricNodeMinPrice = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_oasis_web3_gateway_gas_orcale_node_min_price", Help: "Min gas price periodically queried from the node."}) - metricComputedPrice = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_oasis_web3_gateway_gas_oracle_computed_price", Help: "Computed recommended gas price based on recent full blocks. -1 if none (no recent full blocks)."}) + metricNodeMinPrice = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_web3_gateway_gas_orcale_node_min_price", Help: "Min gas price periodically queried from the node."}) + metricComputedPrice = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_web3_gateway_gas_oracle_computed_price", Help: "Computed recommended gas price based on recent full blocks. -1 if none (no recent full blocks)."}) ) // Backend is the gas price oracle backend. diff --git a/indexer/backend_cache.go b/indexer/backend_cache.go index c9f546ba..36ed48c4 100644 --- a/indexer/backend_cache.go +++ b/indexer/backend_cache.go @@ -27,21 +27,21 @@ const periodicMetricsInterval = 60 * time.Second var ( metricCacheHits = promauto.NewGaugeVec( prometheus.GaugeOpts{ - Name: "oasis_oasis_web3_gateway_cache_hits", + Name: "oasis_web3_gateway_cache_hits", Help: "Number of cache hits.", }, []string{"cache"}, ) metricCacheMisses = promauto.NewGaugeVec( prometheus.GaugeOpts{ - Name: "oasis_oasis_web3_gateway_cache_misses", + Name: "oasis_web3_gateway_cache_misses", Help: "Number of cache misses.", }, []string{"cache"}, ) metricCacheHitRatio = promauto.NewGaugeVec( prometheus.GaugeOpts{ - Name: "oasis_oasis_web3_gateway_cache_hit_ratio", + Name: "oasis_web3_gateway_cache_hit_ratio", Help: "Percent of Hits over all accesses (Hits + Misses).", }, []string{"cache"}, diff --git a/indexer/indexer.go b/indexer/indexer.go index d91d369f..179068ef 100644 --- a/indexer/indexer.go +++ b/indexer/indexer.go @@ -32,9 +32,9 @@ const ( ) var ( - metricBlockIndexed = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_oasis_web3_gateway_block_indexed", Help: "Indexed block heights."}) - metricBlockPruned = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_oasis_web3_gateway_block_pruned", Help: "Pruned block heights."}) - metricHealthy = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_oasis_web3_gateway_health", Help: "1 if gateway healthcheck is reporting as healthy, 0 otherwise."}) + metricBlockIndexed = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_web3_gateway_block_indexed", Help: "Indexed block heights."}) + metricBlockPruned = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_web3_gateway_block_pruned", Help: "Pruned block heights."}) + metricHealthy = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_web3_gateway_indexer_health", Help: "1 if gateway indexer healthcheck is reporting as healthy, 0 otherwise."}) ) // ErrNotHealthy is the error returned if the gateway is unhealthy. diff --git a/main.go b/main.go index 45a1b8bf..6b19351f 100644 --- a/main.go +++ b/main.go @@ -271,8 +271,10 @@ func runRoot() error { } } - w3.RegisterAPIs(rpc.GetRPCAPIs(ctx, rc, archiveClient, backend, gasPriceOracle, cfg.Gateway, es)) - w3.RegisterHealthChecks([]server.HealthCheck{indx}) + apis, checks := rpc.GetRPCAPIs(ctx, rc, archiveClient, backend, gasPriceOracle, cfg.Gateway, es) + w3.RegisterAPIs(apis) + checks = append(checks, indx) + w3.RegisterHealthChecks(checks) svr := server.Server{ Config: cfg, diff --git a/rpc/apis.go b/rpc/apis.go index 7c6c5862..fa1f8b2a 100644 --- a/rpc/apis.go +++ b/rpc/apis.go @@ -20,74 +20,71 @@ import ( "github.com/oasisprotocol/oasis-web3-gateway/rpc/oasis" "github.com/oasisprotocol/oasis-web3-gateway/rpc/txpool" "github.com/oasisprotocol/oasis-web3-gateway/rpc/web3" + "github.com/oasisprotocol/oasis-web3-gateway/server" ) -// GetRPCAPIs returns the list of all APIs. +// GetRPCAPIs returns the list of enabled RPC APIs and accompanying health checks. func GetRPCAPIs( - _ context.Context, + ctx context.Context, client client.RuntimeClient, archiveClient *archive.Client, backend indexer.Backend, gasPriceOracle gas.Backend, config *conf.GatewayConfig, eventSystem *eventFilters.EventSystem, -) []ethRpc.API { +) ([]ethRpc.API, []server.HealthCheck) { var apis []ethRpc.API + var healthChecks []server.HealthCheck + // Web3 JSON-RPC Spec APIs - always enabled. web3Service := web3.NewPublicAPI() ethService := eth.NewPublicAPI(client, archiveClient, logging.GetLogger("eth_rpc"), config.ChainID, backend, gasPriceOracle, config.MethodLimits) netService := net.NewPublicAPI(config.ChainID) txpoolService := txpool.NewPublicAPI() filtersService := filters.NewPublicAPI(client, logging.GetLogger("eth_filters"), backend, eventSystem) - oasisService := oasis.NewPublicAPI(client, logging.GetLogger("oasis")) - if config.Monitoring.Enabled() { web3Service = web3.NewMetricsWrapper(web3Service) netService = net.NewMetricsWrapper(netService) ethService = ethmetrics.NewMetricsWrapper(ethService, logging.GetLogger("eth_rpc_metrics"), backend) txpoolService = txpool.NewMetricsWrapper(txpoolService) filtersService = filters.NewMetricsWrapper(filtersService) - oasisService = oasis.NewMetricsWrapper(oasisService) } - apis = append(apis, ethRpc.API{ Namespace: "web3", - Version: "1.0", Service: web3Service, - Public: true, }, ethRpc.API{ Namespace: "net", - Version: "1.0", Service: netService, - Public: true, }, ethRpc.API{ Namespace: "eth", - Version: "1.0", Service: ethService, - Public: true, }, ethRpc.API{ Namespace: "txpool", - Version: "1.0", Service: txpoolService, - Public: true, }, ethRpc.API{ Namespace: "eth", - Version: "1.0", Service: filtersService, - Public: true, }, - ethRpc.API{ + ) + + // Configure oasis_ APIs if enabled. + if config.ExposeOasisRPCs { + oasisService, oasisHealth := oasis.NewPublicAPI(ctx, client, logging.GetLogger("oasis")) + if config.Monitoring.Enabled() { + oasisService = oasis.NewMetricsWrapper(oasisService) + } + + apis = append(apis, ethRpc.API{ Namespace: "oasis", - Version: "1.0", Service: oasisService, - Public: config.ExposeOasisRPCs, - }, - ) + }) + healthChecks = append(healthChecks, oasisHealth) + } - return apis + return apis, healthChecks } diff --git a/rpc/eth/filters/metrics.go b/rpc/eth/filters/metrics.go index 76c14c79..77375ee9 100644 --- a/rpc/eth/filters/metrics.go +++ b/rpc/eth/filters/metrics.go @@ -14,7 +14,7 @@ import ( var ( durations = promauto.NewHistogramVec( prometheus.HistogramOpts{ - Name: "oasis_oasis_web3_gateway_subscription_seconds", + Name: "oasis_web3_gateway_subscription_seconds", // Buckets ranging from 1 second to 24 hours. Buckets: []float64{1, 10, 30, 60, 600, 1800, 3600, 7200, 21600, 86400}, Help: "Histogram for the eth subscription API subscriptions duration.", @@ -23,7 +23,7 @@ var ( ) inflightSubs = promauto.NewGaugeVec( prometheus.GaugeOpts{ - Name: "oasis_oasis_web3_gateway_subscription_inflight", + Name: "oasis_web3_gateway_subscription_inflight", Help: "Number of concurrent eth inflight subscriptions.", }, []string{"method_name"}, diff --git a/rpc/eth/metrics/api.go b/rpc/eth/metrics/api.go index bd6d67fc..1d0930be 100644 --- a/rpc/eth/metrics/api.go +++ b/rpc/eth/metrics/api.go @@ -22,7 +22,7 @@ import ( var requestHeights = promauto.NewHistogramVec( prometheus.HistogramOpts{ - Name: "oasis_oasis_web3_gateway_api_request_heights", + Name: "oasis_web3_gateway_api_request_heights", Buckets: []float64{0, 1, 2, 3, 5, 10, 50, 100, 500, 1000}, Help: "Histogram of eth API request heights (difference from the latest height).", }, diff --git a/rpc/metrics/metrics.go b/rpc/metrics/metrics.go index 8b14f849..1e2a1dc5 100644 --- a/rpc/metrics/metrics.go +++ b/rpc/metrics/metrics.go @@ -6,11 +6,11 @@ import ( ) var ( - durations = promauto.NewHistogramVec(prometheus.HistogramOpts{Name: "oasis_oasis_web3_gateway_api_seconds", Buckets: []float64{0.00001, 0.0001, .001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, Help: "Histogram for the eth API requests duration."}, []string{"method_name"}) - requests = promauto.NewCounterVec(prometheus.CounterOpts{Name: "oasis_oasis_web3_gateway_api_request", Help: "Counter for API requests."}, []string{"method_name"}) - failures = promauto.NewCounterVec(prometheus.CounterOpts{Name: "oasis_oasis_web3_gateway_api_failure", Help: "Counter for API request failures."}, []string{"method_name"}) - successes = promauto.NewCounterVec(prometheus.CounterOpts{Name: "oasis_oasis_web3_gateway_api_success", Help: "Counter for API successful requests."}, []string{"method_name"}) - inflight = promauto.NewGaugeVec(prometheus.GaugeOpts{Name: "oasis_oasis_web3_gateway_api_inflight", Help: "Number of inflight API request."}, []string{"method_name"}) + durations = promauto.NewHistogramVec(prometheus.HistogramOpts{Name: "oasis_web3_gateway_api_seconds", Buckets: []float64{0.00001, 0.0001, .001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, Help: "Histogram for the eth API requests duration."}, []string{"method_name"}) + requests = promauto.NewCounterVec(prometheus.CounterOpts{Name: "oasis_web3_gateway_api_request", Help: "Counter for API requests."}, []string{"method_name"}) + failures = promauto.NewCounterVec(prometheus.CounterOpts{Name: "oasis_web3_gateway_api_failure", Help: "Counter for API request failures."}, []string{"method_name"}) + successes = promauto.NewCounterVec(prometheus.CounterOpts{Name: "oasis_web3_gateway_api_success", Help: "Counter for API successful requests."}, []string{"method_name"}) + inflight = promauto.NewGaugeVec(prometheus.GaugeOpts{Name: "oasis_web3_gateway_api_inflight", Help: "Number of inflight API request."}, []string{"method_name"}) ) // GetAPIMethodMetrics returns the method metrics for the specified API call. diff --git a/rpc/oasis/api.go b/rpc/oasis/api.go index aff4f8b9..ef3d0df6 100644 --- a/rpc/oasis/api.go +++ b/rpc/oasis/api.go @@ -9,11 +9,12 @@ import ( "github.com/oasisprotocol/oasis-core/go/common/logging" "github.com/oasisprotocol/oasis-sdk/client-sdk/go/client" "github.com/oasisprotocol/oasis-sdk/client-sdk/go/modules/core" + "github.com/oasisprotocol/oasis-web3-gateway/server" ) var ErrInternalError = errors.New("internal error") -// API is the net_ prefixed set of APIs in the Web3 JSON-RPC spec. +// API is the oasis_ prefixed set of APIs. type API interface { // CallDataPublicKey returns the calldata public key for the runtime with the provided ID. CallDataPublicKey(ctx context.Context) (*CallDataPublicKey, error) @@ -35,19 +36,23 @@ type CallDataPublicKey struct { type publicAPI struct { client client.RuntimeClient - Logger *logging.Logger + logger *logging.Logger } -// NewPublicAPI creates an instance of the Web3 API. +// NewPublicAPI creates an instance of the Web3 API and accompanying health check. func NewPublicAPI( + ctx context.Context, client client.RuntimeClient, logger *logging.Logger, -) API { - return &publicAPI{client: client, Logger: logger} +) (API, server.HealthCheck) { + health := &healthChecker{ctx: ctx, client: client, logger: logger} + go health.run() + + return &publicAPI{client: client, logger: logger}, health } func (api *publicAPI) CallDataPublicKey(ctx context.Context) (*CallDataPublicKey, error) { - logger := api.Logger.With("method", "oasis_callDataPublicKey") + logger := api.logger.With("method", "oasis_callDataPublicKey") res, err := core.NewV1(api.client).CallDataPublicKey(ctx) if err != nil { logger.Error("failed to fetch public key", "err", err) diff --git a/rpc/oasis/health.go b/rpc/oasis/health.go new file mode 100644 index 00000000..0ca07477 --- /dev/null +++ b/rpc/oasis/health.go @@ -0,0 +1,68 @@ +package oasis + +import ( + "context" + "fmt" + "sync/atomic" + "time" + + "github.com/oasisprotocol/oasis-core/go/common/logging" + "github.com/oasisprotocol/oasis-sdk/client-sdk/go/client" + "github.com/oasisprotocol/oasis-sdk/client-sdk/go/modules/core" +) + +const ( + healthCheckInterval = 30 * time.Second + healthIterationTimeout = 15 * time.Second +) + +type healthChecker struct { + ctx context.Context + client client.RuntimeClient + logger *logging.Logger + + health uint32 +} + +// Implements server.HealthCheck. +func (h *healthChecker) Health() error { + if atomic.LoadUint32(&h.health) == 0 { + return fmt.Errorf("oasis API not healthy") + } + return nil +} + +func (h *healthChecker) updateHealth(healthy bool) { + if healthy { + atomic.StoreUint32(&h.health, 1) + } else { + atomic.StoreUint32(&h.health, 0) + } +} + +func (h *healthChecker) run() { + for { + select { + case <-time.After(healthCheckInterval): + func() { + ctx, cancel := context.WithTimeout(h.ctx, healthIterationTimeout) + defer cancel() + + // Query public keys. + _, err := core.NewV1(h.client).CallDataPublicKey(ctx) + if err != nil { + h.logger.Error("failed to fetch public key", "err", err) + h.updateHealth(false) + return + } + + h.logger.Debug("oasis_ RPC healthy") + h.updateHealth(true) + }() + case <-h.ctx.Done(): + h.updateHealth(false) + h.logger.Debug("health checker stopping", "reason", h.ctx.Err()) + return + } + } +} diff --git a/server/json_rpc.go b/server/json_rpc.go index 5075076f..5a407f02 100644 --- a/server/json_rpc.go +++ b/server/json_rpc.go @@ -11,11 +11,15 @@ import ( "github.com/ethereum/go-ethereum/rpc" "github.com/gorilla/mux" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/rs/cors" "github.com/oasisprotocol/oasis-core/go/common/logging" ) +var metricHealthy = promauto.NewGauge(prometheus.GaugeOpts{Name: "oasis_web3_gateway_health", Help: "1 if gateway healthcheck is reporting as healthy, 0 otherwise."}) + // httpConfig is the JSON-RPC/HTTP configuration. type httpConfig struct { Modules []string @@ -77,9 +81,11 @@ func healthCheckHandler(healthChecks []HealthCheck) func(w http.ResponseWriter, for _, h := range healthChecks { if err := h.Health(); err != nil { w.WriteHeader(http.StatusServiceUnavailable) + metricHealthy.Set(0) return } } + metricHealthy.Set(1) w.WriteHeader(http.StatusOK) } } diff --git a/storage/psql/metrics.go b/storage/psql/metrics.go index 72e90226..dd5867e6 100644 --- a/storage/psql/metrics.go +++ b/storage/psql/metrics.go @@ -13,7 +13,7 @@ import ( "github.com/oasisprotocol/oasis-web3-gateway/storage" ) -var durations = promauto.NewHistogramVec(prometheus.HistogramOpts{Name: "oasis_oasis_web3_gateway_psql_query_seconds", Buckets: []float64{0.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, Help: "Histogram for the postgresql query duration."}, []string{"query"}) +var durations = promauto.NewHistogramVec(prometheus.HistogramOpts{Name: "oasis_web3_gateway_psql_query_seconds", Buckets: []float64{0.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, Help: "Histogram for the postgresql query duration."}, []string{"query"}) func measureDuration(label string) func() { timer := prometheus.NewTimer(durations.WithLabelValues(label)) diff --git a/tests/rpc/health_test.go b/tests/rpc/health_test.go index 81de9287..75191a19 100644 --- a/tests/rpc/health_test.go +++ b/tests/rpc/health_test.go @@ -11,7 +11,7 @@ import ( func TestHealthCheck(t *testing.T) { // Ensure the initial health-check was done. - <-time.After(20 * time.Second) + <-time.After(50 * time.Second) ctx, cancel := context.WithTimeout(context.Background(), OasisBlockTimeout) defer cancel() diff --git a/tests/rpc/utils.go b/tests/rpc/utils.go index a2392f43..edc6542a 100644 --- a/tests/rpc/utils.go +++ b/tests/rpc/utils.go @@ -179,8 +179,10 @@ func Setup() error { return fmt.Errorf("setup: failed starting gas price oracle: %w", err) } - w3.RegisterAPIs(rpc.GetRPCAPIs(context.Background(), rc, nil, backend, gasPriceOracle, tests.TestsConfig.Gateway, es)) - w3.RegisterHealthChecks([]server.HealthCheck{indx}) + apis, checks := rpc.GetRPCAPIs(ctx, rc, nil, backend, gasPriceOracle, tests.TestsConfig.Gateway, es) + w3.RegisterAPIs(apis) + checks = append(checks, indx) + w3.RegisterHealthChecks(checks) if err = w3.Start(); err != nil { w3.Close()