-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Added a metrics planner to track and monitor performance of job scheduling and planning processes - Introduced detailed telemetry metrics using OpenTelemetry for tracking plan processing, executions, and events - **Improvements** - Enhanced state update process with more granular error tracking and performance monitoring - Added flexibility in metric recording with new `DoneWithoutTotalDuration` method - **Telemetry** - Implemented comprehensive metrics tracking for plan processing, including duration, counts, and event distributions <!-- end of auto-generated comment: release notes by coderabbit.ai -->
- Loading branch information
Showing
5 changed files
with
258 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
package planner | ||
|
||
import ( | ||
"go.opentelemetry.io/otel" | ||
"go.opentelemetry.io/otel/metric" | ||
|
||
"github.com/bacalhau-project/bacalhau/pkg/telemetry" | ||
) | ||
|
||
var ( | ||
Meter = otel.GetMeterProvider().Meter("planner") | ||
|
||
// Processing metrics | ||
processDuration = telemetry.Must(Meter.Float64Histogram( | ||
"planner.process.duration", | ||
metric.WithDescription("Time taken to process a single plan"), | ||
metric.WithUnit("s"), | ||
metric.WithExplicitBucketBoundaries(telemetry.DurationMsBuckets...), | ||
)) | ||
|
||
processPartDuration = telemetry.Must(Meter.Float64Histogram( | ||
"planner.process.part.duration", | ||
metric.WithDescription("Time taken for sub-operations within a planner operation"), | ||
metric.WithUnit("s"), | ||
metric.WithExplicitBucketBoundaries(telemetry.DurationMsBuckets...), | ||
)) | ||
|
||
processCount = telemetry.Must(Meter.Int64Counter( | ||
"planner.process.count", | ||
metric.WithDescription("Number of plans processed"), | ||
metric.WithUnit("1"), | ||
)) | ||
|
||
// State update metrics | ||
executionsCreated = telemetry.Must(Meter.Float64Histogram( | ||
"planner.executions.created", | ||
metric.WithDescription("Distribution of executions created per plan"), | ||
metric.WithUnit("1"), | ||
)) | ||
|
||
executionsUpdated = telemetry.Must(Meter.Float64Histogram( | ||
"planner.executions.updated", | ||
metric.WithDescription("Distribution of executions updated per plan"), | ||
metric.WithUnit("1"), | ||
)) | ||
|
||
jobsUpdated = telemetry.Must(Meter.Int64Counter( | ||
"planner.jobs.updated", | ||
metric.WithDescription("Number of jobs with state updates"), | ||
metric.WithUnit("1"), | ||
)) | ||
|
||
evaluationsCreated = telemetry.Must(Meter.Float64Histogram( | ||
"planner.evaluations.created", | ||
metric.WithDescription("Distribution of evaluations created per plan"), | ||
metric.WithUnit("1"), | ||
)) | ||
|
||
// History event metrics | ||
jobEventsAdded = telemetry.Must(Meter.Float64Histogram( | ||
"planner.events.job", | ||
metric.WithDescription("Distribution of job events added per plan"), | ||
metric.WithUnit("1"), | ||
)) | ||
|
||
execEventsAdded = telemetry.Must(Meter.Float64Histogram( | ||
"planner.events.execution", | ||
metric.WithDescription("Distribution of execution events added per plan"), | ||
metric.WithUnit("1"), | ||
)) | ||
) | ||
|
||
// Common attribute keys | ||
const ( | ||
AttrPlannerType = "planner_type" | ||
|
||
AttrOperationPartBeginTx = "begin_transaction" | ||
AttrOperationPartCreateExec = "create_execution" | ||
AttrOperationPartUpdateExec = "update_execution" | ||
AttrOperationPartUpdateJob = "update_job" | ||
AttrOperationPartCreateEval = "create_evaluation" | ||
AttrOperationPartAddEvents = "add_events" | ||
|
||
AttrOutcomeKey = "outcome" | ||
AttrOutcomeSuccess = "success" | ||
AttrOutcomeFailure = "failure" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
package planner | ||
|
||
import ( | ||
"context" | ||
|
||
"go.opentelemetry.io/otel/attribute" | ||
|
||
"github.com/bacalhau-project/bacalhau/pkg/models" | ||
"github.com/bacalhau-project/bacalhau/pkg/orchestrator" | ||
"github.com/bacalhau-project/bacalhau/pkg/telemetry" | ||
) | ||
|
||
// MetricsPlanner records metrics about plan content as they flow through the planner chain. | ||
// It tracks distributions of executions, evaluations, and events without modifying the plan. | ||
type MetricsPlanner struct{} | ||
|
||
// NewMetricsPlanner creates a new instance of MetricsPlanner. | ||
func NewMetricsPlanner() *MetricsPlanner { | ||
return &MetricsPlanner{} | ||
} | ||
|
||
// Process records metrics about plan content including executions, jobs, evaluations and events. | ||
func (s *MetricsPlanner) Process(ctx context.Context, plan *models.Plan) error { | ||
metrics := telemetry.NewMetricRecorder( | ||
attribute.String(AttrPlannerType, "metrics_planner"), | ||
) | ||
defer func() { | ||
metrics.Count(ctx, processCount) | ||
metrics.DoneWithoutTotalDuration(ctx) | ||
}() | ||
|
||
if len(plan.NewExecutions) > 0 { | ||
metrics.Histogram(ctx, executionsCreated, float64(len(plan.NewExecutions))) | ||
} | ||
if len(plan.UpdatedExecutions) > 0 { | ||
metrics.Histogram(ctx, executionsUpdated, float64(len(plan.UpdatedExecutions))) | ||
} | ||
if !plan.DesiredJobState.IsUndefined() { | ||
metrics.Count(ctx, jobsUpdated) | ||
} | ||
if len(plan.NewEvaluations) > 0 { | ||
metrics.Histogram(ctx, evaluationsCreated, float64(len(plan.NewEvaluations))) | ||
} | ||
if len(plan.JobEvents) > 0 { | ||
metrics.Histogram(ctx, jobEventsAdded, float64(len(plan.JobEvents))) | ||
} | ||
if len(plan.ExecutionEvents) > 0 { | ||
var totalEvents int | ||
for _, events := range plan.ExecutionEvents { | ||
totalEvents += len(events) | ||
} | ||
metrics.Histogram(ctx, execEventsAdded, float64(totalEvents)) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
var _ orchestrator.Planner = (*MetricsPlanner)(nil) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters