Skip to content

Commit

Permalink
Add Nexus failure_reason metric tag (#1671)
Browse files Browse the repository at this point in the history
* Add failure_reason tag

* Remove redunant WorkflowTaskFailureReason tag
  • Loading branch information
bergundy authored Oct 18, 2024
1 parent 959f581 commit 1a13bf3
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 26 deletions.
26 changes: 13 additions & 13 deletions internal/common/metrics/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,19 +92,19 @@ const (

// Metric tag keys
const (
NamespaceTagName = "namespace"
ClientTagName = "client_name"
PollerTypeTagName = "poller_type"
WorkerTypeTagName = "worker_type"
WorkflowTypeNameTagName = "workflow_type"
ActivityTypeNameTagName = "activity_type"
NexusServiceTagName = "nexus_service"
NexusOperationTagName = "nexus_operation"
TaskQueueTagName = "task_queue"
OperationTagName = "operation"
CauseTagName = "cause"
WorkflowTaskFailureReason = "failure_reason"
RequestFailureCode = "status_code"
NamespaceTagName = "namespace"
ClientTagName = "client_name"
PollerTypeTagName = "poller_type"
WorkerTypeTagName = "worker_type"
WorkflowTypeNameTagName = "workflow_type"
ActivityTypeNameTagName = "activity_type"
NexusServiceTagName = "nexus_service"
NexusOperationTagName = "nexus_operation"
FailureReasonTagName = "failure_reason"
TaskQueueTagName = "task_queue"
OperationTagName = "operation"
CauseTagName = "cause"
RequestFailureCode = "status_code"
)

// Metric tag values
Expand Down
9 changes: 8 additions & 1 deletion internal/common/metrics/tags.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ func NexusTags(service, operation, taskQueueName string) map[string]string {
}
}

// NexusTaskFailureTags returns a set of tags for Nexus Operation failures.
func NexusTaskFailureTags(reason string) map[string]string {
return map[string]string{
FailureReasonTagName: reason,
}
}

// TaskQueueTags returns a set of tags for a task queue.
func TaskQueueTags(taskQueue string) map[string]string {
return map[string]string{
Expand All @@ -106,7 +113,7 @@ func PollerTags(pollerType string) map[string]string {
// WorkflowTaskFailedTags returns a set of tags for a workflow task failure.
func WorkflowTaskFailedTags(reason string) map[string]string {
return map[string]string{
WorkflowTaskFailureReason: reason,
FailureReasonTagName: reason,
}
}

Expand Down
17 changes: 15 additions & 2 deletions internal/internal_nexus_task_poller.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,21 @@ func (ntp *nexusTaskPoller) ProcessTask(task interface{}) error {
// Internal error processing the task.
// Failure from user handler.
// Special case for the start response with operation error.
if err != nil || failure != nil || res.Response.GetStartOperation().GetOperationError() != nil {
metricsHandler.Counter(metrics.NexusTaskExecutionFailedCounter).Inc(1)
if err != nil {
metricsHandler.
WithTags(metrics.NexusTaskFailureTags("internal_sdk_error")).
Counter(metrics.NexusTaskExecutionFailedCounter).
Inc(1)
} else if failure != nil {
metricsHandler.
WithTags(metrics.NexusTaskFailureTags("handler_error_" + failure.GetError().GetErrorType())).
Counter(metrics.NexusTaskExecutionFailedCounter).
Inc(1)
} else if e := res.Response.GetStartOperation().GetOperationError(); e != nil {
metricsHandler.
WithTags(metrics.NexusTaskFailureTags("operation_" + e.GetOperationState())).
Counter(metrics.NexusTaskExecutionFailedCounter).
Inc(1)
}

// Let the poller machinery drop the task, nothing to report back.
Expand Down
21 changes: 11 additions & 10 deletions test/nexus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,12 @@ func (tc *testContext) requireTimer(t *assert.CollectT, metric, service, operati
}))
}

func (tc *testContext) requireCounter(t *assert.CollectT, metric, service, operation string) {
func (tc *testContext) requireFailureCounter(t *assert.CollectT, service, operation, failureType string) {
assert.True(t, slices.ContainsFunc(tc.metricsHandler.Counters(), func(ct *metrics.CapturedCounter) bool {
return ct.Name == metric &&
return ct.Name == metrics.NexusTaskExecutionFailedCounter &&
ct.Tags[metrics.NexusServiceTagName] == service &&
ct.Tags[metrics.NexusOperationTagName] == operation
ct.Tags[metrics.NexusOperationTagName] == operation &&
ct.Tags[metrics.FailureReasonTagName] == failureType
}))
}

Expand Down Expand Up @@ -256,7 +257,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "operation_failed")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -271,7 +272,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_INTERNAL")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -286,7 +287,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_BAD_REQUEST")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -301,7 +302,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_BAD_REQUEST")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -316,7 +317,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_INTERNAL")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -331,7 +332,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_BAD_REQUEST")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -346,7 +347,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_INTERNAL")
}, time.Second*3, time.Millisecond*100)
})
}
Expand Down

0 comments on commit 1a13bf3

Please sign in to comment.