Skip to content

Commit

Permalink
go/cometbft/apps/roothash: Process liveness statistics for workers only
Browse files Browse the repository at this point in the history
  • Loading branch information
peternose committed Oct 2, 2023
1 parent f06129e commit 4d18a37
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 56 deletions.
22 changes: 9 additions & 13 deletions go/consensus/cometbft/apps/roothash/finalization.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import (
roothash "github.com/oasisprotocol/oasis-core/go/roothash/api"
"github.com/oasisprotocol/oasis-core/go/roothash/api/block"
"github.com/oasisprotocol/oasis-core/go/roothash/api/commitment"
scheduler "github.com/oasisprotocol/oasis-core/go/scheduler/api"
staking "github.com/oasisprotocol/oasis-core/go/staking/api"
)

Expand Down Expand Up @@ -181,23 +180,20 @@ func (app *rootHashApplication) tryFinalizeRoundInsideTx( //nolint: gocyclo
schedulerVote := sc.Commitment.ToVote()
for i, n := range rtState.Committee.Members {
vote, ok := sc.Votes[n.PublicKey]
// Make sure to not include nodes in multiple roles multiple times.
_, wasSeen := seen[n.PublicKey]
seen[n.PublicKey] = struct{}{}
switch {
case !ok && n.Role == scheduler.RoleBackupWorker && !pool.Discrepancy && !wasSeen:
// This is a backup worker only that did not submit a commitment and there was no
// discrepancy. Count the worker as live.
//
// Note that this skips the case where the node is both primary and backup and the
// primary did not commit as that should be treated as failure.
livenessStats.LiveRounds[i]++
case !ok:
continue
case !ok || vote == nil || wasSeen:
case vote == nil:
// Skip failures.
continue
default:
}

// Make sure to not include nodes in multiple roles multiple times.
if _, ok := seen[n.PublicKey]; ok {
continue
}
seen[n.PublicKey] = struct{}{}

// Resolve the entity owning the node.
var node *node.Node
node, err = regState.Node(ctx, n.PublicKey)
Expand Down
53 changes: 22 additions & 31 deletions go/consensus/cometbft/apps/roothash/liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ import (
"math"

beacon "github.com/oasisprotocol/oasis-core/go/beacon/api"
"github.com/oasisprotocol/oasis-core/go/common/crypto/signature"
tmapi "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/api"
registryState "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/apps/registry/state"
registry "github.com/oasisprotocol/oasis-core/go/registry/api"
roothash "github.com/oasisprotocol/oasis-core/go/roothash/api"
"github.com/oasisprotocol/oasis-core/go/scheduler/api"
staking "github.com/oasisprotocol/oasis-core/go/staking/api"
)

Expand Down Expand Up @@ -42,51 +42,42 @@ func processLivenessStatistics(ctx *tmapi.Context, epoch beacon.EpochTime, rtSta
"slash_amount", slashParams.Amount,
)

// Collect per node liveness statistics as a single node can have multiple roles.
type Stats struct {
liveRounds uint64
finalizedProposals uint64
missedProposals uint64
}
statsPerNode := make(map[signature.PublicKey]*Stats)
for i, member := range rtState.Committee.Members {
stats, ok := statsPerNode[member.PublicKey]
if !ok {
stats = &Stats{}
statsPerNode[member.PublicKey] = stats
// Penalize worker nodes that were not live enough.
regState := registryState.NewMutableState(ctx.State())
for i, n := range rtState.Committee.Members {
if n.Role != api.RoleWorker {
// Workers are listed before backup workers.
break
}
stats.liveRounds += rtState.LivenessStatistics.LiveRounds[i]
stats.finalizedProposals += rtState.LivenessStatistics.FinalizedProposals[i]
stats.missedProposals += rtState.LivenessStatistics.MissedProposals[i]
}

// Penalize nodes that were not live enough.
regState := registryState.NewMutableState(ctx.State())
for nodeID, stats := range statsPerNode {
status, err := regState.NodeStatus(ctx, nodeID)
status, err := regState.NodeStatus(ctx, n.PublicKey)
if err != nil {
return fmt.Errorf("failed to retrieve status for node %s: %w", nodeID, err)
return fmt.Errorf("failed to retrieve status for node %s: %w", n.PublicKey, err)
}
if status.IsSuspended(rtState.Runtime.ID, epoch) {
continue
}

maxMissedProposals := ((stats.missedProposals + stats.finalizedProposals) * maxMissedProposalsPercent) / 100
liveRounds := rtState.LivenessStatistics.LiveRounds[i]
finalizedProposals := rtState.LivenessStatistics.FinalizedProposals[i]
missedProposals := rtState.LivenessStatistics.MissedProposals[i]

maxMissedProposals := ((missedProposals + finalizedProposals) * maxMissedProposalsPercent) / 100
if maxMissedProposalsPercent == 0 {
maxMissedProposals = math.MaxUint64
}

switch {
case stats.liveRounds >= minLiveRounds && stats.missedProposals <= maxMissedProposals:
case liveRounds >= minLiveRounds && missedProposals <= maxMissedProposals:
// Node is live.
status.RecordSuccess(rtState.Runtime.ID, epoch)
default:
// Node is faulty.
ctx.Logger().Debug("node deemed faulty",
"node_id", nodeID,
"live_rounds", stats.liveRounds,
"node_id", n.PublicKey,
"live_rounds", liveRounds,
"min_live_rounds", minLiveRounds,
"missed_proposals", stats.missedProposals,
"missed_proposals", missedProposals,
"max_missed_proposals", maxMissedProposals,
)

Expand All @@ -103,15 +94,15 @@ func processLivenessStatistics(ctx *tmapi.Context, epoch beacon.EpochTime, rtSta
}

// Slash if configured.
err = onRuntimeLivenessFailure(ctx, nodeID, &slashParams.Amount)
err = onRuntimeLivenessFailure(ctx, n.PublicKey, &slashParams.Amount)
if err != nil {
return fmt.Errorf("failed to slash node %s: %w", nodeID, err)
return fmt.Errorf("failed to slash node %s: %w", n.PublicKey, err)
}
}
}

if err = regState.SetNodeStatus(ctx, nodeID, status); err != nil {
return fmt.Errorf("failed to set node status for node %s: %w", nodeID, err)
if err = regState.SetNodeStatus(ctx, n.PublicKey, status); err != nil {
return fmt.Errorf("failed to set node status for node %s: %w", n.PublicKey, err)
}
}

Expand Down
16 changes: 15 additions & 1 deletion go/consensus/cometbft/apps/roothash/liveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,21 @@ func TestLivenessProcessing(t *testing.T) {
// Bump epoch so the node is no longer suspended.
epoch += 2

// When node is live again, fault counter should decrease.
// When node is a backup worker, fault counter should not change.
rtState.Committee.Members[0].Role = scheduler.RoleBackupWorker
rtState.LivenessStatistics.LiveRounds[0] = 91 // At least 90 required.
err = processLivenessStatistics(ctx, epoch, rtState)
require.NoError(err, "processLivenessStatistics")
status, err = registryState.NodeStatus(ctx, sk.Public())
require.NoError(err, "NodeStatus")
require.False(status.IsSuspended(runtime.ID, epoch), "node should not be suspended")
require.EqualValues(1, status.Faults[runtime.ID].Failures, "there should be one fault")

// Bump epoch so the node is no longer suspended.
epoch += 2

// When node is worker again, fault counter should decrease.
rtState.Committee.Members[0].Role = scheduler.RoleWorker
rtState.LivenessStatistics.LiveRounds[0] = 91 // At least 90 required.
err = processLivenessStatistics(ctx, epoch, rtState)
require.NoError(err, "processLivenessStatistics")
Expand Down
19 changes: 8 additions & 11 deletions go/roothash/tests/tester.go
Original file line number Diff line number Diff line change
Expand Up @@ -513,25 +513,22 @@ func (s *runtimeState) testSuccessfulRound(t *testing.T, backend api.Backend, co
// Check that the liveness statistics were computed correctly.
livenessStatistics := s.livenessStatisticsDiff(t, ctx, backend, parent.Height)

goodRoundsPerNode := make(map[signature.PublicKey]uint64)
for i, member := range s.executorCommittee.committee.Members {
goodRoundsPerNode[member.PublicKey] += livenessStatistics.LiveRounds[i]
}

for nodeID, v := range goodRoundsPerNode {
// Workers and backup workers should be considered live as everyone submitted
// commitments and there were no discrepancies.
require.EqualValues(1, v, "LiveRounds(%s)", nodeID)
}

liveRounds := make([]uint64, len(livenessStatistics.LiveRounds))
finalizedProposals := make([]uint64, len(livenessStatistics.FinalizedProposals))
missedProposals := make([]uint64, len(livenessStatistics.MissedProposals))

// All workers and none backup workers should be considered live as every worker submitted
// a commitment and there were no discrepancies.
for i := range s.executorCommittee.workers {
liveRounds[i] = 1
}

schedulerIdx, err := s.executorCommittee.committee.SchedulerIdx(parent.Block.Header.Round, 0)
require.NoError(err, "SchedulerIdx")
finalizedProposals[schedulerIdx]++

require.Equal(uint64(1), livenessStatistics.TotalRounds, "there should be one finalized round")
require.EqualValues(liveRounds, livenessStatistics.LiveRounds, "there should be no live members")
require.EqualValues(finalizedProposals, livenessStatistics.FinalizedProposals, "there should be one finalized proposal")
require.EqualValues(missedProposals, livenessStatistics.MissedProposals, "there should be no failed proposals")
}
Expand Down

0 comments on commit 4d18a37

Please sign in to comment.