Skip to content

Commit

Permalink
Record CPU usage timelines, take 2 (#7809)
Browse files Browse the repository at this point in the history
  • Loading branch information
bduffany authored Nov 7, 2024
1 parent 201e45e commit c88cfe9
Show file tree
Hide file tree
Showing 15 changed files with 155 additions and 172 deletions.
1 change: 0 additions & 1 deletion MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,6 @@ use_repo(
"com_github_bazelbuild_rules_webtesting",
"com_github_bduffany_godemon",
"com_github_bits_and_blooms_bloom_v3",
"com_github_bmkessler_streamvbyte",
"com_github_bojand_ghz",
"com_github_bradfitz_gomemcache",
"com_github_buildbuddy_io_tensorflow_proto",
Expand Down
6 changes: 0 additions & 6 deletions deps.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -561,12 +561,6 @@ def install_go_mod_dependencies(workspace_name = "buildbuddy"):
sum = "h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY=",
version = "v0.0.0-20160611221934-b7ed37b82869",
)
go_repository(
name = "com_github_bmkessler_streamvbyte",
importpath = "github.com/bmkessler/streamvbyte",
sum = "h1:QvzfNFkZD66P1f8s8FwMMVLxPd6Wa2/uqQh8gDhq3Ss=",
version = "v0.1.0",
)
go_repository(
name = "com_github_bojand_ghz",
build_directives = [
Expand Down
5 changes: 5 additions & 0 deletions enterprise/server/remote_execution/container/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ go_library(
"//server/util/status",
"//server/util/tracing",
"//server/util/unixcred",
"@com_github_jonboulle_clockwork//:clockwork",
"@io_opentelemetry_go_otel//attribute",
"@io_opentelemetry_go_otel_trace//:trace",
"@org_golang_google_protobuf//types/known/timestamppb",
],
)

Expand All @@ -40,7 +42,10 @@ go_test(
"//server/testutil/testauth",
"//server/testutil/testenv",
"//server/util/status",
"//server/util/testing/flags",
"//server/util/timeseries",
"@com_github_google_go_cmp//cmp",
"@com_github_jonboulle_clockwork//:clockwork",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
"@org_golang_google_protobuf//testing/protocmp",
Expand Down
74 changes: 66 additions & 8 deletions enterprise/server/remote_execution/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@ import (
"github.com/buildbuddy-io/buildbuddy/server/util/status"
"github.com/buildbuddy-io/buildbuddy/server/util/tracing"
"github.com/buildbuddy-io/buildbuddy/server/util/unixcred"
"github.com/jonboulle/clockwork"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"

fcpb "github.com/buildbuddy-io/buildbuddy/proto/firecracker"
repb "github.com/buildbuddy-io/buildbuddy/proto/remote_execution"
tspb "google.golang.org/protobuf/types/known/timestamppb"
)

const (
Expand All @@ -46,6 +48,10 @@ const (

// How often to poll container stats.
statsPollInterval = 50 * time.Millisecond

// Max uncompressed size in bytes to retain for timeseries data. After this
// limit is reached, samples are dropped.
timeseriesSizeLimitBytes = 1_000_000
)

var (
Expand All @@ -57,8 +63,8 @@ var (
// operation fails due to the container already being removed.
ErrRemoved = status.UnavailableError("container has been removed")

debugUseLocalImagesOnly = flag.Bool("debug_use_local_images_only", false, "Do not pull OCI images and only used locally cached images. This can be set to test local image builds during development without needing to push to a container registry. Not intended for production use.")

recordCPUTimelines = flag.Bool("executor.record_cpu_timelines", false, "Capture CPU timeseries data in UsageStats for each task.")
debugUseLocalImagesOnly = flag.Bool("debug_use_local_images_only", false, "Do not pull OCI images and only used locally cached images. This can be set to test local image builds during development without needing to push to a container registry. Not intended for production use.")
DebugEnableAnonymousRecycling = flag.Bool("debug_enable_anonymous_runner_recycling", false, "Whether to enable runner recycling for unauthenticated requests. For debugging purposes only - do not use in production.")

slowPullWarnOnce sync.Once
Expand Down Expand Up @@ -198,6 +204,8 @@ func (m *ContainerMetrics) Unregister(c CommandContainer) {
// TODO: see whether its feasible to execute each task in its own cgroup
// so that we can avoid this bookkeeping and get stats without polling.
type UsageStats struct {
Clock clockwork.Clock

// last is the last stats update we observed.
last *repb.UsageStats
// taskStats is the usage stats relative to when Reset() was last called
Expand All @@ -215,22 +223,49 @@ type UsageStats struct {
// This is needed so that we can determine PSI stall totals when using
// a recycled runner.
baselineCPUPressure, baselineMemoryPressure, baselineIOPressure *repb.PSI

timeline *repb.UsageTimeline
timelineState timelineState
}

// Delta-encoding state for timelines. We store the delta-encoding directly so
// that TaskStats() can just return a view of the data rather than requiring a
// full re-encoding.
type timelineState struct {
lastTimestamp int64
lastCPUSample int64
// When adding new fields here, also update:
// - The size calculation in updateTimeline()
// - The test
}

func (s *UsageStats) clock() clockwork.Clock {
if s.Clock == nil {
s.Clock = clockwork.NewRealClock()
}
return s.Clock
}

// Reset resets resource usage counters in preparation for a new task, so that
// the new task's resource usage can be accounted for. It should be called
// at the beginning of Exec() in the container lifecycle.
// the new task's resource usage can be accounted for. It should be called at
// the beginning of Run() as well as at the beginning of Exec() in the container
// lifecycle.
func (s *UsageStats) Reset() {
if s.last == nil {
// No observations yet; nothing to do.
return
if s.last != nil {
s.last.MemoryBytes = 0
}
s.last.MemoryBytes = 0
s.baselineCPUNanos = s.last.GetCpuNanos()
s.baselineCPUPressure = s.last.GetCpuPressure()
s.baselineMemoryPressure = s.last.GetMemoryPressure()
s.baselineIOPressure = s.last.GetIoPressure()
s.peakMemoryUsageBytes = 0

now := s.clock().Now()
if *recordCPUTimelines {
s.timeline = &repb.UsageTimeline{StartTime: tspb.New(now)}
s.timelineState = timelineState{}
s.updateTimeline(now)
}
}

// TaskStats returns the usage stats for an executed task.
Expand Down Expand Up @@ -263,9 +298,29 @@ func (s *UsageStats) TaskStats() *repb.UsageStats {
taskStats.IoPressure.Full.Total -= s.baselineIOPressure.GetFull().GetTotal()
}

// Note: we don't clone the timeline because it's expensive.
taskStats.Timeline = s.timeline

return taskStats
}

func (s *UsageStats) updateTimeline(now time.Time) {
if 8*(len(s.timeline.GetCpuSamples())+len(s.timeline.GetTimestamps())) > timeseriesSizeLimitBytes {
return
}
// Update timestamps
ts := now.UnixMilli()
tsDelta := ts - s.timelineState.lastTimestamp
s.timeline.Timestamps = append(s.timeline.Timestamps, tsDelta)
s.timelineState.lastTimestamp = ts

// Update CPU samples with cumulative CPU milliseconds used.
cpu := (s.last.GetCpuNanos() - s.baselineCPUNanos) / 1e6
cpuDelta := cpu - s.timelineState.lastCPUSample
s.timeline.CpuSamples = append(s.timeline.CpuSamples, cpuDelta)
s.timelineState.lastCPUSample = cpu
}

// Update updates the usage for the current task, given a reading from the
// lifetime stats (e.g. cgroup created when the task container was initially
// created).
Expand All @@ -274,6 +329,9 @@ func (s *UsageStats) Update(lifetimeStats *repb.UsageStats) {
if lifetimeStats.GetMemoryBytes() > s.peakMemoryUsageBytes {
s.peakMemoryUsageBytes = lifetimeStats.GetMemoryBytes()
}
if *recordCPUTimelines {
s.updateTimeline(s.clock().Now())
}
}

// TrackStats starts a goroutine to monitor the container's resource usage. It
Expand Down
48 changes: 48 additions & 0 deletions enterprise/server/remote_execution/container/container_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ import (
"github.com/buildbuddy-io/buildbuddy/server/testutil/testauth"
"github.com/buildbuddy-io/buildbuddy/server/testutil/testenv"
"github.com/buildbuddy-io/buildbuddy/server/util/status"
"github.com/buildbuddy-io/buildbuddy/server/util/testing/flags"
"github.com/buildbuddy-io/buildbuddy/server/util/timeseries"
"github.com/google/go-cmp/cmp"
"github.com/jonboulle/clockwork"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/sync/errgroup"
Expand Down Expand Up @@ -357,6 +360,51 @@ func TestUsageStats(t *testing.T) {
}, s.TaskStats(), protocmp.Transform()))
}

func TestUsageStats_Timeseries(t *testing.T) {
flags.Set(t, "executor.record_cpu_timelines", true)

start := time.Unix(100, 0)
clock := clockwork.NewFakeClockAt(start)
lifetimeStats := &repb.UsageStats{}
stats := &container.UsageStats{Clock: clock}

stats.Reset()
clock.Advance(100 * time.Millisecond)
lifetimeStats.CpuNanos += 3e9
stats.Update(lifetimeStats)
timeline := stats.TaskStats().GetTimeline()

timestamps := timeseries.DeltaDecode(timeline.GetTimestamps())
cpuSamples := timeseries.DeltaDecode(timeline.GetCpuSamples())
assert.Equal(t, start.UnixNano(), timeline.GetStartTime().AsTime().UnixNano())
assert.Equal(t, []int64{
start.UnixMilli(),
start.UnixMilli() + 100,
}, timestamps, "timestamps")
assert.Equal(t, []int64{0, 3000}, cpuSamples, "cpu samples")

clock.Advance(250 * time.Millisecond)
start = clock.Now()
stats.Reset()
clock.Advance(500 * time.Millisecond)
lifetimeStats.CpuNanos += 7e9
stats.Update(lifetimeStats)
clock.Advance(500 * time.Millisecond)
lifetimeStats.CpuNanos += 2.5e9
stats.Update(lifetimeStats)
timeline = stats.TaskStats().GetTimeline()

timestamps = timeseries.DeltaDecode(timeline.GetTimestamps())
cpuSamples = timeseries.DeltaDecode(timeline.GetCpuSamples())
assert.Equal(t, start.UnixNano(), timeline.GetStartTime().AsTime().UnixNano())
assert.Equal(t, []int64{
start.UnixMilli(),
start.UnixMilli() + 500,
start.UnixMilli() + 1000,
}, timestamps, "timestamps")
assert.Equal(t, []int64{0, 7000, 9500}, cpuSamples, "cpu samples")
}

func makePSI(someTotal, fullTotal int64) *repb.PSI {
return &repb.PSI{
Some: &repb.PSI_Metrics{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,8 +396,6 @@ func (c *ociContainer) Create(ctx context.Context, workDir string) error {
}

func (c *ociContainer) Exec(ctx context.Context, cmd *repb.Command, stdio *interfaces.Stdio) *interfaces.CommandResult {
// Reset CPU usage and peak memory since we're starting a new task.
c.stats.Reset()
args := []string{"exec", "--cwd=" + execrootPath}
// Respect command env. Note, when setting any --env vars at all, it
// completely overrides the env from the bundle, rather than just adding
Expand Down Expand Up @@ -527,6 +525,7 @@ func (c *ociContainer) Stats(ctx context.Context) (*repb.UsageStats, error) {
// metrics are updated while the function is being executed, and that the
// resource usage results are populated in the returned CommandResult.
func (c *ociContainer) doWithStatsTracking(ctx context.Context, invokeRuntimeFn func(ctx context.Context) *interfaces.CommandResult) *interfaces.CommandResult {
c.stats.Reset()
stop, statsCh := container.TrackStats(ctx, c)
res := invokeRuntimeFn(ctx)
stop()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,7 @@ func (c *podmanCommandContainer) Run(ctx context.Context, command *repb.Command,
// metrics are updated while the function is executing, and that the UsageStats
// field is populated after execution.
func (c *podmanCommandContainer) doWithStatsTracking(ctx context.Context, runPodmanFn func(ctx context.Context) *interfaces.CommandResult) *interfaces.CommandResult {
c.stats.Reset()
stop, statsCh := container.TrackStats(ctx, c)
res := runPodmanFn(ctx)
stop()
Expand Down Expand Up @@ -483,10 +484,6 @@ func (c *podmanCommandContainer) Create(ctx context.Context, workDir string) err
}

func (c *podmanCommandContainer) Exec(ctx context.Context, cmd *repb.Command, stdio *interfaces.Stdio) *interfaces.CommandResult {
// Reset usage stats since we're running a new task. Note: This throws away
// any resource usage between the initial "Create" call and now, but that's
// probably fine for our needs right now.
c.stats.Reset()
podmanRunArgs := make([]string, 0, 2*len(cmd.GetEnvironmentVariables())+len(cmd.Arguments)+1)
for _, envVar := range cmd.GetEnvironmentVariables() {
podmanRunArgs = append(podmanRunArgs, "--env", fmt.Sprintf("%s=%s", envVar.GetName(), envVar.GetValue()))
Expand Down
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ require (
github.com/bazelbuild/rules_webtesting v0.0.0-20210910170740-6b2ef24cfe95
github.com/bduffany/godemon v0.0.0-20221115232931-09721d48e30e
github.com/bits-and-blooms/bloom/v3 v3.7.0
github.com/bmkessler/streamvbyte v0.1.0
github.com/bojand/ghz v0.120.0
github.com/bradfitz/gomemcache v0.0.0-20230611145640-acc696258285
github.com/buildbuddy-io/tensorflow-proto v0.0.0-20220908151343-929b41ab4dc6
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,6 @@ github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnweb
github.com/bmatcuk/doublestar/v4 v4.6.1 h1:FH9SifrbvJhnlQpztAx++wlkk70QBf0iBWDwNy7PA4I=
github.com/bmatcuk/doublestar/v4 v4.6.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc=
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4=
github.com/bmkessler/streamvbyte v0.1.0 h1:QvzfNFkZD66P1f8s8FwMMVLxPd6Wa2/uqQh8gDhq3Ss=
github.com/bmkessler/streamvbyte v0.1.0/go.mod h1:0SZTdGk1trQB2LOWZC7lNfF6SDN05cfv371DkrcUWeA=
github.com/bojand/ghz v0.120.0 h1:6F4wsmZVwFg5UnD+/R+IABWk6sKE/0OKIBdUQUZnOdo=
github.com/bojand/ghz v0.120.0/go.mod h1:HfECuBZj1v02XObGnRuoZgyB1PR24/25dIYiJIMjJnE=
github.com/bradfitz/gomemcache v0.0.0-20230611145640-acc696258285 h1:Dr+ezPI5ivhMn/3WOoB86XzMhie146DNaBbhaQWZHMY=
Expand Down
15 changes: 0 additions & 15 deletions proto/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,6 @@ proto_library(
],
)

proto_library(
name = "timeseries_proto",
srcs = ["timeseries.proto"],
)

proto_library(
name = "failure_details_proto",
srcs = [
Expand Down Expand Up @@ -963,16 +958,6 @@ go_proto_library(
],
)

go_proto_library(
name = "timeseries_go_proto",
compilers = [
"@io_bazel_rules_go//proto:go_proto",
"//proto:vtprotobuf_compiler",
],
importpath = "github.com/buildbuddy-io/buildbuddy/proto/timeseries",
proto = ":timeseries_proto",
)

go_proto_library(
name = "failure_details_go_proto",
compilers = [
Expand Down
27 changes: 10 additions & 17 deletions proto/remote_execution.proto
Original file line number Diff line number Diff line change
Expand Up @@ -2204,10 +2204,6 @@ message UsageStats {
// Total number of CPU-nanoseconds consumed by the task.
int64 cpu_nanos = 2;

// CPU usage timeline. Each sample represents the approximate CPU usage
// accumulated over each period.
Timeseries cpu_timeline = 8;

// Most recently recorded total memory usage of the task. This field is only
// used for real-time metrics and shouldn't be used as a "summary" metric for
// the task (peak_memory_bytes is a more useful summary metric).
Expand Down Expand Up @@ -2249,22 +2245,19 @@ message UsageStats {
// IO PSI metrics.
PSI io_pressure = 7;

message Timeseries {
// Time when recording started.
google.protobuf.Timestamp start = 1;
// Timeseries data.
UsageTimeline timeline = 8;
}

// Time when the last sample was taken.
google.protobuf.Timestamp end = 2;
message UsageTimeline {
google.protobuf.Timestamp start_time = 1;

// Duration between each sample.
google.protobuf.Duration period = 3;

// TODO: compress the samples to save space
// Delta-encoded timestamps (milliseconds since Unix epoch).
repeated int64 timestamps = 2;

// Observed samples. The interpretation of these samples may depend on the
// metric being reported.
repeated int64 samples = 4;
}
// Delta-encoded CPU-millis samples. The decoded samples represent the
// cumulative CPU-millis used since the start time.
repeated int64 cpu_samples = 3;
}

// Pressure Stall Information, commonly known as PSI.
Expand Down
19 changes: 0 additions & 19 deletions proto/timeseries.proto

This file was deleted.

Loading

0 comments on commit c88cfe9

Please sign in to comment.