From 1e7a6bcdd6fe959e1f12bcdc10a08beffb6b7af5 Mon Sep 17 00:00:00 2001 From: Brandon Duffany Date: Mon, 18 Nov 2024 20:36:26 -0500 Subject: [PATCH] Apply cgroup settings to firecracker and ociruntime --- .../remote_execution/container/container.go | 9 +++ .../containers/firecracker/BUILD | 4 + .../containers/firecracker/containeropts.go | 17 ++++- .../containers/firecracker/firecracker.go | 54 +++++++++---- .../containers/ociruntime/BUILD | 1 + .../containers/ociruntime/ociruntime.go | 75 ++++++++++++++----- .../containers/ociruntime/ociruntime_test.go | 58 ++++++++++++++ .../execution_server/execution_server.go | 5 ++ .../server/remote_execution/runner/runner.go | 3 + 9 files changed, 194 insertions(+), 32 deletions(-) diff --git a/enterprise/server/remote_execution/container/container.go b/enterprise/server/remote_execution/container/container.go index 57f086a71de..5faa3318e3f 100644 --- a/enterprise/server/remote_execution/container/container.go +++ b/enterprise/server/remote_execution/container/container.go @@ -111,6 +111,15 @@ type Init struct { // BlockDevice is the block device where the build root dir is located. BlockDevice *block_io.Device + // CgroupParent is the path relative to the cgroupfs root under which the + // container's cgroup should be created. For example, if the cgroupfs is + // rooted at "/sys/fs/cgroup" and all container cgroups are placed under + // "/sys/fs/cgroup/buildbuddy.executor.containers" then this will be + // "buildbuddy.executor.containers". The container implementation is + // responsible for creating the cgroup, applying the CgroupSettings + // from `Task.SchedulingMetadata.CgroupSettings`. + CgroupParent string + // Publisher can be used to send fine-grained execution progress updates. Publisher *operation.Publisher } diff --git a/enterprise/server/remote_execution/containers/firecracker/BUILD b/enterprise/server/remote_execution/containers/firecracker/BUILD index 3a78dcd3105..c31d1cdfb49 100644 --- a/enterprise/server/remote_execution/containers/firecracker/BUILD +++ b/enterprise/server/remote_execution/containers/firecracker/BUILD @@ -22,6 +22,8 @@ go_library( "vmlinuxRunfilePath": "$(rlocationpath //enterprise/vmsupport/bin:vmlinux)", }, deps = [ + "//enterprise/server/remote_execution/block_io", + "//enterprise/server/remote_execution/cgroup", "//enterprise/server/remote_execution/commandutil", "//enterprise/server/remote_execution/container", "//enterprise/server/remote_execution/copy_on_write", @@ -39,6 +41,7 @@ go_library( "//enterprise/vmsupport:bundle", "//proto:firecracker_go_proto", "//proto:remote_execution_go_proto", + "//proto:scheduler_go_proto", "//proto:vmexec_go_proto", "//proto:vmvfs_go_proto", "//server/environment", @@ -50,6 +53,7 @@ go_library( "//server/util/disk", "//server/util/log", "//server/util/networking", + "//server/util/proto", "//server/util/status", "//server/util/tracing", "@com_github_armon_circbuf//:circbuf", diff --git a/enterprise/server/remote_execution/containers/firecracker/containeropts.go b/enterprise/server/remote_execution/containers/firecracker/containeropts.go index 07026014b13..ee5d01152cf 100644 --- a/enterprise/server/remote_execution/containers/firecracker/containeropts.go +++ b/enterprise/server/remote_execution/containers/firecracker/containeropts.go @@ -1,8 +1,11 @@ package firecracker import ( - fcpb "github.com/buildbuddy-io/buildbuddy/proto/firecracker" + "github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/block_io" + dockerclient "github.com/docker/docker/client" + fcpb "github.com/buildbuddy-io/buildbuddy/proto/firecracker" + scpb "github.com/buildbuddy-io/buildbuddy/proto/scheduler" ) type ContainerOpts struct { @@ -45,6 +48,18 @@ type ContainerOpts struct { // CPU-millis. This is set to the task size. CPUWeightMillis int64 + // CgroupParent is the parent cgroup path relative to the cgroup root. + CgroupParent string + + // CgroupSettings are settings applied to cgroup in which jailer executes. + CgroupSettings *scpb.CgroupSettings + + // BlockDevice sets the block device for restricting IO via cgroup. Note + // that the firecracker cgroup does not affect IO for virtual block devices + // (VBD) or memory snapshots (UFFD) since the server for these devices runs + // in the executor process. + BlockDevice *block_io.Device + // Optional flags -- these will default to sane values. // They are here primarily for debugging and running // VMs outside of the normal action-execution framework. diff --git a/enterprise/server/remote_execution/containers/firecracker/firecracker.go b/enterprise/server/remote_execution/containers/firecracker/firecracker.go index 16674c354d2..670a2c09863 100644 --- a/enterprise/server/remote_execution/containers/firecracker/firecracker.go +++ b/enterprise/server/remote_execution/containers/firecracker/firecracker.go @@ -27,6 +27,8 @@ import ( "github.com/armon/circbuf" "github.com/bazelbuild/rules_go/go/runfiles" + "github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/block_io" + "github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/cgroup" "github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/commandutil" "github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/container" "github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/copy_on_write" @@ -50,6 +52,7 @@ import ( "github.com/buildbuddy-io/buildbuddy/server/util/disk" "github.com/buildbuddy-io/buildbuddy/server/util/log" "github.com/buildbuddy-io/buildbuddy/server/util/networking" + "github.com/buildbuddy-io/buildbuddy/server/util/proto" "github.com/buildbuddy-io/buildbuddy/server/util/status" "github.com/buildbuddy-io/buildbuddy/server/util/tracing" "github.com/firecracker-microvm/firecracker-go-sdk/client/operations" @@ -64,6 +67,7 @@ import ( vmsupport_bundle "github.com/buildbuddy-io/buildbuddy/enterprise/vmsupport" fcpb "github.com/buildbuddy-io/buildbuddy/proto/firecracker" repb "github.com/buildbuddy-io/buildbuddy/proto/remote_execution" + scpb "github.com/buildbuddy-io/buildbuddy/proto/scheduler" vmxpb "github.com/buildbuddy-io/buildbuddy/proto/vmexec" vmfspb "github.com/buildbuddy-io/buildbuddy/proto/vmvfs" fcclient "github.com/firecracker-microvm/firecracker-go-sdk" @@ -471,6 +475,9 @@ func (p *Provider) New(ctx context.Context, args *container.Init) (container.Com ContainerImage: args.Props.ContainerImage, User: args.Props.DockerUser, ActionWorkingDirectory: args.WorkDir, + CgroupParent: args.CgroupParent, + CgroupSettings: args.Task.GetSchedulingMetadata().GetCgroupSettings(), + BlockDevice: args.BlockDevice, ExecutorConfig: p.executorConfig, CPUWeightMillis: sizeEstimate.GetEstimatedMilliCpu(), } @@ -538,10 +545,13 @@ type FirecrackerContainer struct { uffdHandler *uffd.Handler memoryStore *copy_on_write.COWStore - jailerRoot string // the root dir the jailer will work in - numaNode int // NUMA node for CPU scheduling - cpuWeightMillis int64 // milliCPU for cgroup CPU weight - machine *fcclient.Machine // the firecracker machine object. + jailerRoot string // the root dir the jailer will work in + numaNode int // NUMA node for CPU scheduling + cpuWeightMillis int64 // milliCPU for cgroup CPU weight + cgroupParent string // parent cgroup path (root-relative) + cgroupSettings *scpb.CgroupSettings // jailer cgroup settings + blockDevice *block_io.Device // block device for cgroup IO settings + machine *fcclient.Machine // the firecracker machine object. vmLog *VMLog env environment.Env mountWorkspaceFile bool @@ -601,6 +611,9 @@ func NewContainer(ctx context.Context, env environment.Env, task *repb.Execution user: opts.User, actionWorkingDir: opts.ActionWorkingDirectory, cpuWeightMillis: opts.CPUWeightMillis, + cgroupParent: opts.CgroupParent, + cgroupSettings: opts.CgroupSettings, + blockDevice: opts.BlockDevice, numaNode: numaNode, env: env, task: task, @@ -976,7 +989,7 @@ func (c *FirecrackerContainer) LoadSnapshot(ctx context.Context) error { // We start firecracker with this reduced config because we will load a // snapshot that is already configured. - jailerCfg, err := c.getJailerConfig("" /*=kernelImagePath*/) + jailerCfg, err := c.getJailerConfig(ctx, "" /*=kernelImagePath*/) if err != nil { return status.WrapError(err, "get jailer config") } @@ -1418,7 +1431,7 @@ func (c *FirecrackerContainer) getConfig(ctx context.Context, rootFS, containerF netnsPath = c.network.NamespacePath() } bootArgs := getBootArgs(c.vmConfig) - jailerCfg, err := c.getJailerConfig(c.executorConfig.KernelImagePath) + jailerCfg, err := c.getJailerConfig(ctx, c.executorConfig.KernelImagePath) if err != nil { return nil, status.WrapError(err, "get jailer config") } @@ -1492,17 +1505,33 @@ func (c *FirecrackerContainer) getConfig(ctx context.Context, rootFS, containerF return cfg, nil } -func (c *FirecrackerContainer) getJailerConfig(kernelImagePath string) (*fcclient.JailerConfig, error) { +func (c *FirecrackerContainer) cgroupPath() string { + return filepath.Join("/sys/fs/cgroup", c.cgroupParent, c.id) +} + +func (c *FirecrackerContainer) getJailerConfig(ctx context.Context, kernelImagePath string) (*fcclient.JailerConfig, error) { cgroupVersion, err := getCgroupVersion() if err != nil { return nil, status.WrapError(err, "get cgroup version") } - var cgroupArgs []string - if *enableCPUWeight { + // Apply CPU weight only if the app hasn't set CPU weight via scheduling + // metadata. + cgroupSettings := c.cgroupSettings + if cgroupSettings == nil && *enableCPUWeight { // Use the same weight calculation used in ociruntime. cpuWeight := oci.CPUSharesToWeight(oci.CPUMillisToShares(c.cpuWeightMillis)) - cgroupArgs = append(cgroupArgs, fmt.Sprintf("cpu.weight=%d", cpuWeight)) + cgroupSettings = &scpb.CgroupSettings{ + CpuWeight: proto.Int64(cpuWeight), + } + } + if cgroupSettings != nil { + if err := os.MkdirAll(c.cgroupPath(), 0755); err != nil { + return nil, status.UnavailableErrorf("create cgroup: %s", err) + } + if err := cgroup.Setup(ctx, c.cgroupPath(), c.cgroupSettings, c.blockDevice); err != nil { + return nil, status.UnavailableErrorf("setup cgroup: %s", err) + } } return &fcclient.JailerConfig{ @@ -1517,12 +1546,11 @@ func (c *FirecrackerContainer) getJailerConfig(kernelImagePath string) (*fcclien Stdout: c.vmLogWriter(), Stderr: c.vmLogWriter(), CgroupVersion: cgroupVersion, - CgroupArgs: cgroupArgs, // Use the root cgroup as the cgroup parent rather than the default // "firecracker" subdir, so that VM cgroups are siblings of OCI // container cgroups. This is needed because CPU weight is applied to // sibling cgroups in a hierarchy. - ParentCgroup: fcclient.String(""), + ParentCgroup: fcclient.String(c.cgroupParent), }, nil } @@ -2475,7 +2503,7 @@ func (c *FirecrackerContainer) stopMachine(ctx context.Context) error { // Once the VM exits, delete the cgroup. // Jailer docs say that this cleanup must be handled by us: // https://github.com/firecracker-microvm/firecracker/blob/main/docs/jailer.md#observations - if err := os.Remove(filepath.Join("/sys/fs/cgroup", c.id)); err != nil && !os.IsNotExist(err) { + if err := os.Remove(c.cgroupPath()); err != nil && !os.IsNotExist(err) { log.CtxWarningf(ctx, "Failed to remove jailer cgroup: %s", err) } diff --git a/enterprise/server/remote_execution/containers/ociruntime/BUILD b/enterprise/server/remote_execution/containers/ociruntime/BUILD index 180d1d47250..e160453e01c 100644 --- a/enterprise/server/remote_execution/containers/ociruntime/BUILD +++ b/enterprise/server/remote_execution/containers/ociruntime/BUILD @@ -21,6 +21,7 @@ go_library( "//enterprise/server/remote_execution/container", "//enterprise/server/util/oci", "//proto:remote_execution_go_proto", + "//proto:scheduler_go_proto", "//server/environment", "//server/interfaces", "//server/util/disk", diff --git a/enterprise/server/remote_execution/containers/ociruntime/ociruntime.go b/enterprise/server/remote_execution/containers/ociruntime/ociruntime.go index b3720721c3b..f2a914f1f5a 100644 --- a/enterprise/server/remote_execution/containers/ociruntime/ociruntime.go +++ b/enterprise/server/remote_execution/containers/ociruntime/ociruntime.go @@ -40,6 +40,7 @@ import ( "golang.org/x/sys/unix" repb "github.com/buildbuddy-io/buildbuddy/proto/remote_execution" + scpb "github.com/buildbuddy-io/buildbuddy/proto/scheduler" ctr "github.com/google/go-containerregistry/pkg/v1" specs "github.com/opencontainers/runtime-spec/specs-go" ) @@ -193,6 +194,8 @@ func (p *provider) New(ctx context.Context, args *container.Init) (container.Com networkPool: p.networkPool, blockDevice: args.BlockDevice, + cgroupParent: args.CgroupParent, + cgroupSettings: args.Task.GetSchedulingMetadata().GetCgroupSettings(), imageRef: args.Props.ContainerImage, networkEnabled: args.Props.DockerNetwork != "off", user: args.Props.DockerUser, @@ -207,6 +210,8 @@ type ociContainer struct { runtime string cgroupPaths *cgroup.Paths + cgroupParent string + cgroupSettings *scpb.CgroupSettings blockDevice *block_io.Device containersRoot string imageCacheRoot string @@ -238,6 +243,14 @@ func (c *ociContainer) rootfsPath() string { return filepath.Join(c.bundlePath(), "rootfs") } +func (c *ociContainer) cgroupRootRelativePath() string { + return filepath.Join(c.cgroupParent, c.cid) +} + +func (c *ociContainer) cgroupPath() string { + return filepath.Join("/sys/fs/cgroup", c.cgroupRootRelativePath()) +} + // Returns the standard config.json path expected by crun. func (c *ociContainer) configPath() string { return filepath.Join(c.bundlePath(), "config.json") @@ -287,6 +300,10 @@ func (c *ociContainer) createBundle(ctx context.Context, cmd *repb.Command) erro if err := c.createRootfs(ctx); err != nil { return fmt.Errorf("create rootfs: %w", err) } + // Setup cgroup + if err := c.setupCgroup(ctx); err != nil { + return fmt.Errorf("setup cgroup: %w", err) + } // Create config.json from the image config and command image, ok := c.imageStore.CachedImage(c.imageRef) @@ -468,6 +485,10 @@ func (c *ociContainer) Remove(ctx context.Context) error { firstErr = status.UnavailableErrorf("remove bundle: %s", err) } + if err := os.Remove(c.cgroupPath()); err != nil && firstErr == nil && !os.IsNotExist(err) { + firstErr = status.UnavailableErrorf("remove cgroup: %s", err) + } + return firstErr } @@ -542,6 +563,17 @@ func (c *ociContainer) doWithStatsTracking(ctx context.Context, invokeRuntimeFn return res } +func (c *ociContainer) setupCgroup(ctx context.Context) error { + path := c.cgroupPath() + if err := os.MkdirAll(path, 0755); err != nil { + return fmt.Errorf("create cgroup: %w", err) + } + if err := cgroup.Setup(ctx, path, c.cgroupSettings, c.blockDevice); err != nil { + return fmt.Errorf("configure cgroup: %w", err) + } + return nil +} + func (c *ociContainer) createRootfs(ctx context.Context) error { if err := os.MkdirAll(c.rootfsPath(), 0755); err != nil { return fmt.Errorf("create rootfs dir: %w", err) @@ -689,25 +721,35 @@ func installBusybox(ctx context.Context, path string) error { func (c *ociContainer) createSpec(ctx context.Context, cmd *repb.Command) (*specs.Spec, error) { env := append(baseEnv, commandutil.EnvStringList(cmd)...) - var pids *specs.LinuxPids - if *pidsLimit >= 0 { - pids = &specs.LinuxPids{Limit: *pidsLimit} - } image, _ := c.imageStore.CachedImage(c.imageRef) user, err := getUser(ctx, image, c.rootfsPath(), c.user, c.forceRoot) if err != nil { return nil, fmt.Errorf("get container user: %w", err) } - cpuSpecs := &specs.LinuxCPU{} - if *cpuLimit != 0 { - period := 100 * time.Millisecond - cpuSpecs.Quota = pointer(int64(*cpuLimit) * period.Microseconds()) - cpuSpecs.Period = pointer(uint64(period.Microseconds())) - } - - if *cpuSharesEnabled { - cpuSpecs.Shares = pointer(uint64(oci.CPUMillisToShares(c.milliCPU))) + var resources *specs.LinuxResources + // If the app is controlling cgroup settings then those take precedence + // over the executor settings. + // TODO: should self-hosted users be allowed to override the app's + // settings? + if c.cgroupSettings == nil { + var pids *specs.LinuxPids + if *pidsLimit >= 0 { + pids = &specs.LinuxPids{Limit: *pidsLimit} + } + cpuSpecs := &specs.LinuxCPU{} + if *cpuLimit != 0 { + period := 100 * time.Millisecond + cpuSpecs.Quota = pointer(int64(*cpuLimit) * period.Microseconds()) + cpuSpecs.Period = pointer(uint64(period.Microseconds())) + } + if *cpuSharesEnabled { + cpuSpecs.Shares = pointer(uint64(oci.CPUMillisToShares(c.milliCPU))) + } + resources = &specs.LinuxResources{ + Pids: pids, + CPU: cpuSpecs, + } } spec := specs.Spec{ @@ -815,7 +857,7 @@ func (c *ociContainer) createSpec(ctx context.Context, cmd *repb.Command) (*spec }, Linux: &specs.Linux{ // TODO: set up cgroups - CgroupsPath: "", + CgroupsPath: filepath.Join(c.cgroupRootRelativePath()), Namespaces: []specs.LinuxNamespace{ {Type: specs.PIDNamespace}, {Type: specs.IPCNamespace}, @@ -832,10 +874,7 @@ func (c *ociContainer) createSpec(ctx context.Context, cmd *repb.Command) (*spec Sysctl: map[string]string{ "net.ipv4.ping_group_range": fmt.Sprintf("%d %d", user.GID, user.GID), }, - Resources: &specs.LinuxResources{ - Pids: pids, - CPU: cpuSpecs, - }, + Resources: resources, // TODO: grok MaskedPaths and ReadonlyPaths - just copied from podman. MaskedPaths: []string{ "/proc/acpi", diff --git a/enterprise/server/remote_execution/containers/ociruntime/ociruntime_test.go b/enterprise/server/remote_execution/containers/ociruntime/ociruntime_test.go index e77b29d7d8c..ce24078395e 100644 --- a/enterprise/server/remote_execution/containers/ociruntime/ociruntime_test.go +++ b/enterprise/server/remote_execution/containers/ociruntime/ociruntime_test.go @@ -206,6 +206,64 @@ func TestCPULimit(t *testing.T) { assert.Equal(t, 0, res.ExitCode) } +func TestCgroupSettings(t *testing.T) { + setupNetworking(t) + + image := manuallyProvisionedBusyboxImage(t) + + ctx := context.Background() + env := testenv.GetTestEnv(t) + + runtimeRoot := testfs.MakeTempDir(t) + flags.Set(t, "executor.oci.runtime_root", runtimeRoot) + + buildRoot := testfs.MakeTempDir(t) + cacheRoot := testfs.MakeTempDir(t) + + provider, err := ociruntime.NewProvider(env, buildRoot, cacheRoot) + require.NoError(t, err) + wd := testfs.MakeDirAll(t, buildRoot, "work") + + // Enable the cgroup controllers that we're going to test (this test is run + // in a firecracker VM which doesn't enable any controllers by default) + err = os.WriteFile("/sys/fs/cgroup/cgroup.subtree_control", []byte("+cpu +pids"), 0) + require.NoError(t, err) + + c, err := provider.New(ctx, &container.Init{ + Task: &repb.ScheduledTask{ + SchedulingMetadata: &scpb.SchedulingMetadata{ + CgroupSettings: &scpb.CgroupSettings{ + CpuQuotaLimitUsec: proto.Int64(300000), + CpuQuotaPeriodUsec: proto.Int64(100000), + PidsMax: proto.Int64(256), + }, + }, + }, + Props: &platform.Properties{ + ContainerImage: image, + }, + }) + require.NoError(t, err) + t.Cleanup(func() { + err := c.Remove(ctx) + require.NoError(t, err) + }) + + // Run + cmd := &repb.Command{ + Arguments: []string{"sh", "-c", ` + cat /sys/fs/cgroup/cpu.max + cat /sys/fs/cgroup/pids.max + `}, + } + + res := c.Run(ctx, cmd, wd, oci.Credentials{}) + require.NoError(t, res.Error) + assert.Equal(t, "300000 100000\n256\n", string(res.Stdout)) + assert.Empty(t, string(res.Stderr)) + assert.Equal(t, 0, res.ExitCode) +} + func TestRunUsageStats(t *testing.T) { setupNetworking(t) diff --git a/enterprise/server/remote_execution/execution_server/execution_server.go b/enterprise/server/remote_execution/execution_server/execution_server.go index 8a94ca65242..e9b6ba801f0 100644 --- a/enterprise/server/remote_execution/execution_server/execution_server.go +++ b/enterprise/server/remote_execution/execution_server/execution_server.go @@ -635,6 +635,11 @@ func (s *ExecutionServer) dispatch(ctx context.Context, req *repb.ExecuteRequest ExecutorGroupId: pool.GroupID, TaskGroupId: taskGroupID, Priority: req.GetExecutionPolicy().GetPriority(), + // XXX + CgroupSettings: &scpb.CgroupSettings{ + CpuQuotaLimitUsec: proto.Int64(300_000), + CpuQuotaPeriodUsec: proto.Int64(100_000), + }, } scheduleReq := &scpb.ScheduleTaskRequest{ TaskId: executionID, diff --git a/enterprise/server/remote_execution/runner/runner.go b/enterprise/server/remote_execution/runner/runner.go index 1cfd03e6e1d..1c4acb9e754 100644 --- a/enterprise/server/remote_execution/runner/runner.go +++ b/enterprise/server/remote_execution/runner/runner.go @@ -1058,6 +1058,9 @@ func (p *pool) newContainer(ctx context.Context, props *platform.Properties, tas Task: task, WorkDir: workingDir, BlockDevice: p.blockDevice, + // TODO(http://go/b/4140): Don't nest container cgroups directly under + // the root cgroup. + CgroupParent: "", } // Overriding in tests.