From 53dd0464414c265d59d5305ea3bec95e6bbec4ea Mon Sep 17 00:00:00 2001 From: Brandon Duffany Date: Thu, 7 Nov 2024 12:45:17 -0800 Subject: [PATCH] firecracker: set jailer cgroup CPU weight according to task size (#7858) --- MODULE.bazel | 5 +- ...er_microvm_firecracker_go_sdk_cgroup.patch | 114 ++++++++++++++++++ deps.bzl | 12 +- .../containers/firecracker/containeropts.go | 4 + .../containers/firecracker/firecracker.go | 90 ++++++++------ 5 files changed, 183 insertions(+), 42 deletions(-) create mode 100644 buildpatches/com_github_firecracker_microvm_firecracker_go_sdk_cgroup.patch diff --git a/MODULE.bazel b/MODULE.bazel index 75dbf7203b6..2a99e3a6d36 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -138,7 +138,10 @@ go_deps.module_override( ) go_deps.module_override( patch_strip = 1, - patches = ["@@//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_jailer.patch"], + patches = [ + "@@//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_jailer.patch", + "@@//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_cgroup.patch", + ], path = "github.com/firecracker-microvm/firecracker-go-sdk", ) go_deps.module_override( diff --git a/buildpatches/com_github_firecracker_microvm_firecracker_go_sdk_cgroup.patch b/buildpatches/com_github_firecracker_microvm_firecracker_go_sdk_cgroup.patch new file mode 100644 index 00000000000..8afb0cb7cd1 --- /dev/null +++ b/buildpatches/com_github_firecracker_microvm_firecracker_go_sdk_cgroup.patch @@ -0,0 +1,114 @@ +From 82d24f54a9a8cad65d8949816e3f6e85a134017d Mon Sep 17 00:00:00 2001 +From: Brandon Duffany +Date: Wed, 6 Nov 2024 17:01:44 -0500 +Subject: [PATCH] Support jailer cgroup args + +Signed-off-by: Brandon Duffany +--- + jailer.go | 29 ++++++++++++++++++++++++++++- + jailer_test.go | 4 ++++ + 2 files changed, 32 insertions(+), 1 deletion(-) + +diff --git a/jailer.go b/jailer.go +index 208de670..3246738d 100644 +--- a/jailer.go ++++ b/jailer.go +@@ -85,6 +85,10 @@ type JailerConfig struct { + // CgroupVersion is the version of the cgroup filesystem to use. + CgroupVersion string + ++ // CgroupArgs are cgroup settings applied by the jailer. Each arg must be ++ // formatted like =, like "cpu.shares=10" ++ CgroupArgs []string ++ + // Stdout specifies the IO writer for STDOUT to use when spawning the jailer. + Stdout io.Writer + // Stderr specifies the IO writer for STDERR to use when spawning the jailer. +@@ -109,6 +113,7 @@ type JailerCommandBuilder struct { + daemonize bool + firecrackerArgs []string + cgroupVersion string ++ cgroupArgs []string + + stdin io.Reader + stdout io.Writer +@@ -143,6 +148,10 @@ func (b JailerCommandBuilder) Args() []string { + args = append(args, "--cgroup", fmt.Sprintf("cpuset.cpus=%s", cpulist)) + } + ++ for _, cgroupArg := range b.cgroupArgs { ++ args = append(args, "--cgroup", cgroupArg) ++ } ++ + if len(b.cgroupVersion) > 0 { + args = append(args, "--cgroup-version", b.cgroupVersion) + } +@@ -204,13 +213,30 @@ func (b JailerCommandBuilder) WithExecFile(path string) JailerCommandBuilder { + return b + } + +-// WithNumaNode uses the specfied node for the jailer. This represents the numa ++// WithNumaNode uses the specified node for the jailer. This represents the numa + // node that the process will get assigned to. ++// Note: this is a convenience function that just sets the values of the cgroup ++// files "cpuset.mems" and "cpuset.cpus". ++// If those files are also configured using WithCgroupArgs, the values passed to ++// WithCgroupArgs will take precedence. + func (b JailerCommandBuilder) WithNumaNode(node int) JailerCommandBuilder { + b.node = node + return b + } + ++// WithCgroupArgs sets cgroup file values to be set by the jailer. ++// Each arg must be of the form =. ++// Each call to this function resets the cgroup arguments, rather than ++// appending. ++// ++// Example: ++// ++// b = b.WithCgroupArgs("cpu.shares=10") ++func (b JailerCommandBuilder) WithCgroupArgs(cgroupArgs ...string) JailerCommandBuilder { ++ b.cgroupArgs = cgroupArgs ++ return b ++} ++ + // WithChrootBaseDir will set the given path as the chroot base directory. This + // specifies where chroot jails are built and defaults to /srv/jailer. + func (b JailerCommandBuilder) WithChrootBaseDir(path string) JailerCommandBuilder { +@@ -348,6 +374,7 @@ func jail(ctx context.Context, m *Machine, cfg *Config) error { + WithChrootBaseDir(cfg.JailerCfg.ChrootBaseDir). + WithDaemonize(cfg.JailerCfg.Daemonize). + WithCgroupVersion(cfg.JailerCfg.CgroupVersion). ++ WithCgroupArgs(cfg.JailerCfg.CgroupArgs...). + WithFirecrackerArgs(fcArgs...). + WithStdout(stdout). + WithStderr(stderr) +diff --git a/jailer_test.go b/jailer_test.go +index 7c7017bc..6037cd27 100644 +--- a/jailer_test.go ++++ b/jailer_test.go +@@ -103,6 +103,7 @@ func TestJailerBuilder(t *testing.T) { + UID: Int(123), + GID: Int(100), + NumaNode: Int(0), ++ CgroupArgs: []string{"cpu.shares=10"}, + ChrootStrategy: NewNaiveChrootStrategy("kernel-image-path"), + ExecFile: "/path/to/firecracker", + ChrootBaseDir: "/tmp", +@@ -123,6 +124,8 @@ func TestJailerBuilder(t *testing.T) { + "cpuset.mems=0", + "--cgroup", + fmt.Sprintf("cpuset.cpus=%s", getNumaCpuset(0)), ++ "--cgroup", ++ "cpu.shares=10", + "--cgroup-version", + "2", + "--chroot-base-dir", +@@ -146,6 +149,7 @@ func TestJailerBuilder(t *testing.T) { + WithUID(IntValue(c.jailerCfg.UID)). + WithGID(IntValue(c.jailerCfg.GID)). + WithNumaNode(IntValue(c.jailerCfg.NumaNode)). ++ WithCgroupArgs(c.jailerCfg.CgroupArgs...). + WithCgroupVersion(c.jailerCfg.CgroupVersion). + WithExecFile(c.jailerCfg.ExecFile) + \ No newline at end of file diff --git a/deps.bzl b/deps.bzl index fb4d71a2df5..c910980faf7 100644 --- a/deps.bzl +++ b/deps.bzl @@ -1503,11 +1503,13 @@ def install_go_mod_dependencies(workspace_name = "buildbuddy"): name = "com_github_firecracker_microvm_firecracker_go_sdk", importpath = "github.com/firecracker-microvm/firecracker-go-sdk", patch_args = ["-p1"], - # TODO(bduffany): when - # https://github.com/firecracker-microvm/firecracker-go-sdk/pull/510 is - # merged, cherry-pick the final revision into our fork, and remove this - # patch. - patches = ["@{}//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_jailer.patch".format(workspace_name)], + # TODO(bduffany): remove these patches when the corresponding PRs are merged + patches = [ + # https://github.com/firecracker-microvm/firecracker-go-sdk/pull/510 + "@{}//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_jailer.patch".format(workspace_name), + # https://github.com/firecracker-microvm/firecracker-go-sdk/pull/600 + "@{}//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_cgroup.patch".format(workspace_name), + ], sum = "h1:n9Q3BKnUAW0x1D1x2I2QIj0T/5K6UYL6JKkPvwwARw0=", version = "v0.0.0-20241028184712-f74f43bb036d", ) diff --git a/enterprise/server/remote_execution/containers/firecracker/containeropts.go b/enterprise/server/remote_execution/containers/firecracker/containeropts.go index 2344330919a..07026014b13 100644 --- a/enterprise/server/remote_execution/containers/firecracker/containeropts.go +++ b/enterprise/server/remote_execution/containers/firecracker/containeropts.go @@ -41,6 +41,10 @@ type ContainerOpts struct { // The action directory with inputs / outputs. ActionWorkingDirectory string + // CPUWeightMillis is the CPU weight to assign to this VM, expressed as + // CPU-millis. This is set to the task size. + CPUWeightMillis int64 + // Optional flags -- these will default to sane values. // They are here primarily for debugging and running // VMs outside of the normal action-execution framework. diff --git a/enterprise/server/remote_execution/containers/firecracker/firecracker.go b/enterprise/server/remote_execution/containers/firecracker/firecracker.go index 3ff3044e76f..aa8004f56b4 100644 --- a/enterprise/server/remote_execution/containers/firecracker/firecracker.go +++ b/enterprise/server/remote_execution/containers/firecracker/firecracker.go @@ -86,6 +86,7 @@ var healthCheckInterval = flag.Duration("executor.firecracker_health_check_inter var healthCheckTimeout = flag.Duration("executor.firecracker_health_check_timeout", 30*time.Second, "Timeout for VM health check requests.") var overprovisionCPUs = flag.Int("executor.firecracker_overprovision_cpus", 3, "Number of CPUs to overprovision for VMs. This allows VMs to more effectively utilize CPU resources on the host machine. Set to -1 to allow all VMs to use max CPU.") var initOnAllocAndFree = flag.Bool("executor.firecracker_init_on_alloc_and_free", false, "Set init_on_alloc=1 and init_on_free=1 in firecracker vms") +var enableCPUWeight = flag.Bool("executor.firecracker_enable_cpu_weight", false, "Set cgroup CPU weight to match VM size") var forceRemoteSnapshotting = flag.Bool("debug_force_remote_snapshots", false, "When remote snapshotting is enabled, force remote snapshotting even for tasks which otherwise wouldn't support it.") var disableWorkspaceSync = flag.Bool("debug_disable_firecracker_workspace_sync", false, "Do not sync the action workspace to the guest, instead using the existing workspace from the VM snapshot.") @@ -211,6 +212,11 @@ const ( // Firecracker does not allow VMs over a certain size. // See MAX_SUPPORTED_VCPUS in firecracker repo. firecrackerMaxCPU = 32 + + // Min value for cgroup2 cpu.weight + cgroupMinCPUWeight = 1 + // Max value for cgroup2 cpu.weight + cgroupMaxCPUWeight = 10_000 ) var ( @@ -483,6 +489,7 @@ func (p *Provider) New(ctx context.Context, args *container.Init) (container.Com DockerClient: p.dockerClient, ActionWorkingDirectory: args.WorkDir, ExecutorConfig: p.executorConfig, + CPUWeightMillis: sizeEstimate.GetEstimatedMilliCpu(), } c, err := NewContainer(ctx, p.env, args.Task.GetExecutionTask(), opts) if err != nil { @@ -553,6 +560,7 @@ type FirecrackerContainer struct { jailerRoot string // the root dir the jailer will work in numaNode int // NUMA node for CPU scheduling + cpuWeightMillis int64 // milliCPU for cgroup CPU weight machine *fcclient.Machine // the firecracker machine object. vmLog *VMLog env environment.Env @@ -613,6 +621,7 @@ func NewContainer(ctx context.Context, env environment.Env, task *repb.Execution containerImage: opts.ContainerImage, user: opts.User, actionWorkingDir: opts.ActionWorkingDirectory, + cpuWeightMillis: opts.CPUWeightMillis, numaNode: numaNode, env: env, task: task, @@ -986,31 +995,18 @@ func (c *FirecrackerContainer) LoadSnapshot(ctx context.Context) error { netnsPath = c.network.NamespacePath() } - cgroupVersion, err := getCgroupVersion() - if err != nil { - return err - } - // We start firecracker with this reduced config because we will load a // snapshot that is already configured. + jailerCfg, err := c.getJailerConfig("" /*=kernelImagePath*/) + if err != nil { + return status.WrapError(err, "get jailer config") + } cfg := fcclient.Config{ SocketPath: firecrackerSocketPath, NetNS: netnsPath, Seccomp: fcclient.SeccompConfig{Enabled: true}, DisableValidation: true, - JailerCfg: &fcclient.JailerConfig{ - JailerBinary: c.executorConfig.JailerBinaryPath, - ChrootBaseDir: c.jailerRoot, - ID: c.id, - UID: fcclient.Int(unix.Geteuid()), - GID: fcclient.Int(unix.Getegid()), - NumaNode: fcclient.Int(c.numaNode), - ExecFile: c.executorConfig.FirecrackerBinaryPath, - ChrootStrategy: fcclient.NewNaiveChrootStrategy(""), - Stdout: c.vmLogWriter(), - Stderr: c.vmLogWriter(), - CgroupVersion: cgroupVersion, - }, + JailerCfg: jailerCfg, Snapshot: fcclient.SnapshotConfig{ EnableDiffSnapshots: true, ResumeVM: true, @@ -1442,11 +1438,11 @@ func (c *FirecrackerContainer) getConfig(ctx context.Context, rootFS, containerF if c.network != nil { netnsPath = c.network.NamespacePath() } - cgroupVersion, err := getCgroupVersion() + bootArgs := getBootArgs(c.vmConfig) + jailerCfg, err := c.getJailerConfig(c.executorConfig.KernelImagePath) if err != nil { - return nil, err + return nil, status.WrapError(err, "get jailer config") } - bootArgs := getBootArgs(c.vmConfig) cfg := &fcclient.Config{ VMID: c.id, SocketPath: firecrackerSocketPath, @@ -1462,19 +1458,7 @@ func (c *FirecrackerContainer) getConfig(ctx context.Context, rootFS, containerF VsockDevices: []fcclient.VsockDevice{ {Path: firecrackerVSockPath}, }, - JailerCfg: &fcclient.JailerConfig{ - JailerBinary: c.executorConfig.JailerBinaryPath, - ChrootBaseDir: c.jailerRoot, - ID: c.id, - UID: fcclient.Int(unix.Geteuid()), - GID: fcclient.Int(unix.Getegid()), - NumaNode: fcclient.Int(c.numaNode), - ExecFile: c.executorConfig.FirecrackerBinaryPath, - ChrootStrategy: fcclient.NewNaiveChrootStrategy(c.executorConfig.KernelImagePath), - Stdout: c.vmLogWriter(), - Stderr: c.vmLogWriter(), - CgroupVersion: cgroupVersion, - }, + JailerCfg: jailerCfg, MachineCfg: fcmodels.MachineConfiguration{ VcpuCount: fcclient.Int64(c.vmConfig.NumCpus), MemSizeMib: fcclient.Int64(c.vmConfig.MemSizeMb), @@ -1523,14 +1507,48 @@ func (c *FirecrackerContainer) getConfig(ctx context.Context, rootFS, containerF }, } } - cfg.JailerCfg.Stdout = c.vmLogWriter() - cfg.JailerCfg.Stderr = c.vmLogWriter() if *debugTerminal { cfg.JailerCfg.Stdin = os.Stdin } return cfg, nil } +func (c *FirecrackerContainer) getJailerConfig(kernelImagePath string) (*fcclient.JailerConfig, error) { + cgroupVersion, err := getCgroupVersion() + if err != nil { + return nil, status.WrapError(err, "get cgroup version") + } + + var cgroupArgs []string + if *enableCPUWeight { + // Divide millicpu by 100 to avoid exceeding the cgroup max value, while + // still allowing reasonably fine-grained CPU weights. e.g. 32000m CPU + // translates to weight 320 which is well under the max of 10000. + // + // Note: this logic assumes cgroup version 2. cgroup1 has different + // min/max values here, and uses "cpu.shares" instead of "cpu.weight" + weight := c.cpuWeightMillis / 100 + weight = max(weight, cgroupMinCPUWeight) + weight = min(weight, cgroupMaxCPUWeight) + cgroupArgs = append(cgroupArgs, fmt.Sprintf("cpu.weight=%d", weight)) + } + + return &fcclient.JailerConfig{ + JailerBinary: c.executorConfig.JailerBinaryPath, + ChrootBaseDir: c.jailerRoot, + ID: c.id, + UID: fcclient.Int(unix.Geteuid()), + GID: fcclient.Int(unix.Getegid()), + NumaNode: fcclient.Int(c.numaNode), + ExecFile: c.executorConfig.FirecrackerBinaryPath, + ChrootStrategy: fcclient.NewNaiveChrootStrategy(kernelImagePath), + Stdout: c.vmLogWriter(), + Stderr: c.vmLogWriter(), + CgroupVersion: cgroupVersion, + CgroupArgs: cgroupArgs, + }, nil +} + func (c *FirecrackerContainer) vmLogWriter() io.Writer { if *debugStreamVMLogs || *debugTerminal { return io.MultiWriter(c.vmLog, os.Stderr)