Skip to content

Commit

Permalink
firecracker: set jailer cgroup CPU weight according to task size (#7858)
Browse files Browse the repository at this point in the history
  • Loading branch information
bduffany authored Nov 7, 2024
1 parent c88cfe9 commit 53dd046
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 42 deletions.
5 changes: 4 additions & 1 deletion MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,10 @@ go_deps.module_override(
)
go_deps.module_override(
patch_strip = 1,
patches = ["@@//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_jailer.patch"],
patches = [
"@@//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_jailer.patch",
"@@//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_cgroup.patch",
],
path = "github.com/firecracker-microvm/firecracker-go-sdk",
)
go_deps.module_override(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
From 82d24f54a9a8cad65d8949816e3f6e85a134017d Mon Sep 17 00:00:00 2001
From: Brandon Duffany <[email protected]>
Date: Wed, 6 Nov 2024 17:01:44 -0500
Subject: [PATCH] Support jailer cgroup args

Signed-off-by: Brandon Duffany <[email protected]>
---
jailer.go | 29 ++++++++++++++++++++++++++++-
jailer_test.go | 4 ++++
2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/jailer.go b/jailer.go
index 208de670..3246738d 100644
--- a/jailer.go
+++ b/jailer.go
@@ -85,6 +85,10 @@ type JailerConfig struct {
// CgroupVersion is the version of the cgroup filesystem to use.
CgroupVersion string

+ // CgroupArgs are cgroup settings applied by the jailer. Each arg must be
+ // formatted like <cgroup_file>=<value>, like "cpu.shares=10"
+ CgroupArgs []string
+
// Stdout specifies the IO writer for STDOUT to use when spawning the jailer.
Stdout io.Writer
// Stderr specifies the IO writer for STDERR to use when spawning the jailer.
@@ -109,6 +113,7 @@ type JailerCommandBuilder struct {
daemonize bool
firecrackerArgs []string
cgroupVersion string
+ cgroupArgs []string

stdin io.Reader
stdout io.Writer
@@ -143,6 +148,10 @@ func (b JailerCommandBuilder) Args() []string {
args = append(args, "--cgroup", fmt.Sprintf("cpuset.cpus=%s", cpulist))
}

+ for _, cgroupArg := range b.cgroupArgs {
+ args = append(args, "--cgroup", cgroupArg)
+ }
+
if len(b.cgroupVersion) > 0 {
args = append(args, "--cgroup-version", b.cgroupVersion)
}
@@ -204,13 +213,30 @@ func (b JailerCommandBuilder) WithExecFile(path string) JailerCommandBuilder {
return b
}

-// WithNumaNode uses the specfied node for the jailer. This represents the numa
+// WithNumaNode uses the specified node for the jailer. This represents the numa
// node that the process will get assigned to.
+// Note: this is a convenience function that just sets the values of the cgroup
+// files "cpuset.mems" and "cpuset.cpus".
+// If those files are also configured using WithCgroupArgs, the values passed to
+// WithCgroupArgs will take precedence.
func (b JailerCommandBuilder) WithNumaNode(node int) JailerCommandBuilder {
b.node = node
return b
}

+// WithCgroupArgs sets cgroup file values to be set by the jailer.
+// Each arg must be of the form <cgroup_file>=<value>.
+// Each call to this function resets the cgroup arguments, rather than
+// appending.
+//
+// Example:
+//
+// b = b.WithCgroupArgs("cpu.shares=10")
+func (b JailerCommandBuilder) WithCgroupArgs(cgroupArgs ...string) JailerCommandBuilder {
+ b.cgroupArgs = cgroupArgs
+ return b
+}
+
// WithChrootBaseDir will set the given path as the chroot base directory. This
// specifies where chroot jails are built and defaults to /srv/jailer.
func (b JailerCommandBuilder) WithChrootBaseDir(path string) JailerCommandBuilder {
@@ -348,6 +374,7 @@ func jail(ctx context.Context, m *Machine, cfg *Config) error {
WithChrootBaseDir(cfg.JailerCfg.ChrootBaseDir).
WithDaemonize(cfg.JailerCfg.Daemonize).
WithCgroupVersion(cfg.JailerCfg.CgroupVersion).
+ WithCgroupArgs(cfg.JailerCfg.CgroupArgs...).
WithFirecrackerArgs(fcArgs...).
WithStdout(stdout).
WithStderr(stderr)
diff --git a/jailer_test.go b/jailer_test.go
index 7c7017bc..6037cd27 100644
--- a/jailer_test.go
+++ b/jailer_test.go
@@ -103,6 +103,7 @@ func TestJailerBuilder(t *testing.T) {
UID: Int(123),
GID: Int(100),
NumaNode: Int(0),
+ CgroupArgs: []string{"cpu.shares=10"},
ChrootStrategy: NewNaiveChrootStrategy("kernel-image-path"),
ExecFile: "/path/to/firecracker",
ChrootBaseDir: "/tmp",
@@ -123,6 +124,8 @@ func TestJailerBuilder(t *testing.T) {
"cpuset.mems=0",
"--cgroup",
fmt.Sprintf("cpuset.cpus=%s", getNumaCpuset(0)),
+ "--cgroup",
+ "cpu.shares=10",
"--cgroup-version",
"2",
"--chroot-base-dir",
@@ -146,6 +149,7 @@ func TestJailerBuilder(t *testing.T) {
WithUID(IntValue(c.jailerCfg.UID)).
WithGID(IntValue(c.jailerCfg.GID)).
WithNumaNode(IntValue(c.jailerCfg.NumaNode)).
+ WithCgroupArgs(c.jailerCfg.CgroupArgs...).
WithCgroupVersion(c.jailerCfg.CgroupVersion).
WithExecFile(c.jailerCfg.ExecFile)

12 changes: 7 additions & 5 deletions deps.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -1503,11 +1503,13 @@ def install_go_mod_dependencies(workspace_name = "buildbuddy"):
name = "com_github_firecracker_microvm_firecracker_go_sdk",
importpath = "github.com/firecracker-microvm/firecracker-go-sdk",
patch_args = ["-p1"],
# TODO(bduffany): when
# https://github.com/firecracker-microvm/firecracker-go-sdk/pull/510 is
# merged, cherry-pick the final revision into our fork, and remove this
# patch.
patches = ["@{}//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_jailer.patch".format(workspace_name)],
# TODO(bduffany): remove these patches when the corresponding PRs are merged
patches = [
# https://github.com/firecracker-microvm/firecracker-go-sdk/pull/510
"@{}//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_jailer.patch".format(workspace_name),
# https://github.com/firecracker-microvm/firecracker-go-sdk/pull/600
"@{}//buildpatches:com_github_firecracker_microvm_firecracker_go_sdk_cgroup.patch".format(workspace_name),
],
sum = "h1:n9Q3BKnUAW0x1D1x2I2QIj0T/5K6UYL6JKkPvwwARw0=",
version = "v0.0.0-20241028184712-f74f43bb036d",
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ type ContainerOpts struct {
// The action directory with inputs / outputs.
ActionWorkingDirectory string

// CPUWeightMillis is the CPU weight to assign to this VM, expressed as
// CPU-millis. This is set to the task size.
CPUWeightMillis int64

// Optional flags -- these will default to sane values.
// They are here primarily for debugging and running
// VMs outside of the normal action-execution framework.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ var healthCheckInterval = flag.Duration("executor.firecracker_health_check_inter
var healthCheckTimeout = flag.Duration("executor.firecracker_health_check_timeout", 30*time.Second, "Timeout for VM health check requests.")
var overprovisionCPUs = flag.Int("executor.firecracker_overprovision_cpus", 3, "Number of CPUs to overprovision for VMs. This allows VMs to more effectively utilize CPU resources on the host machine. Set to -1 to allow all VMs to use max CPU.")
var initOnAllocAndFree = flag.Bool("executor.firecracker_init_on_alloc_and_free", false, "Set init_on_alloc=1 and init_on_free=1 in firecracker vms")
var enableCPUWeight = flag.Bool("executor.firecracker_enable_cpu_weight", false, "Set cgroup CPU weight to match VM size")

var forceRemoteSnapshotting = flag.Bool("debug_force_remote_snapshots", false, "When remote snapshotting is enabled, force remote snapshotting even for tasks which otherwise wouldn't support it.")
var disableWorkspaceSync = flag.Bool("debug_disable_firecracker_workspace_sync", false, "Do not sync the action workspace to the guest, instead using the existing workspace from the VM snapshot.")
Expand Down Expand Up @@ -211,6 +212,11 @@ const (
// Firecracker does not allow VMs over a certain size.
// See MAX_SUPPORTED_VCPUS in firecracker repo.
firecrackerMaxCPU = 32

// Min value for cgroup2 cpu.weight
cgroupMinCPUWeight = 1
// Max value for cgroup2 cpu.weight
cgroupMaxCPUWeight = 10_000
)

var (
Expand Down Expand Up @@ -483,6 +489,7 @@ func (p *Provider) New(ctx context.Context, args *container.Init) (container.Com
DockerClient: p.dockerClient,
ActionWorkingDirectory: args.WorkDir,
ExecutorConfig: p.executorConfig,
CPUWeightMillis: sizeEstimate.GetEstimatedMilliCpu(),
}
c, err := NewContainer(ctx, p.env, args.Task.GetExecutionTask(), opts)
if err != nil {
Expand Down Expand Up @@ -553,6 +560,7 @@ type FirecrackerContainer struct {

jailerRoot string // the root dir the jailer will work in
numaNode int // NUMA node for CPU scheduling
cpuWeightMillis int64 // milliCPU for cgroup CPU weight
machine *fcclient.Machine // the firecracker machine object.
vmLog *VMLog
env environment.Env
Expand Down Expand Up @@ -613,6 +621,7 @@ func NewContainer(ctx context.Context, env environment.Env, task *repb.Execution
containerImage: opts.ContainerImage,
user: opts.User,
actionWorkingDir: opts.ActionWorkingDirectory,
cpuWeightMillis: opts.CPUWeightMillis,
numaNode: numaNode,
env: env,
task: task,
Expand Down Expand Up @@ -986,31 +995,18 @@ func (c *FirecrackerContainer) LoadSnapshot(ctx context.Context) error {
netnsPath = c.network.NamespacePath()
}

cgroupVersion, err := getCgroupVersion()
if err != nil {
return err
}

// We start firecracker with this reduced config because we will load a
// snapshot that is already configured.
jailerCfg, err := c.getJailerConfig("" /*=kernelImagePath*/)
if err != nil {
return status.WrapError(err, "get jailer config")
}
cfg := fcclient.Config{
SocketPath: firecrackerSocketPath,
NetNS: netnsPath,
Seccomp: fcclient.SeccompConfig{Enabled: true},
DisableValidation: true,
JailerCfg: &fcclient.JailerConfig{
JailerBinary: c.executorConfig.JailerBinaryPath,
ChrootBaseDir: c.jailerRoot,
ID: c.id,
UID: fcclient.Int(unix.Geteuid()),
GID: fcclient.Int(unix.Getegid()),
NumaNode: fcclient.Int(c.numaNode),
ExecFile: c.executorConfig.FirecrackerBinaryPath,
ChrootStrategy: fcclient.NewNaiveChrootStrategy(""),
Stdout: c.vmLogWriter(),
Stderr: c.vmLogWriter(),
CgroupVersion: cgroupVersion,
},
JailerCfg: jailerCfg,
Snapshot: fcclient.SnapshotConfig{
EnableDiffSnapshots: true,
ResumeVM: true,
Expand Down Expand Up @@ -1442,11 +1438,11 @@ func (c *FirecrackerContainer) getConfig(ctx context.Context, rootFS, containerF
if c.network != nil {
netnsPath = c.network.NamespacePath()
}
cgroupVersion, err := getCgroupVersion()
bootArgs := getBootArgs(c.vmConfig)
jailerCfg, err := c.getJailerConfig(c.executorConfig.KernelImagePath)
if err != nil {
return nil, err
return nil, status.WrapError(err, "get jailer config")
}
bootArgs := getBootArgs(c.vmConfig)
cfg := &fcclient.Config{
VMID: c.id,
SocketPath: firecrackerSocketPath,
Expand All @@ -1462,19 +1458,7 @@ func (c *FirecrackerContainer) getConfig(ctx context.Context, rootFS, containerF
VsockDevices: []fcclient.VsockDevice{
{Path: firecrackerVSockPath},
},
JailerCfg: &fcclient.JailerConfig{
JailerBinary: c.executorConfig.JailerBinaryPath,
ChrootBaseDir: c.jailerRoot,
ID: c.id,
UID: fcclient.Int(unix.Geteuid()),
GID: fcclient.Int(unix.Getegid()),
NumaNode: fcclient.Int(c.numaNode),
ExecFile: c.executorConfig.FirecrackerBinaryPath,
ChrootStrategy: fcclient.NewNaiveChrootStrategy(c.executorConfig.KernelImagePath),
Stdout: c.vmLogWriter(),
Stderr: c.vmLogWriter(),
CgroupVersion: cgroupVersion,
},
JailerCfg: jailerCfg,
MachineCfg: fcmodels.MachineConfiguration{
VcpuCount: fcclient.Int64(c.vmConfig.NumCpus),
MemSizeMib: fcclient.Int64(c.vmConfig.MemSizeMb),
Expand Down Expand Up @@ -1523,14 +1507,48 @@ func (c *FirecrackerContainer) getConfig(ctx context.Context, rootFS, containerF
},
}
}
cfg.JailerCfg.Stdout = c.vmLogWriter()
cfg.JailerCfg.Stderr = c.vmLogWriter()
if *debugTerminal {
cfg.JailerCfg.Stdin = os.Stdin
}
return cfg, nil
}

func (c *FirecrackerContainer) getJailerConfig(kernelImagePath string) (*fcclient.JailerConfig, error) {
cgroupVersion, err := getCgroupVersion()
if err != nil {
return nil, status.WrapError(err, "get cgroup version")
}

var cgroupArgs []string
if *enableCPUWeight {
// Divide millicpu by 100 to avoid exceeding the cgroup max value, while
// still allowing reasonably fine-grained CPU weights. e.g. 32000m CPU
// translates to weight 320 which is well under the max of 10000.
//
// Note: this logic assumes cgroup version 2. cgroup1 has different
// min/max values here, and uses "cpu.shares" instead of "cpu.weight"
weight := c.cpuWeightMillis / 100
weight = max(weight, cgroupMinCPUWeight)
weight = min(weight, cgroupMaxCPUWeight)
cgroupArgs = append(cgroupArgs, fmt.Sprintf("cpu.weight=%d", weight))
}

return &fcclient.JailerConfig{
JailerBinary: c.executorConfig.JailerBinaryPath,
ChrootBaseDir: c.jailerRoot,
ID: c.id,
UID: fcclient.Int(unix.Geteuid()),
GID: fcclient.Int(unix.Getegid()),
NumaNode: fcclient.Int(c.numaNode),
ExecFile: c.executorConfig.FirecrackerBinaryPath,
ChrootStrategy: fcclient.NewNaiveChrootStrategy(kernelImagePath),
Stdout: c.vmLogWriter(),
Stderr: c.vmLogWriter(),
CgroupVersion: cgroupVersion,
CgroupArgs: cgroupArgs,
}, nil
}

func (c *FirecrackerContainer) vmLogWriter() io.Writer {
if *debugStreamVMLogs || *debugTerminal {
return io.MultiWriter(c.vmLog, os.Stderr)
Expand Down

0 comments on commit 53dd046

Please sign in to comment.