Skip to content

Commit

Permalink
Apply cgroup settings to firecracker and ociruntime
Browse files Browse the repository at this point in the history
  • Loading branch information
bduffany committed Nov 19, 2024
1 parent 8712ac9 commit 1e7a6bc
Show file tree
Hide file tree
Showing 9 changed files with 194 additions and 32 deletions.
9 changes: 9 additions & 0 deletions enterprise/server/remote_execution/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,15 @@ type Init struct {
// BlockDevice is the block device where the build root dir is located.
BlockDevice *block_io.Device

// CgroupParent is the path relative to the cgroupfs root under which the
// container's cgroup should be created. For example, if the cgroupfs is
// rooted at "/sys/fs/cgroup" and all container cgroups are placed under
// "/sys/fs/cgroup/buildbuddy.executor.containers" then this will be
// "buildbuddy.executor.containers". The container implementation is
// responsible for creating the cgroup, applying the CgroupSettings
// from `Task.SchedulingMetadata.CgroupSettings`.
CgroupParent string

// Publisher can be used to send fine-grained execution progress updates.
Publisher *operation.Publisher
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ go_library(
"vmlinuxRunfilePath": "$(rlocationpath //enterprise/vmsupport/bin:vmlinux)",
},
deps = [
"//enterprise/server/remote_execution/block_io",
"//enterprise/server/remote_execution/cgroup",
"//enterprise/server/remote_execution/commandutil",
"//enterprise/server/remote_execution/container",
"//enterprise/server/remote_execution/copy_on_write",
Expand All @@ -39,6 +41,7 @@ go_library(
"//enterprise/vmsupport:bundle",
"//proto:firecracker_go_proto",
"//proto:remote_execution_go_proto",
"//proto:scheduler_go_proto",
"//proto:vmexec_go_proto",
"//proto:vmvfs_go_proto",
"//server/environment",
Expand All @@ -50,6 +53,7 @@ go_library(
"//server/util/disk",
"//server/util/log",
"//server/util/networking",
"//server/util/proto",
"//server/util/status",
"//server/util/tracing",
"@com_github_armon_circbuf//:circbuf",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
package firecracker

import (
fcpb "github.com/buildbuddy-io/buildbuddy/proto/firecracker"
"github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/block_io"

dockerclient "github.com/docker/docker/client"
fcpb "github.com/buildbuddy-io/buildbuddy/proto/firecracker"
scpb "github.com/buildbuddy-io/buildbuddy/proto/scheduler"
)

type ContainerOpts struct {
Expand Down Expand Up @@ -45,6 +48,18 @@ type ContainerOpts struct {
// CPU-millis. This is set to the task size.
CPUWeightMillis int64

// CgroupParent is the parent cgroup path relative to the cgroup root.
CgroupParent string

// CgroupSettings are settings applied to cgroup in which jailer executes.
CgroupSettings *scpb.CgroupSettings

// BlockDevice sets the block device for restricting IO via cgroup. Note
// that the firecracker cgroup does not affect IO for virtual block devices
// (VBD) or memory snapshots (UFFD) since the server for these devices runs
// in the executor process.
BlockDevice *block_io.Device

// Optional flags -- these will default to sane values.
// They are here primarily for debugging and running
// VMs outside of the normal action-execution framework.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ import (

"github.com/armon/circbuf"
"github.com/bazelbuild/rules_go/go/runfiles"
"github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/block_io"
"github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/cgroup"
"github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/commandutil"
"github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/container"
"github.com/buildbuddy-io/buildbuddy/enterprise/server/remote_execution/copy_on_write"
Expand All @@ -50,6 +52,7 @@ import (
"github.com/buildbuddy-io/buildbuddy/server/util/disk"
"github.com/buildbuddy-io/buildbuddy/server/util/log"
"github.com/buildbuddy-io/buildbuddy/server/util/networking"
"github.com/buildbuddy-io/buildbuddy/server/util/proto"
"github.com/buildbuddy-io/buildbuddy/server/util/status"
"github.com/buildbuddy-io/buildbuddy/server/util/tracing"
"github.com/firecracker-microvm/firecracker-go-sdk/client/operations"
Expand All @@ -64,6 +67,7 @@ import (
vmsupport_bundle "github.com/buildbuddy-io/buildbuddy/enterprise/vmsupport"
fcpb "github.com/buildbuddy-io/buildbuddy/proto/firecracker"
repb "github.com/buildbuddy-io/buildbuddy/proto/remote_execution"
scpb "github.com/buildbuddy-io/buildbuddy/proto/scheduler"
vmxpb "github.com/buildbuddy-io/buildbuddy/proto/vmexec"
vmfspb "github.com/buildbuddy-io/buildbuddy/proto/vmvfs"
fcclient "github.com/firecracker-microvm/firecracker-go-sdk"
Expand Down Expand Up @@ -471,6 +475,9 @@ func (p *Provider) New(ctx context.Context, args *container.Init) (container.Com
ContainerImage: args.Props.ContainerImage,
User: args.Props.DockerUser,
ActionWorkingDirectory: args.WorkDir,
CgroupParent: args.CgroupParent,
CgroupSettings: args.Task.GetSchedulingMetadata().GetCgroupSettings(),
BlockDevice: args.BlockDevice,
ExecutorConfig: p.executorConfig,
CPUWeightMillis: sizeEstimate.GetEstimatedMilliCpu(),
}
Expand Down Expand Up @@ -538,10 +545,13 @@ type FirecrackerContainer struct {
uffdHandler *uffd.Handler
memoryStore *copy_on_write.COWStore

jailerRoot string // the root dir the jailer will work in
numaNode int // NUMA node for CPU scheduling
cpuWeightMillis int64 // milliCPU for cgroup CPU weight
machine *fcclient.Machine // the firecracker machine object.
jailerRoot string // the root dir the jailer will work in
numaNode int // NUMA node for CPU scheduling
cpuWeightMillis int64 // milliCPU for cgroup CPU weight
cgroupParent string // parent cgroup path (root-relative)
cgroupSettings *scpb.CgroupSettings // jailer cgroup settings
blockDevice *block_io.Device // block device for cgroup IO settings
machine *fcclient.Machine // the firecracker machine object.
vmLog *VMLog
env environment.Env
mountWorkspaceFile bool
Expand Down Expand Up @@ -601,6 +611,9 @@ func NewContainer(ctx context.Context, env environment.Env, task *repb.Execution
user: opts.User,
actionWorkingDir: opts.ActionWorkingDirectory,
cpuWeightMillis: opts.CPUWeightMillis,
cgroupParent: opts.CgroupParent,
cgroupSettings: opts.CgroupSettings,
blockDevice: opts.BlockDevice,
numaNode: numaNode,
env: env,
task: task,
Expand Down Expand Up @@ -976,7 +989,7 @@ func (c *FirecrackerContainer) LoadSnapshot(ctx context.Context) error {

// We start firecracker with this reduced config because we will load a
// snapshot that is already configured.
jailerCfg, err := c.getJailerConfig("" /*=kernelImagePath*/)
jailerCfg, err := c.getJailerConfig(ctx, "" /*=kernelImagePath*/)
if err != nil {
return status.WrapError(err, "get jailer config")
}
Expand Down Expand Up @@ -1418,7 +1431,7 @@ func (c *FirecrackerContainer) getConfig(ctx context.Context, rootFS, containerF
netnsPath = c.network.NamespacePath()
}
bootArgs := getBootArgs(c.vmConfig)
jailerCfg, err := c.getJailerConfig(c.executorConfig.KernelImagePath)
jailerCfg, err := c.getJailerConfig(ctx, c.executorConfig.KernelImagePath)
if err != nil {
return nil, status.WrapError(err, "get jailer config")
}
Expand Down Expand Up @@ -1492,17 +1505,33 @@ func (c *FirecrackerContainer) getConfig(ctx context.Context, rootFS, containerF
return cfg, nil
}

func (c *FirecrackerContainer) getJailerConfig(kernelImagePath string) (*fcclient.JailerConfig, error) {
func (c *FirecrackerContainer) cgroupPath() string {
return filepath.Join("/sys/fs/cgroup", c.cgroupParent, c.id)
}

func (c *FirecrackerContainer) getJailerConfig(ctx context.Context, kernelImagePath string) (*fcclient.JailerConfig, error) {
cgroupVersion, err := getCgroupVersion()
if err != nil {
return nil, status.WrapError(err, "get cgroup version")
}

var cgroupArgs []string
if *enableCPUWeight {
// Apply CPU weight only if the app hasn't set CPU weight via scheduling
// metadata.
cgroupSettings := c.cgroupSettings
if cgroupSettings == nil && *enableCPUWeight {
// Use the same weight calculation used in ociruntime.
cpuWeight := oci.CPUSharesToWeight(oci.CPUMillisToShares(c.cpuWeightMillis))
cgroupArgs = append(cgroupArgs, fmt.Sprintf("cpu.weight=%d", cpuWeight))
cgroupSettings = &scpb.CgroupSettings{
CpuWeight: proto.Int64(cpuWeight),
}
}
if cgroupSettings != nil {
if err := os.MkdirAll(c.cgroupPath(), 0755); err != nil {
return nil, status.UnavailableErrorf("create cgroup: %s", err)
}
if err := cgroup.Setup(ctx, c.cgroupPath(), c.cgroupSettings, c.blockDevice); err != nil {
return nil, status.UnavailableErrorf("setup cgroup: %s", err)
}
}

return &fcclient.JailerConfig{
Expand All @@ -1517,12 +1546,11 @@ func (c *FirecrackerContainer) getJailerConfig(kernelImagePath string) (*fcclien
Stdout: c.vmLogWriter(),
Stderr: c.vmLogWriter(),
CgroupVersion: cgroupVersion,
CgroupArgs: cgroupArgs,
// Use the root cgroup as the cgroup parent rather than the default
// "firecracker" subdir, so that VM cgroups are siblings of OCI
// container cgroups. This is needed because CPU weight is applied to
// sibling cgroups in a hierarchy.
ParentCgroup: fcclient.String(""),
ParentCgroup: fcclient.String(c.cgroupParent),
}, nil
}

Expand Down Expand Up @@ -2475,7 +2503,7 @@ func (c *FirecrackerContainer) stopMachine(ctx context.Context) error {
// Once the VM exits, delete the cgroup.
// Jailer docs say that this cleanup must be handled by us:
// https://github.com/firecracker-microvm/firecracker/blob/main/docs/jailer.md#observations
if err := os.Remove(filepath.Join("/sys/fs/cgroup", c.id)); err != nil && !os.IsNotExist(err) {
if err := os.Remove(c.cgroupPath()); err != nil && !os.IsNotExist(err) {
log.CtxWarningf(ctx, "Failed to remove jailer cgroup: %s", err)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ go_library(
"//enterprise/server/remote_execution/container",
"//enterprise/server/util/oci",
"//proto:remote_execution_go_proto",
"//proto:scheduler_go_proto",
"//server/environment",
"//server/interfaces",
"//server/util/disk",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"golang.org/x/sys/unix"

repb "github.com/buildbuddy-io/buildbuddy/proto/remote_execution"
scpb "github.com/buildbuddy-io/buildbuddy/proto/scheduler"
ctr "github.com/google/go-containerregistry/pkg/v1"
specs "github.com/opencontainers/runtime-spec/specs-go"
)
Expand Down Expand Up @@ -193,6 +194,8 @@ func (p *provider) New(ctx context.Context, args *container.Init) (container.Com
networkPool: p.networkPool,

blockDevice: args.BlockDevice,
cgroupParent: args.CgroupParent,
cgroupSettings: args.Task.GetSchedulingMetadata().GetCgroupSettings(),
imageRef: args.Props.ContainerImage,
networkEnabled: args.Props.DockerNetwork != "off",
user: args.Props.DockerUser,
Expand All @@ -207,6 +210,8 @@ type ociContainer struct {

runtime string
cgroupPaths *cgroup.Paths
cgroupParent string
cgroupSettings *scpb.CgroupSettings
blockDevice *block_io.Device
containersRoot string
imageCacheRoot string
Expand Down Expand Up @@ -238,6 +243,14 @@ func (c *ociContainer) rootfsPath() string {
return filepath.Join(c.bundlePath(), "rootfs")
}

func (c *ociContainer) cgroupRootRelativePath() string {
return filepath.Join(c.cgroupParent, c.cid)
}

func (c *ociContainer) cgroupPath() string {
return filepath.Join("/sys/fs/cgroup", c.cgroupRootRelativePath())
}

// Returns the standard config.json path expected by crun.
func (c *ociContainer) configPath() string {
return filepath.Join(c.bundlePath(), "config.json")
Expand Down Expand Up @@ -287,6 +300,10 @@ func (c *ociContainer) createBundle(ctx context.Context, cmd *repb.Command) erro
if err := c.createRootfs(ctx); err != nil {
return fmt.Errorf("create rootfs: %w", err)
}
// Setup cgroup
if err := c.setupCgroup(ctx); err != nil {
return fmt.Errorf("setup cgroup: %w", err)
}

// Create config.json from the image config and command
image, ok := c.imageStore.CachedImage(c.imageRef)
Expand Down Expand Up @@ -468,6 +485,10 @@ func (c *ociContainer) Remove(ctx context.Context) error {
firstErr = status.UnavailableErrorf("remove bundle: %s", err)
}

if err := os.Remove(c.cgroupPath()); err != nil && firstErr == nil && !os.IsNotExist(err) {
firstErr = status.UnavailableErrorf("remove cgroup: %s", err)
}

return firstErr
}

Expand Down Expand Up @@ -542,6 +563,17 @@ func (c *ociContainer) doWithStatsTracking(ctx context.Context, invokeRuntimeFn
return res
}

func (c *ociContainer) setupCgroup(ctx context.Context) error {
path := c.cgroupPath()
if err := os.MkdirAll(path, 0755); err != nil {
return fmt.Errorf("create cgroup: %w", err)
}
if err := cgroup.Setup(ctx, path, c.cgroupSettings, c.blockDevice); err != nil {
return fmt.Errorf("configure cgroup: %w", err)
}
return nil
}

func (c *ociContainer) createRootfs(ctx context.Context) error {
if err := os.MkdirAll(c.rootfsPath(), 0755); err != nil {
return fmt.Errorf("create rootfs dir: %w", err)
Expand Down Expand Up @@ -689,25 +721,35 @@ func installBusybox(ctx context.Context, path string) error {

func (c *ociContainer) createSpec(ctx context.Context, cmd *repb.Command) (*specs.Spec, error) {
env := append(baseEnv, commandutil.EnvStringList(cmd)...)
var pids *specs.LinuxPids
if *pidsLimit >= 0 {
pids = &specs.LinuxPids{Limit: *pidsLimit}
}
image, _ := c.imageStore.CachedImage(c.imageRef)
user, err := getUser(ctx, image, c.rootfsPath(), c.user, c.forceRoot)
if err != nil {
return nil, fmt.Errorf("get container user: %w", err)
}

cpuSpecs := &specs.LinuxCPU{}
if *cpuLimit != 0 {
period := 100 * time.Millisecond
cpuSpecs.Quota = pointer(int64(*cpuLimit) * period.Microseconds())
cpuSpecs.Period = pointer(uint64(period.Microseconds()))
}

if *cpuSharesEnabled {
cpuSpecs.Shares = pointer(uint64(oci.CPUMillisToShares(c.milliCPU)))
var resources *specs.LinuxResources
// If the app is controlling cgroup settings then those take precedence
// over the executor settings.
// TODO: should self-hosted users be allowed to override the app's
// settings?
if c.cgroupSettings == nil {
var pids *specs.LinuxPids
if *pidsLimit >= 0 {
pids = &specs.LinuxPids{Limit: *pidsLimit}
}
cpuSpecs := &specs.LinuxCPU{}
if *cpuLimit != 0 {
period := 100 * time.Millisecond
cpuSpecs.Quota = pointer(int64(*cpuLimit) * period.Microseconds())
cpuSpecs.Period = pointer(uint64(period.Microseconds()))
}
if *cpuSharesEnabled {
cpuSpecs.Shares = pointer(uint64(oci.CPUMillisToShares(c.milliCPU)))
}
resources = &specs.LinuxResources{
Pids: pids,
CPU: cpuSpecs,
}
}

spec := specs.Spec{
Expand Down Expand Up @@ -815,7 +857,7 @@ func (c *ociContainer) createSpec(ctx context.Context, cmd *repb.Command) (*spec
},
Linux: &specs.Linux{
// TODO: set up cgroups
CgroupsPath: "",
CgroupsPath: filepath.Join(c.cgroupRootRelativePath()),
Namespaces: []specs.LinuxNamespace{
{Type: specs.PIDNamespace},
{Type: specs.IPCNamespace},
Expand All @@ -832,10 +874,7 @@ func (c *ociContainer) createSpec(ctx context.Context, cmd *repb.Command) (*spec
Sysctl: map[string]string{
"net.ipv4.ping_group_range": fmt.Sprintf("%d %d", user.GID, user.GID),
},
Resources: &specs.LinuxResources{
Pids: pids,
CPU: cpuSpecs,
},
Resources: resources,
// TODO: grok MaskedPaths and ReadonlyPaths - just copied from podman.
MaskedPaths: []string{
"/proc/acpi",
Expand Down
Loading

0 comments on commit 1e7a6bc

Please sign in to comment.