From 618e149e4ae59ae46dd5fd404d61c4bbac5e1b5a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 13 Mar 2024 16:12:51 +1100 Subject: [PATCH] [1.1] seccomp: patchbpf: always include native architecture in stub (This is a backport of ccc500c427da731554a181f2ea407adf99870423.) It turns out that on ppc64le (at least), Docker doesn't include any architectures in the list of allowed architectures. libseccomp interprets this as "just include the default architecture" but patchbpf would return a no-op ENOSYS stub, which would lead to the exact issues that commit 7a8d7162f9d7 ("seccomp: prepend -ENOSYS stub to all filters") fixed for other architectures. So, just always include the running architecture in the list. There's no real downside. Ref: https://bugzilla.suse.com/show_bug.cgi?id=1192051#c6 Fixes: 7a8d7162f9d7 ("seccomp: prepend -ENOSYS stub to all filters") Reported-by: Fabian Vogt Signed-off-by: Aleksa Sarai --- libcontainer/seccomp/patchbpf/enosys_linux.go | 22 +++++++-- .../seccomp/patchbpf/enosys_linux_test.go | 47 +++++++++++++++++-- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go index 1b67fda85c6..d459ba8792c 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -224,16 +224,30 @@ type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSy // representation, but SCMP_ARCH_X32 means we have to track cases where the // same architecture has different largest syscalls based on the mode. func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { - lastSyscalls := make(lastSyscallMap) - // Only loop over architectures which are present in the filter. Any other - // architectures will get the libseccomp bad architecture action anyway. + scmpArchs := make(map[libseccomp.ScmpArch]struct{}) for _, ociArch := range config.Architectures { arch, err := libseccomp.GetArchFromString(ociArch) if err != nil { return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err) } + scmpArchs[arch] = struct{}{} + } + // On architectures like ppc64le, Docker inexplicably doesn't include the + // native architecture in the architecture list which results in no + // architectures being present in the list at all (rendering the ENOSYS + // stub a no-op). So, always include the native architecture. + if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil { + return nil, fmt.Errorf("unable to get native arch: %w", err) + } else if _, ok := scmpArchs[nativeScmpArch]; !ok { + logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch) + scmpArchs[nativeScmpArch] = struct{}{} + } + logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs) - // Figure out native architecture representation of the architecture. + // Only loop over architectures which are present in the filter. Any other + // architectures will get the libseccomp bad architecture action anyway. + lastSyscalls := make(lastSyscallMap) + for arch := range scmpArchs { auditArch, err := scmpArchToAuditArch(arch) if err != nil { return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err) diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go index bdfeff68adb..3d442e1daa6 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go @@ -12,6 +12,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" libseccomp "github.com/seccomp/libseccomp-golang" + "github.com/sirupsen/logrus" "golang.org/x/net/bpf" ) @@ -105,6 +106,18 @@ var testArches = []string{ "ppc64le", "s390", "s390x", + // Dummy value to indicate a configuration with no architecture specified. + "native", +} + +var nativeArch string + +func init() { + scmpNativeArch, err := libseccomp.GetNativeArch() + if err != nil { + logrus.Panicf("get native arch: %v", err) + } + nativeArch = scmpNativeArch.String() } func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) { @@ -155,6 +168,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) expected uint32 } + if arch == "native" { + arch = nativeArch + } scmpArch, err := libseccomp.GetArchFromString(arch) if err != nil { t.Fatalf("unknown libseccomp architecture %q: %v", arch, err) @@ -228,8 +244,15 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) // Test syscalls in the explicit list. for _, test := range syscallTests { - // Override the expected value in the two special cases. - if !archSet[arch] || isAllowAction(defaultAction) { + // Override the expected value in the two special cases: + // 1. If the default action is allow, the filter won't have + // the stub prepended so we expect a fallthrough. + // 2. If the executing architecture is not in the architecture + // set, then the architecture is not handled by the stub -- + // *except* in the case of the native architecture (which + // is always included in the stub). + if isAllowAction(defaultAction) || + (!archSet[arch] && arch != nativeArch) { test.expected = retFallthrough } @@ -263,7 +286,14 @@ var testActions = map[string]configs.Action{ func TestEnosysStub_SingleArch(t *testing.T) { for _, arch := range testArches { - arches := []string{arch} + var arches []string + // "native" indicates a blank architecture field for seccomp, to test + // the case where the running architecture was not included in the + // architecture. Docker doesn't always set the architecture for some + // reason (namely for ppc64le). + if arch != "native" { + arches = append(arches, arch) + } t.Run("arch="+arch, func(t *testing.T) { for name, action := range testActions { t.Run("action="+name, func(t *testing.T) { @@ -277,7 +307,16 @@ func TestEnosysStub_SingleArch(t *testing.T) { func TestEnosysStub_MultiArch(t *testing.T) { for end := 0; end < len(testArches); end++ { for start := 0; start < end; start++ { - arches := testArches[start:end] + var arches []string + for _, arch := range testArches[start:end] { + // "native" indicates a blank architecture field for seccomp, to test + // the case where the running architecture was not included in the + // architecture. Docker doesn't always set the architecture for some + // reason (namely for ppc64le). + if arch != "native" { + arches = append(arches, arch) + } + } if len(arches) <= 1 { continue }