From 6223a65d5d6f5df283a1ebec10c3920f9250d2f3 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 14 Oct 2022 18:37:00 -0700 Subject: [PATCH 1/3] [1.1] libct/seccomp/patchbpf: rm duplicated code (This is a cherry-pick of 2cd05e44b662fb79c46d5ebfd6c71e9ebc98d40c.) In findLastSyscalls, we convert libseccomp.ArchNative to the real libseccomp architecture, but archToNative already does that, so this code is redundant. Remove the redundant code, and move its comment to archToNative. Fixes: 7a8d7162f9d7 ("seccomp: prepend -ENOSYS stub to all filters") Signed-off-by: Kir Kolyshkin Signed-off-by: Aleksa Sarai --- libcontainer/seccomp/patchbpf/enosys_linux.go | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go index efe6dca58b2..c9c1d4ccb68 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -233,16 +233,6 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err) } - // Map native architecture to a real architecture value to avoid - // doubling-up the lastSyscall mapping. - if arch == libseccomp.ArchNative { - nativeArch, err := libseccomp.GetNativeArch() - if err != nil { - return nil, fmt.Errorf("unable to get native architecture: %w", err) - } - arch = nativeArch - } - // Figure out native architecture representation of the architecture. nativeArch, err := archToNative(arch) if err != nil { From d85b58388f40b9b135fa7d71edd702894365d2fa Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 13 Mar 2024 13:40:16 +1100 Subject: [PATCH 2/3] [1.1] seccomp: patchbpf: rename nativeArch -> linuxAuditArch (This is a backport of b288abeaa58816aae1235dbd527b79cec5df644b.) Calling the Linux AUDIT_* architecture constants "native" leads to confusing code when we are getting the actual native architecture of the running system. Signed-off-by: Aleksa Sarai --- libcontainer/seccomp/patchbpf/enosys_linux.go | 81 ++++++++++--------- .../seccomp/patchbpf/enosys_linux_test.go | 16 ++-- 2 files changed, 49 insertions(+), 48 deletions(-) diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go index c9c1d4ccb68..1b67fda85c6 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -164,11 +164,11 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) return program, nil } -type nativeArch uint32 +type linuxAuditArch uint32 -const invalidArch nativeArch = 0 +const invalidArch linuxAuditArch = 0 -func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) { +func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) { switch arch { case libseccomp.ArchNative: // Convert to actual native architecture. @@ -176,48 +176,48 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) { if err != nil { return invalidArch, fmt.Errorf("unable to get native arch: %w", err) } - return archToNative(arch) + return scmpArchToAuditArch(arch) case libseccomp.ArchX86: - return nativeArch(C.C_AUDIT_ARCH_I386), nil + return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil case libseccomp.ArchAMD64, libseccomp.ArchX32: // NOTE: x32 is treated like x86_64 except all x32 syscalls have the // 30th bit of the syscall number set to indicate that it's not a // normal x86_64 syscall. - return nativeArch(C.C_AUDIT_ARCH_X86_64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil case libseccomp.ArchARM: - return nativeArch(C.C_AUDIT_ARCH_ARM), nil + return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil case libseccomp.ArchARM64: - return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil case libseccomp.ArchMIPS: - return nativeArch(C.C_AUDIT_ARCH_MIPS), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil case libseccomp.ArchMIPS64: - return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil case libseccomp.ArchMIPS64N32: - return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil case libseccomp.ArchMIPSEL: - return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil case libseccomp.ArchMIPSEL64: - return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil case libseccomp.ArchMIPSEL64N32: - return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil + return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil case libseccomp.ArchPPC: - return nativeArch(C.C_AUDIT_ARCH_PPC), nil + return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil case libseccomp.ArchPPC64: - return nativeArch(C.C_AUDIT_ARCH_PPC64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil case libseccomp.ArchPPC64LE: - return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil + return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil case libseccomp.ArchS390: - return nativeArch(C.C_AUDIT_ARCH_S390), nil + return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil case libseccomp.ArchS390X: - return nativeArch(C.C_AUDIT_ARCH_S390X), nil + return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil case libseccomp.ArchRISCV64: - return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil + return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil default: return invalidArch, fmt.Errorf("unknown architecture: %v", arch) } } -type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall +type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall // Figure out largest syscall number referenced in the filter for each // architecture. We will be generating code based on the native architecture @@ -234,17 +234,17 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { } // Figure out native architecture representation of the architecture. - nativeArch, err := archToNative(arch) + auditArch, err := scmpArchToAuditArch(arch) if err != nil { return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err) } - if _, ok := lastSyscalls[nativeArch]; !ok { - lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{} + if _, ok := lastSyscalls[auditArch]; !ok { + lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{} } - if _, ok := lastSyscalls[nativeArch][arch]; ok { + if _, ok := lastSyscalls[auditArch][arch]; ok { // Because of ArchNative we may hit the same entry multiple times. - // Just skip it if we've seen this (nativeArch, ScmpArch) + // Just skip it if we've seen this (linuxAuditArch, ScmpArch) // combination before. continue } @@ -262,10 +262,11 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { } } if largestSyscall != 0 { - lastSyscalls[nativeArch][arch] = largestSyscall + logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall) + lastSyscalls[auditArch][arch] = largestSyscall } else { - logrus.Warnf("could not find any syscalls for arch %s", ociArch) - delete(lastSyscalls[nativeArch], arch) + logrus.Warnf("could not find any syscalls for arch %v", arch) + delete(lastSyscalls[auditArch], arch) } } return lastSyscalls, nil @@ -283,10 +284,10 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { // close_range(2) which were added out-of-order in the syscall table between // kernel releases. func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) { - // A jump-table for each nativeArch used to generate the initial + // A jump-table for each linuxAuditArch used to generate the initial // conditional jumps -- measured from the *END* of the program so they // remain valid after prepending to the tail. - archJumpTable := map[nativeArch]uint32{} + archJumpTable := map[linuxAuditArch]uint32{} // Generate our own -ENOSYS rules for each architecture. They have to be // generated in reverse (prepended to the tail of the program) because the @@ -299,7 +300,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) } // Generate the syscall -ENOSYS rules. - for nativeArch, maxSyscalls := range lastSyscalls { + for auditArch, maxSyscalls := range lastSyscalls { // The number of instructions from the tail of this section which need // to be jumped in order to reach the -ENOSYS return. If the section // does not jump, it will fall through to the actual filter. @@ -380,7 +381,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // If we're on x86 we need to add a check for x32 and if we're in // the wrong mode we jump over the section. - if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) { + if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) { // Generate a prefix to check the mode. switch scmpArch { case libseccomp.ArchAMD64: @@ -409,8 +410,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) section = append(section, sectionTail...) case 2: // x32 and x86_64 are a unique case, we can't handle any others. - if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) { - return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch) + if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) { + return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch) } x32sysno, ok := maxSyscalls[libseccomp.ArchX32] @@ -487,7 +488,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) programTail = append(section, programTail...) // Update jump table. - archJumpTable[nativeArch] = uint32(len(programTail)) + archJumpTable[auditArch] = uint32(len(programTail)) } // Add a dummy "jump to filter" for any architecture we might miss below. @@ -507,9 +508,9 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // architectures based on how large the jumps are going to be, or // re-sort the candidate architectures each time to make sure that we // pick the largest jump which is going to be smaller than 255. - for nativeArch := range lastSyscalls { + for auditArch := range lastSyscalls { // We jump forwards but the jump table is calculated from the *END*. - jump := uint32(len(programTail)) - archJumpTable[nativeArch] + jump := uint32(len(programTail)) - archJumpTable[auditArch] // Same routine as above -- this is a basic jeq check, complicated // slightly if it turns out that we need to do a long jump. @@ -518,7 +519,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // jeq [arch],[jump] bpf.JumpIf{ Cond: bpf.JumpEqual, - Val: uint32(nativeArch), + Val: uint32(auditArch), SkipTrue: uint8(jump), }, }, programTail...) @@ -527,7 +528,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) // jne [arch],1 bpf.JumpIf{ Cond: bpf.JumpNotEqual, - Val: uint32(nativeArch), + Val: uint32(auditArch), SkipTrue: 1, }, // ja [jump] diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go index e2d363a43bd..bdfeff68adb 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go @@ -23,7 +23,7 @@ type seccompData struct { } // mockSyscallPayload creates a fake seccomp_data struct with the given data. -func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte { +func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch linuxAuditArch, args ...uint64) []byte { var buf bytes.Buffer data := seccompData{ @@ -150,8 +150,8 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) for _, arch := range testArches { type syscallTest struct { - syscall string sysno libseccomp.ScmpSyscall + syscall string expected uint32 } @@ -160,7 +160,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) t.Fatalf("unknown libseccomp architecture %q: %v", arch, err) } - nativeArch, err := archToNative(scmpArch) + auditArch, err := scmpArchToAuditArch(scmpArch) if err != nil { t.Fatalf("unknown audit architecture %q: %v", arch, err) } @@ -179,9 +179,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err) } syscallTests = append(syscallTests, syscallTest{ - syscall, - sysno, - expected, + sysno: sysno, + syscall: syscall, + expected: expected, }) } @@ -233,7 +233,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) test.expected = retFallthrough } - payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5) + payload := mockSyscallPayload(t, test.sysno, auditArch, 0x1337, 0xF00BA5) // NOTE: golang.org/x/net/bpf returns int here rather // than uint32. rawRet, err := filter.Run(payload) @@ -247,7 +247,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) t.Logf(" [%4.1d] %s", idx, insn) } t.Logf("payload: %#v", payload) - t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected) + t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, auditArch, test.syscall, test.sysno, ret, test.expected) } } } From 618e149e4ae59ae46dd5fd404d61c4bbac5e1b5a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 13 Mar 2024 16:12:51 +1100 Subject: [PATCH 3/3] [1.1] seccomp: patchbpf: always include native architecture in stub (This is a backport of ccc500c427da731554a181f2ea407adf99870423.) It turns out that on ppc64le (at least), Docker doesn't include any architectures in the list of allowed architectures. libseccomp interprets this as "just include the default architecture" but patchbpf would return a no-op ENOSYS stub, which would lead to the exact issues that commit 7a8d7162f9d7 ("seccomp: prepend -ENOSYS stub to all filters") fixed for other architectures. So, just always include the running architecture in the list. There's no real downside. Ref: https://bugzilla.suse.com/show_bug.cgi?id=1192051#c6 Fixes: 7a8d7162f9d7 ("seccomp: prepend -ENOSYS stub to all filters") Reported-by: Fabian Vogt Signed-off-by: Aleksa Sarai --- libcontainer/seccomp/patchbpf/enosys_linux.go | 22 +++++++-- .../seccomp/patchbpf/enosys_linux_test.go | 47 +++++++++++++++++-- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go index 1b67fda85c6..d459ba8792c 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -224,16 +224,30 @@ type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSy // representation, but SCMP_ARCH_X32 means we have to track cases where the // same architecture has different largest syscalls based on the mode. func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { - lastSyscalls := make(lastSyscallMap) - // Only loop over architectures which are present in the filter. Any other - // architectures will get the libseccomp bad architecture action anyway. + scmpArchs := make(map[libseccomp.ScmpArch]struct{}) for _, ociArch := range config.Architectures { arch, err := libseccomp.GetArchFromString(ociArch) if err != nil { return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err) } + scmpArchs[arch] = struct{}{} + } + // On architectures like ppc64le, Docker inexplicably doesn't include the + // native architecture in the architecture list which results in no + // architectures being present in the list at all (rendering the ENOSYS + // stub a no-op). So, always include the native architecture. + if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil { + return nil, fmt.Errorf("unable to get native arch: %w", err) + } else if _, ok := scmpArchs[nativeScmpArch]; !ok { + logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch) + scmpArchs[nativeScmpArch] = struct{}{} + } + logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs) - // Figure out native architecture representation of the architecture. + // Only loop over architectures which are present in the filter. Any other + // architectures will get the libseccomp bad architecture action anyway. + lastSyscalls := make(lastSyscallMap) + for arch := range scmpArchs { auditArch, err := scmpArchToAuditArch(arch) if err != nil { return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err) diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go index bdfeff68adb..3d442e1daa6 100644 --- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go +++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go @@ -12,6 +12,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" libseccomp "github.com/seccomp/libseccomp-golang" + "github.com/sirupsen/logrus" "golang.org/x/net/bpf" ) @@ -105,6 +106,18 @@ var testArches = []string{ "ppc64le", "s390", "s390x", + // Dummy value to indicate a configuration with no architecture specified. + "native", +} + +var nativeArch string + +func init() { + scmpNativeArch, err := libseccomp.GetNativeArch() + if err != nil { + logrus.Panicf("get native arch: %v", err) + } + nativeArch = scmpNativeArch.String() } func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) { @@ -155,6 +168,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) expected uint32 } + if arch == "native" { + arch = nativeArch + } scmpArch, err := libseccomp.GetArchFromString(arch) if err != nil { t.Fatalf("unknown libseccomp architecture %q: %v", arch, err) @@ -228,8 +244,15 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) // Test syscalls in the explicit list. for _, test := range syscallTests { - // Override the expected value in the two special cases. - if !archSet[arch] || isAllowAction(defaultAction) { + // Override the expected value in the two special cases: + // 1. If the default action is allow, the filter won't have + // the stub prepended so we expect a fallthrough. + // 2. If the executing architecture is not in the architecture + // set, then the architecture is not handled by the stub -- + // *except* in the case of the native architecture (which + // is always included in the stub). + if isAllowAction(defaultAction) || + (!archSet[arch] && arch != nativeArch) { test.expected = retFallthrough } @@ -263,7 +286,14 @@ var testActions = map[string]configs.Action{ func TestEnosysStub_SingleArch(t *testing.T) { for _, arch := range testArches { - arches := []string{arch} + var arches []string + // "native" indicates a blank architecture field for seccomp, to test + // the case where the running architecture was not included in the + // architecture. Docker doesn't always set the architecture for some + // reason (namely for ppc64le). + if arch != "native" { + arches = append(arches, arch) + } t.Run("arch="+arch, func(t *testing.T) { for name, action := range testActions { t.Run("action="+name, func(t *testing.T) { @@ -277,7 +307,16 @@ func TestEnosysStub_SingleArch(t *testing.T) { func TestEnosysStub_MultiArch(t *testing.T) { for end := 0; end < len(testArches); end++ { for start := 0; start < end; start++ { - arches := testArches[start:end] + var arches []string + for _, arch := range testArches[start:end] { + // "native" indicates a blank architecture field for seccomp, to test + // the case where the running architecture was not included in the + // architecture. Docker doesn't always set the architecture for some + // reason (namely for ppc64le). + if arch != "native" { + arches = append(arches, arch) + } + } if len(arches) <= 1 { continue }