diff --git a/Gopkg.lock b/Gopkg.lock index be64f4de6..0c3173d14 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -73,6 +73,20 @@ revision = "ba2c2ddd89069b46a7011d4106f6868f17ee1705" version = "v3.6.1" +[[projects]] + branch = "master" + digest = "1:d219d59fcf68d1e8484fb038717cf911c60f70abfe053ef1f56bdeb6c121df7b" + name = "github.com/cilium/ebpf" + packages = [ + ".", + "asm", + "internal", + "internal/btf", + "internal/unix", + ] + pruneopts = "NUT" + revision = "c8f8abaa9ece88e9be5b888e23e130e726d9afa4" + [[projects]] branch = "master" digest = "1:8ecb89af7dfe3ac401bdb0c9390b134ef96a97e85f732d2b0604fb7b3977839f" @@ -469,20 +483,23 @@ version = "v1.0.0-rc1" [[projects]] - digest = "1:e79493df9a0c200099f954b9976e2ad0ab50cbae0bff2fdbd5e837f40221999f" + digest = "1:a41704b63041a102a7435a253ee9498b1f3b2a5806c134e8c279734472e773b4" name = "github.com/opencontainers/runc" packages = [ "libcontainer/cgroups", + "libcontainer/cgroups/ebpf", + "libcontainer/cgroups/ebpf/devicefilter", "libcontainer/cgroups/fs", "libcontainer/cgroups/systemd", "libcontainer/configs", "libcontainer/seccomp", "libcontainer/specconv", "libcontainer/system", + "libcontainer/user", "libcontainer/utils", ] pruneopts = "NUT" - revision = "0351df1c5a66838d0c392b4ac4cf9450de844e2d" + revision = "2b52db75279ca687e18156de86d845876e9ef35d" [[projects]] digest = "1:7a58202c5cdf3d2c1eb0621fe369315561cea7f036ad10f0f0479ac36bcc95eb" diff --git a/Gopkg.toml b/Gopkg.toml index cc0925a98..3a55e652e 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -16,7 +16,7 @@ [[constraint]] name = "github.com/opencontainers/runc" - revision = "0351df1c5a66838d0c392b4ac4cf9450de844e2d" + revision = "2b52db75279ca687e18156de86d845876e9ef35d" [[constraint]] name = "github.com/opencontainers/runtime-spec" diff --git a/vendor/github.com/cilium/ebpf/LICENSE b/vendor/github.com/cilium/ebpf/LICENSE new file mode 100644 index 000000000..c637ae99c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/LICENSE @@ -0,0 +1,23 @@ +MIT License + +Copyright (c) 2017 Nathan Sweet +Copyright (c) 2018, 2019 Cloudflare +Copyright (c) 2019 Authors of Cilium + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/cilium/ebpf/abi.go b/vendor/github.com/cilium/ebpf/abi.go new file mode 100644 index 000000000..77227ffda --- /dev/null +++ b/vendor/github.com/cilium/ebpf/abi.go @@ -0,0 +1,204 @@ +package ebpf + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "syscall" + + "github.com/cilium/ebpf/internal" + + "github.com/pkg/errors" +) + +// MapABI are the attributes of a Map which are available across all supported kernels. +type MapABI struct { + Type MapType + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + Flags uint32 +} + +func newMapABIFromSpec(spec *MapSpec) *MapABI { + return &MapABI{ + spec.Type, + spec.KeySize, + spec.ValueSize, + spec.MaxEntries, + spec.Flags, + } +} + +func newMapABIFromFd(fd *internal.FD) (string, *MapABI, error) { + info, err := bpfGetMapInfoByFD(fd) + if err != nil { + if errors.Cause(err) == syscall.EINVAL { + abi, err := newMapABIFromProc(fd) + return "", abi, err + } + return "", nil, err + } + + return "", &MapABI{ + MapType(info.mapType), + info.keySize, + info.valueSize, + info.maxEntries, + info.flags, + }, nil +} + +func newMapABIFromProc(fd *internal.FD) (*MapABI, error) { + var abi MapABI + err := scanFdInfo(fd, map[string]interface{}{ + "map_type": &abi.Type, + "key_size": &abi.KeySize, + "value_size": &abi.ValueSize, + "max_entries": &abi.MaxEntries, + "map_flags": &abi.Flags, + }) + if err != nil { + return nil, err + } + return &abi, nil +} + +// Equal returns true if two ABIs have the same values. +func (abi *MapABI) Equal(other *MapABI) bool { + switch { + case abi.Type != other.Type: + return false + case abi.KeySize != other.KeySize: + return false + case abi.ValueSize != other.ValueSize: + return false + case abi.MaxEntries != other.MaxEntries: + return false + case abi.Flags != other.Flags: + return false + default: + return true + } +} + +// ProgramABI are the attributes of a Program which are available across all supported kernels. +type ProgramABI struct { + Type ProgramType +} + +func newProgramABIFromSpec(spec *ProgramSpec) *ProgramABI { + return &ProgramABI{ + spec.Type, + } +} + +func newProgramABIFromFd(fd *internal.FD) (string, *ProgramABI, error) { + info, err := bpfGetProgInfoByFD(fd) + if err != nil { + if errors.Cause(err) == syscall.EINVAL { + return newProgramABIFromProc(fd) + } + + return "", nil, err + } + + var name string + if bpfName := internal.CString(info.name[:]); bpfName != "" { + name = bpfName + } else { + name = internal.CString(info.tag[:]) + } + + return name, &ProgramABI{ + Type: ProgramType(info.progType), + }, nil +} + +func newProgramABIFromProc(fd *internal.FD) (string, *ProgramABI, error) { + var ( + abi ProgramABI + name string + ) + + err := scanFdInfo(fd, map[string]interface{}{ + "prog_type": &abi.Type, + "prog_tag": &name, + }) + if errors.Cause(err) == errMissingFields { + return "", nil, &internal.UnsupportedFeatureError{ + Name: "reading ABI from /proc/self/fdinfo", + MinimumVersion: internal.Version{4, 11, 0}, + } + } + if err != nil { + return "", nil, err + } + + return name, &abi, nil +} + +func scanFdInfo(fd *internal.FD, fields map[string]interface{}) error { + raw, err := fd.Value() + if err != nil { + return err + } + + fh, err := os.Open(fmt.Sprintf("/proc/self/fdinfo/%d", raw)) + if err != nil { + return err + } + defer fh.Close() + + return errors.Wrap(scanFdInfoReader(fh, fields), fh.Name()) +} + +var errMissingFields = errors.New("missing fields") + +func scanFdInfoReader(r io.Reader, fields map[string]interface{}) error { + var ( + scanner = bufio.NewScanner(r) + scanned int + ) + + for scanner.Scan() { + parts := bytes.SplitN(scanner.Bytes(), []byte("\t"), 2) + if len(parts) != 2 { + continue + } + + name := bytes.TrimSuffix(parts[0], []byte(":")) + field, ok := fields[string(name)] + if !ok { + continue + } + + if n, err := fmt.Fscanln(bytes.NewReader(parts[1]), field); err != nil || n != 1 { + return errors.Wrapf(err, "can't parse field %s", name) + } + + scanned++ + } + + if err := scanner.Err(); err != nil { + return err + } + + if scanned != len(fields) { + return errMissingFields + } + + return nil +} + +// Equal returns true if two ABIs have the same values. +func (abi *ProgramABI) Equal(other *ProgramABI) bool { + switch { + case abi.Type != other.Type: + return false + default: + return true + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/alu.go b/vendor/github.com/cilium/ebpf/asm/alu.go new file mode 100644 index 000000000..70ccc4d15 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/alu.go @@ -0,0 +1,149 @@ +package asm + +//go:generate stringer -output alu_string.go -type=Source,Endianness,ALUOp + +// Source of ALU / ALU64 / Branch operations +// +// msb lsb +// +----+-+---+ +// |op |S|cls| +// +----+-+---+ +type Source uint8 + +const sourceMask OpCode = 0x08 + +// Source bitmask +const ( + // InvalidSource is returned by getters when invoked + // on non ALU / branch OpCodes. + InvalidSource Source = 0xff + // ImmSource src is from constant + ImmSource Source = 0x00 + // RegSource src is from register + RegSource Source = 0x08 +) + +// The Endianness of a byte swap instruction. +type Endianness uint8 + +const endianMask = sourceMask + +// Endian flags +const ( + InvalidEndian Endianness = 0xff + // Convert to little endian + LE Endianness = 0x00 + // Convert to big endian + BE Endianness = 0x08 +) + +// ALUOp are ALU / ALU64 operations +// +// msb lsb +// +----+-+---+ +// |OP |s|cls| +// +----+-+---+ +type ALUOp uint8 + +const aluMask OpCode = 0xf0 + +const ( + // InvalidALUOp is returned by getters when invoked + // on non ALU OpCodes + InvalidALUOp ALUOp = 0xff + // Add - addition + Add ALUOp = 0x00 + // Sub - subtraction + Sub ALUOp = 0x10 + // Mul - multiplication + Mul ALUOp = 0x20 + // Div - division + Div ALUOp = 0x30 + // Or - bitwise or + Or ALUOp = 0x40 + // And - bitwise and + And ALUOp = 0x50 + // LSh - bitwise shift left + LSh ALUOp = 0x60 + // RSh - bitwise shift right + RSh ALUOp = 0x70 + // Neg - sign/unsign signing bit + Neg ALUOp = 0x80 + // Mod - modulo + Mod ALUOp = 0x90 + // Xor - bitwise xor + Xor ALUOp = 0xa0 + // Mov - move value from one place to another + Mov ALUOp = 0xb0 + // ArSh - arithmatic shift + ArSh ALUOp = 0xc0 + // Swap - endian conversions + Swap ALUOp = 0xd0 +) + +// HostTo converts from host to another endianness. +func HostTo(endian Endianness, dst Register, size Size) Instruction { + var imm int64 + switch size { + case Half: + imm = 16 + case Word: + imm = 32 + case DWord: + imm = 64 + default: + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(ALUClass).SetALUOp(Swap).SetSource(Source(endian)), + Dst: dst, + Constant: imm, + } +} + +// Op returns the OpCode for an ALU operation with a given source. +func (op ALUOp) Op(source Source) OpCode { + return OpCode(ALU64Class).SetALUOp(op).SetSource(source) +} + +// Reg emits `dst (op) src`. +func (op ALUOp) Reg(dst, src Register) Instruction { + return Instruction{ + OpCode: op.Op(RegSource), + Dst: dst, + Src: src, + } +} + +// Imm emits `dst (op) value`. +func (op ALUOp) Imm(dst Register, value int32) Instruction { + return Instruction{ + OpCode: op.Op(ImmSource), + Dst: dst, + Constant: int64(value), + } +} + +// Op32 returns the OpCode for a 32-bit ALU operation with a given source. +func (op ALUOp) Op32(source Source) OpCode { + return OpCode(ALUClass).SetALUOp(op).SetSource(source) +} + +// Reg32 emits `dst (op) src`, zeroing the upper 32 bit of dst. +func (op ALUOp) Reg32(dst, src Register) Instruction { + return Instruction{ + OpCode: op.Op32(RegSource), + Dst: dst, + Src: src, + } +} + +// Imm32 emits `dst (op) value`, zeroing the upper 32 bit of dst. +func (op ALUOp) Imm32(dst Register, value int32) Instruction { + return Instruction{ + OpCode: op.Op32(ImmSource), + Dst: dst, + Constant: int64(value), + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/alu_string.go b/vendor/github.com/cilium/ebpf/asm/alu_string.go new file mode 100644 index 000000000..72d3fe629 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/alu_string.go @@ -0,0 +1,107 @@ +// Code generated by "stringer -output alu_string.go -type=Source,Endianness,ALUOp"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidSource-255] + _ = x[ImmSource-0] + _ = x[RegSource-8] +} + +const ( + _Source_name_0 = "ImmSource" + _Source_name_1 = "RegSource" + _Source_name_2 = "InvalidSource" +) + +func (i Source) String() string { + switch { + case i == 0: + return _Source_name_0 + case i == 8: + return _Source_name_1 + case i == 255: + return _Source_name_2 + default: + return "Source(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidEndian-255] + _ = x[LE-0] + _ = x[BE-8] +} + +const ( + _Endianness_name_0 = "LE" + _Endianness_name_1 = "BE" + _Endianness_name_2 = "InvalidEndian" +) + +func (i Endianness) String() string { + switch { + case i == 0: + return _Endianness_name_0 + case i == 8: + return _Endianness_name_1 + case i == 255: + return _Endianness_name_2 + default: + return "Endianness(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidALUOp-255] + _ = x[Add-0] + _ = x[Sub-16] + _ = x[Mul-32] + _ = x[Div-48] + _ = x[Or-64] + _ = x[And-80] + _ = x[LSh-96] + _ = x[RSh-112] + _ = x[Neg-128] + _ = x[Mod-144] + _ = x[Xor-160] + _ = x[Mov-176] + _ = x[ArSh-192] + _ = x[Swap-208] +} + +const _ALUOp_name = "AddSubMulDivOrAndLShRShNegModXorMovArShSwapInvalidALUOp" + +var _ALUOp_map = map[ALUOp]string{ + 0: _ALUOp_name[0:3], + 16: _ALUOp_name[3:6], + 32: _ALUOp_name[6:9], + 48: _ALUOp_name[9:12], + 64: _ALUOp_name[12:14], + 80: _ALUOp_name[14:17], + 96: _ALUOp_name[17:20], + 112: _ALUOp_name[20:23], + 128: _ALUOp_name[23:26], + 144: _ALUOp_name[26:29], + 160: _ALUOp_name[29:32], + 176: _ALUOp_name[32:35], + 192: _ALUOp_name[35:39], + 208: _ALUOp_name[39:43], + 255: _ALUOp_name[43:55], +} + +func (i ALUOp) String() string { + if str, ok := _ALUOp_map[i]; ok { + return str + } + return "ALUOp(" + strconv.FormatInt(int64(i), 10) + ")" +} diff --git a/vendor/github.com/cilium/ebpf/asm/doc.go b/vendor/github.com/cilium/ebpf/asm/doc.go new file mode 100644 index 000000000..7031bdc27 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/doc.go @@ -0,0 +1,2 @@ +// Package asm is an assembler for eBPF bytecode. +package asm diff --git a/vendor/github.com/cilium/ebpf/asm/func.go b/vendor/github.com/cilium/ebpf/asm/func.go new file mode 100644 index 000000000..97f794cdb --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/func.go @@ -0,0 +1,143 @@ +package asm + +//go:generate stringer -output func_string.go -type=BuiltinFunc + +// BuiltinFunc is a built-in eBPF function. +type BuiltinFunc int32 + +// eBPF built-in functions +// +// You can renegerate this list using the following gawk script: +// +// /FN\(.+\),/ { +// match($1, /\((.+)\)/, r) +// split(r[1], p, "_") +// printf "Fn" +// for (i in p) { +// printf "%s%s", toupper(substr(p[i], 1, 1)), substr(p[i], 2) +// } +// print "" +// } +// +// The script expects include/uapi/linux/bpf.h as it's input. +const ( + FnUnspec BuiltinFunc = iota + FnMapLookupElem + FnMapUpdateElem + FnMapDeleteElem + FnProbeRead + FnKtimeGetNs + FnTracePrintk + FnGetPrandomU32 + FnGetSmpProcessorId + FnSkbStoreBytes + FnL3CsumReplace + FnL4CsumReplace + FnTailCall + FnCloneRedirect + FnGetCurrentPidTgid + FnGetCurrentUidGid + FnGetCurrentComm + FnGetCgroupClassid + FnSkbVlanPush + FnSkbVlanPop + FnSkbGetTunnelKey + FnSkbSetTunnelKey + FnPerfEventRead + FnRedirect + FnGetRouteRealm + FnPerfEventOutput + FnSkbLoadBytes + FnGetStackid + FnCsumDiff + FnSkbGetTunnelOpt + FnSkbSetTunnelOpt + FnSkbChangeProto + FnSkbChangeType + FnSkbUnderCgroup + FnGetHashRecalc + FnGetCurrentTask + FnProbeWriteUser + FnCurrentTaskUnderCgroup + FnSkbChangeTail + FnSkbPullData + FnCsumUpdate + FnSetHashInvalid + FnGetNumaNodeId + FnSkbChangeHead + FnXdpAdjustHead + FnProbeReadStr + FnGetSocketCookie + FnGetSocketUid + FnSetHash + FnSetsockopt + FnSkbAdjustRoom + FnRedirectMap + FnSkRedirectMap + FnSockMapUpdate + FnXdpAdjustMeta + FnPerfEventReadValue + FnPerfProgReadValue + FnGetsockopt + FnOverrideReturn + FnSockOpsCbFlagsSet + FnMsgRedirectMap + FnMsgApplyBytes + FnMsgCorkBytes + FnMsgPullData + FnBind + FnXdpAdjustTail + FnSkbGetXfrmState + FnGetStack + FnSkbLoadBytesRelative + FnFibLookup + FnSockHashUpdate + FnMsgRedirectHash + FnSkRedirectHash + FnLwtPushEncap + FnLwtSeg6StoreBytes + FnLwtSeg6AdjustSrh + FnLwtSeg6Action + FnRcRepeat + FnRcKeydown + FnSkbCgroupId + FnGetCurrentCgroupId + FnGetLocalStorage + FnSkSelectReuseport + FnSkbAncestorCgroupId + FnSkLookupTcp + FnSkLookupUdp + FnSkRelease + FnMapPushElem + FnMapPopElem + FnMapPeekElem + FnMsgPushData + FnMsgPopData + FnRcPointerRel + FnSpinLock + FnSpinUnlock + FnSkFullsock + FnTcpSock + FnSkbEcnSetCe + FnGetListenerSock + FnSkcLookupTcp + FnTcpCheckSyncookie + FnSysctlGetName + FnSysctlGetCurrentValue + FnSysctlGetNewValue + FnSysctlSetNewValue + FnStrtol + FnStrtoul + FnSkStorageGet + FnSkStorageDelete + FnSendSignal + FnTcpGenSyncookie +) + +// Call emits a function call. +func (fn BuiltinFunc) Call() Instruction { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Call), + Constant: int64(fn), + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/func_string.go b/vendor/github.com/cilium/ebpf/asm/func_string.go new file mode 100644 index 000000000..8860b9fdb --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/func_string.go @@ -0,0 +1,133 @@ +// Code generated by "stringer -output func_string.go -type=BuiltinFunc"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[FnUnspec-0] + _ = x[FnMapLookupElem-1] + _ = x[FnMapUpdateElem-2] + _ = x[FnMapDeleteElem-3] + _ = x[FnProbeRead-4] + _ = x[FnKtimeGetNs-5] + _ = x[FnTracePrintk-6] + _ = x[FnGetPrandomU32-7] + _ = x[FnGetSmpProcessorId-8] + _ = x[FnSkbStoreBytes-9] + _ = x[FnL3CsumReplace-10] + _ = x[FnL4CsumReplace-11] + _ = x[FnTailCall-12] + _ = x[FnCloneRedirect-13] + _ = x[FnGetCurrentPidTgid-14] + _ = x[FnGetCurrentUidGid-15] + _ = x[FnGetCurrentComm-16] + _ = x[FnGetCgroupClassid-17] + _ = x[FnSkbVlanPush-18] + _ = x[FnSkbVlanPop-19] + _ = x[FnSkbGetTunnelKey-20] + _ = x[FnSkbSetTunnelKey-21] + _ = x[FnPerfEventRead-22] + _ = x[FnRedirect-23] + _ = x[FnGetRouteRealm-24] + _ = x[FnPerfEventOutput-25] + _ = x[FnSkbLoadBytes-26] + _ = x[FnGetStackid-27] + _ = x[FnCsumDiff-28] + _ = x[FnSkbGetTunnelOpt-29] + _ = x[FnSkbSetTunnelOpt-30] + _ = x[FnSkbChangeProto-31] + _ = x[FnSkbChangeType-32] + _ = x[FnSkbUnderCgroup-33] + _ = x[FnGetHashRecalc-34] + _ = x[FnGetCurrentTask-35] + _ = x[FnProbeWriteUser-36] + _ = x[FnCurrentTaskUnderCgroup-37] + _ = x[FnSkbChangeTail-38] + _ = x[FnSkbPullData-39] + _ = x[FnCsumUpdate-40] + _ = x[FnSetHashInvalid-41] + _ = x[FnGetNumaNodeId-42] + _ = x[FnSkbChangeHead-43] + _ = x[FnXdpAdjustHead-44] + _ = x[FnProbeReadStr-45] + _ = x[FnGetSocketCookie-46] + _ = x[FnGetSocketUid-47] + _ = x[FnSetHash-48] + _ = x[FnSetsockopt-49] + _ = x[FnSkbAdjustRoom-50] + _ = x[FnRedirectMap-51] + _ = x[FnSkRedirectMap-52] + _ = x[FnSockMapUpdate-53] + _ = x[FnXdpAdjustMeta-54] + _ = x[FnPerfEventReadValue-55] + _ = x[FnPerfProgReadValue-56] + _ = x[FnGetsockopt-57] + _ = x[FnOverrideReturn-58] + _ = x[FnSockOpsCbFlagsSet-59] + _ = x[FnMsgRedirectMap-60] + _ = x[FnMsgApplyBytes-61] + _ = x[FnMsgCorkBytes-62] + _ = x[FnMsgPullData-63] + _ = x[FnBind-64] + _ = x[FnXdpAdjustTail-65] + _ = x[FnSkbGetXfrmState-66] + _ = x[FnGetStack-67] + _ = x[FnSkbLoadBytesRelative-68] + _ = x[FnFibLookup-69] + _ = x[FnSockHashUpdate-70] + _ = x[FnMsgRedirectHash-71] + _ = x[FnSkRedirectHash-72] + _ = x[FnLwtPushEncap-73] + _ = x[FnLwtSeg6StoreBytes-74] + _ = x[FnLwtSeg6AdjustSrh-75] + _ = x[FnLwtSeg6Action-76] + _ = x[FnRcRepeat-77] + _ = x[FnRcKeydown-78] + _ = x[FnSkbCgroupId-79] + _ = x[FnGetCurrentCgroupId-80] + _ = x[FnGetLocalStorage-81] + _ = x[FnSkSelectReuseport-82] + _ = x[FnSkbAncestorCgroupId-83] + _ = x[FnSkLookupTcp-84] + _ = x[FnSkLookupUdp-85] + _ = x[FnSkRelease-86] + _ = x[FnMapPushElem-87] + _ = x[FnMapPopElem-88] + _ = x[FnMapPeekElem-89] + _ = x[FnMsgPushData-90] + _ = x[FnMsgPopData-91] + _ = x[FnRcPointerRel-92] + _ = x[FnSpinLock-93] + _ = x[FnSpinUnlock-94] + _ = x[FnSkFullsock-95] + _ = x[FnTcpSock-96] + _ = x[FnSkbEcnSetCe-97] + _ = x[FnGetListenerSock-98] + _ = x[FnSkcLookupTcp-99] + _ = x[FnTcpCheckSyncookie-100] + _ = x[FnSysctlGetName-101] + _ = x[FnSysctlGetCurrentValue-102] + _ = x[FnSysctlGetNewValue-103] + _ = x[FnSysctlSetNewValue-104] + _ = x[FnStrtol-105] + _ = x[FnStrtoul-106] + _ = x[FnSkStorageGet-107] + _ = x[FnSkStorageDelete-108] + _ = x[FnSendSignal-109] + _ = x[FnTcpGenSyncookie-110] +} + +const _BuiltinFunc_name = "FnUnspecFnMapLookupElemFnMapUpdateElemFnMapDeleteElemFnProbeReadFnKtimeGetNsFnTracePrintkFnGetPrandomU32FnGetSmpProcessorIdFnSkbStoreBytesFnL3CsumReplaceFnL4CsumReplaceFnTailCallFnCloneRedirectFnGetCurrentPidTgidFnGetCurrentUidGidFnGetCurrentCommFnGetCgroupClassidFnSkbVlanPushFnSkbVlanPopFnSkbGetTunnelKeyFnSkbSetTunnelKeyFnPerfEventReadFnRedirectFnGetRouteRealmFnPerfEventOutputFnSkbLoadBytesFnGetStackidFnCsumDiffFnSkbGetTunnelOptFnSkbSetTunnelOptFnSkbChangeProtoFnSkbChangeTypeFnSkbUnderCgroupFnGetHashRecalcFnGetCurrentTaskFnProbeWriteUserFnCurrentTaskUnderCgroupFnSkbChangeTailFnSkbPullDataFnCsumUpdateFnSetHashInvalidFnGetNumaNodeIdFnSkbChangeHeadFnXdpAdjustHeadFnProbeReadStrFnGetSocketCookieFnGetSocketUidFnSetHashFnSetsockoptFnSkbAdjustRoomFnRedirectMapFnSkRedirectMapFnSockMapUpdateFnXdpAdjustMetaFnPerfEventReadValueFnPerfProgReadValueFnGetsockoptFnOverrideReturnFnSockOpsCbFlagsSetFnMsgRedirectMapFnMsgApplyBytesFnMsgCorkBytesFnMsgPullDataFnBindFnXdpAdjustTailFnSkbGetXfrmStateFnGetStackFnSkbLoadBytesRelativeFnFibLookupFnSockHashUpdateFnMsgRedirectHashFnSkRedirectHashFnLwtPushEncapFnLwtSeg6StoreBytesFnLwtSeg6AdjustSrhFnLwtSeg6ActionFnRcRepeatFnRcKeydownFnSkbCgroupIdFnGetCurrentCgroupIdFnGetLocalStorageFnSkSelectReuseportFnSkbAncestorCgroupIdFnSkLookupTcpFnSkLookupUdpFnSkReleaseFnMapPushElemFnMapPopElemFnMapPeekElemFnMsgPushDataFnMsgPopDataFnRcPointerRelFnSpinLockFnSpinUnlockFnSkFullsockFnTcpSockFnSkbEcnSetCeFnGetListenerSockFnSkcLookupTcpFnTcpCheckSyncookieFnSysctlGetNameFnSysctlGetCurrentValueFnSysctlGetNewValueFnSysctlSetNewValueFnStrtolFnStrtoulFnSkStorageGetFnSkStorageDeleteFnSendSignalFnTcpGenSyncookie" + +var _BuiltinFunc_index = [...]uint16{0, 8, 23, 38, 53, 64, 76, 89, 104, 123, 138, 153, 168, 178, 193, 212, 230, 246, 264, 277, 289, 306, 323, 338, 348, 363, 380, 394, 406, 416, 433, 450, 466, 481, 497, 512, 528, 544, 568, 583, 596, 608, 624, 639, 654, 669, 683, 700, 714, 723, 735, 750, 763, 778, 793, 808, 828, 847, 859, 875, 894, 910, 925, 939, 952, 958, 973, 990, 1000, 1022, 1033, 1049, 1066, 1082, 1096, 1115, 1133, 1148, 1158, 1169, 1182, 1202, 1219, 1238, 1259, 1272, 1285, 1296, 1309, 1321, 1334, 1347, 1359, 1373, 1383, 1395, 1407, 1416, 1429, 1446, 1460, 1479, 1494, 1517, 1536, 1555, 1563, 1572, 1586, 1603, 1615, 1632} + +func (i BuiltinFunc) String() string { + if i < 0 || i >= BuiltinFunc(len(_BuiltinFunc_index)-1) { + return "BuiltinFunc(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _BuiltinFunc_name[_BuiltinFunc_index[i]:_BuiltinFunc_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/asm/instruction.go b/vendor/github.com/cilium/ebpf/asm/instruction.go new file mode 100644 index 000000000..c8ed6cfb4 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/instruction.go @@ -0,0 +1,416 @@ +package asm + +import ( + "encoding/binary" + "fmt" + "io" + "math" + "strings" + + "github.com/pkg/errors" +) + +// InstructionSize is the size of a BPF instruction in bytes +const InstructionSize = 8 + +// Instruction is a single eBPF instruction. +type Instruction struct { + OpCode OpCode + Dst Register + Src Register + Offset int16 + Constant int64 + Reference string + Symbol string +} + +// Sym creates a symbol. +func (ins Instruction) Sym(name string) Instruction { + ins.Symbol = name + return ins +} + +// Unmarshal decodes a BPF instruction. +func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder) (uint64, error) { + var bi bpfInstruction + err := binary.Read(r, bo, &bi) + if err != nil { + return 0, err + } + + ins.OpCode = bi.OpCode + ins.Dst = bi.Registers.Dst() + ins.Src = bi.Registers.Src() + ins.Offset = bi.Offset + ins.Constant = int64(bi.Constant) + + if !bi.OpCode.isDWordLoad() { + return InstructionSize, nil + } + + var bi2 bpfInstruction + if err := binary.Read(r, bo, &bi2); err != nil { + // No Wrap, to avoid io.EOF clash + return 0, errors.New("64bit immediate is missing second half") + } + if bi2.OpCode != 0 || bi2.Offset != 0 || bi2.Registers != 0 { + return 0, errors.New("64bit immediate has non-zero fields") + } + ins.Constant = int64(uint64(uint32(bi2.Constant))<<32 | uint64(uint32(bi.Constant))) + + return 2 * InstructionSize, nil +} + +// Marshal encodes a BPF instruction. +func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error) { + if ins.OpCode == InvalidOpCode { + return 0, errors.New("invalid opcode") + } + + isDWordLoad := ins.OpCode.isDWordLoad() + + cons := int32(ins.Constant) + if isDWordLoad { + // Encode least significant 32bit first for 64bit operations. + cons = int32(uint32(ins.Constant)) + } + + bpfi := bpfInstruction{ + ins.OpCode, + newBPFRegisters(ins.Dst, ins.Src), + ins.Offset, + cons, + } + + if err := binary.Write(w, bo, &bpfi); err != nil { + return 0, err + } + + if !isDWordLoad { + return InstructionSize, nil + } + + bpfi = bpfInstruction{ + Constant: int32(ins.Constant >> 32), + } + + if err := binary.Write(w, bo, &bpfi); err != nil { + return 0, err + } + + return 2 * InstructionSize, nil +} + +// RewriteMapPtr changes an instruction to use a new map fd. +// +// Returns an error if the fd is invalid, or the instruction +// is incorrect. +func (ins *Instruction) RewriteMapPtr(fd int) error { + if !ins.OpCode.isDWordLoad() { + return errors.Errorf("%s is not a 64 bit load", ins.OpCode) + } + + if fd < 0 { + return errors.New("invalid fd") + } + + ins.Src = R1 + ins.Constant = int64(fd) + return nil +} + +// Format implements fmt.Formatter. +func (ins Instruction) Format(f fmt.State, c rune) { + if c != 'v' { + fmt.Fprintf(f, "{UNRECOGNIZED: %c}", c) + return + } + + op := ins.OpCode + + if op == InvalidOpCode { + fmt.Fprint(f, "INVALID") + return + } + + // Omit trailing space for Exit + if op.JumpOp() == Exit { + fmt.Fprint(f, op) + return + } + + fmt.Fprintf(f, "%v ", op) + switch cls := op.Class(); cls { + case LdClass, LdXClass, StClass, StXClass: + switch op.Mode() { + case ImmMode: + fmt.Fprintf(f, "dst: %s imm: %d", ins.Dst, ins.Constant) + case AbsMode: + fmt.Fprintf(f, "imm: %d", ins.Constant) + case IndMode: + fmt.Fprintf(f, "dst: %s src: %s imm: %d", ins.Dst, ins.Src, ins.Constant) + case MemMode: + fmt.Fprintf(f, "dst: %s src: %s off: %d imm: %d", ins.Dst, ins.Src, ins.Offset, ins.Constant) + case XAddMode: + fmt.Fprintf(f, "dst: %s src: %s", ins.Dst, ins.Src) + } + + case ALU64Class, ALUClass: + fmt.Fprintf(f, "dst: %s ", ins.Dst) + if op.ALUOp() == Swap || op.Source() == ImmSource { + fmt.Fprintf(f, "imm: %d", ins.Constant) + } else { + fmt.Fprintf(f, "src: %s", ins.Src) + } + + case JumpClass: + switch jop := op.JumpOp(); jop { + case Call: + if ins.Src == R1 { + // bpf-to-bpf call + fmt.Fprint(f, ins.Constant) + } else { + fmt.Fprint(f, BuiltinFunc(ins.Constant)) + } + + default: + fmt.Fprintf(f, "dst: %s off: %d ", ins.Dst, ins.Offset) + if op.Source() == ImmSource { + fmt.Fprintf(f, "imm: %d", ins.Constant) + } else { + fmt.Fprintf(f, "src: %s", ins.Src) + } + } + } + + if ins.Reference != "" { + fmt.Fprintf(f, " <%s>", ins.Reference) + } +} + +// Instructions is an eBPF program. +type Instructions []Instruction + +func (insns Instructions) String() string { + return fmt.Sprint(insns) +} + +// RewriteMapPtr rewrites all loads of a specific map pointer to a new fd. +// +// Returns an error if the symbol isn't used, see IsUnreferencedSymbol. +func (insns Instructions) RewriteMapPtr(symbol string, fd int) error { + if symbol == "" { + return errors.New("empty symbol") + } + + found := false + for i := range insns { + ins := &insns[i] + if ins.Reference != symbol { + continue + } + + if err := ins.RewriteMapPtr(fd); err != nil { + return err + } + + found = true + } + + if !found { + return &unreferencedSymbolError{symbol} + } + + return nil +} + +// SymbolOffsets returns the set of symbols and their offset in +// the instructions. +func (insns Instructions) SymbolOffsets() (map[string]int, error) { + offsets := make(map[string]int) + + for i, ins := range insns { + if ins.Symbol == "" { + continue + } + + if _, ok := offsets[ins.Symbol]; ok { + return nil, errors.Errorf("duplicate symbol %s", ins.Symbol) + } + + offsets[ins.Symbol] = i + } + + return offsets, nil +} + +// ReferenceOffsets returns the set of references and their offset in +// the instructions. +func (insns Instructions) ReferenceOffsets() map[string][]int { + offsets := make(map[string][]int) + + for i, ins := range insns { + if ins.Reference == "" { + continue + } + + offsets[ins.Reference] = append(offsets[ins.Reference], i) + } + + return offsets +} + +func (insns Instructions) marshalledOffsets() (map[string]int, error) { + symbols := make(map[string]int) + + marshalledPos := 0 + for _, ins := range insns { + currentPos := marshalledPos + marshalledPos += ins.OpCode.marshalledInstructions() + + if ins.Symbol == "" { + continue + } + + if _, ok := symbols[ins.Symbol]; ok { + return nil, errors.Errorf("duplicate symbol %s", ins.Symbol) + } + + symbols[ins.Symbol] = currentPos + } + + return symbols, nil +} + +// Format implements fmt.Formatter. +// +// You can control indentation of symbols by +// specifying a width. Setting a precision controls the indentation of +// instructions. +// The default character is a tab, which can be overriden by specifying +// the ' ' space flag. +func (insns Instructions) Format(f fmt.State, c rune) { + if c != 's' && c != 'v' { + fmt.Fprintf(f, "{UNKNOWN FORMAT '%c'}", c) + return + } + + // Precision is better in this case, because it allows + // specifying 0 padding easily. + padding, ok := f.Precision() + if !ok { + padding = 1 + } + + indent := strings.Repeat("\t", padding) + if f.Flag(' ') { + indent = strings.Repeat(" ", padding) + } + + symPadding, ok := f.Width() + if !ok { + symPadding = padding - 1 + } + if symPadding < 0 { + symPadding = 0 + } + + symIndent := strings.Repeat("\t", symPadding) + if f.Flag(' ') { + symIndent = strings.Repeat(" ", symPadding) + } + + // Figure out how many digits we need to represent the highest + // offset. + highestOffset := 0 + for _, ins := range insns { + highestOffset += ins.OpCode.marshalledInstructions() + } + offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset)))) + + offset := 0 + for _, ins := range insns { + if ins.Symbol != "" { + fmt.Fprintf(f, "%s%s:\n", symIndent, ins.Symbol) + } + fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, offset, ins) + offset += ins.OpCode.marshalledInstructions() + } + + return +} + +// Marshal encodes a BPF program into the kernel format. +func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error { + absoluteOffsets, err := insns.marshalledOffsets() + if err != nil { + return err + } + + num := 0 + for i, ins := range insns { + switch { + case ins.OpCode.JumpOp() == Call && ins.Constant == -1: + // Rewrite bpf to bpf call + offset, ok := absoluteOffsets[ins.Reference] + if !ok { + return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference) + } + + ins.Constant = int64(offset - num - 1) + + case ins.OpCode.Class() == JumpClass && ins.Offset == -1: + // Rewrite jump to label + offset, ok := absoluteOffsets[ins.Reference] + if !ok { + return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference) + } + + ins.Offset = int16(offset - num - 1) + } + + n, err := ins.Marshal(w, bo) + if err != nil { + return errors.Wrapf(err, "instruction %d", i) + } + + num += int(n / InstructionSize) + } + return nil +} + +type bpfInstruction struct { + OpCode OpCode + Registers bpfRegisters + Offset int16 + Constant int32 +} + +type bpfRegisters uint8 + +func newBPFRegisters(dst, src Register) bpfRegisters { + return bpfRegisters((src << 4) | (dst & 0xF)) +} + +func (r bpfRegisters) Dst() Register { + return Register(r & 0xF) +} + +func (r bpfRegisters) Src() Register { + return Register(r >> 4) +} + +type unreferencedSymbolError struct { + symbol string +} + +func (use *unreferencedSymbolError) Error() string { + return fmt.Sprintf("unreferenced symbol %s", use.symbol) +} + +// IsUnreferencedSymbol returns true if err was caused by +// an unreferenced symbol. +func IsUnreferencedSymbol(err error) bool { + _, ok := err.(*unreferencedSymbolError) + return ok +} diff --git a/vendor/github.com/cilium/ebpf/asm/jump.go b/vendor/github.com/cilium/ebpf/asm/jump.go new file mode 100644 index 000000000..33c9b5656 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/jump.go @@ -0,0 +1,109 @@ +package asm + +//go:generate stringer -output jump_string.go -type=JumpOp + +// JumpOp affect control flow. +// +// msb lsb +// +----+-+---+ +// |OP |s|cls| +// +----+-+---+ +type JumpOp uint8 + +const jumpMask OpCode = aluMask + +const ( + // InvalidJumpOp is returned by getters when invoked + // on non branch OpCodes + InvalidJumpOp JumpOp = 0xff + // Ja jumps by offset unconditionally + Ja JumpOp = 0x00 + // JEq jumps by offset if r == imm + JEq JumpOp = 0x10 + // JGT jumps by offset if r > imm + JGT JumpOp = 0x20 + // JGE jumps by offset if r >= imm + JGE JumpOp = 0x30 + // JSet jumps by offset if r & imm + JSet JumpOp = 0x40 + // JNE jumps by offset if r != imm + JNE JumpOp = 0x50 + // JSGT jumps by offset if signed r > signed imm + JSGT JumpOp = 0x60 + // JSGE jumps by offset if signed r >= signed imm + JSGE JumpOp = 0x70 + // Call builtin or user defined function from imm + Call JumpOp = 0x80 + // Exit ends execution, with value in r0 + Exit JumpOp = 0x90 + // JLT jumps by offset if r < imm + JLT JumpOp = 0xa0 + // JLE jumps by offset if r <= imm + JLE JumpOp = 0xb0 + // JSLT jumps by offset if signed r < signed imm + JSLT JumpOp = 0xc0 + // JSLE jumps by offset if signed r <= signed imm + JSLE JumpOp = 0xd0 +) + +// Return emits an exit instruction. +// +// Requires a return value in R0. +func Return() Instruction { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Exit), + } +} + +// Op returns the OpCode for a given jump source. +func (op JumpOp) Op(source Source) OpCode { + return OpCode(JumpClass).SetJumpOp(op).SetSource(source) +} + +// Imm compares dst to value, and adjusts PC by offset if the condition is fulfilled. +func (op JumpOp) Imm(dst Register, value int32, label string) Instruction { + if op == Exit || op == Call || op == Ja { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(op).SetSource(ImmSource), + Dst: dst, + Offset: -1, + Constant: int64(value), + Reference: label, + } +} + +// Reg compares dst to src, and adjusts PC by offset if the condition is fulfilled. +func (op JumpOp) Reg(dst, src Register, label string) Instruction { + if op == Exit || op == Call || op == Ja { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(op).SetSource(RegSource), + Dst: dst, + Src: src, + Offset: -1, + Reference: label, + } +} + +// Label adjusts PC to the address of the label. +func (op JumpOp) Label(label string) Instruction { + if op == Call { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Call), + Src: R1, + Constant: -1, + Reference: label, + } + } + + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(op), + Offset: -1, + Reference: label, + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/jump_string.go b/vendor/github.com/cilium/ebpf/asm/jump_string.go new file mode 100644 index 000000000..85a4aaffa --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/jump_string.go @@ -0,0 +1,53 @@ +// Code generated by "stringer -output jump_string.go -type=JumpOp"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidJumpOp-255] + _ = x[Ja-0] + _ = x[JEq-16] + _ = x[JGT-32] + _ = x[JGE-48] + _ = x[JSet-64] + _ = x[JNE-80] + _ = x[JSGT-96] + _ = x[JSGE-112] + _ = x[Call-128] + _ = x[Exit-144] + _ = x[JLT-160] + _ = x[JLE-176] + _ = x[JSLT-192] + _ = x[JSLE-208] +} + +const _JumpOp_name = "JaJEqJGTJGEJSetJNEJSGTJSGECallExitJLTJLEJSLTJSLEInvalidJumpOp" + +var _JumpOp_map = map[JumpOp]string{ + 0: _JumpOp_name[0:2], + 16: _JumpOp_name[2:5], + 32: _JumpOp_name[5:8], + 48: _JumpOp_name[8:11], + 64: _JumpOp_name[11:15], + 80: _JumpOp_name[15:18], + 96: _JumpOp_name[18:22], + 112: _JumpOp_name[22:26], + 128: _JumpOp_name[26:30], + 144: _JumpOp_name[30:34], + 160: _JumpOp_name[34:37], + 176: _JumpOp_name[37:40], + 192: _JumpOp_name[40:44], + 208: _JumpOp_name[44:48], + 255: _JumpOp_name[48:61], +} + +func (i JumpOp) String() string { + if str, ok := _JumpOp_map[i]; ok { + return str + } + return "JumpOp(" + strconv.FormatInt(int64(i), 10) + ")" +} diff --git a/vendor/github.com/cilium/ebpf/asm/load_store.go b/vendor/github.com/cilium/ebpf/asm/load_store.go new file mode 100644 index 000000000..ab0e92fc3 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/load_store.go @@ -0,0 +1,189 @@ +package asm + +//go:generate stringer -output load_store_string.go -type=Mode,Size + +// Mode for load and store operations +// +// msb lsb +// +---+--+---+ +// |MDE|sz|cls| +// +---+--+---+ +type Mode uint8 + +const modeMask OpCode = 0xe0 + +const ( + // InvalidMode is returned by getters when invoked + // on non load / store OpCodes + InvalidMode Mode = 0xff + // ImmMode - immediate value + ImmMode Mode = 0x00 + // AbsMode - immediate value + offset + AbsMode Mode = 0x20 + // IndMode - indirect (imm+src) + IndMode Mode = 0x40 + // MemMode - load from memory + MemMode Mode = 0x60 + // XAddMode - add atomically across processors. + XAddMode Mode = 0xc0 +) + +// Size of load and store operations +// +// msb lsb +// +---+--+---+ +// |mde|SZ|cls| +// +---+--+---+ +type Size uint8 + +const sizeMask OpCode = 0x18 + +const ( + // InvalidSize is returned by getters when invoked + // on non load / store OpCodes + InvalidSize Size = 0xff + // DWord - double word; 64 bits + DWord Size = 0x18 + // Word - word; 32 bits + Word Size = 0x00 + // Half - half-word; 16 bits + Half Size = 0x08 + // Byte - byte; 8 bits + Byte Size = 0x10 +) + +// Sizeof returns the size in bytes. +func (s Size) Sizeof() int { + switch s { + case DWord: + return 8 + case Word: + return 4 + case Half: + return 2 + case Byte: + return 1 + default: + return -1 + } +} + +// LoadMemOp returns the OpCode to load a value of given size from memory. +func LoadMemOp(size Size) OpCode { + return OpCode(LdXClass).SetMode(MemMode).SetSize(size) +} + +// LoadMem emits `dst = *(size *)(src + offset)`. +func LoadMem(dst, src Register, offset int16, size Size) Instruction { + return Instruction{ + OpCode: LoadMemOp(size), + Dst: dst, + Src: src, + Offset: offset, + } +} + +// LoadImmOp returns the OpCode to load an immediate of given size. +// +// As of kernel 4.20, only DWord size is accepted. +func LoadImmOp(size Size) OpCode { + return OpCode(LdClass).SetMode(ImmMode).SetSize(size) +} + +// LoadImm emits `dst = (size)value`. +// +// As of kernel 4.20, only DWord size is accepted. +func LoadImm(dst Register, value int64, size Size) Instruction { + return Instruction{ + OpCode: LoadImmOp(size), + Dst: dst, + Constant: value, + } +} + +// LoadMapPtr stores a pointer to a map in dst. +func LoadMapPtr(dst Register, fd int) Instruction { + if fd < 0 { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: LoadImmOp(DWord), + Dst: dst, + Src: R1, + Constant: int64(fd), + } +} + +// LoadIndOp returns the OpCode for loading a value of given size from an sk_buff. +func LoadIndOp(size Size) OpCode { + return OpCode(LdClass).SetMode(IndMode).SetSize(size) +} + +// LoadInd emits `dst = ntoh(*(size *)(((sk_buff *)R6)->data + src + offset))`. +func LoadInd(dst, src Register, offset int32, size Size) Instruction { + return Instruction{ + OpCode: LoadIndOp(size), + Dst: dst, + Src: src, + Constant: int64(offset), + } +} + +// LoadAbsOp returns the OpCode for loading a value of given size from an sk_buff. +func LoadAbsOp(size Size) OpCode { + return OpCode(LdClass).SetMode(AbsMode).SetSize(size) +} + +// LoadAbs emits `r0 = ntoh(*(size *)(((sk_buff *)R6)->data + offset))`. +func LoadAbs(offset int32, size Size) Instruction { + return Instruction{ + OpCode: LoadAbsOp(size), + Dst: R0, + Constant: int64(offset), + } +} + +// StoreMemOp returns the OpCode for storing a register of given size in memory. +func StoreMemOp(size Size) OpCode { + return OpCode(StXClass).SetMode(MemMode).SetSize(size) +} + +// StoreMem emits `*(size *)(dst + offset) = src` +func StoreMem(dst Register, offset int16, src Register, size Size) Instruction { + return Instruction{ + OpCode: StoreMemOp(size), + Dst: dst, + Src: src, + Offset: offset, + } +} + +// StoreImmOp returns the OpCode for storing an immediate of given size in memory. +func StoreImmOp(size Size) OpCode { + return OpCode(StClass).SetMode(MemMode).SetSize(size) +} + +// StoreImm emits `*(size *)(dst + offset) = value`. +func StoreImm(dst Register, offset int16, value int64, size Size) Instruction { + return Instruction{ + OpCode: StoreImmOp(size), + Dst: dst, + Offset: offset, + Constant: value, + } +} + +// StoreXAddOp returns the OpCode to atomically add a register to a value in memory. +func StoreXAddOp(size Size) OpCode { + return OpCode(StXClass).SetMode(XAddMode).SetSize(size) +} + +// StoreXAdd atomically adds src to *dst. +func StoreXAdd(dst, src Register, size Size) Instruction { + return Instruction{ + OpCode: StoreXAddOp(size), + Dst: dst, + Src: src, + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/load_store_string.go b/vendor/github.com/cilium/ebpf/asm/load_store_string.go new file mode 100644 index 000000000..76d29a075 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/load_store_string.go @@ -0,0 +1,80 @@ +// Code generated by "stringer -output load_store_string.go -type=Mode,Size"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidMode-255] + _ = x[ImmMode-0] + _ = x[AbsMode-32] + _ = x[IndMode-64] + _ = x[MemMode-96] + _ = x[XAddMode-192] +} + +const ( + _Mode_name_0 = "ImmMode" + _Mode_name_1 = "AbsMode" + _Mode_name_2 = "IndMode" + _Mode_name_3 = "MemMode" + _Mode_name_4 = "XAddMode" + _Mode_name_5 = "InvalidMode" +) + +func (i Mode) String() string { + switch { + case i == 0: + return _Mode_name_0 + case i == 32: + return _Mode_name_1 + case i == 64: + return _Mode_name_2 + case i == 96: + return _Mode_name_3 + case i == 192: + return _Mode_name_4 + case i == 255: + return _Mode_name_5 + default: + return "Mode(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidSize-255] + _ = x[DWord-24] + _ = x[Word-0] + _ = x[Half-8] + _ = x[Byte-16] +} + +const ( + _Size_name_0 = "Word" + _Size_name_1 = "Half" + _Size_name_2 = "Byte" + _Size_name_3 = "DWord" + _Size_name_4 = "InvalidSize" +) + +func (i Size) String() string { + switch { + case i == 0: + return _Size_name_0 + case i == 8: + return _Size_name_1 + case i == 16: + return _Size_name_2 + case i == 24: + return _Size_name_3 + case i == 255: + return _Size_name_4 + default: + return "Size(" + strconv.FormatInt(int64(i), 10) + ")" + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/opcode.go b/vendor/github.com/cilium/ebpf/asm/opcode.go new file mode 100644 index 000000000..d796de3fe --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/opcode.go @@ -0,0 +1,237 @@ +package asm + +import ( + "fmt" + "strings" +) + +//go:generate stringer -output opcode_string.go -type=Class + +type encoding int + +const ( + unknownEncoding encoding = iota + loadOrStore + jumpOrALU +) + +// Class of operations +// +// msb lsb +// +---+--+---+ +// | ?? |CLS| +// +---+--+---+ +type Class uint8 + +const classMask OpCode = 0x07 + +const ( + // LdClass load memory + LdClass Class = 0x00 + // LdXClass load memory from constant + LdXClass Class = 0x01 + // StClass load register from memory + StClass Class = 0x02 + // StXClass load register from constant + StXClass Class = 0x03 + // ALUClass arithmetic operators + ALUClass Class = 0x04 + // JumpClass jump operators + JumpClass Class = 0x05 + // ALU64Class arithmetic in 64 bit mode + ALU64Class Class = 0x07 +) + +func (cls Class) encoding() encoding { + switch cls { + case LdClass, LdXClass, StClass, StXClass: + return loadOrStore + case ALU64Class, ALUClass, JumpClass: + return jumpOrALU + default: + return unknownEncoding + } +} + +// OpCode is a packed eBPF opcode. +// +// Its encoding is defined by a Class value: +// +// msb lsb +// +----+-+---+ +// | ???? |CLS| +// +----+-+---+ +type OpCode uint8 + +// InvalidOpCode is returned by setters on OpCode +const InvalidOpCode OpCode = 0xff + +// marshalledInstructions returns the number of BPF instructions required +// to encode this opcode. +func (op OpCode) marshalledInstructions() int { + if op == LoadImmOp(DWord) { + return 2 + } + return 1 +} + +func (op OpCode) isDWordLoad() bool { + return op == LoadImmOp(DWord) +} + +// Class returns the class of operation. +func (op OpCode) Class() Class { + return Class(op & classMask) +} + +// Mode returns the mode for load and store operations. +func (op OpCode) Mode() Mode { + if op.Class().encoding() != loadOrStore { + return InvalidMode + } + return Mode(op & modeMask) +} + +// Size returns the size for load and store operations. +func (op OpCode) Size() Size { + if op.Class().encoding() != loadOrStore { + return InvalidSize + } + return Size(op & sizeMask) +} + +// Source returns the source for branch and ALU operations. +func (op OpCode) Source() Source { + if op.Class().encoding() != jumpOrALU || op.ALUOp() == Swap { + return InvalidSource + } + return Source(op & sourceMask) +} + +// ALUOp returns the ALUOp. +func (op OpCode) ALUOp() ALUOp { + if op.Class().encoding() != jumpOrALU { + return InvalidALUOp + } + return ALUOp(op & aluMask) +} + +// Endianness returns the Endianness for a byte swap instruction. +func (op OpCode) Endianness() Endianness { + if op.ALUOp() != Swap { + return InvalidEndian + } + return Endianness(op & endianMask) +} + +// JumpOp returns the JumpOp. +func (op OpCode) JumpOp() JumpOp { + if op.Class().encoding() != jumpOrALU { + return InvalidJumpOp + } + return JumpOp(op & jumpMask) +} + +// SetMode sets the mode on load and store operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetMode(mode Mode) OpCode { + if op.Class().encoding() != loadOrStore || !valid(OpCode(mode), modeMask) { + return InvalidOpCode + } + return (op & ^modeMask) | OpCode(mode) +} + +// SetSize sets the size on load and store operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetSize(size Size) OpCode { + if op.Class().encoding() != loadOrStore || !valid(OpCode(size), sizeMask) { + return InvalidOpCode + } + return (op & ^sizeMask) | OpCode(size) +} + +// SetSource sets the source on jump and ALU operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetSource(source Source) OpCode { + if op.Class().encoding() != jumpOrALU || !valid(OpCode(source), sourceMask) { + return InvalidOpCode + } + return (op & ^sourceMask) | OpCode(source) +} + +// SetALUOp sets the ALUOp on ALU operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetALUOp(alu ALUOp) OpCode { + class := op.Class() + if (class != ALUClass && class != ALU64Class) || !valid(OpCode(alu), aluMask) { + return InvalidOpCode + } + return (op & ^aluMask) | OpCode(alu) +} + +// SetJumpOp sets the JumpOp on jump operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetJumpOp(jump JumpOp) OpCode { + if op.Class() != JumpClass || !valid(OpCode(jump), jumpMask) { + return InvalidOpCode + } + return (op & ^jumpMask) | OpCode(jump) +} + +func (op OpCode) String() string { + var f strings.Builder + + switch class := op.Class(); class { + case LdClass, LdXClass, StClass, StXClass: + f.WriteString(strings.TrimSuffix(class.String(), "Class")) + + mode := op.Mode() + f.WriteString(strings.TrimSuffix(mode.String(), "Mode")) + + switch op.Size() { + case DWord: + f.WriteString("DW") + case Word: + f.WriteString("W") + case Half: + f.WriteString("H") + case Byte: + f.WriteString("B") + } + + case ALU64Class, ALUClass: + f.WriteString(op.ALUOp().String()) + + if op.ALUOp() == Swap { + // Width for Endian is controlled by Constant + f.WriteString(op.Endianness().String()) + } else { + if class == ALUClass { + f.WriteString("32") + } + + f.WriteString(strings.TrimSuffix(op.Source().String(), "Source")) + } + + case JumpClass: + f.WriteString(op.JumpOp().String()) + if jop := op.JumpOp(); jop != Exit && jop != Call { + f.WriteString(strings.TrimSuffix(op.Source().String(), "Source")) + } + + default: + fmt.Fprintf(&f, "%#x", op) + } + + return f.String() +} + +// valid returns true if all bits in value are covered by mask. +func valid(value, mask OpCode) bool { + return value & ^mask == 0 +} diff --git a/vendor/github.com/cilium/ebpf/asm/opcode_string.go b/vendor/github.com/cilium/ebpf/asm/opcode_string.go new file mode 100644 index 000000000..079ce1db0 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/opcode_string.go @@ -0,0 +1,38 @@ +// Code generated by "stringer -output opcode_string.go -type=Class"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[LdClass-0] + _ = x[LdXClass-1] + _ = x[StClass-2] + _ = x[StXClass-3] + _ = x[ALUClass-4] + _ = x[JumpClass-5] + _ = x[ALU64Class-7] +} + +const ( + _Class_name_0 = "LdClassLdXClassStClassStXClassALUClassJumpClass" + _Class_name_1 = "ALU64Class" +) + +var ( + _Class_index_0 = [...]uint8{0, 7, 15, 22, 30, 38, 47} +) + +func (i Class) String() string { + switch { + case 0 <= i && i <= 5: + return _Class_name_0[_Class_index_0[i]:_Class_index_0[i+1]] + case i == 7: + return _Class_name_1 + default: + return "Class(" + strconv.FormatInt(int64(i), 10) + ")" + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/register.go b/vendor/github.com/cilium/ebpf/asm/register.go new file mode 100644 index 000000000..4f284fbe7 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/register.go @@ -0,0 +1,42 @@ +package asm + +import ( + "fmt" +) + +// Register is the source or destination of most operations. +type Register uint8 + +// R0 contains return values. +const R0 Register = 0 + +// Registers for function arguments. +const ( + R1 Register = R0 + 1 + iota + R2 + R3 + R4 + R5 +) + +// Callee saved registers preserved by function calls. +const ( + R6 Register = R5 + 1 + iota + R7 + R8 + R9 +) + +// Read-only frame pointer to access stack. +const ( + R10 Register = R9 + 1 + RFP = R10 +) + +func (r Register) String() string { + v := uint8(r) + if v == 10 { + return "rfp" + } + return fmt.Sprintf("r%d", v) +} diff --git a/vendor/github.com/cilium/ebpf/collection.go b/vendor/github.com/cilium/ebpf/collection.go new file mode 100644 index 000000000..bf6a96d4a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/collection.go @@ -0,0 +1,201 @@ +package ebpf + +import ( + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal/btf" + "github.com/pkg/errors" +) + +// CollectionOptions control loading a collection into the kernel. +type CollectionOptions struct { + Programs ProgramOptions +} + +// CollectionSpec describes a collection. +type CollectionSpec struct { + Maps map[string]*MapSpec + Programs map[string]*ProgramSpec +} + +// Copy returns a recursive copy of the spec. +func (cs *CollectionSpec) Copy() *CollectionSpec { + if cs == nil { + return nil + } + + cpy := CollectionSpec{ + Maps: make(map[string]*MapSpec, len(cs.Maps)), + Programs: make(map[string]*ProgramSpec, len(cs.Programs)), + } + + for name, spec := range cs.Maps { + cpy.Maps[name] = spec.Copy() + } + + for name, spec := range cs.Programs { + cpy.Programs[name] = spec.Copy() + } + + return &cpy +} + +// Collection is a collection of Programs and Maps associated +// with their symbols +type Collection struct { + Programs map[string]*Program + Maps map[string]*Map +} + +// NewCollection creates a Collection from a specification. +// +// Only maps referenced by at least one of the programs are initialized. +func NewCollection(spec *CollectionSpec) (*Collection, error) { + return NewCollectionWithOptions(spec, CollectionOptions{}) +} + +// NewCollectionWithOptions creates a Collection from a specification. +// +// Only maps referenced by at least one of the programs are initialized. +func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (coll *Collection, err error) { + var ( + maps = make(map[string]*Map) + progs = make(map[string]*Program) + btfs = make(map[*btf.Spec]*btf.Handle) + ) + + defer func() { + for _, btf := range btfs { + btf.Close() + } + + if err == nil { + return + } + + for _, m := range maps { + m.Close() + } + + for _, p := range progs { + p.Close() + } + }() + + loadBTF := func(spec *btf.Spec) (*btf.Handle, error) { + if btfs[spec] != nil { + return btfs[spec], nil + } + + handle, err := btf.NewHandle(spec) + if err != nil { + return nil, err + } + + btfs[spec] = handle + return handle, nil + } + + for mapName, mapSpec := range spec.Maps { + var handle *btf.Handle + if mapSpec.BTF != nil { + handle, err = loadBTF(btf.MapSpec(mapSpec.BTF)) + if err != nil && !btf.IsNotSupported(err) { + return nil, err + } + } + + m, err := newMapWithBTF(mapSpec, handle) + if err != nil { + return nil, errors.Wrapf(err, "map %s", mapName) + } + maps[mapName] = m + } + + for progName, origProgSpec := range spec.Programs { + progSpec := origProgSpec.Copy() + + // Rewrite any reference to a valid map. + for i := range progSpec.Instructions { + var ( + ins = &progSpec.Instructions[i] + m = maps[ins.Reference] + ) + + if ins.Reference == "" || m == nil { + continue + } + + if ins.Src == asm.R1 { + // Don't overwrite maps already rewritten, users can + // rewrite programs in the spec themselves + continue + } + + if err := ins.RewriteMapPtr(m.FD()); err != nil { + return nil, errors.Wrapf(err, "progam %s: map %s", progName, ins.Reference) + } + } + + var handle *btf.Handle + if progSpec.BTF != nil { + handle, err = loadBTF(btf.ProgramSpec(progSpec.BTF)) + if err != nil && !btf.IsNotSupported(err) { + return nil, err + } + } + + prog, err := newProgramWithBTF(progSpec, handle, opts.Programs) + if err != nil { + return nil, errors.Wrapf(err, "program %s", progName) + } + progs[progName] = prog + } + + return &Collection{ + progs, + maps, + }, nil +} + +// LoadCollection parses an object file and converts it to a collection. +func LoadCollection(file string) (*Collection, error) { + spec, err := LoadCollectionSpec(file) + if err != nil { + return nil, err + } + return NewCollection(spec) +} + +// Close frees all maps and programs associated with the collection. +// +// The collection mustn't be used afterwards. +func (coll *Collection) Close() { + for _, prog := range coll.Programs { + prog.Close() + } + for _, m := range coll.Maps { + m.Close() + } +} + +// DetachMap removes the named map from the Collection. +// +// This means that a later call to Close() will not affect this map. +// +// Returns nil if no map of that name exists. +func (coll *Collection) DetachMap(name string) *Map { + m := coll.Maps[name] + delete(coll.Maps, name) + return m +} + +// DetachProgram removes the named program from the Collection. +// +// This means that a later call to Close() will not affect this program. +// +// Returns nil if no program of that name exists. +func (coll *Collection) DetachProgram(name string) *Program { + p := coll.Programs[name] + delete(coll.Programs, name) + return p +} diff --git a/vendor/github.com/cilium/ebpf/doc.go b/vendor/github.com/cilium/ebpf/doc.go new file mode 100644 index 000000000..d96e6b1e6 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/doc.go @@ -0,0 +1,17 @@ +// Package ebpf is a toolkit for working with eBPF programs. +// +// eBPF programs are small snippets of code which are executed directly +// in a VM in the Linux kernel, which makes them very fast and flexible. +// Many Linux subsystems now accept eBPF programs. This makes it possible +// to implement highly application specific logic inside the kernel, +// without having to modify the actual kernel itself. +// +// This package is designed for long-running processes which +// want to use eBPF to implement part of their application logic. It has no +// run-time dependencies outside of the library and the Linux kernel itself. +// eBPF code should be compiled ahead of time using clang, and shipped with +// your application as any other resource. +// +// This package doesn't include code required to attach eBPF to Linux +// subsystems, since this varies per subsystem. +package ebpf diff --git a/vendor/github.com/cilium/ebpf/elf_reader.go b/vendor/github.com/cilium/ebpf/elf_reader.go new file mode 100644 index 000000000..c33b744f8 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/elf_reader.go @@ -0,0 +1,523 @@ +package ebpf + +import ( + "bytes" + "debug/elf" + "encoding/binary" + "io" + "os" + "strings" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/btf" + + "github.com/pkg/errors" +) + +type elfCode struct { + *elf.File + symbols []elf.Symbol + symbolsPerSection map[elf.SectionIndex]map[uint64]string + license string + version uint32 +} + +// LoadCollectionSpec parses an ELF file into a CollectionSpec. +func LoadCollectionSpec(file string) (*CollectionSpec, error) { + f, err := os.Open(file) + if err != nil { + return nil, err + } + defer f.Close() + + spec, err := LoadCollectionSpecFromReader(f) + return spec, errors.Wrapf(err, "file %s", file) +} + +// LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec. +func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) { + f, err := elf.NewFile(rd) + if err != nil { + return nil, err + } + defer f.Close() + + symbols, err := f.Symbols() + if err != nil { + return nil, errors.Wrap(err, "load symbols") + } + + ec := &elfCode{f, symbols, symbolsPerSection(symbols), "", 0} + + var ( + licenseSection *elf.Section + versionSection *elf.Section + btfMaps = make(map[elf.SectionIndex]*elf.Section) + progSections = make(map[elf.SectionIndex]*elf.Section) + relSections = make(map[elf.SectionIndex]*elf.Section) + mapSections = make(map[elf.SectionIndex]*elf.Section) + ) + + for i, sec := range ec.Sections { + switch { + case strings.HasPrefix(sec.Name, "license"): + licenseSection = sec + case strings.HasPrefix(sec.Name, "version"): + versionSection = sec + case strings.HasPrefix(sec.Name, "maps"): + mapSections[elf.SectionIndex(i)] = sec + case sec.Name == ".maps": + btfMaps[elf.SectionIndex(i)] = sec + case sec.Type == elf.SHT_REL: + if int(sec.Info) >= len(ec.Sections) { + return nil, errors.Errorf("found relocation section %v for missing section %v", i, sec.Info) + } + + // Store relocations under the section index of the target + idx := elf.SectionIndex(sec.Info) + if relSections[idx] != nil { + return nil, errors.Errorf("section %d has multiple relocation sections", sec.Info) + } + relSections[idx] = sec + case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0: + progSections[elf.SectionIndex(i)] = sec + } + } + + ec.license, err = loadLicense(licenseSection) + if err != nil { + return nil, errors.Wrap(err, "load license") + } + + ec.version, err = loadVersion(versionSection, ec.ByteOrder) + if err != nil { + return nil, errors.Wrap(err, "load version") + } + + btf, err := btf.LoadSpecFromReader(rd) + if err != nil { + return nil, errors.Wrap(err, "load BTF") + } + + maps := make(map[string]*MapSpec) + + if err := ec.loadMaps(maps, mapSections); err != nil { + return nil, errors.Wrap(err, "load maps") + } + + if len(btfMaps) > 0 { + if err := ec.loadBTFMaps(maps, btfMaps, btf); err != nil { + return nil, errors.Wrap(err, "load BTF maps") + } + } + + progs, err := ec.loadPrograms(progSections, relSections, btf) + if err != nil { + return nil, errors.Wrap(err, "load programs") + } + + return &CollectionSpec{maps, progs}, nil +} + +func loadLicense(sec *elf.Section) (string, error) { + if sec == nil { + return "", errors.Errorf("missing license section") + } + data, err := sec.Data() + if err != nil { + return "", errors.Wrapf(err, "section %s", sec.Name) + } + return string(bytes.TrimRight(data, "\000")), nil +} + +func loadVersion(sec *elf.Section, bo binary.ByteOrder) (uint32, error) { + if sec == nil { + return 0, nil + } + + var version uint32 + err := binary.Read(sec.Open(), bo, &version) + return version, errors.Wrapf(err, "section %s", sec.Name) +} + +func (ec *elfCode) loadPrograms(progSections, relSections map[elf.SectionIndex]*elf.Section, btf *btf.Spec) (map[string]*ProgramSpec, error) { + var ( + progs []*ProgramSpec + libs []*ProgramSpec + ) + + for idx, prog := range progSections { + syms := ec.symbolsPerSection[idx] + if len(syms) == 0 { + return nil, errors.Errorf("section %v: missing symbols", prog.Name) + } + + funcSym := syms[0] + if funcSym == "" { + return nil, errors.Errorf("section %v: no label at start", prog.Name) + } + + rels, err := ec.loadRelocations(relSections[idx]) + if err != nil { + return nil, errors.Wrapf(err, "program %s: can't load relocations", funcSym) + } + + insns, length, err := ec.loadInstructions(prog, syms, rels) + if err != nil { + return nil, errors.Wrapf(err, "program %s: can't unmarshal instructions", funcSym) + } + + progType, attachType := getProgType(prog.Name) + + spec := &ProgramSpec{ + Name: funcSym, + Type: progType, + AttachType: attachType, + License: ec.license, + KernelVersion: ec.version, + Instructions: insns, + } + + if btf != nil { + spec.BTF, err = btf.Program(prog.Name, length) + if err != nil { + return nil, errors.Wrapf(err, "BTF for section %s (program %s)", prog.Name, funcSym) + } + } + + if spec.Type == UnspecifiedProgram { + // There is no single name we can use for "library" sections, + // since they may contain multiple functions. We'll decode the + // labels they contain later on, and then link sections that way. + libs = append(libs, spec) + } else { + progs = append(progs, spec) + } + } + + res := make(map[string]*ProgramSpec, len(progs)) + for _, prog := range progs { + err := link(prog, libs) + if err != nil { + return nil, errors.Wrapf(err, "program %s", prog.Name) + } + res[prog.Name] = prog + } + + return res, nil +} + +func (ec *elfCode) loadInstructions(section *elf.Section, symbols, relocations map[uint64]string) (asm.Instructions, uint64, error) { + var ( + r = section.Open() + insns asm.Instructions + ins asm.Instruction + offset uint64 + ) + for { + n, err := ins.Unmarshal(r, ec.ByteOrder) + if err == io.EOF { + return insns, offset, nil + } + if err != nil { + return nil, 0, errors.Wrapf(err, "offset %d", offset) + } + + ins.Symbol = symbols[offset] + ins.Reference = relocations[offset] + + insns = append(insns, ins) + offset += n + } +} + +func (ec *elfCode) loadMaps(maps map[string]*MapSpec, mapSections map[elf.SectionIndex]*elf.Section) error { + for idx, sec := range mapSections { + syms := ec.symbolsPerSection[idx] + if len(syms) == 0 { + return errors.Errorf("section %v: no symbols", sec.Name) + } + + if sec.Size%uint64(len(syms)) != 0 { + return errors.Errorf("section %v: map descriptors are not of equal size", sec.Name) + } + + var ( + r = sec.Open() + size = sec.Size / uint64(len(syms)) + ) + for i, offset := 0, uint64(0); i < len(syms); i, offset = i+1, offset+size { + mapSym := syms[offset] + if mapSym == "" { + return errors.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset) + } + + if maps[mapSym] != nil { + return errors.Errorf("section %v: map %v already exists", sec.Name, mapSym) + } + + lr := io.LimitReader(r, int64(size)) + + var spec MapSpec + switch { + case binary.Read(lr, ec.ByteOrder, &spec.Type) != nil: + return errors.Errorf("map %v: missing type", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.KeySize) != nil: + return errors.Errorf("map %v: missing key size", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.ValueSize) != nil: + return errors.Errorf("map %v: missing value size", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.MaxEntries) != nil: + return errors.Errorf("map %v: missing max entries", mapSym) + case binary.Read(lr, ec.ByteOrder, &spec.Flags) != nil: + return errors.Errorf("map %v: missing flags", mapSym) + } + + if _, err := io.Copy(internal.DiscardZeroes{}, lr); err != nil { + return errors.Errorf("map %v: unknown and non-zero fields in definition", mapSym) + } + + maps[mapSym] = &spec + } + } + + return nil +} + +func (ec *elfCode) loadBTFMaps(maps map[string]*MapSpec, mapSections map[elf.SectionIndex]*elf.Section, spec *btf.Spec) error { + + if spec == nil { + return errors.Errorf("missing BTF") + } + + for idx, sec := range mapSections { + syms := ec.symbolsPerSection[idx] + if len(syms) == 0 { + return errors.Errorf("section %v: no symbols", sec.Name) + } + + for _, sym := range syms { + if maps[sym] != nil { + return errors.Errorf("section %v: map %v already exists", sec.Name, sym) + } + + btfMap, err := spec.Map(sym) + if err != nil { + return errors.Wrapf(err, "map %v: can't get BTF", sym) + } + + spec, err := mapSpecFromBTF(btfMap) + if err != nil { + return errors.Wrapf(err, "map %v", sym) + } + + maps[sym] = spec + } + } + + return nil +} + +func mapSpecFromBTF(btfMap *btf.Map) (*MapSpec, error) { + var ( + mapType, flags, maxEntries uint32 + err error + ) + for _, member := range btf.MapType(btfMap).Members { + switch member.Name { + case "type": + mapType, err = uintFromBTF(member.Type) + if err != nil { + return nil, errors.Wrap(err, "can't get type") + } + + case "map_flags": + flags, err = uintFromBTF(member.Type) + if err != nil { + return nil, errors.Wrap(err, "can't get BTF map flags") + } + + case "max_entries": + maxEntries, err = uintFromBTF(member.Type) + if err != nil { + return nil, errors.Wrap(err, "can't get BTF map max entries") + } + + case "key": + case "value": + default: + return nil, errors.Errorf("unrecognized field %s in BTF map definition", member.Name) + } + } + + keySize, err := btf.Sizeof(btf.MapKey(btfMap)) + if err != nil { + return nil, errors.Wrap(err, "can't get size of BTF key") + } + + valueSize, err := btf.Sizeof(btf.MapValue(btfMap)) + if err != nil { + return nil, errors.Wrap(err, "can't get size of BTF value") + } + + return &MapSpec{ + Type: MapType(mapType), + KeySize: uint32(keySize), + ValueSize: uint32(valueSize), + MaxEntries: maxEntries, + Flags: flags, + BTF: btfMap, + }, nil +} + +// uintFromBTF resolves the __uint macro, which is a pointer to a sized +// array, e.g. for int (*foo)[10], this function will return 10. +func uintFromBTF(typ btf.Type) (uint32, error) { + ptr, ok := typ.(*btf.Pointer) + if !ok { + return 0, errors.Errorf("not a pointer: %v", typ) + } + + arr, ok := ptr.Target.(*btf.Array) + if !ok { + return 0, errors.Errorf("not a pointer to array: %v", typ) + } + + return arr.Nelems, nil +} + +func getProgType(v string) (ProgramType, AttachType) { + types := map[string]ProgramType{ + // From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c#n3568 + "socket": SocketFilter, + "seccomp": SocketFilter, + "kprobe/": Kprobe, + "uprobe/": Kprobe, + "kretprobe/": Kprobe, + "uretprobe/": Kprobe, + "tracepoint/": TracePoint, + "raw_tracepoint/": RawTracepoint, + "xdp": XDP, + "perf_event": PerfEvent, + "lwt_in": LWTIn, + "lwt_out": LWTOut, + "lwt_xmit": LWTXmit, + "lwt_seg6local": LWTSeg6Local, + "sockops": SockOps, + "sk_skb": SkSKB, + "sk_msg": SkMsg, + "lirc_mode2": LircMode2, + "flow_dissector": FlowDissector, + + "cgroup_skb/": CGroupSKB, + "cgroup/dev": CGroupDevice, + "cgroup/skb": CGroupSKB, + "cgroup/sock": CGroupSock, + "cgroup/post_bind": CGroupSock, + "cgroup/bind": CGroupSockAddr, + "cgroup/connect": CGroupSockAddr, + "cgroup/sendmsg": CGroupSockAddr, + "cgroup/recvmsg": CGroupSockAddr, + "cgroup/sysctl": CGroupSysctl, + "cgroup/getsockopt": CGroupSockopt, + "cgroup/setsockopt": CGroupSockopt, + "classifier": SchedCLS, + "action": SchedACT, + } + attachTypes := map[string]AttachType{ + "cgroup_skb/ingress": AttachCGroupInetIngress, + "cgroup_skb/egress": AttachCGroupInetEgress, + "cgroup/sock": AttachCGroupInetSockCreate, + "cgroup/post_bind4": AttachCGroupInet4PostBind, + "cgroup/post_bind6": AttachCGroupInet6PostBind, + "cgroup/dev": AttachCGroupDevice, + "sockops": AttachCGroupSockOps, + "sk_skb/stream_parser": AttachSkSKBStreamParser, + "sk_skb/stream_verdict": AttachSkSKBStreamVerdict, + "sk_msg": AttachSkSKBStreamVerdict, + "lirc_mode2": AttachLircMode2, + "flow_dissector": AttachFlowDissector, + "cgroup/bind4": AttachCGroupInet4Bind, + "cgroup/bind6": AttachCGroupInet6Bind, + "cgroup/connect4": AttachCGroupInet4Connect, + "cgroup/connect6": AttachCGroupInet6Connect, + "cgroup/sendmsg4": AttachCGroupUDP4Sendmsg, + "cgroup/sendmsg6": AttachCGroupUDP6Sendmsg, + "cgroup/recvmsg4": AttachCGroupUDP4Recvmsg, + "cgroup/recvmsg6": AttachCGroupUDP6Recvmsg, + "cgroup/sysctl": AttachCGroupSysctl, + "cgroup/getsockopt": AttachCGroupGetsockopt, + "cgroup/setsockopt": AttachCGroupSetsockopt, + } + attachType := AttachNone + for k, t := range attachTypes { + if strings.HasPrefix(v, k) { + attachType = t + } + } + + for k, t := range types { + if strings.HasPrefix(v, k) { + return t, attachType + } + } + return UnspecifiedProgram, AttachNone +} + +func (ec *elfCode) loadRelocations(sec *elf.Section) (map[uint64]string, error) { + rels := make(map[uint64]string) + if sec == nil { + return rels, nil + } + + if sec.Entsize < 16 { + return nil, errors.New("rels are less than 16 bytes") + } + + r := sec.Open() + for off := uint64(0); off < sec.Size; off += sec.Entsize { + ent := io.LimitReader(r, int64(sec.Entsize)) + + var rel elf.Rel64 + if binary.Read(ent, ec.ByteOrder, &rel) != nil { + return nil, errors.Errorf("can't parse relocation at offset %v", off) + } + + symNo := int(elf.R_SYM64(rel.Info) - 1) + if symNo >= len(ec.symbols) { + return nil, errors.Errorf("relocation at offset %d: symbol %v doesnt exist", off, symNo) + } + + rels[rel.Off] = ec.symbols[symNo].Name + } + return rels, nil +} + +func symbolsPerSection(symbols []elf.Symbol) map[elf.SectionIndex]map[uint64]string { + result := make(map[elf.SectionIndex]map[uint64]string) + for i, sym := range symbols { + switch elf.ST_TYPE(sym.Info) { + case elf.STT_NOTYPE: + // Older versions of LLVM doesn't tag + // symbols correctly. + break + case elf.STT_OBJECT: + break + case elf.STT_FUNC: + break + default: + continue + } + + if sym.Name == "" { + continue + } + + idx := sym.Section + if _, ok := result[idx]; !ok { + result[idx] = make(map[uint64]string) + } + result[idx][sym.Value] = symbols[i].Name + } + return result +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/btf.go b/vendor/github.com/cilium/ebpf/internal/btf/btf.go new file mode 100644 index 000000000..b2122f37e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/btf.go @@ -0,0 +1,530 @@ +package btf + +import ( + "bytes" + "debug/elf" + "encoding/binary" + "io" + "io/ioutil" + "math" + "reflect" + "unsafe" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +const btfMagic = 0xeB9F + +// Spec represents decoded BTF. +type Spec struct { + rawTypes []rawType + strings stringTable + types map[string][]Type + funcInfos map[string]extInfo + lineInfos map[string]extInfo +} + +type btfHeader struct { + Magic uint16 + Version uint8 + Flags uint8 + HdrLen uint32 + + TypeOff uint32 + TypeLen uint32 + StringOff uint32 + StringLen uint32 +} + +// LoadSpecFromReader reads BTF sections from an ELF. +// +// Returns a nil Spec and no error if no BTF was present. +func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) { + file, err := elf.NewFile(rd) + if err != nil { + return nil, err + } + defer file.Close() + + var ( + btfSection *elf.Section + btfExtSection *elf.Section + ) + + for _, sec := range file.Sections { + switch sec.Name { + case ".BTF": + btfSection = sec + case ".BTF.ext": + btfExtSection = sec + } + } + + if btfSection == nil { + return nil, nil + } + + spec, err := parseBTF(btfSection.Open(), file.ByteOrder) + if err != nil { + return nil, err + } + + if btfExtSection != nil { + spec.funcInfos, spec.lineInfos, err = parseExtInfos(btfExtSection.Open(), file.ByteOrder, spec.strings) + if err != nil { + return nil, errors.Wrap(err, "can't read ext info") + } + } + + return spec, nil +} + +func parseBTF(btf io.ReadSeeker, bo binary.ByteOrder) (*Spec, error) { + rawBTF, err := ioutil.ReadAll(btf) + if err != nil { + return nil, errors.Wrap(err, "can't read BTF") + } + + rd := bytes.NewReader(rawBTF) + + var header btfHeader + if err := binary.Read(rd, bo, &header); err != nil { + return nil, errors.Wrap(err, "can't read header") + } + + if header.Magic != btfMagic { + return nil, errors.Errorf("incorrect magic value %v", header.Magic) + } + + if header.Version != 1 { + return nil, errors.Errorf("unexpected version %v", header.Version) + } + + if header.Flags != 0 { + return nil, errors.Errorf("unsupported flags %v", header.Flags) + } + + remainder := int64(header.HdrLen) - int64(binary.Size(&header)) + if remainder < 0 { + return nil, errors.New("header is too short") + } + + if _, err := io.CopyN(internal.DiscardZeroes{}, rd, remainder); err != nil { + return nil, errors.Wrap(err, "header padding") + } + + if _, err := rd.Seek(int64(header.HdrLen+header.StringOff), io.SeekStart); err != nil { + return nil, errors.Wrap(err, "can't seek to start of string section") + } + + strings, err := readStringTable(io.LimitReader(rd, int64(header.StringLen))) + if err != nil { + return nil, errors.Wrap(err, "can't read type names") + } + + if _, err := rd.Seek(int64(header.HdrLen+header.TypeOff), io.SeekStart); err != nil { + return nil, errors.Wrap(err, "can't seek to start of type section") + } + + rawTypes, err := readTypes(io.LimitReader(rd, int64(header.TypeLen)), bo) + if err != nil { + return nil, errors.Wrap(err, "can't read types") + } + + types, err := inflateRawTypes(rawTypes, strings) + if err != nil { + return nil, err + } + + return &Spec{ + rawTypes: rawTypes, + types: types, + strings: strings, + funcInfos: make(map[string]extInfo), + lineInfos: make(map[string]extInfo), + }, nil +} + +func (s *Spec) marshal(bo binary.ByteOrder) ([]byte, error) { + var ( + buf bytes.Buffer + header = new(btfHeader) + headerLen = binary.Size(header) + ) + + // Reserve space for the header. We have to write it last since + // we don't know the size of the type section yet. + _, _ = buf.Write(make([]byte, headerLen)) + + // Write type section, just after the header. + for _, typ := range s.rawTypes { + if typ.Kind() == kindDatasec { + // Datasec requires patching with information from the ELF + // file. We don't support this at the moment, so patch + // out any Datasec by turning it into a void*. + typ = rawType{} + typ.SetKind(kindPointer) + } + + if err := typ.Marshal(&buf, bo); err != nil { + return nil, errors.Wrap(err, "can't marshal BTF") + } + } + + typeLen := uint32(buf.Len() - headerLen) + + // Write string section after type section. + _, _ = buf.Write(s.strings) + + // Fill out the header, and write it out. + header = &btfHeader{ + Magic: btfMagic, + Version: 1, + Flags: 0, + HdrLen: uint32(headerLen), + TypeOff: 0, + TypeLen: typeLen, + StringOff: typeLen, + StringLen: uint32(len(s.strings)), + } + + raw := buf.Bytes() + err := binary.Write(sliceWriter(raw[:headerLen]), bo, header) + if err != nil { + return nil, errors.Wrap(err, "can't write header") + } + + return raw, nil +} + +type sliceWriter []byte + +func (sw sliceWriter) Write(p []byte) (int, error) { + if len(p) != len(sw) { + return 0, errors.New("size doesn't match") + } + + return copy(sw, p), nil +} + +// Program finds the BTF for a specific section. +// +// Length is the number of bytes in the raw BPF instruction stream. +// +// Returns an error if there is no BTF. +func (s *Spec) Program(name string, length uint64) (*Program, error) { + if length == 0 { + return nil, errors.New("length musn't be zero") + } + + funcInfos, funcOK := s.funcInfos[name] + lineInfos, lineOK := s.lineInfos[name] + + if !funcOK && !lineOK { + return nil, errors.Errorf("no BTF for program %s", name) + } + + return &Program{s, length, funcInfos, lineInfos}, nil +} + +// Map finds the BTF for a map. +// +// Returns an error if there is no BTF for the given name. +func (s *Spec) Map(name string) (*Map, error) { + var mapVar Var + if err := s.FindType(name, &mapVar); err != nil { + return nil, err + } + + mapStruct, ok := mapVar.Type.(*Struct) + if !ok { + return nil, errors.Errorf("expected struct, have %s", mapVar.Type) + } + + var key, value Type + for _, member := range mapStruct.Members { + switch member.Name { + case "key": + key = member.Type + + case "value": + value = member.Type + } + } + + if key == nil { + return nil, errors.Errorf("map %s: missing 'key' in type", name) + } + + if value == nil { + return nil, errors.Errorf("map %s: missing 'value' in type", name) + } + + return &Map{mapStruct, s, key, value}, nil +} + +var errNotFound = errors.New("not found") + +// FindType searches for a type with a specific name. +// +// hint determines the type of the returned Type. +// +// Returns an error if there is no or multiple matches. +func (s *Spec) FindType(name string, typ Type) error { + var ( + wanted = reflect.TypeOf(typ) + candidate Type + ) + + for _, typ := range s.types[name] { + if reflect.TypeOf(typ) != wanted { + continue + } + + if candidate != nil { + return errors.Errorf("type %s: multiple candidates for %T", name, typ) + } + + candidate = typ + } + + if candidate == nil { + return errors.WithMessagef(errNotFound, "type %s", name) + } + + value := reflect.Indirect(reflect.ValueOf(copyType(candidate))) + reflect.Indirect(reflect.ValueOf(typ)).Set(value) + return nil +} + +// Handle is a reference to BTF loaded into the kernel. +type Handle struct { + fd *internal.FD +} + +// NewHandle loads BTF into the kernel. +// +// Returns an error if BTF is not supported, which can +// be checked by IsNotSupported. +func NewHandle(spec *Spec) (*Handle, error) { + if err := haveBTF(); err != nil { + return nil, err + } + + btf, err := spec.marshal(internal.NativeEndian) + if err != nil { + return nil, errors.Wrap(err, "can't marshal BTF") + } + + if uint64(len(btf)) > math.MaxUint32 { + return nil, errors.New("BTF exceeds the maximum size") + } + + attr := &bpfLoadBTFAttr{ + btf: internal.NewSlicePointer(btf), + btfSize: uint32(len(btf)), + } + + fd, err := bpfLoadBTF(attr) + if err != nil { + logBuf := make([]byte, 64*1024) + attr.logBuf = internal.NewSlicePointer(logBuf) + attr.btfLogSize = uint32(len(logBuf)) + attr.btfLogLevel = 1 + _, logErr := bpfLoadBTF(attr) + return nil, internal.ErrorWithLog(err, logBuf, logErr) + } + + return &Handle{fd}, nil +} + +// Close destroys the handle. +// +// Subsequent calls to FD will return an invalid value. +func (h *Handle) Close() error { + return h.fd.Close() +} + +// FD returns the file descriptor for the handle. +func (h *Handle) FD() int { + value, err := h.fd.Value() + if err != nil { + return -1 + } + + return int(value) +} + +// Map is the BTF for a map. +type Map struct { + definition *Struct + spec *Spec + key, value Type +} + +// MapSpec should be a method on Map, but is a free function +// to hide it from users of the ebpf package. +func MapSpec(m *Map) *Spec { + return m.spec +} + +// MapType should be a method on Map, but is a free function +// to hide it from users of the ebpf package. +func MapType(m *Map) *Struct { + return m.definition +} + +// MapKey should be a method on Map, but is a free function +// to hide it from users of the ebpf package. +func MapKey(m *Map) Type { + return m.key +} + +// MapValue should be a method on Map, but is a free function +// to hide it from users of the ebpf package. +func MapValue(m *Map) Type { + return m.value +} + +// Program is the BTF information for a stream of instructions. +type Program struct { + spec *Spec + length uint64 + funcInfos, lineInfos extInfo +} + +// ProgramSpec returns the Spec needed for loading function and line infos into the kernel. +// +// This is a free function instead of a method to hide it from users +// of package ebpf. +func ProgramSpec(s *Program) *Spec { + return s.spec +} + +// ProgramAppend the information from other to the Program. +// +// This is a free function instead of a method to hide it from users +// of package ebpf. +func ProgramAppend(s, other *Program) error { + funcInfos, err := s.funcInfos.append(other.funcInfos, s.length) + if err != nil { + return errors.Wrap(err, "func infos") + } + + lineInfos, err := s.lineInfos.append(other.lineInfos, s.length) + if err != nil { + return errors.Wrap(err, "line infos") + } + + s.length += other.length + s.funcInfos = funcInfos + s.lineInfos = lineInfos + return nil +} + +// ProgramFuncInfos returns the binary form of BTF function infos. +// +// This is a free function instead of a method to hide it from users +// of package ebpf. +func ProgramFuncInfos(s *Program) (recordSize uint32, bytes []byte, err error) { + bytes, err = s.funcInfos.MarshalBinary() + if err != nil { + return 0, nil, err + } + + return s.funcInfos.recordSize, bytes, nil +} + +// ProgramLineInfos returns the binary form of BTF line infos. +// +// This is a free function instead of a method to hide it from users +// of package ebpf. +func ProgramLineInfos(s *Program) (recordSize uint32, bytes []byte, err error) { + bytes, err = s.lineInfos.MarshalBinary() + if err != nil { + return 0, nil, err + } + + return s.lineInfos.recordSize, bytes, nil +} + +// IsNotSupported returns true if the error indicates that the kernel +// doesn't support BTF. +func IsNotSupported(err error) bool { + ufe, ok := errors.Cause(err).(*internal.UnsupportedFeatureError) + return ok && ufe.Name == "BTF" +} + +type bpfLoadBTFAttr struct { + btf internal.Pointer + logBuf internal.Pointer + btfSize uint32 + btfLogSize uint32 + btfLogLevel uint32 +} + +func bpfLoadBTF(attr *bpfLoadBTFAttr) (*internal.FD, error) { + const _BTFLoad = 18 + + fd, err := internal.BPF(_BTFLoad, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + + return internal.NewFD(uint32(fd)), nil +} + +func minimalBTF(bo binary.ByteOrder) []byte { + const minHeaderLength = 24 + + var ( + types struct { + Integer btfType + Var btfType + btfVar struct{ Linkage uint32 } + } + typLen = uint32(binary.Size(&types)) + strings = []byte{0, 'a', 0} + header = btfHeader{ + Magic: btfMagic, + Version: 1, + HdrLen: minHeaderLength, + TypeOff: 0, + TypeLen: typLen, + StringOff: typLen, + StringLen: uint32(len(strings)), + } + ) + + // We use a BTF_KIND_VAR here, to make sure that + // the kernel understands BTF at least as well as we + // do. BTF_KIND_VAR was introduced ~5.1. + types.Integer.SetKind(kindPointer) + types.Var.NameOff = 1 + types.Var.SetKind(kindVar) + types.Var.SizeType = 1 + + buf := new(bytes.Buffer) + _ = binary.Write(buf, bo, &header) + _ = binary.Write(buf, bo, &types) + buf.Write(strings) + + return buf.Bytes() +} + +var haveBTF = internal.FeatureTest("BTF", "5.1", func() bool { + btf := minimalBTF(internal.NativeEndian) + fd, err := bpfLoadBTF(&bpfLoadBTFAttr{ + btf: internal.NewSlicePointer(btf), + btfSize: uint32(len(btf)), + }) + if err == nil { + fd.Close() + } + // Check for EINVAL specifically, rather than err != nil since we + // otherwise misdetect due to insufficient permissions. + return errors.Cause(err) != unix.EINVAL +}) diff --git a/vendor/github.com/cilium/ebpf/internal/btf/btf_types.go b/vendor/github.com/cilium/ebpf/internal/btf/btf_types.go new file mode 100644 index 000000000..6570fedff --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/btf_types.go @@ -0,0 +1,190 @@ +package btf + +import ( + "encoding/binary" + "io" + + "github.com/pkg/errors" +) + +// btfKind describes a Type. +type btfKind uint8 + +// Equivalents of the BTF_KIND_* constants. +const ( + kindUnknown btfKind = iota + kindInt + kindPointer + kindArray + kindStruct + kindUnion + kindEnum + kindForward + kindTypedef + kindVolatile + kindConst + kindRestrict + // Added ~4.20 + kindFunc + kindFuncProto + // Added ~5.1 + kindVar + kindDatasec +) + +const ( + btfTypeKindShift = 24 + btfTypeKindLen = 4 + btfTypeVlenShift = 0 + btfTypeVlenMask = 16 +) + +// btfType is equivalent to struct btf_type in Documentation/bpf/btf.rst. +type btfType struct { + NameOff uint32 + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-27: kind (e.g. int, ptr, array...etc) + * bits 28-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd + */ + Info uint32 + /* "size" is used by INT, ENUM, STRUCT and UNION. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. + * "type" is a type_id referring to another type. + */ + SizeType uint32 +} + +func mask(len uint32) uint32 { + return (1 << len) - 1 +} + +func (bt *btfType) info(len, shift uint32) uint32 { + return (bt.Info >> shift) & mask(len) +} + +func (bt *btfType) setInfo(value, len, shift uint32) { + bt.Info &^= mask(len) << shift + bt.Info |= (value & mask(len)) << shift +} + +func (bt *btfType) Kind() btfKind { + return btfKind(bt.info(btfTypeKindLen, btfTypeKindShift)) +} + +func (bt *btfType) SetKind(kind btfKind) { + bt.setInfo(uint32(kind), btfTypeKindLen, btfTypeKindShift) +} + +func (bt *btfType) Vlen() int { + return int(bt.info(btfTypeVlenMask, btfTypeVlenShift)) +} + +func (bt *btfType) SetVlen(vlen int) { + bt.setInfo(uint32(vlen), btfTypeVlenMask, btfTypeVlenShift) +} + +func (bt *btfType) Type() TypeID { + // TODO: Panic here if wrong kind? + return TypeID(bt.SizeType) +} + +func (bt *btfType) Size() uint32 { + // TODO: Panic here if wrong kind? + return bt.SizeType +} + +type rawType struct { + btfType + data interface{} +} + +func (rt *rawType) Marshal(w io.Writer, bo binary.ByteOrder) error { + if err := binary.Write(w, bo, &rt.btfType); err != nil { + return err + } + + if rt.data == nil { + return nil + } + + return binary.Write(w, bo, rt.data) +} + +type btfArray struct { + Type TypeID + IndexType TypeID + Nelems uint32 +} + +type btfMember struct { + NameOff uint32 + Type TypeID + Offset uint32 +} + +func readTypes(r io.Reader, bo binary.ByteOrder) ([]rawType, error) { + var ( + header btfType + types []rawType + ) + + for id := TypeID(1); ; id++ { + if err := binary.Read(r, bo, &header); err == io.EOF { + return types, nil + } else if err != nil { + return nil, errors.Wrapf(err, "can't read type info for id %v", id) + } + + var data interface{} + switch header.Kind() { + case kindInt: + // sizeof(uint32) + data = make([]byte, 4) + case kindPointer: + case kindArray: + data = new(btfArray) + case kindStruct: + fallthrough + case kindUnion: + data = make([]btfMember, header.Vlen()) + case kindEnum: + // sizeof(struct btf_enum) + data = make([]byte, header.Vlen()*4*2) + case kindForward: + case kindTypedef: + case kindVolatile: + case kindConst: + case kindRestrict: + case kindFunc: + case kindFuncProto: + // sizeof(struct btf_param) + data = make([]byte, header.Vlen()*4*2) + case kindVar: + // sizeof(struct btf_variable) + data = make([]byte, 4) + case kindDatasec: + // sizeof(struct btf_var_secinfo) + data = make([]byte, header.Vlen()*4*3) + default: + return nil, errors.Errorf("type id %v: unknown kind: %v", id, header.Kind()) + } + + if data == nil { + types = append(types, rawType{header, nil}) + continue + } + + if err := binary.Read(r, bo, data); err != nil { + return nil, errors.Wrapf(err, "type id %d: kind %v: can't read %T", id, header.Kind(), data) + } + + types = append(types, rawType{header, data}) + } +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/doc.go b/vendor/github.com/cilium/ebpf/internal/btf/doc.go new file mode 100644 index 000000000..ad2576cb2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/doc.go @@ -0,0 +1,8 @@ +// Package btf handles data encoded according to the BPF Type Format. +// +// The canonical documentation lives in the Linux kernel repository and is +// available at https://www.kernel.org/doc/html/latest/bpf/btf.html +// +// The API is very much unstable. You should only use this via the main +// ebpf library. +package btf diff --git a/vendor/github.com/cilium/ebpf/internal/btf/ext_info.go b/vendor/github.com/cilium/ebpf/internal/btf/ext_info.go new file mode 100644 index 000000000..ab019cac7 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/ext_info.go @@ -0,0 +1,184 @@ +package btf + +import ( + "bytes" + "encoding/binary" + "io" + "io/ioutil" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + + "github.com/pkg/errors" +) + +type btfExtHeader struct { + Magic uint16 + Version uint8 + Flags uint8 + HdrLen uint32 + + FuncInfoOff uint32 + FuncInfoLen uint32 + LineInfoOff uint32 + LineInfoLen uint32 +} + +func parseExtInfos(r io.ReadSeeker, bo binary.ByteOrder, strings stringTable) (funcInfo, lineInfo map[string]extInfo, err error) { + const expectedMagic = 0xeB9F + + var header btfExtHeader + if err := binary.Read(r, bo, &header); err != nil { + return nil, nil, errors.Wrap(err, "can't read header") + } + + if header.Magic != expectedMagic { + return nil, nil, errors.Errorf("incorrect magic value %v", header.Magic) + } + + if header.Version != 1 { + return nil, nil, errors.Errorf("unexpected version %v", header.Version) + } + + if header.Flags != 0 { + return nil, nil, errors.Errorf("unsupported flags %v", header.Flags) + } + + remainder := int64(header.HdrLen) - int64(binary.Size(&header)) + if remainder < 0 { + return nil, nil, errors.New("header is too short") + } + + // Of course, the .BTF.ext header has different semantics than the + // .BTF ext header. We need to ignore non-null values. + _, err = io.CopyN(ioutil.Discard, r, remainder) + if err != nil { + return nil, nil, errors.Wrap(err, "header padding") + } + + if _, err := r.Seek(int64(header.HdrLen+header.FuncInfoOff), io.SeekStart); err != nil { + return nil, nil, errors.Wrap(err, "can't seek to function info section") + } + + funcInfo, err = parseExtInfo(io.LimitReader(r, int64(header.FuncInfoLen)), bo, strings) + if err != nil { + return nil, nil, errors.Wrap(err, "function info") + } + + if _, err := r.Seek(int64(header.HdrLen+header.LineInfoOff), io.SeekStart); err != nil { + return nil, nil, errors.Wrap(err, "can't seek to line info section") + } + + lineInfo, err = parseExtInfo(io.LimitReader(r, int64(header.LineInfoLen)), bo, strings) + if err != nil { + return nil, nil, errors.Wrap(err, "line info") + } + + return funcInfo, lineInfo, nil +} + +type btfExtInfoSec struct { + SecNameOff uint32 + NumInfo uint32 +} + +type extInfoRecord struct { + InsnOff uint64 + Opaque []byte +} + +type extInfo struct { + recordSize uint32 + records []extInfoRecord +} + +func (ei extInfo) append(other extInfo, offset uint64) (extInfo, error) { + if other.recordSize != ei.recordSize { + return extInfo{}, errors.Errorf("ext_info record size mismatch, want %d (got %d)", ei.recordSize, other.recordSize) + } + + records := make([]extInfoRecord, 0, len(ei.records)+len(other.records)) + records = append(records, ei.records...) + for _, info := range other.records { + records = append(records, extInfoRecord{ + InsnOff: info.InsnOff + offset, + Opaque: info.Opaque, + }) + } + return extInfo{ei.recordSize, records}, nil +} + +func (ei extInfo) MarshalBinary() ([]byte, error) { + if len(ei.records) == 0 { + return nil, nil + } + + buf := bytes.NewBuffer(make([]byte, 0, int(ei.recordSize)*len(ei.records))) + for _, info := range ei.records { + // The kernel expects offsets in number of raw bpf instructions, + // while the ELF tracks it in bytes. + insnOff := uint32(info.InsnOff / asm.InstructionSize) + if err := binary.Write(buf, internal.NativeEndian, insnOff); err != nil { + return nil, errors.Wrap(err, "can't write instruction offset") + } + + buf.Write(info.Opaque) + } + + return buf.Bytes(), nil +} + +func parseExtInfo(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]extInfo, error) { + var recordSize uint32 + if err := binary.Read(r, bo, &recordSize); err != nil { + return nil, errors.Wrap(err, "can't read record size") + } + + if recordSize < 4 { + // Need at least insnOff + return nil, errors.New("record size too short") + } + + result := make(map[string]extInfo) + for { + var infoHeader btfExtInfoSec + if err := binary.Read(r, bo, &infoHeader); err == io.EOF { + return result, nil + } else if err != nil { + return nil, errors.Wrap(err, "can't read ext info header") + } + + secName, err := strings.Lookup(infoHeader.SecNameOff) + if err != nil { + return nil, errors.Wrap(err, "can't get section name") + } + + if infoHeader.NumInfo == 0 { + return nil, errors.Errorf("section %s has invalid number of records", secName) + } + + var records []extInfoRecord + for i := uint32(0); i < infoHeader.NumInfo; i++ { + var byteOff uint32 + if err := binary.Read(r, bo, &byteOff); err != nil { + return nil, errors.Wrapf(err, "section %v: can't read extended info offset", secName) + } + + buf := make([]byte, int(recordSize-4)) + if _, err := io.ReadFull(r, buf); err != nil { + return nil, errors.Wrapf(err, "section %v: can't read record", secName) + } + + if byteOff%asm.InstructionSize != 0 { + return nil, errors.Errorf("section %v: offset %v is not aligned with instruction size", secName, byteOff) + } + + records = append(records, extInfoRecord{uint64(byteOff), buf}) + } + + result[secName] = extInfo{ + recordSize, + records, + } + } +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/strings.go b/vendor/github.com/cilium/ebpf/internal/btf/strings.go new file mode 100644 index 000000000..c0337649e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/strings.go @@ -0,0 +1,60 @@ +package btf + +import ( + "bytes" + "io" + "io/ioutil" + + "github.com/pkg/errors" +) + +type stringTable []byte + +func readStringTable(r io.Reader) (stringTable, error) { + contents, err := ioutil.ReadAll(r) + if err != nil { + return nil, errors.Wrap(err, "can't read string table") + } + + if len(contents) < 1 { + return nil, errors.New("string table is empty") + } + + if contents[0] != '\x00' { + return nil, errors.New("first item in string table is non-empty") + } + + if contents[len(contents)-1] != '\x00' { + return nil, errors.New("string table isn't null terminated") + } + + return stringTable(contents), nil +} + +func (st stringTable) Lookup(offset uint32) (string, error) { + if int64(offset) > int64(^uint(0)>>1) { + return "", errors.Errorf("offset %d overflows int", offset) + } + + pos := int(offset) + if pos >= len(st) { + return "", errors.Errorf("offset %d is out of bounds", offset) + } + + if pos > 0 && st[pos-1] != '\x00' { + return "", errors.Errorf("offset %d isn't start of a string", offset) + } + + str := st[pos:] + end := bytes.IndexByte(str, '\x00') + if end == -1 { + return "", errors.Errorf("offset %d isn't null terminated", offset) + } + + return string(str[:end]), nil +} + +func (st stringTable) LookupName(offset uint32) (Name, error) { + str, err := st.Lookup(offset) + return Name(str), err +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/types.go b/vendor/github.com/cilium/ebpf/internal/btf/types.go new file mode 100644 index 000000000..c49cb8621 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/types.go @@ -0,0 +1,550 @@ +package btf + +import ( + "math" + + "github.com/pkg/errors" +) + +const maxTypeDepth = 32 + +// TypeID identifies a type in a BTF section. +type TypeID uint32 + +// ID implements part of the Type interface. +func (tid TypeID) ID() TypeID { + return tid +} + +// Type represents a type described by BTF. +type Type interface { + ID() TypeID + + // Make a copy of the type, without copying Type members. + copy() Type + + walk(*copyStack) +} + +// Name identifies a type. +// +// Anonymous types have an empty name. +type Name string + +func (n Name) name() string { + return string(n) +} + +// Void is the unit type of BTF. +type Void struct{} + +func (v Void) ID() TypeID { return 0 } +func (v Void) copy() Type { return Void{} } +func (v Void) walk(*copyStack) {} + +// Int is an integer of a given length. +type Int struct { + TypeID + Name + + // The size of the integer in bytes. + Size uint32 +} + +func (i *Int) size() uint32 { return i.Size } +func (i *Int) walk(*copyStack) {} +func (i *Int) copy() Type { + cpy := *i + return &cpy +} + +// Pointer is a pointer to another type. +type Pointer struct { + TypeID + Target Type +} + +func (p *Pointer) size() uint32 { return 8 } +func (p *Pointer) walk(cs *copyStack) { cs.push(&p.Target) } +func (p *Pointer) copy() Type { + cpy := *p + return &cpy +} + +// Array is an array with a fixed number of elements. +type Array struct { + TypeID + Type Type + Nelems uint32 +} + +func (arr *Array) walk(cs *copyStack) { cs.push(&arr.Type) } +func (arr *Array) copy() Type { + cpy := *arr + return &cpy +} + +// Struct is a compound type of consecutive members. +type Struct struct { + TypeID + Name + // The size of the struct including padding, in bytes + Size uint32 + Members []Member +} + +func (s *Struct) size() uint32 { return s.Size } + +func (s *Struct) walk(cs *copyStack) { + for i := range s.Members { + cs.push(&s.Members[i].Type) + } +} + +func (s *Struct) copy() Type { + cpy := *s + cpy.Members = copyMembers(cpy.Members) + return &cpy +} + +// Union is a compound type where members occupy the same memory. +type Union struct { + TypeID + Name + // The size of the union including padding, in bytes. + Size uint32 + Members []Member +} + +func (u *Union) size() uint32 { return u.Size } + +func (u *Union) walk(cs *copyStack) { + for i := range u.Members { + cs.push(&u.Members[i].Type) + } +} + +func (u *Union) copy() Type { + cpy := *u + cpy.Members = copyMembers(cpy.Members) + return &cpy +} + +// Member is part of a Struct or Union. +// +// It is not a valid Type. +type Member struct { + Name + Type Type + Offset uint32 +} + +func copyMembers(in []Member) []Member { + cpy := make([]Member, 0, len(in)) + for _, member := range in { + cpy = append(cpy, member) + } + return cpy +} + +// Enum lists possible values. +type Enum struct { + TypeID + Name +} + +func (e *Enum) size() uint32 { return 4 } +func (e *Enum) walk(*copyStack) {} +func (e *Enum) copy() Type { + cpy := *e + return &cpy +} + +// Fwd is a forward declaration of a Type. +type Fwd struct { + TypeID + Name +} + +func (f *Fwd) walk(*copyStack) {} +func (f *Fwd) copy() Type { + cpy := *f + return &cpy +} + +// Typedef is an alias of a Type. +type Typedef struct { + TypeID + Name + Type Type +} + +func (td *Typedef) walk(cs *copyStack) { cs.push(&td.Type) } +func (td *Typedef) copy() Type { + cpy := *td + return &cpy +} + +// Volatile is a modifier. +type Volatile struct { + TypeID + Type Type +} + +func (v *Volatile) walk(cs *copyStack) { cs.push(&v.Type) } +func (v *Volatile) copy() Type { + cpy := *v + return &cpy +} + +// Const is a modifier. +type Const struct { + TypeID + Type Type +} + +func (c *Const) walk(cs *copyStack) { cs.push(&c.Type) } +func (c *Const) copy() Type { + cpy := *c + return &cpy +} + +// Restrict is a modifier. +type Restrict struct { + TypeID + Type Type +} + +func (r *Restrict) walk(cs *copyStack) { cs.push(&r.Type) } +func (r *Restrict) copy() Type { + cpy := *r + return &cpy +} + +// Func is a function definition. +type Func struct { + TypeID + Name + Type Type +} + +func (f *Func) walk(cs *copyStack) { cs.push(&f.Type) } +func (f *Func) copy() Type { + cpy := *f + return &cpy +} + +// FuncProto is a function declaration. +type FuncProto struct { + TypeID + Return Type + // Parameters not supported yet +} + +func (fp *FuncProto) walk(cs *copyStack) { cs.push(&fp.Return) } +func (fp *FuncProto) copy() Type { + cpy := *fp + return &cpy +} + +// Var is a global variable. +type Var struct { + TypeID + Name + Type Type +} + +func (v *Var) walk(cs *copyStack) { cs.push(&v.Type) } +func (v *Var) copy() Type { + cpy := *v + return &cpy +} + +// Datasec is a global program section containing data. +type Datasec struct { + TypeID + Name + Size uint32 +} + +func (ds *Datasec) size() uint32 { return ds.Size } +func (ds *Datasec) walk(*copyStack) {} +func (ds *Datasec) copy() Type { + cpy := *ds + return &cpy +} + +type sizer interface { + size() uint32 +} + +var ( + _ sizer = (*Int)(nil) + _ sizer = (*Pointer)(nil) + _ sizer = (*Struct)(nil) + _ sizer = (*Union)(nil) + _ sizer = (*Enum)(nil) + _ sizer = (*Datasec)(nil) +) + +// Sizeof returns the size of a type in bytes. +// +// Returns an error if the size can't be computed. +func Sizeof(typ Type) (int, error) { + var ( + n = int64(1) + elem int64 + ) + + for i := 0; i < maxTypeDepth; i++ { + switch v := typ.(type) { + case *Array: + if n > 0 && int64(v.Nelems) > math.MaxInt64/n { + return 0, errors.New("overflow") + } + + // Arrays may be of zero length, which allows + // n to be zero as well. + n *= int64(v.Nelems) + typ = v.Type + continue + + case sizer: + elem = int64(v.size()) + + case *Typedef: + typ = v.Type + continue + case *Volatile: + typ = v.Type + continue + case *Const: + typ = v.Type + continue + case *Restrict: + typ = v.Type + continue + + default: + return 0, errors.Errorf("unrecognized type %T", typ) + } + + if n > 0 && elem > math.MaxInt64/n { + return 0, errors.New("overflow") + } + + size := n * elem + if int64(int(size)) != size { + return 0, errors.New("overflow") + } + + return int(size), nil + } + + return 0, errors.New("exceeded type depth") +} + +// copy a Type recursively. +// +// typ may form a cycle. +func copyType(typ Type) Type { + var ( + copies = make(map[Type]Type) + work copyStack + ) + + for t := &typ; t != nil; t = work.pop() { + // *t is the identity of the type. + if cpy := copies[*t]; cpy != nil { + *t = cpy + continue + } + + cpy := (*t).copy() + copies[*t] = cpy + *t = cpy + + // Mark any nested types for copying. + cpy.walk(&work) + } + + return typ +} + +// copyStack keeps track of pointers to types which still +// need to be copied. +type copyStack []*Type + +// push adds a type to the stack. +func (cs *copyStack) push(t *Type) { + *cs = append(*cs, t) +} + +// pop returns the topmost Type, or nil. +func (cs *copyStack) pop() *Type { + n := len(*cs) + if n == 0 { + return nil + } + + t := (*cs)[n-1] + *cs = (*cs)[:n-1] + return t +} + +type namer interface { + name() string +} + +var _ namer = Name("") + +// inflateRawTypes takes a list of raw btf types linked via type IDs, and turns +// it into a graph of Types connected via pointers. +// +// Returns a map of named types (so, where NameOff is non-zero). Since BTF ignores +// compilation units, multiple types may share the same name. A Type may form a +// cyclic graph by pointing at itself. +func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (namedTypes map[string][]Type, err error) { + type fixup struct { + id TypeID + typ *Type + } + + var fixups []fixup + convertMembers := func(raw []btfMember) ([]Member, error) { + // NB: The fixup below relies on pre-allocating this array to + // work, since otherwise append might re-allocate members. + members := make([]Member, 0, len(raw)) + for i, btfMember := range raw { + name, err := rawStrings.LookupName(btfMember.NameOff) + if err != nil { + return nil, errors.Wrapf(err, "can't get name for member %d", i) + } + members = append(members, Member{ + Name: name, + Offset: btfMember.Offset, + }) + } + for i := range members { + fixups = append(fixups, fixup{raw[i].Type, &members[i].Type}) + } + return members, nil + } + + types := make([]Type, 0, len(rawTypes)) + types = append(types, Void{}) + namedTypes = make(map[string][]Type) + + for i, raw := range rawTypes { + var ( + // Void is defined to always be type ID 0, and is thus + // omitted from BTF. + id = TypeID(i + 1) + typ Type + ) + + name, err := rawStrings.LookupName(raw.NameOff) + if err != nil { + return nil, errors.Wrapf(err, "can't get name for type id %d", id) + } + + switch raw.Kind() { + case kindInt: + typ = &Int{id, name, raw.Size()} + + case kindPointer: + ptr := &Pointer{id, nil} + fixups = append(fixups, fixup{raw.Type(), &ptr.Target}) + typ = ptr + + case kindArray: + btfArr := raw.data.(*btfArray) + + // IndexType is unused according to btf.rst. + // Don't make it available right now. + arr := &Array{id, nil, btfArr.Nelems} + fixups = append(fixups, fixup{btfArr.Type, &arr.Type}) + typ = arr + + case kindStruct: + members, err := convertMembers(raw.data.([]btfMember)) + if err != nil { + return nil, errors.Wrapf(err, "struct %s (id %d)", name, id) + } + typ = &Struct{id, name, raw.Size(), members} + + case kindUnion: + members, err := convertMembers(raw.data.([]btfMember)) + if err != nil { + return nil, errors.Wrapf(err, "union %s (id %d)", name, id) + } + typ = &Union{id, name, raw.Size(), members} + + case kindEnum: + typ = &Enum{id, name} + + case kindForward: + typ = &Fwd{id, name} + + case kindTypedef: + typedef := &Typedef{id, name, nil} + fixups = append(fixups, fixup{raw.Type(), &typedef.Type}) + typ = typedef + + case kindVolatile: + volatile := &Volatile{id, nil} + fixups = append(fixups, fixup{raw.Type(), &volatile.Type}) + typ = volatile + + case kindConst: + cnst := &Const{id, nil} + fixups = append(fixups, fixup{raw.Type(), &cnst.Type}) + typ = cnst + + case kindRestrict: + restrict := &Restrict{id, nil} + fixups = append(fixups, fixup{raw.Type(), &restrict.Type}) + typ = restrict + + case kindFunc: + fn := &Func{id, name, nil} + fixups = append(fixups, fixup{raw.Type(), &fn.Type}) + typ = fn + + case kindFuncProto: + fp := &FuncProto{id, nil} + fixups = append(fixups, fixup{raw.Type(), &fp.Return}) + typ = fp + + case kindVar: + v := &Var{id, name, nil} + fixups = append(fixups, fixup{raw.Type(), &v.Type}) + typ = v + + case kindDatasec: + typ = &Datasec{id, name, raw.SizeType} + + default: + return nil, errors.Errorf("type id %d: unknown kind: %v", id, raw.Kind()) + } + + types = append(types, typ) + + if namer, ok := typ.(namer); ok { + if name := namer.name(); name != "" { + namedTypes[name] = append(namedTypes[name], typ) + } + } + } + + for _, fixup := range fixups { + i := int(fixup.id) + if i >= len(types) { + return nil, errors.Errorf("reference to invalid type id: %d", fixup.id) + } + + *fixup.typ = types[i] + } + + return namedTypes, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/cpu.go b/vendor/github.com/cilium/ebpf/internal/cpu.go new file mode 100644 index 000000000..ce3cab730 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/cpu.go @@ -0,0 +1,64 @@ +package internal + +import ( + "fmt" + "os" + "sync" + + "github.com/pkg/errors" +) + +var sysCPU struct { + once sync.Once + err error + num int +} + +// PossibleCPUs returns the max number of CPUs a system may possibly have +// Logical CPU numbers must be of the form 0-n +func PossibleCPUs() (int, error) { + sysCPU.once.Do(func() { + sysCPU.num, sysCPU.err = parseCPUs("/sys/devices/system/cpu/possible") + }) + + return sysCPU.num, sysCPU.err +} + +var onlineCPU struct { + once sync.Once + err error + num int +} + +// OnlineCPUs returns the number of currently online CPUs +// Logical CPU numbers must be of the form 0-n +func OnlineCPUs() (int, error) { + onlineCPU.once.Do(func() { + onlineCPU.num, onlineCPU.err = parseCPUs("/sys/devices/system/cpu/online") + }) + + return onlineCPU.num, onlineCPU.err +} + +// parseCPUs parses the number of cpus from sysfs, +// in the format of "/sys/devices/system/cpu/{possible,online,..}. +// Logical CPU numbers must be of the form 0-n +func parseCPUs(path string) (int, error) { + file, err := os.Open(path) + if err != nil { + return 0, err + } + defer file.Close() + + var low, high int + n, _ := fmt.Fscanf(file, "%d-%d", &low, &high) + if n < 1 || low != 0 { + return 0, errors.Wrapf(err, "%s has unknown format", path) + } + if n == 1 { + high = low + } + + // cpus is 0 indexed + return high + 1, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/endian.go b/vendor/github.com/cilium/ebpf/internal/endian.go new file mode 100644 index 000000000..ac8a94e51 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/endian.go @@ -0,0 +1,24 @@ +package internal + +import ( + "encoding/binary" + "unsafe" +) + +// NativeEndian is set to either binary.BigEndian or binary.LittleEndian, +// depending on the host's endianness. +var NativeEndian binary.ByteOrder + +func init() { + if isBigEndian() { + NativeEndian = binary.BigEndian + } else { + NativeEndian = binary.LittleEndian + } +} + +func isBigEndian() (ret bool) { + i := int(0x1) + bs := (*[int(unsafe.Sizeof(i))]byte)(unsafe.Pointer(&i)) + return bs[0] == 0 +} diff --git a/vendor/github.com/cilium/ebpf/internal/errors.go b/vendor/github.com/cilium/ebpf/internal/errors.go new file mode 100644 index 000000000..9590fe84e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/errors.go @@ -0,0 +1,50 @@ +package internal + +import ( + "bytes" + "fmt" + "strings" + + "github.com/cilium/ebpf/internal/unix" + "github.com/pkg/errors" +) + +// ErrorWithLog returns an error that includes logs from the +// kernel verifier. +// +// logErr should be the error returned by the syscall that generated +// the log. It is used to check for truncation of the output. +func ErrorWithLog(err error, log []byte, logErr error) error { + logStr := strings.Trim(CString(log), "\t\r\n ") + if errors.Cause(logErr) == unix.ENOSPC { + logStr += " (truncated...)" + } + + return &loadError{err, logStr} +} + +type loadError struct { + cause error + log string +} + +func (le *loadError) Error() string { + if le.log == "" { + return le.cause.Error() + } + + return fmt.Sprintf("%s: %s", le.cause, le.log) +} + +func (le *loadError) Cause() error { + return le.cause +} + +// CString turns a NUL / zero terminated byte buffer into a string. +func CString(in []byte) string { + inLen := bytes.IndexByte(in, 0) + if inLen == -1 { + return "" + } + return string(in[:inLen]) +} diff --git a/vendor/github.com/cilium/ebpf/internal/fd.go b/vendor/github.com/cilium/ebpf/internal/fd.go new file mode 100644 index 000000000..6800c84aa --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/fd.go @@ -0,0 +1,63 @@ +package internal + +import ( + "runtime" + "strconv" + + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +var ErrClosedFd = errors.New("use of closed file descriptor") + +type FD struct { + raw int64 +} + +func NewFD(value uint32) *FD { + fd := &FD{int64(value)} + runtime.SetFinalizer(fd, (*FD).Close) + return fd +} + +func (fd *FD) String() string { + return strconv.FormatInt(fd.raw, 10) +} + +func (fd *FD) Value() (uint32, error) { + if fd.raw < 0 { + return 0, ErrClosedFd + } + + return uint32(fd.raw), nil +} + +func (fd *FD) Close() error { + if fd.raw < 0 { + return nil + } + + value := int(fd.raw) + fd.raw = -1 + + fd.Forget() + return unix.Close(value) +} + +func (fd *FD) Forget() { + runtime.SetFinalizer(fd, nil) +} + +func (fd *FD) Dup() (*FD, error) { + if fd.raw < 0 { + return nil, ErrClosedFd + } + + dup, err := unix.FcntlInt(uintptr(fd.raw), unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return nil, errors.Wrap(err, "can't dup fd") + } + + return NewFD(uint32(dup)), nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/feature.go b/vendor/github.com/cilium/ebpf/internal/feature.go new file mode 100644 index 000000000..f7497d37f --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/feature.go @@ -0,0 +1,85 @@ +package internal + +import ( + "fmt" + "sync" + + "github.com/pkg/errors" +) + +// UnsupportedFeatureError is returned by FeatureTest() functions. +type UnsupportedFeatureError struct { + // The minimum Linux mainline version required for this feature. + // Used for the error string, and for sanity checking during testing. + MinimumVersion Version + + // The name of the feature that isn't supported. + Name string +} + +func (ufe *UnsupportedFeatureError) Error() string { + return fmt.Sprintf("%s not supported (requires >= %s)", ufe.Name, ufe.MinimumVersion) +} + +// FeatureTest wraps a function so that it is run at most once. +// +// name should identify the tested feature, while version must be in the +// form Major.Minor[.Patch]. +// +// Returns a descriptive UnsupportedFeatureError if the feature is not available. +func FeatureTest(name, version string, fn func() bool) func() error { + v, err := NewVersion(version) + if err != nil { + return func() error { return err } + } + + var ( + once sync.Once + result error + ) + + return func() error { + once.Do(func() { + if !fn() { + result = &UnsupportedFeatureError{ + MinimumVersion: v, + Name: name, + } + } + }) + return result + } +} + +// A Version in the form Major.Minor.Patch. +type Version [3]uint16 + +// NewVersion creates a version from a string like "Major.Minor.Patch". +// +// Patch is optional. +func NewVersion(ver string) (Version, error) { + var major, minor, patch uint16 + n, _ := fmt.Sscanf(ver, "%d.%d.%d", &major, &minor, &patch) + if n < 2 { + return Version{}, errors.Errorf("invalid version: %s", ver) + } + return Version{major, minor, patch}, nil +} + +func (v Version) String() string { + if v[2] == 0 { + return fmt.Sprintf("v%d.%d", v[0], v[1]) + } + return fmt.Sprintf("v%d.%d.%d", v[0], v[1], v[2]) +} + +// Less returns true if the version is less than another version. +func (v Version) Less(other Version) bool { + for i, a := range v { + if a == other[i] { + continue + } + return a < other[i] + } + return false +} diff --git a/vendor/github.com/cilium/ebpf/internal/io.go b/vendor/github.com/cilium/ebpf/internal/io.go new file mode 100644 index 000000000..756e86119 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/io.go @@ -0,0 +1,16 @@ +package internal + +import "github.com/pkg/errors" + +// DiscardZeroes makes sure that all written bytes are zero +// before discarding them. +type DiscardZeroes struct{} + +func (DiscardZeroes) Write(p []byte) (int, error) { + for _, b := range p { + if b != 0 { + return 0, errors.New("encountered non-zero byte") + } + } + return len(p), nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/ptr.go b/vendor/github.com/cilium/ebpf/internal/ptr.go new file mode 100644 index 000000000..e79b5aa00 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/ptr.go @@ -0,0 +1,26 @@ +package internal + +import "unsafe" + +// NewPointer creates a 64-bit pointer from an unsafe Pointer. +func NewPointer(ptr unsafe.Pointer) Pointer { + return Pointer{ptr: ptr} +} + +// NewSlicePointer creates a 64-bit pointer from a byte slice. +func NewSlicePointer(buf []byte) Pointer { + if len(buf) == 0 { + return Pointer{} + } + + return Pointer{ptr: unsafe.Pointer(&buf[0])} +} + +// NewStringPointer creates a 64-bit pointer from a string. +func NewStringPointer(str string) Pointer { + if str == "" { + return Pointer{} + } + + return Pointer{ptr: unsafe.Pointer(&[]byte(str)[0])} +} diff --git a/vendor/github.com/cilium/ebpf/internal/ptr_32_be.go b/vendor/github.com/cilium/ebpf/internal/ptr_32_be.go new file mode 100644 index 000000000..a56fbcc8e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/ptr_32_be.go @@ -0,0 +1,14 @@ +// +build armbe mips mips64p32 + +package internal + +import ( + "unsafe" +) + +// Pointer wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type Pointer struct { + pad uint32 + ptr unsafe.Pointer +} diff --git a/vendor/github.com/cilium/ebpf/internal/ptr_32_le.go b/vendor/github.com/cilium/ebpf/internal/ptr_32_le.go new file mode 100644 index 000000000..be2ecfca7 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/ptr_32_le.go @@ -0,0 +1,14 @@ +// +build 386 amd64p32 arm mipsle mips64p32le + +package internal + +import ( + "unsafe" +) + +// Pointer wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type Pointer struct { + ptr unsafe.Pointer + pad uint32 +} diff --git a/vendor/github.com/cilium/ebpf/internal/ptr_64.go b/vendor/github.com/cilium/ebpf/internal/ptr_64.go new file mode 100644 index 000000000..69452dceb --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/ptr_64.go @@ -0,0 +1,14 @@ +// +build !386,!amd64p32,!arm,!mipsle,!mips64p32le +// +build !armbe,!mips,!mips64p32 + +package internal + +import ( + "unsafe" +) + +// Pointer wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type Pointer struct { + ptr unsafe.Pointer +} diff --git a/vendor/github.com/cilium/ebpf/internal/syscall.go b/vendor/github.com/cilium/ebpf/internal/syscall.go new file mode 100644 index 000000000..b32cf3bce --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/syscall.go @@ -0,0 +1,23 @@ +package internal + +import ( + "runtime" + "unsafe" + + "github.com/cilium/ebpf/internal/unix" +) + +// BPF wraps SYS_BPF. +// +// Any pointers contained in attr must use the Pointer type from this package. +func BPF(cmd int, attr unsafe.Pointer, size uintptr) (uintptr, error) { + r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size) + runtime.KeepAlive(attr) + + var err error + if errNo != 0 { + err = errNo + } + + return r1, err +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go new file mode 100644 index 000000000..169309667 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go @@ -0,0 +1,127 @@ +// +build linux + +package unix + +import ( + "syscall" + + linux "golang.org/x/sys/unix" +) + +const ( + ENOENT = linux.ENOENT + EAGAIN = linux.EAGAIN + ENOSPC = linux.ENOSPC + EINVAL = linux.EINVAL + EPOLLIN = linux.EPOLLIN + BPF_OBJ_NAME_LEN = linux.BPF_OBJ_NAME_LEN + BPF_TAG_SIZE = linux.BPF_TAG_SIZE + SYS_BPF = linux.SYS_BPF + F_DUPFD_CLOEXEC = linux.F_DUPFD_CLOEXEC + EPOLL_CTL_ADD = linux.EPOLL_CTL_ADD + EPOLL_CLOEXEC = linux.EPOLL_CLOEXEC + O_CLOEXEC = linux.O_CLOEXEC + O_NONBLOCK = linux.O_NONBLOCK + PROT_READ = linux.PROT_READ + PROT_WRITE = linux.PROT_WRITE + MAP_SHARED = linux.MAP_SHARED + PERF_TYPE_SOFTWARE = linux.PERF_TYPE_SOFTWARE + PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT + PerfBitWatermark = linux.PerfBitWatermark + PERF_SAMPLE_RAW = linux.PERF_SAMPLE_RAW + PERF_FLAG_FD_CLOEXEC = linux.PERF_FLAG_FD_CLOEXEC + RLIM_INFINITY = linux.RLIM_INFINITY +) + +// Statfs_t is a wrapper +type Statfs_t = linux.Statfs_t + +// Rlimit is a wrapper +type Rlimit = linux.Rlimit + +// Setrlimit is a wrapper +func Setrlimit(resource int, rlim *Rlimit) (err error) { + return linux.Setrlimit(resource, rlim) +} + +// Syscall is a wrapper +func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { + return linux.Syscall(trap, a1, a2, a3) +} + +// FcntlInt is a wrapper +func FcntlInt(fd uintptr, cmd, arg int) (int, error) { + return linux.FcntlInt(fd, cmd, arg) +} + +// Statfs is a wrapper +func Statfs(path string, buf *Statfs_t) (err error) { + return linux.Statfs(path, buf) +} + +// Close is a wrapper +func Close(fd int) (err error) { + return linux.Close(fd) +} + +// EpollEvent is a wrapper +type EpollEvent = linux.EpollEvent + +// EpollWait is a wrapper +func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) { + return linux.EpollWait(epfd, events, msec) +} + +// EpollCtl is a wrapper +func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) { + return linux.EpollCtl(epfd, op, fd, event) +} + +// Eventfd is a wrapper +func Eventfd(initval uint, flags int) (fd int, err error) { + return linux.Eventfd(initval, flags) +} + +// Write is a wrapper +func Write(fd int, p []byte) (n int, err error) { + return linux.Write(fd, p) +} + +// EpollCreate1 is a wrapper +func EpollCreate1(flag int) (fd int, err error) { + return linux.EpollCreate1(flag) +} + +// PerfEventMmapPage is a wrapper +type PerfEventMmapPage linux.PerfEventMmapPage + +// SetNonblock is a wrapper +func SetNonblock(fd int, nonblocking bool) (err error) { + return linux.SetNonblock(fd, nonblocking) +} + +// Mmap is a wrapper +func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) { + return linux.Mmap(fd, offset, length, prot, flags) +} + +// Munmap is a wrapper +func Munmap(b []byte) (err error) { + return linux.Munmap(b) +} + +// PerfEventAttr is a wrapper +type PerfEventAttr = linux.PerfEventAttr + +// PerfEventOpen is a wrapper +func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) { + return linux.PerfEventOpen(attr, pid, cpu, groupFd, flags) +} + +// Utsname is a wrapper +type Utsname = linux.Utsname + +// Uname is a wrapper +func Uname(buf *Utsname) (err error) { + return linux.Uname(buf) +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_other.go b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go new file mode 100644 index 000000000..57a514da7 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go @@ -0,0 +1,193 @@ +// +build !linux + +package unix + +import ( + "fmt" + "runtime" + "syscall" +) + +var errNonLinux = fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH) + +const ( + ENOENT = syscall.ENOENT + EAGAIN = syscall.EAGAIN + ENOSPC = syscall.ENOSPC + EINVAL = syscall.EINVAL + BPF_OBJ_NAME_LEN = 0x10 + BPF_TAG_SIZE = 0x8 + SYS_BPF = 321 + F_DUPFD_CLOEXEC = 0x406 + EPOLLIN = 0x1 + EPOLL_CTL_ADD = 0x1 + EPOLL_CLOEXEC = 0x80000 + O_CLOEXEC = 0x80000 + O_NONBLOCK = 0x800 + PROT_READ = 0x1 + PROT_WRITE = 0x2 + MAP_SHARED = 0x1 + PERF_TYPE_SOFTWARE = 0x1 + PERF_COUNT_SW_BPF_OUTPUT = 0xa + PerfBitWatermark = 0x4000 + PERF_SAMPLE_RAW = 0x400 + PERF_FLAG_FD_CLOEXEC = 0x8 +) + +// Statfs_t is a wrapper +type Statfs_t struct { + Type int64 + Bsize int64 + Blocks uint64 + Bfree uint64 + Bavail uint64 + Files uint64 + Ffree uint64 + Fsid [2]int32 + Namelen int64 + Frsize int64 + Flags int64 + Spare [4]int64 +} + +// Rlimit is a wrapper +type Rlimit struct { + Cur uint64 + Max uint64 +} + +// Setrlimit is a wrapper +func Setrlimit(resource int, rlim *Rlimit) (err error) { + return errNonLinux +} + +// Syscall is a wrapper +func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { + return 0, 0, syscall.Errno(1) +} + +// FcntlInt is a wrapper +func FcntlInt(fd uintptr, cmd, arg int) (int, error) { + return -1, errNonLinux +} + +// Statfs is a wrapper +func Statfs(path string, buf *Statfs_t) error { + return errNonLinux +} + +// Close is a wrapper +func Close(fd int) (err error) { + return errNonLinux +} + +// EpollEvent is a wrapper +type EpollEvent struct { + Events uint32 + Fd int32 + Pad int32 +} + +// EpollWait is a wrapper +func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) { + return 0, errNonLinux +} + +// EpollCtl is a wrapper +func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) { + return errNonLinux +} + +// Eventfd is a wrapper +func Eventfd(initval uint, flags int) (fd int, err error) { + return 0, errNonLinux +} + +// Write is a wrapper +func Write(fd int, p []byte) (n int, err error) { + return 0, errNonLinux +} + +// EpollCreate1 is a wrapper +func EpollCreate1(flag int) (fd int, err error) { + return 0, errNonLinux +} + +// PerfEventMmapPage is a wrapper +type PerfEventMmapPage struct { + Version uint32 + Compat_version uint32 + Lock uint32 + Index uint32 + Offset int64 + Time_enabled uint64 + Time_running uint64 + Capabilities uint64 + Pmc_width uint16 + Time_shift uint16 + Time_mult uint32 + Time_offset uint64 + Time_zero uint64 + Size uint32 + + Data_head uint64 + Data_tail uint64 + Data_offset uint64 + Data_size uint64 + Aux_head uint64 + Aux_tail uint64 + Aux_offset uint64 + Aux_size uint64 +} + +// SetNonblock is a wrapper +func SetNonblock(fd int, nonblocking bool) (err error) { + return errNonLinux +} + +// Mmap is a wrapper +func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) { + return []byte{}, errNonLinux +} + +// Munmap is a wrapper +func Munmap(b []byte) (err error) { + return errNonLinux +} + +// PerfEventAttr is a wrapper +type PerfEventAttr struct { + Type uint32 + Size uint32 + Config uint64 + Sample uint64 + Sample_type uint64 + Read_format uint64 + Bits uint64 + Wakeup uint32 + Bp_type uint32 + Ext1 uint64 + Ext2 uint64 + Branch_sample_type uint64 + Sample_regs_user uint64 + Sample_stack_user uint32 + Clockid int32 + Sample_regs_intr uint64 + Aux_watermark uint32 + Sample_max_stack uint16 +} + +// PerfEventOpen is a wrapper +func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) { + return 0, errNonLinux +} + +// Utsname is a wrapper +type Utsname struct { + Release [65]byte +} + +// Uname is a wrapper +func Uname(buf *Utsname) (err error) { + return errNonLinux +} diff --git a/vendor/github.com/cilium/ebpf/linker.go b/vendor/github.com/cilium/ebpf/linker.go new file mode 100644 index 000000000..25f2ab947 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/linker.go @@ -0,0 +1,70 @@ +package ebpf + +import ( + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal/btf" + "github.com/pkg/errors" +) + +// link resolves bpf-to-bpf calls. +// +// Each library may contain multiple functions / labels, and is only linked +// if the program being edited references one of these functions. +// +// Libraries must not require linking themselves. +func link(prog *ProgramSpec, libs []*ProgramSpec) error { + for _, lib := range libs { + insns, err := linkSection(prog.Instructions, lib.Instructions) + if err != nil { + return errors.Wrapf(err, "linking %s", lib.Name) + } + + if len(insns) == len(prog.Instructions) { + continue + } + + prog.Instructions = insns + if prog.BTF != nil && lib.BTF != nil { + if err := btf.ProgramAppend(prog.BTF, lib.BTF); err != nil { + return errors.Wrapf(err, "linking BTF of %s", lib.Name) + } + } + } + return nil +} + +func linkSection(insns, section asm.Instructions) (asm.Instructions, error) { + // A map of symbols to the libraries which contain them. + symbols, err := section.SymbolOffsets() + if err != nil { + return nil, err + } + + for _, ins := range insns { + if ins.Reference == "" { + continue + } + + if ins.OpCode.JumpOp() != asm.Call || ins.Src != asm.R1 { + continue + } + + if ins.Constant != -1 { + // This is already a valid call, no need to link again. + continue + } + + if _, ok := symbols[ins.Reference]; !ok { + // Symbol isn't available in this section + continue + } + + // At this point we know that at least one function in the + // library is called from insns. Merge the two sections. + // The rewrite of ins.Constant happens in asm.Instruction.Marshal. + return append(insns, section...), nil + } + + // None of the functions in the section are called. Do nothing. + return insns, nil +} diff --git a/vendor/github.com/cilium/ebpf/map.go b/vendor/github.com/cilium/ebpf/map.go new file mode 100644 index 000000000..1c09418ee --- /dev/null +++ b/vendor/github.com/cilium/ebpf/map.go @@ -0,0 +1,627 @@ +package ebpf + +import ( + "fmt" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/btf" + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +// MapSpec defines a Map. +type MapSpec struct { + // Name is passed to the kernel as a debug aid. Must only contain + // alpha numeric and '_' characters. + Name string + Type MapType + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + Flags uint32 + + // InnerMap is used as a template for ArrayOfMaps and HashOfMaps + InnerMap *MapSpec + + // The BTF associated with this map. + BTF *btf.Map +} + +func (ms *MapSpec) String() string { + return fmt.Sprintf("%s(keySize=%d, valueSize=%d, maxEntries=%d, flags=%d)", ms.Type, ms.KeySize, ms.ValueSize, ms.MaxEntries, ms.Flags) +} + +// Copy returns a copy of the spec. +func (ms *MapSpec) Copy() *MapSpec { + if ms == nil { + return nil + } + + cpy := *ms + cpy.InnerMap = ms.InnerMap.Copy() + return &cpy +} + +// Map represents a Map file descriptor. +// +// It is not safe to close a map which is used by other goroutines. +// +// Methods which take interface{} arguments by default encode +// them using binary.Read/Write in the machine's native endianness. +// +// Implement encoding.BinaryMarshaler or encoding.BinaryUnmarshaler +// if you require custom encoding. +type Map struct { + name string + fd *internal.FD + abi MapABI + // Per CPU maps return values larger than the size in the spec + fullValueSize int +} + +// NewMapFromFD creates a map from a raw fd. +// +// You should not use fd after calling this function. +func NewMapFromFD(fd int) (*Map, error) { + if fd < 0 { + return nil, errors.New("invalid fd") + } + bpfFd := internal.NewFD(uint32(fd)) + + name, abi, err := newMapABIFromFd(bpfFd) + if err != nil { + bpfFd.Forget() + return nil, err + } + return newMap(bpfFd, name, abi) +} + +// NewMap creates a new Map. +// +// Creating a map for the first time will perform feature detection +// by creating small, temporary maps. +func NewMap(spec *MapSpec) (*Map, error) { + if spec.BTF == nil { + return newMapWithBTF(spec, nil) + } + + handle, err := btf.NewHandle(btf.MapSpec(spec.BTF)) + if err != nil && !btf.IsNotSupported(err) { + return nil, errors.Wrap(err, "can't load BTF") + } + + return newMapWithBTF(spec, handle) +} + +func newMapWithBTF(spec *MapSpec, handle *btf.Handle) (*Map, error) { + if spec.Type != ArrayOfMaps && spec.Type != HashOfMaps { + return createMap(spec, nil, handle) + } + + if spec.InnerMap == nil { + return nil, errors.Errorf("%s requires InnerMap", spec.Type) + } + + template, err := createMap(spec.InnerMap, nil, handle) + if err != nil { + return nil, err + } + defer template.Close() + + return createMap(spec, template.fd, handle) +} + +func createMap(spec *MapSpec, inner *internal.FD, handle *btf.Handle) (*Map, error) { + spec = spec.Copy() + + switch spec.Type { + case ArrayOfMaps: + fallthrough + case HashOfMaps: + if err := haveNestedMaps(); err != nil { + return nil, err + } + + if spec.ValueSize != 0 && spec.ValueSize != 4 { + return nil, errors.Errorf("ValueSize must be zero or four for map of map") + } + spec.ValueSize = 4 + + case PerfEventArray: + if spec.KeySize != 0 { + return nil, errors.Errorf("KeySize must be zero for perf event array") + } + if spec.ValueSize != 0 { + return nil, errors.Errorf("ValueSize must be zero for perf event array") + } + if spec.MaxEntries == 0 { + n, err := internal.OnlineCPUs() + if err != nil { + return nil, errors.Wrap(err, "perf event array") + } + spec.MaxEntries = uint32(n) + } + + spec.KeySize = 4 + spec.ValueSize = 4 + } + + attr := bpfMapCreateAttr{ + mapType: spec.Type, + keySize: spec.KeySize, + valueSize: spec.ValueSize, + maxEntries: spec.MaxEntries, + flags: spec.Flags, + } + + if inner != nil { + var err error + attr.innerMapFd, err = inner.Value() + if err != nil { + return nil, errors.Wrap(err, "map create") + } + } + + if handle != nil && spec.BTF != nil { + attr.btfFd = uint32(handle.FD()) + attr.btfKeyTypeID = btf.MapKey(spec.BTF).ID() + attr.btfValueTypeID = btf.MapValue(spec.BTF).ID() + } + + name, err := newBPFObjName(spec.Name) + if err != nil { + return nil, errors.Wrap(err, "map create") + } + + if haveObjName() == nil { + attr.mapName = name + } + + fd, err := bpfMapCreate(&attr) + if err != nil { + return nil, errors.Wrap(err, "map create") + } + + return newMap(fd, spec.Name, newMapABIFromSpec(spec)) +} + +func newMap(fd *internal.FD, name string, abi *MapABI) (*Map, error) { + m := &Map{ + name, + fd, + *abi, + int(abi.ValueSize), + } + + if !abi.Type.hasPerCPUValue() { + return m, nil + } + + possibleCPUs, err := internal.PossibleCPUs() + if err != nil { + return nil, err + } + + m.fullValueSize = align(int(abi.ValueSize), 8) * possibleCPUs + return m, nil +} + +func (m *Map) String() string { + if m.name != "" { + return fmt.Sprintf("%s(%s)#%v", m.abi.Type, m.name, m.fd) + } + return fmt.Sprintf("%s#%v", m.abi.Type, m.fd) +} + +// ABI gets the ABI of the Map +func (m *Map) ABI() MapABI { + return m.abi +} + +// Lookup retrieves a value from a Map. +// +// Calls Close() on valueOut if it is of type **Map or **Program, +// and *valueOut is not nil. +// +// Returns an error if the key doesn't exist, see IsNotExist. +func (m *Map) Lookup(key, valueOut interface{}) error { + valuePtr, valueBytes := makeBuffer(valueOut, m.fullValueSize) + + if err := m.lookup(key, valuePtr); err != nil { + return err + } + + if valueBytes == nil { + return nil + } + + if m.abi.Type.hasPerCPUValue() { + return unmarshalPerCPUValue(valueOut, int(m.abi.ValueSize), valueBytes) + } + + switch value := valueOut.(type) { + case **Map: + m, err := unmarshalMap(valueBytes) + if err != nil { + return err + } + + (*value).Close() + *value = m + return nil + case *Map: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil)) + case Map: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil)) + + case **Program: + p, err := unmarshalProgram(valueBytes) + if err != nil { + return err + } + + (*value).Close() + *value = p + return nil + case *Program: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil)) + case Program: + return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil)) + + default: + return unmarshalBytes(valueOut, valueBytes) + } +} + +// LookupBytes gets a value from Map. +// +// Returns a nil value if a key doesn't exist. +func (m *Map) LookupBytes(key interface{}) ([]byte, error) { + valueBytes := make([]byte, m.fullValueSize) + valuePtr := internal.NewSlicePointer(valueBytes) + + err := m.lookup(key, valuePtr) + if IsNotExist(err) { + return nil, nil + } + + return valueBytes, err +} + +func (m *Map) lookup(key interface{}, valueOut internal.Pointer) error { + keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + + err = bpfMapLookupElem(m.fd, keyPtr, valueOut) + return errors.WithMessage(err, "lookup failed") +} + +// MapUpdateFlags controls the behaviour of the Map.Update call. +// +// The exact semantics depend on the specific MapType. +type MapUpdateFlags uint64 + +const ( + // UpdateAny creates a new element or update an existing one. + UpdateAny MapUpdateFlags = iota + // UpdateNoExist creates a new element. + UpdateNoExist MapUpdateFlags = 1 << (iota - 1) + // UpdateExist updates an existing element. + UpdateExist +) + +// Put replaces or creates a value in map. +// +// It is equivalent to calling Update with UpdateAny. +func (m *Map) Put(key, value interface{}) error { + return m.Update(key, value, UpdateAny) +} + +// Update changes the value of a key. +func (m *Map) Update(key, value interface{}, flags MapUpdateFlags) error { + keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + + var valuePtr internal.Pointer + if m.abi.Type.hasPerCPUValue() { + valuePtr, err = marshalPerCPUValue(value, int(m.abi.ValueSize)) + } else { + valuePtr, err = marshalPtr(value, int(m.abi.ValueSize)) + } + if err != nil { + return errors.WithMessage(err, "can't marshal value") + } + + return bpfMapUpdateElem(m.fd, keyPtr, valuePtr, uint64(flags)) +} + +// Delete removes a value. +// +// Returns an error if the key does not exist, see IsNotExist. +func (m *Map) Delete(key interface{}) error { + keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + + err = bpfMapDeleteElem(m.fd, keyPtr) + return errors.WithMessage(err, "can't delete key") +} + +// NextKey finds the key following an initial key. +// +// See NextKeyBytes for details. +func (m *Map) NextKey(key, nextKeyOut interface{}) error { + nextKeyPtr, nextKeyBytes := makeBuffer(nextKeyOut, int(m.abi.KeySize)) + + if err := m.nextKey(key, nextKeyPtr); err != nil { + return err + } + + if nextKeyBytes == nil { + return nil + } + + err := unmarshalBytes(nextKeyOut, nextKeyBytes) + return errors.WithMessage(err, "can't unmarshal next key") +} + +// NextKeyBytes returns the key following an initial key as a byte slice. +// +// Passing nil will return the first key. +// +// Use Iterate if you want to traverse all entries in the map. +func (m *Map) NextKeyBytes(key interface{}) ([]byte, error) { + nextKey := make([]byte, m.abi.KeySize) + nextKeyPtr := internal.NewSlicePointer(nextKey) + + err := m.nextKey(key, nextKeyPtr) + if IsNotExist(err) { + return nil, nil + } + + return nextKey, err +} + +func (m *Map) nextKey(key interface{}, nextKeyOut internal.Pointer) error { + var ( + keyPtr internal.Pointer + err error + ) + + if key != nil { + keyPtr, err = marshalPtr(key, int(m.abi.KeySize)) + if err != nil { + return errors.WithMessage(err, "can't marshal key") + } + } + + err = bpfMapGetNextKey(m.fd, keyPtr, nextKeyOut) + return errors.WithMessage(err, "can't get next key") +} + +// Iterate traverses a map. +// +// It's safe to create multiple iterators at the same time. +// +// It's not possible to guarantee that all keys in a map will be +// returned if there are concurrent modifications to the map. +func (m *Map) Iterate() *MapIterator { + return newMapIterator(m) +} + +// Close removes a Map +func (m *Map) Close() error { + if m == nil { + // This makes it easier to clean up when iterating maps + // of maps / programs. + return nil + } + + return m.fd.Close() +} + +// FD gets the file descriptor of the Map. +// +// Calling this function is invalid after Close has been called. +func (m *Map) FD() int { + fd, err := m.fd.Value() + if err != nil { + // Best effort: -1 is the number most likely to be an + // invalid file descriptor. + return -1 + } + + return int(fd) +} + +// Clone creates a duplicate of the Map. +// +// Closing the duplicate does not affect the original, and vice versa. +// Changes made to the map are reflected by both instances however. +// +// Cloning a nil Map returns nil. +func (m *Map) Clone() (*Map, error) { + if m == nil { + return nil, nil + } + + dup, err := m.fd.Dup() + if err != nil { + return nil, errors.Wrap(err, "can't clone map") + } + + return newMap(dup, m.name, &m.abi) +} + +// Pin persists the map past the lifetime of the process that created it. +// +// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional +func (m *Map) Pin(fileName string) error { + return bpfPinObject(fileName, m.fd) +} + +// LoadPinnedMap load a Map from a BPF file. +// +// The function is not compatible with nested maps. +// Use LoadPinnedMapExplicit in these situations. +func LoadPinnedMap(fileName string) (*Map, error) { + fd, err := bpfGetObject(fileName) + if err != nil { + return nil, err + } + name, abi, err := newMapABIFromFd(fd) + if err != nil { + _ = fd.Close() + return nil, err + } + return newMap(fd, name, abi) +} + +// LoadPinnedMapExplicit loads a map with explicit parameters. +func LoadPinnedMapExplicit(fileName string, abi *MapABI) (*Map, error) { + fd, err := bpfGetObject(fileName) + if err != nil { + return nil, err + } + return newMap(fd, "", abi) +} + +func unmarshalMap(buf []byte) (*Map, error) { + if len(buf) != 4 { + return nil, errors.New("map id requires 4 byte value") + } + + // Looking up an entry in a nested map or prog array returns an id, + // not an fd. + id := internal.NativeEndian.Uint32(buf) + fd, err := bpfGetMapFDByID(id) + if err != nil { + return nil, err + } + + name, abi, err := newMapABIFromFd(fd) + if err != nil { + _ = fd.Close() + return nil, err + } + + return newMap(fd, name, abi) +} + +// MarshalBinary implements BinaryMarshaler. +func (m *Map) MarshalBinary() ([]byte, error) { + fd, err := m.fd.Value() + if err != nil { + return nil, err + } + + buf := make([]byte, 4) + internal.NativeEndian.PutUint32(buf, fd) + return buf, nil +} + +// MapIterator iterates a Map. +// +// See Map.Iterate. +type MapIterator struct { + target *Map + prevKey interface{} + prevBytes []byte + count, maxEntries uint32 + done bool + err error +} + +func newMapIterator(target *Map) *MapIterator { + return &MapIterator{ + target: target, + maxEntries: target.abi.MaxEntries, + prevBytes: make([]byte, int(target.abi.KeySize)), + } +} + +var errIterationAborted = errors.New("iteration aborted") + +// Next decodes the next key and value. +// +// Iterating a hash map from which keys are being deleted is not +// safe. You may see the same key multiple times. Iteration may +// also abort with an error, see IsIterationAborted. +// +// Returns false if there are no more entries. You must check +// the result of Err afterwards. +// +// See Map.Get for further caveats around valueOut. +func (mi *MapIterator) Next(keyOut, valueOut interface{}) bool { + if mi.err != nil || mi.done { + return false + } + + for ; mi.count < mi.maxEntries; mi.count++ { + var nextBytes []byte + nextBytes, mi.err = mi.target.NextKeyBytes(mi.prevKey) + if mi.err != nil { + return false + } + + if nextBytes == nil { + mi.done = true + return false + } + + // The user can get access to nextBytes since unmarshalBytes + // does not copy when unmarshaling into a []byte. + // Make a copy to prevent accidental corruption of + // iterator state. + copy(mi.prevBytes, nextBytes) + mi.prevKey = mi.prevBytes + + mi.err = mi.target.Lookup(nextBytes, valueOut) + if IsNotExist(mi.err) { + // Even though the key should be valid, we couldn't look up + // its value. If we're iterating a hash map this is probably + // because a concurrent delete removed the value before we + // could get it. This means that the next call to NextKeyBytes + // is very likely to restart iteration. + // If we're iterating one of the fd maps like + // ProgramArray it means that a given slot doesn't have + // a valid fd associated. It's OK to continue to the next slot. + continue + } + if mi.err != nil { + return false + } + + mi.err = unmarshalBytes(keyOut, nextBytes) + return mi.err == nil + } + + mi.err = errIterationAborted + return false +} + +// Err returns any encountered error. +// +// The method must be called after Next returns nil. +func (mi *MapIterator) Err() error { + return mi.err +} + +// IsNotExist returns true if the error indicates that a +// key doesn't exist. +func IsNotExist(err error) bool { + return errors.Cause(err) == unix.ENOENT +} + +// IsIterationAborted returns true if the iteration was aborted. +// +// This occurs when keys are deleted from a hash map during iteration. +func IsIterationAborted(err error) bool { + return errors.Cause(err) == errIterationAborted +} diff --git a/vendor/github.com/cilium/ebpf/marshalers.go b/vendor/github.com/cilium/ebpf/marshalers.go new file mode 100644 index 000000000..197f1538e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/marshalers.go @@ -0,0 +1,192 @@ +package ebpf + +import ( + "bytes" + "encoding" + "encoding/binary" + "reflect" + "runtime" + "unsafe" + + "github.com/cilium/ebpf/internal" + + "github.com/pkg/errors" +) + +func marshalPtr(data interface{}, length int) (internal.Pointer, error) { + if ptr, ok := data.(unsafe.Pointer); ok { + return internal.NewPointer(ptr), nil + } + + buf, err := marshalBytes(data, length) + if err != nil { + return internal.Pointer{}, err + } + + return internal.NewSlicePointer(buf), nil +} + +func marshalBytes(data interface{}, length int) (buf []byte, err error) { + switch value := data.(type) { + case encoding.BinaryMarshaler: + buf, err = value.MarshalBinary() + case string: + buf = []byte(value) + case []byte: + buf = value + case unsafe.Pointer: + err = errors.New("can't marshal from unsafe.Pointer") + default: + var wr bytes.Buffer + err = binary.Write(&wr, internal.NativeEndian, value) + err = errors.Wrapf(err, "encoding %T", value) + buf = wr.Bytes() + } + if err != nil { + return nil, err + } + + if len(buf) != length { + return nil, errors.Errorf("%T doesn't marshal to %d bytes", data, length) + } + return buf, nil +} + +func makeBuffer(dst interface{}, length int) (internal.Pointer, []byte) { + if ptr, ok := dst.(unsafe.Pointer); ok { + return internal.NewPointer(ptr), nil + } + + buf := make([]byte, length) + return internal.NewSlicePointer(buf), buf +} + +func unmarshalBytes(data interface{}, buf []byte) error { + switch value := data.(type) { + case unsafe.Pointer: + sh := &reflect.SliceHeader{ + Data: uintptr(value), + Len: len(buf), + Cap: len(buf), + } + + dst := *(*[]byte)(unsafe.Pointer(sh)) + copy(dst, buf) + runtime.KeepAlive(value) + return nil + case encoding.BinaryUnmarshaler: + return value.UnmarshalBinary(buf) + case *string: + *value = string(buf) + return nil + case *[]byte: + *value = buf + return nil + case string: + return errors.New("require pointer to string") + case []byte: + return errors.New("require pointer to []byte") + default: + rd := bytes.NewReader(buf) + err := binary.Read(rd, internal.NativeEndian, value) + return errors.Wrapf(err, "decoding %T", value) + } +} + +// marshalPerCPUValue encodes a slice containing one value per +// possible CPU into a buffer of bytes. +// +// Values are initialized to zero if the slice has less elements than CPUs. +// +// slice must have a type like []elementType. +func marshalPerCPUValue(slice interface{}, elemLength int) (internal.Pointer, error) { + sliceType := reflect.TypeOf(slice) + if sliceType.Kind() != reflect.Slice { + return internal.Pointer{}, errors.New("per-CPU value requires slice") + } + + possibleCPUs, err := internal.PossibleCPUs() + if err != nil { + return internal.Pointer{}, err + } + + sliceValue := reflect.ValueOf(slice) + sliceLen := sliceValue.Len() + if sliceLen > possibleCPUs { + return internal.Pointer{}, errors.Errorf("per-CPU value exceeds number of CPUs") + } + + alignedElemLength := align(elemLength, 8) + buf := make([]byte, alignedElemLength*possibleCPUs) + + for i := 0; i < sliceLen; i++ { + elem := sliceValue.Index(i).Interface() + elemBytes, err := marshalBytes(elem, elemLength) + if err != nil { + return internal.Pointer{}, err + } + + offset := i * alignedElemLength + copy(buf[offset:offset+elemLength], elemBytes) + } + + return internal.NewSlicePointer(buf), nil +} + +// unmarshalPerCPUValue decodes a buffer into a slice containing one value per +// possible CPU. +// +// valueOut must have a type like *[]elementType +func unmarshalPerCPUValue(slicePtr interface{}, elemLength int, buf []byte) error { + slicePtrType := reflect.TypeOf(slicePtr) + if slicePtrType.Kind() != reflect.Ptr || slicePtrType.Elem().Kind() != reflect.Slice { + return errors.Errorf("per-cpu value requires pointer to slice") + } + + possibleCPUs, err := internal.PossibleCPUs() + if err != nil { + return err + } + + sliceType := slicePtrType.Elem() + slice := reflect.MakeSlice(sliceType, possibleCPUs, possibleCPUs) + + sliceElemType := sliceType.Elem() + sliceElemIsPointer := sliceElemType.Kind() == reflect.Ptr + if sliceElemIsPointer { + sliceElemType = sliceElemType.Elem() + } + + step := len(buf) / possibleCPUs + if step < elemLength { + return errors.Errorf("per-cpu element length is larger than available data") + } + for i := 0; i < possibleCPUs; i++ { + var elem interface{} + if sliceElemIsPointer { + newElem := reflect.New(sliceElemType) + slice.Index(i).Set(newElem) + elem = newElem.Interface() + } else { + elem = slice.Index(i).Addr().Interface() + } + + // Make a copy, since unmarshal can hold on to itemBytes + elemBytes := make([]byte, elemLength) + copy(elemBytes, buf[:elemLength]) + + err := unmarshalBytes(elem, elemBytes) + if err != nil { + return errors.Wrapf(err, "cpu %d", i) + } + + buf = buf[step:] + } + + reflect.ValueOf(slicePtr).Elem().Set(slice) + return nil +} + +func align(n, alignment int) int { + return (int(n) + alignment - 1) / alignment * alignment +} diff --git a/vendor/github.com/cilium/ebpf/prog.go b/vendor/github.com/cilium/ebpf/prog.go new file mode 100644 index 000000000..08ef4fa46 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/prog.go @@ -0,0 +1,520 @@ +package ebpf + +import ( + "bytes" + "fmt" + "math" + "strings" + "time" + "unsafe" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/btf" + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +const ( + // Number of bytes to pad the output buffer for BPF_PROG_TEST_RUN. + // This is currently the maximum of spare space allocated for SKB + // and XDP programs, and equal to XDP_PACKET_HEADROOM + NET_IP_ALIGN. + outputPad = 256 + 2 +) + +// DefaultVerifierLogSize is the default number of bytes allocated for the +// verifier log. +const DefaultVerifierLogSize = 64 * 1024 + +// ProgramOptions control loading a program into the kernel. +type ProgramOptions struct { + // Controls the detail emitted by the kernel verifier. Set to non-zero + // to enable logging. + LogLevel uint32 + // Controls the output buffer size for the verifier. Defaults to + // DefaultVerifierLogSize. + LogSize int +} + +// ProgramSpec defines a Program +type ProgramSpec struct { + // Name is passed to the kernel as a debug aid. Must only contain + // alpha numeric and '_' characters. + Name string + Type ProgramType + AttachType AttachType + Instructions asm.Instructions + License string + KernelVersion uint32 + + // The BTF associated with this program. Changing Instructions + // will most likely invalidate the contained data, and may + // result in errors when attempting to load it into the kernel. + BTF *btf.Program +} + +// Copy returns a copy of the spec. +func (ps *ProgramSpec) Copy() *ProgramSpec { + if ps == nil { + return nil + } + + cpy := *ps + cpy.Instructions = make(asm.Instructions, len(ps.Instructions)) + copy(cpy.Instructions, ps.Instructions) + return &cpy +} + +// Program represents BPF program loaded into the kernel. +// +// It is not safe to close a Program which is used by other goroutines. +type Program struct { + // Contains the output of the kernel verifier if enabled, + // otherwise it is empty. + VerifierLog string + + fd *internal.FD + name string + abi ProgramABI +} + +// NewProgram creates a new Program. +// +// Loading a program for the first time will perform +// feature detection by loading small, temporary programs. +func NewProgram(spec *ProgramSpec) (*Program, error) { + return NewProgramWithOptions(spec, ProgramOptions{}) +} + +// NewProgramWithOptions creates a new Program. +// +// Loading a program for the first time will perform +// feature detection by loading small, temporary programs. +func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) { + if spec.BTF == nil { + return newProgramWithBTF(spec, nil, opts) + } + + handle, err := btf.NewHandle(btf.ProgramSpec(spec.BTF)) + if err != nil && !btf.IsNotSupported(err) { + return nil, errors.Wrap(err, "can't load BTF") + } + + return newProgramWithBTF(spec, handle, opts) +} + +func newProgramWithBTF(spec *ProgramSpec, btf *btf.Handle, opts ProgramOptions) (*Program, error) { + attr, err := convertProgramSpec(spec, btf) + if err != nil { + return nil, err + } + + logSize := DefaultVerifierLogSize + if opts.LogSize > 0 { + logSize = opts.LogSize + } + + var logBuf []byte + if opts.LogLevel > 0 { + logBuf = make([]byte, logSize) + attr.logLevel = opts.LogLevel + attr.logSize = uint32(len(logBuf)) + attr.logBuf = internal.NewSlicePointer(logBuf) + } + + fd, err := bpfProgLoad(attr) + if err == nil { + prog := newProgram(fd, spec.Name, &ProgramABI{spec.Type}) + prog.VerifierLog = internal.CString(logBuf) + return prog, nil + } + + if opts.LogLevel == 0 { + // Re-run with the verifier enabled to get better error messages. + logBuf = make([]byte, logSize) + attr.logLevel = 1 + attr.logSize = uint32(len(logBuf)) + attr.logBuf = internal.NewSlicePointer(logBuf) + + _, logErr := bpfProgLoad(attr) + err = internal.ErrorWithLog(err, logBuf, logErr) + } + + return nil, errors.Wrap(err, "can't load program") +} + +// NewProgramFromFD creates a program from a raw fd. +// +// You should not use fd after calling this function. +// +// Requires at least Linux 4.11. +func NewProgramFromFD(fd int) (*Program, error) { + if fd < 0 { + return nil, errors.New("invalid fd") + } + bpfFd := internal.NewFD(uint32(fd)) + + name, abi, err := newProgramABIFromFd(bpfFd) + if err != nil { + bpfFd.Forget() + return nil, err + } + + return newProgram(bpfFd, name, abi), nil +} + +func newProgram(fd *internal.FD, name string, abi *ProgramABI) *Program { + return &Program{ + name: name, + fd: fd, + abi: *abi, + } +} + +func convertProgramSpec(spec *ProgramSpec, handle *btf.Handle) (*bpfProgLoadAttr, error) { + if len(spec.Instructions) == 0 { + return nil, errors.New("Instructions cannot be empty") + } + + if len(spec.License) == 0 { + return nil, errors.New("License cannot be empty") + } + + buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize)) + err := spec.Instructions.Marshal(buf, internal.NativeEndian) + if err != nil { + return nil, err + } + + bytecode := buf.Bytes() + insCount := uint32(len(bytecode) / asm.InstructionSize) + attr := &bpfProgLoadAttr{ + progType: spec.Type, + expectedAttachType: spec.AttachType, + insCount: insCount, + instructions: internal.NewSlicePointer(bytecode), + license: internal.NewStringPointer(spec.License), + } + + name, err := newBPFObjName(spec.Name) + if err != nil { + return nil, err + } + + if haveObjName() == nil { + attr.progName = name + } + + if handle != nil && spec.BTF != nil { + attr.progBTFFd = uint32(handle.FD()) + + recSize, bytes, err := btf.ProgramLineInfos(spec.BTF) + if err != nil { + return nil, errors.Wrap(err, "can't get BTF line infos") + } + attr.lineInfoRecSize = recSize + attr.lineInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize)) + attr.lineInfo = internal.NewSlicePointer(bytes) + + recSize, bytes, err = btf.ProgramFuncInfos(spec.BTF) + if err != nil { + return nil, errors.Wrap(err, "can't get BTF function infos") + } + attr.funcInfoRecSize = recSize + attr.funcInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize)) + attr.funcInfo = internal.NewSlicePointer(bytes) + } + + return attr, nil +} + +func (p *Program) String() string { + if p.name != "" { + return fmt.Sprintf("%s(%s)#%v", p.abi.Type, p.name, p.fd) + } + return fmt.Sprintf("%s#%v", p.abi.Type, p.fd) +} + +// ABI gets the ABI of the Program +func (p *Program) ABI() ProgramABI { + return p.abi +} + +// FD gets the file descriptor of the Program. +// +// It is invalid to call this function after Close has been called. +func (p *Program) FD() int { + fd, err := p.fd.Value() + if err != nil { + // Best effort: -1 is the number most likely to be an + // invalid file descriptor. + return -1 + } + + return int(fd) +} + +// Clone creates a duplicate of the Program. +// +// Closing the duplicate does not affect the original, and vice versa. +// +// Cloning a nil Program returns nil. +func (p *Program) Clone() (*Program, error) { + if p == nil { + return nil, nil + } + + dup, err := p.fd.Dup() + if err != nil { + return nil, errors.Wrap(err, "can't clone program") + } + + return newProgram(dup, p.name, &p.abi), nil +} + +// Pin persists the Program past the lifetime of the process that created it +// +// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional +func (p *Program) Pin(fileName string) error { + return errors.Wrap(bpfPinObject(fileName, p.fd), "can't pin program") +} + +// Close unloads the program from the kernel. +func (p *Program) Close() error { + if p == nil { + return nil + } + + return p.fd.Close() +} + +// Test runs the Program in the kernel with the given input and returns the +// value returned by the eBPF program. outLen may be zero. +// +// Note: the kernel expects at least 14 bytes input for an ethernet header for +// XDP and SKB programs. +// +// This function requires at least Linux 4.12. +func (p *Program) Test(in []byte) (uint32, []byte, error) { + ret, out, _, err := p.testRun(in, 1) + return ret, out, errors.Wrap(err, "can't test program") +} + +// Benchmark runs the Program with the given input for a number of times +// and returns the time taken per iteration. +// +// The returned value is the return value of the last execution of +// the program. +// +// This function requires at least Linux 4.12. +func (p *Program) Benchmark(in []byte, repeat int) (uint32, time.Duration, error) { + ret, _, total, err := p.testRun(in, repeat) + return ret, total, errors.Wrap(err, "can't benchmark program") +} + +var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() bool { + prog, err := NewProgram(&ProgramSpec{ + Type: SocketFilter, + Instructions: asm.Instructions{ + asm.LoadImm(asm.R0, 0, asm.DWord), + asm.Return(), + }, + License: "MIT", + }) + if err != nil { + // This may be because we lack sufficient permissions, etc. + return false + } + defer prog.Close() + + fd, err := prog.fd.Value() + if err != nil { + return false + } + + // Programs require at least 14 bytes input + in := make([]byte, 14) + attr := bpfProgTestRunAttr{ + fd: fd, + dataSizeIn: uint32(len(in)), + dataIn: internal.NewSlicePointer(in), + } + + _, err = internal.BPF(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + + // Check for EINVAL specifically, rather than err != nil since we + // otherwise misdetect due to insufficient permissions. + return errors.Cause(err) != unix.EINVAL +}) + +func (p *Program) testRun(in []byte, repeat int) (uint32, []byte, time.Duration, error) { + if uint(repeat) > math.MaxUint32 { + return 0, nil, 0, fmt.Errorf("repeat is too high") + } + + if len(in) == 0 { + return 0, nil, 0, fmt.Errorf("missing input") + } + + if uint(len(in)) > math.MaxUint32 { + return 0, nil, 0, fmt.Errorf("input is too long") + } + + if err := haveProgTestRun(); err != nil { + return 0, nil, 0, err + } + + // Older kernels ignore the dataSizeOut argument when copying to user space. + // Combined with things like bpf_xdp_adjust_head() we don't really know what the final + // size will be. Hence we allocate an output buffer which we hope will always be large + // enough, and panic if the kernel wrote past the end of the allocation. + // See https://patchwork.ozlabs.org/cover/1006822/ + out := make([]byte, len(in)+outputPad) + + fd, err := p.fd.Value() + if err != nil { + return 0, nil, 0, err + } + + attr := bpfProgTestRunAttr{ + fd: fd, + dataSizeIn: uint32(len(in)), + dataSizeOut: uint32(len(out)), + dataIn: internal.NewSlicePointer(in), + dataOut: internal.NewSlicePointer(out), + repeat: uint32(repeat), + } + + _, err = internal.BPF(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return 0, nil, 0, errors.Wrap(err, "can't run test") + } + + if int(attr.dataSizeOut) > cap(out) { + // Houston, we have a problem. The program created more data than we allocated, + // and the kernel wrote past the end of our buffer. + panic("kernel wrote past end of output buffer") + } + out = out[:int(attr.dataSizeOut)] + + total := time.Duration(attr.duration) * time.Nanosecond + return attr.retval, out, total, nil +} + +func unmarshalProgram(buf []byte) (*Program, error) { + if len(buf) != 4 { + return nil, errors.New("program id requires 4 byte value") + } + + // Looking up an entry in a nested map or prog array returns an id, + // not an fd. + id := internal.NativeEndian.Uint32(buf) + fd, err := bpfGetProgramFDByID(id) + if err != nil { + return nil, err + } + + name, abi, err := newProgramABIFromFd(fd) + if err != nil { + _ = fd.Close() + return nil, err + } + + return newProgram(fd, name, abi), nil +} + +// MarshalBinary implements BinaryMarshaler. +func (p *Program) MarshalBinary() ([]byte, error) { + value, err := p.fd.Value() + if err != nil { + return nil, err + } + + buf := make([]byte, 4) + internal.NativeEndian.PutUint32(buf, value) + return buf, nil +} + +// Attach a Program to a container object fd +func (p *Program) Attach(fd int, typ AttachType, flags AttachFlags) error { + if fd < 0 { + return errors.New("invalid fd") + } + + pfd, err := p.fd.Value() + if err != nil { + return err + } + + attr := bpfProgAlterAttr{ + targetFd: uint32(fd), + attachBpfFd: pfd, + attachType: uint32(typ), + attachFlags: uint32(flags), + } + + return bpfProgAlter(_ProgAttach, &attr) +} + +// Detach a Program from a container object fd +func (p *Program) Detach(fd int, typ AttachType, flags AttachFlags) error { + if fd < 0 { + return errors.New("invalid fd") + } + + pfd, err := p.fd.Value() + if err != nil { + return err + } + + attr := bpfProgAlterAttr{ + targetFd: uint32(fd), + attachBpfFd: pfd, + attachType: uint32(typ), + attachFlags: uint32(flags), + } + + return bpfProgAlter(_ProgDetach, &attr) +} + +// LoadPinnedProgram loads a Program from a BPF file. +// +// Requires at least Linux 4.11. +func LoadPinnedProgram(fileName string) (*Program, error) { + fd, err := bpfGetObject(fileName) + if err != nil { + return nil, err + } + + name, abi, err := newProgramABIFromFd(fd) + if err != nil { + _ = fd.Close() + return nil, errors.Wrapf(err, "can't get ABI for %s", fileName) + } + + return newProgram(fd, name, abi), nil +} + +// SanitizeName replaces all invalid characters in name. +// +// Use this to automatically generate valid names for maps and +// programs at run time. +// +// Passing a negative value for replacement will delete characters +// instead of replacing them. +func SanitizeName(name string, replacement rune) string { + return strings.Map(func(char rune) rune { + if invalidBPFObjNameChar(char) { + return replacement + } + return char + }, name) +} + +// IsNotSupported returns true if an error occurred because +// the kernel does not have support for a specific feature. +func IsNotSupported(err error) bool { + _, notSupported := errors.Cause(err).(*internal.UnsupportedFeatureError) + return notSupported +} diff --git a/vendor/github.com/cilium/ebpf/syscalls.go b/vendor/github.com/cilium/ebpf/syscalls.go new file mode 100644 index 000000000..20ce05dd8 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/syscalls.go @@ -0,0 +1,379 @@ +package ebpf + +import ( + "path/filepath" + "strings" + "unsafe" + + "github.com/cilium/ebpf/internal/btf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" + + "github.com/pkg/errors" +) + +// bpfObjName is a null-terminated string made up of +// 'A-Za-z0-9_' characters. +type bpfObjName [unix.BPF_OBJ_NAME_LEN]byte + +// newBPFObjName truncates the result if it is too long. +func newBPFObjName(name string) (bpfObjName, error) { + idx := strings.IndexFunc(name, invalidBPFObjNameChar) + if idx != -1 { + return bpfObjName{}, errors.Errorf("invalid character '%c' in name '%s'", name[idx], name) + } + + var result bpfObjName + copy(result[:unix.BPF_OBJ_NAME_LEN-1], name) + return result, nil +} + +func invalidBPFObjNameChar(char rune) bool { + switch { + case char >= 'A' && char <= 'Z': + fallthrough + case char >= 'a' && char <= 'z': + fallthrough + case char >= '0' && char <= '9': + fallthrough + case char == '_': + return false + default: + return true + } +} + +type bpfMapCreateAttr struct { + mapType MapType + keySize uint32 + valueSize uint32 + maxEntries uint32 + flags uint32 + innerMapFd uint32 // since 4.12 56f668dfe00d + numaNode uint32 // since 4.14 96eabe7a40aa + mapName bpfObjName // since 4.15 ad5b177bd73f + mapIfIndex uint32 + btfFd uint32 + btfKeyTypeID btf.TypeID + btfValueTypeID btf.TypeID +} + +type bpfMapOpAttr struct { + mapFd uint32 + padding uint32 + key internal.Pointer + value internal.Pointer + flags uint64 +} + +type bpfMapInfo struct { + mapType uint32 + id uint32 + keySize uint32 + valueSize uint32 + maxEntries uint32 + flags uint32 + mapName bpfObjName // since 4.15 ad5b177bd73f +} + +type bpfPinObjAttr struct { + fileName internal.Pointer + fd uint32 + padding uint32 +} + +type bpfProgLoadAttr struct { + progType ProgramType + insCount uint32 + instructions internal.Pointer + license internal.Pointer + logLevel uint32 + logSize uint32 + logBuf internal.Pointer + kernelVersion uint32 // since 4.1 2541517c32be + progFlags uint32 // since 4.11 e07b98d9bffe + progName bpfObjName // since 4.15 067cae47771c + progIfIndex uint32 // since 4.15 1f6f4cb7ba21 + expectedAttachType AttachType // since 4.17 5e43f899b03a + progBTFFd uint32 + funcInfoRecSize uint32 + funcInfo internal.Pointer + funcInfoCnt uint32 + lineInfoRecSize uint32 + lineInfo internal.Pointer + lineInfoCnt uint32 +} + +type bpfProgInfo struct { + progType uint32 + id uint32 + tag [unix.BPF_TAG_SIZE]byte + jitedLen uint32 + xlatedLen uint32 + jited internal.Pointer + xlated internal.Pointer + loadTime uint64 // since 4.15 cb4d2b3f03d8 + createdByUID uint32 + nrMapIDs uint32 + mapIds internal.Pointer + name bpfObjName +} + +type bpfProgTestRunAttr struct { + fd uint32 + retval uint32 + dataSizeIn uint32 + dataSizeOut uint32 + dataIn internal.Pointer + dataOut internal.Pointer + repeat uint32 + duration uint32 +} + +type bpfProgAlterAttr struct { + targetFd uint32 + attachBpfFd uint32 + attachType uint32 + attachFlags uint32 +} + +type bpfObjGetInfoByFDAttr struct { + fd uint32 + infoLen uint32 + info internal.Pointer // May be either bpfMapInfo or bpfProgInfo +} + +type bpfGetFDByIDAttr struct { + id uint32 + next uint32 +} + +func bpfProgLoad(attr *bpfProgLoadAttr) (*internal.FD, error) { + for { + fd, err := internal.BPF(_ProgLoad, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + // As of ~4.20 the verifier can be interrupted by a signal, + // and returns EAGAIN in that case. + if err == unix.EAGAIN { + continue + } + + if err != nil { + return nil, err + } + + return internal.NewFD(uint32(fd)), nil + } +} + +func bpfProgAlter(cmd int, attr *bpfProgAlterAttr) error { + _, err := internal.BPF(cmd, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +func bpfMapCreate(attr *bpfMapCreateAttr) (*internal.FD, error) { + fd, err := internal.BPF(_MapCreate, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + + return internal.NewFD(uint32(fd)), nil +} + +var haveNestedMaps = internal.FeatureTest("nested maps", "4.12", func() bool { + inner, err := bpfMapCreate(&bpfMapCreateAttr{ + mapType: Array, + keySize: 4, + valueSize: 4, + maxEntries: 1, + }) + if err != nil { + return false + } + defer inner.Close() + + innerFd, _ := inner.Value() + nested, err := bpfMapCreate(&bpfMapCreateAttr{ + mapType: ArrayOfMaps, + keySize: 4, + valueSize: 4, + maxEntries: 1, + innerMapFd: innerFd, + }) + if err != nil { + return false + } + + _ = nested.Close() + return true +}) + +func bpfMapLookupElem(m *internal.FD, key, valueOut internal.Pointer) error { + fd, err := m.Value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + value: valueOut, + } + _, err = internal.BPF(_MapLookupElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +func bpfMapUpdateElem(m *internal.FD, key, valueOut internal.Pointer, flags uint64) error { + fd, err := m.Value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + value: valueOut, + flags: flags, + } + _, err = internal.BPF(_MapUpdateElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +func bpfMapDeleteElem(m *internal.FD, key internal.Pointer) error { + fd, err := m.Value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + } + _, err = internal.BPF(_MapDeleteElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +func bpfMapGetNextKey(m *internal.FD, key, nextKeyOut internal.Pointer) error { + fd, err := m.Value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + value: nextKeyOut, + } + _, err = internal.BPF(_MapGetNextKey, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err +} + +const bpfFSType = 0xcafe4a11 + +func bpfPinObject(fileName string, fd *internal.FD) error { + dirName := filepath.Dir(fileName) + var statfs unix.Statfs_t + if err := unix.Statfs(dirName, &statfs); err != nil { + return err + } + if uint64(statfs.Type) != bpfFSType { + return errors.Errorf("%s is not on a bpf filesystem", fileName) + } + + value, err := fd.Value() + if err != nil { + return err + } + + _, err = internal.BPF(_ObjPin, unsafe.Pointer(&bpfPinObjAttr{ + fileName: internal.NewStringPointer(fileName), + fd: value, + }), 16) + return errors.Wrapf(err, "pin object %s", fileName) +} + +func bpfGetObject(fileName string) (*internal.FD, error) { + ptr, err := internal.BPF(_ObjGet, unsafe.Pointer(&bpfPinObjAttr{ + fileName: internal.NewStringPointer(fileName), + }), 16) + if err != nil { + return nil, errors.Wrapf(err, "get object %s", fileName) + } + return internal.NewFD(uint32(ptr)), nil +} + +func bpfGetObjectInfoByFD(fd *internal.FD, info unsafe.Pointer, size uintptr) error { + value, err := fd.Value() + if err != nil { + return err + } + + // available from 4.13 + attr := bpfObjGetInfoByFDAttr{ + fd: value, + infoLen: uint32(size), + info: internal.NewPointer(info), + } + _, err = internal.BPF(_ObjGetInfoByFD, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return errors.Wrapf(err, "fd %d", fd) +} + +func bpfGetProgInfoByFD(fd *internal.FD) (*bpfProgInfo, error) { + var info bpfProgInfo + err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)) + return &info, errors.Wrap(err, "can't get program info") +} + +func bpfGetMapInfoByFD(fd *internal.FD) (*bpfMapInfo, error) { + var info bpfMapInfo + err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)) + return &info, errors.Wrap(err, "can't get map info") +} + +var haveObjName = internal.FeatureTest("object names", "4.15", func() bool { + name, err := newBPFObjName("feature_test") + if err != nil { + // This really is a fatal error, but it should be caught + // by the unit tests not working. + return false + } + + attr := bpfMapCreateAttr{ + mapType: Array, + keySize: 4, + valueSize: 4, + maxEntries: 1, + mapName: name, + } + + fd, err := bpfMapCreate(&attr) + if err != nil { + return false + } + + _ = fd.Close() + return true +}) + +func bpfGetMapFDByID(id uint32) (*internal.FD, error) { + // available from 4.13 + attr := bpfGetFDByIDAttr{ + id: id, + } + ptr, err := internal.BPF(_MapGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return nil, errors.Wrapf(err, "can't get fd for map id %d", id) + } + return internal.NewFD(uint32(ptr)), nil +} + +func bpfGetProgramFDByID(id uint32) (*internal.FD, error) { + // available from 4.13 + attr := bpfGetFDByIDAttr{ + id: id, + } + ptr, err := internal.BPF(_ProgGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return nil, errors.Wrapf(err, "can't get fd for program id %d", id) + } + return internal.NewFD(uint32(ptr)), nil +} diff --git a/vendor/github.com/cilium/ebpf/types.go b/vendor/github.com/cilium/ebpf/types.go new file mode 100644 index 000000000..35f718360 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/types.go @@ -0,0 +1,201 @@ +package ebpf + +//go:generate stringer -output types_string.go -type=MapType,ProgramType + +// MapType indicates the type map structure +// that will be initialized in the kernel. +type MapType uint32 + +// All the various map types that can be created +const ( + UnspecifiedMap MapType = iota + // Hash is a hash map + Hash + // Array is an array map + Array + // ProgramArray - A program array map is a special kind of array map whose map + // values contain only file descriptors referring to other eBPF + // programs. Thus, both the key_size and value_size must be + // exactly four bytes. This map is used in conjunction with the + // TailCall helper. + ProgramArray + // PerfEventArray - A perf event array is used in conjunction with PerfEventRead + // and PerfEventOutput calls, to read the raw bpf_perf_data from the registers. + PerfEventArray + // PerCPUHash - This data structure is useful for people who have high performance + // network needs and can reconcile adds at the end of some cycle, so that + // hashes can be lock free without the use of XAdd, which can be costly. + PerCPUHash + // PerCPUArray - This data structure is useful for people who have high performance + // network needs and can reconcile adds at the end of some cycle, so that + // hashes can be lock free without the use of XAdd, which can be costly. + // Each CPU gets a copy of this hash, the contents of all of which can be reconciled + // later. + PerCPUArray + // StackTrace - This holds whole user and kernel stack traces, it can be retrieved with + // GetStackID + StackTrace + // CGroupArray - This is a very niche structure used to help SKBInCGroup determine + // if an skb is from a socket belonging to a specific cgroup + CGroupArray + // LRUHash - This allows you to create a small hash structure that will purge the + // least recently used items rather than thow an error when you run out of memory + LRUHash + // LRUCPUHash - This is NOT like PerCPUHash, this structure is shared among the CPUs, + // it has more to do with including the CPU id with the LRU calculation so that if a + // particular CPU is using a value over-and-over again, then it will be saved, but if + // a value is being retrieved a lot but sparsely across CPUs it is not as important, basically + // giving weight to CPU locality over overall usage. + LRUCPUHash + // LPMTrie - This is an implementation of Longest-Prefix-Match Trie structure. It is useful, + // for storing things like IP addresses which can be bit masked allowing for keys of differing + // values to refer to the same reference based on their masks. See wikipedia for more details. + LPMTrie + // ArrayOfMaps - Each item in the array is another map. The inner map mustn't be a map of maps + // itself. + ArrayOfMaps + // HashOfMaps - Each item in the hash map is another map. The inner map mustn't be a map of maps + // itself. + HashOfMaps +) + +// hasPerCPUValue returns true if the Map stores a value per CPU. +func (mt MapType) hasPerCPUValue() bool { + if mt == PerCPUHash || mt == PerCPUArray { + return true + } + return false +} + +const ( + _MapCreate = iota + _MapLookupElem + _MapUpdateElem + _MapDeleteElem + _MapGetNextKey + _ProgLoad + _ObjPin + _ObjGet + _ProgAttach + _ProgDetach + _ProgTestRun + _ProgGetNextID + _MapGetNextID + _ProgGetFDByID + _MapGetFDByID + _ObjGetInfoByFD + _ProgQuery + _RawTracepointOpen + _BTFLoad + _BTFGetFDByID + _TaskFDQuery + _MapLookupAndDeleteElem + _MapFreeze +) + +const ( + _Any = iota + _NoExist + _Exist +) + +// ProgramType of the eBPF program +type ProgramType uint32 + +// eBPF program types +const ( + // Unrecognized program type + UnspecifiedProgram ProgramType = iota + // SocketFilter socket or seccomp filter + SocketFilter + // Kprobe program + Kprobe + // SchedCLS traffic control shaper + SchedCLS + // SchedACT routing control shaper + SchedACT + // TracePoint program + TracePoint + // XDP program + XDP + // PerfEvent program + PerfEvent + // CGroupSKB program + CGroupSKB + // CGroupSock program + CGroupSock + // LWTIn program + LWTIn + // LWTOut program + LWTOut + // LWTXmit program + LWTXmit + // SockOps program + SockOps + // SkSKB program + SkSKB + // CGroupDevice program + CGroupDevice + // SkMsg program + SkMsg + // RawTracepoint program + RawTracepoint + // CGroupSockAddr program + CGroupSockAddr + // LWTSeg6Local program + LWTSeg6Local + // LircMode2 program + LircMode2 + // SkReuseport program + SkReuseport + // FlowDissector program + FlowDissector + // CGroupSysctl program + CGroupSysctl + // RawTracepointWritable program + RawTracepointWritable + // CGroupSockopt program + CGroupSockopt + // Tracing program + Tracing +) + +// AttachType of the eBPF program, needed to differentiate allowed context accesses in +// some newer program types like CGroupSockAddr. Should be set to AttachNone if not required. +// Will cause invalid argument (EINVAL) at program load time if set incorrectly. +type AttachType uint32 + +// AttachNone is an alias for AttachCGroupInetIngress for readability reasons +const AttachNone AttachType = 0 + +const ( + AttachCGroupInetIngress AttachType = iota + AttachCGroupInetEgress + AttachCGroupInetSockCreate + AttachCGroupSockOps + AttachSkSKBStreamParser + AttachSkSKBStreamVerdict + AttachCGroupDevice + AttachSkMsgVerdict + AttachCGroupInet4Bind + AttachCGroupInet6Bind + AttachCGroupInet4Connect + AttachCGroupInet6Connect + AttachCGroupInet4PostBind + AttachCGroupInet6PostBind + AttachCGroupUDP4Sendmsg + AttachCGroupUDP6Sendmsg + AttachLircMode2 + AttachFlowDissector + AttachCGroupSysctl + AttachCGroupUDP4Recvmsg + AttachCGroupUDP6Recvmsg + AttachCGroupGetsockopt + AttachCGroupSetsockopt + AttachTraceRawTp + AttachTraceFEntry + AttachTraceFExit +) + +// AttachFlags of the eBPF program used in BPF_PROG_ATTACH command +type AttachFlags uint32 diff --git a/vendor/github.com/cilium/ebpf/types_string.go b/vendor/github.com/cilium/ebpf/types_string.go new file mode 100644 index 000000000..4813437ec --- /dev/null +++ b/vendor/github.com/cilium/ebpf/types_string.go @@ -0,0 +1,78 @@ +// Code generated by "stringer -output types_string.go -type=MapType,ProgramType"; DO NOT EDIT. + +package ebpf + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[UnspecifiedMap-0] + _ = x[Hash-1] + _ = x[Array-2] + _ = x[ProgramArray-3] + _ = x[PerfEventArray-4] + _ = x[PerCPUHash-5] + _ = x[PerCPUArray-6] + _ = x[StackTrace-7] + _ = x[CGroupArray-8] + _ = x[LRUHash-9] + _ = x[LRUCPUHash-10] + _ = x[LPMTrie-11] + _ = x[ArrayOfMaps-12] + _ = x[HashOfMaps-13] +} + +const _MapType_name = "UnspecifiedMapHashArrayProgramArrayPerfEventArrayPerCPUHashPerCPUArrayStackTraceCGroupArrayLRUHashLRUCPUHashLPMTrieArrayOfMapsHashOfMaps" + +var _MapType_index = [...]uint8{0, 14, 18, 23, 35, 49, 59, 70, 80, 91, 98, 108, 115, 126, 136} + +func (i MapType) String() string { + if i >= MapType(len(_MapType_index)-1) { + return "MapType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _MapType_name[_MapType_index[i]:_MapType_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[UnspecifiedProgram-0] + _ = x[SocketFilter-1] + _ = x[Kprobe-2] + _ = x[SchedCLS-3] + _ = x[SchedACT-4] + _ = x[TracePoint-5] + _ = x[XDP-6] + _ = x[PerfEvent-7] + _ = x[CGroupSKB-8] + _ = x[CGroupSock-9] + _ = x[LWTIn-10] + _ = x[LWTOut-11] + _ = x[LWTXmit-12] + _ = x[SockOps-13] + _ = x[SkSKB-14] + _ = x[CGroupDevice-15] + _ = x[SkMsg-16] + _ = x[RawTracepoint-17] + _ = x[CGroupSockAddr-18] + _ = x[LWTSeg6Local-19] + _ = x[LircMode2-20] + _ = x[SkReuseport-21] + _ = x[FlowDissector-22] + _ = x[CGroupSysctl-23] + _ = x[RawTracepointWritable-24] + _ = x[CGroupSockopt-25] +} + +const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockopt" + +var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258} + +func (i ProgramType) String() string { + if i >= ProgramType(len(_ProgramType_index)-1) { + return "ProgramType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _ProgramType_name[_ProgramType_index[i]:_ProgramType_index[i+1]] +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go index 25ff51589..c0a965923 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go @@ -37,8 +37,18 @@ type Manager interface { // restore the object later. GetPaths() map[string]string + // GetUnifiedPath returns the unified path when running in unified mode. + // The value corresponds to the all values of GetPaths() map. + // + // GetUnifiedPath returns error when running in hybrid mode as well as + // in legacy mode. + GetUnifiedPath() (string, error) + // Sets the cgroup as configured. Set(container *configs.Config) error + + // Gets the cgroup as configured. + GetCgroups() (*configs.Cgroup, error) } type NotFoundError struct { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go new file mode 100644 index 000000000..847ce8ef1 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go @@ -0,0 +1,180 @@ +// Package devicefilter containes eBPF device filter program +// +// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c +// +// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) +// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 +package devicefilter + +import ( + "fmt" + "math" + + "github.com/cilium/ebpf/asm" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +const ( + // license string format is same as kernel MODULE_LICENSE macro + license = "Apache" +) + +// DeviceFilter returns eBPF device filter program and its license string +func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) { + p := &program{} + p.init() + for i := len(devices) - 1; i >= 0; i-- { + if err := p.appendDevice(devices[i]); err != nil { + return nil, "", err + } + } + insts, err := p.finalize() + return insts, license, err +} + +type program struct { + insts asm.Instructions + hasWildCard bool + blockID int +} + +func (p *program) init() { + // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 + /* + u32 access_type + u32 major + u32 minor + */ + // R2 <- type (lower 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R2, asm.R1, 0, asm.Half)) + + // R3 <- access (upper 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), + // RSh: bitwise shift right + asm.RSh.Imm32(asm.R3, 16)) + + // R4 <- major (u32 major at R1[4]) + p.insts = append(p.insts, + asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) + + // R5 <- minor (u32 minor at R1[8]) + p.insts = append(p.insts, + asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) +} + +// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element. +func (p *program) appendDevice(dev *configs.Device) error { + if p.blockID < 0 { + return errors.New("the program is finalized") + } + if p.hasWildCard { + // All entries after wildcard entry are ignored + return nil + } + + bpfType := int32(-1) + hasType := true + switch dev.Type { + case 'c': + bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) + case 'b': + bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) + case 'a': + hasType = false + default: + // if not specified in OCI json, typ is set to DeviceTypeAll + return errors.Errorf("invalid DeviceType %q", string(dev.Type)) + } + if dev.Major > math.MaxUint32 { + return errors.Errorf("invalid major %d", dev.Major) + } + if dev.Minor > math.MaxUint32 { + return errors.Errorf("invalid minor %d", dev.Major) + } + hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1 + hasMinor := dev.Minor >= 0 + bpfAccess := int32(0) + for _, r := range dev.Permissions { + switch r { + case 'r': + bpfAccess |= unix.BPF_DEVCG_ACC_READ + case 'w': + bpfAccess |= unix.BPF_DEVCG_ACC_WRITE + case 'm': + bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD + default: + return errors.Errorf("unknown device access %v", r) + } + } + // If the access is rwm, skip the check. + hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) + + blockSym := fmt.Sprintf("block-%d", p.blockID) + nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1) + prevBlockLastIdx := len(p.insts) - 1 + if hasType { + p.insts = append(p.insts, + // if (R2 != bpfType) goto next + asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), + ) + } + if hasAccess { + p.insts = append(p.insts, + // if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next + asm.Mov.Reg32(asm.R1, asm.R3), + asm.And.Imm32(asm.R1, bpfAccess), + asm.JEq.Imm(asm.R1, 0, nextBlockSym), + ) + } + if hasMajor { + p.insts = append(p.insts, + // if (R4 != major) goto next + asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym), + ) + } + if hasMinor { + p.insts = append(p.insts, + // if (R5 != minor) goto next + asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym), + ) + } + if !hasType && !hasAccess && !hasMajor && !hasMinor { + p.hasWildCard = true + } + p.insts = append(p.insts, acceptBlock(dev.Allow)...) + // set blockSym to the first instruction we added in this iteration + p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym) + p.blockID++ + return nil +} + +func (p *program) finalize() (asm.Instructions, error) { + if p.hasWildCard { + // acceptBlock with asm.Return() is already inserted + return p.insts, nil + } + blockSym := fmt.Sprintf("block-%d", p.blockID) + p.insts = append(p.insts, + // R0 <- 0 + asm.Mov.Imm32(asm.R0, 0).Sym(blockSym), + asm.Return(), + ) + p.blockID = -1 + return p.insts, nil +} + +func acceptBlock(accept bool) asm.Instructions { + v := int32(0) + if accept { + v = 1 + } + return []asm.Instruction{ + // R0 <- v + asm.Mov.Imm32(asm.R0, v), + asm.Return(), + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf.go new file mode 100644 index 000000000..4795e0aa3 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf.go @@ -0,0 +1,45 @@ +package ebpf + +import ( + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. +// +// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . +// +// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 +func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) { + nilCloser := func() error { + return nil + } + // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167). + // This limit is not inherited into the container. + memlockLimit := &unix.Rlimit{ + Cur: unix.RLIM_INFINITY, + Max: unix.RLIM_INFINITY, + } + _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) + spec := &ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + Instructions: insts, + License: license, + } + prog, err := ebpf.NewProgram(spec) + if err != nil { + return nilCloser, err + } + if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + closer := func() error { + if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + return nil + } + return closer, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go index 22d82acb4..fc9a164f8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -3,7 +3,6 @@ package fs import ( - "errors" "fmt" "io" "io/ioutil" @@ -14,10 +13,12 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" + "golang.org/x/sys/unix" ) var ( - subsystems = subsystemSet{ + subsystemsLegacy = subsystemSet{ &CpusetGroup{}, &DevicesGroup{}, &MemoryGroup{}, @@ -32,10 +33,19 @@ var ( &FreezerGroup{}, &NameGroup{GroupName: "name=systemd", Join: true}, } + subsystemsUnified = subsystemSet{ + &CpusetGroupV2{}, + &FreezerGroupV2{}, + &CpuGroupV2{}, + &MemoryGroupV2{}, + &IOGroupV2{}, + &PidsGroupV2{}, + &DevicesGroupV2{}, + } HugePageSizes, _ = cgroups.GetHugePageSize() ) -var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") +var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist") type subsystemSet []subsystem @@ -62,9 +72,10 @@ type subsystem interface { } type Manager struct { - mu sync.Mutex - Cgroups *configs.Cgroup - Paths map[string]string + mu sync.Mutex + Cgroups *configs.Cgroup + Rootless bool // ignore permission-related errors + Paths map[string]string } // The absolute path to the root of the cgroup hierarchies. @@ -100,6 +111,40 @@ type cgroupData struct { pid int } +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(rootless bool, err error) bool { + // We do not ignore errors if we are root. + if !rootless { + return false + } + // Is it an ordinary EPERM? + if os.IsPermission(errors.Cause(err)) { + return true + } + + // Try to handle other errnos. + var errno error + switch err := errors.Cause(err).(type) { + case *os.PathError: + errno = err.Err + case *os.LinkError: + errno = err.Err + case *os.SyscallError: + errno = err.Err + } + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES +} + +func (m *Manager) getSubsystems() subsystemSet { + if cgroups.IsCgroup2UnifiedMode() { + return subsystemsUnified + } + return subsystemsLegacy +} + func (m *Manager) Apply(pid int) (err error) { if m.Cgroups == nil { return nil @@ -129,7 +174,7 @@ func (m *Manager) Apply(pid int) (err error) { return cgroups.EnterPid(m.Paths, pid) } - for _, sys := range subsystems { + for _, sys := range m.getSubsystems() { // TODO: Apply should, ideally, be reentrant or be broken up into a separate // create and join phase so that the cgroup hierarchy for a container can be // created then join consists of writing the process pids to cgroup.procs @@ -145,14 +190,23 @@ func (m *Manager) Apply(pid int) (err error) { m.Paths[sys.Name()] = p if err := sys.Apply(d); err != nil { + // In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't + // been set, we don't bail on error in case of permission problems. + // Cases where limits have been set (and we couldn't create our own + // cgroup) are handled by Set. + if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" { + delete(m.Paths, sys.Name()) + continue + } return err } + } return nil } func (m *Manager) Destroy() error { - if m.Cgroups.Paths != nil { + if m.Cgroups == nil || m.Cgroups.Paths != nil { return nil } m.mu.Lock() @@ -171,12 +225,34 @@ func (m *Manager) GetPaths() map[string]string { return paths } +func (m *Manager) GetUnifiedPath() (string, error) { + if !cgroups.IsCgroup2UnifiedMode() { + return "", errors.New("unified path is only supported when running in unified mode") + } + unifiedPath := "" + m.mu.Lock() + defer m.mu.Unlock() + for k, v := range m.Paths { + if unifiedPath == "" { + unifiedPath = v + } else if v != unifiedPath { + return unifiedPath, + errors.Errorf("expected %q path to be unified path %q, got %q", k, unifiedPath, v) + } + } + if unifiedPath == "" { + // FIXME: unified path could be detected even when no controller is available + return unifiedPath, errors.New("cannot detect unified path") + } + return unifiedPath, nil +} + func (m *Manager) GetStats() (*cgroups.Stats, error) { m.mu.Lock() defer m.mu.Unlock() stats := cgroups.NewStats() for name, path := range m.Paths { - sys, err := subsystems.Get(name) + sys, err := m.getSubsystems().Get(name) if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { continue } @@ -188,16 +264,31 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { + if container.Cgroups == nil { + return nil + } + // If Paths are set, then we are just joining cgroups paths // and there is no need to set any values. - if m.Cgroups.Paths != nil { + if m.Cgroups != nil && m.Cgroups.Paths != nil { return nil } paths := m.GetPaths() - for _, sys := range subsystems { + for _, sys := range m.getSubsystems() { path := paths[sys.Name()] if err := sys.Set(path, container.Cgroups); err != nil { + if m.Rootless && sys.Name() == "devices" { + continue + } + // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if path == "" { + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) + } return err } } @@ -213,11 +304,15 @@ func (m *Manager) Set(container *configs.Config) error { // Freeze toggles the container's freezer cgroup depending on the state // provided func (m *Manager) Freeze(state configs.FreezerState) error { + if m.Cgroups == nil { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + paths := m.GetPaths() dir := paths["freezer"] prevState := m.Cgroups.Resources.Freezer m.Cgroups.Resources.Freezer = state - freezer, err := subsystems.Get("freezer") + freezer, err := m.getSubsystems().Get("freezer") if err != nil { return err } @@ -230,11 +325,25 @@ func (m *Manager) Freeze(state configs.FreezerState) error { } func (m *Manager) GetPids() ([]int, error) { + if cgroups.IsCgroup2UnifiedMode() { + path, err := m.GetUnifiedPath() + if err != nil { + return nil, err + } + return cgroups.GetPids(path) + } paths := m.GetPaths() return cgroups.GetPids(paths["devices"]) } func (m *Manager) GetAllPids() ([]int, error) { + if cgroups.IsCgroup2UnifiedMode() { + path, err := m.GetUnifiedPath() + if err != nil { + return nil, err + } + return cgroups.GetAllPids(path) + } paths := m.GetPaths() return cgroups.GetAllPids(paths["devices"]) } @@ -268,7 +377,7 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { } func (raw *cgroupData) path(subsystem string) (string, error) { - mnt, err := cgroups.FindCgroupMountpoint(subsystem) + mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem) // If we didn't mount the subsystem, there is no point we make the path. if err != nil { return "", err @@ -358,3 +467,7 @@ func CheckCpushares(path string, c uint64) error { return nil } + +func (m *Manager) GetCgroups() (*configs.Cgroup, error) { + return m.Cgroups, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go index b712bd0b1..e240a8313 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -46,11 +46,7 @@ func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems - if err := cgroups.WriteCgroupProc(path, pid); err != nil { - return err - } - - return nil + return cgroups.WriteCgroupProc(path, pid) } func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { @@ -83,11 +79,7 @@ func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { return err } } - if err := s.SetRtSched(path, cgroup); err != nil { - return err - } - - return nil + return s.SetRtSched(path, cgroup) } func (s *CpuGroup) Remove(d *cgroupData) error { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu_v2.go new file mode 100644 index 000000000..245071ae5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu_v2.go @@ -0,0 +1,92 @@ +// +build linux + +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type CpuGroupV2 struct { +} + +func (s *CpuGroupV2) Name() string { + return "cpu" +} + +func (s *CpuGroupV2) Apply(d *cgroupData) error { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + path, err := d.path("cpu") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return s.ApplyDir(path, d.config, d.pid) +} + +func (s *CpuGroupV2) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error { + // This might happen if we have no cpu cgroup mounted. + // Just do nothing and don't fail. + if path == "" { + return nil + } + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + return cgroups.WriteCgroupProc(path, pid) +} + +func (s *CpuGroupV2) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpuWeight != 0 { + if err := writeFile(path, "cpu.weight", strconv.FormatUint(cgroup.Resources.CpuWeight, 10)); err != nil { + return err + } + } + + if cgroup.Resources.CpuMax != "" { + if err := writeFile(path, "cpu.max", cgroup.Resources.CpuMax); err != nil { + return err + } + } + + return nil +} + +func (s *CpuGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("cpu")) +} + +func (s *CpuGroupV2) GetStats(path string, stats *cgroups.Stats) error { + f, err := os.Open(filepath.Join(path, "cpu.stat")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := getCgroupParamKeyValue(sc.Text()) + if err != nil { + return err + } + switch t { + case "usage_usec": + stats.CpuStats.CpuUsage.TotalUsage = v * 1000 + + case "user_usec": + stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000 + + case "system_usec": + stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000 + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go index 53afbaddf..032b76ecf 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go @@ -85,8 +85,8 @@ func getCpuUsageBreakdown(path string) (uint64, uint64, error) { return 0, 0, err } fields := strings.Fields(string(data)) - if len(fields) != 4 { - return 0, 0, fmt.Errorf("failure - %s is expected to have 4 fields", filepath.Join(path, cgroupCpuacctStat)) + if len(fields) < 4 { + return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat)) } if fields[0] != userField { return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go index 20c9eafac..5a1d152ea 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -77,18 +77,14 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro // The logic is, if user specified cpuset configs, use these // specified configs, otherwise, inherit from parent. This makes // cpuset configs work correctly with 'cpuset.cpu_exclusive', and - // keep backward compatbility. + // keep backward compatibility. if err := s.ensureCpusAndMems(dir, cgroup); err != nil { return err } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems - if err := cgroups.WriteCgroupProc(dir, pid); err != nil { - return err - } - - return nil + return cgroups.WriteCgroupProc(dir, pid) } func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset_v2.go new file mode 100644 index 000000000..35c194be1 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset_v2.go @@ -0,0 +1,162 @@ +// +build linux + +package fs + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" +) + +type CpusetGroupV2 struct { +} + +func (s *CpusetGroupV2) Name() string { + return "cpuset" +} + +func (s *CpusetGroupV2) Apply(d *cgroupData) error { + if d.config.Resources.CpusetCpus == "" && d.config.Resources.CpusetMems == "" { + return nil + } + dir, err := d.path("cpuset") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return s.applyDir(dir, d.config, d.pid) +} + +func (s *CpusetGroupV2) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpusetCpus != "" { + if err := writeFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil { + return err + } + } + if cgroup.Resources.CpusetMems != "" { + if err := writeFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil { + return err + } + } + return nil +} + +func (s *CpusetGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("cpuset")) +} + +func (s *CpusetGroupV2) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *CpusetGroupV2) applyDir(dir string, cgroup *configs.Cgroup, pid int) error { + // This might happen if we have no cpuset cgroup mounted. + // Just do nothing and don't fail. + if dir == "" { + return nil + } + mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo") + if err != nil { + return err + } + root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo))) + // 'ensureParent' start with parent because we don't want to + // explicitly inherit from parent, it could conflict with + // 'cpuset.cpu_exclusive'. + if err := s.ensureParent(filepath.Dir(dir), root); err != nil { + return err + } + if err := os.MkdirAll(dir, 0755); err != nil { + return err + } + // We didn't inherit cpuset configs from parent, but we have + // to ensure cpuset configs are set before moving task into the + // cgroup. + // The logic is, if user specified cpuset configs, use these + // specified configs, otherwise, inherit from parent. This makes + // cpuset configs work correctly with 'cpuset.cpu_exclusive', and + // keep backward compatibility. + if err := s.ensureCpusAndMems(dir, cgroup); err != nil { + return err + } + + // because we are not using d.join we need to place the pid into the procs file + // unlike the other subsystems + return cgroups.WriteCgroupProc(dir, pid) +} + +func (s *CpusetGroupV2) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { + if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus.effective")); err != nil { + return + } + if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems.effective")); err != nil { + return + } + return cpus, mems, nil +} + +// ensureParent makes sure that the parent directory of current is created +// and populated with the proper cpus and mems files copied from +// it's parent. +func (s *CpusetGroupV2) ensureParent(current, root string) error { + parent := filepath.Dir(current) + if libcontainerUtils.CleanPath(parent) == root { + return nil + } + // Avoid infinite recursion. + if parent == current { + return fmt.Errorf("cpuset: cgroup parent path outside cgroup root") + } + if err := s.ensureParent(parent, root); err != nil { + return err + } + if err := os.MkdirAll(current, 0755); err != nil { + return err + } + return s.copyIfNeeded(current, parent) +} + +// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func (s *CpusetGroupV2) copyIfNeeded(current, parent string) error { + var ( + err error + currentCpus, currentMems []byte + parentCpus, parentMems []byte + ) + + if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil { + return err + } + if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil { + return err + } + + if s.isEmpty(currentCpus) { + if err := writeFile(current, "cpuset.cpus", string(parentCpus)); err != nil { + return err + } + } + if s.isEmpty(currentMems) { + if err := writeFile(current, "cpuset.mems", string(parentMems)); err != nil { + return err + } + } + return nil +} + +func (s *CpusetGroupV2) isEmpty(b []byte) bool { + return len(bytes.Trim(b, "\n")) == 0 +} + +func (s *CpusetGroupV2) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error { + if err := s.Set(path, cgroup); err != nil { + return err + } + return s.copyIfNeeded(path, filepath.Dir(path)) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices_v2.go new file mode 100644 index 000000000..98512539e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices_v2.go @@ -0,0 +1,85 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf" + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +type DevicesGroupV2 struct { +} + +func (s *DevicesGroupV2) Name() string { + return "devices" +} + +func (s *DevicesGroupV2) Apply(d *cgroupData) error { + return nil +} + +func isRWM(cgroupPermissions string) bool { + r := false + w := false + m := false + for _, rn := range cgroupPermissions { + switch rn { + case 'r': + r = true + case 'w': + w = true + case 'm': + m = true + } + } + return r && w && m +} + +// the logic is from crun +// https://github.com/containers/crun/blob/0.10.2/src/libcrun/cgroup.c#L1644-L1652 +func canSkipEBPFError(cgroup *configs.Cgroup) bool { + for _, dev := range cgroup.Resources.Devices { + if dev.Allow || !isRWM(dev.Permissions) { + return false + } + } + return true +} + +func (s *DevicesGroupV2) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.AllowAllDevices != nil { + // never set by OCI specconv + return errors.New("libcontainer AllowAllDevices is not supported, use Devices") + } + if len(cgroup.Resources.DeniedDevices) != 0 { + // never set by OCI specconv + return errors.New("libcontainer DeniedDevices is not supported, use Devices") + } + insts, license, err := devicefilter.DeviceFilter(cgroup.Devices) + if err != nil { + return err + } + dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY, 0600) + if err != nil { + return errors.Errorf("cannot get dir FD for %s", path) + } + defer unix.Close(dirFD) + if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { + if !canSkipEBPFError(cgroup) { + return err + } + } + return nil +} + +func (s *DevicesGroupV2) Remove(d *cgroupData) error { + return nil +} + +func (s *DevicesGroupV2) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go index e70dfe3b9..4b19f8a97 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go @@ -29,11 +29,15 @@ func (s *FreezerGroup) Apply(d *cgroupData) error { func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { switch cgroup.Resources.Freezer { case configs.Frozen, configs.Thawed: - if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil { - return err - } - for { + // In case this loop does not exit because it doesn't get the expected + // state, let's write again this state, hoping it's going to be properly + // set this time. Otherwise, this loop could run infinitely, waiting for + // a state change that would never happen. + if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil { + return err + } + state, err := readFile(path, "freezer.state") if err != nil { return err @@ -41,6 +45,7 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) { break } + time.Sleep(1 * time.Millisecond) } case configs.Undefined: diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer_v2.go new file mode 100644 index 000000000..186de9ab4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer_v2.go @@ -0,0 +1,74 @@ +// +build linux + +package fs + +import ( + "fmt" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type FreezerGroupV2 struct { +} + +func (s *FreezerGroupV2) Name() string { + return "freezer" +} + +func (s *FreezerGroupV2) Apply(d *cgroupData) error { + _, err := d.join("freezer") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *FreezerGroupV2) Set(path string, cgroup *configs.Cgroup) error { + var desiredState string + filename := "cgroup.freeze" + if cgroup.Resources.Freezer == configs.Frozen { + desiredState = "1" + } else { + desiredState = "0" + } + + switch cgroup.Resources.Freezer { + case configs.Frozen, configs.Thawed: + for { + // In case this loop does not exit because it doesn't get the expected + // state, let's write again this state, hoping it's going to be properly + // set this time. Otherwise, this loop could run infinitely, waiting for + // a state change that would never happen. + if err := writeFile(path, filename, desiredState); err != nil { + return err + } + + state, err := readFile(path, filename) + if err != nil { + return err + } + if strings.TrimSpace(state) == desiredState { + break + } + + time.Sleep(1 * time.Millisecond) + } + case configs.Undefined: + return nil + default: + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer)) + } + + return nil +} + +func (s *FreezerGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("freezer")) +} + +func (s *FreezerGroupV2) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/io_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/io_v2.go new file mode 100644 index 000000000..477f7325b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/io_v2.go @@ -0,0 +1,149 @@ +// +build linux + +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type IOGroupV2 struct { +} + +func (s *IOGroupV2) Name() string { + return "io" +} + +func (s *IOGroupV2) Apply(d *cgroupData) error { + _, err := d.join("io") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *IOGroupV2) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.BlkioWeight != 0 { + filename := "io.bfq.weight" + if err := writeFile(path, filename, strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil { + return err + } + } + + for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice { + if err := writeFile(path, "io.max", td.StringName("rbps")); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice { + if err := writeFile(path, "io.max", td.StringName("wbps")); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice { + if err := writeFile(path, "io.max", td.StringName("riops")); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice { + if err := writeFile(path, "io.max", td.StringName("wiops")); err != nil { + return err + } + } + + return nil +} + +func (s *IOGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("io")) +} + +func readCgroup2MapFile(path string, name string) (map[string][]string, error) { + ret := map[string][]string{} + p := filepath.Join("/sys/fs/cgroup", path, name) + f, err := os.Open(p) + if err != nil { + if os.IsNotExist(err) { + return ret, nil + } + return nil, err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + ret[parts[0]] = parts[1:] + } + if err := scanner.Err(); err != nil { + return nil, err + } + return ret, nil +} + +func (s *IOGroupV2) getCgroupV2Stats(path string, stats *cgroups.Stats) error { + // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt + var ioServiceBytesRecursive []cgroups.BlkioStatEntry + values, err := readCgroup2MapFile(path, "io.stat") + if err != nil { + return err + } + for k, v := range values { + d := strings.Split(k, ":") + if len(d) != 2 { + continue + } + minor, err := strconv.ParseUint(d[0], 10, 0) + if err != nil { + return err + } + major, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + for _, item := range v { + d := strings.Split(item, "=") + if len(d) != 2 { + continue + } + op := d[0] + + // Accommodate the cgroup v1 naming + switch op { + case "rbytes": + op = "read" + case "wbytes": + op = "write" + } + + value, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + entry := cgroups.BlkioStatEntry{ + Op: op, + Major: major, + Minor: minor, + Value: value, + } + ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry) + } + } + stats.BlkioStats = cgroups.BlkioStats{IoServiceBytesRecursive: ioServiceBytesRecursive} + return nil +} + +func (s *IOGroupV2) GetStats(path string, stats *cgroups.Stats) error { + return s.getCgroupV2Stats(path, stats) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go new file mode 100644 index 000000000..69b5a1946 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go @@ -0,0 +1,62 @@ +// +build linux,!nokmem + +package fs + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "syscall" // for Errno type only + + "github.com/opencontainers/runc/libcontainer/cgroups" + "golang.org/x/sys/unix" +) + +const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" + +func EnableKernelMemoryAccounting(path string) error { + // Ensure that kernel memory is available in this kernel build. If it + // isn't, we just ignore it because EnableKernelMemoryAccounting is + // automatically called for all memory limits. + if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { + return nil + } + // We have to limit the kernel memory here as it won't be accounted at all + // until a limit is set on the cgroup and limit cannot be set once the + // cgroup has children, or if there are already tasks in the cgroup. + for _, i := range []int64{1, -1} { + if err := setKernelMemory(path, i); err != nil { + return err + } + } + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + if path == "" { + return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) + } + if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { + // We have specifically been asked to set a kmem limit. If the kernel + // doesn't support it we *must* error out. + return errors.New("kernel memory accounting not supported by this kernel") + } + if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { + // Check if the error number returned by the syscall is "EBUSY" + // The EBUSY signal is returned on attempts to write to the + // memory.kmem.limit_in_bytes file if the cgroup has children or + // once tasks have been attached to the cgroup + if pathErr, ok := err.(*os.PathError); ok { + if errNo, ok := pathErr.Err.(syscall.Errno); ok { + if errNo == unix.EBUSY { + return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) + } + } + } + return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go new file mode 100644 index 000000000..ac290fd7a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go @@ -0,0 +1,15 @@ +// +build linux,nokmem + +package fs + +import ( + "errors" +) + +func EnableKernelMemoryAccounting(path string) error { + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + return errors.New("kernel memory accounting disabled in this runc build") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go index ad395a5d6..d5310d569 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -5,23 +5,18 @@ package fs import ( "bufio" "fmt" - "io/ioutil" "os" "path/filepath" "strconv" "strings" - "syscall" // only for Errno "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" - - "golang.org/x/sys/unix" ) const ( - cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" - cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" - cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" ) type MemoryGroup struct { @@ -67,44 +62,6 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { return nil } -func EnableKernelMemoryAccounting(path string) error { - // Check if kernel memory is enabled - // We have to limit the kernel memory here as it won't be accounted at all - // until a limit is set on the cgroup and limit cannot be set once the - // cgroup has children, or if there are already tasks in the cgroup. - for _, i := range []int64{1, -1} { - if err := setKernelMemory(path, i); err != nil { - return err - } - } - return nil -} - -func setKernelMemory(path string, kernelMemoryLimit int64) error { - if path == "" { - return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) - } - if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { - // kernel memory is not enabled on the system so we should do nothing - return nil - } - if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { - // Check if the error number returned by the syscall is "EBUSY" - // The EBUSY signal is returned on attempts to write to the - // memory.kmem.limit_in_bytes file if the cgroup has children or - // once tasks have been attached to the cgroup - if pathErr, ok := err.(*os.PathError); ok { - if errNo, ok := pathErr.Err.(syscall.Errno); ok { - if errNo == unix.EBUSY { - return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) - } - } - } - return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) - } - return nil -} - func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { // If the memory update is set to -1 we should also // set swap to -1, it means unlimited memory. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory_v2.go new file mode 100644 index 000000000..2ad997bcc --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory_v2.go @@ -0,0 +1,164 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type MemoryGroupV2 struct { +} + +func (s *MemoryGroupV2) Name() string { + return "memory" +} + +func (s *MemoryGroupV2) Apply(d *cgroupData) (err error) { + path, err := d.path("memory") + if err != nil && !cgroups.IsNotFound(err) { + return err + } else if path == "" { + return nil + } + if memoryAssigned(d.config) { + if _, err := os.Stat(path); os.IsNotExist(err) { + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + // Only enable kernel memory accouting when this cgroup + // is created by libcontainer, otherwise we might get + // error when people use `cgroupsPath` to join an existed + // cgroup whose kernel memory is not initialized. + if err := EnableKernelMemoryAccounting(path); err != nil { + return err + } + } + } + defer func() { + if err != nil { + os.RemoveAll(path) + } + }() + + // We need to join memory cgroup after set memory limits, because + // kmem.limit_in_bytes can only be set when the cgroup is empty. + _, err = d.join("memory") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func setMemoryAndSwapCgroups(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.MemorySwap != 0 { + if err := writeFile(path, "memory.swap.max", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } + if cgroup.Resources.Memory != 0 { + if err := writeFile(path, "memory.max", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } + return nil +} + +func (s *MemoryGroupV2) Set(path string, cgroup *configs.Cgroup) error { + + if err := setMemoryAndSwapCgroups(path, cgroup); err != nil { + return err + } + + if cgroup.Resources.KernelMemory != 0 { + if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil { + return err + } + } + + if cgroup.Resources.MemoryReservation != 0 { + if err := writeFile(path, "memory.high", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { + return err + } + } + + return nil +} + +func (s *MemoryGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("memory")) +} + +func (s *MemoryGroupV2) GetStats(path string, stats *cgroups.Stats) error { + // Set stats from memory.stat. + statsFile, err := os.Open(filepath.Join(path, "memory.stat")) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := getCgroupParamKeyValue(sc.Text()) + if err != nil { + return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err) + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryDataV2(path, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryDataV2(path, "swap") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + + stats.MemoryStats.UseHierarchy = true + return nil +} + +func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = strings.Join([]string{"memory", name}, ".") + } + usage := strings.Join([]string{moduleName, "current"}, ".") + limit := strings.Join([]string{moduleName, "max"}, ".") + + value, err := getCgroupParamUint(path, usage) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err) + } + memoryData.Usage = value + + value, err = getCgroupParamUint(path, limit) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err) + } + memoryData.Limit = value + + return memoryData, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids_v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids_v2.go new file mode 100644 index 000000000..3413a2a0d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids_v2.go @@ -0,0 +1,107 @@ +// +build linux + +package fs + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +type PidsGroupV2 struct { +} + +func (s *PidsGroupV2) Name() string { + return "pids" +} + +func (s *PidsGroupV2) Apply(d *cgroupData) error { + _, err := d.join("pids") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *PidsGroupV2) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if cgroup.Resources.PidsLimit > 0 { + limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) + } + + if err := writeFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroupV2) Remove(d *cgroupData) error { + return removePath(d.path("pids")) +} + +func isNOTSUP(err error) bool { + switch err := err.(type) { + case *os.PathError: + return err.Err == unix.ENOTSUP + default: + return false + } +} + +func (s *PidsGroupV2) GetStats(path string, stats *cgroups.Stats) error { + current, err := getCgroupParamUint(path, "pids.current") + if os.IsNotExist(err) { + // if the controller is not enabled, let's read the list + // PIDs (or threads if cgroup.threads is enabled) + contents, err := ioutil.ReadFile(filepath.Join(path, "cgroup.procs")) + if err != nil && isNOTSUP(err) { + contents, err = ioutil.ReadFile(filepath.Join(path, "cgroup.threads")) + } + if err != nil { + return err + } + pids := make(map[string]string) + for _, i := range strings.Split(string(contents), "\n") { + if i != "" { + pids[i] = i + } + } + stats.PidsStats.Current = uint64(len(pids)) + stats.PidsStats.Limit = 0 + return nil + + } + if err != nil { + return fmt.Errorf("failed to parse pids.current - %s", err) + } + + maxString, err := getCgroupParamString(path, "pids.max") + if err != nil { + return fmt.Errorf("failed to parse pids.max - %s", err) + } + + // Default if pids.max == "max" is 0 -- which represents "no limit". + var max uint64 + if maxString != "max" { + max, err = parseUint(maxString, 10, 64) + if err != nil { + return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max")) + } + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go index 5ff0a1615..30922777f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/utils.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "io/ioutil" + "math" "path/filepath" "strconv" "strings" @@ -59,8 +60,12 @@ func getCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) { if err != nil { return 0, err } + trimmed := strings.TrimSpace(string(contents)) + if trimmed == "max" { + return math.MaxUint64, nil + } - res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64) + res, err := parseUint(trimmed, 10, 64) if err != nil { return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go index a65d8e443..ef0db5aeb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go @@ -1,4 +1,4 @@ -// +build !linux static_build +// +build !linux package systemd @@ -18,6 +18,10 @@ func UseSystemd() bool { return false } +func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) { + return nil, fmt.Errorf("Systemd not supported") +} + func (m *Manager) Apply(pid int) error { return fmt.Errorf("Systemd not supported") } @@ -38,6 +42,10 @@ func (m *Manager) GetPaths() map[string]string { return nil } +func (m *Manager) GetUnifiedPath() (string, error) { + return "", fmt.Errorf("Systemd not supported") +} + func (m *Manager) GetStats() (*cgroups.Stats, error) { return nil, fmt.Errorf("Systemd not supported") } @@ -53,3 +61,7 @@ func (m *Manager) Freeze(state configs.FreezerState) error { func Freeze(c *configs.Cgroup, state configs.FreezerState) error { return fmt.Errorf("Systemd not supported") } + +func (m *Manager) GetCgroups() (*configs.Cgroup, error) { + return nil, fmt.Errorf("Systemd not supported") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go index d9d8302d6..c4b19b3e6 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -1,10 +1,12 @@ -// +build linux,!static_build +// +build linux package systemd import ( "errors" "fmt" + "io/ioutil" + "math" "os" "path/filepath" "strings" @@ -12,14 +14,14 @@ import ( "time" systemdDbus "github.com/coreos/go-systemd/dbus" - systemdUtil "github.com/coreos/go-systemd/util" "github.com/godbus/dbus" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" ) -type Manager struct { +type LegacyManager struct { mu sync.Mutex Cgroups *configs.Cgroup Paths map[string]string @@ -47,7 +49,7 @@ func (s subsystemSet) Get(name string) (subsystem, error) { return nil, errSubsystemDoesNotExist } -var subsystems = subsystemSet{ +var legacySubsystems = subsystemSet{ &fs.CpusetGroup{}, &fs.DevicesGroup{}, &fs.MemoryGroup{}, @@ -69,12 +71,8 @@ const ( ) var ( - connLock sync.Mutex - theConn *systemdDbus.Conn - hasStartTransientUnit bool - hasStartTransientSliceUnit bool - hasTransientDefaultDependencies bool - hasDelegate bool + connLock sync.Mutex + theConn *systemdDbus.Conn ) func newProp(name string, units interface{}) systemdDbus.Property { @@ -84,8 +82,23 @@ func newProp(name string, units interface{}) systemdDbus.Property { } } +// NOTE: This function comes from package github.com/coreos/go-systemd/util +// It was borrowed here to avoid a dependency on cgo. +// +// IsRunningSystemd checks whether the host was booted with systemd as its init +// system. This functions similarly to systemd's `sd_booted(3)`: internally, it +// checks whether /run/systemd/system/ exists and is a directory. +// http://www.freedesktop.org/software/systemd/man/sd_booted.html +func isRunningSystemd() bool { + fi, err := os.Lstat("/run/systemd/system") + if err != nil { + return false + } + return fi.IsDir() +} + func UseSystemd() bool { - if !systemdUtil.IsRunningSystemd() { + if !isRunningSystemd() { return false } @@ -98,102 +111,31 @@ func UseSystemd() bool { if err != nil { return false } - - // Assume we have StartTransientUnit - hasStartTransientUnit = true - - // But if we get UnknownMethod error we don't - if _, err := theConn.StartTransientUnit("test.scope", "invalid", nil, nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" { - hasStartTransientUnit = false - return hasStartTransientUnit - } - } - } - - // Ensure the scope name we use doesn't exist. Use the Pid to - // avoid collisions between multiple libcontainer users on a - // single host. - scope := fmt.Sprintf("libcontainer-%d-systemd-test-default-dependencies.scope", os.Getpid()) - testScopeExists := true - for i := 0; i <= testScopeWait; i++ { - if _, err := theConn.StopUnit(scope, "replace", nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") { - testScopeExists = false - break - } - } - } - time.Sleep(time.Millisecond) - } - - // Bail out if we can't kill this scope without testing for DefaultDependencies - if testScopeExists { - return hasStartTransientUnit - } - - // Assume StartTransientUnit on a scope allows DefaultDependencies - hasTransientDefaultDependencies = true - ddf := newProp("DefaultDependencies", false) - if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{ddf}, nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { - hasTransientDefaultDependencies = false - } - } - } - - // Not critical because of the stop unit logic above. - theConn.StopUnit(scope, "replace", nil) - - // Assume StartTransientUnit on a scope allows Delegate - hasDelegate = true - dl := newProp("Delegate", true) - if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { - hasDelegate = false - } - } - } - - // Assume we have the ability to start a transient unit as a slice - // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 - // For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299 - hasStartTransientSliceUnit = true - - // To ensure simple clean-up, we create a slice off the root with no hierarchy - slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid()) - if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil { - if _, ok := err.(dbus.Error); ok { - hasStartTransientSliceUnit = false - } - } - - for i := 0; i <= testSliceWait; i++ { - if _, err := theConn.StopUnit(slice, "replace", nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") { - hasStartTransientSliceUnit = false - break - } - } - } else { - break - } - time.Sleep(time.Millisecond) - } - - // Not critical because of the stop unit logic above. - theConn.StopUnit(scope, "replace", nil) - theConn.StopUnit(slice, "replace", nil) } - return hasStartTransientUnit + return true } -func (m *Manager) Apply(pid int) error { +func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) { + if !isRunningSystemd() { + return nil, fmt.Errorf("systemd not running on this host, can't use systemd as a cgroups.Manager") + } + if cgroups.IsCgroup2UnifiedMode() { + return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &UnifiedManager{ + Cgroups: config, + Paths: paths, + } + }, nil + } + return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &LegacyManager{ + Cgroups: config, + Paths: paths, + } + }, nil +} + +func (m *LegacyManager) Apply(pid int) error { var ( c = m.Cgroups unitName = getUnitName(c) @@ -226,10 +168,6 @@ func (m *Manager) Apply(pid int) error { // if we create a slice, the parent is defined via a Wants= if strings.HasSuffix(unitName, ".slice") { - // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 - if !hasStartTransientSliceUnit { - return fmt.Errorf("systemd version does not support ability to start a slice as transient unit") - } properties = append(properties, systemdDbus.PropWants(slice)) } else { // otherwise, we use Slice= @@ -241,8 +179,9 @@ func (m *Manager) Apply(pid int) error { properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) } - if hasDelegate { - // This is only supported on systemd versions 218 and above. + // Check if we can delegate. This is only supported on systemd versions 218 and above. + if !strings.HasSuffix(unitName, ".slice") { + // Assume scopes always support delegation. properties = append(properties, newProp("Delegate", true)) } @@ -253,10 +192,9 @@ func (m *Manager) Apply(pid int) error { newProp("CPUAccounting", true), newProp("BlockIOAccounting", true)) - if hasTransientDefaultDependencies { - properties = append(properties, - newProp("DefaultDependencies", false)) - } + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) if c.Resources.Memory != 0 { properties = append(properties, @@ -270,7 +208,20 @@ func (m *Manager) Apply(pid int) error { // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { - cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + // corresponds to USEC_INFINITY in systemd + // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd + // always setting a property value ensures we can apply a quota and remove it later + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if c.Resources.CpuQuota > 0 { + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } properties = append(properties, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) } @@ -280,6 +231,12 @@ func (m *Manager) Apply(pid int) error { newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) } + if c.Resources.PidsLimit > 0 { + properties = append(properties, + newProp("TasksAccounting", true), + newProp("TasksMax", uint64(c.Resources.PidsLimit))) + } + // We have to set kernel memory here, as we can't change it once // processes have been attached to the cgroup. if c.Resources.KernelMemory != 0 { @@ -288,7 +245,14 @@ func (m *Manager) Apply(pid int) error { } } - if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil && !isUnitExists(err) { + statusChan := make(chan string, 1) + if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { + select { + case <-statusChan: + case <-time.After(time.Second): + logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName) + } + } else if !isUnitExists(err) { return err } @@ -297,7 +261,7 @@ func (m *Manager) Apply(pid int) error { } paths := make(map[string]string) - for _, s := range subsystems { + for _, s := range legacySubsystems { subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) if err != nil { // Don't fail if a cgroup hierarchy was not found, just skip this subsystem @@ -312,7 +276,7 @@ func (m *Manager) Apply(pid int) error { return nil } -func (m *Manager) Destroy() error { +func (m *LegacyManager) Destroy() error { if m.Cgroups.Paths != nil { return nil } @@ -326,18 +290,23 @@ func (m *Manager) Destroy() error { return nil } -func (m *Manager) GetPaths() map[string]string { +func (m *LegacyManager) GetPaths() map[string]string { m.mu.Lock() paths := m.Paths m.mu.Unlock() return paths } +func (m *LegacyManager) GetUnifiedPath() (string, error) { + return "", errors.New("unified path is only supported when running in unified mode") +} + func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { path, err := getSubsystemPath(c, subsystem) if err != nil { return "", err } + if err := os.MkdirAll(path, 0755); err != nil { return "", err } @@ -348,7 +317,7 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { } func joinCgroups(c *configs.Cgroup, pid int) error { - for _, sys := range subsystems { + for _, sys := range legacySubsystems { name := sys.Name() switch name { case "name=systemd": @@ -385,7 +354,7 @@ func joinCgroups(c *configs.Cgroup, pid int) error { // systemd represents slice hierarchy using `-`, so we need to follow suit when // generating the path of slice. Essentially, test-a-b.slice becomes -// test.slice/test-a.slice/test-a-b.slice. +// /test.slice/test-a.slice/test-a-b.slice. func ExpandSlice(slice string) (string, error) { suffix := ".slice" // Name has to end with ".slice", but can't be just ".slice". @@ -411,15 +380,14 @@ func ExpandSlice(slice string) (string, error) { } // Append the component to the path and to the prefix. - path += prefix + component + suffix + "/" + path += "/" + prefix + component + suffix prefix += component + "-" } - return path, nil } func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { - mountpoint, err := cgroups.FindCgroupMountpoint(subsystem) + mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem) if err != nil { return "", err } @@ -444,14 +412,14 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil } -func (m *Manager) Freeze(state configs.FreezerState) error { +func (m *LegacyManager) Freeze(state configs.FreezerState) error { path, err := getSubsystemPath(m.Cgroups, "freezer") if err != nil { return err } prevState := m.Cgroups.Resources.Freezer m.Cgroups.Resources.Freezer = state - freezer, err := subsystems.Get("freezer") + freezer, err := legacySubsystems.Get("freezer") if err != nil { return err } @@ -463,7 +431,7 @@ func (m *Manager) Freeze(state configs.FreezerState) error { return nil } -func (m *Manager) GetPids() ([]int, error) { +func (m *LegacyManager) GetPids() ([]int, error) { path, err := getSubsystemPath(m.Cgroups, "devices") if err != nil { return nil, err @@ -471,7 +439,7 @@ func (m *Manager) GetPids() ([]int, error) { return cgroups.GetPids(path) } -func (m *Manager) GetAllPids() ([]int, error) { +func (m *LegacyManager) GetAllPids() ([]int, error) { path, err := getSubsystemPath(m.Cgroups, "devices") if err != nil { return nil, err @@ -479,12 +447,12 @@ func (m *Manager) GetAllPids() ([]int, error) { return cgroups.GetAllPids(path) } -func (m *Manager) GetStats() (*cgroups.Stats, error) { +func (m *LegacyManager) GetStats() (*cgroups.Stats, error) { m.mu.Lock() defer m.mu.Unlock() stats := cgroups.NewStats() for name, path := range m.Paths { - sys, err := subsystems.Get(name) + sys, err := legacySubsystems.Get(name) if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { continue } @@ -496,13 +464,13 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { return stats, nil } -func (m *Manager) Set(container *configs.Config) error { +func (m *LegacyManager) Set(container *configs.Config) error { // If Paths are set, then we are just joining cgroups paths // and there is no need to set any values. if m.Cgroups.Paths != nil { return nil } - for _, sys := range subsystems { + for _, sys := range legacySubsystems { // Get the subsystem path, but don't error out for not found cgroups. path, err := getSubsystemPath(container.Cgroups, sys.Name()) if err != nil && !cgroups.IsNotFound(err) { @@ -539,6 +507,15 @@ func setKernelMemory(c *configs.Cgroup) error { if err := os.MkdirAll(path, 0755); err != nil { return err } + // do not try to enable the kernel memory if we already have + // tasks in the cgroup. + content, err := ioutil.ReadFile(filepath.Join(path, "tasks")) + if err != nil { + return err + } + if len(content) > 0 { + return nil + } return fs.EnableKernelMemoryAccounting(path) } @@ -551,3 +528,7 @@ func isUnitExists(err error) bool { } return false } + +func (m *LegacyManager) GetCgroups() (*configs.Cgroup, error) { + return m.Cgroups, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/unified_hierarchy.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/unified_hierarchy.go new file mode 100644 index 000000000..6d1c70827 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/unified_hierarchy.go @@ -0,0 +1,352 @@ +// +build linux + +package systemd + +import ( + "fmt" + "io/ioutil" + "math" + "os" + "path/filepath" + "strings" + "sync" + "time" + + systemdDbus "github.com/coreos/go-systemd/dbus" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +type UnifiedManager struct { + mu sync.Mutex + Cgroups *configs.Cgroup + Paths map[string]string +} + +var unifiedSubsystems = subsystemSet{ + &fs.CpusetGroupV2{}, + &fs.FreezerGroupV2{}, + &fs.CpuGroupV2{}, + &fs.MemoryGroupV2{}, + &fs.IOGroupV2{}, + &fs.PidsGroupV2{}, + &fs.DevicesGroupV2{}, +} + +func (m *UnifiedManager) Apply(pid int) error { + var ( + c = m.Cgroups + unitName = getUnitName(c) + slice = "system.slice" + properties []systemdDbus.Property + ) + + if c.Paths != nil { + paths := make(map[string]string) + for name, path := range c.Paths { + _, err := getSubsystemPath(m.Cgroups, name) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[name] = path + } + m.Paths = paths + return cgroups.EnterPid(m.Paths, pid) + } + + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + // if we create a slice, the parent is defined via a Wants= + if strings.HasSuffix(unitName, ".slice") { + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // otherwise, we use Slice= + properties = append(properties, systemdDbus.PropSlice(slice)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // Check if we can delegate. This is only supported on systemd versions 218 and above. + if !strings.HasSuffix(unitName, ".slice") { + // Assume scopes always support delegation. + properties = append(properties, newProp("Delegate", true)) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("BlockIOAccounting", true)) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + if c.Resources.Memory != 0 { + properties = append(properties, + newProp("MemoryLimit", uint64(c.Resources.Memory))) + } + + if c.Resources.CpuShares != 0 { + properties = append(properties, + newProp("CPUShares", c.Resources.CpuShares)) + } + + // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. + if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { + // corresponds to USEC_INFINITY in systemd + // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd + // always setting a property value ensures we can apply a quota and remove it later + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if c.Resources.CpuQuota > 0 { + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } + properties = append(properties, + newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) + } + + if c.Resources.BlkioWeight != 0 { + properties = append(properties, + newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) + } + + if c.Resources.PidsLimit > 0 { + properties = append(properties, + newProp("TasksAccounting", true), + newProp("TasksMax", uint64(c.Resources.PidsLimit))) + } + + // We have to set kernel memory here, as we can't change it once + // processes have been attached to the cgroup. + if c.Resources.KernelMemory != 0 { + if err := setKernelMemory(c); err != nil { + return err + } + } + + statusChan := make(chan string, 1) + if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { + select { + case <-statusChan: + case <-time.After(time.Second): + logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName) + } + } else if !isUnitExists(err) { + return err + } + + if err := joinCgroupsV2(c, pid); err != nil { + return err + } + + paths := make(map[string]string) + for _, s := range unifiedSubsystems { + subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[s.Name()] = subsystemPath + } + m.Paths = paths + return nil +} + +func (m *UnifiedManager) Destroy() error { + if m.Cgroups.Paths != nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) + if err := cgroups.RemovePaths(m.Paths); err != nil { + return err + } + m.Paths = make(map[string]string) + return nil +} + +func (m *UnifiedManager) GetPaths() map[string]string { + m.mu.Lock() + paths := m.Paths + m.mu.Unlock() + return paths +} +func (m *UnifiedManager) GetUnifiedPath() (string, error) { + unifiedPath := "" + m.mu.Lock() + defer m.mu.Unlock() + for k, v := range m.Paths { + if unifiedPath == "" { + unifiedPath = v + } else if v != unifiedPath { + return unifiedPath, + errors.Errorf("expected %q path to be unified path %q, got %q", k, unifiedPath, v) + } + } + if unifiedPath == "" { + // FIXME: unified path could be detected even when no controller is available + return unifiedPath, errors.New("cannot detect unified path") + } + return unifiedPath, nil +} +func createCgroupsv2Path(path string) (Err error) { + content, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers") + if err != nil { + return err + } + if !filepath.HasPrefix(path, "/sys/fs/cgroup") { + return fmt.Errorf("invalid cgroup path %s", path) + } + + res := "" + for i, c := range strings.Split(strings.TrimSpace(string(content)), " ") { + if i == 0 { + res = fmt.Sprintf("+%s", c) + } else { + res = res + fmt.Sprintf(" +%s", c) + } + } + resByte := []byte(res) + + current := "/sys/fs" + elements := strings.Split(path, "/") + for i, e := range elements[3:] { + current = filepath.Join(current, e) + if i > 0 { + if err := os.Mkdir(current, 0755); err != nil { + if !os.IsExist(err) { + return err + } + } else { + // If the directory was created, be sure it is not left around on errors. + defer func() { + if Err != nil { + os.Remove(current) + } + }() + } + } + if i < len(elements[3:])-1 { + if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), resByte, 0755); err != nil { + return err + } + } + } + return nil +} + +func joinCgroupsV2(c *configs.Cgroup, pid int) error { + path, err := getSubsystemPath(c, "memory") + if err != nil { + return err + } + return createCgroupsv2Path(path) +} + +func (m *UnifiedManager) Freeze(state configs.FreezerState) error { + path, err := getSubsystemPath(m.Cgroups, "freezer") + if err != nil { + return err + } + prevState := m.Cgroups.Resources.Freezer + m.Cgroups.Resources.Freezer = state + freezer, err := unifiedSubsystems.Get("freezer") + if err != nil { + return err + } + err = freezer.Set(path, m.Cgroups) + if err != nil { + m.Cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *UnifiedManager) GetPids() ([]int, error) { + path, err := m.GetUnifiedPath() + if err != nil { + return nil, err + } + return cgroups.GetPids(path) +} + +func (m *UnifiedManager) GetAllPids() ([]int, error) { + path, err := m.GetUnifiedPath() + if err != nil { + return nil, err + } + return cgroups.GetAllPids(path) +} + +func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for name, path := range m.Paths { + sys, err := unifiedSubsystems.Get(name) + if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + + return stats, nil +} + +func (m *UnifiedManager) Set(container *configs.Config) error { + // If Paths are set, then we are just joining cgroups paths + // and there is no need to set any values. + if m.Cgroups.Paths != nil { + return nil + } + for _, sys := range unifiedSubsystems { + // Get the subsystem path, but don't error out for not found cgroups. + path, err := getSubsystemPath(container.Cgroups, sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + + if err := sys.Set(path, container.Cgroups); err != nil { + return err + } + } + + if m.Paths["cpu"] != "" { + if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } + return nil +} + +func (m *UnifiedManager) GetCgroups() (*configs.Cgroup, error) { + return m.Cgroups, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index 7c995efee..dbcc58f5b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -11,42 +11,87 @@ import ( "path/filepath" "strconv" "strings" + "sync" + "syscall" "time" - "github.com/docker/go-units" + units "github.com/docker/go-units" + "golang.org/x/sys/unix" ) const ( - cgroupNamePrefix = "name=" - CgroupProcesses = "cgroup.procs" + CgroupNamePrefix = "name=" + CgroupProcesses = "cgroup.procs" + unifiedMountpoint = "/sys/fs/cgroup" ) +var ( + isUnifiedOnce sync.Once + isUnified bool +) + +// HugePageSizeUnitList is a list of the units used by the linux kernel when +// naming the HugePage control files. +// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt +// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed, +// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393 +var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"} + +// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. +func IsCgroup2UnifiedMode() bool { + isUnifiedOnce.Do(func() { + var st syscall.Statfs_t + if err := syscall.Statfs(unifiedMountpoint, &st); err != nil { + panic("cannot statfs cgroup root") + } + isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC + }) + return isUnified +} + // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt -func FindCgroupMountpoint(subsystem string) (string, error) { - mnt, _, err := FindCgroupMountpointAndRoot(subsystem) +func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return unifiedMountpoint, nil + } + mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) return mnt, err } -func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { +func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { // We are not using mount.GetMounts() because it's super-inefficient, // parsing it directly sped up x10 times because of not using Sscanf. // It was one of two major performance drawbacks in container start. if !isSubsystemAvailable(subsystem) { return "", "", NewNotFoundError(subsystem) } + f, err := os.Open("/proc/self/mountinfo") if err != nil { return "", "", err } defer f.Close() - scanner := bufio.NewScanner(f) + if IsCgroup2UnifiedMode() { + subsystem = "" + } + + return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem) +} + +func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) { + scanner := bufio.NewScanner(reader) for scanner.Scan() { txt := scanner.Text() - fields := strings.Split(txt, " ") - for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if opt == subsystem { - return fields[4], fields[3], nil + fields := strings.Fields(txt) + if len(fields) < 9 { + continue + } + if strings.HasPrefix(fields[4], cgroupPath) { + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if (subsystem == "" && fields[9] == "cgroup2") || opt == subsystem { + return fields[4], fields[3], nil + } } } } @@ -58,6 +103,19 @@ func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { } func isSubsystemAvailable(subsystem string) bool { + if IsCgroup2UnifiedMode() { + controllers, err := GetAllSubsystems() + if err != nil { + return false + } + for _, c := range controllers { + if c == subsystem { + return true + } + } + return false + } + cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return false @@ -102,8 +160,8 @@ func FindCgroupMountpointDir() (string, error) { return "", fmt.Errorf("Found no fields post '-' in %q", text) } - if postSeparatorFields[0] == "cgroup" { - // Check that the mount is properly formated. + if postSeparatorFields[0] == "cgroup" || postSeparatorFields[0] == "cgroup2" { + // Check that the mount is properly formatted. if numPostFields < 3 { return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) } @@ -151,19 +209,20 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, Root: fields[3], } for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if !ss[opt] { + seen, known := ss[opt] + if !known || (!all && seen) { continue } - if strings.HasPrefix(opt, cgroupNamePrefix) { - m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) - } else { - m.Subsystems = append(m.Subsystems, opt) - } - if !all { - numFound++ + ss[opt] = true + if strings.HasPrefix(opt, CgroupNamePrefix) { + opt = opt[len(CgroupNamePrefix):] } + m.Subsystems = append(m.Subsystems, opt) + numFound++ + } + if len(m.Subsystems) > 0 || all { + res = append(res, m) } - res = append(res, m) } if err := scanner.Err(); err != nil { return nil, err @@ -174,6 +233,19 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, // GetCgroupMounts returns the mounts for the cgroup subsystems. // all indicates whether to return just the first instance or all the mounts. func GetCgroupMounts(all bool) ([]Mount, error) { + if IsCgroup2UnifiedMode() { + availableControllers, err := GetAllSubsystems() + if err != nil { + return nil, err + } + m := Mount{ + Mountpoint: unifiedMountpoint, + Root: unifiedMountpoint, + Subsystems: availableControllers, + } + return []Mount{m}, nil + } + f, err := os.Open("/proc/self/mountinfo") if err != nil { return nil, err @@ -187,13 +259,28 @@ func GetCgroupMounts(all bool) ([]Mount, error) { allMap := make(map[string]bool) for s := range allSubsystems { - allMap[s] = true + allMap[s] = false } return getCgroupMountsHelper(allMap, f, all) } // GetAllSubsystems returns all the cgroup subsystems supported by the kernel func GetAllSubsystems() ([]string, error) { + // /proc/cgroups is meaningless for v2 + // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features + if IsCgroup2UnifiedMode() { + // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers. + // - devices: implemented in kernel 4.15 + // - freezer: implemented in kernel 5.2 + // We assume these are always available, as it is hard to detect availability. + pseudo := []string{"devices", "freezer"} + data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers") + if err != nil { + return nil, err + } + subsystems := append(pseudo, strings.Fields(string(data))...) + return subsystems, nil + } f, err := os.Open("/proc/cgroups") if err != nil { return nil, err @@ -256,13 +343,13 @@ func GetInitCgroupPath(subsystem string) (string, error) { } func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := FindCgroupMountpointAndRoot(subsystem) + mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) if err != nil { return "", err } // This is needed for nested containers, because in /proc/self/cgroup we - // see pathes from host, which don't exist in container. + // see paths from host, which don't exist in container. relCgroup, err := filepath.Rel(root, cgroup) if err != nil { return "", err @@ -337,12 +424,15 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) { } func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { + if IsCgroup2UnifiedMode() { + return "/", nil + } if p, ok := cgroups[subsystem]; ok { return p, nil } - if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok { + if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { return p, nil } @@ -397,19 +487,26 @@ func RemovePaths(paths map[string]string) (err error) { } func GetHugePageSize() ([]string, error) { - var pageSizes []string - sizeList := []string{"B", "kB", "MB", "GB", "TB", "PB"} files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages") if err != nil { - return pageSizes, err + return []string{}, err } + var fileNames []string for _, st := range files { - nameArray := strings.Split(st.Name(), "-") + fileNames = append(fileNames, st.Name()) + } + return getHugePageSizeFromFilenames(fileNames) +} + +func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { + var pageSizes []string + for _, fileName := range fileNames { + nameArray := strings.Split(fileName, "-") pageSize, err := units.RAMInBytes(nameArray[1]) if err != nil { return []string{}, err } - sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, sizeList) + sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList) pageSizes = append(pageSizes, sizeString) } @@ -453,10 +550,39 @@ func WriteCgroupProc(dir string, pid int) error { } // Dont attach any pid to the cgroup if -1 is specified as a pid - if pid != -1 { - if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) - } + if pid == -1 { + return nil + } + + cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700) + if err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + defer cgroupProcessesFile.Close() + + for i := 0; i < 5; i++ { + _, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid)) + if err == nil { + return nil + } + + // EINVAL might mean that the task being added to cgroup.procs is in state + // TASK_NEW. We should attempt to do so again. + if isEINVAL(err) { + time.Sleep(30 * time.Millisecond) + continue + } + + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + return err +} + +func isEINVAL(err error) bool { + switch err := err.(type) { + case *os.PathError: + return err.Err == unix.EINVAL + default: + return false } - return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go index e0f3ca165..fa195bf90 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go @@ -59,3 +59,8 @@ func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { func (td *ThrottleDevice) String() string { return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) } + +// StringName formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) StringName(name string) string { + return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go index e15a662f5..58ed19c9e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go @@ -119,4 +119,12 @@ type Resources struct { // Set class identifier for container's network packets NetClsClassid uint32 `json:"net_cls_classid_u"` + + // Used on cgroups v2: + + // CpuWeight sets a proportional bandwidth limit. + CpuWeight uint64 `json:"cpu_weight"` + + // CpuMax sets she maximum bandwidth limit (format: max period). + CpuMax string `json:"cpu_max"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go index 95e2830a4..c0c23d700 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go @@ -1,6 +1,8 @@ -// +build !windows,!linux,!freebsd +// +build !linux package configs +// TODO Windows: This can ultimately be entirely factored out on Windows as +// cgroups are a Unix-specific construct. type Cgroup struct { } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go deleted file mode 100644 index d74847b0d..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_windows.go +++ /dev/null @@ -1,6 +0,0 @@ -package configs - -// TODO Windows: This can ultimately be entirely factored out on Windows as -// cgroups are a Unix-specific construct. -type Cgroup struct { -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 3cae4fd8d..24989e9f5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -44,6 +44,7 @@ const ( Trap Allow Trace + Log ) // Operator is a comparison operator to be used when matching syscall arguments in Seccomp @@ -141,9 +142,10 @@ type Config struct { // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores // for a process. Valid values are between the range [-1000, '1000'], where processes with - // higher scores are preferred for being killed. + // higher scores are preferred for being killed. If it is unset then we don't touch the current + // value. // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ - OomScoreAdj int `json:"oom_score_adj"` + OomScoreAdj *int `json:"oom_score_adj,omitempty"` // UidMappings is an array of User ID mappings for User Namespaces UidMappings []IDMap `json:"uid_mappings"` @@ -185,12 +187,19 @@ type Config struct { // callers keyring in this case. NoNewKeyring bool `json:"no_new_keyring"` - // Rootless specifies whether the container is a rootless container. - Rootless bool `json:"rootless"` - - // IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into - // to limit the resources (e.g., L3 cache) the container has available + // IntelRdt specifies settings for Intel RDT group that the container is placed into + // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` + + // RootlessEUID is set when the runc was launched with non-zero EUID. + // Note that RootlessEUID is set to false when launched with EUID=0 in userns. + // When RootlessEUID is set, runc creates a new userns for the container. + // (config.json needs to contain userns settings) + RootlessEUID bool `json:"rootless_euid,omitempty"` + + // RootlessCgroups is set when unlikely to have the full access to cgroups. + // When RootlessCgroups is set, cgroups errors are ignored. + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` } type Hooks struct { @@ -264,26 +273,23 @@ func (hooks Hooks) MarshalJSON() ([]byte, error) { }) } -// HookState is the payload provided to a hook on execution. -type HookState specs.State - type Hook interface { // Run executes the hook with the provided state. - Run(HookState) error + Run(*specs.State) error } // NewFunctionHook will call the provided function when the hook is run. -func NewFunctionHook(f func(HookState) error) FuncHook { +func NewFunctionHook(f func(*specs.State) error) FuncHook { return FuncHook{ run: f, } } type FuncHook struct { - run func(HookState) error + run func(*specs.State) error } -func (f FuncHook) Run(s HookState) error { +func (f FuncHook) Run(s *specs.State) error { return f.run(s) } @@ -306,7 +312,7 @@ type CommandHook struct { Command } -func (c Command) Run(s HookState) error { +func (c Command) Run(s *specs.State) error { b, err := json.Marshal(s) if err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go index 4d348d217..e4f423c52 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go @@ -1,4 +1,4 @@ -// +build linux freebsd +// +build linux package configs diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go index 36bd5f96a..57e9f037d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go @@ -4,4 +4,10 @@ type IntelRdt struct { // The schema for L3 cache id and capacity bitmask (CBM) // Format: "L3:=;=;..." L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The schema of memory bandwidth per L3 cache id + // Format: "MB:=bandwidth0;=bandwidth1;..." + // The unit of memory bandwidth is specified in "percentages" by + // default, and in "MBps" if MBA Software Controller is enabled. + MemBwSchema string `json:"memBwSchema,omitempty"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go index 5fc171a57..1bbaef9bd 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go @@ -7,12 +7,13 @@ import ( ) const ( - NEWNET NamespaceType = "NEWNET" - NEWPID NamespaceType = "NEWPID" - NEWNS NamespaceType = "NEWNS" - NEWUTS NamespaceType = "NEWUTS" - NEWIPC NamespaceType = "NEWIPC" - NEWUSER NamespaceType = "NEWUSER" + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" + NEWCGROUP NamespaceType = "NEWCGROUP" ) var ( @@ -35,6 +36,8 @@ func NsName(ns NamespaceType) string { return "user" case NEWUTS: return "uts" + case NEWCGROUP: + return "cgroup" } return "" } @@ -68,6 +71,7 @@ func NamespaceTypes() []NamespaceType { NEWNET, NEWPID, NEWNS, + NEWCGROUP, } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go index 4ce6813d2..2dc7adfc9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go @@ -9,12 +9,13 @@ func (n *Namespace) Syscall() int { } var namespaceInfo = map[NamespaceType]int{ - NEWNET: unix.CLONE_NEWNET, - NEWNS: unix.CLONE_NEWNS, - NEWUSER: unix.CLONE_NEWUSER, - NEWIPC: unix.CLONE_NEWIPC, - NEWUTS: unix.CLONE_NEWUTS, - NEWPID: unix.CLONE_NEWPID, + NEWNET: unix.CLONE_NEWNET, + NEWNS: unix.CLONE_NEWNS, + NEWUSER: unix.CLONE_NEWUSER, + NEWIPC: unix.CLONE_NEWIPC, + NEWUTS: unix.CLONE_NEWUTS, + NEWPID: unix.CLONE_NEWPID, + NEWCGROUP: unix.CLONE_NEWCGROUP, } // CloneFlags parses the container's Namespaces options to set the correct diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go index ded5a6bbc..c32122798 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go @@ -22,6 +22,7 @@ var actions = map[string]configs.Action{ "SCMP_ACT_TRAP": configs.Trap, "SCMP_ACT_ALLOW": configs.Allow, "SCMP_ACT_TRACE": configs.Trace, + "SCMP_ACT_LOG": configs.Log, } var archs = map[string]string{ diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go index 2523cbf99..1b7a07118 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -19,9 +19,15 @@ var ( actTrap = libseccomp.ActTrap actKill = libseccomp.ActKill actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM)) + actLog = libseccomp.ActLog actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) ) +const ( + // Linux system calls can have at most 6 arguments + syscallMaxArguments int = 6 +) + // Filters given syscalls in a container, preventing them from being used // Started in the container init process, and carried over to all child processes // Setns calls, however, require a separate invocation, as they are not children @@ -45,11 +51,11 @@ func InitSeccomp(config *configs.Seccomp) error { for _, arch := range config.Architectures { scmpArch, err := libseccomp.GetArchFromString(arch) if err != nil { - return err + return fmt.Errorf("error validating Seccomp architecture: %s", err) } if err := filter.AddArch(scmpArch); err != nil { - return err + return fmt.Errorf("error adding architecture to seccomp filter: %s", err) } } @@ -107,6 +113,8 @@ func getAction(act configs.Action) (libseccomp.ScmpAction, error) { return actAllow, nil case configs.Trace: return actTrace, nil + case configs.Log: + return actLog, nil default: return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule") } @@ -170,29 +178,55 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { // Convert the call's action to the libseccomp equivalent callAct, err := getAction(call.Action) if err != nil { - return err + return fmt.Errorf("action in seccomp profile is invalid: %s", err) } // Unconditional match - just add the rule if len(call.Args) == 0 { if err = filter.AddRule(callNum, callAct); err != nil { - return err + return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err) } } else { - // Conditional match - convert the per-arg rules into library format + // If two or more arguments have the same condition, + // Revert to old behavior, adding each condition as a separate rule + argCounts := make([]uint, syscallMaxArguments) conditions := []libseccomp.ScmpCondition{} for _, cond := range call.Args { newCond, err := getCondition(cond) if err != nil { - return err + return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err) } + argCounts[cond.Index] += 1 + conditions = append(conditions, newCond) } - if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil { - return err + hasMultipleArgs := false + for _, count := range argCounts { + if count > 1 { + hasMultipleArgs = true + break + } + } + + if hasMultipleArgs { + // Revert to old behavior + // Add each condition attached to a separate rule + for _, cond := range conditions { + condArr := []libseccomp.ScmpCondition{cond} + + if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err) + } + } + } else { + // No conditions share same argument + // Use new, proper behavior + if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err) + } } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go index 0b1cd3b62..23e225c3f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go @@ -110,15 +110,18 @@ func Example() *specs.Spec { }, Linux: &specs.Linux{ MaskedPaths: []string{ + "/proc/acpi", + "/proc/asound", "/proc/kcore", + "/proc/keys", "/proc/latency_stats", "/proc/timer_list", "/proc/timer_stats", "/proc/sched_debug", "/sys/firmware", + "/proc/scsi", }, ReadonlyPaths: []string{ - "/proc/asound", "/proc/bus", "/proc/fs", "/proc/irq", @@ -154,9 +157,9 @@ func Example() *specs.Spec { } } -// ExampleRootless returns an example spec file that works with rootless -// containers. It's essentially a modified version of the specfile from -// Example(). +// ToRootless converts the given spec file into one that should work with +// rootless containers (euid != 0), by removing incompatible options and adding others that +// are needed. func ToRootless(spec *specs.Spec) { var namespaces []specs.LinuxNamespace diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go index 9a17d1e7a..d9e73c46b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go @@ -28,19 +28,23 @@ var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ specs.UserNamespace: configs.NEWUSER, specs.IPCNamespace: configs.NEWIPC, specs.UTSNamespace: configs.NEWUTS, + specs.CgroupNamespace: configs.NEWCGROUP, } var mountPropagationMapping = map[string]int{ - "rprivate": unix.MS_PRIVATE | unix.MS_REC, - "private": unix.MS_PRIVATE, - "rslave": unix.MS_SLAVE | unix.MS_REC, - "slave": unix.MS_SLAVE, - "rshared": unix.MS_SHARED | unix.MS_REC, - "shared": unix.MS_SHARED, - "": 0, + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "private": unix.MS_PRIVATE, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "slave": unix.MS_SLAVE, + "rshared": unix.MS_SHARED | unix.MS_REC, + "shared": unix.MS_SHARED, + "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, + "unbindable": unix.MS_UNBINDABLE, + "": 0, } -var allowedDevices = []*configs.Device{ +// AllowedDevices is exposed for devicefilter_test.go +var AllowedDevices = []*configs.Device{ // allow mknod for any device { Type: 'c', @@ -146,7 +150,8 @@ type CreateOpts struct { NoPivotRoot bool NoNewKeyring bool Spec *specs.Spec - Rootless bool + RootlessEUID bool + RootlessCgroups bool } // CreateLibcontainerConfig creates a new libcontainer configuration from a @@ -174,13 +179,14 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { labels = append(labels, fmt.Sprintf("%s=%s", k, v)) } config := &configs.Config{ - Rootfs: rootfsPath, - NoPivotRoot: opts.NoPivotRoot, - Readonlyfs: spec.Root.Readonly, - Hostname: spec.Hostname, - Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), - NoNewKeyring: opts.NoNewKeyring, - Rootless: opts.Rootless, + Rootfs: rootfsPath, + NoPivotRoot: opts.NoPivotRoot, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), + NoNewKeyring: opts.NoNewKeyring, + RootlessEUID: opts.RootlessEUID, + RootlessCgroups: opts.RootlessCgroups, } exists := false @@ -190,10 +196,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { if err := createDevices(spec, config); err != nil { return nil, err } - if err := setupUserNamespace(spec, config); err != nil { - return nil, err - } - c, err := createCgroupConfig(opts) + c, err := CreateCgroupConfig(opts) if err != nil { return nil, err } @@ -203,6 +206,9 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists { return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation) } + if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) { + return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root") + } for _, ns := range spec.Linux.Namespaces { t, exists := namespaceMapping[ns.Type] @@ -214,61 +220,74 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { } config.Namespaces.Add(t, ns.Path) } - if config.Namespaces.Contains(configs.NEWNET) { + if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" { config.Networks = []*configs.Network{ { Type: "loopback", }, } } + if config.Namespaces.Contains(configs.NEWUSER) { + if err := setupUserNamespace(spec, config); err != nil { + return nil, err + } + } config.MaskPaths = spec.Linux.MaskedPaths config.ReadonlyPaths = spec.Linux.ReadonlyPaths config.MountLabel = spec.Linux.MountLabel config.Sysctl = spec.Linux.Sysctl if spec.Linux.Seccomp != nil { - seccomp, err := setupSeccomp(spec.Linux.Seccomp) + seccomp, err := SetupSeccomp(spec.Linux.Seccomp) if err != nil { return nil, err } config.Seccomp = seccomp } + if spec.Linux.IntelRdt != nil { + config.IntelRdt = &configs.IntelRdt{} + if spec.Linux.IntelRdt.L3CacheSchema != "" { + config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema + } + if spec.Linux.IntelRdt.MemBwSchema != "" { + config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema + } + } } - if spec.Process.SelinuxLabel != "" { - config.ProcessLabel = spec.Process.SelinuxLabel - } - if spec.Process != nil && spec.Process.OOMScoreAdj != nil { - config.OomScoreAdj = *spec.Process.OOMScoreAdj - } - if spec.Process.Capabilities != nil { - config.Capabilities = &configs.Capabilities{ - Bounding: spec.Process.Capabilities.Bounding, - Effective: spec.Process.Capabilities.Effective, - Permitted: spec.Process.Capabilities.Permitted, - Inheritable: spec.Process.Capabilities.Inheritable, - Ambient: spec.Process.Capabilities.Ambient, + if spec.Process != nil { + config.OomScoreAdj = spec.Process.OOMScoreAdj + if spec.Process.SelinuxLabel != "" { + config.ProcessLabel = spec.Process.SelinuxLabel + } + if spec.Process.Capabilities != nil { + config.Capabilities = &configs.Capabilities{ + Bounding: spec.Process.Capabilities.Bounding, + Effective: spec.Process.Capabilities.Effective, + Permitted: spec.Process.Capabilities.Permitted, + Inheritable: spec.Process.Capabilities.Inheritable, + Ambient: spec.Process.Capabilities.Ambient, + } } } createHooks(spec, config) config.Version = specs.Version - if spec.Linux.IntelRdt != nil { - config.IntelRdt = &configs.IntelRdt{} - if spec.Linux.IntelRdt.L3CacheSchema != "" { - config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema - } - } return config, nil } func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { flags, pgflags, data, ext := parseMountOptions(m.Options) source := m.Source - if m.Type == "bind" { + device := m.Type + if flags&unix.MS_BIND != 0 { + // Any "type" the user specified is meaningless (and ignored) for + // bind-mounts -- so we set it to "bind" because rootfs_linux.go + // (incorrectly) relies on this for some checks. + device = "bind" if !filepath.IsAbs(source) { source = filepath.Join(cwd, m.Source) } } return &configs.Mount{ - Device: m.Type, + Device: device, Source: source, Destination: m.Destination, Data: data, @@ -278,7 +297,7 @@ func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { } } -func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { +func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { var ( myCgroupPath string @@ -308,7 +327,7 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { // for e.g. "system.slice:docker:1234" parts := strings.Split(myCgroupPath, ":") if len(parts) != 3 { - return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups") + return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", myCgroupPath) } c.Parent = parts[0] c.ScopePrefix = parts[1] @@ -321,12 +340,9 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { c.Path = myCgroupPath } - // In rootless containers, any attempt to make cgroup changes will fail. - // libcontainer will validate this and we shouldn't add any cgroup options - // the user didn't specify. - if !opts.Rootless { - c.Resources.AllowedDevices = allowedDevices - } + // In rootless containers, any attempt to make cgroup changes is likely to fail. + // libcontainer will validate this but ignores the error. + c.Resources.AllowedDevices = AllowedDevices if spec.Linux != nil { r := spec.Linux.Resources if r == nil { @@ -479,10 +495,8 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { } } } - if !opts.Rootless { - // append the default allowed devices to the end of the list - c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) - } + // append the default allowed devices to the end of the list + c.Resources.Devices = append(c.Resources.Devices, AllowedDevices...) return c, nil } @@ -615,9 +629,6 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { } } if spec.Linux != nil { - if len(spec.Linux.UIDMappings) == 0 { - return nil - } for _, m := range spec.Linux.UIDMappings { config.UidMappings = append(config.UidMappings, create(m)) } @@ -728,7 +739,7 @@ func parseMountOptions(options []string) (int, []int, string, int) { return flag, pgflag, strings.Join(data, ","), extFlags } -func setupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { +func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { if config == nil { return nil, nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go index 4837085a7..a4ae8901a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -3,13 +3,12 @@ package system import ( - "bufio" - "fmt" "os" "os/exec" "syscall" // only for exec "unsafe" + "github.com/opencontainers/runc/libcontainer/user" "golang.org/x/sys/unix" ) @@ -102,35 +101,55 @@ func Setctty() error { } // RunningInUserNS detects whether we are currently running in a user namespace. -// Copied from github.com/lxc/lxd/shared/util.go +// Originally copied from github.com/lxc/lxd/shared/util.go func RunningInUserNS() bool { - file, err := os.Open("/proc/self/uid_map") + uidmap, err := user.CurrentProcessUIDMap() if err != nil { // This kernel-provided file only exists if user namespaces are supported return false } - defer file.Close() + return UIDMapInUserNS(uidmap) +} - buf := bufio.NewReader(file) - l, _, err := buf.ReadLine() - if err != nil { - return false - } - - line := string(l) - var a, b, c int64 - fmt.Sscanf(line, "%d %d %d", &a, &b, &c) +func UIDMapInUserNS(uidmap []user.IDMap) bool { /* * We assume we are in the initial user namespace if we have a full * range - 4294967295 uids starting at uid 0. */ - if a == 0 && b == 0 && c == 4294967295 { + if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { return false } return true } +// GetParentNSeuid returns the euid within the parent user namespace +func GetParentNSeuid() int64 { + euid := int64(os.Geteuid()) + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return euid + } + for _, um := range uidmap { + if um.ID <= euid && euid <= um.ID+um.Count-1 { + return um.ParentID + euid - um.ID + } + } + return euid +} + // SetSubreaper sets the value i as the subreaper setting for the calling process func SetSubreaper(i int) error { return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) } + +// GetSubreaper returns the subreaper setting for the calling process +func GetSubreaper() (int, error) { + var i uintptr + + if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil { + return -1, err + } + + return int(i), nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_386.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_32.go similarity index 93% rename from vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_386.go rename to vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_32.go index 3f7235ed1..c5ca5d862 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_386.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_32.go @@ -1,4 +1,5 @@ -// +build linux,386 +// +build linux +// +build 386 arm package system diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go index d7891a2ff..e05e30adc 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_64.go @@ -1,4 +1,5 @@ -// +build linux,arm64 linux,amd64 linux,ppc linux,ppc64 linux,ppc64le linux,s390x +// +build linux +// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le riscv64 s390x package system diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_arm.go b/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_arm.go deleted file mode 100644 index 31ff3deb1..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/syscall_linux_arm.go +++ /dev/null @@ -1,25 +0,0 @@ -// +build linux,arm - -package system - -import ( - "golang.org/x/sys/unix" -) - -// Setuid sets the uid of the calling thread to the specified uid. -func Setuid(uid int) (err error) { - _, _, e1 := unix.RawSyscall(unix.SYS_SETUID32, uintptr(uid), 0, 0) - if e1 != 0 { - err = e1 - } - return -} - -// Setgid sets the gid of the calling thread to the specified gid. -func Setgid(gid int) (err error) { - _, _, e1 := unix.RawSyscall(unix.SYS_SETGID32, uintptr(gid), 0, 0) - if e1 != 0 { - err = e1 - } - return -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go b/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go index b3a07cba3..b8434f105 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/sysconfig.go @@ -1,4 +1,4 @@ -// +build cgo,linux cgo,freebsd +// +build cgo,linux package system diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go index e7cfd62b2..b94be74a6 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go @@ -2,8 +2,26 @@ package system +import ( + "os" + + "github.com/opencontainers/runc/libcontainer/user" +) + // RunningInUserNS is a stub for non-Linux systems // Always returns false func RunningInUserNS() bool { return false } + +// UIDMapInUserNS is a stub for non-Linux systems +// Always returns false +func UIDMapInUserNS(uidmap []user.IDMap) bool { + return false +} + +// GetParentNSeuid returns the euid within the parent user namespace +// Always returns os.Geteuid on non-linux +func GetParentNSeuid() int { + return os.Geteuid() +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go new file mode 100644 index 000000000..6fd8dd0d4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go @@ -0,0 +1,41 @@ +package user + +import ( + "errors" +) + +var ( + // The current operating system does not provide the required data for user lookups. + ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data") + // No matching entries found in file. + ErrNoPasswdEntries = errors.New("no matching entries in passwd file") + ErrNoGroupEntries = errors.New("no matching entries in group file") +) + +// LookupUser looks up a user by their username in /etc/passwd. If the user +// cannot be found (or there is no /etc/passwd file on the filesystem), then +// LookupUser returns an error. +func LookupUser(username string) (User, error) { + return lookupUser(username) +} + +// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot +// be found (or there is no /etc/passwd file on the filesystem), then LookupId +// returns an error. +func LookupUid(uid int) (User, error) { + return lookupUid(uid) +} + +// LookupGroup looks up a group by its name in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGroup +// returns an error. +func LookupGroup(groupname string) (Group, error) { + return lookupGroup(groupname) +} + +// LookupGid looks up a group by its group id in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGid +// returns an error. +func LookupGid(gid int) (Group, error) { + return lookupGid(gid) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go new file mode 100644 index 000000000..92b5ae8de --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go @@ -0,0 +1,144 @@ +// +build darwin dragonfly freebsd linux netbsd openbsd solaris + +package user + +import ( + "io" + "os" + "strconv" + + "golang.org/x/sys/unix" +) + +// Unix-specific path to the passwd and group formatted files. +const ( + unixPasswdPath = "/etc/passwd" + unixGroupPath = "/etc/group" +) + +func lookupUser(username string) (User, error) { + return lookupUserFunc(func(u User) bool { + return u.Name == username + }) +} + +func lookupUid(uid int) (User, error) { + return lookupUserFunc(func(u User) bool { + return u.Uid == uid + }) +} + +func lookupUserFunc(filter func(u User) bool) (User, error) { + // Get operating system-specific passwd reader-closer. + passwd, err := GetPasswd() + if err != nil { + return User{}, err + } + defer passwd.Close() + + // Get the users. + users, err := ParsePasswdFilter(passwd, filter) + if err != nil { + return User{}, err + } + + // No user entries found. + if len(users) == 0 { + return User{}, ErrNoPasswdEntries + } + + // Assume the first entry is the "correct" one. + return users[0], nil +} + +func lookupGroup(groupname string) (Group, error) { + return lookupGroupFunc(func(g Group) bool { + return g.Name == groupname + }) +} + +func lookupGid(gid int) (Group, error) { + return lookupGroupFunc(func(g Group) bool { + return g.Gid == gid + }) +} + +func lookupGroupFunc(filter func(g Group) bool) (Group, error) { + // Get operating system-specific group reader-closer. + group, err := GetGroup() + if err != nil { + return Group{}, err + } + defer group.Close() + + // Get the users. + groups, err := ParseGroupFilter(group, filter) + if err != nil { + return Group{}, err + } + + // No user entries found. + if len(groups) == 0 { + return Group{}, ErrNoGroupEntries + } + + // Assume the first entry is the "correct" one. + return groups[0], nil +} + +func GetPasswdPath() (string, error) { + return unixPasswdPath, nil +} + +func GetPasswd() (io.ReadCloser, error) { + return os.Open(unixPasswdPath) +} + +func GetGroupPath() (string, error) { + return unixGroupPath, nil +} + +func GetGroup() (io.ReadCloser, error) { + return os.Open(unixGroupPath) +} + +// CurrentUser looks up the current user by their user id in /etc/passwd. If the +// user cannot be found (or there is no /etc/passwd file on the filesystem), +// then CurrentUser returns an error. +func CurrentUser() (User, error) { + return LookupUid(unix.Getuid()) +} + +// CurrentGroup looks up the current user's group by their primary group id's +// entry in /etc/passwd. If the group cannot be found (or there is no +// /etc/group file on the filesystem), then CurrentGroup returns an error. +func CurrentGroup() (Group, error) { + return LookupGid(unix.Getgid()) +} + +func currentUserSubIDs(fileName string) ([]SubID, error) { + u, err := CurrentUser() + if err != nil { + return nil, err + } + filter := func(entry SubID) bool { + return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid) + } + return ParseSubIDFileFilter(fileName, filter) +} + +func CurrentUserSubUIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subuid") +} + +func CurrentUserSubGIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subgid") +} + +func CurrentProcessUIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/uid_map") +} + +func CurrentProcessGIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/gid_map") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go new file mode 100644 index 000000000..65cd40e92 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go @@ -0,0 +1,40 @@ +// +build windows + +package user + +import ( + "fmt" + "os/user" +) + +func lookupUser(username string) (User, error) { + u, err := user.Lookup(username) + if err != nil { + return User{}, err + } + return userFromOS(u) +} + +func lookupUid(uid int) (User, error) { + u, err := user.LookupId(fmt.Sprintf("%d", uid)) + if err != nil { + return User{}, err + } + return userFromOS(u) +} + +func lookupGroup(groupname string) (Group, error) { + g, err := user.LookupGroup(groupname) + if err != nil { + return Group{}, err + } + return groupFromOS(g) +} + +func lookupGid(gid int) (Group, error) { + g, err := user.LookupGroupId(fmt.Sprintf("%d", gid)) + if err != nil { + return Group{}, err + } + return groupFromOS(g) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go new file mode 100644 index 000000000..7b912bbf8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go @@ -0,0 +1,608 @@ +package user + +import ( + "bufio" + "fmt" + "io" + "os" + "os/user" + "strconv" + "strings" +) + +const ( + minId = 0 + maxId = 1<<31 - 1 //for 32-bit systems compatibility +) + +var ( + ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId) +) + +type User struct { + Name string + Pass string + Uid int + Gid int + Gecos string + Home string + Shell string +} + +// userFromOS converts an os/user.(*User) to local User +// +// (This does not include Pass, Shell or Gecos) +func userFromOS(u *user.User) (User, error) { + newUser := User{ + Name: u.Username, + Home: u.HomeDir, + } + id, err := strconv.Atoi(u.Uid) + if err != nil { + return newUser, err + } + newUser.Uid = id + + id, err = strconv.Atoi(u.Gid) + if err != nil { + return newUser, err + } + newUser.Gid = id + return newUser, nil +} + +type Group struct { + Name string + Pass string + Gid int + List []string +} + +// groupFromOS converts an os/user.(*Group) to local Group +// +// (This does not include Pass, Shell or Gecos) +func groupFromOS(g *user.Group) (Group, error) { + newGroup := Group{ + Name: g.Name, + } + + id, err := strconv.Atoi(g.Gid) + if err != nil { + return newGroup, err + } + newGroup.Gid = id + + return newGroup, nil +} + +// SubID represents an entry in /etc/sub{u,g}id +type SubID struct { + Name string + SubID int64 + Count int64 +} + +// IDMap represents an entry in /proc/PID/{u,g}id_map +type IDMap struct { + ID int64 + ParentID int64 + Count int64 +} + +func parseLine(line string, v ...interface{}) { + parseParts(strings.Split(line, ":"), v...) +} + +func parseParts(parts []string, v ...interface{}) { + if len(parts) == 0 { + return + } + + for i, p := range parts { + // Ignore cases where we don't have enough fields to populate the arguments. + // Some configuration files like to misbehave. + if len(v) <= i { + break + } + + // Use the type of the argument to figure out how to parse it, scanf() style. + // This is legit. + switch e := v[i].(type) { + case *string: + *e = p + case *int: + // "numbers", with conversion errors ignored because of some misbehaving configuration files. + *e, _ = strconv.Atoi(p) + case *int64: + *e, _ = strconv.ParseInt(p, 10, 64) + case *[]string: + // Comma-separated lists. + if p != "" { + *e = strings.Split(p, ",") + } else { + *e = []string{} + } + default: + // Someone goof'd when writing code using this function. Scream so they can hear us. + panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e)) + } + } +} + +func ParsePasswdFile(path string) ([]User, error) { + passwd, err := os.Open(path) + if err != nil { + return nil, err + } + defer passwd.Close() + return ParsePasswd(passwd) +} + +func ParsePasswd(passwd io.Reader) ([]User, error) { + return ParsePasswdFilter(passwd, nil) +} + +func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) { + passwd, err := os.Open(path) + if err != nil { + return nil, err + } + defer passwd.Close() + return ParsePasswdFilter(passwd, filter) +} + +func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) { + if r == nil { + return nil, fmt.Errorf("nil source for passwd-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []User{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 5 passwd + // name:password:UID:GID:GECOS:directory:shell + // Name:Pass:Uid:Gid:Gecos:Home:Shell + // root:x:0:0:root:/root:/bin/bash + // adm:x:3:4:adm:/var/adm:/bin/false + p := User{} + parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +func ParseGroupFile(path string) ([]Group, error) { + group, err := os.Open(path) + if err != nil { + return nil, err + } + + defer group.Close() + return ParseGroup(group) +} + +func ParseGroup(group io.Reader) ([]Group, error) { + return ParseGroupFilter(group, nil) +} + +func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) { + group, err := os.Open(path) + if err != nil { + return nil, err + } + defer group.Close() + return ParseGroupFilter(group, filter) +} + +func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) { + if r == nil { + return nil, fmt.Errorf("nil source for group-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []Group{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := s.Text() + if text == "" { + continue + } + + // see: man 5 group + // group_name:password:GID:user_list + // Name:Pass:Gid:List + // root:x:0:root + // adm:x:4:root,adm,daemon + p := Group{} + parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +type ExecUser struct { + Uid int + Gid int + Sgids []int + Home string +} + +// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the +// given file paths and uses that data as the arguments to GetExecUser. If the +// files cannot be opened for any reason, the error is ignored and a nil +// io.Reader is passed instead. +func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) { + var passwd, group io.Reader + + if passwdFile, err := os.Open(passwdPath); err == nil { + passwd = passwdFile + defer passwdFile.Close() + } + + if groupFile, err := os.Open(groupPath); err == nil { + group = groupFile + defer groupFile.Close() + } + + return GetExecUser(userSpec, defaults, passwd, group) +} + +// GetExecUser parses a user specification string (using the passwd and group +// readers as sources for /etc/passwd and /etc/group data, respectively). In +// the case of blank fields or missing data from the sources, the values in +// defaults is used. +// +// GetExecUser will return an error if a user or group literal could not be +// found in any entry in passwd and group respectively. +// +// Examples of valid user specifications are: +// * "" +// * "user" +// * "uid" +// * "user:group" +// * "uid:gid +// * "user:gid" +// * "uid:group" +// +// It should be noted that if you specify a numeric user or group id, they will +// not be evaluated as usernames (only the metadata will be filled). So attempting +// to parse a user with user.Name = "1337" will produce the user with a UID of +// 1337. +func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) { + if defaults == nil { + defaults = new(ExecUser) + } + + // Copy over defaults. + user := &ExecUser{ + Uid: defaults.Uid, + Gid: defaults.Gid, + Sgids: defaults.Sgids, + Home: defaults.Home, + } + + // Sgids slice *cannot* be nil. + if user.Sgids == nil { + user.Sgids = []int{} + } + + // Allow for userArg to have either "user" syntax, or optionally "user:group" syntax + var userArg, groupArg string + parseLine(userSpec, &userArg, &groupArg) + + // Convert userArg and groupArg to be numeric, so we don't have to execute + // Atoi *twice* for each iteration over lines. + uidArg, uidErr := strconv.Atoi(userArg) + gidArg, gidErr := strconv.Atoi(groupArg) + + // Find the matching user. + users, err := ParsePasswdFilter(passwd, func(u User) bool { + if userArg == "" { + // Default to current state of the user. + return u.Uid == user.Uid + } + + if uidErr == nil { + // If the userArg is numeric, always treat it as a UID. + return uidArg == u.Uid + } + + return u.Name == userArg + }) + + // If we can't find the user, we have to bail. + if err != nil && passwd != nil { + if userArg == "" { + userArg = strconv.Itoa(user.Uid) + } + return nil, fmt.Errorf("unable to find user %s: %v", userArg, err) + } + + var matchedUserName string + if len(users) > 0 { + // First match wins, even if there's more than one matching entry. + matchedUserName = users[0].Name + user.Uid = users[0].Uid + user.Gid = users[0].Gid + user.Home = users[0].Home + } else if userArg != "" { + // If we can't find a user with the given username, the only other valid + // option is if it's a numeric username with no associated entry in passwd. + + if uidErr != nil { + // Not numeric. + return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries) + } + user.Uid = uidArg + + // Must be inside valid uid range. + if user.Uid < minId || user.Uid > maxId { + return nil, ErrRange + } + + // Okay, so it's numeric. We can just roll with this. + } + + // On to the groups. If we matched a username, we need to do this because of + // the supplementary group IDs. + if groupArg != "" || matchedUserName != "" { + groups, err := ParseGroupFilter(group, func(g Group) bool { + // If the group argument isn't explicit, we'll just search for it. + if groupArg == "" { + // Check if user is a member of this group. + for _, u := range g.List { + if u == matchedUserName { + return true + } + } + return false + } + + if gidErr == nil { + // If the groupArg is numeric, always treat it as a GID. + return gidArg == g.Gid + } + + return g.Name == groupArg + }) + if err != nil && group != nil { + return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err) + } + + // Only start modifying user.Gid if it is in explicit form. + if groupArg != "" { + if len(groups) > 0 { + // First match wins, even if there's more than one matching entry. + user.Gid = groups[0].Gid + } else { + // If we can't find a group with the given name, the only other valid + // option is if it's a numeric group name with no associated entry in group. + + if gidErr != nil { + // Not numeric. + return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries) + } + user.Gid = gidArg + + // Must be inside valid gid range. + if user.Gid < minId || user.Gid > maxId { + return nil, ErrRange + } + + // Okay, so it's numeric. We can just roll with this. + } + } else if len(groups) > 0 { + // Supplementary group ids only make sense if in the implicit form. + user.Sgids = make([]int, len(groups)) + for i, group := range groups { + user.Sgids[i] = group.Gid + } + } + } + + return user, nil +} + +// GetAdditionalGroups looks up a list of groups by name or group id +// against the given /etc/group formatted data. If a group name cannot +// be found, an error will be returned. If a group id cannot be found, +// or the given group data is nil, the id will be returned as-is +// provided it is in the legal range. +func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) { + var groups = []Group{} + if group != nil { + var err error + groups, err = ParseGroupFilter(group, func(g Group) bool { + for _, ag := range additionalGroups { + if g.Name == ag || strconv.Itoa(g.Gid) == ag { + return true + } + } + return false + }) + if err != nil { + return nil, fmt.Errorf("Unable to find additional groups %v: %v", additionalGroups, err) + } + } + + gidMap := make(map[int]struct{}) + for _, ag := range additionalGroups { + var found bool + for _, g := range groups { + // if we found a matched group either by name or gid, take the + // first matched as correct + if g.Name == ag || strconv.Itoa(g.Gid) == ag { + if _, ok := gidMap[g.Gid]; !ok { + gidMap[g.Gid] = struct{}{} + found = true + break + } + } + } + // we asked for a group but didn't find it. let's check to see + // if we wanted a numeric group + if !found { + gid, err := strconv.Atoi(ag) + if err != nil { + return nil, fmt.Errorf("Unable to find group %s", ag) + } + // Ensure gid is inside gid range. + if gid < minId || gid > maxId { + return nil, ErrRange + } + gidMap[gid] = struct{}{} + } + } + gids := []int{} + for gid := range gidMap { + gids = append(gids, gid) + } + return gids, nil +} + +// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups +// that opens the groupPath given and gives it as an argument to +// GetAdditionalGroups. +func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) { + var group io.Reader + + if groupFile, err := os.Open(groupPath); err == nil { + group = groupFile + defer groupFile.Close() + } + return GetAdditionalGroups(additionalGroups, group) +} + +func ParseSubIDFile(path string) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubID(subid) +} + +func ParseSubID(subid io.Reader) ([]SubID, error) { + return ParseSubIDFilter(subid, nil) +} + +func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubIDFilter(subid, filter) +} + +func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { + if r == nil { + return nil, fmt.Errorf("nil source for subid-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []SubID{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 5 subuid + p := SubID{} + parseLine(line, &p.Name, &p.SubID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +func ParseIDMapFile(path string) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMap(r) +} + +func ParseIDMap(r io.Reader) ([]IDMap, error) { + return ParseIDMapFilter(r, nil) +} + +func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMapFilter(r, filter) +} + +func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { + if r == nil { + return nil, fmt.Errorf("nil source for idmap-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []IDMap{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 7 user_namespaces + p := IDMap{} + parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index baa54c9ba..40ccfaa1a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -1,8 +1,6 @@ package utils import ( - "crypto/rand" - "encoding/hex" "encoding/json" "io" "os" @@ -17,19 +15,6 @@ const ( exitSignalOffset = 128 ) -// GenerateRandomName returns a new name joined with a prefix. This size -// specified is used to truncate the randomly generated value -func GenerateRandomName(prefix string, size int) (string, error) { - id := make([]byte, 32) - if _, err := io.ReadFull(rand.Reader, id); err != nil { - return "", err - } - if size > 64 { - size = 64 - } - return prefix + hex.EncodeToString(id)[:size], nil -} - // ResolveRootfs ensures that the current working directory is // not a symlink and returns the absolute path to the rootfs func ResolveRootfs(uncleanRootfs string) (string, error) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go index c96088988..1576f2d4a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -3,33 +3,57 @@ package utils import ( - "io/ioutil" + "fmt" "os" "strconv" "golang.org/x/sys/unix" ) +// EnsureProcHandle returns whether or not the given file handle is on procfs. +func EnsureProcHandle(fh *os.File) error { + var buf unix.Statfs_t + if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { + return fmt.Errorf("ensure %s is on procfs: %v", fh.Name(), err) + } + if buf.Type != unix.PROC_SUPER_MAGIC { + return fmt.Errorf("%s is not on procfs", fh.Name()) + } + return nil +} + +// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for +// the process (except for those below the given fd value). func CloseExecFrom(minFd int) error { - fdList, err := ioutil.ReadDir("/proc/self/fd") + fdDir, err := os.Open("/proc/self/fd") if err != nil { return err } - for _, fi := range fdList { - fd, err := strconv.Atoi(fi.Name()) + defer fdDir.Close() + + if err := EnsureProcHandle(fdDir); err != nil { + return err + } + + fdList, err := fdDir.Readdirnames(-1) + if err != nil { + return err + } + for _, fdStr := range fdList { + fd, err := strconv.Atoi(fdStr) + // Ignore non-numeric file names. if err != nil { - // ignore non-numeric file names continue } - + // Ignore descriptors lower than our specified minimum. if fd < minFd { - // ignore descriptors lower than our specified minimum continue } - - // intentionally ignore errors from unix.CloseOnExec + // Intentionally ignore errors from unix.CloseOnExec -- the cases where + // this might fail are basically file descriptors that have already + // been closed (including and especially the one that was created when + // ioutil.ReadDir did the "opendir" syscall). unix.CloseOnExec(fd) - // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall) } return nil }