runtime: Move the resourcecontrol package one layer up

And try to reduce the number of virtcontainers packages, step by step.

Signed-off-by: Samuel Ortiz <s.ortiz@apple.com>
This commit is contained in:
Samuel Ortiz
2022-02-08 12:54:44 +00:00
committed by Samuel Ortiz
parent 823faee83a
commit 9fd4e5514f
10 changed files with 5 additions and 5 deletions

View File

@@ -0,0 +1,332 @@
// +build linux
//
// Copyright (c) 2021-2022 Apple Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
package resourcecontrol
import (
"fmt"
"os"
"path/filepath"
"sync"
"github.com/containerd/cgroups"
v1 "github.com/containerd/cgroups/stats/v1"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
)
// prepend a kata specific string to oci cgroup path to
// form a different cgroup path, thus cAdvisor couldn't
// find kata containers cgroup path on host to prevent it
// from grabbing the stats data.
const CgroupKataPrefix = "kata"
func RenameCgroupPath(path string) (string, error) {
if path == "" {
path = DefaultResourceControllerID
}
cgroupPathDir := filepath.Dir(path)
cgroupPathName := fmt.Sprintf("%s_%s", CgroupKataPrefix, filepath.Base(path))
return filepath.Join(cgroupPathDir, cgroupPathName), nil
}
type LinuxCgroup struct {
cgroup cgroups.Cgroup
path string
cpusets *specs.LinuxCPU
devices []specs.LinuxDeviceCgroup
sync.Mutex
}
func sandboxDevices() []specs.LinuxDeviceCgroup {
devices := []specs.LinuxDeviceCgroup{}
defaultDevices := []string{
"/dev/null",
"/dev/random",
"/dev/full",
"/dev/tty",
"/dev/zero",
"/dev/urandom",
"/dev/console",
}
// Processes running in a device-cgroup are constrained, they have acccess
// only to the devices listed in the devices.list file.
// In order to run Virtual Machines and create virtqueues, hypervisors
// need access to certain character devices in the host, like kvm and vhost-net.
hypervisorDevices := []string{
"/dev/kvm", // To run virtual machines
"/dev/vhost-net", // To create virtqueues
"/dev/vfio/vfio", // To access VFIO devices
"/dev/vhost-vsock", // To interact with vsock if
}
defaultDevices = append(defaultDevices, hypervisorDevices...)
for _, device := range defaultDevices {
ldevice, err := DeviceToLinuxDevice(device)
if err != nil {
controllerLogger.WithField("source", "cgroups").Warnf("Could not add %s to the devices cgroup", device)
continue
}
devices = append(devices, ldevice)
}
wildcardMajor := int64(-1)
wildcardMinor := int64(-1)
ptsMajor := int64(136)
tunMajor := int64(10)
tunMinor := int64(200)
wildcardDevices := []specs.LinuxDeviceCgroup{
// allow mknod for any device
{
Allow: true,
Type: "c",
Major: &wildcardMajor,
Minor: &wildcardMinor,
Access: "m",
},
{
Allow: true,
Type: "b",
Major: &wildcardMajor,
Minor: &wildcardMinor,
Access: "m",
},
// /dev/pts/ - pts namespaces are "coming soon"
{
Allow: true,
Type: "c",
Major: &ptsMajor,
Minor: &wildcardMinor,
Access: "rwm",
},
// tuntap
{
Allow: true,
Type: "c",
Major: &tunMajor,
Minor: &tunMinor,
Access: "rwm",
},
}
devices = append(devices, wildcardDevices...)
return devices
}
func NewResourceController(path string, resources *specs.LinuxResources) (ResourceController, error) {
var err error
cgroupPath, err := ValidCgroupPath(path, IsSystemdCgroup(path))
if err != nil {
return nil, err
}
cgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources)
if err != nil {
return nil, err
}
return &LinuxCgroup{
path: cgroupPath,
devices: resources.Devices,
cpusets: resources.CPU,
cgroup: cgroup,
}, nil
}
func NewSandboxResourceController(path string, resources *specs.LinuxResources, sandboxCgroupOnly bool) (ResourceController, error) {
var cgroup cgroups.Cgroup
sandboxResources := *resources
sandboxResources.Devices = append(sandboxResources.Devices, sandboxDevices()...)
// Currently we know to handle systemd cgroup path only when it's the only cgroup (no overhead group), hence,
// if sandboxCgroupOnly is not true we treat it as cgroupfs path as it used to be, although it may be incorrect
if !IsSystemdCgroup(path) || !sandboxCgroupOnly {
return NewResourceController(path, &sandboxResources)
}
slice, unit, err := getSliceAndUnit(path)
if err != nil {
return nil, err
}
// github.com/containerd/cgroups doesn't support creating a scope unit with
// v1 cgroups against systemd, the following interacts directly with systemd
// to create the cgroup and then load it using containerd's api
err = createCgroupsSystemd(slice, unit, uint32(os.Getpid())) // adding runtime process, it makes calling setupCgroups redundant
if err != nil {
return nil, err
}
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return nil, err
}
// load created cgroup and update with resources
if cgroup, err = cgroups.Load(cgHierarchy, cgPath); err == nil {
err = cgroup.Update(&sandboxResources)
}
if err != nil {
return nil, err
}
return &LinuxCgroup{
path: path,
devices: sandboxResources.Devices,
cpusets: sandboxResources.CPU,
cgroup: cgroup,
}, nil
}
func LoadResourceController(path string) (ResourceController, error) {
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return nil, err
}
cgroup, err := cgroups.Load(cgHierarchy, cgPath)
if err != nil {
return nil, err
}
return &LinuxCgroup{
path: path,
cgroup: cgroup,
}, nil
}
func (c *LinuxCgroup) Logger() *logrus.Entry {
return controllerLogger.WithField("source", "cgroups")
}
func (c *LinuxCgroup) Delete() error {
return c.cgroup.Delete()
}
func (c *LinuxCgroup) Stat() (*v1.Metrics, error) {
return c.cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist))
}
func (c *LinuxCgroup) AddProcess(pid int, subsystems ...string) error {
return c.cgroup.Add(cgroups.Process{Pid: pid})
}
func (c *LinuxCgroup) AddThread(pid int, subsystems ...string) error {
return c.cgroup.AddTask(cgroups.Process{Pid: pid})
}
func (c *LinuxCgroup) Update(resources *specs.LinuxResources) error {
return c.cgroup.Update(resources)
}
func (c *LinuxCgroup) MoveTo(path string) error {
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return err
}
newCgroup, err := cgroups.Load(cgHierarchy, cgPath)
if err != nil {
return err
}
return c.cgroup.MoveTo(newCgroup)
}
func (c *LinuxCgroup) AddDevice(deviceHostPath string) error {
deviceResource, err := DeviceToLinuxDevice(deviceHostPath)
if err != nil {
return err
}
c.Lock()
defer c.Unlock()
c.devices = append(c.devices, deviceResource)
if err := c.cgroup.Update(&specs.LinuxResources{
Devices: c.devices,
}); err != nil {
return err
}
return nil
}
func (c *LinuxCgroup) RemoveDevice(deviceHostPath string) error {
deviceResource, err := DeviceToLinuxDevice(deviceHostPath)
if err != nil {
return err
}
c.Lock()
defer c.Unlock()
for i, d := range c.devices {
if d.Type == deviceResource.Type &&
d.Major == deviceResource.Major &&
d.Minor == deviceResource.Minor {
c.devices = append(c.devices[:i], c.devices[i+1:]...)
}
}
if err := c.cgroup.Update(&specs.LinuxResources{
Devices: c.devices,
}); err != nil {
return err
}
return nil
}
func (c *LinuxCgroup) UpdateCpuSet(cpuset, memset string) error {
c.Lock()
defer c.Unlock()
if len(cpuset) > 0 {
// If we didn't have a cpuset defined, let's create:
if c.cpusets == nil {
c.cpusets = &specs.LinuxCPU{}
}
c.cpusets.Cpus = cpuset
}
if len(memset) > 0 {
// If we didn't have a cpuset defined, let's now create:
if c.cpusets == nil {
c.cpusets = &specs.LinuxCPU{}
}
c.cpusets.Mems = memset
}
return c.cgroup.Update(&specs.LinuxResources{
CPU: c.cpusets,
})
}
func (c *LinuxCgroup) Type() ResourceControllerType {
return LinuxCgroups
}
func (c *LinuxCgroup) ID() string {
return c.path
}
func (c *LinuxCgroup) Parent() string {
return filepath.Dir(c.path)
}

View File

@@ -0,0 +1,82 @@
// Copyright (c) 2021 Apple Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
package resourcecontrol
import (
v1 "github.com/containerd/cgroups/stats/v1"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
)
var (
controllerLogger = logrus.WithField("source", "virtcontainers/pkg/resourcecontrol")
)
// SetLogger sets up a logger for this pkg
func SetLogger(logger *logrus.Entry) {
fields := controllerLogger.Data
controllerLogger = logger.WithFields(fields)
}
// HypervisorType describes an hypervisor type.
type ResourceControllerType string
const (
LinuxCgroups ResourceControllerType = "cgroups"
)
// String converts an hypervisor type to a string.
func (rType *ResourceControllerType) String() string {
switch *rType {
case LinuxCgroups:
return string(LinuxCgroups)
default:
return "Unknown controller type"
}
}
// ResourceController represents a system resources controller.
// On Linux this interface is implemented through the cgroups API.
type ResourceController interface {
// Type returns the resource controller implementation type.
Type() ResourceControllerType
// The controller identifier, e.g. a Linux cgroups path.
ID() string
// Parent returns the parent controller, on hierarchically
// defined resource (e.g. Linux cgroups).
Parent() string
// Delete the controller.
Delete() error
// Stat returns the statistics for the controller.
Stat() (*v1.Metrics, error)
// AddProcess adds a process to a set of controllers.
AddProcess(int, ...string) error
// AddThread adds a process thread to a set of controllers.
AddThread(int, ...string) error
// Update updates the set of resources controlled, based on
// an OCI resources description.
Update(*specs.LinuxResources) error
// MoveTo moves a controller to another one.
MoveTo(string) error
// AddDevice adds a device resource to the controller.
AddDevice(string) error
// RemoveDevice removes a device resource to the controller.
RemoveDevice(string) error
// UpdateCpuSet updates the set of controlled CPUs and memory nodes.
UpdateCpuSet(string, string) error
}

View File

@@ -0,0 +1,59 @@
// Copyright (c) 2020 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
package resourcecontrol
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
)
func DeviceToCgroupDeviceRule(device string) (*devices.Rule, error) {
var st unix.Stat_t
deviceRule := devices.Rule{
Allow: true,
Permissions: "rwm",
}
if err := unix.Stat(device, &st); err != nil {
return nil, err
}
devType := st.Mode & unix.S_IFMT
switch devType {
case unix.S_IFCHR:
deviceRule.Type = 'c'
case unix.S_IFBLK:
deviceRule.Type = 'b'
default:
return nil, fmt.Errorf("unsupported device type: %v", devType)
}
major := int64(unix.Major(st.Rdev))
minor := int64(unix.Minor(st.Rdev))
deviceRule.Major = major
deviceRule.Minor = minor
return &deviceRule, nil
}
func DeviceToLinuxDevice(device string) (specs.LinuxDeviceCgroup, error) {
dev, err := DeviceToCgroupDeviceRule(device)
if err != nil {
return specs.LinuxDeviceCgroup{}, err
}
return specs.LinuxDeviceCgroup{
Allow: dev.Allow,
Type: string(dev.Type),
Major: &dev.Major,
Minor: &dev.Minor,
Access: string(dev.Permissions),
}, nil
}

View File

@@ -0,0 +1,131 @@
// Copyright (c) 2020 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
package resourcecontrol
import (
"context"
"fmt"
"path/filepath"
"strings"
"github.com/containerd/cgroups"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
)
// DefaultResourceControllerID runtime-determined location in the cgroups hierarchy.
const DefaultResourceControllerID = "/vc"
// validCgroupPath returns a valid cgroup path.
// see https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#cgroups-path
func ValidCgroupPath(path string, systemdCgroup bool) (string, error) {
if IsSystemdCgroup(path) {
return path, nil
}
if systemdCgroup {
return "", fmt.Errorf("malformed systemd path '%v': expected to be of form 'slice:prefix:name'", path)
}
// In the case of an absolute path (starting with /), the runtime MUST
// take the path to be relative to the cgroups mount point.
if filepath.IsAbs(path) {
return filepath.Clean(path), nil
}
// In the case of a relative path (not starting with /), the runtime MAY
// interpret the path relative to a runtime-determined location in the cgroups hierarchy.
// clean up path and return a new path relative to DefaultResourceControllerID
return filepath.Join(DefaultResourceControllerID, filepath.Clean("/"+path)), nil
}
func IsSystemdCgroup(cgroupPath string) bool {
// If we are utilizing systemd to manage cgroups, we expect to receive a path
// in the format slice:scopeprefix:name. A typical example would be:
//
// system.slice:docker:6b4c4a4d0cc2a12c529dcb13a2b8e438dfb3b2a6af34d548d7d
//
// Based on this, let's split by the ':' delimiter and verify that the first
// section has .slice as a suffix.
parts := strings.Split(cgroupPath, ":")
if len(parts) == 3 && strings.HasSuffix(parts[0], ".slice") {
return true
}
return false
}
func newProperty(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
Value: dbus.MakeVariant(units),
}
}
func cgroupHierarchy(path string) (cgroups.Hierarchy, cgroups.Path, error) {
if !IsSystemdCgroup(path) {
return cgroups.V1, cgroups.StaticPath(path), nil
} else {
slice, unit, err := getSliceAndUnit(path)
if err != nil {
return nil, nil, err
}
cgroupSlicePath, _ := systemd.ExpandSlice(slice)
if err != nil {
return nil, nil, err
}
return cgroups.Systemd, cgroups.Slice(cgroupSlicePath, unit), nil
}
}
func createCgroupsSystemd(slice string, unit string, pid uint32) error {
ctx := context.TODO()
conn, err := systemdDbus.NewWithContext(ctx)
if err != nil {
return err
}
defer conn.Close()
properties := []systemdDbus.Property{
systemdDbus.PropDescription("cgroup " + unit),
newProperty("DefaultDependencies", false),
newProperty("MemoryAccounting", true),
newProperty("CPUAccounting", true),
newProperty("IOAccounting", true),
}
// https://github.com/opencontainers/runc/blob/master/docs/systemd.md
if strings.HasSuffix(unit, ".scope") {
// It's a scope, which we put into a Slice=.
properties = append(properties, systemdDbus.PropSlice(slice))
properties = append(properties, newProperty("Delegate", true))
properties = append(properties, systemdDbus.PropPids(pid))
} else {
return fmt.Errorf("Failed to create cgroups with systemd: unit %s is not a scope", unit)
}
ch := make(chan string)
// https://www.freedesktop.org/wiki/Software/systemd/ControlGroupInterface/
_, err = conn.StartTransientUnitContext(ctx, unit, "replace", properties, ch)
if err != nil {
return err
}
<-ch
return nil
}
func getSliceAndUnit(cgroupPath string) (string, string, error) {
parts := strings.Split(cgroupPath, ":")
if len(parts) == 3 && strings.HasSuffix(parts[0], ".slice") {
return parts[0], fmt.Sprintf("%s-%s.scope", parts[1], parts[2]), nil
}
return "", "", fmt.Errorf("Path: %s is not valid systemd's cgroups path", cgroupPath)
}

View File

@@ -0,0 +1,160 @@
// Copyright (c) 2020 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
package resourcecontrol
import (
"os"
"path/filepath"
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestIsSystemdCgroup(t *testing.T) {
assert := assert.New(t)
tests := []struct {
path string
expected bool
}{
{"foo.slice:kata:afhts2e5d4g5s", true},
{"system.slice:kata:afhts2e5d4g5s", true},
{"/kata/afhts2e5d4g5s", false},
{"a:b:c:d", false},
{":::", false},
{"", false},
{":", false},
{"::", false},
{":::", false},
{"a:b", false},
{"a:b:", false},
{":a:b", false},
{"@:@:@", false},
}
for _, t := range tests {
assert.Equal(t.expected, IsSystemdCgroup(t.path), "invalid systemd cgroup path: %v", t.path)
}
}
func TestValidCgroupPath(t *testing.T) {
assert := assert.New(t)
for _, t := range []struct {
path string
systemdCgroup bool
error bool
}{
// empty paths
{"../../../", false, false},
{"../", false, false},
{".", false, false},
{"../../../", false, false},
{"./../", false, false},
// valid no-systemd paths
{"../../../foo", false, false},
{"/../hi", false, false},
{"/../hi/foo", false, false},
{"o / m /../ g", false, false},
{"/overhead/foobar", false, false},
{"/sys/fs/cgroup/cpu/sandbox/kata_foobar", false, false},
// invalid systemd paths
{"o / m /../ g", true, true},
{"slice:kata", true, true},
{"/kata/afhts2e5d4g5s", true, true},
{"a:b:c:d", true, true},
{":::", true, true},
{"", true, true},
{":", true, true},
{"::", true, true},
{":::", true, true},
{"a:b", true, true},
{"a:b:", true, true},
{":a:b", true, true},
{"@:@:@", true, true},
// valid systemd paths
{"x.slice:kata:55555", true, false},
{"system.slice:kata:afhts2e5d4g5s", true, false},
} {
path, err := ValidCgroupPath(t.path, t.systemdCgroup)
if t.error {
assert.Error(err)
continue
} else {
assert.NoError(err)
}
if filepath.IsAbs(t.path) {
cleanPath := filepath.Dir(filepath.Clean(t.path))
assert.True(strings.HasPrefix(path, cleanPath),
"%v should have prefix %v", path, cleanPath)
} else if t.systemdCgroup {
assert.Equal(t.path, path)
} else {
assert.True(
strings.HasPrefix(path, DefaultResourceControllerID),
"%v should have prefix /%v", path, DefaultResourceControllerID)
}
}
}
func TestDeviceToCgroupDeviceRule(t *testing.T) {
assert := assert.New(t)
f, err := os.CreateTemp("", "device")
assert.NoError(err)
f.Close()
// fail: regular file to device
dev, err := DeviceToCgroupDeviceRule(f.Name())
assert.Error(err)
assert.Nil(dev)
// fail: no such file
os.Remove(f.Name())
dev, err = DeviceToCgroupDeviceRule(f.Name())
assert.Error(err)
assert.Nil(dev)
devPath := "/dev/null"
if _, err := os.Stat(devPath); os.IsNotExist(err) {
t.Skipf("no such device: %v", devPath)
return
}
dev, err = DeviceToCgroupDeviceRule(devPath)
assert.NoError(err)
assert.NotNil(dev)
assert.Equal(rune(dev.Type), 'c')
assert.NotZero(dev.Major)
assert.NotZero(dev.Minor)
assert.NotEmpty(dev.Permissions)
assert.True(dev.Allow)
}
func TestDeviceToLinuxDevice(t *testing.T) {
assert := assert.New(t)
devPath := "/dev/null"
if _, err := os.Stat(devPath); os.IsNotExist(err) {
t.Skipf("no such device: %v", devPath)
return
}
dev, err := DeviceToLinuxDevice(devPath)
assert.NoError(err)
assert.NotNil(dev)
assert.Equal(dev.Type, "c")
assert.NotNil(dev.Major)
assert.NotZero(*dev.Major)
assert.NotNil(dev.Minor)
assert.NotZero(*dev.Minor)
assert.NotEmpty(dev.Access)
assert.True(dev.Allow)
}