runtime: Support for host cgroup v2

Support cgroup v2 on the host. Update vendor containerd/cgroups to add cgroup v2.

Fixes: #3073

Signed-off-by: yaoyinnan <yaoyinnan@foxmail.com>
This commit is contained in:
yaoyinnan
2022-07-14 15:37:22 +08:00
parent 4ab45e5c93
commit 5c3155f7e2
31 changed files with 7179 additions and 125 deletions

View File

@@ -15,16 +15,21 @@ import (
"sync"
"github.com/containerd/cgroups"
v1 "github.com/containerd/cgroups/stats/v1"
cgroupsv2 "github.com/containerd/cgroups/v2"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
)
// prepend a kata specific string to oci cgroup path to
// form a different cgroup path, thus cAdvisor couldn't
// find kata containers cgroup path on host to prevent it
// from grabbing the stats data.
const CgroupKataPrefix = "kata"
const (
// prepend a kata specific string to oci cgroup path to
// form a different cgroup path, thus cAdvisor couldn't
// find kata containers cgroup path on host to prevent it
// from grabbing the stats data.
CgroupKataPrefix = "kata"
// cgroup v2 mount point
unifiedMountpoint = "/sys/fs/cgroup"
)
func RenameCgroupPath(path string) (string, error) {
if path == "" {
@@ -34,11 +39,10 @@ func RenameCgroupPath(path string) (string, error) {
cgroupPathDir := filepath.Dir(path)
cgroupPathName := fmt.Sprintf("%s_%s", CgroupKataPrefix, filepath.Base(path))
return filepath.Join(cgroupPathDir, cgroupPathName), nil
}
type LinuxCgroup struct {
cgroup cgroups.Cgroup
cgroup interface{}
path string
cpusets *specs.LinuxCPU
devices []specs.LinuxDeviceCgroup
@@ -128,15 +132,29 @@ func sandboxDevices() []specs.LinuxDeviceCgroup {
func NewResourceController(path string, resources *specs.LinuxResources) (ResourceController, error) {
var err error
var cgroup interface{}
var cgroupPath string
cgroupPath, err := ValidCgroupPath(path, IsSystemdCgroup(path))
if err != nil {
return nil, err
}
cgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources)
if err != nil {
return nil, err
if cgroups.Mode() == cgroups.Legacy || cgroups.Mode() == cgroups.Hybrid {
cgroupPath, err = ValidCgroupPathV1(path, IsSystemdCgroup(path))
if err != nil {
return nil, err
}
cgroup, err = cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources)
if err != nil {
return nil, err
}
} else if cgroups.Mode() == cgroups.Unified {
cgroupPath, err = ValidCgroupPathV2(path, IsSystemdCgroup(path))
if err != nil {
return nil, err
}
cgroup, err = cgroupsv2.NewManager(unifiedMountpoint, cgroupPath, cgroupsv2.ToResources(resources))
if err != nil {
return nil, err
}
} else {
return nil, ErrCgroupMode
}
return &LinuxCgroup{
@@ -148,40 +166,56 @@ func NewResourceController(path string, resources *specs.LinuxResources) (Resour
}
func NewSandboxResourceController(path string, resources *specs.LinuxResources, sandboxCgroupOnly bool) (ResourceController, error) {
var cgroup cgroups.Cgroup
sandboxResources := *resources
sandboxResources.Devices = append(sandboxResources.Devices, sandboxDevices()...)
// Currently we know to handle systemd cgroup path only when it's the only cgroup (no overhead group), hence,
// if sandboxCgroupOnly is not true we treat it as cgroupfs path as it used to be, although it may be incorrect
// if sandboxCgroupOnly is not true we treat it as cgroupfs path as it used to be, although it may be incorrect.
if !IsSystemdCgroup(path) || !sandboxCgroupOnly {
return NewResourceController(path, &sandboxResources)
}
var cgroup interface{}
slice, unit, err := getSliceAndUnit(path)
if err != nil {
return nil, err
}
// github.com/containerd/cgroups doesn't support creating a scope unit with
// v1 cgroups against systemd, the following interacts directly with systemd
// to create the cgroup and then load it using containerd's api
err = createCgroupsSystemd(slice, unit, uint32(os.Getpid())) // adding runtime process, it makes calling setupCgroups redundant
if err != nil {
//github.com/containerd/cgroups doesn't support creating a scope unit with
//v1 and v2 cgroups against systemd, the following interacts directly with systemd
//to create the cgroup and then load it using containerd's api.
//adding runtime process, it makes calling setupCgroups redundant
if createCgroupsSystemd(slice, unit, os.Getpid()); err != nil {
return nil, err
}
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return nil, err
}
// Create systemd cgroup
if cgroups.Mode() == cgroups.Legacy || cgroups.Mode() == cgroups.Hybrid {
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return nil, err
}
// load created cgroup and update with resources
if cgroup, err = cgroups.Load(cgHierarchy, cgPath); err == nil {
err = cgroup.Update(&sandboxResources)
}
if err != nil {
return nil, err
// load created cgroup and update with resources
cg, err := cgroups.Load(cgHierarchy, cgPath)
if err != nil {
if cg.Update(&sandboxResources); err != nil {
return nil, err
}
}
cgroup = cg
} else if cgroups.Mode() == cgroups.Unified {
// load created cgroup and update with resources
cg, err := cgroupsv2.LoadSystemd(slice, unit)
if err != nil {
if cg.Update(cgroupsv2.ToResources(&sandboxResources)); err != nil {
return nil, err
}
}
cgroup = cg
} else {
return nil, ErrCgroupMode
}
return &LinuxCgroup{
@@ -193,14 +227,38 @@ func NewSandboxResourceController(path string, resources *specs.LinuxResources,
}
func LoadResourceController(path string) (ResourceController, error) {
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return nil, err
}
var err error
var cgroup interface{}
cgroup, err := cgroups.Load(cgHierarchy, cgPath)
if err != nil {
return nil, err
// load created cgroup and update with resources
if cgroups.Mode() == cgroups.Legacy || cgroups.Mode() == cgroups.Hybrid {
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return nil, err
}
cgroup, err = cgroups.Load(cgHierarchy, cgPath)
if err != nil {
return nil, err
}
} else if cgroups.Mode() == cgroups.Unified {
if IsSystemdCgroup(path) {
slice, unit, err := getSliceAndUnit(path)
if err != nil {
return nil, err
}
cgroup, err = cgroupsv2.LoadSystemd(slice, unit)
if err != nil {
return nil, err
}
} else {
cgroup, err = cgroupsv2.LoadManager(unifiedMountpoint, path)
if err != nil {
return nil, err
}
}
} else {
return nil, ErrCgroupMode
}
return &LinuxCgroup{
@@ -214,37 +272,86 @@ func (c *LinuxCgroup) Logger() *logrus.Entry {
}
func (c *LinuxCgroup) Delete() error {
return c.cgroup.Delete()
switch cg := c.cgroup.(type) {
case cgroups.Cgroup:
return cg.Delete()
case *cgroupsv2.Manager:
if IsSystemdCgroup(c.ID()) {
if err := cg.DeleteSystemd(); err != nil {
return err
}
}
return cg.Delete()
default:
return ErrCgroupMode
}
}
func (c *LinuxCgroup) Stat() (*v1.Metrics, error) {
return c.cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist))
func (c *LinuxCgroup) Stat() (interface{}, error) {
switch cg := c.cgroup.(type) {
case cgroups.Cgroup:
return cg.Stat(cgroups.IgnoreNotExist)
case *cgroupsv2.Manager:
return cg.Stat()
default:
return nil, ErrCgroupMode
}
}
func (c *LinuxCgroup) AddProcess(pid int, subsystems ...string) error {
return c.cgroup.Add(cgroups.Process{Pid: pid})
switch cg := c.cgroup.(type) {
case cgroups.Cgroup:
return cg.AddProc(uint64(pid))
case *cgroupsv2.Manager:
return cg.AddProc(uint64(pid))
default:
return ErrCgroupMode
}
}
func (c *LinuxCgroup) AddThread(pid int, subsystems ...string) error {
return c.cgroup.AddTask(cgroups.Process{Pid: pid})
switch cg := c.cgroup.(type) {
case cgroups.Cgroup:
return cg.AddTask(cgroups.Process{Pid: pid})
case *cgroupsv2.Manager:
return cg.AddProc(uint64(pid))
default:
return ErrCgroupMode
}
}
func (c *LinuxCgroup) Update(resources *specs.LinuxResources) error {
return c.cgroup.Update(resources)
switch cg := c.cgroup.(type) {
case cgroups.Cgroup:
return cg.Update(resources)
case *cgroupsv2.Manager:
return cg.Update(cgroupsv2.ToResources(resources))
default:
return ErrCgroupMode
}
}
func (c *LinuxCgroup) MoveTo(path string) error {
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return err
switch cg := c.cgroup.(type) {
case cgroups.Cgroup:
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return err
}
newCgroup, err := cgroups.Load(cgHierarchy, cgPath)
if err != nil {
return err
}
return cg.MoveTo(newCgroup)
case *cgroupsv2.Manager:
newCgroup, err := cgroupsv2.LoadManager(unifiedMountpoint, path)
if err != nil {
return err
}
return cg.MoveTo(newCgroup)
default:
return ErrCgroupMode
}
newCgroup, err := cgroups.Load(cgHierarchy, cgPath)
if err != nil {
return err
}
return c.cgroup.MoveTo(newCgroup)
}
func (c *LinuxCgroup) AddDevice(deviceHostPath string) error {
@@ -258,10 +365,21 @@ func (c *LinuxCgroup) AddDevice(deviceHostPath string) error {
c.devices = append(c.devices, deviceResource)
if err := c.cgroup.Update(&specs.LinuxResources{
Devices: c.devices,
}); err != nil {
return err
switch cg := c.cgroup.(type) {
case cgroups.Cgroup:
if err := cg.Update(&specs.LinuxResources{
Devices: c.devices,
}); err != nil {
return err
}
case *cgroupsv2.Manager:
if err := cg.Update(cgroupsv2.ToResources(&specs.LinuxResources{
Devices: c.devices,
})); err != nil {
return err
}
default:
return ErrCgroupMode
}
return nil
@@ -284,10 +402,21 @@ func (c *LinuxCgroup) RemoveDevice(deviceHostPath string) error {
}
}
if err := c.cgroup.Update(&specs.LinuxResources{
Devices: c.devices,
}); err != nil {
return err
switch cg := c.cgroup.(type) {
case cgroups.Cgroup:
if err := cg.Update(&specs.LinuxResources{
Devices: c.devices,
}); err != nil {
return err
}
case *cgroupsv2.Manager:
if err := cg.Update(cgroupsv2.ToResources(&specs.LinuxResources{
Devices: c.devices,
})); err != nil {
return err
}
default:
return ErrCgroupMode
}
return nil
@@ -315,9 +444,18 @@ func (c *LinuxCgroup) UpdateCpuSet(cpuset, memset string) error {
c.cpusets.Mems = memset
}
return c.cgroup.Update(&specs.LinuxResources{
CPU: c.cpusets,
})
switch cg := c.cgroup.(type) {
case cgroups.Cgroup:
return cg.Update(&specs.LinuxResources{
CPU: c.cpusets,
})
case *cgroupsv2.Manager:
return cg.Update(cgroupsv2.ToResources(&specs.LinuxResources{
CPU: c.cpusets,
}))
default:
return ErrCgroupMode
}
}
func (c *LinuxCgroup) Type() ResourceControllerType {

View File

@@ -6,7 +6,6 @@
package resourcecontrol
import (
v1 "github.com/containerd/cgroups/stats/v1"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
)
@@ -56,7 +55,7 @@ type ResourceController interface {
Delete() error
// Stat returns the statistics for the controller.
Stat() (*v1.Metrics, error)
Stat() (interface{}, error)
// AddProcess adds a process to a set of controllers.
AddProcess(int, ...string) error

View File

@@ -6,6 +6,7 @@
package resourcecontrol
import (
"errors"
"fmt"
"strings"
@@ -14,6 +15,10 @@ import (
"golang.org/x/sys/unix"
)
var (
ErrCgroupMode = errors.New("cgroup controller type error")
)
func DeviceToCgroupDeviceRule(device string) (*devices.Rule, error) {
var st unix.Stat_t
deviceRule := devices.Rule{

View File

@@ -20,9 +20,9 @@ import (
// DefaultResourceControllerID runtime-determined location in the cgroups hierarchy.
const DefaultResourceControllerID = "/vc"
// ValidCgroupPath returns a valid cgroup path.
// ValidCgroupPathV1 returns a valid cgroup path for cgroup v1.
// see https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#cgroups-path
func ValidCgroupPath(path string, systemdCgroup bool) (string, error) {
func ValidCgroupPathV1(path string, systemdCgroup bool) (string, error) {
if IsSystemdCgroup(path) {
return path, nil
}
@@ -43,6 +43,30 @@ func ValidCgroupPath(path string, systemdCgroup bool) (string, error) {
return filepath.Join(DefaultResourceControllerID, filepath.Clean("/"+path)), nil
}
// ValidCgroupPathV2 returns a valid cgroup path for cgroup v2.
// see https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#cgroups-path
func ValidCgroupPathV2(path string, systemdCgroup bool) (string, error) {
// In cgroup v2path must be a "clean" absolute path starts with "/".
if IsSystemdCgroup(path) {
return filepath.Join("/", path), nil
}
if systemdCgroup {
return "", fmt.Errorf("malformed systemd path '%v': expected to be of form 'slice:prefix:name'", path)
}
// In the case of an absolute path (starting with /), the runtime MUST
// take the path to be relative to the cgroups mount point.
if filepath.IsAbs(path) {
return filepath.Clean(path), nil
}
// In the case of a relative path (not starting with /), the runtime MAY
// interpret the path relative to a runtime-determined location in the cgroups hierarchy.
// clean up path and return a new path relative to DefaultResourceControllerID
return filepath.Join(DefaultResourceControllerID, filepath.Clean("/"+path)), nil
}
func newProperty(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
@@ -68,7 +92,7 @@ func cgroupHierarchy(path string) (cgroups.Hierarchy, cgroups.Path, error) {
}
}
func createCgroupsSystemd(slice string, unit string, pid uint32) error {
func createCgroupsSystemd(slice string, unit string, pid int) error {
ctx := context.TODO()
conn, err := systemdDbus.NewWithContext(ctx)
if err != nil {
@@ -84,14 +108,19 @@ func createCgroupsSystemd(slice string, unit string, pid uint32) error {
newProperty("IOAccounting", true),
}
// https://github.com/opencontainers/runc/blob/master/docs/systemd.md
if strings.HasSuffix(unit, ".scope") {
// It's a scope, which we put into a Slice=.
properties = append(properties, systemdDbus.PropSlice(slice))
properties = append(properties, newProperty("Delegate", true))
properties = append(properties, systemdDbus.PropPids(pid))
if strings.HasSuffix(unit, ".slice") {
// If we create a slice, the parent is defined via a Wants=.
properties = append(properties, systemdDbus.PropWants(slice))
} else {
return fmt.Errorf("Failed to create cgroups with systemd: unit %s is not a scope", unit)
// Otherwise it's a scope, which we put into a Slice=.
properties = append(properties, systemdDbus.PropSlice(slice))
}
// Assume scopes always support delegation (supported since systemd v218).
properties = append(properties, newProperty("Delegate", true))
if pid != -1 {
properties = append(properties, systemdDbus.PropPids(uint32(pid)))
}
ch := make(chan string)

View File

@@ -41,7 +41,7 @@ func TestIsSystemdCgroup(t *testing.T) {
}
}
func TestValidCgroupPath(t *testing.T) {
func TestValidCgroupPathV1(t *testing.T) {
assert := assert.New(t)
for _, t := range []struct {
@@ -62,12 +62,14 @@ func TestValidCgroupPath(t *testing.T) {
{"/../hi/foo", false, false},
{"o / m /../ g", false, false},
{"/overhead/foobar", false, false},
{"/kata/afhts2e5d4g5s", false, false},
{"/kubepods/besteffort/podxxx-afhts2e5d4g5s/kata_afhts2e5d4g5s", false, false},
{"/sys/fs/cgroup/cpu/sandbox/kata_foobar", false, false},
{"kata_overhead/afhts2e5d4g5s", false, false},
// invalid systemd paths
{"o / m /../ g", true, true},
{"slice:kata", true, true},
{"/kata/afhts2e5d4g5s", true, true},
{"a:b:c:d", true, true},
{":::", true, true},
{"", true, true},
@@ -83,7 +85,7 @@ func TestValidCgroupPath(t *testing.T) {
{"x.slice:kata:55555", true, false},
{"system.slice:kata:afhts2e5d4g5s", true, false},
} {
path, err := ValidCgroupPath(t.path, t.systemdCgroup)
path, err := ValidCgroupPathV1(t.path, t.systemdCgroup)
if t.error {
assert.Error(err)
continue
@@ -106,6 +108,73 @@ func TestValidCgroupPath(t *testing.T) {
}
func TestValidCgroupPathV2(t *testing.T) {
assert := assert.New(t)
for _, t := range []struct {
path string
systemdCgroup bool
error bool
}{
// empty paths
{"../../../", false, false},
{"../", false, false},
{".", false, false},
{"../../../", false, false},
{"./../", false, false},
// valid no-systemd paths
{"../../../foo", false, false},
{"/../hi", false, false},
{"/../hi/foo", false, false},
{"o / m /../ g", false, false},
{"/overhead/foobar", false, false},
{"/kata/afhts2e5d4g5s", false, false},
{"/kubepods/besteffort/podxxx-afhts2e5d4g5s/kata_afhts2e5d4g5s", false, false},
{"/sys/fs/cgroup/cpu/sandbox/kata_foobar", false, false},
{"kata_overhead/afhts2e5d4g5s", false, false},
// invalid systemd paths
{"o / m /../ g", true, true},
{"slice:kata", true, true},
{"a:b:c:d", true, true},
{":::", true, true},
{"", true, true},
{":", true, true},
{"::", true, true},
{":::", true, true},
{"a:b", true, true},
{"a:b:", true, true},
{":a:b", true, true},
{"@:@:@", true, true},
// valid systemd paths
{"x.slice:kata:55555", true, false},
{"system.slice:kata:afhts2e5d4g5s", true, false},
} {
path, err := ValidCgroupPathV2(t.path, t.systemdCgroup)
if t.error {
assert.Error(err)
continue
} else {
assert.NoError(err)
}
if filepath.IsAbs(t.path) {
cleanPath := filepath.Dir(filepath.Clean(t.path))
assert.True(strings.HasPrefix(path, cleanPath),
"%v should have prefix %v", path, cleanPath)
} else if t.systemdCgroup {
assert.Equal(filepath.Join("/", t.path), path)
} else {
assert.True(
strings.HasPrefix(path, DefaultResourceControllerID),
"%v should have prefix /%v", path, DefaultResourceControllerID)
}
}
}
func TestDeviceToCgroupDeviceRule(t *testing.T) {
assert := assert.New(t)