Files
kata-containers/virtcontainers/cgroups.go
Peng Tao 0075bf85ba hypervisor: allow to return a slice of pids
so that for qemu, we can save and export virtiofsd pid,
and put it to the same cgroup as the qemu process.

Fixes: #1972
Signed-off-by: Peng Tao <bergwolf@hyper.sh>
2019-08-21 11:37:01 +08:00

375 lines
10 KiB
Go

// Copyright (c) 2018 Huawei Corporation
// Copyright (c) 2019 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
package virtcontainers
import (
"bufio"
"fmt"
"math"
"os"
"path/filepath"
"strings"
"github.com/containerd/cgroups"
"github.com/kata-containers/runtime/virtcontainers/pkg/annotations"
specs "github.com/opencontainers/runtime-spec/specs-go"
)
type cgroupPather interface {
cgroups.Subsystem
Path(path string) string
}
// unconstrained cgroups are placed here.
// for example /sys/fs/cgroup/memory/kata/$CGPATH
// where path is defined by the containers manager
const cgroupKataPath = "/kata/"
// prepend a kata specific string to oci cgroup path to
// form a different cgroup path, thus cAdvisor couldn't
// find kata containers cgroup path on host to prevent it
// from grabbing the stats data.
const cgroupKataPrefix = "kata"
var cgroupsLoadFunc = cgroups.Load
var cgroupsNewFunc = cgroups.New
// V1Constraints returns the cgroups that are compatible with th VC architecture
// and hypervisor, constraints can be applied to these cgroups.
func V1Constraints() ([]cgroups.Subsystem, error) {
root, err := cgroupV1MountPoint()
if err != nil {
return nil, err
}
subsystems := []cgroups.Subsystem{
cgroups.NewCputset(root),
cgroups.NewCpu(root),
cgroups.NewCpuacct(root),
}
return cgroupsSubsystems(subsystems)
}
// V1NoConstraints returns the cgroups that are *not* compatible with th VC
// architecture and hypervisor, constraints MUST NOT be applied to these cgroups.
func V1NoConstraints() ([]cgroups.Subsystem, error) {
root, err := cgroupV1MountPoint()
if err != nil {
return nil, err
}
subsystems := []cgroups.Subsystem{
// Some constainers managers, like k8s, take the control of cgroups.
// k8s: the memory cgroup for the dns containers is small to place
// a hypervisor there.
cgroups.NewMemory(root),
}
return cgroupsSubsystems(subsystems)
}
func cgroupsSubsystems(subsystems []cgroups.Subsystem) ([]cgroups.Subsystem, error) {
var enabled []cgroups.Subsystem
for _, s := range cgroupPathers(subsystems) {
// check and remove the default groups that do not exist
if _, err := os.Lstat(s.Path("/")); err == nil {
enabled = append(enabled, s)
}
}
return enabled, nil
}
func cgroupPathers(subystems []cgroups.Subsystem) []cgroupPather {
var out []cgroupPather
for _, s := range subystems {
if p, ok := s.(cgroupPather); ok {
out = append(out, p)
}
}
return out
}
// v1MountPoint returns the mount point where the cgroup
// mountpoints are mounted in a single hiearchy
func cgroupV1MountPoint() (string, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
if err := scanner.Err(); err != nil {
return "", err
}
var (
text = scanner.Text()
fields = strings.Split(text, " ")
// safe as mountinfo encodes mountpoints with spaces as \040.
index = strings.Index(text, " - ")
postSeparatorFields = strings.Fields(text[index+3:])
numPostFields = len(postSeparatorFields)
)
// this is an error as we can't detect if the mount is for "cgroup"
if numPostFields == 0 {
return "", fmt.Errorf("Found no fields post '-' in %q", text)
}
if postSeparatorFields[0] == "cgroup" {
// check that the mount is properly formated.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
return filepath.Dir(fields[4]), nil
}
}
return "", cgroups.ErrMountPointNotExist
}
func cgroupNoConstraintsPath(path string) string {
return filepath.Join(cgroupKataPath, path)
}
// return the parent cgroup for the given path
func parentCgroup(hierarchy cgroups.Hierarchy, path string) (cgroups.Cgroup, error) {
// append '/' just in case CgroupsPath doesn't start with it
parent := filepath.Dir("/" + path)
parentCgroup, err := cgroupsLoadFunc(hierarchy,
cgroups.StaticPath(parent))
if err != nil {
return nil, fmt.Errorf("Could not load parent cgroup %v: %v", parent, err)
}
return parentCgroup, nil
}
func (s *Sandbox) updateCgroups() error {
if s.state.CgroupPath == "" {
s.Logger().Warn("sandbox's cgroup won't be updated: cgroup path is empty")
return nil
}
cgroup, err := cgroupsLoadFunc(V1Constraints, cgroups.StaticPath(s.state.CgroupPath))
if err != nil {
return fmt.Errorf("Could not load cgroup %v: %v", s.state.CgroupPath, err)
}
if err := s.constrainHypervisor(cgroup); err != nil {
return err
}
if len(s.containers) <= 1 {
// nothing to update
return nil
}
resources, err := s.resources()
if err != nil {
return err
}
if err := cgroup.Update(&resources); err != nil {
return fmt.Errorf("Could not update cgroup %v: %v", s.state.CgroupPath, err)
}
return nil
}
func (s *Sandbox) deleteCgroups() error {
s.Logger().Debug("Deleting sandbox cgroup")
path := cgroupNoConstraintsPath(s.state.CgroupPath)
s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup")
noConstraintsCgroup, err := cgroupsLoadFunc(V1NoConstraints, cgroups.StaticPath(path))
if err == cgroups.ErrCgroupDeleted {
// cgroup already deleted
return nil
}
if err != nil {
return fmt.Errorf("Could not load cgroup without constraints %v: %v", path, err)
}
// move running process here, that way cgroup can be removed
parent, err := parentCgroup(V1NoConstraints, path)
if err != nil {
// parent cgroup doesn't exist, that means there are no process running
// and the no constraints cgroup was removed.
s.Logger().WithError(err).Warn("Parent cgroup doesn't exist")
return nil
}
if err := noConstraintsCgroup.MoveTo(parent); err != nil {
// Don't fail, cgroup can be deleted
s.Logger().WithError(err).Warn("Could not move process from no constraints to parent cgroup")
}
return noConstraintsCgroup.Delete()
}
func (s *Sandbox) constrainHypervisor(cgroup cgroups.Cgroup) error {
pids := s.hypervisor.getPids()
if len(pids) == 0 || pids[0] == 0 {
return fmt.Errorf("Invalid hypervisor PID: %+v", pids)
}
// Move hypervisor into cgroups without constraints,
// those cgroups are not yet supported.
resources := &specs.LinuxResources{}
path := cgroupNoConstraintsPath(s.state.CgroupPath)
noConstraintsCgroup, err := cgroupsNewFunc(V1NoConstraints, cgroups.StaticPath(path), resources)
if err != nil {
return fmt.Errorf("Could not create cgroup %v: %v", path, err)
}
for _, pid := range pids {
if pid <= 0 {
s.Logger().Warnf("Invalid hypervisor pid: %d", pid)
continue
}
if err := noConstraintsCgroup.Add(cgroups.Process{Pid: pid}); err != nil {
return fmt.Errorf("Could not add hypervisor PID %d to cgroup %v: %v", pid, path, err)
}
}
// when new container joins, new CPU could be hotplugged, so we
// have to query fresh vcpu info from hypervisor for every time.
tids, err := s.hypervisor.getThreadIDs()
if err != nil {
return fmt.Errorf("failed to get thread ids from hypervisor: %v", err)
}
if len(tids.vcpus) == 0 {
// If there's no tid returned from the hypervisor, this is not
// a bug. It simply means there is nothing to constrain, hence
// let's return without any error from here.
return nil
}
// We are about to move just the vcpus (threads) into cgroups with constraints.
// Move whole hypervisor process whould be easier but the IO/network performance
// whould be impacted.
for _, i := range tids.vcpus {
// In contrast, AddTask will write thread id to `tasks`
// After this, vcpu threads are in "vcpu" sub-cgroup, other threads in
// qemu will be left in parent cgroup untouched.
if err := cgroup.AddTask(cgroups.Process{
Pid: i,
}); err != nil {
return err
}
}
return nil
}
func (s *Sandbox) resources() (specs.LinuxResources, error) {
resources := specs.LinuxResources{
CPU: s.cpuResources(),
}
return resources, nil
}
func (s *Sandbox) cpuResources() *specs.LinuxCPU {
// Use default period and quota if they are not specified.
// Container will inherit the constraints from its parent.
quota := int64(0)
period := uint64(0)
shares := uint64(0)
realtimePeriod := uint64(0)
realtimeRuntime := int64(0)
cpu := &specs.LinuxCPU{
Quota: &quota,
Period: &period,
Shares: &shares,
RealtimePeriod: &realtimePeriod,
RealtimeRuntime: &realtimeRuntime,
}
for _, c := range s.containers {
ann := c.GetAnnotations()
if ann[annotations.ContainerTypeKey] == string(PodSandbox) {
// skip sandbox container
continue
}
if c.config.Resources.CPU == nil {
continue
}
if c.config.Resources.CPU.Shares != nil {
shares = uint64(math.Max(float64(*c.config.Resources.CPU.Shares), float64(shares)))
}
if c.config.Resources.CPU.Quota != nil {
quota += *c.config.Resources.CPU.Quota
}
if c.config.Resources.CPU.Period != nil {
period = uint64(math.Max(float64(*c.config.Resources.CPU.Period), float64(period)))
}
if c.config.Resources.CPU.Cpus != "" {
cpu.Cpus += c.config.Resources.CPU.Cpus + ","
}
if c.config.Resources.CPU.RealtimeRuntime != nil {
realtimeRuntime += *c.config.Resources.CPU.RealtimeRuntime
}
if c.config.Resources.CPU.RealtimePeriod != nil {
realtimePeriod += *c.config.Resources.CPU.RealtimePeriod
}
if c.config.Resources.CPU.Mems != "" {
cpu.Mems += c.config.Resources.CPU.Mems + ","
}
}
cpu.Cpus = strings.Trim(cpu.Cpus, " \n\t,")
return validCPUResources(cpu)
}
// validCPUResources checks CPU resources coherency
func validCPUResources(cpuSpec *specs.LinuxCPU) *specs.LinuxCPU {
if cpuSpec == nil {
return nil
}
cpu := *cpuSpec
if cpu.Period != nil && *cpu.Period < 1 {
cpu.Period = nil
}
if cpu.Quota != nil && *cpu.Quota < 1 {
cpu.Quota = nil
}
if cpu.Shares != nil && *cpu.Shares < 1 {
cpu.Shares = nil
}
if cpu.RealtimePeriod != nil && *cpu.RealtimePeriod < 1 {
cpu.RealtimePeriod = nil
}
if cpu.RealtimeRuntime != nil && *cpu.RealtimeRuntime < 1 {
cpu.RealtimeRuntime = nil
}
return &cpu
}
func renameCgroupPath(path string) (string, error) {
if path == "" {
return "", fmt.Errorf("Cgroup path is empty")
}
cgroupPathDir := filepath.Dir(path)
cgroupPathName := fmt.Sprintf("%s_%s", cgroupKataPrefix, filepath.Base(path))
return filepath.Join(cgroupPathDir, cgroupPathName), nil
}