diff --git a/cli/create.go b/cli/create.go index 0bbc288ca..3cd868b02 100644 --- a/cli/create.go +++ b/cli/create.go @@ -10,9 +10,7 @@ import ( "context" "errors" "fmt" - "io/ioutil" "os" - "path/filepath" "strings" vc "github.com/kata-containers/runtime/virtcontainers" @@ -177,26 +175,6 @@ func create(ctx context.Context, containerID, bundlePath, console, pidFilePath s } } - // config.json provides a cgroups path that has to be used to create "tasks" - // and "cgroups.procs" files. Those files have to be filled with a PID, which - // is shim's in our case. This is mandatory to make sure there is no one - // else (like Docker) trying to create those files on our behalf. We want to - // know those files location so that we can remove them when delete is called. - cgroupsPathList, err := processCgroupsPath(ctx, ociSpec, containerType.IsSandbox()) - if err != nil { - return err - } - - // cgroupsDirPath is CgroupsPath fetch from OCI spec - var cgroupsDirPath string - if ociSpec.Linux != nil { - cgroupsDirPath = ociSpec.Linux.CgroupsPath - } - - if err := createCgroupsFiles(ctx, containerID, cgroupsDirPath, cgroupsPathList, process.Pid); err != nil { - return err - } - // Creation of PID file has to be the last thing done in the create // because containerd considers the create complete after this file // is created. @@ -379,52 +357,6 @@ func createContainer(ctx context.Context, ociSpec oci.CompatOCISpec, containerID return c.Process(), nil } -func createCgroupsFiles(ctx context.Context, containerID string, cgroupsDirPath string, cgroupsPathList []string, pid int) error { - span, _ := trace(ctx, "createCgroupsFiles") - defer span.Finish() - - if len(cgroupsPathList) == 0 { - kataLog.WithField("pid", pid).Info("Cgroups files not created because cgroupsPath was empty") - return nil - } - - for _, cgroupsPath := range cgroupsPathList { - if err := os.MkdirAll(cgroupsPath, cgroupsDirMode); err != nil { - return err - } - - if strings.Contains(cgroupsPath, "cpu") && cgroupsDirPath != "" { - parent := strings.TrimSuffix(cgroupsPath, cgroupsDirPath) - copyParentCPUSet(cgroupsPath, parent) - } - - tasksFilePath := filepath.Join(cgroupsPath, cgroupsTasksFile) - procsFilePath := filepath.Join(cgroupsPath, cgroupsProcsFile) - - pidStr := fmt.Sprintf("%d", pid) - - for _, path := range []string{tasksFilePath, procsFilePath} { - f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, cgroupsFileMode) - if err != nil { - return err - } - defer f.Close() - - n, err := f.WriteString(pidStr) - if err != nil { - return err - } - - if n < len(pidStr) { - return fmt.Errorf("Could not write pid to %q: only %d bytes written out of %d", - path, n, len(pidStr)) - } - } - } - - return nil -} - func createPIDFile(ctx context.Context, pidFilePath string, pid int) error { span, _ := trace(ctx, "createPIDFile") defer span.Finish() @@ -457,48 +389,3 @@ func createPIDFile(ctx context.Context, pidFilePath string, pid int) error { return nil } - -// copyParentCPUSet copies the cpuset.cpus and cpuset.mems from the parent -// directory to the current directory if the file's contents are 0 -func copyParentCPUSet(current, parent string) error { - currentCpus, currentMems, err := getCPUSet(current) - if err != nil { - return err - } - - parentCpus, parentMems, err := getCPUSet(parent) - if err != nil { - return err - } - - if len(parentCpus) < 1 || len(parentMems) < 1 { - return nil - } - - var cgroupsFileMode = os.FileMode(0600) - if isEmptyString(currentCpus) { - if err := writeFile(filepath.Join(current, "cpuset.cpus"), string(parentCpus), cgroupsFileMode); err != nil { - return err - } - } - - if isEmptyString(currentMems) { - if err := writeFile(filepath.Join(current, "cpuset.mems"), string(parentMems), cgroupsFileMode); err != nil { - return err - } - } - - return nil -} - -func getCPUSet(parent string) (cpus []byte, mems []byte, err error) { - if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil { - return - } - - if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil { - return - } - - return cpus, mems, nil -} diff --git a/cli/create_test.go b/cli/create_test.go index 362ec4820..02cd95ddf 100644 --- a/cli/create_test.go +++ b/cli/create_test.go @@ -36,22 +36,6 @@ const ( var testStrPID = fmt.Sprintf("%d", testPID) -func mockCPUSetContent(contents map[string]string) error { - for filePath, data := range contents { - if err := writeFile(filePath, data, testFileMode); err != nil { - return err - } - } - - return nil -} - -func testCreateCgroupsFilesSuccessful(t *testing.T, cgroupsDirPath string, cgroupsPathList []string, pid int) { - if err := createCgroupsFiles(context.Background(), "foo", cgroupsDirPath, cgroupsPathList, pid); err != nil { - t.Fatalf("This test should succeed (cgroupsPath %q, pid %d): %s", cgroupsPathList, pid, err) - } -} - // return the value of the *last* param with the specified key func findLastParam(key string, params []vc.Param) (string, error) { if key == "" { @@ -74,62 +58,6 @@ func findLastParam(key string, params []vc.Param) (string, error) { return "", fmt.Errorf("no param called %q found", name) } -func TestCgroupsFilesEmptyCgroupsPathSuccessful(t *testing.T) { - testCreateCgroupsFilesSuccessful(t, "", []string{}, testPID) -} - -func TestCreateCgroupsFilesFailToWriteFile(t *testing.T) { - if os.Geteuid() == 0 { - // The os.FileMode(0000) trick doesn't work for root. - t.Skip(testDisabledNeedNonRoot) - } - - assert := assert.New(t) - - tmpdir, err := ioutil.TempDir("", "") - assert.NoError(err) - defer os.RemoveAll(tmpdir) - - // create the file as a directory to force an error - file := filepath.Join(tmpdir, "cgroups-file") - err = os.MkdirAll(file, os.FileMode(0000)) - assert.NoError(err) - - files := []string{file} - - err = createCgroupsFiles(context.Background(), "foo", "cgroups-file", files, testPID) - assert.Error(err) -} - -func TestCgroupsFilesNonEmptyCgroupsPathSuccessful(t *testing.T) { - cgroupsPath, err := ioutil.TempDir(testDir, "cgroups-path-") - if err != nil { - t.Fatalf("Could not create temporary cgroups directory: %s", err) - } - - testCreateCgroupsFilesSuccessful(t, "cgroups-path-", []string{cgroupsPath}, testPID) - - defer os.RemoveAll(cgroupsPath) - - tasksPath := filepath.Join(cgroupsPath, cgroupsTasksFile) - procsPath := filepath.Join(cgroupsPath, cgroupsProcsFile) - - for _, path := range []string{tasksPath, procsPath} { - if _, err := os.Stat(path); err != nil { - t.Fatalf("Path %q should have been created: %s", path, err) - } - - fileBytes, err := ioutil.ReadFile(path) - if err != nil { - t.Fatalf("Could not read %q previously created: %s", path, err) - } - - if string(fileBytes) != testStrPID { - t.Fatalf("PID %s read from %q different from expected PID %s", string(fileBytes), path, testStrPID) - } - } -} - func TestCreatePIDFileSuccessful(t *testing.T) { pidDirPath, err := ioutil.TempDir(testDir, "pid-path-") if err != nil { @@ -1087,56 +1015,6 @@ func TestCreateCreateContainer(t *testing.T) { } } -func TestCopyParentCPUSetFail(t *testing.T) { - assert := assert.New(t) - - cgroupsPath, err := ioutil.TempDir(testDir, "cgroups-path-") - assert.NoError(err) - defer os.RemoveAll(cgroupsPath) - - err = copyParentCPUSet(cgroupsPath, testDir) - assert.Error(err) -} - -func TestCopyParentCPUSetSuccessful(t *testing.T) { - assert := assert.New(t) - - cgroupsPath, err := ioutil.TempDir(testDir, "cgroups-path-") - assert.NoError(err) - defer os.RemoveAll(cgroupsPath) - - cgroupsSrcPath := filepath.Join(cgroupsPath, "src") - err = os.Mkdir(cgroupsSrcPath, testDirMode) - assert.NoError(err) - - err = mockCPUSetContent(map[string]string{ - filepath.Join(cgroupsSrcPath, "cpuset.cpus"): "0-1", - filepath.Join(cgroupsSrcPath, "cpuset.mems"): "0-1", - }) - assert.NoError(err) - - cgroupsDstPath := filepath.Join(cgroupsPath, "dst") - err = os.Mkdir(cgroupsDstPath, testDirMode) - assert.NoError(err) - - fd, err := os.Create(filepath.Join(cgroupsDstPath, "cpuset.cpus")) - assert.NoError(err) - fd.Close() - - fd, err = os.Create(filepath.Join(cgroupsDstPath, "cpuset.mems")) - assert.NoError(err) - fd.Close() - - err = copyParentCPUSet(cgroupsDstPath, cgroupsSrcPath) - assert.NoError(err) - - currentCpus, currentMems, err := getCPUSet(cgroupsDstPath) - assert.NoError(err) - - assert.False(isEmptyString(currentCpus)) - assert.False(isEmptyString(currentMems)) -} - func TestSetKernelParams(t *testing.T) { assert := assert.New(t) diff --git a/cli/delete.go b/cli/delete.go index ff939f064..d7104f47d 100644 --- a/cli/delete.go +++ b/cli/delete.go @@ -123,19 +123,7 @@ func delete(ctx context.Context, containerID string, force bool) error { return err } - // In order to prevent any file descriptor leak related to cgroups files - // that have been previously created, we have to remove them before this - // function returns. - cgroupsPathList, err := processCgroupsPath(ctx, ociSpec, containerType.IsSandbox()) - if err != nil { - return err - } - - if err := delContainerIDMapping(ctx, containerID); err != nil { - return err - } - - return removeCgroupsPath(ctx, containerID, cgroupsPathList) + return delContainerIDMapping(ctx, containerID) } func deleteSandbox(ctx context.Context, sandboxID string) error { diff --git a/cli/oci.go b/cli/oci.go index 928ab3e6e..c54e6d60f 100644 --- a/cli/oci.go +++ b/cli/oci.go @@ -8,7 +8,6 @@ package main import ( "bufio" "context" - "errors" "fmt" "io/ioutil" "net" @@ -20,18 +19,11 @@ import ( "github.com/containernetworking/plugins/pkg/ns" vc "github.com/kata-containers/runtime/virtcontainers" - "github.com/kata-containers/runtime/virtcontainers/pkg/oci" "github.com/opencontainers/runc/libcontainer/utils" - specs "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" ) // Contants related to cgroup memory directory const ( - cgroupsTasksFile = "tasks" - cgroupsProcsFile = "cgroup.procs" - cgroupsDirMode = os.FileMode(0750) - cgroupsFileMode = os.FileMode(0640) ctrsMappingDirMode = os.FileMode(0750) // Filesystem type corresponding to CGROUP_SUPER_MAGIC as listed @@ -39,8 +31,6 @@ const ( cgroupFsType = 0x27e0eb ) -var errNeedLinuxResource = errors.New("Linux resource cannot be empty") - var cgroupsDirPath string var procMountInfo = "/proc/self/mountinfo" @@ -125,128 +115,6 @@ func validCreateParams(ctx context.Context, containerID, bundlePath string) (str return resolved, nil } -// processCgroupsPath process the cgroups path as expected from the -// OCI runtime specification. It returns a list of complete paths -// that should be created and used for every specified resource. -func processCgroupsPath(ctx context.Context, ociSpec oci.CompatOCISpec, isSandbox bool) ([]string, error) { - span, _ := trace(ctx, "processCgroupsPath") - defer span.Finish() - - var cgroupsPathList []string - - if ociSpec.Linux.CgroupsPath == "" { - return []string{}, nil - } - - if ociSpec.Linux.Resources == nil { - return []string{}, nil - } - - if ociSpec.Linux.Resources.Memory != nil { - memCgroupsPath, err := processCgroupsPathForResource(ociSpec, "memory", isSandbox) - if err != nil { - return []string{}, err - } - - if memCgroupsPath != "" { - cgroupsPathList = append(cgroupsPathList, memCgroupsPath) - } - } - - if ociSpec.Linux.Resources.CPU != nil { - cpuCgroupsPath, err := processCgroupsPathForResource(ociSpec, "cpu", isSandbox) - if err != nil { - return []string{}, err - } - - if cpuCgroupsPath != "" { - cgroupsPathList = append(cgroupsPathList, cpuCgroupsPath) - } - } - - if ociSpec.Linux.Resources.Pids != nil { - pidsCgroupsPath, err := processCgroupsPathForResource(ociSpec, "pids", isSandbox) - if err != nil { - return []string{}, err - } - - if pidsCgroupsPath != "" { - cgroupsPathList = append(cgroupsPathList, pidsCgroupsPath) - } - } - - if ociSpec.Linux.Resources.BlockIO != nil { - blkIOCgroupsPath, err := processCgroupsPathForResource(ociSpec, "blkio", isSandbox) - if err != nil { - return []string{}, err - } - - if blkIOCgroupsPath != "" { - cgroupsPathList = append(cgroupsPathList, blkIOCgroupsPath) - } - } - - return cgroupsPathList, nil -} - -func processCgroupsPathForResource(ociSpec oci.CompatOCISpec, resource string, isSandbox bool) (string, error) { - if resource == "" { - return "", errNeedLinuxResource - } - - var err error - cgroupsDirPath, err = getCgroupsDirPath(procMountInfo) - if err != nil { - return "", fmt.Errorf("get CgroupsDirPath error: %s", err) - } - - // Relative cgroups path provided. - if filepath.IsAbs(ociSpec.Linux.CgroupsPath) == false { - return filepath.Join(cgroupsDirPath, resource, ociSpec.Linux.CgroupsPath), nil - } - - // Absolute cgroups path provided. - var cgroupMount specs.Mount - cgroupMountFound := false - for _, mount := range ociSpec.Mounts { - if mount.Type == "cgroup" { - cgroupMount = mount - cgroupMountFound = true - break - } - } - - if !cgroupMountFound { - // According to the OCI spec, an absolute path should be - // interpreted as relative to the system cgroup mount point - // when there is no cgroup mount point. - return filepath.Join(cgroupsDirPath, resource, ociSpec.Linux.CgroupsPath), nil - } - - if cgroupMount.Destination == "" { - return "", fmt.Errorf("cgroupsPath is absolute, cgroup mount destination cannot be empty") - } - - cgroupPath := filepath.Join(cgroupMount.Destination, resource) - - // It is not an error to have this cgroup not mounted. It is usually - // due to an old kernel version with missing support for specific - // cgroups. - fields := logrus.Fields{ - "path": cgroupPath, - "type": "cgroup", - } - - if !isCgroupMounted(cgroupPath) { - kataLog.WithFields(fields).Info("path not mounted") - return "", nil - } - - kataLog.WithFields(fields).Info("path mounted") - - return filepath.Join(cgroupPath, ociSpec.Linux.CgroupsPath), nil -} - func isCgroupMounted(cgroupPath string) bool { var statFs syscall.Statfs_t diff --git a/cli/oci_test.go b/cli/oci_test.go index 7ab3c4940..1fd9428d0 100644 --- a/cli/oci_test.go +++ b/cli/oci_test.go @@ -13,17 +13,13 @@ import ( "net" "os" "path/filepath" - "reflect" - "syscall" "testing" "time" vc "github.com/kata-containers/runtime/virtcontainers" vcAnnotations "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" - "github.com/kata-containers/runtime/virtcontainers/pkg/oci" "github.com/kata-containers/runtime/virtcontainers/pkg/vcmock" "github.com/opencontainers/runc/libcontainer/utils" - specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" ) @@ -32,38 +28,6 @@ var ( consoleSocketPathTest = "console-socket-test" ) -type cgroupTestDataType struct { - resource string - linuxSpec *specs.LinuxResources -} - -var cgroupTestData = []cgroupTestDataType{ - { - "memory", - &specs.LinuxResources{ - Memory: &specs.LinuxMemory{}, - }, - }, - { - "cpu", - &specs.LinuxResources{ - CPU: &specs.LinuxCPU{}, - }, - }, - { - "pids", - &specs.LinuxResources{ - Pids: &specs.LinuxPids{}, - }, - }, - { - "blkio", - &specs.LinuxResources{ - BlockIO: &specs.LinuxBlockIO{}, - }, - }, -} - func TestGetContainerInfoContainerIDEmptyFailure(t *testing.T) { assert := assert.New(t) status, _, err := getContainerInfo(context.Background(), "") @@ -181,141 +145,6 @@ func TestValidCreateParamsBundleIsAFile(t *testing.T) { assert.False(vcmock.IsMockError(err)) } -func testProcessCgroupsPath(t *testing.T, ociSpec oci.CompatOCISpec, expected []string) { - assert := assert.New(t) - result, err := processCgroupsPath(context.Background(), ociSpec, true) - - assert.NoError(err) - - if reflect.DeepEqual(result, expected) == false { - assert.FailNow("DeepEqual failed", "Result path %q should match the expected one %q", result, expected) - } -} - -func TestProcessCgroupsPathEmptyPathSuccessful(t *testing.T) { - ociSpec := oci.CompatOCISpec{} - - ociSpec.Linux = &specs.Linux{ - CgroupsPath: "", - } - - testProcessCgroupsPath(t, ociSpec, []string{}) -} - -func TestProcessCgroupsPathEmptyResources(t *testing.T) { - ociSpec := oci.CompatOCISpec{} - - ociSpec.Linux = &specs.Linux{ - CgroupsPath: "foo", - } - - testProcessCgroupsPath(t, ociSpec, []string{}) -} - -func TestProcessCgroupsPathRelativePathSuccessful(t *testing.T) { - relativeCgroupsPath := "relative/cgroups/path" - cgroupsDirPath = "/foo/runtime/base" - - ociSpec := oci.CompatOCISpec{} - - ociSpec.Linux = &specs.Linux{ - CgroupsPath: relativeCgroupsPath, - } - - for _, d := range cgroupTestData { - ociSpec.Linux.Resources = d.linuxSpec - - p := filepath.Join(cgroupsDirPath, d.resource, relativeCgroupsPath) - - testProcessCgroupsPath(t, ociSpec, []string{p}) - } -} - -func TestProcessCgroupsPathAbsoluteNoCgroupMountSuccessful(t *testing.T) { - absoluteCgroupsPath := "/absolute/cgroups/path" - cgroupsDirPath = "/foo/runtime/base" - - ociSpec := oci.CompatOCISpec{} - - ociSpec.Linux = &specs.Linux{ - CgroupsPath: absoluteCgroupsPath, - } - - for _, d := range cgroupTestData { - ociSpec.Linux.Resources = d.linuxSpec - - p := filepath.Join(cgroupsDirPath, d.resource, absoluteCgroupsPath) - - testProcessCgroupsPath(t, ociSpec, []string{p}) - } -} - -func TestProcessCgroupsPathAbsoluteNoCgroupMountDestinationFailure(t *testing.T) { - assert := assert.New(t) - absoluteCgroupsPath := "/absolute/cgroups/path" - - ociSpec := oci.CompatOCISpec{} - - ociSpec.Mounts = []specs.Mount{ - { - Type: "cgroup", - }, - } - - ociSpec.Linux = &specs.Linux{ - CgroupsPath: absoluteCgroupsPath, - } - - for _, d := range cgroupTestData { - ociSpec.Linux.Resources = d.linuxSpec - for _, isSandbox := range []bool{true, false} { - _, err := processCgroupsPath(context.Background(), ociSpec, isSandbox) - assert.Error(err, "This test should fail because no cgroup mount destination provided") - } - } -} - -func TestProcessCgroupsPathAbsoluteSuccessful(t *testing.T) { - assert := assert.New(t) - - if os.Geteuid() != 0 { - t.Skip(testDisabledNeedRoot) - } - - memoryResource := "memory" - absoluteCgroupsPath := "/cgroup/mount/destination" - - cgroupMountDest, err := ioutil.TempDir("", "cgroup-memory-") - assert.NoError(err) - defer os.RemoveAll(cgroupMountDest) - - resourceMountPath := filepath.Join(cgroupMountDest, memoryResource) - err = os.MkdirAll(resourceMountPath, cgroupsDirMode) - assert.NoError(err) - - err = syscall.Mount("go-test", resourceMountPath, "cgroup", 0, memoryResource) - assert.NoError(err) - defer syscall.Unmount(resourceMountPath, 0) - - ociSpec := oci.CompatOCISpec{} - - ociSpec.Linux = &specs.Linux{ - Resources: &specs.LinuxResources{ - Memory: &specs.LinuxMemory{}, - }, - CgroupsPath: absoluteCgroupsPath, - } - - ociSpec.Mounts = []specs.Mount{ - { - Type: "cgroup", - Destination: cgroupMountDest, - }, - } - - testProcessCgroupsPath(t, ociSpec, []string{filepath.Join(resourceMountPath, absoluteCgroupsPath)}) -} - func TestSetupConsoleExistingConsolePathSuccessful(t *testing.T) { assert := assert.New(t) console, err := setupConsole(consolePathTest, "") @@ -436,31 +265,6 @@ func TestIsCgroupMounted(t *testing.T) { assert.True(isCgroupMounted(memoryCgroupPath), "%s is a cgroup", memoryCgroupPath) } -func TestProcessCgroupsPathForResource(t *testing.T) { - assert := assert.New(t) - - tmpdir, err := ioutil.TempDir("", "") - assert.NoError(err) - defer os.RemoveAll(tmpdir) - - bundlePath := filepath.Join(tmpdir, "bundle") - - err = makeOCIBundle(bundlePath) - assert.NoError(err) - - ociConfigFile := filepath.Join(bundlePath, specConfig) - assert.True(fileExists(ociConfigFile)) - - spec, err := readOCIConfigFile(ociConfigFile) - assert.NoError(err) - - for _, isSandbox := range []bool{true, false} { - _, err := processCgroupsPathForResource(spec, "", isSandbox) - assert.Error(err) - assert.False(vcmock.IsMockError(err)) - } -} - func TestGetCgroupsDirPath(t *testing.T) { assert := assert.New(t) diff --git a/cli/utils.go b/cli/utils.go index b544e0ba7..b13dff933 100644 --- a/cli/utils.go +++ b/cli/utils.go @@ -6,7 +6,6 @@ package main import ( - "bytes" "fmt" "io/ioutil" "os" @@ -227,11 +226,6 @@ func writeFile(filePath string, data string, fileMode os.FileMode) error { return nil } -// isEmptyString return if string is empty -func isEmptyString(b []byte) bool { - return len(bytes.Trim(b, "\n")) == 0 -} - // fileSize returns the number of bytes in the specified file func fileSize(file string) (int64, error) { st := syscall.Stat_t{} diff --git a/virtcontainers/api.go b/virtcontainers/api.go index 64e65f2c6..dce9e1f0c 100644 --- a/virtcontainers/api.go +++ b/virtcontainers/api.go @@ -126,6 +126,11 @@ func createSandboxFromConfig(ctx context.Context, sandboxConfig SandboxConfig, f return nil, err } + // Setup host cgroups + if err := s.setupCgroups(); err != nil { + return nil, err + } + return s, nil } diff --git a/virtcontainers/cgroups.go b/virtcontainers/cgroups.go new file mode 100644 index 000000000..9ad0e8e2f --- /dev/null +++ b/virtcontainers/cgroups.go @@ -0,0 +1,191 @@ +// Copyright (c) 2018 Huawei Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package virtcontainers + +import ( + "encoding/json" + "fmt" + + "github.com/containerd/cgroups" + "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + vcpuGroupName = "vcpu" + defaultCgroupParent = "/kata" +) + +type sandboxCgroups struct { + commonParent cgroups.Cgroup + sandboxSub cgroups.Cgroup + vcpuSub cgroups.Cgroup +} + +func (s *Sandbox) newCgroups() error { + // New will still succeed when cgroup exists + // create common parent for all kata-containers + // e.g. /sys/fs/cgroup/cpu/vc + parent, err := cgroups.New(cgroups.V1, + cgroups.StaticPath(defaultCgroupParent), &specs.LinuxResources{}) + if err != nil { + return fmt.Errorf("failed to create cgroup for %q", defaultCgroupParent) + } + + // create sub-cgroup for each sandbox + // e.g. /sys/fs/cgroup/cpu/vc/ + sandboxSub, err := parent.New(s.id, &specs.LinuxResources{}) + if err != nil { + return fmt.Errorf("failed to create cgroup for %s/%s", defaultCgroupParent, s.id) + } + + // create sub-cgroup for vcpu threads + vcpuSub, err := sandboxSub.New(vcpuGroupName, &specs.LinuxResources{}) + if err != nil { + return fmt.Errorf("failed to create cgroup for %s/%s/%s", defaultCgroupParent, s.id, vcpuGroupName) + } + + s.cgroup = &sandboxCgroups{ + commonParent: parent, + sandboxSub: sandboxSub, + vcpuSub: vcpuSub, + } + return nil +} + +func (s *Sandbox) destroyCgroups() error { + if s.cgroup == nil { + s.Logger().Warningf("cgroup is not initialized, no need to destroy") + return nil + } + + // first move all processes in subgroup to parent in case live process blocks + // deletion of cgroup + if err := s.cgroup.sandboxSub.MoveTo(s.cgroup.commonParent); err != nil { + return fmt.Errorf("failed to clear cgroup processes") + } + + return s.cgroup.sandboxSub.Delete() +} + +func (s *Sandbox) setupCgroups() error { + if s.cgroup == nil { + return fmt.Errorf("failed to setup uninitialized cgroup for sandbox") + } + + resource, err := s.mergeSpecResource() + if err != nil { + return err + } + + if err := s.applyCPUCgroup(resource); err != nil { + return err + } + return nil +} + +func (s *Sandbox) applyCPUCgroup(rc *specs.LinuxResources) error { + if s.cgroup == nil { + return fmt.Errorf("failed to setup uninitialized cgroup for sandbox") + } + + // apply cpu constraint to vcpu cgroup + if err := s.cgroup.vcpuSub.Update(rc); err != nil { + return err + } + + // when new container joins, new CPU could be hotplugged, so we + // have to query fresh vcpu info from hypervisor for every time. + tids, err := s.hypervisor.getThreadIDs() + if err != nil || tids == nil { + return fmt.Errorf("failed to get thread ids from hypervisor: %v", err) + } + + // use Add() to add vcpu thread to s.cgroup, it will write thread id to + // `cgroup.procs` which will move all threads in qemu process to this cgroup + // immediately as default behaviour. + if len(tids.vcpus) > 0 { + if err := s.cgroup.sandboxSub.Add(cgroups.Process{ + Pid: tids.vcpus[0], + }); err != nil { + return err + } + } + + for _, i := range tids.vcpus { + if i <= 0 { + continue + } + + // In contrast, AddTask will write thread id to `tasks` + // After this, vcpu threads are in "vcpu" sub-cgroup, other threads in + // qemu will be left in parent cgroup untouched. + if err := s.cgroup.vcpuSub.AddTask(cgroups.Process{ + Pid: i, + }); err != nil { + return err + } + } + + return nil +} + +func (s *Sandbox) mergeSpecResource() (*specs.LinuxResources, error) { + if s.config == nil { + return nil, fmt.Errorf("sandbox config is nil") + } + + resource := &specs.LinuxResources{ + CPU: &specs.LinuxCPU{}, + } + + for _, c := range s.config.Containers { + config, ok := c.Annotations[annotations.ConfigJSONKey] + if !ok { + s.Logger().WithField("container", c.ID).Warningf("failed to find config from container annotations") + continue + } + + var spec specs.Spec + if err := json.Unmarshal([]byte(config), &spec); err != nil { + return nil, err + } + + // TODO: how to handle empty/unlimited resource? + // maybe we should add a default CPU/Memory delta when no + // resource limit is given. -- @WeiZhang555 + if spec.Linux == nil || spec.Linux.Resources == nil { + continue + } + // calculate cpu quota and period + s.mergeCPUResource(resource, spec.Linux.Resources) + } + return resource, nil +} + +func (s *Sandbox) mergeCPUResource(orig, rc *specs.LinuxResources) { + if orig.CPU == nil { + orig.CPU = &specs.LinuxCPU{} + } + + if rc.CPU != nil && rc.CPU.Quota != nil && rc.CPU.Period != nil && + *rc.CPU.Quota > 0 && *rc.CPU.Period > 0 { + if orig.CPU.Period == nil { + orig.CPU.Period = rc.CPU.Period + orig.CPU.Quota = rc.CPU.Quota + } else { + // this is an example to show how it works: + // container A and `orig` has quota: 5000 and period 10000 + // here comes container B with quota 40 and period 100, + // so use previous period 10000 as a baseline, container B + // has proportional resource of quota 4000 and period 10000, calculated as + // delta := 40 / 100 * 10000 = 4000 + // and final `*orig.CPU.Quota` = 5000 + 4000 = 9000 + delta := float64(*rc.CPU.Quota) / float64(*rc.CPU.Period) * float64(*orig.CPU.Period) + *orig.CPU.Quota += int64(delta) + } + } +} diff --git a/virtcontainers/cgroups_test.go b/virtcontainers/cgroups_test.go new file mode 100644 index 000000000..1a89690fb --- /dev/null +++ b/virtcontainers/cgroups_test.go @@ -0,0 +1,211 @@ +// Copyright (c) 2018 Huawei Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package virtcontainers + +import ( + "bufio" + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "reflect" + "strings" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/stretchr/testify/assert" + + "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" +) + +func getCgroupDestination(subsystem string) (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return "", err + } + fields := strings.Fields(s.Text()) + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if opt == subsystem { + return fields[4], nil + } + } + } + return "", fmt.Errorf("failed to find cgroup mountpoint for %q", subsystem) +} + +func TestMergeSpecResource(t *testing.T) { + s := &Sandbox{ + config: &SandboxConfig{ + Containers: []ContainerConfig{ + { + ID: "containerA", + Annotations: make(map[string]string), + }, + { + ID: "containerA", + Annotations: make(map[string]string), + }, + }, + }, + } + + contA := s.config.Containers[0] + contB := s.config.Containers[1] + + getIntP := func(x int64) *int64 { return &x } + getUintP := func(x uint64) *uint64 { return &x } + + type testData struct { + first *specs.LinuxResources + second *specs.LinuxResources + expected *specs.LinuxResources + } + + for _, testdata := range []testData{ + { + nil, + nil, + &specs.LinuxResources{CPU: &specs.LinuxCPU{}}, + }, + { + nil, + &specs.LinuxResources{}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{}}, + }, + { + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(0), Period: getUintP(100000)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, + }, + { + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(10000), Period: getUintP(0)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, + }, + { + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(1000), Period: getUintP(2000)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(1400), Period: getUintP(2000)}}, + }, + } { + data, err := json.Marshal(&specs.Spec{ + Linux: &specs.Linux{ + Resources: testdata.first, + }, + }) + assert.Nil(t, err) + contA.Annotations[annotations.ConfigJSONKey] = string(data) + + data, err = json.Marshal(&specs.Spec{ + Linux: &specs.Linux{ + Resources: testdata.second, + }, + }) + assert.Nil(t, err) + contB.Annotations[annotations.ConfigJSONKey] = string(data) + + rc, err := s.mergeSpecResource() + assert.Nil(t, err) + assert.True(t, reflect.DeepEqual(rc, testdata.expected), "should be equal, got: %#v, expected: %#v", rc, testdata.expected) + } +} + +func TestSetupCgroups(t *testing.T) { + if os.Geteuid() != 0 { + t.Skip("Test disabled as requires root privileges") + } + + s := &Sandbox{ + id: "test-sandbox", + hypervisor: &mockHypervisor{}, + config: &SandboxConfig{ + Containers: []ContainerConfig{ + { + ID: "containerA", + Annotations: make(map[string]string), + }, + { + ID: "containerA", + Annotations: make(map[string]string), + }, + }, + }, + } + + contA := s.config.Containers[0] + contB := s.config.Containers[1] + + getIntP := func(x int64) *int64 { return &x } + getUintP := func(x uint64) *uint64 { return &x } + + data, err := json.Marshal(&specs.Spec{ + Linux: &specs.Linux{ + Resources: &specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Quota: getIntP(5000), + Period: getUintP(10000), + }, + }, + }, + }) + assert.Nil(t, err) + contA.Annotations[annotations.ConfigJSONKey] = string(data) + + data, err = json.Marshal(&specs.Spec{ + Linux: &specs.Linux{ + Resources: &specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Quota: getIntP(10000), + Period: getUintP(40000), + }, + }, + }, + }) + assert.Nil(t, err) + contB.Annotations[annotations.ConfigJSONKey] = string(data) + + err = s.newCgroups() + assert.Nil(t, err, "failed to create cgroups") + + defer s.destroyCgroups() + + // test if function works without error + err = s.setupCgroups() + assert.Nil(t, err, "setup host cgroup failed") + + // test if the quota and period value are written into cgroup files + cpu, err := getCgroupDestination("cpu") + assert.Nil(t, err, "failed to get cpu cgroup path") + assert.NotEqual(t, "", cpu, "cpu cgroup value can't be empty") + + parentDir := filepath.Join(cpu, defaultCgroupParent, "test-sandbox", "vcpu") + quotaFile := filepath.Join(parentDir, "cpu.cfs_quota_us") + periodFile := filepath.Join(parentDir, "cpu.cfs_period_us") + + expectedQuota := "7500\n" + expectedPeriod := "10000\n" + + fquota, err := os.Open(quotaFile) + assert.Nil(t, err, "open file %q failed", quotaFile) + defer fquota.Close() + data, err = ioutil.ReadAll(fquota) + assert.Nil(t, err) + assert.Equal(t, expectedQuota, string(data), "failed to get expected cfs_quota") + + fperiod, err := os.Open(periodFile) + assert.Nil(t, err, "open file %q failed", periodFile) + defer fperiod.Close() + data, err = ioutil.ReadAll(fperiod) + assert.Nil(t, err) + assert.Equal(t, expectedPeriod, string(data), "failed to get expected cfs_period") +} diff --git a/virtcontainers/hypervisor.go b/virtcontainers/hypervisor.go index d94591ff7..5e143fe46 100644 --- a/virtcontainers/hypervisor.go +++ b/virtcontainers/hypervisor.go @@ -252,6 +252,10 @@ type HypervisorConfig struct { DisableVhostNet bool } +type threadIDs struct { + vcpus []int +} + func (conf *HypervisorConfig) checkTemplateConfig() error { if conf.BootToBeTemplate && conf.BootFromTemplate { return fmt.Errorf("Cannot set both 'to be' and 'from' vm tempate") @@ -571,4 +575,5 @@ type hypervisor interface { disconnect() capabilities() capabilities hypervisorConfig() HypervisorConfig + getThreadIDs() (*threadIDs, error) } diff --git a/virtcontainers/mock_hypervisor.go b/virtcontainers/mock_hypervisor.go index ca9c29d9f..de492f003 100644 --- a/virtcontainers/mock_hypervisor.go +++ b/virtcontainers/mock_hypervisor.go @@ -5,7 +5,10 @@ package virtcontainers -import "context" +import ( + "context" + "os" +) type mockHypervisor struct { } @@ -86,3 +89,8 @@ func (m *mockHypervisor) getSandboxConsole(sandboxID string) (string, error) { func (m *mockHypervisor) disconnect() { } + +func (m *mockHypervisor) getThreadIDs() (*threadIDs, error) { + vcpus := []int{os.Getpid()} + return &threadIDs{vcpus}, nil +} diff --git a/virtcontainers/qemu.go b/virtcontainers/qemu.go index 3c93a4f64..76ac46733 100644 --- a/virtcontainers/qemu.go +++ b/virtcontainers/qemu.go @@ -1293,3 +1293,27 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu return memory } + +func (q *qemu) getThreadIDs() (*threadIDs, error) { + span, _ := q.trace("getThreadIDs") + defer span.Finish() + + err := q.qmpSetup() + if err != nil { + return nil, err + } + + cpuInfos, err := q.qmpMonitorCh.qmp.ExecQueryCpus(q.qmpMonitorCh.ctx) + if err != nil { + q.Logger().WithError(err).Error("failed to query cpu infos") + return nil, err + } + + var tid threadIDs + for _, i := range cpuInfos { + if i.ThreadID > 0 { + tid.vcpus = append(tid.vcpus, i.ThreadID) + } + } + return &tid, nil +} diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index e72e3f102..7cffa51f6 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -30,9 +30,11 @@ import ( "github.com/vishvananda/netlink" ) -// vmStartTimeout represents the time in seconds a sandbox can wait before -// to consider the VM starting operation failed. -const vmStartTimeout = 10 +const ( + // vmStartTimeout represents the time in seconds a sandbox can wait before + // to consider the VM starting operation failed. + vmStartTimeout = 10 +) // stateString is a string representing a sandbox state. type stateString string @@ -491,6 +493,8 @@ type Sandbox struct { stateful bool ctx context.Context + + cgroup *sandboxCgroups } // ID returns the sandbox identifier string. @@ -861,6 +865,11 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return nil, err } + // create new cgroup for sandbox + if err := s.newCgroups(); err != nil { + return nil, err + } + return s, nil } @@ -978,6 +987,12 @@ func (s *Sandbox) Delete() error { } } + // destroy sandbox cgroup + if err := s.destroyCgroups(); err != nil { + // continue the removal process even cgroup failed to destroy + s.Logger().WithError(err).Error("failed to destroy cgroup") + } + globalSandboxList.removeSandbox(s.id) if s.monitor != nil { @@ -1283,6 +1298,10 @@ func (s *Sandbox) CreateContainer(contConfig ContainerConfig) (VCContainer, erro return nil, err } + // Setup host cgroups for new container + if err := s.setupCgroups(); err != nil { + return nil, err + } return c, nil }