mirror of
https://github.com/aljazceru/kata-containers.git
synced 2026-01-30 11:44:26 +01:00
This change introduces the `disable_guest_empty_dir` config option, which allows the user to change whether a Kubernetes emptyDir volume is created on the guest (the default, for performance reasons), or the host (necessary if you want to pass data from the host to a guest via an emptyDir). Fixes #2053 Signed-off-by: Evan Foster <efoster@adobe.com>
372 lines
11 KiB
Go
372 lines
11 KiB
Go
// Copyright (c) 2014,2015,2016 Docker, Inc.
|
|
// Copyright (c) 2017 Intel Corporation
|
|
// Copyright (c) 2018 HyperHQ Inc.
|
|
// Copyright (c) 2021 Adobe Inc.
|
|
//
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
//
|
|
|
|
package containerdshim
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"os/user"
|
|
"path"
|
|
"path/filepath"
|
|
"strconv"
|
|
"syscall"
|
|
|
|
containerd_types "github.com/containerd/containerd/api/types"
|
|
"github.com/containerd/containerd/mount"
|
|
taskAPI "github.com/containerd/containerd/runtime/v2/task"
|
|
"github.com/containerd/typeurl"
|
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless"
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/pkg/errors"
|
|
|
|
// only register the proto type
|
|
crioption "github.com/containerd/containerd/pkg/runtimeoptions/v1"
|
|
_ "github.com/containerd/containerd/runtime/linux/runctypes"
|
|
_ "github.com/containerd/containerd/runtime/v2/runc/options"
|
|
oldcrioption "github.com/containerd/cri-containerd/pkg/api/runtimeoptions/v1"
|
|
|
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
|
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
|
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
|
|
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/compatoci"
|
|
)
|
|
|
|
type startManagementServerFunc func(s *service, ctx context.Context, ociSpec *specs.Spec)
|
|
|
|
var defaultStartManagementServerFunc startManagementServerFunc = func(s *service, ctx context.Context, ociSpec *specs.Spec) {
|
|
go s.startManagementServer(ctx, ociSpec)
|
|
shimLog.Info("management server started")
|
|
}
|
|
|
|
func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*container, error) {
|
|
rootFs := vc.RootFs{}
|
|
if len(r.Rootfs) == 1 {
|
|
m := r.Rootfs[0]
|
|
rootFs.Source = m.Source
|
|
rootFs.Type = m.Type
|
|
rootFs.Options = m.Options
|
|
}
|
|
|
|
detach := !r.Terminal
|
|
ociSpec, bundlePath, err := loadSpec(r)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
containerType, err := oci.ContainerType(*ociSpec)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
disableOutput := noNeedForOutput(detach, ociSpec.Process.Terminal)
|
|
rootfs := filepath.Join(r.Bundle, "rootfs")
|
|
|
|
runtimeConfig, err := loadRuntimeConfig(s, r, ociSpec.Annotations)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
switch containerType {
|
|
case vc.PodSandbox, vc.SingleContainer:
|
|
if s.sandbox != nil {
|
|
return nil, fmt.Errorf("cannot create another sandbox in sandbox: %s", s.sandbox.ID())
|
|
}
|
|
|
|
s.config = runtimeConfig
|
|
|
|
// create tracer
|
|
// This is the earliest location we can create the tracer because we must wait
|
|
// until the runtime config is loaded
|
|
jaegerConfig := &katatrace.JaegerConfig{
|
|
JaegerEndpoint: s.config.JaegerEndpoint,
|
|
JaegerUser: s.config.JaegerUser,
|
|
JaegerPassword: s.config.JaegerPassword,
|
|
}
|
|
_, err = katatrace.CreateTracer("kata", jaegerConfig)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// create root span
|
|
rootSpan, newCtx := katatrace.Trace(s.ctx, shimLog, "root span", shimTracingTags)
|
|
s.rootCtx = newCtx
|
|
defer rootSpan.End()
|
|
|
|
// create span
|
|
span, newCtx := katatrace.Trace(s.rootCtx, shimLog, "create", shimTracingTags)
|
|
s.ctx = newCtx
|
|
defer span.End()
|
|
|
|
// Sandbox sizing information *may* be provided in two scenarios:
|
|
// 1. The upper layer runtime (ie, containerd or crio) provide sandbox sizing information as an annotation
|
|
// in the 'sandbox container's' spec. This would typically be a scenario where as part of a create sandbox
|
|
// request the upper layer runtime receives this information as part of a pod, and makes it available to us
|
|
// for sizing purposes.
|
|
// 2. If this is not a sandbox infrastructure container, but instead a standalone single container (analogous to "docker run..."),
|
|
// then the container spec itself will contain appropriate sizing information for the entire sandbox (since it is
|
|
// a single container.
|
|
if containerType == vc.PodSandbox {
|
|
s.config.SandboxCPUs, s.config.SandboxMemMB = oci.CalculateSandboxSizing(ociSpec)
|
|
} else {
|
|
s.config.SandboxCPUs, s.config.SandboxMemMB = oci.CalculateContainerSizing(ociSpec)
|
|
}
|
|
|
|
if rootFs.Mounted, err = checkAndMount(s, r); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
defer func() {
|
|
if err != nil && rootFs.Mounted {
|
|
if err2 := mount.UnmountAll(rootfs, 0); err2 != nil {
|
|
shimLog.WithField("container-type", containerType).WithError(err2).Warn("failed to cleanup rootfs mount")
|
|
}
|
|
}
|
|
}()
|
|
|
|
katautils.HandleFactory(ctx, vci, s.config)
|
|
rootless.SetRootless(s.config.HypervisorConfig.Rootless)
|
|
if rootless.IsRootless() {
|
|
if err := configureNonRootHypervisor(s.config); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// Pass service's context instead of local ctx to CreateSandbox(), since local
|
|
// ctx will be canceled after this rpc service call, but the sandbox will live
|
|
// across multiple rpc service calls.
|
|
//
|
|
sandbox, _, err := katautils.CreateSandbox(s.ctx, vci, *ociSpec, *s.config, rootFs, r.ID, bundlePath, "", disableOutput, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
s.sandbox = sandbox
|
|
pid, err := s.sandbox.GetHypervisorPid()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
s.hpid = uint32(pid)
|
|
|
|
if defaultStartManagementServerFunc != nil {
|
|
defaultStartManagementServerFunc(s, ctx, ociSpec)
|
|
}
|
|
|
|
case vc.PodContainer:
|
|
span, ctx := katatrace.Trace(s.ctx, shimLog, "create", shimTracingTags)
|
|
defer span.End()
|
|
|
|
if s.sandbox == nil {
|
|
return nil, fmt.Errorf("BUG: Cannot start the container, since the sandbox hasn't been created")
|
|
}
|
|
|
|
if rootFs.Mounted, err = checkAndMount(s, r); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
defer func() {
|
|
if err != nil && rootFs.Mounted {
|
|
if err2 := mount.UnmountAll(rootfs, 0); err2 != nil {
|
|
shimLog.WithField("container-type", containerType).WithError(err2).Warn("failed to cleanup rootfs mount")
|
|
}
|
|
}
|
|
}()
|
|
|
|
_, err = katautils.CreateContainer(ctx, s.sandbox, *ociSpec, rootFs, r.ID, bundlePath, "", disableOutput, runtimeConfig.DisableGuestEmptyDir)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
container, err := newContainer(s, r, containerType, ociSpec, rootFs.Mounted)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return container, nil
|
|
}
|
|
|
|
func loadSpec(r *taskAPI.CreateTaskRequest) (*specs.Spec, string, error) {
|
|
// Checks the MUST and MUST NOT from OCI runtime specification
|
|
bundlePath, err := validBundle(r.ID, r.Bundle)
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
|
|
ociSpec, err := compatoci.ParseConfigJSON(bundlePath)
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
|
|
return &ociSpec, bundlePath, nil
|
|
}
|
|
|
|
// Config override ordering(high to low):
|
|
// 1. podsandbox annotation
|
|
// 2. shimv2 create task option
|
|
// 3. environment
|
|
func loadRuntimeConfig(s *service, r *taskAPI.CreateTaskRequest, anno map[string]string) (*oci.RuntimeConfig, error) {
|
|
if s.config != nil {
|
|
return s.config, nil
|
|
}
|
|
configPath := oci.GetSandboxConfigPath(anno)
|
|
if configPath == "" && r.Options != nil {
|
|
v, err := typeurl.UnmarshalAny(r.Options)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
option, ok := v.(*crioption.Options)
|
|
// cri default runtime handler will pass a linux runc options,
|
|
// and we'll ignore it.
|
|
if ok {
|
|
configPath = option.ConfigPath
|
|
} else {
|
|
// Some versions of containerd, such as 1.4.3, and 1.4.4
|
|
// still rely on the runtime options coming from
|
|
// github.com/containerd/cri-containerd/pkg/api/runtimeoptions/v1
|
|
// Knowing that, instead of breaking compatibility with such
|
|
// versions, let's work this around on our side
|
|
oldOption, ok := v.(*oldcrioption.Options)
|
|
if ok {
|
|
configPath = oldOption.ConfigPath
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try to get the config file from the env KATA_CONF_FILE
|
|
if configPath == "" {
|
|
configPath = os.Getenv("KATA_CONF_FILE")
|
|
}
|
|
|
|
_, runtimeConfig, err := katautils.LoadConfiguration(configPath, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// For the unit test, the config will be predefined
|
|
if s.config == nil {
|
|
s.config = &runtimeConfig
|
|
}
|
|
|
|
return &runtimeConfig, nil
|
|
}
|
|
|
|
func checkAndMount(s *service, r *taskAPI.CreateTaskRequest) (bool, error) {
|
|
if len(r.Rootfs) == 1 {
|
|
m := r.Rootfs[0]
|
|
|
|
// Plug the block backed rootfs directly instead of mounting it.
|
|
if katautils.IsBlockDevice(m.Source) && !s.config.HypervisorConfig.DisableBlockDeviceUse {
|
|
return false, nil
|
|
}
|
|
if m.Type == vc.NydusRootFSType {
|
|
// if kata + nydus, do not mount
|
|
return false, nil
|
|
}
|
|
}
|
|
rootfs := filepath.Join(r.Bundle, "rootfs")
|
|
if err := doMount(r.Rootfs, rootfs); err != nil {
|
|
return false, err
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
func doMount(mounts []*containerd_types.Mount, rootfs string) error {
|
|
if len(mounts) == 0 {
|
|
return nil
|
|
}
|
|
|
|
if _, err := os.Stat(rootfs); os.IsNotExist(err) {
|
|
if err := os.Mkdir(rootfs, 0711); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
for _, rm := range mounts {
|
|
m := &mount.Mount{
|
|
Type: rm.Type,
|
|
Source: rm.Source,
|
|
Options: rm.Options,
|
|
}
|
|
if err := m.Mount(rootfs); err != nil {
|
|
return errors.Wrapf(err, "failed to mount rootfs component %v", m)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func configureNonRootHypervisor(runtimeConfig *oci.RuntimeConfig) error {
|
|
userName, err := utils.CreateVmmUser()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() {
|
|
if err != nil {
|
|
if err2 := utils.RemoveVmmUser(userName); err2 != nil {
|
|
shimLog.WithField("userName", userName).WithError(err).Warn("failed to remove user")
|
|
}
|
|
}
|
|
}()
|
|
|
|
u, err := user.Lookup(userName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
uid, err := strconv.Atoi(u.Uid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
gid, err := strconv.Atoi(u.Gid)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
runtimeConfig.HypervisorConfig.Uid = uint32(uid)
|
|
runtimeConfig.HypervisorConfig.Gid = uint32(gid)
|
|
|
|
userTmpDir := path.Join("/run/user/", fmt.Sprint(uid))
|
|
_, err = os.Stat(userTmpDir)
|
|
// Clean up the directory created by the previous run
|
|
if !os.IsNotExist(err) {
|
|
if err = os.RemoveAll(userTmpDir); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if err = os.Mkdir(userTmpDir, vc.DirMode); err != nil {
|
|
return err
|
|
}
|
|
defer func() {
|
|
if err != nil {
|
|
if err = os.RemoveAll(userTmpDir); err != nil {
|
|
shimLog.WithField("userTmpDir", userTmpDir).WithError(err).Warn("failed to remove userTmpDir")
|
|
}
|
|
}
|
|
}()
|
|
if err = syscall.Chown(userTmpDir, uid, gid); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := os.Setenv("XDG_RUNTIME_DIR", userTmpDir); err != nil {
|
|
return err
|
|
}
|
|
|
|
info, err := os.Stat("/dev/kvm")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if stat, ok := info.Sys().(*syscall.Stat_t); ok {
|
|
// Add the kvm group to the hypervisor supplemental group so that the hypervisor process can access /dev/kvm
|
|
runtimeConfig.HypervisorConfig.Groups = append(runtimeConfig.HypervisorConfig.Groups, stat.Gid)
|
|
return nil
|
|
}
|
|
return fmt.Errorf("failed to get the gid of /dev/kvm")
|
|
}
|