mirror of
https://github.com/aljazceru/kata-containers.git
synced 2025-12-18 23:04:20 +01:00
As github.com/containerd/cgroups doesn't support scope units which are essential in some cases lets create the cgroups manually and load it trough the cgroups api This is currently done only when there's single sandbox cgroup (sandbox_cgroup_only=true), otherwise we set it as static cgroup path as it used to be (until a proper soultion for overhead cgroup under systemd will be suggested) Backport-from: #2959 Fixes: #2868 Signed-off-by: Snir Sheriber <ssheribe@redhat.com>
2322 lines
64 KiB
Go
2322 lines
64 KiB
Go
// Copyright (c) 2016 Intel Corporation
|
|
// Copyright (c) 2020 Adobe Inc.
|
|
//
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
//
|
|
|
|
package virtcontainers
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"net"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"sync"
|
|
"syscall"
|
|
|
|
"github.com/containernetworking/plugins/pkg/ns"
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/pkg/errors"
|
|
"github.com/sirupsen/logrus"
|
|
"github.com/vishvananda/netlink"
|
|
|
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/api"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/config"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/drivers"
|
|
deviceManager "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/manager"
|
|
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist"
|
|
persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api"
|
|
pbTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cgroups"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/compatoci"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless"
|
|
vcTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/types"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
|
|
)
|
|
|
|
// sandboxTracingTags defines tags for the trace span
|
|
var sandboxTracingTags = map[string]string{
|
|
"source": "runtime",
|
|
"package": "virtcontainers",
|
|
"subsystem": "sandbox",
|
|
}
|
|
|
|
const (
|
|
// VmStartTimeout represents the time in seconds a sandbox can wait before
|
|
// to consider the VM starting operation failed.
|
|
VmStartTimeout = 10
|
|
|
|
// DirMode is the permission bits used for creating a directory
|
|
DirMode = os.FileMode(0750) | os.ModeDir
|
|
|
|
mkswapPath = "/sbin/mkswap"
|
|
rwm = "rwm"
|
|
|
|
// When the Kata overhead threads (I/O, VMM, etc) are not
|
|
// placed in the sandbox cgroup, they are moved to a specific,
|
|
// unconstrained cgroup hierarchy.
|
|
// Assuming the cgroup mount point is at /sys/fs/cgroup/, on a
|
|
// cgroup v1 system, the Kata overhead memory cgroup will be at
|
|
// /sys/fs/cgroup/memory/kata_overhead/$CGPATH where $CGPATH is
|
|
// defined by the orchestrator.
|
|
cgroupKataOverheadPath = "/kata_overhead/"
|
|
)
|
|
|
|
var (
|
|
errSandboxNotRunning = errors.New("Sandbox not running")
|
|
)
|
|
|
|
// SandboxStatus describes a sandbox status.
|
|
type SandboxStatus struct {
|
|
ContainersStatus []ContainerStatus
|
|
|
|
// Annotations allow clients to store arbitrary values,
|
|
// for example to add additional status values required
|
|
// to support particular specifications.
|
|
Annotations map[string]string
|
|
|
|
ID string
|
|
Hypervisor HypervisorType
|
|
State types.SandboxState
|
|
HypervisorConfig HypervisorConfig
|
|
}
|
|
|
|
// SandboxStats describes a sandbox's stats
|
|
type SandboxStats struct {
|
|
CgroupStats CgroupStats
|
|
Cpus int
|
|
}
|
|
|
|
// SandboxConfig is a Sandbox configuration.
|
|
type SandboxConfig struct {
|
|
// Volumes is a list of shared volumes between the host and the Sandbox.
|
|
Volumes []types.Volume
|
|
|
|
// Containers describe the list of containers within a Sandbox.
|
|
// This list can be empty and populated by adding containers
|
|
// to the Sandbox a posteriori.
|
|
//TODO: this should be a map to avoid duplicated containers
|
|
Containers []ContainerConfig
|
|
|
|
// SandboxBindMounts - list of paths to mount into guest
|
|
SandboxBindMounts []string
|
|
|
|
// Experimental features enabled
|
|
Experimental []exp.Feature
|
|
|
|
// Annotations keys must be unique strings and must be name-spaced
|
|
// with e.g. reverse domain notation (org.clearlinux.key).
|
|
Annotations map[string]string
|
|
|
|
ID string
|
|
|
|
Hostname string
|
|
|
|
HypervisorType HypervisorType
|
|
|
|
AgentConfig KataAgentConfig
|
|
|
|
NetworkConfig NetworkConfig
|
|
|
|
HypervisorConfig HypervisorConfig
|
|
|
|
ShmSize uint64
|
|
|
|
VfioMode config.VFIOModeType
|
|
|
|
// SharePidNs sets all containers to share the same sandbox level pid namespace.
|
|
SharePidNs bool
|
|
|
|
// SystemdCgroup enables systemd cgroup support
|
|
SystemdCgroup bool
|
|
|
|
// SandboxCgroupOnly enables cgroup only at podlevel in the host
|
|
SandboxCgroupOnly bool
|
|
|
|
DisableGuestSeccomp bool
|
|
}
|
|
|
|
// valid checks that the sandbox configuration is valid.
|
|
func (sandboxConfig *SandboxConfig) valid() bool {
|
|
if sandboxConfig.ID == "" {
|
|
return false
|
|
}
|
|
|
|
if _, err := NewHypervisor(sandboxConfig.HypervisorType); err != nil {
|
|
sandboxConfig.HypervisorType = QemuHypervisor
|
|
}
|
|
|
|
// validate experimental features
|
|
for _, f := range sandboxConfig.Experimental {
|
|
if exp.Get(f.Name) == nil {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// Sandbox is composed of a set of containers and a runtime environment.
|
|
// A Sandbox can be created, deleted, started, paused, stopped, listed, entered, and restored.
|
|
type Sandbox struct {
|
|
ctx context.Context
|
|
devManager api.DeviceManager
|
|
factory Factory
|
|
hypervisor Hypervisor
|
|
agent agent
|
|
store persistapi.PersistDriver
|
|
|
|
swapDevices []*config.BlockDrive
|
|
volumes []types.Volume
|
|
|
|
monitor *monitor
|
|
config *SandboxConfig
|
|
annotationsLock *sync.RWMutex
|
|
wg *sync.WaitGroup
|
|
sandboxCgroup *cgroups.Cgroup
|
|
overheadCgroup *cgroups.Cgroup
|
|
cw *consoleWatcher
|
|
|
|
containers map[string]*Container
|
|
|
|
id string
|
|
|
|
network Network
|
|
|
|
state types.SandboxState
|
|
|
|
networkNS NetworkNamespace
|
|
|
|
sync.Mutex
|
|
|
|
swapSizeBytes int64
|
|
shmSize uint64
|
|
swapDeviceNum uint
|
|
|
|
sharePidNs bool
|
|
seccompSupported bool
|
|
disableVMShutdown bool
|
|
}
|
|
|
|
// ID returns the sandbox identifier string.
|
|
func (s *Sandbox) ID() string {
|
|
return s.id
|
|
}
|
|
|
|
// Logger returns a logrus logger appropriate for logging Sandbox messages
|
|
func (s *Sandbox) Logger() *logrus.Entry {
|
|
return virtLog.WithFields(logrus.Fields{
|
|
"subsystem": "sandbox",
|
|
"sandbox": s.id,
|
|
})
|
|
}
|
|
|
|
// Annotations returns any annotation that a user could have stored through the sandbox.
|
|
func (s *Sandbox) Annotations(key string) (string, error) {
|
|
s.annotationsLock.RLock()
|
|
defer s.annotationsLock.RUnlock()
|
|
|
|
value, exist := s.config.Annotations[key]
|
|
if !exist {
|
|
return "", fmt.Errorf("Annotations key %s does not exist", key)
|
|
}
|
|
|
|
return value, nil
|
|
}
|
|
|
|
// SetAnnotations sets or adds an annotations
|
|
func (s *Sandbox) SetAnnotations(annotations map[string]string) error {
|
|
s.annotationsLock.Lock()
|
|
defer s.annotationsLock.Unlock()
|
|
|
|
for k, v := range annotations {
|
|
s.config.Annotations[k] = v
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetAnnotations returns sandbox's annotations
|
|
func (s *Sandbox) GetAnnotations() map[string]string {
|
|
s.annotationsLock.RLock()
|
|
defer s.annotationsLock.RUnlock()
|
|
|
|
return s.config.Annotations
|
|
}
|
|
|
|
// GetNetNs returns the network namespace of the current sandbox.
|
|
func (s *Sandbox) GetNetNs() string {
|
|
return s.networkNS.NetNsPath
|
|
}
|
|
|
|
// GetHypervisorPid returns the hypervisor's pid.
|
|
func (s *Sandbox) GetHypervisorPid() (int, error) {
|
|
pids := s.hypervisor.GetPids()
|
|
if len(pids) == 0 || pids[0] == 0 {
|
|
return -1, fmt.Errorf("Invalid hypervisor PID: %+v", pids)
|
|
}
|
|
|
|
return pids[0], nil
|
|
}
|
|
|
|
// GetAllContainers returns all containers.
|
|
func (s *Sandbox) GetAllContainers() []VCContainer {
|
|
ifa := make([]VCContainer, len(s.containers))
|
|
|
|
i := 0
|
|
for _, v := range s.containers {
|
|
ifa[i] = v
|
|
i++
|
|
}
|
|
|
|
return ifa
|
|
}
|
|
|
|
// GetContainer returns the container named by the containerID.
|
|
func (s *Sandbox) GetContainer(containerID string) VCContainer {
|
|
if c, ok := s.containers[containerID]; ok {
|
|
return c
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Release closes the agent connection.
|
|
func (s *Sandbox) Release(ctx context.Context) error {
|
|
s.Logger().Info("release sandbox")
|
|
if s.monitor != nil {
|
|
s.monitor.stop()
|
|
}
|
|
s.hypervisor.Disconnect(ctx)
|
|
return s.agent.disconnect(ctx)
|
|
}
|
|
|
|
// Status gets the status of the sandbox
|
|
func (s *Sandbox) Status() SandboxStatus {
|
|
var contStatusList []ContainerStatus
|
|
for _, c := range s.containers {
|
|
rootfs := c.config.RootFs.Source
|
|
if c.config.RootFs.Mounted {
|
|
rootfs = c.config.RootFs.Target
|
|
}
|
|
|
|
contStatusList = append(contStatusList, ContainerStatus{
|
|
ID: c.id,
|
|
State: c.state,
|
|
PID: c.process.Pid,
|
|
StartTime: c.process.StartTime,
|
|
RootFs: rootfs,
|
|
Annotations: c.config.Annotations,
|
|
})
|
|
}
|
|
|
|
return SandboxStatus{
|
|
ID: s.id,
|
|
State: s.state,
|
|
Hypervisor: s.config.HypervisorType,
|
|
HypervisorConfig: s.config.HypervisorConfig,
|
|
ContainersStatus: contStatusList,
|
|
Annotations: s.config.Annotations,
|
|
}
|
|
}
|
|
|
|
// Monitor returns a error channel for watcher to watch at
|
|
func (s *Sandbox) Monitor(ctx context.Context) (chan error, error) {
|
|
if s.state.State != types.StateRunning {
|
|
return nil, errSandboxNotRunning
|
|
}
|
|
|
|
s.Lock()
|
|
if s.monitor == nil {
|
|
s.monitor = newMonitor(s)
|
|
}
|
|
s.Unlock()
|
|
|
|
return s.monitor.newWatcher(ctx)
|
|
}
|
|
|
|
// WaitProcess waits on a container process and return its exit code
|
|
func (s *Sandbox) WaitProcess(ctx context.Context, containerID, processID string) (int32, error) {
|
|
if s.state.State != types.StateRunning {
|
|
return 0, errSandboxNotRunning
|
|
}
|
|
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
return c.wait(ctx, processID)
|
|
}
|
|
|
|
// SignalProcess sends a signal to a process of a container when all is false.
|
|
// When all is true, it sends the signal to all processes of a container.
|
|
func (s *Sandbox) SignalProcess(ctx context.Context, containerID, processID string, signal syscall.Signal, all bool) error {
|
|
if s.state.State != types.StateRunning {
|
|
return errSandboxNotRunning
|
|
}
|
|
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return c.signalProcess(ctx, processID, signal, all)
|
|
}
|
|
|
|
// WinsizeProcess resizes the tty window of a process
|
|
func (s *Sandbox) WinsizeProcess(ctx context.Context, containerID, processID string, height, width uint32) error {
|
|
if s.state.State != types.StateRunning {
|
|
return errSandboxNotRunning
|
|
}
|
|
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return c.winsizeProcess(ctx, processID, height, width)
|
|
}
|
|
|
|
// IOStream returns stdin writer, stdout reader and stderr reader of a process
|
|
func (s *Sandbox) IOStream(containerID, processID string) (io.WriteCloser, io.Reader, io.Reader, error) {
|
|
if s.state.State != types.StateRunning {
|
|
return nil, nil, nil, errSandboxNotRunning
|
|
}
|
|
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return nil, nil, nil, err
|
|
}
|
|
|
|
return c.ioStream(processID)
|
|
}
|
|
|
|
func createAssets(ctx context.Context, sandboxConfig *SandboxConfig) error {
|
|
span, _ := katatrace.Trace(ctx, nil, "createAssets", sandboxTracingTags, map[string]string{"sandbox_id": sandboxConfig.ID})
|
|
defer span.End()
|
|
|
|
for _, name := range types.AssetTypes() {
|
|
a, err := types.NewAsset(sandboxConfig.Annotations, name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := sandboxConfig.HypervisorConfig.AddCustomAsset(a); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
_, imageErr := sandboxConfig.HypervisorConfig.assetPath(types.ImageAsset)
|
|
_, initrdErr := sandboxConfig.HypervisorConfig.assetPath(types.InitrdAsset)
|
|
|
|
if imageErr != nil && initrdErr != nil {
|
|
return fmt.Errorf("%s and %s cannot be both set", types.ImageAsset, types.InitrdAsset)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *Sandbox) getAndStoreGuestDetails(ctx context.Context) error {
|
|
guestDetailRes, err := s.agent.getGuestDetails(ctx, &grpc.GuestDetailsRequest{
|
|
MemBlockSize: true,
|
|
MemHotplugProbe: true,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if guestDetailRes != nil {
|
|
s.state.GuestMemoryBlockSizeMB = uint32(guestDetailRes.MemBlockSizeBytes >> 20)
|
|
if guestDetailRes.AgentDetails != nil {
|
|
s.seccompSupported = guestDetailRes.AgentDetails.SupportsSeccomp
|
|
}
|
|
s.state.GuestMemoryHotplugProbe = guestDetailRes.SupportMemHotplugProbe
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// createSandbox creates a sandbox from a sandbox description, the containers list, the hypervisor
|
|
// and the agent passed through the Config structure.
|
|
// It will create and store the sandbox structure, and then ask the hypervisor
|
|
// to physically create that sandbox i.e. starts a VM for that sandbox to eventually
|
|
// be started.
|
|
func createSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factory) (*Sandbox, error) {
|
|
span, ctx := katatrace.Trace(ctx, nil, "createSandbox", sandboxTracingTags, map[string]string{"sandbox_id": sandboxConfig.ID})
|
|
defer span.End()
|
|
|
|
if err := createAssets(ctx, &sandboxConfig); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
s, err := newSandbox(ctx, sandboxConfig, factory)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if len(s.config.Experimental) != 0 {
|
|
s.Logger().WithField("features", s.config.Experimental).Infof("Enable experimental features")
|
|
}
|
|
|
|
// Sandbox state has been loaded from storage.
|
|
// If the Stae is not empty, this is a re-creation, i.e.
|
|
// we don't need to talk to the guest's agent, but only
|
|
// want to create the sandbox and its containers in memory.
|
|
if s.state.State != "" {
|
|
return s, nil
|
|
}
|
|
|
|
// Below code path is called only during create, because of earlier Check.
|
|
if err := s.agent.createSandbox(ctx, s); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Set sandbox state
|
|
if err := s.setSandboxState(types.StateReady); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return s, nil
|
|
}
|
|
|
|
func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factory) (sb *Sandbox, retErr error) {
|
|
span, ctx := katatrace.Trace(ctx, nil, "newSandbox", sandboxTracingTags, map[string]string{"sandbox_id": sandboxConfig.ID})
|
|
defer span.End()
|
|
|
|
if !sandboxConfig.valid() {
|
|
return nil, fmt.Errorf("Invalid sandbox configuration")
|
|
}
|
|
|
|
// create agent instance
|
|
agent := getNewAgentFunc(ctx)()
|
|
|
|
hypervisor, err := NewHypervisor(sandboxConfig.HypervisorType)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
s := &Sandbox{
|
|
id: sandboxConfig.ID,
|
|
factory: factory,
|
|
hypervisor: hypervisor,
|
|
agent: agent,
|
|
config: &sandboxConfig,
|
|
volumes: sandboxConfig.Volumes,
|
|
containers: map[string]*Container{},
|
|
state: types.SandboxState{BlockIndexMap: make(map[int]struct{})},
|
|
annotationsLock: &sync.RWMutex{},
|
|
wg: &sync.WaitGroup{},
|
|
shmSize: sandboxConfig.ShmSize,
|
|
sharePidNs: sandboxConfig.SharePidNs,
|
|
networkNS: NetworkNamespace{NetNsPath: sandboxConfig.NetworkConfig.NetNSPath},
|
|
ctx: ctx,
|
|
swapDeviceNum: 0,
|
|
swapSizeBytes: 0,
|
|
swapDevices: []*config.BlockDrive{},
|
|
}
|
|
|
|
hypervisor.setSandbox(s)
|
|
|
|
if s.store, err = persist.GetDriver(); err != nil || s.store == nil {
|
|
return nil, fmt.Errorf("failed to get fs persist driver: %v", err)
|
|
}
|
|
|
|
defer func() {
|
|
if retErr != nil {
|
|
s.Logger().WithError(retErr).Error("Create new sandbox failed")
|
|
s.store.Destroy(s.id)
|
|
}
|
|
}()
|
|
|
|
spec := s.GetPatchedOCISpec()
|
|
if spec != nil && spec.Process.SelinuxLabel != "" {
|
|
sandboxConfig.HypervisorConfig.SELinuxProcessLabel = spec.Process.SelinuxLabel
|
|
}
|
|
|
|
s.devManager = deviceManager.NewDeviceManager(sandboxConfig.HypervisorConfig.BlockDeviceDriver,
|
|
sandboxConfig.HypervisorConfig.EnableVhostUserStore,
|
|
sandboxConfig.HypervisorConfig.VhostUserStorePath, nil)
|
|
|
|
// Create the sandbox cgroups
|
|
if err := s.createCgroups(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Ignore the error. Restore can fail for a new sandbox
|
|
if err := s.Restore(); err != nil {
|
|
s.Logger().WithError(err).Debug("restore sandbox failed")
|
|
}
|
|
|
|
// store doesn't require hypervisor to be stored immediately
|
|
if err = s.hypervisor.CreateVM(ctx, s.id, s.networkNS, &sandboxConfig.HypervisorConfig); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if s.disableVMShutdown, err = s.agent.init(ctx, s, sandboxConfig.AgentConfig); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return s, nil
|
|
}
|
|
|
|
func (s *Sandbox) createCgroups() error {
|
|
var err error
|
|
cgroupPath := ""
|
|
|
|
// Do not change current cgroup configuration.
|
|
// Create a spec without constraints
|
|
resources := specs.LinuxResources{}
|
|
|
|
if s.config == nil {
|
|
return fmt.Errorf("Could not create cgroup manager: empty sandbox configuration")
|
|
}
|
|
|
|
spec := s.GetPatchedOCISpec()
|
|
if spec != nil && spec.Linux != nil {
|
|
cgroupPath = spec.Linux.CgroupsPath
|
|
|
|
// Kata relies on the cgroup parent created and configured by the container
|
|
// engine by default. The exception is for devices whitelist as well as sandbox-level
|
|
// CPUSet.
|
|
// For the sandbox cgroups we create and manage, rename the base of the cgroup path to
|
|
// include "kata_"
|
|
if !cgroups.IsSystemdCgroup(cgroupPath) { // don't add prefix when cgroups are managed by systemd
|
|
cgroupPath, err = cgroups.RenameCgroupPath(cgroupPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if spec.Linux.Resources != nil {
|
|
resources.Devices = spec.Linux.Resources.Devices
|
|
|
|
intptr := func(i int64) *int64 { return &i }
|
|
// Determine if device /dev/null and /dev/urandom exist, and add if they don't
|
|
nullDeviceExist := false
|
|
urandomDeviceExist := false
|
|
for _, device := range resources.Devices {
|
|
if device.Type == "c" && device.Major == intptr(1) && device.Minor == intptr(3) {
|
|
nullDeviceExist = true
|
|
}
|
|
|
|
if device.Type == "c" && device.Major == intptr(1) && device.Minor == intptr(9) {
|
|
urandomDeviceExist = true
|
|
}
|
|
}
|
|
|
|
if !nullDeviceExist {
|
|
// "/dev/null"
|
|
resources.Devices = append(resources.Devices, []specs.LinuxDeviceCgroup{
|
|
{Type: "c", Major: intptr(1), Minor: intptr(3), Access: rwm, Allow: true},
|
|
}...)
|
|
}
|
|
if !urandomDeviceExist {
|
|
// "/dev/urandom"
|
|
resources.Devices = append(resources.Devices, []specs.LinuxDeviceCgroup{
|
|
{Type: "c", Major: intptr(1), Minor: intptr(9), Access: rwm, Allow: true},
|
|
}...)
|
|
}
|
|
|
|
if spec.Linux.Resources.CPU != nil {
|
|
resources.CPU = &specs.LinuxCPU{
|
|
Cpus: spec.Linux.Resources.CPU.Cpus,
|
|
}
|
|
}
|
|
}
|
|
|
|
//TODO: in Docker or Podman use case, it is reasonable to set a constraint. Need to add a flag
|
|
// to allow users to configure Kata to constrain CPUs and Memory in this alternative
|
|
// scenario. See https://github.com/kata-containers/runtime/issues/2811
|
|
}
|
|
|
|
if s.devManager != nil {
|
|
for _, d := range s.devManager.GetAllDevices() {
|
|
dev, err := cgroups.DeviceToLinuxDevice(d.GetHostPath())
|
|
if err != nil {
|
|
s.Logger().WithError(err).WithField("device", d.GetHostPath()).Warn("Could not add device to sandbox resources")
|
|
continue
|
|
}
|
|
resources.Devices = append(resources.Devices, dev)
|
|
}
|
|
}
|
|
|
|
// Create the sandbox cgroup.
|
|
// Depending on the SandboxCgroupOnly value, this cgroup
|
|
// will either hold all the pod threads (SandboxCgroupOnly is true)
|
|
// or only the virtual CPU ones (SandboxCgroupOnly is false).
|
|
s.sandboxCgroup, err = cgroups.NewSandboxCgroup(cgroupPath, &resources, s.config.SandboxCgroupOnly)
|
|
if err != nil {
|
|
return fmt.Errorf("Could not create the sandbox cgroup %v", err)
|
|
}
|
|
|
|
// Now that the sandbox cgroup is created, we can set the state cgroup root paths.
|
|
s.state.SandboxCgroupPath = s.sandboxCgroup.Path()
|
|
s.state.OverheadCgroupPath = ""
|
|
|
|
if s.config.SandboxCgroupOnly {
|
|
s.overheadCgroup = nil
|
|
} else {
|
|
// The shim configuration is requesting that we do not put all threads
|
|
// into the sandbox cgroup.
|
|
// We're creating an overhead cgroup, with no constraints. Everything but
|
|
// the vCPU threads will eventually make it there.
|
|
overheadCgroup, err := cgroups.NewCgroup(fmt.Sprintf("/%s/%s", cgroupKataOverheadPath, s.id), &specs.LinuxResources{})
|
|
// TODO: support systemd cgroups overhead cgroup
|
|
// https://github.com/kata-containers/kata-containers/issues/2963
|
|
if err != nil {
|
|
return err
|
|
}
|
|
s.overheadCgroup = overheadCgroup
|
|
s.state.OverheadCgroupPath = s.overheadCgroup.Path()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// storeSandbox stores a sandbox config.
|
|
func (s *Sandbox) storeSandbox(ctx context.Context) error {
|
|
span, _ := katatrace.Trace(ctx, s.Logger(), "storeSandbox", sandboxTracingTags, map[string]string{"sandbox_id": s.id})
|
|
defer span.End()
|
|
|
|
// flush data to storage
|
|
if err := s.Save(); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func rwLockSandbox(sandboxID string) (func() error, error) {
|
|
store, err := persist.GetDriver()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get fs persist driver: %v", err)
|
|
}
|
|
|
|
return store.Lock(sandboxID, true)
|
|
}
|
|
|
|
// findContainer returns a container from the containers list held by the
|
|
// sandbox structure, based on a container ID.
|
|
func (s *Sandbox) findContainer(containerID string) (*Container, error) {
|
|
if s == nil {
|
|
return nil, vcTypes.ErrNeedSandbox
|
|
}
|
|
|
|
if containerID == "" {
|
|
return nil, vcTypes.ErrNeedContainerID
|
|
}
|
|
|
|
if c, ok := s.containers[containerID]; ok {
|
|
return c, nil
|
|
}
|
|
|
|
return nil, errors.Wrapf(vcTypes.ErrNoSuchContainer, "Could not find the container %q from the sandbox %q containers list",
|
|
containerID, s.id)
|
|
}
|
|
|
|
// removeContainer removes a container from the containers list held by the
|
|
// sandbox structure, based on a container ID.
|
|
func (s *Sandbox) removeContainer(containerID string) error {
|
|
if s == nil {
|
|
return vcTypes.ErrNeedSandbox
|
|
}
|
|
|
|
if containerID == "" {
|
|
return vcTypes.ErrNeedContainerID
|
|
}
|
|
|
|
if _, ok := s.containers[containerID]; !ok {
|
|
return errors.Wrapf(vcTypes.ErrNoSuchContainer, "Could not remove the container %q from the sandbox %q containers list",
|
|
containerID, s.id)
|
|
}
|
|
|
|
delete(s.containers, containerID)
|
|
|
|
return nil
|
|
}
|
|
|
|
// Delete deletes an already created sandbox.
|
|
// The VM in which the sandbox is running will be shut down.
|
|
func (s *Sandbox) Delete(ctx context.Context) error {
|
|
if s.state.State != types.StateReady &&
|
|
s.state.State != types.StatePaused &&
|
|
s.state.State != types.StateStopped {
|
|
return fmt.Errorf("Sandbox not ready, paused or stopped, impossible to delete")
|
|
}
|
|
|
|
for _, c := range s.containers {
|
|
if err := c.delete(ctx); err != nil {
|
|
s.Logger().WithError(err).WithField("container`", c.id).Debug("failed to delete container")
|
|
}
|
|
}
|
|
|
|
if !rootless.IsRootless() {
|
|
if err := s.cgroupsDelete(); err != nil {
|
|
s.Logger().WithError(err).Error("failed to Cleanup cgroups")
|
|
}
|
|
}
|
|
|
|
if s.monitor != nil {
|
|
s.monitor.stop()
|
|
}
|
|
|
|
if err := s.hypervisor.Cleanup(ctx); err != nil {
|
|
s.Logger().WithError(err).Error("failed to Cleanup hypervisor")
|
|
}
|
|
|
|
s.agent.cleanup(ctx, s)
|
|
|
|
return s.store.Destroy(s.id)
|
|
}
|
|
|
|
func (s *Sandbox) startNetworkMonitor(ctx context.Context) error {
|
|
span, ctx := katatrace.Trace(ctx, s.Logger(), "startNetworkMonitor", sandboxTracingTags, map[string]string{"sandbox_id": s.id})
|
|
defer span.End()
|
|
|
|
binPath, err := os.Executable()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
logLevel := "info"
|
|
if s.config.NetworkConfig.NetmonConfig.Debug {
|
|
logLevel = "debug"
|
|
}
|
|
|
|
params := netmonParams{
|
|
netmonPath: s.config.NetworkConfig.NetmonConfig.Path,
|
|
debug: s.config.NetworkConfig.NetmonConfig.Debug,
|
|
logLevel: logLevel,
|
|
runtime: binPath,
|
|
sandboxID: s.id,
|
|
}
|
|
|
|
return s.network.Run(ctx, s.networkNS.NetNsPath, func() error {
|
|
pid, err := startNetmon(params)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
s.networkNS.NetmonPID = pid
|
|
|
|
return nil
|
|
})
|
|
}
|
|
|
|
func (s *Sandbox) createNetwork(ctx context.Context) error {
|
|
if s.config.NetworkConfig.DisableNewNetNs ||
|
|
s.config.NetworkConfig.NetNSPath == "" {
|
|
return nil
|
|
}
|
|
|
|
span, ctx := katatrace.Trace(ctx, s.Logger(), "createNetwork", sandboxTracingTags, map[string]string{"sandbox_id": s.id})
|
|
defer span.End()
|
|
|
|
s.networkNS = NetworkNamespace{
|
|
NetNsPath: s.config.NetworkConfig.NetNSPath,
|
|
NetNsCreated: s.config.NetworkConfig.NetNsCreated,
|
|
}
|
|
|
|
katatrace.AddTags(span, "networkNS", s.networkNS, "NetworkConfig", s.config.NetworkConfig)
|
|
|
|
// In case there is a factory, network interfaces are hotplugged
|
|
// after vm is started.
|
|
if s.factory == nil {
|
|
// Add the network
|
|
endpoints, err := s.network.Add(ctx, &s.config.NetworkConfig, s, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
s.networkNS.Endpoints = endpoints
|
|
|
|
if s.config.NetworkConfig.NetmonConfig.Enable {
|
|
if err := s.startNetworkMonitor(ctx); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Sandbox) postCreatedNetwork(ctx context.Context) error {
|
|
|
|
return s.network.PostAdd(ctx, &s.networkNS, s.factory != nil)
|
|
}
|
|
|
|
func (s *Sandbox) removeNetwork(ctx context.Context) error {
|
|
span, ctx := katatrace.Trace(ctx, s.Logger(), "removeNetwork", sandboxTracingTags, map[string]string{"sandbox_id": s.id})
|
|
defer span.End()
|
|
|
|
if s.config.NetworkConfig.NetmonConfig.Enable {
|
|
if err := stopNetmon(s.networkNS.NetmonPID); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return s.network.Remove(ctx, &s.networkNS, s.hypervisor)
|
|
}
|
|
|
|
func (s *Sandbox) generateNetInfo(inf *pbTypes.Interface) (NetworkInfo, error) {
|
|
hw, err := net.ParseMAC(inf.HwAddr)
|
|
if err != nil {
|
|
return NetworkInfo{}, err
|
|
}
|
|
|
|
var addrs []netlink.Addr
|
|
for _, addr := range inf.IPAddresses {
|
|
netlinkAddrStr := fmt.Sprintf("%s/%s", addr.Address, addr.Mask)
|
|
netlinkAddr, err := netlink.ParseAddr(netlinkAddrStr)
|
|
if err != nil {
|
|
return NetworkInfo{}, fmt.Errorf("could not parse %q: %v", netlinkAddrStr, err)
|
|
}
|
|
|
|
addrs = append(addrs, *netlinkAddr)
|
|
}
|
|
|
|
return NetworkInfo{
|
|
Iface: NetlinkIface{
|
|
LinkAttrs: netlink.LinkAttrs{
|
|
Name: inf.Name,
|
|
HardwareAddr: hw,
|
|
MTU: int(inf.Mtu),
|
|
},
|
|
Type: inf.Type,
|
|
},
|
|
Addrs: addrs,
|
|
}, nil
|
|
}
|
|
|
|
// AddInterface adds new nic to the sandbox.
|
|
func (s *Sandbox) AddInterface(ctx context.Context, inf *pbTypes.Interface) (*pbTypes.Interface, error) {
|
|
netInfo, err := s.generateNetInfo(inf)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
endpoint, err := createEndpoint(netInfo, len(s.networkNS.Endpoints), s.config.NetworkConfig.InterworkingModel, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
endpoint.SetProperties(netInfo)
|
|
if err := doNetNS(s.networkNS.NetNsPath, func(_ ns.NetNS) error {
|
|
s.Logger().WithField("endpoint-type", endpoint.Type()).Info("Hot attaching endpoint")
|
|
return endpoint.HotAttach(ctx, s.hypervisor)
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Update the sandbox storage
|
|
s.networkNS.Endpoints = append(s.networkNS.Endpoints, endpoint)
|
|
if err := s.Save(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Add network for vm
|
|
inf.PciPath = endpoint.PciPath().String()
|
|
return s.agent.updateInterface(ctx, inf)
|
|
}
|
|
|
|
// RemoveInterface removes a nic of the sandbox.
|
|
func (s *Sandbox) RemoveInterface(ctx context.Context, inf *pbTypes.Interface) (*pbTypes.Interface, error) {
|
|
for i, endpoint := range s.networkNS.Endpoints {
|
|
if endpoint.HardwareAddr() == inf.HwAddr {
|
|
s.Logger().WithField("endpoint-type", endpoint.Type()).Info("Hot detaching endpoint")
|
|
if err := endpoint.HotDetach(ctx, s.hypervisor, s.networkNS.NetNsCreated, s.networkNS.NetNsPath); err != nil {
|
|
return inf, err
|
|
}
|
|
s.networkNS.Endpoints = append(s.networkNS.Endpoints[:i], s.networkNS.Endpoints[i+1:]...)
|
|
|
|
if err := s.Save(); err != nil {
|
|
return inf, err
|
|
}
|
|
|
|
break
|
|
}
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
// ListInterfaces lists all nics and their configurations in the sandbox.
|
|
func (s *Sandbox) ListInterfaces(ctx context.Context) ([]*pbTypes.Interface, error) {
|
|
return s.agent.listInterfaces(ctx)
|
|
}
|
|
|
|
// UpdateRoutes updates the sandbox route table (e.g. for portmapping support).
|
|
func (s *Sandbox) UpdateRoutes(ctx context.Context, routes []*pbTypes.Route) ([]*pbTypes.Route, error) {
|
|
return s.agent.updateRoutes(ctx, routes)
|
|
}
|
|
|
|
// ListRoutes lists all routes and their configurations in the sandbox.
|
|
func (s *Sandbox) ListRoutes(ctx context.Context) ([]*pbTypes.Route, error) {
|
|
return s.agent.listRoutes(ctx)
|
|
}
|
|
|
|
const (
|
|
// unix socket type of console
|
|
consoleProtoUnix = "unix"
|
|
|
|
// pty type of console.
|
|
consoleProtoPty = "pty"
|
|
)
|
|
|
|
// console watcher is designed to monitor guest console output.
|
|
type consoleWatcher struct {
|
|
conn net.Conn
|
|
ptyConsole *os.File
|
|
proto string
|
|
consoleURL string
|
|
}
|
|
|
|
func newConsoleWatcher(ctx context.Context, s *Sandbox) (*consoleWatcher, error) {
|
|
var (
|
|
err error
|
|
cw consoleWatcher
|
|
)
|
|
|
|
cw.proto, cw.consoleURL, err = s.hypervisor.GetVMConsole(ctx, s.id)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &cw, nil
|
|
}
|
|
|
|
// start the console watcher
|
|
func (cw *consoleWatcher) start(s *Sandbox) (err error) {
|
|
if cw.consoleWatched() {
|
|
return fmt.Errorf("console watcher has already watched for sandbox %s", s.id)
|
|
}
|
|
|
|
var scanner *bufio.Scanner
|
|
|
|
switch cw.proto {
|
|
case consoleProtoUnix:
|
|
cw.conn, err = net.Dial("unix", cw.consoleURL)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
scanner = bufio.NewScanner(cw.conn)
|
|
case consoleProtoPty:
|
|
// read-only
|
|
cw.ptyConsole, _ = os.Open(cw.consoleURL)
|
|
scanner = bufio.NewScanner(cw.ptyConsole)
|
|
default:
|
|
return fmt.Errorf("unknown console proto %s", cw.proto)
|
|
}
|
|
|
|
go func() {
|
|
for scanner.Scan() {
|
|
s.Logger().WithFields(logrus.Fields{
|
|
"console-protocol": cw.proto,
|
|
"console-url": cw.consoleURL,
|
|
"sandbox": s.id,
|
|
"vmconsole": scanner.Text(),
|
|
}).Debug("reading guest console")
|
|
}
|
|
|
|
if err := scanner.Err(); err != nil {
|
|
if err == io.EOF {
|
|
s.Logger().Info("console watcher quits")
|
|
} else {
|
|
s.Logger().WithError(err).WithFields(logrus.Fields{
|
|
"console-protocol": cw.proto,
|
|
"console-url": cw.consoleURL,
|
|
"sandbox": s.id,
|
|
}).Error("Failed to read guest console logs")
|
|
}
|
|
}
|
|
}()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Check if the console watcher has already watched the vm console.
|
|
func (cw *consoleWatcher) consoleWatched() bool {
|
|
return cw.conn != nil || cw.ptyConsole != nil
|
|
}
|
|
|
|
// stop the console watcher.
|
|
func (cw *consoleWatcher) stop() {
|
|
if cw.conn != nil {
|
|
cw.conn.Close()
|
|
cw.conn = nil
|
|
}
|
|
|
|
if cw.ptyConsole != nil {
|
|
cw.ptyConsole.Close()
|
|
cw.ptyConsole = nil
|
|
}
|
|
}
|
|
|
|
func (s *Sandbox) addSwap(ctx context.Context, swapID string, size int64) (*config.BlockDrive, error) {
|
|
swapFile := filepath.Join(getSandboxPath(s.id), swapID)
|
|
|
|
swapFD, err := os.OpenFile(swapFile, os.O_CREATE, 0600)
|
|
if err != nil {
|
|
err = fmt.Errorf("creat swapfile %s fail %s", swapFile, err.Error())
|
|
s.Logger().WithError(err).Error("addSwap")
|
|
return nil, err
|
|
}
|
|
swapFD.Close()
|
|
defer func() {
|
|
if err != nil {
|
|
os.Remove(swapFile)
|
|
}
|
|
}()
|
|
|
|
// Check the size
|
|
pagesize := os.Getpagesize()
|
|
// mkswap refuses areas smaller than 10 pages.
|
|
size = int64(math.Max(float64(size), float64(pagesize*10)))
|
|
// Swapfile need a page to store the metadata
|
|
size += int64(pagesize)
|
|
|
|
err = os.Truncate(swapFile, size)
|
|
if err != nil {
|
|
err = fmt.Errorf("truncate swapfile %s fail %s", swapFile, err.Error())
|
|
s.Logger().WithError(err).Error("addSwap")
|
|
return nil, err
|
|
}
|
|
|
|
var outbuf, errbuf bytes.Buffer
|
|
cmd := exec.CommandContext(ctx, mkswapPath, swapFile)
|
|
cmd.Stdout = &outbuf
|
|
cmd.Stderr = &errbuf
|
|
err = cmd.Run()
|
|
if err != nil {
|
|
err = fmt.Errorf("mkswap swapfile %s fail %s stdout %s stderr %s", swapFile, err.Error(), outbuf.String(), errbuf.String())
|
|
s.Logger().WithError(err).Error("addSwap")
|
|
return nil, err
|
|
}
|
|
|
|
blockDevice := &config.BlockDrive{
|
|
File: swapFile,
|
|
Format: "raw",
|
|
ID: swapID,
|
|
Swap: true,
|
|
}
|
|
_, err = s.hypervisor.HotplugAddDevice(ctx, blockDevice, BlockDev)
|
|
if err != nil {
|
|
err = fmt.Errorf("add swapfile %s device to VM fail %s", swapFile, err.Error())
|
|
s.Logger().WithError(err).Error("addSwap")
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if err != nil {
|
|
_, e := s.hypervisor.HotplugRemoveDevice(ctx, blockDevice, BlockDev)
|
|
if e != nil {
|
|
s.Logger().Errorf("remove swapfile %s to VM fail %s", swapFile, e.Error())
|
|
}
|
|
}
|
|
}()
|
|
|
|
err = s.agent.addSwap(ctx, blockDevice.PCIPath)
|
|
if err != nil {
|
|
err = fmt.Errorf("agent add swapfile %s PCIPath %+v to VM fail %s", swapFile, blockDevice.PCIPath, err.Error())
|
|
s.Logger().WithError(err).Error("addSwap")
|
|
return nil, err
|
|
}
|
|
|
|
s.Logger().Infof("add swapfile %s size %d PCIPath %+v to VM success", swapFile, size, blockDevice.PCIPath)
|
|
|
|
return blockDevice, nil
|
|
}
|
|
|
|
func (s *Sandbox) removeSwap(ctx context.Context, blockDevice *config.BlockDrive) error {
|
|
err := os.Remove(blockDevice.File)
|
|
if err != nil {
|
|
err = fmt.Errorf("remove swapfile %s fail %s", blockDevice.File, err.Error())
|
|
s.Logger().WithError(err).Error("removeSwap")
|
|
} else {
|
|
s.Logger().Infof("remove swapfile %s success", blockDevice.File)
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (s *Sandbox) setupSwap(ctx context.Context, sizeBytes int64) error {
|
|
if sizeBytes > s.swapSizeBytes {
|
|
dev, err := s.addSwap(ctx, fmt.Sprintf("swap%d", s.swapDeviceNum), sizeBytes-s.swapSizeBytes)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
s.swapDeviceNum += 1
|
|
s.swapSizeBytes = sizeBytes
|
|
s.swapDevices = append(s.swapDevices, dev)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *Sandbox) cleanSwap(ctx context.Context) {
|
|
for _, dev := range s.swapDevices {
|
|
err := s.removeSwap(ctx, dev)
|
|
if err != nil {
|
|
s.Logger().Warnf("remove swap device %+v got error %s", dev, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// startVM starts the VM.
|
|
func (s *Sandbox) startVM(ctx context.Context) (err error) {
|
|
span, ctx := katatrace.Trace(ctx, s.Logger(), "startVM", sandboxTracingTags, map[string]string{"sandbox_id": s.id})
|
|
defer span.End()
|
|
|
|
s.Logger().Info("Starting VM")
|
|
|
|
if s.config.HypervisorConfig.Debug {
|
|
// create console watcher
|
|
consoleWatcher, err := newConsoleWatcher(ctx, s)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
s.cw = consoleWatcher
|
|
}
|
|
|
|
defer func() {
|
|
if err != nil {
|
|
s.hypervisor.StopVM(ctx, false)
|
|
}
|
|
}()
|
|
|
|
if err := s.network.Run(ctx, s.networkNS.NetNsPath, func() error {
|
|
if s.factory != nil {
|
|
vm, err := s.factory.GetVM(ctx, VMConfig{
|
|
HypervisorType: s.config.HypervisorType,
|
|
HypervisorConfig: s.config.HypervisorConfig,
|
|
AgentConfig: s.config.AgentConfig,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return vm.assignSandbox(s)
|
|
}
|
|
|
|
return s.hypervisor.StartVM(ctx, VmStartTimeout)
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
|
|
// In case of vm factory, network interfaces are hotplugged
|
|
// after vm is started.
|
|
if s.factory != nil {
|
|
endpoints, err := s.network.Add(ctx, &s.config.NetworkConfig, s, true)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
s.networkNS.Endpoints = endpoints
|
|
|
|
if s.config.NetworkConfig.NetmonConfig.Enable {
|
|
if err := s.startNetworkMonitor(ctx); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
s.Logger().Info("VM started")
|
|
|
|
if s.cw != nil {
|
|
s.Logger().Debug("console watcher starts")
|
|
if err := s.cw.start(s); err != nil {
|
|
s.cw.stop()
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Once the hypervisor is done starting the sandbox,
|
|
// we want to guarantee that it is manageable.
|
|
// For that we need to ask the agent to start the
|
|
// sandbox inside the VM.
|
|
if err := s.agent.startSandbox(ctx, s); err != nil {
|
|
return err
|
|
}
|
|
|
|
s.Logger().Info("Agent started in the sandbox")
|
|
|
|
defer func() {
|
|
if err != nil {
|
|
if e := s.agent.stopSandbox(ctx, s); e != nil {
|
|
s.Logger().WithError(e).WithField("sandboxid", s.id).Warning("Agent did not stop sandbox")
|
|
}
|
|
}
|
|
}()
|
|
|
|
return nil
|
|
}
|
|
|
|
// stopVM: stop the sandbox's VM
|
|
func (s *Sandbox) stopVM(ctx context.Context) error {
|
|
span, ctx := katatrace.Trace(ctx, s.Logger(), "stopVM", sandboxTracingTags, map[string]string{"sandbox_id": s.id})
|
|
defer span.End()
|
|
|
|
s.Logger().Info("Stopping sandbox in the VM")
|
|
if err := s.agent.stopSandbox(ctx, s); err != nil {
|
|
s.Logger().WithError(err).WithField("sandboxid", s.id).Warning("Agent did not stop sandbox")
|
|
}
|
|
|
|
s.Logger().Info("Stopping VM")
|
|
|
|
return s.hypervisor.StopVM(ctx, s.disableVMShutdown)
|
|
}
|
|
|
|
func (s *Sandbox) addContainer(c *Container) error {
|
|
if _, ok := s.containers[c.id]; ok {
|
|
return fmt.Errorf("Duplicated container: %s", c.id)
|
|
}
|
|
s.containers[c.id] = c
|
|
|
|
return nil
|
|
}
|
|
|
|
// CreateContainer creates a new container in the sandbox
|
|
// This should be called only when the sandbox is already created.
|
|
// It will add new container config to sandbox.config.Containers
|
|
func (s *Sandbox) CreateContainer(ctx context.Context, contConfig ContainerConfig) (VCContainer, error) {
|
|
// Update sandbox config to include the new container's config
|
|
s.config.Containers = append(s.config.Containers, contConfig)
|
|
|
|
var err error
|
|
|
|
defer func() {
|
|
if err != nil {
|
|
if len(s.config.Containers) > 0 {
|
|
// delete container config
|
|
s.config.Containers = s.config.Containers[:len(s.config.Containers)-1]
|
|
}
|
|
}
|
|
}()
|
|
|
|
// Create the container object, add devices to the sandbox's device-manager:
|
|
c, err := newContainer(ctx, s, &s.config.Containers[len(s.config.Containers)-1])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// create and start the container
|
|
if err = c.create(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Add the container to the containers list in the sandbox.
|
|
if err = s.addContainer(c); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
defer func() {
|
|
// Rollback if error happens.
|
|
if err != nil {
|
|
logger := s.Logger().WithFields(logrus.Fields{"container": c.id, "sandbox": s.id, "rollback": true})
|
|
logger.WithError(err).Error("Cleaning up partially created container")
|
|
|
|
if errStop := c.stop(ctx, true); errStop != nil {
|
|
logger.WithError(errStop).Error("Could not stop container")
|
|
}
|
|
|
|
logger.Debug("Removing stopped container from sandbox store")
|
|
s.removeContainer(c.id)
|
|
}
|
|
}()
|
|
|
|
// Sandbox is responsible to update VM resources needed by Containers
|
|
// Update resources after having added containers to the sandbox, since
|
|
// container status is requiered to know if more resources should be added.
|
|
if err = s.updateResources(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = s.cgroupsUpdate(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = s.storeSandbox(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return c, nil
|
|
}
|
|
|
|
// StartContainer starts a container in the sandbox
|
|
func (s *Sandbox) StartContainer(ctx context.Context, containerID string) (VCContainer, error) {
|
|
// Fetch the container.
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Start it.
|
|
if err = c.start(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = s.storeSandbox(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
s.Logger().WithField("container", containerID).Info("Container is started")
|
|
|
|
// Update sandbox resources in case a stopped container
|
|
// is started
|
|
if err = s.updateResources(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return c, nil
|
|
}
|
|
|
|
// StopContainer stops a container in the sandbox
|
|
func (s *Sandbox) StopContainer(ctx context.Context, containerID string, force bool) (VCContainer, error) {
|
|
// Fetch the container.
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Stop it.
|
|
if err := c.stop(ctx, force); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = s.storeSandbox(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
return c, nil
|
|
}
|
|
|
|
// KillContainer signals a container in the sandbox
|
|
func (s *Sandbox) KillContainer(ctx context.Context, containerID string, signal syscall.Signal, all bool) error {
|
|
// Fetch the container.
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Send a signal to the process.
|
|
err = c.kill(ctx, signal, all)
|
|
|
|
// SIGKILL should never fail otherwise it is
|
|
// impossible to clean things up.
|
|
if signal == syscall.SIGKILL {
|
|
return nil
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
// DeleteContainer deletes a container from the sandbox
|
|
func (s *Sandbox) DeleteContainer(ctx context.Context, containerID string) (VCContainer, error) {
|
|
if containerID == "" {
|
|
return nil, vcTypes.ErrNeedContainerID
|
|
}
|
|
|
|
// Fetch the container.
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Delete it.
|
|
if err = c.delete(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Update sandbox config
|
|
for idx, contConfig := range s.config.Containers {
|
|
if contConfig.ID == containerID {
|
|
s.config.Containers = append(s.config.Containers[:idx], s.config.Containers[idx+1:]...)
|
|
break
|
|
}
|
|
}
|
|
|
|
// update the sandbox cgroup
|
|
if err = s.cgroupsUpdate(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = s.storeSandbox(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
return c, nil
|
|
}
|
|
|
|
// StatusContainer gets the status of a container
|
|
func (s *Sandbox) StatusContainer(containerID string) (ContainerStatus, error) {
|
|
if containerID == "" {
|
|
return ContainerStatus{}, vcTypes.ErrNeedContainerID
|
|
}
|
|
|
|
if c, ok := s.containers[containerID]; ok {
|
|
rootfs := c.config.RootFs.Source
|
|
if c.config.RootFs.Mounted {
|
|
rootfs = c.config.RootFs.Target
|
|
}
|
|
|
|
return ContainerStatus{
|
|
ID: c.id,
|
|
State: c.state,
|
|
PID: c.process.Pid,
|
|
StartTime: c.process.StartTime,
|
|
RootFs: rootfs,
|
|
Annotations: c.config.Annotations,
|
|
}, nil
|
|
}
|
|
|
|
return ContainerStatus{}, vcTypes.ErrNoSuchContainer
|
|
}
|
|
|
|
// EnterContainer is the virtcontainers container command execution entry point.
|
|
// EnterContainer enters an already running container and runs a given command.
|
|
func (s *Sandbox) EnterContainer(ctx context.Context, containerID string, cmd types.Cmd) (VCContainer, *Process, error) {
|
|
// Fetch the container.
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
// Enter it.
|
|
process, err := c.enter(ctx, cmd)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
return c, process, nil
|
|
}
|
|
|
|
// UpdateContainer update a running container.
|
|
func (s *Sandbox) UpdateContainer(ctx context.Context, containerID string, resources specs.LinuxResources) error {
|
|
// Fetch the container.
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err = c.update(ctx, resources); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := s.cgroupsUpdate(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err = s.storeSandbox(ctx); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// StatsContainer return the stats of a running container
|
|
func (s *Sandbox) StatsContainer(ctx context.Context, containerID string) (ContainerStats, error) {
|
|
// Fetch the container.
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return ContainerStats{}, err
|
|
}
|
|
|
|
stats, err := c.stats(ctx)
|
|
if err != nil {
|
|
return ContainerStats{}, err
|
|
}
|
|
return *stats, nil
|
|
}
|
|
|
|
// Stats returns the stats of a running sandbox
|
|
func (s *Sandbox) Stats(ctx context.Context) (SandboxStats, error) {
|
|
|
|
metrics, err := s.sandboxCgroup.Stat()
|
|
if err != nil {
|
|
return SandboxStats{}, err
|
|
}
|
|
|
|
stats := SandboxStats{}
|
|
|
|
// TODO Do we want to aggregate the overhead cgroup stats to the sandbox ones?
|
|
stats.CgroupStats.CPUStats.CPUUsage.TotalUsage = metrics.CPU.Usage.Total
|
|
stats.CgroupStats.MemoryStats.Usage.Usage = metrics.Memory.Usage.Usage
|
|
tids, err := s.hypervisor.GetThreadIDs(ctx)
|
|
if err != nil {
|
|
return stats, err
|
|
}
|
|
stats.Cpus = len(tids.vcpus)
|
|
|
|
return stats, nil
|
|
}
|
|
|
|
// PauseContainer pauses a running container.
|
|
func (s *Sandbox) PauseContainer(ctx context.Context, containerID string) error {
|
|
// Fetch the container.
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Pause the container.
|
|
if err := c.pause(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err = s.storeSandbox(ctx); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ResumeContainer resumes a paused container.
|
|
func (s *Sandbox) ResumeContainer(ctx context.Context, containerID string) error {
|
|
// Fetch the container.
|
|
c, err := s.findContainer(containerID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Resume the container.
|
|
if err := c.resume(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err = s.storeSandbox(ctx); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// createContainers registers all containers, create the
|
|
// containers in the guest and starts one shim per container.
|
|
func (s *Sandbox) createContainers(ctx context.Context) error {
|
|
span, ctx := katatrace.Trace(ctx, s.Logger(), "createContainers", sandboxTracingTags, map[string]string{"sandbox_id": s.id})
|
|
defer span.End()
|
|
|
|
for i := range s.config.Containers {
|
|
|
|
c, err := newContainer(ctx, s, &s.config.Containers[i])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := c.create(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := s.addContainer(c); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Update resources after having added containers to the sandbox, since
|
|
// container status is requiered to know if more resources should be added.
|
|
if err := s.updateResources(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := s.cgroupsUpdate(ctx); err != nil {
|
|
return err
|
|
}
|
|
if err := s.storeSandbox(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Start starts a sandbox. The containers that are making the sandbox
|
|
// will be started.
|
|
func (s *Sandbox) Start(ctx context.Context) error {
|
|
if err := s.state.ValidTransition(s.state.State, types.StateRunning); err != nil {
|
|
return err
|
|
}
|
|
|
|
prevState := s.state.State
|
|
|
|
if err := s.setSandboxState(types.StateRunning); err != nil {
|
|
return err
|
|
}
|
|
|
|
var startErr error
|
|
defer func() {
|
|
if startErr != nil {
|
|
s.setSandboxState(prevState)
|
|
}
|
|
}()
|
|
for _, c := range s.containers {
|
|
if startErr = c.start(ctx); startErr != nil {
|
|
return startErr
|
|
}
|
|
}
|
|
|
|
if err := s.storeSandbox(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
s.Logger().Info("Sandbox is started")
|
|
|
|
return nil
|
|
}
|
|
|
|
// Stop stops a sandbox. The containers that are making the sandbox
|
|
// will be destroyed.
|
|
// When force is true, ignore guest related stop failures.
|
|
func (s *Sandbox) Stop(ctx context.Context, force bool) error {
|
|
span, ctx := katatrace.Trace(ctx, s.Logger(), "Stop", sandboxTracingTags, map[string]string{"sandbox_id": s.id})
|
|
defer span.End()
|
|
|
|
if s.state.State == types.StateStopped {
|
|
s.Logger().Info("sandbox already stopped")
|
|
return nil
|
|
}
|
|
|
|
if err := s.state.ValidTransition(s.state.State, types.StateStopped); err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, c := range s.containers {
|
|
if err := c.stop(ctx, force); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if err := s.stopVM(ctx); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
// shutdown console watcher if exists
|
|
if s.cw != nil {
|
|
s.Logger().Debug("stop the console watcher")
|
|
s.cw.stop()
|
|
}
|
|
|
|
if err := s.setSandboxState(types.StateStopped); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Remove the network.
|
|
if err := s.removeNetwork(ctx); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
if err := s.storeSandbox(ctx); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Stop communicating with the agent.
|
|
if err := s.agent.disconnect(ctx); err != nil && !force {
|
|
return err
|
|
}
|
|
|
|
s.cleanSwap(ctx)
|
|
|
|
return nil
|
|
}
|
|
|
|
// setSandboxState sets the in-memory state of the sandbox.
|
|
func (s *Sandbox) setSandboxState(state types.StateString) error {
|
|
if state == "" {
|
|
return vcTypes.ErrNeedState
|
|
}
|
|
|
|
// update in-memory state
|
|
s.state.State = state
|
|
|
|
return nil
|
|
}
|
|
|
|
const maxBlockIndex = 65535
|
|
|
|
// getAndSetSandboxBlockIndex retrieves an unused sandbox block index from
|
|
// the BlockIndexMap and marks it as used. This index is used to maintain the
|
|
// index at which a block device is assigned to a container in the sandbox.
|
|
func (s *Sandbox) getAndSetSandboxBlockIndex() (int, error) {
|
|
currentIndex := -1
|
|
for i := 0; i < maxBlockIndex; i++ {
|
|
if _, ok := s.state.BlockIndexMap[i]; !ok {
|
|
currentIndex = i
|
|
break
|
|
}
|
|
}
|
|
if currentIndex == -1 {
|
|
return -1, errors.New("no available block index")
|
|
}
|
|
s.state.BlockIndexMap[currentIndex] = struct{}{}
|
|
|
|
return currentIndex, nil
|
|
}
|
|
|
|
// unsetSandboxBlockIndex deletes the current sandbox block index from BlockIndexMap.
|
|
// This is used to recover from failure while adding a block device.
|
|
func (s *Sandbox) unsetSandboxBlockIndex(index int) error {
|
|
var err error
|
|
original := index
|
|
delete(s.state.BlockIndexMap, index)
|
|
defer func() {
|
|
if err != nil {
|
|
s.state.BlockIndexMap[original] = struct{}{}
|
|
}
|
|
}()
|
|
|
|
return nil
|
|
}
|
|
|
|
// HotplugAddDevice is used for add a device to sandbox
|
|
// Sandbox implement DeviceReceiver interface from device/api/interface.go
|
|
func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devType config.DeviceType) error {
|
|
span, ctx := katatrace.Trace(ctx, s.Logger(), "HotplugAddDevice", sandboxTracingTags, map[string]string{"sandbox_id": s.id})
|
|
defer span.End()
|
|
|
|
if err := s.sandboxCgroup.AddDevice(device.GetHostPath()); err != nil {
|
|
s.Logger().WithError(err).WithField("device", device).
|
|
Warn("Could not add device to cgroup")
|
|
}
|
|
|
|
switch devType {
|
|
case config.DeviceVFIO:
|
|
vfioDevices, ok := device.GetDeviceInfo().([]*config.VFIODev)
|
|
if !ok {
|
|
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
|
|
}
|
|
|
|
// adding a group of VFIO devices
|
|
for _, dev := range vfioDevices {
|
|
if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil {
|
|
s.Logger().
|
|
WithFields(logrus.Fields{
|
|
"sandbox": s.id,
|
|
"vfio-device-ID": dev.ID,
|
|
"vfio-device-BDF": dev.BDF,
|
|
}).WithError(err).Error("failed to hotplug VFIO device")
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
case config.DeviceBlock:
|
|
blockDevice, ok := device.(*drivers.BlockDevice)
|
|
if !ok {
|
|
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
|
|
}
|
|
_, err := s.hypervisor.HotplugAddDevice(ctx, blockDevice.BlockDrive, BlockDev)
|
|
return err
|
|
case config.VhostUserBlk:
|
|
vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice)
|
|
if !ok {
|
|
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
|
|
}
|
|
_, err := s.hypervisor.HotplugAddDevice(ctx, vhostUserBlkDevice.VhostUserDeviceAttrs, VhostuserDev)
|
|
return err
|
|
case config.DeviceGeneric:
|
|
// TODO: what?
|
|
return nil
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// HotplugRemoveDevice is used for removing a device from sandbox
|
|
// Sandbox implement DeviceReceiver interface from device/api/interface.go
|
|
func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, devType config.DeviceType) error {
|
|
defer func() {
|
|
if err := s.sandboxCgroup.RemoveDevice(device.GetHostPath()); err != nil {
|
|
s.Logger().WithError(err).WithField("device", device).
|
|
Warn("Could not add device to cgroup")
|
|
}
|
|
}()
|
|
|
|
switch devType {
|
|
case config.DeviceVFIO:
|
|
vfioDevices, ok := device.GetDeviceInfo().([]*config.VFIODev)
|
|
if !ok {
|
|
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
|
|
}
|
|
|
|
// remove a group of VFIO devices
|
|
for _, dev := range vfioDevices {
|
|
if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil {
|
|
s.Logger().WithError(err).
|
|
WithFields(logrus.Fields{
|
|
"sandbox": s.id,
|
|
"vfio-device-ID": dev.ID,
|
|
"vfio-device-BDF": dev.BDF,
|
|
}).Error("failed to hot unplug VFIO device")
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
case config.DeviceBlock:
|
|
blockDrive, ok := device.GetDeviceInfo().(*config.BlockDrive)
|
|
if !ok {
|
|
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
|
|
}
|
|
// PMEM devices cannot be hot removed
|
|
if blockDrive.Pmem {
|
|
s.Logger().WithField("path", blockDrive.File).Infof("Skip device: cannot hot remove PMEM devices")
|
|
return nil
|
|
}
|
|
_, err := s.hypervisor.HotplugRemoveDevice(ctx, blockDrive, BlockDev)
|
|
return err
|
|
case config.VhostUserBlk:
|
|
vhostUserDeviceAttrs, ok := device.GetDeviceInfo().(*config.VhostUserDeviceAttrs)
|
|
if !ok {
|
|
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
|
|
}
|
|
_, err := s.hypervisor.HotplugRemoveDevice(ctx, vhostUserDeviceAttrs, VhostuserDev)
|
|
return err
|
|
case config.DeviceGeneric:
|
|
// TODO: what?
|
|
return nil
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetAndSetSandboxBlockIndex is used for getting and setting virtio-block indexes
|
|
// Sandbox implement DeviceReceiver interface from device/api/interface.go
|
|
func (s *Sandbox) GetAndSetSandboxBlockIndex() (int, error) {
|
|
return s.getAndSetSandboxBlockIndex()
|
|
}
|
|
|
|
// UnsetSandboxBlockIndex unsets block indexes
|
|
// Sandbox implement DeviceReceiver interface from device/api/interface.go
|
|
func (s *Sandbox) UnsetSandboxBlockIndex(index int) error {
|
|
return s.unsetSandboxBlockIndex(index)
|
|
}
|
|
|
|
// AppendDevice can only handle vhost user device currently, it adds a
|
|
// vhost user device to sandbox
|
|
// Sandbox implement DeviceReceiver interface from device/api/interface.go
|
|
func (s *Sandbox) AppendDevice(ctx context.Context, device api.Device) error {
|
|
switch device.DeviceType() {
|
|
case config.VhostUserSCSI, config.VhostUserNet, config.VhostUserBlk, config.VhostUserFS:
|
|
return s.hypervisor.AddDevice(ctx, device.GetDeviceInfo().(*config.VhostUserDeviceAttrs), VhostuserDev)
|
|
case config.DeviceVFIO:
|
|
vfioDevs := device.GetDeviceInfo().([]*config.VFIODev)
|
|
for _, d := range vfioDevs {
|
|
return s.hypervisor.AddDevice(ctx, *d, VfioDev)
|
|
}
|
|
default:
|
|
s.Logger().WithField("device-type", device.DeviceType()).
|
|
Warn("Could not append device: unsupported device type")
|
|
}
|
|
|
|
return fmt.Errorf("unsupported device type")
|
|
}
|
|
|
|
// AddDevice will add a device to sandbox
|
|
func (s *Sandbox) AddDevice(ctx context.Context, info config.DeviceInfo) (api.Device, error) {
|
|
if s.devManager == nil {
|
|
return nil, fmt.Errorf("device manager isn't initialized")
|
|
}
|
|
|
|
var err error
|
|
b, err := s.devManager.NewDevice(info)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if err != nil {
|
|
s.devManager.RemoveDevice(b.DeviceID())
|
|
}
|
|
}()
|
|
|
|
if err = s.devManager.AttachDevice(ctx, b.DeviceID(), s); err != nil {
|
|
return nil, err
|
|
}
|
|
defer func() {
|
|
if err != nil {
|
|
s.devManager.DetachDevice(ctx, b.DeviceID(), s)
|
|
}
|
|
}()
|
|
|
|
return b, nil
|
|
}
|
|
|
|
// updateResources will:
|
|
// - calculate the resources required for the virtual machine, and adjust the virtual machine
|
|
// sizing accordingly. For a given sandbox, it will calculate the number of vCPUs required based
|
|
// on the sum of container requests, plus default CPUs for the VM. Similar is done for memory.
|
|
// If changes in memory or CPU are made, the VM will be updated and the agent will online the
|
|
// applicable CPU and memory.
|
|
func (s *Sandbox) updateResources(ctx context.Context) error {
|
|
if s == nil {
|
|
return errors.New("sandbox is nil")
|
|
}
|
|
|
|
if s.config == nil {
|
|
return fmt.Errorf("sandbox config is nil")
|
|
}
|
|
|
|
sandboxVCPUs, err := s.calculateSandboxCPUs()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Add default vcpus for sandbox
|
|
sandboxVCPUs += s.hypervisor.HypervisorConfig().NumVCPUs
|
|
|
|
sandboxMemoryByte, sandboxneedPodSwap, sandboxSwapByte := s.calculateSandboxMemory()
|
|
// Add default / rsvd memory for sandbox.
|
|
hypervisorMemoryByte := int64(s.hypervisor.HypervisorConfig().MemorySize) << utils.MibToBytesShift
|
|
sandboxMemoryByte += hypervisorMemoryByte
|
|
if sandboxneedPodSwap {
|
|
sandboxSwapByte += hypervisorMemoryByte
|
|
}
|
|
s.Logger().WithField("sandboxMemoryByte", sandboxMemoryByte).WithField("sandboxneedPodSwap", sandboxneedPodSwap).WithField("sandboxSwapByte", sandboxSwapByte).Debugf("updateResources: after calculateSandboxMemory")
|
|
|
|
// Setup the SWAP in the guest
|
|
if sandboxSwapByte > 0 {
|
|
err = s.setupSwap(ctx, sandboxSwapByte)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Update VCPUs
|
|
s.Logger().WithField("cpus-sandbox", sandboxVCPUs).Debugf("Request to hypervisor to update vCPUs")
|
|
oldCPUs, newCPUs, err := s.hypervisor.ResizeVCPUs(ctx, sandboxVCPUs)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
s.Logger().Debugf("Request to hypervisor to update oldCPUs/newCPUs: %d/%d", oldCPUs, newCPUs)
|
|
// If the CPUs were increased, ask agent to online them
|
|
if oldCPUs < newCPUs {
|
|
vcpusAdded := newCPUs - oldCPUs
|
|
s.Logger().Debugf("Request to onlineCPUMem with %d CPUs", vcpusAdded)
|
|
if err := s.agent.onlineCPUMem(ctx, vcpusAdded, true); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
s.Logger().Debugf("Sandbox CPUs: %d", newCPUs)
|
|
|
|
// Update Memory
|
|
s.Logger().WithField("memory-sandbox-size-byte", sandboxMemoryByte).Debugf("Request to hypervisor to update memory")
|
|
newMemory, updatedMemoryDevice, err := s.hypervisor.ResizeMemory(ctx, uint32(sandboxMemoryByte>>utils.MibToBytesShift), s.state.GuestMemoryBlockSizeMB, s.state.GuestMemoryHotplugProbe)
|
|
if err != nil {
|
|
if err == noGuestMemHotplugErr {
|
|
s.Logger().Warnf("%s, memory specifications cannot be guaranteed", err)
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
s.Logger().Debugf("Sandbox memory size: %d MB", newMemory)
|
|
if s.state.GuestMemoryHotplugProbe && updatedMemoryDevice.Addr != 0 {
|
|
// notify the guest kernel about memory hot-add event, before onlining them
|
|
s.Logger().Debugf("notify guest kernel memory hot-add event via probe interface, memory device located at 0x%x", updatedMemoryDevice.Addr)
|
|
if err := s.agent.memHotplugByProbe(ctx, updatedMemoryDevice.Addr, uint32(updatedMemoryDevice.SizeMB), s.state.GuestMemoryBlockSizeMB); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if err := s.agent.onlineCPUMem(ctx, 0, false); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *Sandbox) calculateSandboxMemory() (int64, bool, int64) {
|
|
memorySandbox := int64(0)
|
|
needPodSwap := false
|
|
swapSandbox := int64(0)
|
|
for _, c := range s.config.Containers {
|
|
// Do not hot add again non-running containers resources
|
|
if cont, ok := s.containers[c.ID]; ok && cont.state.State == types.StateStopped {
|
|
s.Logger().WithField("container", c.ID).Debug("Do not taking into account memory resources of not running containers")
|
|
continue
|
|
}
|
|
|
|
if m := c.Resources.Memory; m != nil {
|
|
currentLimit := int64(0)
|
|
if m.Limit != nil {
|
|
currentLimit = *m.Limit
|
|
memorySandbox += currentLimit
|
|
}
|
|
if s.config.HypervisorConfig.GuestSwap && m.Swappiness != nil && *m.Swappiness > 0 {
|
|
currentSwap := int64(0)
|
|
if m.Swap != nil {
|
|
currentSwap = *m.Swap
|
|
}
|
|
if currentSwap == 0 {
|
|
if currentLimit == 0 {
|
|
needPodSwap = true
|
|
} else {
|
|
swapSandbox += currentLimit
|
|
}
|
|
} else if currentSwap > currentLimit {
|
|
swapSandbox = currentSwap - currentLimit
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return memorySandbox, needPodSwap, swapSandbox
|
|
}
|
|
|
|
func (s *Sandbox) calculateSandboxCPUs() (uint32, error) {
|
|
mCPU := uint32(0)
|
|
cpusetCount := int(0)
|
|
|
|
for _, c := range s.config.Containers {
|
|
// Do not hot add again non-running containers resources
|
|
if cont, ok := s.containers[c.ID]; ok && cont.state.State == types.StateStopped {
|
|
s.Logger().WithField("container", c.ID).Debug("Do not taking into account CPU resources of not running containers")
|
|
continue
|
|
}
|
|
|
|
if cpu := c.Resources.CPU; cpu != nil {
|
|
if cpu.Period != nil && cpu.Quota != nil {
|
|
mCPU += utils.CalculateMilliCPUs(*cpu.Quota, *cpu.Period)
|
|
}
|
|
|
|
set, err := cpuset.Parse(cpu.Cpus)
|
|
if err != nil {
|
|
return 0, nil
|
|
}
|
|
cpusetCount += set.Size()
|
|
}
|
|
}
|
|
|
|
// If we aren't being constrained, then we could have two scenarios:
|
|
// 1. BestEffort QoS: no proper support today in Kata.
|
|
// 2. We could be constrained only by CPUSets. Check for this:
|
|
if mCPU == 0 && cpusetCount > 0 {
|
|
return uint32(cpusetCount), nil
|
|
}
|
|
|
|
return utils.CalculateVCpusFromMilliCpus(mCPU), nil
|
|
}
|
|
|
|
// GetHypervisorType is used for getting Hypervisor name currently used.
|
|
// Sandbox implement DeviceReceiver interface from device/api/interface.go
|
|
func (s *Sandbox) GetHypervisorType() string {
|
|
return string(s.config.HypervisorType)
|
|
}
|
|
|
|
// cgroupsUpdate updates the sandbox cpuset cgroup subsystem.
|
|
// Also, if the sandbox has an overhead cgroup, it updates the hypervisor
|
|
// constraints by moving the potentially new vCPU threads back to the sandbox
|
|
// cgroup.
|
|
func (s *Sandbox) cgroupsUpdate(ctx context.Context) error {
|
|
cpuset, memset, err := s.getSandboxCPUSet()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// We update the sandbox cgroup with potentially new virtual CPUs.
|
|
if err := s.sandboxCgroup.UpdateCpuSet(cpuset, memset); err != nil {
|
|
return err
|
|
}
|
|
|
|
if s.overheadCgroup != nil {
|
|
// If we have an overhead cgroup, new vCPU threads would start there,
|
|
// as being children of the VMM PID.
|
|
// We need to constrain them by moving them into the sandbox cgroup.
|
|
if err := s.constrainHypervisor(ctx); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// cgroupsDelete will move the running processes in the sandbox cgroup
|
|
// to the parent and then delete the sandbox cgroup
|
|
func (s *Sandbox) cgroupsDelete() error {
|
|
s.Logger().Debug("Deleting sandbox cgroup")
|
|
if s.state.SandboxCgroupPath == "" {
|
|
s.Logger().Warnf("sandbox cgroup path is empty")
|
|
return nil
|
|
}
|
|
|
|
sandboxCgroup, err := cgroups.Load(s.state.SandboxCgroupPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := sandboxCgroup.MoveToParent(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := sandboxCgroup.Delete(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if s.state.OverheadCgroupPath != "" {
|
|
overheadCgroup, err := cgroups.Load(s.state.OverheadCgroupPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := s.overheadCgroup.MoveToParent(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := overheadCgroup.Delete(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// constrainHypervisor will place the VMM and vCPU threads into cgroups.
|
|
func (s *Sandbox) constrainHypervisor(ctx context.Context) error {
|
|
tids, err := s.hypervisor.GetThreadIDs(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get thread ids from hypervisor: %v", err)
|
|
}
|
|
|
|
// All vCPU threads move to the sandbox cgroup.
|
|
for _, i := range tids.vcpus {
|
|
if err := s.sandboxCgroup.AddTask(i); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// setupCgroups adds the runtime process to either the sandbox cgroup or the overhead one,
|
|
// depending on the sandbox_cgroup_only configuration setting.
|
|
func (s *Sandbox) setupCgroups() error {
|
|
vmmCgroup := s.sandboxCgroup
|
|
if s.overheadCgroup != nil {
|
|
vmmCgroup = s.overheadCgroup
|
|
}
|
|
|
|
// By adding the runtime process to either the sandbox or overhead cgroup, we are making
|
|
// sure that any child process of the runtime (i.e. *all* processes serving a Kata pod)
|
|
// will initially live in this cgroup. Depending on the sandbox_cgroup settings, we will
|
|
// then move the vCPU threads between cgroups.
|
|
runtimePid := os.Getpid()
|
|
// Add the runtime to the VMM sandbox cgroup
|
|
if err := vmmCgroup.AddProcess(runtimePid); err != nil {
|
|
return fmt.Errorf("Could not add runtime PID %d to sandbox cgroup: %v", runtimePid, err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetPatchedOCISpec returns sandbox's OCI specification
|
|
// This OCI specification was patched when the sandbox was created
|
|
// by containerCapabilities(), SetEphemeralStorageType() and others
|
|
// in order to support:
|
|
// * Capabilities
|
|
// * Ephemeral storage
|
|
// * k8s empty dir
|
|
// If you need the original (vanilla) OCI spec,
|
|
// use compatoci.GetContainerSpec() instead.
|
|
func (s *Sandbox) GetPatchedOCISpec() *specs.Spec {
|
|
if s.config == nil {
|
|
return nil
|
|
}
|
|
|
|
// get the container associated with the PodSandbox annotation. In Kubernetes, this
|
|
// represents the pause container. In Docker, this is the container. We derive the
|
|
// cgroup path from this container.
|
|
for _, cConfig := range s.config.Containers {
|
|
if cConfig.Annotations[annotations.ContainerTypeKey] == string(PodSandbox) {
|
|
return cConfig.CustomSpec
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *Sandbox) GetOOMEvent(ctx context.Context) (string, error) {
|
|
return s.agent.getOOMEvent(ctx)
|
|
}
|
|
|
|
func (s *Sandbox) GetAgentURL() (string, error) {
|
|
return s.agent.getAgentURL()
|
|
}
|
|
|
|
// getSandboxCPUSet returns the union of each of the sandbox's containers' CPU sets'
|
|
// cpus and mems as a string in canonical linux CPU/mems list format
|
|
func (s *Sandbox) getSandboxCPUSet() (string, string, error) {
|
|
if s.config == nil {
|
|
return "", "", nil
|
|
}
|
|
|
|
cpuResult := cpuset.NewCPUSet()
|
|
memResult := cpuset.NewCPUSet()
|
|
for _, ctr := range s.config.Containers {
|
|
if ctr.Resources.CPU != nil {
|
|
currCPUSet, err := cpuset.Parse(ctr.Resources.CPU.Cpus)
|
|
if err != nil {
|
|
return "", "", fmt.Errorf("unable to parse CPUset.cpus for container %s: %v", ctr.ID, err)
|
|
}
|
|
cpuResult = cpuResult.Union(currCPUSet)
|
|
|
|
currMemSet, err := cpuset.Parse(ctr.Resources.CPU.Mems)
|
|
if err != nil {
|
|
return "", "", fmt.Errorf("unable to parse CPUset.mems for container %s: %v", ctr.ID, err)
|
|
}
|
|
memResult = memResult.Union(currMemSet)
|
|
}
|
|
}
|
|
|
|
return cpuResult.String(), memResult.String(), nil
|
|
}
|
|
|
|
// fetchSandbox fetches a sandbox config from a sandbox ID and returns a sandbox.
|
|
func fetchSandbox(ctx context.Context, sandboxID string) (sandbox *Sandbox, err error) {
|
|
virtLog.Info("fetch sandbox")
|
|
if sandboxID == "" {
|
|
return nil, vcTypes.ErrNeedSandboxID
|
|
}
|
|
|
|
var config SandboxConfig
|
|
|
|
// Load sandbox config fromld store.
|
|
c, err := loadSandboxConfig(sandboxID)
|
|
if err != nil {
|
|
virtLog.WithError(err).Warning("failed to get sandbox config from store")
|
|
return nil, err
|
|
}
|
|
|
|
config = *c
|
|
|
|
// fetchSandbox is not suppose to create new sandbox VM.
|
|
sandbox, err = createSandbox(ctx, config, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create sandbox with config %+v: %v", config, err)
|
|
}
|
|
|
|
// This sandbox already exists, we don't need to recreate the containers in the guest.
|
|
// We only need to fetch the containers from storage and create the container structs.
|
|
if err := sandbox.fetchContainers(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return sandbox, nil
|
|
}
|
|
|
|
// fetchContainers creates new containers structure and
|
|
// adds them to the sandbox. It does not create the containers
|
|
// in the guest. This should only be used when fetching a
|
|
// sandbox that already exists.
|
|
func (s *Sandbox) fetchContainers(ctx context.Context) error {
|
|
for i, contConfig := range s.config.Containers {
|
|
// Add spec from bundle path
|
|
spec, err := compatoci.GetContainerSpec(contConfig.Annotations)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
contConfig.CustomSpec = &spec
|
|
s.config.Containers[i] = contConfig
|
|
|
|
c, err := newContainer(ctx, s, &s.config.Containers[i])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := s.addContainer(c); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|