Files
kata-containers/src/runtime/virtcontainers/qemu.go
David Gibson ea1d799f79 qemu: Only one element of qemuPaths map is relevant
The qemuPaths field in qemuArchBase maps from machine type to the default
qemu path.  But, by the time we construct it, we already know the machine
type, so that entry ends up being the only one we care about.

So, collapse the map into a single path.  As a bonus, the qemuPath()
method can no longer fail.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2020-06-24 21:26:43 +10:00

2266 lines
59 KiB
Go

// Copyright (c) 2016 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
package virtcontainers
import (
"bufio"
"context"
"encoding/hex"
"encoding/json"
"fmt"
"io/ioutil"
"math"
"net"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"syscall"
"time"
"unsafe"
govmmQemu "github.com/intel/govmm/qemu"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/opentracing/opentracing-go"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/config"
persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/uuid"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
)
// romFile is the file name of the ROM that can be used for virtio-pci devices.
// If this file name is empty, this means we expect the firmware used by Qemu,
// such as SeaBIOS or OVMF for instance, to handle this directly.
const romFile = ""
// disable-modern is a option to QEMU that will fall back to using 0.9 version
// of virtio. Since moving to QEMU4.0, we can start using virtio 1.0 version.
// Default value is false.
const defaultDisableModern = false
type qmpChannel struct {
sync.Mutex
ctx context.Context
path string
qmp *govmmQemu.QMP
disconn chan struct{}
}
// CPUDevice represents a CPU device which was hot-added in a running VM
type CPUDevice struct {
// ID is used to identify this CPU in the hypervisor options.
ID string
}
// QemuState keeps Qemu's state
type QemuState struct {
Bridges []types.Bridge
// HotpluggedCPUs is the list of CPUs that were hot-added
HotpluggedVCPUs []CPUDevice
HotpluggedMemory int
UUID string
HotplugVFIOOnRootBus bool
VirtiofsdPid int
PCIeRootPort int
}
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
type qemu struct {
id string
config HypervisorConfig
qmpMonitorCh qmpChannel
qemuConfig govmmQemu.Config
state QemuState
arch qemuArch
// fds is a list of file descriptors inherited by QEMU process
// they'll be closed once QEMU process is running
fds []*os.File
ctx context.Context
nvdimmCount int
stopped bool
store persistapi.PersistDriver
}
const (
consoleSocket = "console.sock"
qmpSocket = "qmp.sock"
vhostFSSocket = "vhost-fs.sock"
qmpCapErrMsg = "Failed to negoatiate QMP capabilities"
qmpExecCatCmd = "exec:cat"
scsiControllerID = "scsi0"
rngID = "rng0"
vsockKernelOption = "agent.use_vsock"
fallbackFileBackedMemDir = "/dev/shm"
)
var qemuMajorVersion int
var qemuMinorVersion int
// agnostic list of kernel parameters
var defaultKernelParameters = []Param{
{"panic", "1"},
}
type qmpLogger struct {
logger *logrus.Entry
}
func newQMPLogger() qmpLogger {
return qmpLogger{
logger: virtLog.WithField("subsystem", "qmp"),
}
}
func (l qmpLogger) V(level int32) bool {
return level != 0
}
func (l qmpLogger) Infof(format string, v ...interface{}) {
l.logger.Infof(format, v...)
}
func (l qmpLogger) Warningf(format string, v ...interface{}) {
l.logger.Warnf(format, v...)
}
func (l qmpLogger) Errorf(format string, v ...interface{}) {
l.logger.Errorf(format, v...)
}
// Logger returns a logrus logger appropriate for logging qemu messages
func (q *qemu) Logger() *logrus.Entry {
return virtLog.WithField("subsystem", "qemu")
}
func (q *qemu) kernelParameters() string {
// get a list of arch kernel parameters
params := q.arch.kernelParameters(q.config.Debug)
// use default parameters
params = append(params, defaultKernelParameters...)
// set the maximum number of vCPUs
params = append(params, Param{"nr_cpus", fmt.Sprintf("%d", q.config.DefaultMaxVCPUs)})
// Add a kernel param to indicate if vsock is being used.
// This will be consumed by the agent to determine if it needs to listen on
// a serial or vsock channel
params = append(params, Param{vsockKernelOption, strconv.FormatBool(q.config.UseVSock)})
// add the params specified by the provided config. As the kernel
// honours the last parameter value set and since the config-provided
// params are added here, they will take priority over the defaults.
params = append(params, q.config.KernelParams...)
paramsStr := SerializeParams(params, "=")
return strings.Join(paramsStr, " ")
}
// Adds all capabilities supported by qemu implementation of hypervisor interface
func (q *qemu) capabilities() types.Capabilities {
span, _ := q.trace("capabilities")
defer span.Finish()
return q.arch.capabilities()
}
func (q *qemu) hypervisorConfig() HypervisorConfig {
return q.config
}
// get the QEMU binary path
func (q *qemu) qemuPath() (string, error) {
p, err := q.config.HypervisorAssetPath()
if err != nil {
return "", err
}
if p == "" {
p = q.arch.qemuPath()
}
if _, err = os.Stat(p); os.IsNotExist(err) {
return "", fmt.Errorf("QEMU path (%s) does not exist", p)
}
return p, nil
}
func (q *qemu) trace(name string) (opentracing.Span, context.Context) {
if q.ctx == nil {
q.Logger().WithField("type", "bug").Error("trace called before context set")
q.ctx = context.Background()
}
span, ctx := opentracing.StartSpanFromContext(q.ctx, name)
span.SetTag("subsystem", "hypervisor")
span.SetTag("type", "qemu")
return span, ctx
}
// setup sets the Qemu structure up.
func (q *qemu) setup(id string, hypervisorConfig *HypervisorConfig) error {
span, _ := q.trace("setup")
defer span.Finish()
err := hypervisorConfig.valid()
if err != nil {
return err
}
q.id = id
q.config = *hypervisorConfig
q.arch, err = newQemuArch(q.config)
if err != nil {
return err
}
initrdPath, err := q.config.InitrdAssetPath()
if err != nil {
return err
}
imagePath, err := q.config.ImageAssetPath()
if err != nil {
return err
}
if initrdPath == "" && imagePath != "" && !q.config.DisableImageNvdimm {
q.nvdimmCount = 1
} else {
q.nvdimmCount = 0
}
var create bool
if q.state.UUID == "" {
create = true
}
q.arch.setBridges(q.state.Bridges)
if create {
q.Logger().Debug("Creating bridges")
q.arch.bridges(q.config.DefaultBridges)
q.Logger().Debug("Creating UUID")
q.state.UUID = uuid.Generate().String()
q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
q.state.PCIeRootPort = int(q.config.PCIeRootPort)
// The path might already exist, but in case of VM templating,
// we have to create it since the sandbox has not created it yet.
if err = os.MkdirAll(filepath.Join(q.store.RunStoragePath(), id), DirMode); err != nil {
return err
}
}
nested, err := RunningOnVMM(procCPUInfo)
if err != nil {
return err
}
if !q.config.DisableNestingChecks && nested {
q.arch.enableNestingChecks()
} else {
q.Logger().WithField("inside-vm", fmt.Sprintf("%t", nested)).Debug("Disable nesting environment checks")
q.arch.disableNestingChecks()
}
if !q.config.DisableVhostNet {
q.arch.enableVhostNet()
} else {
q.Logger().Debug("Disable vhost_net")
q.arch.disableVhostNet()
}
return nil
}
func (q *qemu) cpuTopology() govmmQemu.SMP {
return q.arch.cpuTopology(q.config.NumVCPUs, q.config.DefaultMaxVCPUs)
}
func (q *qemu) hostMemMB() (uint64, error) {
hostMemKb, err := getHostMemorySizeKb(procMemInfo)
if err != nil {
return 0, fmt.Errorf("Unable to read memory info: %s", err)
}
if hostMemKb == 0 {
return 0, fmt.Errorf("Error host memory size 0")
}
return hostMemKb / 1024, nil
}
func (q *qemu) memoryTopology() (govmmQemu.Memory, error) {
hostMemMb, err := q.hostMemMB()
if err != nil {
return govmmQemu.Memory{}, err
}
memMb := uint64(q.config.MemorySize)
return q.arch.memoryTopology(memMb, hostMemMb, uint8(q.config.MemSlots)), nil
}
func (q *qemu) qmpSocketPath(id string) (string, error) {
return utils.BuildSocketPath(q.store.RunVMStoragePath(), id, qmpSocket)
}
func (q *qemu) getQemuMachine() (govmmQemu.Machine, error) {
machine := q.arch.machine()
accelerators := q.config.MachineAccelerators
if accelerators != "" {
if !strings.HasPrefix(accelerators, ",") {
accelerators = fmt.Sprintf(",%s", accelerators)
}
machine.Options += accelerators
}
return machine, nil
}
func (q *qemu) appendImage(devices []govmmQemu.Device) ([]govmmQemu.Device, error) {
imagePath, err := q.config.ImageAssetPath()
if err != nil {
return nil, err
}
if imagePath != "" {
devices, err = q.arch.appendImage(devices, imagePath)
if err != nil {
return nil, err
}
}
return devices, nil
}
func (q *qemu) createQmpSocket() ([]govmmQemu.QMPSocket, error) {
monitorSockPath, err := q.qmpSocketPath(q.id)
if err != nil {
return nil, err
}
q.qmpMonitorCh = qmpChannel{
ctx: q.ctx,
path: monitorSockPath,
}
return []govmmQemu.QMPSocket{
{
Type: "unix",
Name: q.qmpMonitorCh.path,
Server: true,
NoWait: true,
},
}, nil
}
func (q *qemu) buildDevices(initrdPath string) ([]govmmQemu.Device, *govmmQemu.IOThread, error) {
var devices []govmmQemu.Device
console, err := q.getSandboxConsole(q.id)
if err != nil {
return nil, nil, err
}
// Add bridges before any other devices. This way we make sure that
// bridge gets the first available PCI address i.e bridgePCIStartAddr
devices = q.arch.appendBridges(devices)
devices, err = q.arch.appendConsole(devices, console)
if err != nil {
return nil, nil, err
}
if initrdPath == "" {
devices, err = q.appendImage(devices)
if err != nil {
return nil, nil, err
}
}
if q.config.IOMMU {
devices, err = q.arch.appendIOMMU(devices)
if err != nil {
return nil, nil, err
}
}
var ioThread *govmmQemu.IOThread
if q.config.BlockDeviceDriver == config.VirtioSCSI {
return q.arch.appendSCSIController(devices, q.config.EnableIOThreads)
}
return devices, ioThread, nil
}
func (q *qemu) setupTemplate(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) govmmQemu.Incoming {
incoming := govmmQemu.Incoming{}
if q.config.BootToBeTemplate || q.config.BootFromTemplate {
knobs.FileBackedMem = true
memory.Path = q.config.MemoryPath
if q.config.BootToBeTemplate {
knobs.MemShared = true
}
if q.config.BootFromTemplate {
incoming.MigrationType = govmmQemu.MigrationDefer
}
}
return incoming
}
func (q *qemu) setupFileBackedMem(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) {
var target string
if q.config.FileBackedMemRootDir != "" {
target = q.config.FileBackedMemRootDir
} else {
target = fallbackFileBackedMemDir
}
if _, err := os.Stat(target); err != nil {
q.Logger().WithError(err).Error("File backed memory location does not exist")
return
}
knobs.FileBackedMem = true
knobs.MemShared = true
memory.Path = target
}
// createSandbox is the Hypervisor sandbox creation implementation for govmmQemu.
func (q *qemu) createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig, stateful bool) error {
// Save the tracing context
q.ctx = ctx
span, _ := q.trace("createSandbox")
defer span.Finish()
if err := q.setup(id, hypervisorConfig); err != nil {
return err
}
machine, err := q.getQemuMachine()
if err != nil {
return err
}
smp := q.cpuTopology()
memory, err := q.memoryTopology()
if err != nil {
return err
}
knobs := govmmQemu.Knobs{
NoUserConfig: true,
NoDefaults: true,
NoGraphic: true,
Daemonize: true,
MemPrealloc: q.config.MemPrealloc,
HugePages: q.config.HugePages,
Realtime: q.config.Realtime,
Mlock: q.config.Mlock,
}
kernelPath, err := q.config.KernelAssetPath()
if err != nil {
return err
}
initrdPath, err := q.config.InitrdAssetPath()
if err != nil {
return err
}
kernel := govmmQemu.Kernel{
Path: kernelPath,
InitrdPath: initrdPath,
Params: q.kernelParameters(),
}
incoming := q.setupTemplate(&knobs, &memory)
// With the current implementations, VM templating will not work with file
// based memory (stand-alone) or virtiofs. This is because VM templating
// builds the first VM with file-backed memory and shared=on and the
// subsequent ones with shared=off. virtio-fs always requires shared=on for
// memory.
if q.config.SharedFS == config.VirtioFS || q.config.FileBackedMemRootDir != "" {
if !(q.config.BootToBeTemplate || q.config.BootFromTemplate) {
q.setupFileBackedMem(&knobs, &memory)
} else {
return errors.New("VM templating has been enabled with either virtio-fs or file backed memory and this configuration will not work")
}
if q.config.HugePages {
knobs.MemPrealloc = true
}
}
// Vhost-user-blk/scsi process which can improve performance, like SPDK,
// requires shared-on hugepage to work with Qemu.
if q.config.EnableVhostUserStore {
if !q.config.HugePages {
return errors.New("Vhost-user-blk/scsi is enabled without HugePages. This configuration will not work")
}
knobs.MemShared = true
}
rtc := govmmQemu.RTC{
Base: "utc",
DriftFix: "slew",
}
if q.state.UUID == "" {
return fmt.Errorf("UUID should not be empty")
}
qmpSockets, err := q.createQmpSocket()
if err != nil {
return err
}
devices, ioThread, err := q.buildDevices(initrdPath)
if err != nil {
return err
}
cpuModel := q.arch.cpuModel()
firmwarePath, err := q.config.FirmwareAssetPath()
if err != nil {
return err
}
qemuPath, err := q.qemuPath()
if err != nil {
return err
}
qemuConfig := govmmQemu.Config{
Name: fmt.Sprintf("sandbox-%s", q.id),
UUID: q.state.UUID,
Path: qemuPath,
Ctx: q.qmpMonitorCh.ctx,
Machine: machine,
SMP: smp,
Memory: memory,
Devices: devices,
CPUModel: cpuModel,
Kernel: kernel,
RTC: rtc,
QMPSockets: qmpSockets,
Knobs: knobs,
Incoming: incoming,
VGA: "none",
GlobalParam: "kvm-pit.lost_tick_policy=discard",
Bios: firmwarePath,
PidFile: filepath.Join(q.store.RunVMStoragePath(), q.id, "pid"),
}
if ioThread != nil {
qemuConfig.IOThreads = []govmmQemu.IOThread{*ioThread}
}
// Add RNG device to hypervisor
rngDev := config.RNGDev{
ID: rngID,
Filename: q.config.EntropySource,
}
qemuConfig.Devices, err = q.arch.appendRNGDevice(qemuConfig.Devices, rngDev)
if err != nil {
return err
}
// Add PCIe Root Port devices to hypervisor
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port.
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
if hypervisorConfig.PCIeRootPort > 0 {
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort)
}
q.qemuConfig = qemuConfig
return nil
}
func (q *qemu) vhostFSSocketPath(id string) (string, error) {
return utils.BuildSocketPath(q.store.RunVMStoragePath(), id, vhostFSSocket)
}
func (q *qemu) virtiofsdArgs(fd uintptr) []string {
// The daemon will terminate when the vhost-user socket
// connection with QEMU closes. Therefore we do not keep track
// of this child process after returning from this function.
sourcePath := filepath.Join(getSharePath(q.id))
args := []string{
fmt.Sprintf("--fd=%v", fd),
"-o", "source=" + sourcePath,
"-o", "cache=" + q.config.VirtioFSCache,
"--syslog", "-o", "no_posix_lock"}
if q.config.Debug {
args = append(args, "-d")
} else {
args = append(args, "-f")
}
if len(q.config.VirtioFSExtraArgs) != 0 {
args = append(args, q.config.VirtioFSExtraArgs...)
}
return args
}
func (q *qemu) setupVirtiofsd() (err error) {
var listener *net.UnixListener
var fd *os.File
if _, err = os.Stat(q.config.VirtioFSDaemon); os.IsNotExist(err) {
return fmt.Errorf("virtiofsd path (%s) does not exist", q.config.VirtioFSDaemon)
}
sockPath, err := q.vhostFSSocketPath(q.id)
if err != nil {
return err
}
listener, err = net.ListenUnix("unix", &net.UnixAddr{
Name: sockPath,
Net: "unix",
})
if err != nil {
return err
}
listener.SetUnlinkOnClose(false)
fd, err = listener.File()
listener.Close() // no longer needed since fd is a dup
listener = nil
if err != nil {
return err
}
const sockFd = 3 // Cmd.ExtraFiles[] fds are numbered starting from 3
cmd := exec.Command(q.config.VirtioFSDaemon, q.virtiofsdArgs(sockFd)...)
cmd.ExtraFiles = append(cmd.ExtraFiles, fd)
stderr, err := cmd.StderrPipe()
if err != nil {
return err
}
err = cmd.Start()
if err == nil {
q.state.VirtiofsdPid = cmd.Process.Pid
}
fd.Close()
// Monitor virtiofsd's stderr and stop sandbox if virtiofsd quits
go func() {
scanner := bufio.NewScanner(stderr)
for scanner.Scan() {
q.Logger().WithField("source", "virtiofsd").Info(scanner.Text())
}
q.Logger().Info("virtiofsd quits")
// Wait to release resources of virtiofsd process
cmd.Process.Wait()
q.stopSandbox()
}()
return err
}
func (q *qemu) getMemArgs() (bool, string, string, error) {
share := false
target := ""
memoryBack := "memory-backend-ram"
if q.qemuConfig.Knobs.HugePages {
// we are setting all the bits that govmm sets when hugepages are enabled.
// https://github.com/intel/govmm/blob/master/qemu/qemu.go#L1677
target = "/dev/hugepages"
memoryBack = "memory-backend-file"
share = true
} else {
if q.config.EnableVhostUserStore {
// Vhost-user-blk/scsi process which can improve performance, like SPDK,
// requires shared-on hugepage to work with Qemu.
return share, target, "", fmt.Errorf("Vhost-user-blk/scsi requires hugepage memory")
}
if q.config.SharedFS == config.VirtioFS || q.config.FileBackedMemRootDir != "" {
target = q.qemuConfig.Memory.Path
memoryBack = "memory-backend-file"
}
}
if q.qemuConfig.Knobs.MemShared {
share = true
}
return share, target, memoryBack, nil
}
func (q *qemu) setupVirtioMem() error {
maxMem, err := q.hostMemMB()
if err != nil {
return err
}
// 1024 is size for nvdimm
sizeMB := int(maxMem) - int(q.config.MemorySize)
share, target, memoryBack, err := q.getMemArgs()
if err != nil {
return err
}
err = q.qmpSetup()
if err != nil {
return err
}
err = q.qmpMonitorCh.qmp.ExecMemdevAdd(q.qmpMonitorCh.ctx, memoryBack, "virtiomem", target, sizeMB, share, "virtio-mem-pci", "virtiomem0")
if err == nil {
q.config.VirtioMem = true
q.Logger().Infof("Setup %dMB virtio-mem-pci success", sizeMB)
} else {
help := ""
if strings.Contains(err.Error(), "Cannot allocate memory") {
help = ". Please use command \"echo 1 > /proc/sys/vm/overcommit_memory\" handle it."
}
err = fmt.Errorf("Add %dMB virtio-mem-pci fail %s%s", sizeMB, err.Error(), help)
}
return err
}
// startSandbox will start the Sandbox's VM.
func (q *qemu) startSandbox(timeout int) error {
span, _ := q.trace("startSandbox")
defer span.Finish()
if q.config.Debug {
params := q.arch.kernelParameters(q.config.Debug)
strParams := SerializeParams(params, "=")
formatted := strings.Join(strParams, " ")
// The name of this field matches a similar one generated by
// the runtime and allows users to identify which parameters
// are set here, which come from the runtime and which are set
// by the user.
q.Logger().WithField("default-kernel-parameters", formatted).Debug()
}
defer func() {
for _, fd := range q.fds {
if err := fd.Close(); err != nil {
q.Logger().WithError(err).Error("After launching Qemu")
}
}
q.fds = []*os.File{}
}()
vmPath := filepath.Join(q.store.RunVMStoragePath(), q.id)
err := os.MkdirAll(vmPath, DirMode)
if err != nil {
return err
}
// append logfile only on debug
if q.config.Debug {
q.qemuConfig.LogFile = filepath.Join(vmPath, "qemu.log")
}
defer func() {
if err != nil {
if err := os.RemoveAll(vmPath); err != nil {
q.Logger().WithError(err).Error("Fail to clean up vm directory")
}
}
}()
// This needs to be done as late as possible, just before launching
// virtiofsd are executed by kata-runtime after this call, run with
// the SELinux label. If these processes require privileged, we do
// notwant to run them under confinement.
if err := label.SetProcessLabel(q.config.SELinuxProcessLabel); err != nil {
return err
}
defer label.SetProcessLabel("")
if q.config.SharedFS == config.VirtioFS {
err = q.setupVirtiofsd()
if err != nil {
return err
}
}
var strErr string
strErr, err = govmmQemu.LaunchQemu(q.qemuConfig, newQMPLogger())
if err != nil {
if q.config.Debug && q.qemuConfig.LogFile != "" {
b, err := ioutil.ReadFile(q.qemuConfig.LogFile)
if err == nil {
strErr += string(b)
}
}
q.Logger().WithError(err).Errorf("failed to launch qemu: %s", strErr)
return fmt.Errorf("failed to launch qemu: %s, error messages from qemu log: %s", err, strErr)
}
err = q.waitSandbox(timeout)
if err != nil {
return err
}
if q.config.BootFromTemplate {
if err = q.bootFromTemplate(); err != nil {
return err
}
}
if q.config.VirtioMem {
err = q.setupVirtioMem()
}
return err
}
func (q *qemu) bootFromTemplate() error {
err := q.qmpSetup()
if err != nil {
return err
}
defer q.qmpShutdown()
err = q.arch.setIgnoreSharedMemoryMigrationCaps(q.qmpMonitorCh.ctx, q.qmpMonitorCh.qmp)
if err != nil {
q.Logger().WithError(err).Error("set migration ignore shared memory")
return err
}
uri := fmt.Sprintf("exec:cat %s", q.config.DevicesStatePath)
err = q.qmpMonitorCh.qmp.ExecuteMigrationIncoming(q.qmpMonitorCh.ctx, uri)
if err != nil {
return err
}
return q.waitMigration()
}
// waitSandbox will wait for the Sandbox's VM to be up and running.
func (q *qemu) waitSandbox(timeout int) error {
span, _ := q.trace("waitSandbox")
defer span.Finish()
if timeout < 0 {
return fmt.Errorf("Invalid timeout %ds", timeout)
}
cfg := govmmQemu.QMPConfig{Logger: newQMPLogger()}
var qmp *govmmQemu.QMP
var disconnectCh chan struct{}
var ver *govmmQemu.QMPVersion
var err error
// clear any possible old state before trying to connect again.
q.qmpShutdown()
timeStart := time.Now()
for {
disconnectCh = make(chan struct{})
qmp, ver, err = govmmQemu.QMPStart(q.qmpMonitorCh.ctx, q.qmpMonitorCh.path, cfg, disconnectCh)
if err == nil {
break
}
if int(time.Since(timeStart).Seconds()) > timeout {
return fmt.Errorf("Failed to connect to QEMU instance (timeout %ds): %v", timeout, err)
}
time.Sleep(time.Duration(50) * time.Millisecond)
}
q.qmpMonitorCh.qmp = qmp
q.qmpMonitorCh.disconn = disconnectCh
defer q.qmpShutdown()
qemuMajorVersion = ver.Major
qemuMinorVersion = ver.Minor
q.Logger().WithFields(logrus.Fields{
"qmp-major-version": ver.Major,
"qmp-minor-version": ver.Minor,
"qmp-micro-version": ver.Micro,
"qmp-capabilities": strings.Join(ver.Capabilities, ","),
}).Infof("QMP details")
if err = q.qmpMonitorCh.qmp.ExecuteQMPCapabilities(q.qmpMonitorCh.ctx); err != nil {
q.Logger().WithError(err).Error(qmpCapErrMsg)
return err
}
return nil
}
// stopSandbox will stop the Sandbox's VM.
func (q *qemu) stopSandbox() error {
span, _ := q.trace("stopSandbox")
defer span.Finish()
q.Logger().Info("Stopping Sandbox")
if q.stopped {
q.Logger().Info("Already stopped")
return nil
}
defer func() {
q.cleanupVM()
q.stopped = true
}()
if q.config.Debug && q.qemuConfig.LogFile != "" {
f, err := os.OpenFile(q.qemuConfig.LogFile, os.O_RDONLY, 0)
if err == nil {
scanner := bufio.NewScanner(f)
for scanner.Scan() {
q.Logger().Debug(scanner.Text())
}
if err := scanner.Err(); err != nil {
q.Logger().WithError(err).Debug("read qemu log failed")
}
}
}
err := q.qmpSetup()
if err != nil {
return err
}
err = q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
if err != nil {
q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
return err
}
return nil
}
func (q *qemu) cleanupVM() error {
// cleanup vm path
dir := filepath.Join(q.store.RunVMStoragePath(), q.id)
// If it's a symlink, remove both dir and the target.
// This can happen when vm template links a sandbox to a vm.
link, err := filepath.EvalSymlinks(dir)
if err != nil {
// Well, it's just cleanup failure. Let's ignore it.
q.Logger().WithError(err).WithField("dir", dir).Warn("failed to resolve vm path")
}
q.Logger().WithField("link", link).WithField("dir", dir).Infof("cleanup vm path")
if err := os.RemoveAll(dir); err != nil {
q.Logger().WithError(err).Warnf("failed to remove vm path %s", dir)
}
if link != dir && link != "" {
if err := os.RemoveAll(link); err != nil {
q.Logger().WithError(err).WithField("link", link).Warn("failed to remove resolved vm path")
}
}
if q.config.VMid != "" {
dir = filepath.Join(q.store.RunStoragePath(), q.config.VMid)
if err := os.RemoveAll(dir); err != nil {
q.Logger().WithError(err).WithField("path", dir).Warnf("failed to remove vm path")
}
}
return nil
}
func (q *qemu) togglePauseSandbox(pause bool) error {
span, _ := q.trace("togglePauseSandbox")
defer span.Finish()
err := q.qmpSetup()
if err != nil {
return err
}
if pause {
err = q.qmpMonitorCh.qmp.ExecuteStop(q.qmpMonitorCh.ctx)
} else {
err = q.qmpMonitorCh.qmp.ExecuteCont(q.qmpMonitorCh.ctx)
}
if err != nil {
return err
}
return nil
}
func (q *qemu) qmpSetup() error {
q.qmpMonitorCh.Lock()
defer q.qmpMonitorCh.Unlock()
if q.qmpMonitorCh.qmp != nil {
return nil
}
cfg := govmmQemu.QMPConfig{Logger: newQMPLogger()}
// Auto-closed by QMPStart().
disconnectCh := make(chan struct{})
qmp, _, err := govmmQemu.QMPStart(q.qmpMonitorCh.ctx, q.qmpMonitorCh.path, cfg, disconnectCh)
if err != nil {
q.Logger().WithError(err).Error("Failed to connect to QEMU instance")
return err
}
err = qmp.ExecuteQMPCapabilities(q.qmpMonitorCh.ctx)
if err != nil {
qmp.Shutdown()
q.Logger().WithError(err).Error(qmpCapErrMsg)
return err
}
q.qmpMonitorCh.qmp = qmp
q.qmpMonitorCh.disconn = disconnectCh
return nil
}
func (q *qemu) qmpShutdown() {
q.qmpMonitorCh.Lock()
defer q.qmpMonitorCh.Unlock()
if q.qmpMonitorCh.qmp != nil {
q.qmpMonitorCh.qmp.Shutdown()
// wait on disconnected channel to be sure that the qmp channel has
// been closed cleanly.
<-q.qmpMonitorCh.disconn
q.qmpMonitorCh.qmp = nil
q.qmpMonitorCh.disconn = nil
}
}
func (q *qemu) hotplugAddBlockDevice(drive *config.BlockDrive, op operation, devID string) (err error) {
// drive can be a pmem device, in which case it's used as backing file for a nvdimm device
if q.config.BlockDeviceDriver == config.Nvdimm || drive.Pmem {
var blocksize int64
file, err := os.Open(drive.File)
if err != nil {
return err
}
defer file.Close()
st, err := file.Stat()
if err != nil {
return fmt.Errorf("failed to get information from nvdimm device %v: %v", drive.File, err)
}
// regular files do not support syscall BLKGETSIZE64
if st.Mode().IsRegular() {
blocksize = st.Size()
} else if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&blocksize))); err != 0 {
return err
}
if err = q.qmpMonitorCh.qmp.ExecuteNVDIMMDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, drive.File, blocksize, &drive.Pmem); err != nil {
q.Logger().WithError(err).Errorf("Failed to add NVDIMM device %s", drive.File)
return err
}
drive.NvdimmID = strconv.Itoa(q.nvdimmCount)
q.nvdimmCount++
return nil
}
if q.config.BlockDeviceCacheSet {
err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithCache(q.qmpMonitorCh.ctx, drive.File, drive.ID, q.config.BlockDeviceCacheDirect, q.config.BlockDeviceCacheNoflush)
} else {
err = q.qmpMonitorCh.qmp.ExecuteBlockdevAdd(q.qmpMonitorCh.ctx, drive.File, drive.ID)
}
if err != nil {
return err
}
defer func() {
if err != nil {
q.qmpMonitorCh.qmp.ExecuteBlockdevDel(q.qmpMonitorCh.ctx, drive.ID)
}
}()
switch {
case q.config.BlockDeviceDriver == config.VirtioBlockCCW:
driver := "virtio-blk-ccw"
addr, bridge, err := q.arch.addDeviceToBridge(drive.ID, types.CCW)
if err != nil {
return err
}
var devNoHotplug string
devNoHotplug, err = bridge.AddressFormatCCW(addr)
if err != nil {
return err
}
drive.DevNo, err = bridge.AddressFormatCCWForVirtServer(addr)
if err != nil {
return err
}
if err = q.qmpMonitorCh.qmp.ExecuteDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, devNoHotplug, "", true, false); err != nil {
return err
}
case q.config.BlockDeviceDriver == config.VirtioBlock:
driver := "virtio-blk-pci"
addr, bridge, err := q.arch.addDeviceToBridge(drive.ID, types.PCI)
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(drive.ID)
}
}()
// PCI address is in the format bridge-addr/device-addr eg. "03/02"
drive.PCIAddr = fmt.Sprintf("%02x", bridge.Addr) + "/" + addr
if err = q.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, addr, bridge.ID, romFile, 0, true, defaultDisableModern); err != nil {
return err
}
case q.config.BlockDeviceDriver == config.VirtioSCSI:
driver := "scsi-hd"
// Bus exposed by the SCSI Controller
bus := scsiControllerID + ".0"
// Get SCSI-id and LUN based on the order of attaching drives.
scsiID, lun, err := utils.GetSCSIIdLun(drive.Index)
if err != nil {
return err
}
if err = q.qmpMonitorCh.qmp.ExecuteSCSIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, bus, romFile, scsiID, lun, true, defaultDisableModern); err != nil {
return err
}
default:
return fmt.Errorf("Block device %s not recognized", q.config.BlockDeviceDriver)
}
return nil
}
func (q *qemu) hotplugAddVhostUserBlkDevice(vAttr *config.VhostUserDeviceAttrs, op operation, devID string) (err error) {
err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false)
if err != nil {
return err
}
defer func() {
if err != nil {
q.qmpMonitorCh.qmp.ExecuteChardevDel(q.qmpMonitorCh.ctx, vAttr.DevID)
}
}()
driver := "vhost-user-blk-pci"
addr, bridge, err := q.arch.addDeviceToBridge(vAttr.DevID, types.PCI)
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(vAttr.DevID)
}
}()
// PCI address is in the format bridge-addr/device-addr eg. "03/02"
vAttr.PCIAddr = fmt.Sprintf("%02x", bridge.Addr) + "/" + addr
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridge.ID); err != nil {
return err
}
return nil
}
func (q *qemu) hotplugBlockDevice(drive *config.BlockDrive, op operation) error {
err := q.qmpSetup()
if err != nil {
return err
}
devID := "virtio-" + drive.ID
if op == addDevice {
err = q.hotplugAddBlockDevice(drive, op, devID)
} else {
if q.config.BlockDeviceDriver == config.VirtioBlock {
if err := q.arch.removeDeviceFromBridge(drive.ID); err != nil {
return err
}
}
if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
return err
}
if err := q.qmpMonitorCh.qmp.ExecuteBlockdevDel(q.qmpMonitorCh.ctx, drive.ID); err != nil {
return err
}
}
return err
}
func (q *qemu) hotplugVhostUserDevice(vAttr *config.VhostUserDeviceAttrs, op operation) error {
err := q.qmpSetup()
if err != nil {
return err
}
devID := "virtio-" + vAttr.DevID
if op == addDevice {
switch vAttr.Type {
case config.VhostUserBlk:
return q.hotplugAddVhostUserBlkDevice(vAttr, op, devID)
default:
return fmt.Errorf("Incorrect vhost-user device type found")
}
} else {
if err := q.arch.removeDeviceFromBridge(vAttr.DevID); err != nil {
return err
}
if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
return err
}
if err := q.qmpMonitorCh.qmp.ExecuteChardevDel(q.qmpMonitorCh.ctx, vAttr.DevID); err != nil {
return err
}
}
return nil
}
func (q *qemu) hotplugVFIODevice(device *config.VFIODev, op operation) (err error) {
err = q.qmpSetup()
if err != nil {
return err
}
devID := device.ID
machinneType := q.hypervisorConfig().HypervisorMachineType
if op == addDevice {
buf, _ := json.Marshal(device)
q.Logger().WithFields(logrus.Fields{
"machine-type": machinneType,
"hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus,
"pcie-root-port": q.state.PCIeRootPort,
"device-info": string(buf),
}).Info("Start hot-plug VFIO device")
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotplugVFIOOnRootBus {
// In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
switch machinneType {
case QemuQ35:
if device.IsPCIe && q.state.PCIeRootPort <= 0 {
q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
device.Bus = ""
}
default:
device.Bus = ""
}
switch device.Type {
case config.VFIODeviceNormalType:
return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, device.Bus, romFile)
case config.VFIODeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", device.Bus, romFile)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
}
addr, bridge, err := q.arch.addDeviceToBridge(devID, types.PCI)
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(devID)
}
}()
switch device.Type {
case config.VFIODeviceNormalType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, addr, bridge.ID, romFile)
case config.VFIODeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, addr, bridge.ID, romFile)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
} else {
q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")
if !q.state.HotplugVFIOOnRootBus {
if err := q.arch.removeDeviceFromBridge(devID); err != nil {
return err
}
}
if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
return err
}
}
return nil
}
func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
var (
VMFdNames []string
VhostFdNames []string
)
for i, VMFd := range VMFds {
fdName := fmt.Sprintf("fd%d", i)
if err := q.qmpMonitorCh.qmp.ExecuteGetFD(q.qmpMonitorCh.ctx, fdName, VMFd); err != nil {
return err
}
VMFdNames = append(VMFdNames, fdName)
}
for i, VhostFd := range VhostFds {
fdName := fmt.Sprintf("vhostfd%d", i)
if err := q.qmpMonitorCh.qmp.ExecuteGetFD(q.qmpMonitorCh.ctx, fdName, VhostFd); err != nil {
return err
}
VhostFd.Close()
VhostFdNames = append(VhostFdNames, fdName)
}
return q.qmpMonitorCh.qmp.ExecuteNetdevAddByFds(q.qmpMonitorCh.ctx, "tap", name, VMFdNames, VhostFdNames)
}
func (q *qemu) hotplugNetDevice(endpoint Endpoint, op operation) (err error) {
err = q.qmpSetup()
if err != nil {
return err
}
var tap TapInterface
switch endpoint.Type() {
case VethEndpointType:
drive := endpoint.(*VethEndpoint)
tap = drive.NetPair.TapInterface
case TapEndpointType:
drive := endpoint.(*TapEndpoint)
tap = drive.TapInterface
default:
return fmt.Errorf("this endpoint is not supported")
}
devID := "virtio-" + tap.ID
if op == addDevice {
if err = q.hotAddNetDevice(tap.Name, endpoint.HardwareAddr(), tap.VMFds, tap.VhostFds); err != nil {
return err
}
defer func() {
if err != nil {
q.qmpMonitorCh.qmp.ExecuteNetdevDel(q.qmpMonitorCh.ctx, tap.Name)
}
}()
addr, bridge, err := q.arch.addDeviceToBridge(tap.ID, types.PCI)
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(tap.ID)
}
}()
pciAddr := fmt.Sprintf("%02x/%s", bridge.Addr, addr)
endpoint.SetPciAddr(pciAddr)
var machine govmmQemu.Machine
machine, err = q.getQemuMachine()
if err != nil {
return err
}
if machine.Type == QemuCCWVirtio {
devNoHotplug := fmt.Sprintf("fe.%x.%x", bridge.Addr, addr)
return q.qmpMonitorCh.qmp.ExecuteNetCCWDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), devNoHotplug, int(q.config.NumVCPUs))
}
return q.qmpMonitorCh.qmp.ExecuteNetPCIDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), addr, bridge.ID, romFile, int(q.config.NumVCPUs), defaultDisableModern)
}
if err := q.arch.removeDeviceFromBridge(tap.ID); err != nil {
return err
}
if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
return err
}
if err := q.qmpMonitorCh.qmp.ExecuteNetdevDel(q.qmpMonitorCh.ctx, tap.Name); err != nil {
return err
}
return nil
}
func (q *qemu) hotplugDevice(devInfo interface{}, devType deviceType, op operation) (interface{}, error) {
switch devType {
case blockDev:
drive := devInfo.(*config.BlockDrive)
return nil, q.hotplugBlockDevice(drive, op)
case cpuDev:
vcpus := devInfo.(uint32)
return q.hotplugCPUs(vcpus, op)
case vfioDev:
device := devInfo.(*config.VFIODev)
return nil, q.hotplugVFIODevice(device, op)
case memoryDev:
memdev := devInfo.(*memoryDevice)
return q.hotplugMemory(memdev, op)
case netDev:
device := devInfo.(Endpoint)
return nil, q.hotplugNetDevice(device, op)
case vhostuserDev:
vAttr := devInfo.(*config.VhostUserDeviceAttrs)
return nil, q.hotplugVhostUserDevice(vAttr, op)
default:
return nil, fmt.Errorf("cannot hotplug device: unsupported device type '%v'", devType)
}
}
func (q *qemu) hotplugAddDevice(devInfo interface{}, devType deviceType) (interface{}, error) {
span, _ := q.trace("hotplugAddDevice")
defer span.Finish()
data, err := q.hotplugDevice(devInfo, devType, addDevice)
if err != nil {
return data, err
}
return data, nil
}
func (q *qemu) hotplugRemoveDevice(devInfo interface{}, devType deviceType) (interface{}, error) {
span, _ := q.trace("hotplugRemoveDevice")
defer span.Finish()
data, err := q.hotplugDevice(devInfo, devType, removeDevice)
if err != nil {
return data, err
}
return data, nil
}
func (q *qemu) hotplugCPUs(vcpus uint32, op operation) (uint32, error) {
if vcpus == 0 {
q.Logger().Warnf("cannot hotplug 0 vCPUs")
return 0, nil
}
err := q.qmpSetup()
if err != nil {
return 0, err
}
if op == addDevice {
return q.hotplugAddCPUs(vcpus)
}
return q.hotplugRemoveCPUs(vcpus)
}
// try to hot add an amount of vCPUs, returns the number of vCPUs added
func (q *qemu) hotplugAddCPUs(amount uint32) (uint32, error) {
currentVCPUs := q.qemuConfig.SMP.CPUs + uint32(len(q.state.HotpluggedVCPUs))
// Don't fail if the number of max vCPUs is exceeded, log a warning and hot add the vCPUs needed
// to reach out max vCPUs
if currentVCPUs+amount > q.config.DefaultMaxVCPUs {
q.Logger().Warnf("Cannot hotplug %d CPUs, currently this SB has %d CPUs and the maximum amount of CPUs is %d",
amount, currentVCPUs, q.config.DefaultMaxVCPUs)
amount = q.config.DefaultMaxVCPUs - currentVCPUs
}
if amount == 0 {
// Don't fail if no more vCPUs can be added, since cgroups still can be updated
q.Logger().Warnf("maximum number of vCPUs '%d' has been reached", q.config.DefaultMaxVCPUs)
return 0, nil
}
// get the list of hotpluggable CPUs
hotpluggableVCPUs, err := q.qmpMonitorCh.qmp.ExecuteQueryHotpluggableCPUs(q.qmpMonitorCh.ctx)
if err != nil {
return 0, fmt.Errorf("failed to query hotpluggable CPUs: %v", err)
}
machine := q.arch.machine()
var hotpluggedVCPUs uint32
for _, hc := range hotpluggableVCPUs {
// qom-path is the path to the CPU, non-empty means that this CPU is already in use
if hc.QOMPath != "" {
continue
}
// CPU type, i.e host-x86_64-cpu
driver := hc.Type
cpuID := fmt.Sprintf("cpu-%d", len(q.state.HotpluggedVCPUs))
socketID := fmt.Sprintf("%d", hc.Properties.Socket)
dieID := fmt.Sprintf("%d", hc.Properties.Die)
coreID := fmt.Sprintf("%d", hc.Properties.Core)
threadID := fmt.Sprintf("%d", hc.Properties.Thread)
// If CPU type is IBM pSeries or Z, we do not set socketID and threadID
if machine.Type == "pseries" || machine.Type == "s390-ccw-virtio" {
socketID = ""
threadID = ""
dieID = ""
}
if err := q.qmpMonitorCh.qmp.ExecuteCPUDeviceAdd(q.qmpMonitorCh.ctx, driver, cpuID, socketID, dieID, coreID, threadID, romFile); err != nil {
// don't fail, let's try with other CPU
continue
}
// a new vCPU was added, update list of hotplugged vCPUs and check if all vCPUs were added
q.state.HotpluggedVCPUs = append(q.state.HotpluggedVCPUs, CPUDevice{cpuID})
hotpluggedVCPUs++
if hotpluggedVCPUs == amount {
// All vCPUs were hotplugged
return amount, nil
}
}
return hotpluggedVCPUs, fmt.Errorf("failed to hot add vCPUs: only %d vCPUs of %d were added", hotpluggedVCPUs, amount)
}
// try to hot remove an amount of vCPUs, returns the number of vCPUs removed
func (q *qemu) hotplugRemoveCPUs(amount uint32) (uint32, error) {
hotpluggedVCPUs := uint32(len(q.state.HotpluggedVCPUs))
// we can only remove hotplugged vCPUs
if amount > hotpluggedVCPUs {
return 0, fmt.Errorf("Unable to remove %d CPUs, currently there are only %d hotplugged CPUs", amount, hotpluggedVCPUs)
}
for i := uint32(0); i < amount; i++ {
// get the last vCPUs and try to remove it
cpu := q.state.HotpluggedVCPUs[len(q.state.HotpluggedVCPUs)-1]
if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, cpu.ID); err != nil {
return i, fmt.Errorf("failed to hotunplug CPUs, only %d CPUs were hotunplugged: %v", i, err)
}
// remove from the list the vCPU hotunplugged
q.state.HotpluggedVCPUs = q.state.HotpluggedVCPUs[:len(q.state.HotpluggedVCPUs)-1]
}
return amount, nil
}
func (q *qemu) hotplugMemory(memDev *memoryDevice, op operation) (int, error) {
if !q.arch.supportGuestMemoryHotplug() {
return 0, fmt.Errorf("guest memory hotplug not supported")
}
if memDev.sizeMB < 0 {
return 0, fmt.Errorf("cannot hotplug negative size (%d) memory", memDev.sizeMB)
}
memLog := q.Logger().WithField("hotplug", "memory")
memLog.WithField("hotplug-memory-mb", memDev.sizeMB).Debug("requested memory hotplug")
err := q.qmpSetup()
if err != nil {
return 0, err
}
currentMemory := int(q.config.MemorySize) + q.state.HotpluggedMemory
if memDev.sizeMB == 0 {
memLog.Debug("hotplug is not required")
return 0, nil
}
switch op {
case removeDevice:
memLog.WithField("operation", "remove").Debugf("Requested to remove memory: %d MB", memDev.sizeMB)
// Dont fail but warn that this is not supported.
memLog.Warn("hot-remove VM memory not supported")
return 0, nil
case addDevice:
memLog.WithField("operation", "add").Debugf("Requested to add memory: %d MB", memDev.sizeMB)
maxMem, err := q.hostMemMB()
if err != nil {
return 0, err
}
// Don't exceed the maximum amount of memory
if currentMemory+memDev.sizeMB > int(maxMem) {
// Fixme: return a typed error
return 0, fmt.Errorf("Unable to hotplug %d MiB memory, the SB has %d MiB and the maximum amount is %d MiB",
memDev.sizeMB, currentMemory, maxMem)
}
memoryAdded, err := q.hotplugAddMemory(memDev)
if err != nil {
return memoryAdded, err
}
return memoryAdded, nil
default:
return 0, fmt.Errorf("invalid operation %v", op)
}
}
func (q *qemu) hotplugAddMemory(memDev *memoryDevice) (int, error) {
memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx)
if err != nil {
return 0, fmt.Errorf("failed to query memory devices: %v", err)
}
if len(memoryDevices) != 0 {
maxSlot := -1
for _, device := range memoryDevices {
if maxSlot < device.Data.Slot {
maxSlot = device.Data.Slot
}
}
memDev.slot = maxSlot + 1
}
share, target, memoryBack, err := q.getMemArgs()
if err != nil {
return 0, err
}
err = q.qmpMonitorCh.qmp.ExecHotplugMemory(q.qmpMonitorCh.ctx, memoryBack, "mem"+strconv.Itoa(memDev.slot), target, memDev.sizeMB, share)
if err != nil {
q.Logger().WithError(err).Error("hotplug memory")
return 0, err
}
// if guest kernel only supports memory hotplug via probe interface, we need to get address of hot-add memory device
if memDev.probe {
memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx)
if err != nil {
return 0, fmt.Errorf("failed to query memory devices: %v", err)
}
if len(memoryDevices) != 0 {
q.Logger().WithField("addr", fmt.Sprintf("0x%x", memoryDevices[len(memoryDevices)-1].Data.Addr)).Debug("recently hot-add memory device")
memDev.addr = memoryDevices[len(memoryDevices)-1].Data.Addr
} else {
return 0, fmt.Errorf("failed to probe address of recently hot-add memory device, no device exists")
}
}
q.state.HotpluggedMemory += memDev.sizeMB
return memDev.sizeMB, nil
}
func (q *qemu) pauseSandbox() error {
span, _ := q.trace("pauseSandbox")
defer span.Finish()
return q.togglePauseSandbox(true)
}
func (q *qemu) resumeSandbox() error {
span, _ := q.trace("resumeSandbox")
defer span.Finish()
return q.togglePauseSandbox(false)
}
// addDevice will add extra devices to Qemu command line.
func (q *qemu) addDevice(devInfo interface{}, devType deviceType) error {
var err error
span, _ := q.trace("addDevice")
defer span.Finish()
switch v := devInfo.(type) {
case types.Volume:
if q.config.SharedFS == config.VirtioFS {
q.Logger().WithField("volume-type", "virtio-fs").Info("adding volume")
var randBytes []byte
randBytes, err = utils.GenerateRandomBytes(8)
if err != nil {
return err
}
id := hex.EncodeToString(randBytes)
var sockPath string
sockPath, err = q.vhostFSSocketPath(q.id)
if err != nil {
return err
}
vhostDev := config.VhostUserDeviceAttrs{
Tag: v.MountTag,
Type: config.VhostUserFS,
CacheSize: q.config.VirtioFSCacheSize,
Cache: q.config.VirtioFSCache,
}
vhostDev.SocketPath = sockPath
vhostDev.DevID = id
q.qemuConfig.Devices, err = q.arch.appendVhostUserDevice(q.qemuConfig.Devices, vhostDev)
} else {
q.Logger().WithField("volume-type", "virtio-9p").Info("adding volume")
q.qemuConfig.Devices, err = q.arch.append9PVolume(q.qemuConfig.Devices, v)
}
case types.Socket:
q.qemuConfig.Devices = q.arch.appendSocket(q.qemuConfig.Devices, v)
case types.VSock:
q.fds = append(q.fds, v.VhostFd)
q.qemuConfig.Devices, err = q.arch.appendVSock(q.qemuConfig.Devices, v)
case Endpoint:
q.qemuConfig.Devices, err = q.arch.appendNetwork(q.qemuConfig.Devices, v)
case config.BlockDrive:
q.qemuConfig.Devices, err = q.arch.appendBlockDevice(q.qemuConfig.Devices, v)
case config.VhostUserDeviceAttrs:
q.qemuConfig.Devices, err = q.arch.appendVhostUserDevice(q.qemuConfig.Devices, v)
case config.VFIODev:
q.qemuConfig.Devices = q.arch.appendVFIODevice(q.qemuConfig.Devices, v)
default:
break
}
return err
}
// getSandboxConsole builds the path of the console where we can read
// logs coming from the sandbox.
func (q *qemu) getSandboxConsole(id string) (string, error) {
span, _ := q.trace("getSandboxConsole")
defer span.Finish()
return utils.BuildSocketPath(q.store.RunVMStoragePath(), id, consoleSocket)
}
func (q *qemu) saveSandbox() error {
q.Logger().Info("save sandbox")
err := q.qmpSetup()
if err != nil {
return err
}
// BootToBeTemplate sets the VM to be a template that other VMs can clone from. We would want to
// bypass shared memory when saving the VM to a local file through migration exec.
if q.config.BootToBeTemplate {
err := q.arch.setIgnoreSharedMemoryMigrationCaps(q.qmpMonitorCh.ctx, q.qmpMonitorCh.qmp)
if err != nil {
q.Logger().WithError(err).Error("set migration ignore shared memory")
return err
}
}
err = q.qmpMonitorCh.qmp.ExecSetMigrateArguments(q.qmpMonitorCh.ctx, fmt.Sprintf("%s>%s", qmpExecCatCmd, q.config.DevicesStatePath))
if err != nil {
q.Logger().WithError(err).Error("exec migration")
return err
}
return q.waitMigration()
}
func (q *qemu) waitMigration() error {
t := time.NewTimer(qmpMigrationWaitTimeout)
defer t.Stop()
for {
status, err := q.qmpMonitorCh.qmp.ExecuteQueryMigration(q.qmpMonitorCh.ctx)
if err != nil {
q.Logger().WithError(err).Error("failed to query migration status")
return err
}
if status.Status == "completed" {
break
}
select {
case <-t.C:
q.Logger().WithField("migration-status", status).Error("timeout waiting for qemu migration")
return fmt.Errorf("timed out after %d seconds waiting for qemu migration", qmpMigrationWaitTimeout)
default:
// migration in progress
q.Logger().WithField("migration-status", status).Debug("migration in progress")
time.Sleep(100 * time.Millisecond)
}
}
return nil
}
func (q *qemu) disconnect() {
span, _ := q.trace("disconnect")
defer span.Finish()
q.qmpShutdown()
}
// resizeMemory get a request to update the VM memory to reqMemMB
// Memory update is managed with two approaches
// Add memory to VM:
// When memory is required to be added we hotplug memory
// Remove Memory from VM/ Return memory to host.
//
// Memory unplug can be slow and it cannot be guaranteed.
// Additionally, the unplug has not small granularly it has to be
// the memory to remove has to be at least the size of one slot.
// To return memory back we are resizing the VM memory balloon.
// A longer term solution is evaluate solutions like virtio-mem
func (q *qemu) resizeMemory(reqMemMB uint32, memoryBlockSizeMB uint32, probe bool) (uint32, memoryDevice, error) {
currentMemory := q.config.MemorySize + uint32(q.state.HotpluggedMemory)
err := q.qmpSetup()
if err != nil {
return 0, memoryDevice{}, err
}
var addMemDevice memoryDevice
if q.config.VirtioMem && currentMemory != reqMemMB {
q.Logger().WithField("hotplug", "memory").Debugf("resize memory from %dMB to %dMB", currentMemory, reqMemMB)
sizeByte := (reqMemMB - q.config.MemorySize) * 1024 * 1024
err = q.qmpMonitorCh.qmp.ExecQomSet(q.qmpMonitorCh.ctx, "virtiomem0", "requested-size", uint64(sizeByte))
if err != nil {
return 0, memoryDevice{}, err
}
q.state.HotpluggedMemory = int(sizeByte / 1024 / 1024)
return reqMemMB, memoryDevice{}, nil
}
switch {
case currentMemory < reqMemMB:
//hotplug
addMemMB := reqMemMB - currentMemory
memHotplugMB, err := calcHotplugMemMiBSize(addMemMB, memoryBlockSizeMB)
if err != nil {
return currentMemory, memoryDevice{}, err
}
addMemDevice.sizeMB = int(memHotplugMB)
addMemDevice.probe = probe
data, err := q.hotplugAddDevice(&addMemDevice, memoryDev)
if err != nil {
return currentMemory, addMemDevice, err
}
memoryAdded, ok := data.(int)
if !ok {
return currentMemory, addMemDevice, fmt.Errorf("Could not get the memory added, got %+v", data)
}
currentMemory += uint32(memoryAdded)
case currentMemory > reqMemMB:
//hotunplug
addMemMB := currentMemory - reqMemMB
memHotunplugMB, err := calcHotplugMemMiBSize(addMemMB, memoryBlockSizeMB)
if err != nil {
return currentMemory, memoryDevice{}, err
}
addMemDevice.sizeMB = int(memHotunplugMB)
addMemDevice.probe = probe
data, err := q.hotplugRemoveDevice(&addMemDevice, memoryDev)
if err != nil {
return currentMemory, addMemDevice, err
}
memoryRemoved, ok := data.(int)
if !ok {
return currentMemory, addMemDevice, fmt.Errorf("Could not get the memory removed, got %+v", data)
}
//FIXME: This is to check memory hotplugRemoveDevice reported 0, as this is not supported.
// In the future if this is implemented this validation should be removed.
if memoryRemoved != 0 {
return currentMemory, addMemDevice, fmt.Errorf("memory hot unplug is not supported, something went wrong")
}
currentMemory -= uint32(memoryRemoved)
}
// currentMemory is the current memory (updated) of the VM, return to caller to allow verify
// the current VM memory state.
return currentMemory, addMemDevice, nil
}
// genericAppendBridges appends to devices the given bridges
// nolint: unused, deadcode
func genericAppendBridges(devices []govmmQemu.Device, bridges []types.Bridge, machineType string) []govmmQemu.Device {
bus := defaultPCBridgeBus
switch machineType {
case QemuQ35, QemuVirt:
bus = defaultBridgeBus
}
for idx, b := range bridges {
t := govmmQemu.PCIBridge
if b.Type == types.PCIE {
t = govmmQemu.PCIEBridge
}
if b.Type == types.CCW {
continue
}
bridges[idx].Addr = bridgePCIStartAddr + idx
devices = append(devices,
govmmQemu.BridgeDevice{
Type: t,
Bus: bus,
ID: b.ID,
// Each bridge is required to be assigned a unique chassis id > 0
Chassis: idx + 1,
SHPC: true,
Addr: strconv.FormatInt(int64(bridges[idx].Addr), 10),
},
)
}
return devices
}
func genericBridges(number uint32, machineType string) []types.Bridge {
var bridges []types.Bridge
var bt types.Type
switch machineType {
case QemuQ35:
// currently only pci bridges are supported
// qemu-2.10 will introduce pcie bridges
fallthrough
case QemuPC:
bt = types.PCI
case QemuVirt:
bt = types.PCIE
case QemuPseries:
bt = types.PCI
case QemuCCWVirtio:
bt = types.CCW
default:
return nil
}
for i := uint32(0); i < number; i++ {
bridges = append(bridges, types.NewBridge(bt, fmt.Sprintf("%s-bridge-%d", bt, i), make(map[uint32]string), 0))
}
return bridges
}
// nolint: unused, deadcode
func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOffset uint32) govmmQemu.Memory {
// image NVDIMM device needs memory space 1024MB
// See https://github.com/clearcontainers/runtime/issues/380
memoryOffset += 1024
memMax := fmt.Sprintf("%dM", hostMemoryMb+uint64(memoryOffset))
mem := fmt.Sprintf("%dM", memoryMb)
memory := govmmQemu.Memory{
Size: mem,
Slots: slots,
MaxMem: memMax,
}
return memory
}
// genericAppendPCIeRootPort appends to devices the given pcie-root-port
func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device {
var (
bus string
chassis string
multiFunction bool
addr string
)
switch machineType {
case QemuQ35:
bus = defaultBridgeBus
chassis = "0"
multiFunction = false
addr = "0"
default:
return devices
}
for i := uint32(0); i < number; i++ {
devices = append(devices,
govmmQemu.PCIeRootPortDevice{
ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i),
Bus: bus,
Chassis: chassis,
Slot: strconv.FormatUint(uint64(i), 10),
Multifunction: multiFunction,
Addr: addr,
},
)
}
return devices
}
func (q *qemu) getThreadIDs() (vcpuThreadIDs, error) {
span, _ := q.trace("getThreadIDs")
defer span.Finish()
tid := vcpuThreadIDs{}
err := q.qmpSetup()
if err != nil {
return tid, err
}
cpuInfos, err := q.qmpMonitorCh.qmp.ExecQueryCpus(q.qmpMonitorCh.ctx)
if err != nil {
q.Logger().WithError(err).Error("failed to query cpu infos")
return tid, err
}
tid.vcpus = make(map[int]int, len(cpuInfos))
for _, i := range cpuInfos {
if i.ThreadID > 0 {
tid.vcpus[i.CPU] = i.ThreadID
}
}
return tid, nil
}
func calcHotplugMemMiBSize(mem uint32, memorySectionSizeMB uint32) (uint32, error) {
if memorySectionSizeMB == 0 {
return mem, nil
}
// TODO: hot add memory aligned to memory section should be more properly. See https://github.com/kata-containers/kata-containers/src/runtime/pull/624#issuecomment-419656853
return uint32(math.Ceil(float64(mem)/float64(memorySectionSizeMB))) * memorySectionSizeMB, nil
}
func (q *qemu) resizeVCPUs(reqVCPUs uint32) (currentVCPUs uint32, newVCPUs uint32, err error) {
currentVCPUs = q.config.NumVCPUs + uint32(len(q.state.HotpluggedVCPUs))
newVCPUs = currentVCPUs
switch {
case currentVCPUs < reqVCPUs:
//hotplug
addCPUs := reqVCPUs - currentVCPUs
data, err := q.hotplugAddDevice(addCPUs, cpuDev)
if err != nil {
return currentVCPUs, newVCPUs, err
}
vCPUsAdded, ok := data.(uint32)
if !ok {
return currentVCPUs, newVCPUs, fmt.Errorf("Could not get the vCPUs added, got %+v", data)
}
newVCPUs += vCPUsAdded
case currentVCPUs > reqVCPUs:
//hotunplug
removeCPUs := currentVCPUs - reqVCPUs
data, err := q.hotplugRemoveDevice(removeCPUs, cpuDev)
if err != nil {
return currentVCPUs, newVCPUs, err
}
vCPUsRemoved, ok := data.(uint32)
if !ok {
return currentVCPUs, newVCPUs, fmt.Errorf("Could not get the vCPUs removed, got %+v", data)
}
newVCPUs -= vCPUsRemoved
}
return currentVCPUs, newVCPUs, nil
}
func (q *qemu) cleanup() error {
span, _ := q.trace("cleanup")
defer span.Finish()
for _, fd := range q.fds {
if err := fd.Close(); err != nil {
q.Logger().WithError(err).Warn("failed closing fd")
}
}
q.fds = []*os.File{}
return nil
}
func (q *qemu) getPids() []int {
data, err := ioutil.ReadFile(q.qemuConfig.PidFile)
if err != nil {
q.Logger().WithError(err).Error("Could not read qemu pid file")
return []int{0}
}
pid, err := strconv.Atoi(strings.Trim(string(data), "\n\t "))
if err != nil {
q.Logger().WithError(err).Error("Could not convert string to int")
return []int{0}
}
var pids []int
pids = append(pids, pid)
if q.state.VirtiofsdPid != 0 {
pids = append(pids, q.state.VirtiofsdPid)
}
return pids
}
type qemuGrpc struct {
ID string
QmpChannelpath string
State QemuState
NvdimmCount int
// Most members of q.qemuConfig are just to generate
// q.qemuConfig.qemuParams that is used by LaunchQemu except
// q.qemuConfig.SMP.
// So just transport q.qemuConfig.SMP from VM Cache server to runtime.
QemuSMP govmmQemu.SMP
}
func (q *qemu) fromGrpc(ctx context.Context, hypervisorConfig *HypervisorConfig, j []byte) error {
var qp qemuGrpc
err := json.Unmarshal(j, &qp)
if err != nil {
return err
}
q.id = qp.ID
q.config = *hypervisorConfig
q.qmpMonitorCh.ctx = ctx
q.qmpMonitorCh.path = qp.QmpChannelpath
q.qemuConfig.Ctx = ctx
q.state = qp.State
q.arch, err = newQemuArch(q.config)
if err != nil {
return err
}
q.ctx = ctx
q.nvdimmCount = qp.NvdimmCount
q.qemuConfig.SMP = qp.QemuSMP
q.arch.setBridges(q.state.Bridges)
return nil
}
func (q *qemu) toGrpc() ([]byte, error) {
q.qmpShutdown()
q.cleanup()
qp := qemuGrpc{
ID: q.id,
QmpChannelpath: q.qmpMonitorCh.path,
State: q.state,
NvdimmCount: q.nvdimmCount,
QemuSMP: q.qemuConfig.SMP,
}
return json.Marshal(&qp)
}
func (q *qemu) save() (s persistapi.HypervisorState) {
pids := q.getPids()
if len(pids) != 0 {
s.Pid = pids[0]
}
s.VirtiofsdPid = q.state.VirtiofsdPid
s.Type = string(QemuHypervisor)
s.UUID = q.state.UUID
s.HotpluggedMemory = q.state.HotpluggedMemory
s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
s.PCIeRootPort = q.state.PCIeRootPort
for _, bridge := range q.arch.getBridges() {
s.Bridges = append(s.Bridges, persistapi.Bridge{
DeviceAddr: bridge.Devices,
Type: string(bridge.Type),
ID: bridge.ID,
Addr: bridge.Addr,
})
}
for _, cpu := range q.state.HotpluggedVCPUs {
s.HotpluggedVCPUs = append(s.HotpluggedVCPUs, persistapi.CPUDevice{
ID: cpu.ID,
})
}
return
}
func (q *qemu) load(s persistapi.HypervisorState) {
q.state.UUID = s.UUID
q.state.HotpluggedMemory = s.HotpluggedMemory
q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
q.state.VirtiofsdPid = s.VirtiofsdPid
q.state.PCIeRootPort = s.PCIeRootPort
for _, bridge := range s.Bridges {
q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr))
}
for _, cpu := range s.HotpluggedVCPUs {
q.state.HotpluggedVCPUs = append(q.state.HotpluggedVCPUs, CPUDevice{
ID: cpu.ID,
})
}
}
func (q *qemu) check() error {
err := q.qmpSetup()
if err != nil {
return err
}
status, err := q.qmpMonitorCh.qmp.ExecuteQueryStatus(q.qmpMonitorCh.ctx)
if err != nil {
return err
}
if status.Status == "internal-error" || status.Status == "guest-panicked" {
return errors.Errorf("guest failure: %s", status.Status)
}
return nil
}
func (q *qemu) generateSocket(id string, useVsock bool) (interface{}, error) {
return generateVMSocket(id, useVsock, q.store.RunVMStoragePath())
}