mirror of
https://github.com/aljazceru/kata-containers.git
synced 2025-12-17 06:14:26 +01:00
QEMU for TDX 1.5 makes use of private memory map/unmap. Make changes to govmm to support this. Support for private backing fd for memory is added as knob to the qemu config. Userspace's map/unmap operations are done by fallocate() ioctl on the backing store fd. Reference: https://lore.kernel.org/linux-mm/20220519153713.819591-1-chao.p.peng@linux.intel.com/ Fixes: #7770 Signed-off-by: Archana Shinde <archana.m.shinde@intel.com> Signed-off-by: Fabiano Fidêncio <fabiano.fidencio@intel.com>
985 lines
29 KiB
Go
985 lines
29 KiB
Go
//go:build linux
|
|
|
|
// Copyright (c) 2018 Intel Corporation
|
|
//
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
//
|
|
|
|
package virtcontainers
|
|
|
|
import (
|
|
"context"
|
|
"encoding/hex"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"regexp"
|
|
"runtime"
|
|
"strings"
|
|
|
|
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
|
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci"
|
|
|
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
|
|
)
|
|
|
|
// A deeper PCIe topology than 5 is already not advisable just for the sake
|
|
// of having enough buffer we limit ourselves to 10 and exit if we reach
|
|
// the root bus
|
|
const maxPCIeTopoDepth = 10
|
|
|
|
type qemuArch interface {
|
|
// enableNestingChecks nesting checks will be honoured
|
|
enableNestingChecks()
|
|
|
|
// disableNestingChecks nesting checks will be ignored
|
|
disableNestingChecks()
|
|
|
|
// runNested indicates if the hypervisor runs in a nested environment
|
|
runNested() bool
|
|
|
|
// enableVhostNet vhost will be enabled
|
|
enableVhostNet()
|
|
|
|
// disableVhostNet vhost will be disabled
|
|
disableVhostNet()
|
|
|
|
// machine returns the machine type
|
|
machine() govmmQemu.Machine
|
|
|
|
// qemuPath returns the path to the QEMU binary
|
|
qemuPath() string
|
|
|
|
// kernelParameters returns the kernel parameters
|
|
// if debug is true then kernel debug parameters are included
|
|
kernelParameters(debug bool) []Param
|
|
|
|
//capabilities returns the capabilities supported by QEMU
|
|
capabilities(config HypervisorConfig) types.Capabilities
|
|
|
|
// bridges sets the number bridges for the machine type
|
|
bridges(number uint32)
|
|
|
|
// cpuTopology returns the CPU topology for the given amount of vcpus
|
|
cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP
|
|
|
|
// cpuModel returns the CPU model for the machine type
|
|
cpuModel() string
|
|
|
|
// memoryTopology returns the memory topology using the given amount of memoryMb and hostMemoryMb
|
|
memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory
|
|
|
|
// protection returns platform protection
|
|
getProtection() guestProtection
|
|
|
|
// appendConsole appends a console to devices
|
|
appendConsole(ctx context.Context, devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error)
|
|
|
|
// appendImage appends an image to devices
|
|
appendImage(ctx context.Context, devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error)
|
|
|
|
// appendBlockImage appends an image as block device
|
|
appendBlockImage(ctx context.Context, devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error)
|
|
|
|
// appendNvdimmImage appends an image as nvdimm device
|
|
appendNvdimmImage(devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error)
|
|
|
|
// appendSCSIController appens a SCSI controller to devices
|
|
appendSCSIController(context context.Context, devices []govmmQemu.Device, enableIOThreads bool) ([]govmmQemu.Device, *govmmQemu.IOThread, error)
|
|
|
|
// appendBridges appends bridges to devices
|
|
appendBridges(devices []govmmQemu.Device) []govmmQemu.Device
|
|
|
|
// append9PVolume appends a 9P volume to devices
|
|
append9PVolume(ctx context.Context, devices []govmmQemu.Device, volume types.Volume) ([]govmmQemu.Device, error)
|
|
|
|
// appendSocket appends a socket to devices
|
|
appendSocket(devices []govmmQemu.Device, socket types.Socket) []govmmQemu.Device
|
|
|
|
// appendVSock appends a vsock PCI to devices
|
|
appendVSock(ctx context.Context, devices []govmmQemu.Device, vsock types.VSock) ([]govmmQemu.Device, error)
|
|
|
|
// appendNetwork appends a endpoint device to devices
|
|
appendNetwork(ctx context.Context, devices []govmmQemu.Device, endpoint Endpoint) ([]govmmQemu.Device, error)
|
|
|
|
// appendBlockDevice appends a block drive to devices
|
|
appendBlockDevice(ctx context.Context, devices []govmmQemu.Device, drive config.BlockDrive) ([]govmmQemu.Device, error)
|
|
|
|
// appendVhostUserDevice appends a vhost user device to devices
|
|
appendVhostUserDevice(ctx context.Context, devices []govmmQemu.Device, drive config.VhostUserDeviceAttrs) ([]govmmQemu.Device, error)
|
|
|
|
// appendVFIODevice appends a VFIO device to devices
|
|
appendVFIODevice(devices []govmmQemu.Device, vfioDevice config.VFIODev) []govmmQemu.Device
|
|
|
|
// appendRNGDevice appends a RNG device to devices
|
|
appendRNGDevice(ctx context.Context, devices []govmmQemu.Device, rngDevice config.RNGDev) ([]govmmQemu.Device, error)
|
|
|
|
// addDeviceToBridge adds devices to the bus
|
|
addDeviceToBridge(ctx context.Context, ID string, t types.Type) (string, types.Bridge, error)
|
|
|
|
// removeDeviceFromBridge removes devices to the bus
|
|
removeDeviceFromBridge(ID string) error
|
|
|
|
// getBridges grants access to Bridges
|
|
getBridges() []types.Bridge
|
|
|
|
// setBridges grants access to Bridges
|
|
setBridges(bridges []types.Bridge)
|
|
|
|
// addBridge adds a new Bridge to the list of Bridges
|
|
addBridge(types.Bridge)
|
|
|
|
// getPFlash() get pflash from configuration
|
|
getPFlash() ([]string, error)
|
|
|
|
// setPFlash() grants access to pflash
|
|
setPFlash([]string)
|
|
|
|
// handleImagePath handles the Hypervisor Config image path
|
|
handleImagePath(config HypervisorConfig) error
|
|
|
|
// supportGuestMemoryHotplug returns if the guest supports memory hotplug
|
|
supportGuestMemoryHotplug() bool
|
|
|
|
// setIgnoreSharedMemoryMigrationCaps set bypass-shared-memory capability for migration
|
|
setIgnoreSharedMemoryMigrationCaps(context.Context, *govmmQemu.QMP) error
|
|
|
|
// appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus
|
|
appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
|
|
|
|
// appendPCIeSwitch appends a ioh3420 device to a pcie-root-port
|
|
appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
|
|
|
|
// append vIOMMU device
|
|
appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error)
|
|
|
|
// append pvpanic device
|
|
appendPVPanicDevice(devices []govmmQemu.Device) ([]govmmQemu.Device, error)
|
|
|
|
// append protection device.
|
|
// This implementation is architecture specific, some archs may need
|
|
// a firmware, returns a string containing the path to the firmware that should
|
|
// be used with the -bios option, ommit -bios option if the path is empty.
|
|
appendProtectionDevice(devices []govmmQemu.Device, firmware, firmwareVolume string) ([]govmmQemu.Device, string, error)
|
|
|
|
// scans the PCIe space and returns the biggest BAR sizes for 32-bit
|
|
// and 64-bit addressable memory
|
|
getBARsMaxAddressableMemory() (uint64, uint64)
|
|
|
|
// Query QMP to find a device's PCI path given its QOM path or ID
|
|
qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error)
|
|
|
|
// Query QMP to find the PCI slot of a device, given its QOM path or ID
|
|
qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error)
|
|
}
|
|
|
|
type qemuArchBase struct {
|
|
qemuExePath string
|
|
qemuMachine govmmQemu.Machine
|
|
PFlash []string
|
|
kernelParamsNonDebug []Param
|
|
kernelParamsDebug []Param
|
|
kernelParams []Param
|
|
Bridges []types.Bridge
|
|
memoryOffset uint64
|
|
networkIndex int
|
|
// Exclude from lint checking for it is ultimately only used in architecture-specific code
|
|
protection guestProtection //nolint:structcheck
|
|
nestedRun bool
|
|
vhost bool
|
|
disableNvdimm bool
|
|
dax bool
|
|
legacySerial bool
|
|
}
|
|
|
|
const (
|
|
defaultCores uint32 = 1
|
|
defaultThreads uint32 = 1
|
|
defaultCPUModel = "host"
|
|
defaultBridgeBus = "pcie.0"
|
|
defaultPCBridgeBus = "pci.0"
|
|
maxDevIDSize = 31
|
|
maxPCIeRootPort = 16 // Limitation from QEMU
|
|
maxPCIeSwitchPort = 16 // Limitation from QEMU
|
|
)
|
|
|
|
// This is the PCI start address assigned to the first bridge that
|
|
// is added on the qemu command line. In case of x86_64, the first two PCI
|
|
// addresses (0 and 1) are used by the platform while in case of ARM, address
|
|
// 0 is reserved.
|
|
const bridgePCIStartAddr = 2
|
|
|
|
const (
|
|
// QemuQ35 is the QEMU Q35 machine type for amd64
|
|
QemuQ35 = "q35"
|
|
|
|
// QemuMicrovm is the QEMU microvm machine type for amd64
|
|
QemuMicrovm = "microvm"
|
|
|
|
// QemuVirt is the QEMU virt machine type for aarch64 or amd64
|
|
QemuVirt = "virt"
|
|
|
|
// QemuPseries is a QEMU virt machine type for ppc64le
|
|
QemuPseries = "pseries"
|
|
|
|
// QemuCCWVirtio is a QEMU virt machine type for for s390x
|
|
QemuCCWVirtio = "s390-ccw-virtio"
|
|
|
|
qmpCapMigrationIgnoreShared = "x-ignore-shared"
|
|
|
|
qemuNvdimmOption = "nvdimm=on"
|
|
)
|
|
|
|
// kernelParamsNonDebug is a list of the default kernel
|
|
// parameters that will be used in standard (non-debug) mode.
|
|
var kernelParamsNonDebug = []Param{
|
|
{"quiet", ""},
|
|
}
|
|
|
|
// kernelParamsSystemdNonDebug is a list of the default systemd related
|
|
// kernel parameters that will be used in standard (non-debug) mode.
|
|
var kernelParamsSystemdNonDebug = []Param{
|
|
{"systemd.show_status", "false"},
|
|
}
|
|
|
|
// kernelParamsDebug is a list of the default kernel
|
|
// parameters that will be used in debug mode (as much boot output as
|
|
// possible).
|
|
var kernelParamsDebug = []Param{
|
|
{"debug", ""},
|
|
}
|
|
|
|
// kernelParamsSystemdDebug is a list of the default systemd related kernel
|
|
// parameters that will be used in debug mode (as much boot output as
|
|
// possible).
|
|
var kernelParamsSystemdDebug = []Param{
|
|
{"systemd.show_status", "true"},
|
|
{"systemd.log_level", "debug"},
|
|
}
|
|
|
|
func (q *qemuArchBase) enableNestingChecks() {
|
|
q.nestedRun = true
|
|
}
|
|
|
|
func (q *qemuArchBase) disableNestingChecks() {
|
|
q.nestedRun = false
|
|
}
|
|
|
|
func (q *qemuArchBase) runNested() bool {
|
|
return q.nestedRun
|
|
}
|
|
|
|
func (q *qemuArchBase) enableVhostNet() {
|
|
q.vhost = true
|
|
}
|
|
|
|
func (q *qemuArchBase) disableVhostNet() {
|
|
q.vhost = false
|
|
}
|
|
|
|
func (q *qemuArchBase) machine() govmmQemu.Machine {
|
|
return q.qemuMachine
|
|
}
|
|
|
|
func (q *qemuArchBase) getProtection() guestProtection {
|
|
return q.protection
|
|
}
|
|
|
|
func (q *qemuArchBase) qemuPath() string {
|
|
return q.qemuExePath
|
|
}
|
|
|
|
func (q *qemuArchBase) kernelParameters(debug bool) []Param {
|
|
params := q.kernelParams
|
|
|
|
if debug {
|
|
params = append(params, q.kernelParamsDebug...)
|
|
} else {
|
|
params = append(params, q.kernelParamsNonDebug...)
|
|
}
|
|
|
|
return params
|
|
}
|
|
|
|
func (q *qemuArchBase) capabilities(hConfig HypervisorConfig) types.Capabilities {
|
|
var caps types.Capabilities
|
|
caps.SetBlockDeviceHotplugSupport()
|
|
caps.SetMultiQueueSupport()
|
|
if hConfig.SharedFS != config.NoSharedFS {
|
|
caps.SetFsSharingSupport()
|
|
}
|
|
return caps
|
|
}
|
|
|
|
func (q *qemuArchBase) bridges(number uint32) {
|
|
for i := uint32(0); i < number; i++ {
|
|
q.Bridges = append(q.Bridges, types.NewBridge(types.PCI, fmt.Sprintf("%s-bridge-%d", types.PCI, i), make(map[uint32]string), 0))
|
|
}
|
|
}
|
|
|
|
func (q *qemuArchBase) cpuTopology(vcpus, maxvcpus uint32) govmmQemu.SMP {
|
|
smp := govmmQemu.SMP{
|
|
CPUs: vcpus,
|
|
Sockets: maxvcpus,
|
|
Cores: defaultCores,
|
|
Threads: defaultThreads,
|
|
MaxCPUs: maxvcpus,
|
|
}
|
|
|
|
return smp
|
|
}
|
|
|
|
func (q *qemuArchBase) cpuModel() string {
|
|
return defaultCPUModel
|
|
}
|
|
|
|
func (q *qemuArchBase) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory {
|
|
memMax := fmt.Sprintf("%dM", hostMemoryMb)
|
|
mem := fmt.Sprintf("%dM", memoryMb)
|
|
memory := govmmQemu.Memory{
|
|
Size: mem,
|
|
Slots: slots,
|
|
MaxMem: memMax,
|
|
}
|
|
|
|
return memory
|
|
}
|
|
|
|
func (q *qemuArchBase) appendConsole(_ context.Context, devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) {
|
|
var serial, console govmmQemu.Device
|
|
var consoleKernelParams []Param
|
|
|
|
if q.legacySerial {
|
|
serial = govmmQemu.LegacySerialDevice{
|
|
Chardev: "charconsole0",
|
|
}
|
|
|
|
console = govmmQemu.CharDevice{
|
|
Driver: govmmQemu.LegacySerial,
|
|
Backend: govmmQemu.Socket,
|
|
DeviceID: "console0",
|
|
ID: "charconsole0",
|
|
Path: path,
|
|
}
|
|
|
|
consoleKernelParams = []Param{
|
|
{"console", "ttyS0"},
|
|
}
|
|
} else {
|
|
serial = govmmQemu.SerialDevice{
|
|
Driver: govmmQemu.VirtioSerial,
|
|
ID: "serial0",
|
|
DisableModern: q.nestedRun,
|
|
MaxPorts: uint(2),
|
|
}
|
|
|
|
console = govmmQemu.CharDevice{
|
|
Driver: govmmQemu.Console,
|
|
Backend: govmmQemu.Socket,
|
|
DeviceID: "console0",
|
|
ID: "charconsole0",
|
|
Path: path,
|
|
}
|
|
|
|
consoleKernelParams = []Param{
|
|
{"console", "hvc0"},
|
|
{"console", "hvc1"},
|
|
}
|
|
}
|
|
|
|
devices = append(devices, serial)
|
|
devices = append(devices, console)
|
|
q.kernelParams = append(q.kernelParams, consoleKernelParams...)
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
func genericImage(path string) (config.BlockDrive, error) {
|
|
if _, err := os.Stat(path); os.IsNotExist(err) {
|
|
return config.BlockDrive{}, err
|
|
}
|
|
|
|
randBytes, err := utils.GenerateRandomBytes(8)
|
|
if err != nil {
|
|
return config.BlockDrive{}, err
|
|
}
|
|
|
|
id := utils.MakeNameID("image", hex.EncodeToString(randBytes), maxDevIDSize)
|
|
|
|
drive := config.BlockDrive{
|
|
File: path,
|
|
Format: "raw",
|
|
ID: id,
|
|
ShareRW: true,
|
|
ReadOnly: true,
|
|
}
|
|
|
|
return drive, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) appendNvdimmImage(devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) {
|
|
imageFile, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer imageFile.Close()
|
|
|
|
imageStat, err := imageFile.Stat()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
object := govmmQemu.Object{
|
|
Driver: govmmQemu.NVDIMM,
|
|
Type: govmmQemu.MemoryBackendFile,
|
|
DeviceID: "nv0",
|
|
ID: "mem0",
|
|
MemPath: path,
|
|
Size: (uint64)(imageStat.Size()),
|
|
ReadOnly: true,
|
|
}
|
|
|
|
devices = append(devices, object)
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) appendImage(ctx context.Context, devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) {
|
|
return q.appendBlockImage(ctx, devices, path)
|
|
}
|
|
|
|
func (q *qemuArchBase) appendBlockImage(ctx context.Context, devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) {
|
|
drive, err := genericImage(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
devices, err = q.appendBlockDevice(ctx, devices, drive)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return devices, nil
|
|
}
|
|
|
|
func genericSCSIController(enableIOThreads, nestedRun bool) (govmmQemu.SCSIController, *govmmQemu.IOThread) {
|
|
scsiController := govmmQemu.SCSIController{
|
|
ID: scsiControllerID,
|
|
DisableModern: nestedRun,
|
|
}
|
|
|
|
var t *govmmQemu.IOThread
|
|
|
|
if enableIOThreads {
|
|
randBytes, _ := utils.GenerateRandomBytes(8)
|
|
|
|
t = &govmmQemu.IOThread{
|
|
ID: fmt.Sprintf("%s-%s", "iothread", hex.EncodeToString(randBytes)),
|
|
}
|
|
|
|
scsiController.IOThread = t.ID
|
|
}
|
|
|
|
return scsiController, t
|
|
}
|
|
|
|
func (q *qemuArchBase) appendSCSIController(_ context.Context, devices []govmmQemu.Device, enableIOThreads bool) ([]govmmQemu.Device, *govmmQemu.IOThread, error) {
|
|
d, t := genericSCSIController(enableIOThreads, q.nestedRun)
|
|
devices = append(devices, d)
|
|
return devices, t, nil
|
|
}
|
|
|
|
// appendBridges appends to devices the given bridges
|
|
func (q *qemuArchBase) appendBridges(devices []govmmQemu.Device) []govmmQemu.Device {
|
|
return genericAppendBridges(devices, q.Bridges, q.qemuMachine.Type)
|
|
}
|
|
|
|
func generic9PVolume(volume types.Volume, nestedRun bool) govmmQemu.FSDevice {
|
|
devID := fmt.Sprintf("extra-9p-%s", volume.MountTag)
|
|
if len(devID) > maxDevIDSize {
|
|
devID = devID[:maxDevIDSize]
|
|
}
|
|
|
|
return govmmQemu.FSDevice{
|
|
Driver: govmmQemu.Virtio9P,
|
|
FSDriver: govmmQemu.Local,
|
|
ID: devID,
|
|
Path: volume.HostPath,
|
|
MountTag: volume.MountTag,
|
|
SecurityModel: govmmQemu.None,
|
|
DisableModern: nestedRun,
|
|
Multidev: govmmQemu.Remap,
|
|
}
|
|
}
|
|
|
|
func genericAppend9PVolume(devices []govmmQemu.Device, volume types.Volume, nestedRun bool) (govmmQemu.FSDevice, error) {
|
|
d := generic9PVolume(volume, nestedRun)
|
|
return d, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) append9PVolume(_ context.Context, devices []govmmQemu.Device, volume types.Volume) ([]govmmQemu.Device, error) {
|
|
if volume.MountTag == "" || volume.HostPath == "" {
|
|
return devices, nil
|
|
}
|
|
|
|
d, err := genericAppend9PVolume(devices, volume, q.nestedRun)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
devices = append(devices, d)
|
|
return devices, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) appendSocket(devices []govmmQemu.Device, socket types.Socket) []govmmQemu.Device {
|
|
devID := socket.ID
|
|
if len(devID) > maxDevIDSize {
|
|
devID = devID[:maxDevIDSize]
|
|
}
|
|
|
|
devices = append(devices,
|
|
govmmQemu.CharDevice{
|
|
Driver: govmmQemu.VirtioSerialPort,
|
|
Backend: govmmQemu.Socket,
|
|
DeviceID: socket.DeviceID,
|
|
ID: devID,
|
|
Path: socket.HostPath,
|
|
Name: socket.Name,
|
|
},
|
|
)
|
|
|
|
return devices
|
|
}
|
|
|
|
func (q *qemuArchBase) appendVSock(_ context.Context, devices []govmmQemu.Device, vsock types.VSock) ([]govmmQemu.Device, error) {
|
|
devices = append(devices,
|
|
govmmQemu.VSOCKDevice{
|
|
ID: fmt.Sprintf("vsock-%d", vsock.ContextID),
|
|
ContextID: vsock.ContextID,
|
|
VHostFD: vsock.VhostFd,
|
|
DisableModern: q.nestedRun,
|
|
},
|
|
)
|
|
|
|
return devices, nil
|
|
|
|
}
|
|
|
|
func networkModelToQemuType(model NetInterworkingModel) govmmQemu.NetDeviceType {
|
|
switch model {
|
|
case NetXConnectMacVtapModel:
|
|
return govmmQemu.MACVTAP
|
|
default:
|
|
//TAP should work for most other cases
|
|
return govmmQemu.TAP
|
|
}
|
|
}
|
|
|
|
func genericNetwork(endpoint Endpoint, vhost, nestedRun bool, index int) (govmmQemu.NetDevice, error) {
|
|
var d govmmQemu.NetDevice
|
|
switch ep := endpoint.(type) {
|
|
case *VethEndpoint, *MacvlanEndpoint, *IPVlanEndpoint:
|
|
netPair := ep.NetworkPair()
|
|
d = govmmQemu.NetDevice{
|
|
Type: networkModelToQemuType(netPair.NetInterworkingModel),
|
|
Driver: govmmQemu.VirtioNet,
|
|
ID: fmt.Sprintf("network-%d", index),
|
|
IFName: netPair.TAPIface.Name,
|
|
MACAddress: netPair.TAPIface.HardAddr,
|
|
DownScript: "no",
|
|
Script: "no",
|
|
VHost: vhost,
|
|
DisableModern: nestedRun,
|
|
FDs: netPair.VMFds,
|
|
VhostFDs: netPair.VhostFds,
|
|
}
|
|
case *MacvtapEndpoint:
|
|
d = govmmQemu.NetDevice{
|
|
Type: govmmQemu.MACVTAP,
|
|
Driver: govmmQemu.VirtioNet,
|
|
ID: fmt.Sprintf("network-%d", index),
|
|
IFName: ep.Name(),
|
|
MACAddress: ep.HardwareAddr(),
|
|
DownScript: "no",
|
|
Script: "no",
|
|
VHost: vhost,
|
|
DisableModern: nestedRun,
|
|
FDs: ep.VMFds,
|
|
VhostFDs: ep.VhostFds,
|
|
}
|
|
case *TuntapEndpoint:
|
|
netPair := ep.NetworkPair()
|
|
d = govmmQemu.NetDevice{
|
|
Type: govmmQemu.NetDeviceType("tap"),
|
|
Driver: govmmQemu.VirtioNet,
|
|
ID: fmt.Sprintf("network-%d", index),
|
|
IFName: netPair.TAPIface.Name,
|
|
MACAddress: netPair.TAPIface.HardAddr,
|
|
DownScript: "no",
|
|
Script: "no",
|
|
VHost: vhost,
|
|
DisableModern: nestedRun,
|
|
FDs: netPair.VMFds,
|
|
VhostFDs: netPair.VhostFds,
|
|
}
|
|
default:
|
|
return govmmQemu.NetDevice{}, fmt.Errorf("Unknown type for endpoint")
|
|
}
|
|
|
|
return d, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) appendNetwork(_ context.Context, devices []govmmQemu.Device, endpoint Endpoint) ([]govmmQemu.Device, error) {
|
|
d, err := genericNetwork(endpoint, q.vhost, q.nestedRun, q.networkIndex)
|
|
if err != nil {
|
|
return devices, fmt.Errorf("Failed to append network %v", err)
|
|
}
|
|
q.networkIndex++
|
|
devices = append(devices, d)
|
|
return devices, nil
|
|
}
|
|
|
|
func genericBlockDevice(drive config.BlockDrive, nestedRun bool) (govmmQemu.BlockDevice, error) {
|
|
if drive.File == "" || drive.ID == "" || drive.Format == "" {
|
|
return govmmQemu.BlockDevice{}, fmt.Errorf("Empty File, ID or Format for drive %v", drive)
|
|
}
|
|
|
|
if len(drive.ID) > maxDevIDSize {
|
|
drive.ID = drive.ID[:maxDevIDSize]
|
|
}
|
|
|
|
return govmmQemu.BlockDevice{
|
|
Driver: govmmQemu.VirtioBlock,
|
|
ID: drive.ID,
|
|
File: drive.File,
|
|
AIO: govmmQemu.Threads,
|
|
Format: govmmQemu.BlockDeviceFormat(drive.Format),
|
|
Interface: "none",
|
|
DisableModern: nestedRun,
|
|
ShareRW: drive.ShareRW,
|
|
ReadOnly: drive.ReadOnly,
|
|
}, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) appendBlockDevice(_ context.Context, devices []govmmQemu.Device, drive config.BlockDrive) ([]govmmQemu.Device, error) {
|
|
d, err := genericBlockDevice(drive, q.nestedRun)
|
|
if err != nil {
|
|
return devices, fmt.Errorf("Failed to append block device %v", err)
|
|
}
|
|
devices = append(devices, d)
|
|
return devices, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govmmQemu.Device, attr config.VhostUserDeviceAttrs) ([]govmmQemu.Device, error) {
|
|
qemuVhostUserDevice := govmmQemu.VhostUserDevice{}
|
|
|
|
switch attr.Type {
|
|
case config.VhostUserNet:
|
|
qemuVhostUserDevice.TypeDevID = utils.MakeNameID("net", attr.DevID, maxDevIDSize)
|
|
qemuVhostUserDevice.Address = attr.MacAddress
|
|
qemuVhostUserDevice.VhostUserType = govmmQemu.VhostUserNet
|
|
case config.VhostUserSCSI:
|
|
qemuVhostUserDevice.TypeDevID = utils.MakeNameID("scsi", attr.DevID, maxDevIDSize)
|
|
qemuVhostUserDevice.VhostUserType = govmmQemu.VhostUserSCSI
|
|
case config.VhostUserBlk:
|
|
qemuVhostUserDevice.VhostUserType = govmmQemu.VhostUserBlk
|
|
case config.VhostUserFS:
|
|
qemuVhostUserDevice.TypeDevID = utils.MakeNameID("fs", attr.DevID, maxDevIDSize)
|
|
qemuVhostUserDevice.Tag = attr.Tag
|
|
qemuVhostUserDevice.CacheSize = attr.CacheSize
|
|
qemuVhostUserDevice.QueueSize = attr.QueueSize
|
|
qemuVhostUserDevice.VhostUserType = govmmQemu.VhostUserFS
|
|
}
|
|
|
|
qemuVhostUserDevice.SocketPath = attr.SocketPath
|
|
qemuVhostUserDevice.CharDevID = utils.MakeNameID("char", attr.DevID, maxDevIDSize)
|
|
|
|
devices = append(devices, qemuVhostUserDevice)
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device {
|
|
|
|
if vfioDev.BDF == "" {
|
|
return devices
|
|
}
|
|
|
|
devices = append(devices,
|
|
govmmQemu.VFIODevice{
|
|
BDF: vfioDev.BDF,
|
|
VendorID: vfioDev.VendorID,
|
|
DeviceID: vfioDev.DeviceID,
|
|
Bus: vfioDev.Bus,
|
|
},
|
|
)
|
|
|
|
return devices
|
|
}
|
|
|
|
func (q *qemuArchBase) appendRNGDevice(_ context.Context, devices []govmmQemu.Device, rngDev config.RNGDev) ([]govmmQemu.Device, error) {
|
|
devices = append(devices,
|
|
govmmQemu.RngDevice{
|
|
ID: rngDev.ID,
|
|
Filename: rngDev.Filename,
|
|
},
|
|
)
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) handleImagePath(config HypervisorConfig) error {
|
|
if config.ImagePath != "" {
|
|
kernelRootParams, err := GetKernelRootParams(config.RootfsType, q.disableNvdimm, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !q.disableNvdimm {
|
|
q.qemuMachine.Options = strings.Join([]string{
|
|
q.qemuMachine.Options, qemuNvdimmOption,
|
|
}, ",")
|
|
kernelRootParams, err = GetKernelRootParams(config.RootfsType, q.disableNvdimm, q.dax)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
q.kernelParams = append(q.kernelParams, kernelRootParams...)
|
|
q.kernelParamsNonDebug = append(q.kernelParamsNonDebug, kernelParamsSystemdNonDebug...)
|
|
q.kernelParamsDebug = append(q.kernelParamsDebug, kernelParamsSystemdDebug...)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (q *qemuArchBase) supportGuestMemoryHotplug() bool {
|
|
return q.protection == noneProtection
|
|
}
|
|
|
|
func (q *qemuArchBase) setIgnoreSharedMemoryMigrationCaps(ctx context.Context, qmp *govmmQemu.QMP) error {
|
|
err := qmp.ExecSetMigrationCaps(ctx, []map[string]interface{}{
|
|
{
|
|
"capability": qmpCapMigrationIgnoreShared,
|
|
"state": true,
|
|
},
|
|
})
|
|
return err
|
|
}
|
|
|
|
func (q *qemuArchBase) addDeviceToBridge(ctx context.Context, ID string, t types.Type) (string, types.Bridge, error) {
|
|
addr, b, err := genericAddDeviceToBridge(ctx, q.Bridges, ID, t)
|
|
if err != nil {
|
|
return "", b, err
|
|
}
|
|
|
|
return fmt.Sprintf("%02x", addr), b, nil
|
|
}
|
|
|
|
func genericAddDeviceToBridge(ctx context.Context, bridges []types.Bridge, ID string, t types.Type) (uint32, types.Bridge, error) {
|
|
var err error
|
|
var addr uint32
|
|
|
|
if len(bridges) == 0 {
|
|
return 0, types.Bridge{}, errors.New("failed to get available address from bridges")
|
|
}
|
|
|
|
// looking for an empty address in the bridges
|
|
for _, b := range bridges {
|
|
if t != b.Type {
|
|
continue
|
|
}
|
|
addr, err = b.AddDevice(ctx, ID)
|
|
if err == nil {
|
|
return addr, b, nil
|
|
}
|
|
}
|
|
|
|
return 0, types.Bridge{}, fmt.Errorf("no more bridge slots available")
|
|
}
|
|
|
|
func (q *qemuArchBase) removeDeviceFromBridge(ID string) error {
|
|
var err error
|
|
for _, b := range q.Bridges {
|
|
err = b.RemoveDevice(ID)
|
|
if err == nil {
|
|
// device was removed correctly
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (q *qemuArchBase) getBridges() []types.Bridge {
|
|
return q.Bridges
|
|
}
|
|
|
|
func (q *qemuArchBase) setBridges(bridges []types.Bridge) {
|
|
q.Bridges = bridges
|
|
}
|
|
|
|
func (q *qemuArchBase) addBridge(b types.Bridge) {
|
|
q.Bridges = append(q.Bridges, b)
|
|
}
|
|
|
|
// appendPCIeRootPortDevice appends to devices the given pcie-root-port
|
|
func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
|
|
return genericAppendPCIeRootPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
|
|
}
|
|
|
|
// appendPCIeSwitchPortDevice appends a PCIe Switch with <number> ports
|
|
func (q *qemuArchBase) appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
|
|
return genericAppendPCIeSwitchPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
|
|
}
|
|
|
|
// getBARsMaxAddressableMemory we need to know the BAR sizes to configure the
|
|
// PCIe Root Port or PCIe Downstream Port attaching a device with huge BARs.
|
|
func (q *qemuArchBase) getBARsMaxAddressableMemory() (uint64, uint64) {
|
|
|
|
pci := nvpci.New()
|
|
devs, _ := pci.GetAllDevices()
|
|
|
|
// Since we do not know which devices are going to be hotplugged,
|
|
// we're going to use the GPU with the biggest BARs to initialize the
|
|
// root port, this should work for all other devices as well.
|
|
// defaults are 2MB for both, if no suitable devices found
|
|
max32bit := uint64(2 * 1024 * 1024)
|
|
max64bit := uint64(2 * 1024 * 1024)
|
|
|
|
for _, dev := range devs {
|
|
if !dev.IsGPU() {
|
|
continue
|
|
}
|
|
memSize32bit, memSize64bit := dev.Resources.GetTotalAddressableMemory(true)
|
|
if max32bit < memSize32bit {
|
|
max32bit = memSize32bit
|
|
}
|
|
if max64bit < memSize64bit {
|
|
max64bit = memSize64bit
|
|
}
|
|
}
|
|
// The actual 32bit is most of the time a power of 2 but we need some
|
|
// buffer so double that to leave space for other IO functions.
|
|
// The 64bit size is not a power of 2 and hence is already rounded up
|
|
// to the higher value.
|
|
return max32bit * 2, max64bit
|
|
}
|
|
|
|
// appendIOMMU appends a virtual IOMMU device
|
|
func (q *qemuArchBase) appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error) {
|
|
switch q.qemuMachine.Type {
|
|
case QemuQ35:
|
|
iommu := govmmQemu.IommuDev{
|
|
Intremap: true,
|
|
DeviceIotlb: true,
|
|
CachingMode: true,
|
|
}
|
|
|
|
devices = append(devices, iommu)
|
|
return devices, nil
|
|
default:
|
|
return devices, fmt.Errorf("Machine Type %s does not support vIOMMU", q.qemuMachine.Type)
|
|
}
|
|
}
|
|
|
|
// appendPVPanicDevice appends a pvpanic device
|
|
func (q *qemuArchBase) appendPVPanicDevice(devices []govmmQemu.Device) ([]govmmQemu.Device, error) {
|
|
devices = append(devices, govmmQemu.PVPanicDevice{NoShutdown: true})
|
|
return devices, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) getPFlash() ([]string, error) {
|
|
return q.PFlash, nil
|
|
}
|
|
|
|
func (q *qemuArchBase) setPFlash(p []string) {
|
|
q.PFlash = p
|
|
}
|
|
|
|
// append protection device
|
|
func (q *qemuArchBase) appendProtectionDevice(devices []govmmQemu.Device, firmware, firmwareVolume string) ([]govmmQemu.Device, string, error) {
|
|
hvLogger.WithField("arch", runtime.GOARCH).Warnf("Confidential Computing has not been implemented for this architecture")
|
|
return devices, firmware, nil
|
|
}
|
|
|
|
// Query QMP to find the PCI slot of a device, given its QOM path or ID
|
|
func (q *qemuArchBase) qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error) {
|
|
addr, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, qomPath, "addr")
|
|
if err != nil {
|
|
return types.PciSlot{}, err
|
|
}
|
|
addrf, ok := addr.(float64)
|
|
// XXX going via float makes no real sense, but that's how
|
|
// JSON works, and we'll get away with it for the small values
|
|
// we have here
|
|
if !ok {
|
|
return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr)
|
|
}
|
|
addri := int(addrf)
|
|
|
|
slotNum, funcNum := addri>>3, addri&0x7
|
|
if funcNum != 0 {
|
|
return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q",
|
|
slotNum, funcNum, qomPath)
|
|
}
|
|
|
|
return types.PciSlotFromInt(slotNum)
|
|
}
|
|
|
|
// Query QMP to find a device's PCI path given its QOM path or ID
|
|
func (q *qemuArchBase) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) {
|
|
|
|
var slots []types.PciSlot
|
|
|
|
devSlot, err := q.qomGetSlot(qemuID, qmpCh)
|
|
if err != nil {
|
|
return types.PciPath{}, err
|
|
}
|
|
slots = append(slots, devSlot)
|
|
|
|
// This only works for Q35 and Virt
|
|
r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
|
|
|
|
var parentPath = qemuID
|
|
// We do not want to use a forever loop here, a deeper PCIe topology
|
|
// than 5 is already not advisable just for the sake of having enough
|
|
// buffer we limit ourselves to 10 and leave the loop early if we hit
|
|
// the root bus.
|
|
for i := 1; i <= maxPCIeTopoDepth; i++ {
|
|
parenBusQOM, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, parentPath, "parent_bus")
|
|
if err != nil {
|
|
return types.PciPath{}, err
|
|
}
|
|
|
|
busQOM, ok := parenBusQOM.(string)
|
|
if !ok {
|
|
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
|
|
}
|
|
|
|
// If we hit /machine/q35/pcie.0 we're done this is the root bus
|
|
// we climbed the complete hierarchy
|
|
if r.Match([]byte(busQOM)) {
|
|
break
|
|
}
|
|
|
|
// `bus` is the QOM path of the QOM bus object, but we need
|
|
// the PCI parent_bus which manages that bus. There doesn't seem
|
|
// to be a way to get that other than to simply drop the last
|
|
// path component.
|
|
idx := strings.LastIndex(busQOM, "/")
|
|
if idx == -1 {
|
|
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
|
|
}
|
|
parentBus := busQOM[:idx]
|
|
|
|
parentSlot, err := q.qomGetSlot(parentBus, qmpCh)
|
|
if err != nil {
|
|
return types.PciPath{}, err
|
|
}
|
|
|
|
// Prepend the slots, since we're climbing the hierarchy
|
|
slots = append([]types.PciSlot{parentSlot}, slots...)
|
|
parentPath = parentBus
|
|
}
|
|
return types.PciPathFromSlots(slots...)
|
|
}
|