CCv0: Merge from main -- August 1st

Conflicts:
	src/runtime/pkg/katautils/config.go
	src/runtime/virtcontainers/container.go
	src/runtime/virtcontainers/hypervisor.go
	src/runtime/virtcontainers/qemu_arch_base.go
	src/runtime/virtcontainers/sandbox.go
	tests/integration/kubernetes/gha-run.sh
	tests/integration/kubernetes/setup.sh
	tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
	tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh
	tools/packaging/kata-deploy/scripts/kata-deploy.sh
	tools/packaging/kernel/kata_config_version
	versions.yaml

Fixes: #7433

Signed-off-by: Fabiano Fidêncio <fabiano.fidencio@intel.com>
This commit is contained in:
Fabiano Fidêncio
2023-08-01 17:14:17 +02:00
426 changed files with 64309 additions and 2456 deletions

View File

@@ -18,6 +18,7 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
@@ -878,9 +879,16 @@ func (c *Container) create(ctx context.Context) (err error) {
// If cold-plug we've attached the devices already, do not try to
// attach them a second time.
coldPlugVFIO := (c.sandbox.config.HypervisorConfig.ColdPlugVFIO != config.NoPort)
modeVFIO := (c.sandbox.config.VfioMode == config.VFIOModeVFIO)
if coldPlugVFIO {
var cntDevices []ContainerDevice
for _, dev := range c.devices {
isVFIOControlDevice := deviceManager.IsVFIOControlDevice(dev.ContainerPath)
if isVFIOControlDevice && modeVFIO {
cntDevices = append(cntDevices, dev)
}
if strings.HasPrefix(dev.ContainerPath, vfioPath) {
c.Logger().WithFields(logrus.Fields{
"device": dev,

View File

@@ -284,10 +284,6 @@ type HypervisorConfig struct {
// DisableImageNvdimm is used to disable guest rootfs image nvdimm devices
DisableImageNvdimm bool
// HotplugVFIOOnRootBus is used to indicate if devices need to be hotplugged on the
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
// root port, switch, bridge or no port
HotPlugVFIO hv.PCIePort

View File

@@ -312,117 +312,356 @@ type Param struct {
// HypervisorConfig is the hypervisor configuration.
// nolint: govet
type HypervisorConfig struct {
customAssets map[types.AssetType]*types.Asset
SeccompSandbox string
KernelPath string
ImagePath string
InitrdPath string
FirmwarePath string
FirmwareVolumePath string
MachineAccelerators string
CPUFeatures string
HypervisorPath string
HypervisorCtlPath string
GuestPreAttestationKeyset string
BlockDeviceDriver string
HypervisorMachineType string
GuestPreAttestationURI string
GuestPreAttestationMode string
DevicesStatePath string
EntropySource string
SharedFS string
SharedPath string
VirtioFSDaemon string
VirtioFSCache string
FileBackedMemRootDir string
VhostUserStorePath string
GuestMemoryDumpPath string
GuestHookPath string
VMid string
VMStorePath string
RunStorePath string
SELinuxProcessLabel string
JailerPath string
MemoryPath string
SEVCertChainPath string
BlockDeviceAIO string
User string
RemoteHypervisorSocket string
SandboxName string
SandboxNamespace string
JailerPathList []string
EntropySourceList []string
VirtioFSDaemonList []string
VirtioFSExtraArgs []string
EnableAnnotations []string
FileBackedMemRootList []string
PFlash []string
VhostUserStorePathList []string
HypervisorCtlPathList []string
KernelParams []Param
Groups []uint32
HypervisorPathList []string
HypervisorParams []Param
DiskRateLimiterBwOneTimeBurst int64
DiskRateLimiterOpsMaxRate int64
RootfsType string
VhostUserDeviceReconnect uint32
// customAssets is a map of assets.
// Each value in that map takes precedence over the configured assets.
// For example, if there is a value for the "kernel" key in this map,
// it will be used for the sandbox's kernel path instead of KernelPath.
customAssets map[types.AssetType]*types.Asset
// Supplementary group IDs.
Groups []uint32
// KernelPath is the guest kernel host path.
KernelPath string
// ImagePath is the guest image host path.
ImagePath string
// InitrdPath is the guest initrd image host path.
// ImagePath and InitrdPath cannot be set at the same time.
InitrdPath string
// RootfsType is filesystem type of rootfs.
RootfsType string
// FirmwarePath is the bios host path
FirmwarePath string
// FirmwareVolumePath is the configuration volume path for the firmware
FirmwareVolumePath string
// MachineAccelerators are machine specific accelerators
MachineAccelerators string
// CPUFeatures are cpu specific features
CPUFeatures string
// HypervisorPath is the hypervisor executable host path.
HypervisorPath string
// HypervisorCtlPath is the hypervisor ctl executable host path.
HypervisorCtlPath string
// JailerPath is the jailer executable host path.
JailerPath string
// BlockDeviceDriver specifies the driver to be used for block device
// either VirtioSCSI or VirtioBlock with the default driver being defaultBlockDriver
BlockDeviceDriver string
// HypervisorMachineType specifies the type of machine being
// emulated.
HypervisorMachineType string
// MemoryPath is the memory file path of VM memory. Used when either BootToBeTemplate or
// BootFromTemplate is true.
MemoryPath string
// DevicesStatePath is the VM device state file path. Used when either BootToBeTemplate or
// BootFromTemplate is true.
DevicesStatePath string
// EntropySource is the path to a host source of
// entropy (/dev/random, /dev/urandom or real hardware RNG device)
EntropySource string
// Shared file system type:
// - virtio-9p
// - virtio-fs (default)
SharedFS string
// Path for filesystem sharing
SharedPath string
// VirtioFSDaemon is the virtio-fs vhost-user daemon path
VirtioFSDaemon string
// VirtioFSCache cache mode for fs version cache
VirtioFSCache string
// File based memory backend root directory
FileBackedMemRootDir string
// VhostUserStorePath is the directory path where vhost-user devices
// related folders, sockets and device nodes should be.
VhostUserStorePath string
// VhostUserDeviceReconnect is the timeout for reconnecting on non-server spdk sockets
// when the remote end goes away. Zero disables reconnecting.
VhostUserDeviceReconnect uint32
// GuestCoredumpPath is the path in host for saving guest memory dump
GuestMemoryDumpPath string
// GuestHookPath is the path within the VM that will be used for 'drop-in' hooks
GuestHookPath string
// VMid is the id of the VM that create the hypervisor if the VM is created by the factory.
// VMid is "" if the hypervisor is not created by the factory.
VMid string
// VMStorePath is the location on disk where VM information will persist
VMStorePath string
// VMStorePath is the location on disk where runtime information will persist
RunStorePath string
// SELinux label for the VM
SELinuxProcessLabel string
// HypervisorPathList is the list of hypervisor paths names allowed in annotations
HypervisorPathList []string
// HypervisorCtlPathList is the list of hypervisor control paths names allowed in annotations
HypervisorCtlPathList []string
// JailerPathList is the list of jailer paths names allowed in annotations
JailerPathList []string
// EntropySourceList is the list of valid entropy sources
EntropySourceList []string
// VirtioFSDaemonList is the list of valid virtiofs names for annotations
VirtioFSDaemonList []string
// VirtioFSExtraArgs passes options to virtiofsd daemon
VirtioFSExtraArgs []string
// Enable annotations by name
EnableAnnotations []string
// FileBackedMemRootList is the list of valid root directories values for annotations
FileBackedMemRootList []string
// PFlash image paths
PFlash []string
// VhostUserStorePathList is the list of valid values for vhost-user paths
VhostUserStorePathList []string
// SeccompSandbox is the qemu function which enables the seccomp feature
SeccompSandbox string
// BlockiDeviceAIO specifies the I/O API to be used.
BlockDeviceAIO string
// The user maps to the uid.
User string
// KernelParams are additional guest kernel parameters.
KernelParams []Param
// HypervisorParams are additional hypervisor parameters.
HypervisorParams []Param
// SGXEPCSize specifies the size in bytes for the EPC Section.
// Enable SGX. Hardware-based isolation and memory encryption.
SGXEPCSize int64
// DiskRateLimiterBwRate is used to control disk I/O bandwidth on VM level.
// The same value, defined in bits per second, is used for inbound and outbound bandwidth.
DiskRateLimiterBwMaxRate int64
// DiskRateLimiterBwOneTimeBurst is used to control disk I/O bandwidth on VM level.
// This increases the initial max rate and this initial extra credit does *NOT* replenish
// and can be used for an *initial* burst of data.
DiskRateLimiterBwOneTimeBurst int64
// DiskRateLimiterOpsRate is used to control disk I/O operations on VM level.
// The same value, defined in operations per second, is used for inbound and outbound bandwidth.
DiskRateLimiterOpsMaxRate int64
// DiskRateLimiterOpsOneTimeBurst is used to control disk I/O operations on VM level.
// This increases the initial max rate and this initial extra credit does *NOT* replenish
// and can be used for an *initial* burst of data.
DiskRateLimiterOpsOneTimeBurst int64
SGXEPCSize int64
DefaultMaxMemorySize uint64
NetRateLimiterBwMaxRate int64
NetRateLimiterBwOneTimeBurst int64
NetRateLimiterOpsMaxRate int64
NetRateLimiterOpsOneTimeBurst int64
MemOffset uint64
TxRateLimiterMaxRate uint64
DiskRateLimiterBwMaxRate int64
RxRateLimiterMaxRate uint64
MemorySize uint32
DefaultMaxVCPUs uint32
DefaultBridges uint32
Msize9p uint32
MemSlots uint32
VirtioFSCacheSize uint32
VirtioFSQueueSize uint32
Uid uint32
Gid uint32
SEVGuestPolicy uint32
SNPGuestPolicy uint64
NumVCPUs uint32
RemoteHypervisorTimeout uint32
IOMMUPlatform bool
EnableIOThreads bool
Debug bool
MemPrealloc bool
HugePages bool
VirtioMem bool
IOMMU bool
DisableBlockDeviceUse bool
DisableNestingChecks bool
DisableImageNvdimm bool
HotplugVFIOOnRootBus bool
GuestMemoryDumpPaging bool
ConfidentialGuest bool
SevSnpGuest bool
GuestPreAttestation bool
BlockDeviceCacheNoflush bool
BlockDeviceCacheDirect bool
BlockDeviceCacheSet bool
BootToBeTemplate bool
BootFromTemplate bool
DisableVhostNet bool
EnableVhostUserStore bool
GuestSwap bool
Rootless bool
DisableSeccomp bool
DisableSeLinux bool
DisableGuestSeLinux bool
LegacySerial bool
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
VFIODevices []config.DeviceInfo
VhostUserBlkDevices []config.DeviceInfo
// RxRateLimiterMaxRate is used to control network I/O inbound bandwidth on VM level.
RxRateLimiterMaxRate uint64
// TxRateLimiterMaxRate is used to control network I/O outbound bandwidth on VM level.
TxRateLimiterMaxRate uint64
// NetRateLimiterBwRate is used to control network I/O bandwidth on VM level.
// The same value, defined in bits per second, is used for inbound and outbound bandwidth.
NetRateLimiterBwMaxRate int64
// NetRateLimiterBwOneTimeBurst is used to control network I/O bandwidth on VM level.
// This increases the initial max rate and this initial extra credit does *NOT* replenish
// and can be used for an *initial* burst of data.
NetRateLimiterBwOneTimeBurst int64
// NetRateLimiterOpsRate is used to control network I/O operations on VM level.
// The same value, defined in operations per second, is used for inbound and outbound bandwidth.
NetRateLimiterOpsMaxRate int64
// NetRateLimiterOpsOneTimeBurst is used to control network I/O operations on VM level.
// This increases the initial max rate and this initial extra credit does *NOT* replenish
// and can be used for an *initial* burst of data.
NetRateLimiterOpsOneTimeBurst int64
// MemOffset specifies memory space for nvdimm device
MemOffset uint64
// VFIODevices are used to get PCIe device info early before the sandbox
// is started to make better PCIe topology decisions
VFIODevices []config.DeviceInfo
// VhostUserBlkDevices are handled differently in Q35 and Virt machine
// type. capture them early before the sandbox to make better PCIe topology
// decisions
VhostUserBlkDevices []config.DeviceInfo
// HotplugVFIO is used to indicate if devices need to be hotplugged on the
// root port or a switch
HotPlugVFIO config.PCIePort
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
// root port, switch or no port
ColdPlugVFIO config.PCIePort
// NumVCPUs specifies default number of vCPUs for the VM.
NumVCPUs uint32
//DefaultMaxVCPUs specifies the maximum number of vCPUs for the VM.
DefaultMaxVCPUs uint32
// DefaultMem specifies default memory size in MiB for the VM.
MemorySize uint32
// DefaultMaxMemorySize specifies the maximum amount of RAM in MiB for the VM.
DefaultMaxMemorySize uint64
// DefaultBridges specifies default number of bridges for the VM.
// Bridges can be used to hot plug devices
DefaultBridges uint32
// Msize9p is used as the msize for 9p shares
Msize9p uint32
// MemSlots specifies default memory slots the VM.
MemSlots uint32
// VirtioFSCacheSize is the DAX cache size in MiB
VirtioFSCacheSize uint32
// Size of virtqueues
VirtioFSQueueSize uint32
// User ID.
Uid uint32
// Group ID.
Gid uint32
// BlockDeviceCacheSet specifies cache-related options will be set to block devices or not.
BlockDeviceCacheSet bool
// BlockDeviceCacheDirect specifies cache-related options for block devices.
// Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
BlockDeviceCacheDirect bool
// BlockDeviceCacheNoflush specifies cache-related options for block devices.
// Denotes whether flush requests for the device are ignored.
BlockDeviceCacheNoflush bool
// DisableBlockDeviceUse disallows a block device from being used.
DisableBlockDeviceUse bool
// EnableIOThreads enables IO to be processed in a separate thread.
// Supported currently for virtio-scsi driver.
EnableIOThreads bool
// Debug changes the default hypervisor and kernel parameters to
// enable debug output where available. And Debug also enable the hmp socket.
Debug bool
// MemPrealloc specifies if the memory should be pre-allocated
MemPrealloc bool
// HugePages specifies if the memory should be pre-allocated from huge pages
HugePages bool
// VirtioMem is used to enable/disable virtio-mem
VirtioMem bool
// IOMMU specifies if the VM should have a vIOMMU
IOMMU bool
// IOMMUPlatform is used to indicate if IOMMU_PLATFORM is enabled for supported devices
IOMMUPlatform bool
// DisableNestingChecks is used to override customizations performed
// when running on top of another VMM.
DisableNestingChecks bool
// DisableImageNvdimm is used to disable guest rootfs image nvdimm devices
DisableImageNvdimm bool
// GuestMemoryDumpPaging is used to indicate if enable paging
// for QEMU dump-guest-memory command
GuestMemoryDumpPaging bool
// Enable confidential guest support.
// Enable or disable different hardware features, ranging
// from memory encryption to both memory and CPU-state encryption and integrity.
ConfidentialGuest bool
// Enable SEV-SNP guests on AMD machines capable of both
SevSnpGuest bool
// BootToBeTemplate used to indicate if the VM is created to be a template VM
BootToBeTemplate bool
// BootFromTemplate used to indicate if the VM should be created from a template VM
BootFromTemplate bool
// DisableVhostNet is used to indicate if host supports vhost_net
DisableVhostNet bool
// EnableVhostUserStore is used to indicate if host supports vhost-user-blk/scsi
EnableVhostUserStore bool
// GuestSwap Used to enable/disable swap in the guest
GuestSwap bool
// Rootless is used to enable rootless VMM process
Rootless bool
// Disable seccomp from the hypervisor process
DisableSeccomp bool
// Disable selinux from the hypervisor process
DisableSeLinux bool
// Disable selinux from the container process
DisableGuestSeLinux bool
// Use legacy serial for the guest console
LegacySerial bool
GuestPreAttestation bool
GuestPreAttestationKeyset string
GuestPreAttestationURI string
GuestPreAttestationMode string
RemoteHypervisorSocket string
RemoteHypervisorTimeout uint32
SandboxName string
SandboxNamespace string
SEVCertChainPath string
SEVGuestPolicy uint32
SNPGuestPolicy uint64
}
// vcpu mapping from vcpu number to thread number

View File

@@ -244,7 +244,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
FileBackedMemRootList: sconfig.HypervisorConfig.FileBackedMemRootList,
DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks,
DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm,
HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus,
BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate,
BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate,
DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet,
@@ -485,7 +484,6 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
FileBackedMemRootList: hconf.FileBackedMemRootList,
DisableNestingChecks: hconf.DisableNestingChecks,
DisableImageNvdimm: hconf.DisableImageNvdimm,
HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus,
HotPlugVFIO: hconf.HotPlugVFIO,
ColdPlugVFIO: hconf.ColdPlugVFIO,
BootToBeTemplate: hconf.BootToBeTemplate,

View File

@@ -191,10 +191,6 @@ type HypervisorConfig struct {
// DisableImageNvdimm disables nvdimm for guest rootfs image
DisableImageNvdimm bool
// HotplugVFIOOnRootBus is used to indicate if devices need to be hotplugged on the
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
// root, switch, bridge or no-port
HotPlugVFIO config.PCIePort

View File

@@ -139,10 +139,6 @@ const (
// DisableImageNvdimm is a sandbox annotation to specify use of nvdimm device for guest rootfs image.
DisableImageNvdimm = kataAnnotHypervisorPrefix + "disable_image_nvdimm"
// HotplugVFIOOnRootBus is a sandbox annotation used to indicate if devices need to be hotplugged on the
// root bus instead of a bridge.
HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus"
// ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged.
ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio"

View File

@@ -66,11 +66,6 @@ const romFile = ""
// Default value is false.
const defaultDisableModern = false
// A deeper PCIe topology than 5 is already not advisable just for the sake
// of having enough buffer we limit ourselves to 10 and exit if we reach
// the root bus
const maxPCIeTopoDepth = 10
type qmpChannel struct {
qmp *govmmQemu.QMP
ctx context.Context
@@ -81,15 +76,14 @@ type qmpChannel struct {
// QemuState keeps Qemu's state
type QemuState struct {
UUID string
HotPlugVFIO config.PCIePort
Bridges []types.Bridge
HotpluggedVCPUs []hv.CPUDevice
HotpluggedMemory int
VirtiofsDaemonPid int
HotplugVFIOOnRootBus bool
HotplugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
UUID string
HotPlugVFIO config.PCIePort
Bridges []types.Bridge
HotpluggedVCPUs []hv.CPUDevice
HotpluggedMemory int
VirtiofsDaemonPid int
HotplugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
}
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
@@ -285,7 +279,6 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
q.state.UUID = uuid.Generate().String()
q.state.HotPlugVFIO = q.config.HotPlugVFIO
q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
q.state.HotPlugVFIO = q.config.HotPlugVFIO
// The path might already exist, but in case of VM templating,
@@ -808,6 +801,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig
if err != nil {
return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err)
}
devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
if err != nil {
return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err)
@@ -818,7 +812,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig
}
}
}
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus)
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort)
vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices)
@@ -1723,7 +1717,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
config.PCIeDevices[config.RootPort][devID] = true
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
bridgeSlot, err := q.qomGetSlot(bridgeQomPath)
bridgeSlot, err := q.arch.qomGetSlot(bridgeQomPath, &q.qmpMonitorCh)
if err != nil {
return err
}
@@ -1826,88 +1820,6 @@ func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUs
}
}
// Query QMP to find the PCI slot of a device, given its QOM path or ID
func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) {
addr, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qomPath, "addr")
if err != nil {
return types.PciSlot{}, err
}
addrf, ok := addr.(float64)
// XXX going via float makes no real sense, but that's how
// JSON works, and we'll get away with it for the small values
// we have here
if !ok {
return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr)
}
addri := int(addrf)
slotNum, funcNum := addri>>3, addri&0x7
if funcNum != 0 {
return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q",
slotNum, funcNum, qomPath)
}
return types.PciSlotFromInt(slotNum)
}
// Query QMP to find a device's PCI path given its QOM path or ID
func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) {
var slots []types.PciSlot
devSlot, err := q.qomGetSlot(qemuID)
if err != nil {
return types.PciPath{}, err
}
slots = append(slots, devSlot)
// This only works for Q35 and Virt
r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
var parentPath = qemuID
// We do not want to use a forever loop here, a deeper PCIe topology
// than 5 is already not advisable just for the sake of having enough
// buffer we limit ourselves to 10 and leave the loop early if we hit
// the root bus.
for i := 1; i <= maxPCIeTopoDepth; i++ {
parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus")
if err != nil {
return types.PciPath{}, err
}
busQOM, ok := parenBusQOM.(string)
if !ok {
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
}
// If we hit /machine/q35/pcie.0 we're done this is the root bus
// we climbed the complete hierarchy
if r.Match([]byte(busQOM)) {
break
}
// `bus` is the QOM path of the QOM bus object, but we need
// the PCI parent_bus which manages that bus. There doesn't seem
// to be a way to get that other than to simply drop the last
// path component.
idx := strings.LastIndex(busQOM, "/")
if idx == -1 {
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
}
parentBus := busQOM[:idx]
parentSlot, err := q.qomGetSlot(parentBus)
if err != nil {
return types.PciPath{}, err
}
// Prepend the slots, since we're climbing the hierarchy
slots = append([]types.PciSlot{parentSlot}, slots...)
parentPath = parentBus
}
return types.PciPathFromSlots(slots...)
}
func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) {
return q.executeVFIODeviceAdd(device)
}
@@ -1937,7 +1849,7 @@ func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, brid
case config.VFIOPCIDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile)
case config.VFIOAPDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
@@ -1950,7 +1862,7 @@ func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error {
case config.VFIOPCIDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile)
case config.VFIOAPDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
@@ -1968,46 +1880,43 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
"hot-plug-vfio": q.state.HotPlugVFIO,
"device-info": string(buf),
}).Info("Start hot-plug VFIO device")
// In case MachineType is q35, a PCIe device is hotplugged on
// a PCIe Root Port or alternatively on a PCIe Switch Port
if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt {
device.Bus = ""
} else {
var err error
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus {
err = q.hotplugVFIODeviceRootPort(ctx, device)
} else if q.state.HotPlugVFIO == config.SwitchPort {
err = q.hotplugVFIODeviceSwitchPort(ctx, device)
} else {
err = q.hotplugVFIODeviceBridgePort(ctx, device)
}
if err != nil {
return err
}
err = fmt.Errorf("Incorrect hot plug configuration %v for device %v found", q.state.HotPlugVFIO, device)
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotPlugVFIO == config.RootPort {
err = q.hotplugVFIODeviceRootPort(ctx, device)
} else if q.state.HotPlugVFIO == config.SwitchPort {
err = q.hotplugVFIODeviceSwitchPort(ctx, device)
} else if q.state.HotPlugVFIO == config.BridgePort {
err = q.hotplugVFIODeviceBridgePort(ctx, device)
}
// XXX: Depending on whether we're doing root port or
if err != nil {
return err
}
// Depending on whether we're doing root port or
// bridge hotplug, and how the bridge is set up in
// other parts of the code, we may or may not already
// have information about the slot number of the
// bridge and or the device. For simplicity, just
// query both of them back from qemu
device.GuestPciPath, err = q.qomGetPciPath(device.ID)
// query both of them back from qemu based on the arch
device.GuestPciPath, err = q.arch.qomGetPciPath(device.ID, &q.qmpMonitorCh)
return err
}
} else {
q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
if !q.state.HotplugVFIOOnRootBus {
if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
return err
if q.state.HotPlugVFIO == config.BridgePort {
if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
return err
}
}
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
}
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
}
func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
@@ -2966,7 +2875,6 @@ func (q *qemu) Save() (s hv.HypervisorState) {
s.Type = string(QemuHypervisor)
s.UUID = q.state.UUID
s.HotpluggedMemory = q.state.HotpluggedMemory
s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
for _, bridge := range q.arch.getBridges() {
s.Bridges = append(s.Bridges, hv.Bridge{
@@ -2988,7 +2896,6 @@ func (q *qemu) Save() (s hv.HypervisorState) {
func (q *qemu) Load(s hv.HypervisorState) {
q.state.UUID = s.UUID
q.state.HotpluggedMemory = s.HotpluggedMemory
q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid
for _, bridge := range s.Bridges {

View File

@@ -13,6 +13,7 @@ import (
"errors"
"fmt"
"os"
"regexp"
"runtime"
"strings"
@@ -25,6 +26,11 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
)
// A deeper PCIe topology than 5 is already not advisable just for the sake
// of having enough buffer we limit ourselves to 10 and exit if we reach
// the root bus
const maxPCIeTopoDepth = 10
type qemuArch interface {
// enableNestingChecks nesting checks will be honoured
enableNestingChecks()
@@ -169,6 +175,12 @@ type qemuArch interface {
// and 64-bit addressable memory
getBARsMaxAddressableMemory() (uint64, uint64)
// Query QMP to find a device's PCI path given its QOM path or ID
qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error)
// Query QMP to find the PCI slot of a device, given its QOM path or ID
qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error)
// append SEV object type to the VM definition
appendSEVObject(devices []govmmQemu.Device, firmware, firmwareVolume string, config sevKbs.GuestPreAttestationConfig) ([]govmmQemu.Device, string, error)
@@ -915,6 +927,88 @@ func (q *qemuArchBase) appendProtectionDevice(devices []govmmQemu.Device, firmwa
return devices, firmware, nil
}
// Query QMP to find the PCI slot of a device, given its QOM path or ID
func (q *qemuArchBase) qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error) {
addr, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, qomPath, "addr")
if err != nil {
return types.PciSlot{}, err
}
addrf, ok := addr.(float64)
// XXX going via float makes no real sense, but that's how
// JSON works, and we'll get away with it for the small values
// we have here
if !ok {
return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr)
}
addri := int(addrf)
slotNum, funcNum := addri>>3, addri&0x7
if funcNum != 0 {
return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q",
slotNum, funcNum, qomPath)
}
return types.PciSlotFromInt(slotNum)
}
// Query QMP to find a device's PCI path given its QOM path or ID
func (q *qemuArchBase) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) {
var slots []types.PciSlot
devSlot, err := q.qomGetSlot(qemuID, qmpCh)
if err != nil {
return types.PciPath{}, err
}
slots = append(slots, devSlot)
// This only works for Q35 and Virt
r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
var parentPath = qemuID
// We do not want to use a forever loop here, a deeper PCIe topology
// than 5 is already not advisable just for the sake of having enough
// buffer we limit ourselves to 10 and leave the loop early if we hit
// the root bus.
for i := 1; i <= maxPCIeTopoDepth; i++ {
parenBusQOM, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, parentPath, "parent_bus")
if err != nil {
return types.PciPath{}, err
}
busQOM, ok := parenBusQOM.(string)
if !ok {
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
}
// If we hit /machine/q35/pcie.0 we're done this is the root bus
// we climbed the complete hierarchy
if r.Match([]byte(busQOM)) {
break
}
// `bus` is the QOM path of the QOM bus object, but we need
// the PCI parent_bus which manages that bus. There doesn't seem
// to be a way to get that other than to simply drop the last
// path component.
idx := strings.LastIndex(busQOM, "/")
if idx == -1 {
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
}
parentBus := busQOM[:idx]
parentSlot, err := q.qomGetSlot(parentBus, qmpCh)
if err != nil {
return types.PciPath{}, err
}
// Prepend the slots, since we're climbing the hierarchy
slots = append([]types.PciSlot{parentSlot}, slots...)
parentPath = parentBus
}
return types.PciPathFromSlots(slots...)
}
// AMD SEV methods
func (q *qemuArchBase) appendSEVObject(devices []govmmQemu.Device, firmware, firmwareVolume string, config sevKbs.GuestPreAttestationConfig) ([]govmmQemu.Device, string, error) {
hvLogger.WithField("arch", runtime.GOARCH).Warnf("Confidential Computing has not been implemented for this architecture")

View File

@@ -351,3 +351,32 @@ func (q *qemuS390x) appendProtectionDevice(devices []govmmQemu.Device, firmware,
return devices, firmware, fmt.Errorf("Unsupported guest protection technology: %v", q.protection)
}
}
func (q *qemuS390x) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device {
if vfioDev.SysfsDev == "" {
return devices
}
if len(vfioDev.APDevices) > 0 {
devices = append(devices,
govmmQemu.VFIODevice{
SysfsDev: vfioDev.SysfsDev,
Transport: govmmQemu.TransportAP,
},
)
return devices
}
devices = append(devices,
govmmQemu.VFIODevice{
SysfsDev: vfioDev.SysfsDev,
},
)
return devices
}
// Query QMP to find a device's PCI path given its QOM path or ID
func (q *qemuS390x) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) {
hvLogger.Warnf("qomGetPciPath not implemented for s390x")
return types.PciPath{}, nil
}

View File

@@ -30,8 +30,6 @@ import (
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
cri "github.com/containerd/containerd/pkg/cri/annotations"
crio "github.com/containers/podman/v4/pkg/annotations"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/api"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
@@ -611,67 +609,11 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return nil, err
}
if len(sandboxConfig.Containers) > 0 {
// These values are required by remote hypervisor
for _, a := range []string{cri.SandboxName, crio.SandboxName} {
if value, ok := sandboxConfig.Containers[0].Annotations[a]; ok {
sandboxConfig.HypervisorConfig.SandboxName = value
}
}
for _, a := range []string{cri.SandboxNamespace, crio.Namespace} {
if value, ok := sandboxConfig.Containers[0].Annotations[a]; ok {
sandboxConfig.HypervisorConfig.SandboxNamespace = value
}
}
coldPlugVFIO, err := s.coldOrHotPlugVFIO(&sandboxConfig)
if err != nil {
return nil, err
}
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
// until we have TDISP/IDE PCIe support.
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
// Aggregate all the containner devices for hot-plug and use them to dedcue
// the correct amount of ports to reserve for the hypervisor.
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
var vfioDevices []config.DeviceInfo
// vhost-user-block device is a PCIe device in Virt, keep track of it
// for correct number of PCIe root ports.
var vhostUserBlkDevices []config.DeviceInfo
for cnt, containers := range sandboxConfig.Containers {
for dev, device := range containers.DeviceInfos {
if deviceManager.IsVhostUserBlk(device) {
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
continue
}
isVFIO := deviceManager.IsVFIO(device.ContainerPath)
if hotPlugVFIO && isVFIO {
vfioDevices = append(vfioDevices, device)
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
}
if coldPlugVFIO && isVFIO {
device.ColdPlug = true
device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
vfioDevices = append(vfioDevices, device)
// We need to remove the devices marked for cold-plug
// otherwise at the container level the kata-agent
// will try to hot-plug them.
sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
}
}
var filteredDevices []config.DeviceInfo
for _, device := range containers.DeviceInfos {
if device.ID != "remove-we-are-cold-plugging" {
filteredDevices = append(filteredDevices, device)
}
}
sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
}
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
// store doesn't require hypervisor to be stored immediately
if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
return nil, err
@@ -685,7 +627,8 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return s, nil
}
for _, dev := range vfioDevices {
for _, dev := range sandboxConfig.HypervisorConfig.VFIODevices {
s.Logger().Info("cold-plug device: ", dev)
_, err := s.AddDevice(ctx, dev)
if err != nil {
s.Logger().WithError(err).Debug("Cannot cold-plug add device")
@@ -695,6 +638,70 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return s, nil
}
func (s *Sandbox) coldOrHotPlugVFIO(sandboxConfig *SandboxConfig) (bool, error) {
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
// until we have TDISP/IDE PCIe support.
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
// Aggregate all the containner devices for hot-plug and use them to dedcue
// the correct amount of ports to reserve for the hypervisor.
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
modeIsGK := (sandboxConfig.VfioMode == config.VFIOModeGuestKernel)
modeIsVFIO := (sandboxConfig.VfioMode == config.VFIOModeVFIO)
var vfioDevices []config.DeviceInfo
// vhost-user-block device is a PCIe device in Virt, keep track of it
// for correct number of PCIe root ports.
var vhostUserBlkDevices []config.DeviceInfo
for cnt, containers := range sandboxConfig.Containers {
for dev, device := range containers.DeviceInfos {
if deviceManager.IsVhostUserBlk(device) {
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
continue
}
isVFIODevice := deviceManager.IsVFIODevice(device.ContainerPath)
isVFIOControlDevice := deviceManager.IsVFIOControlDevice(device.ContainerPath)
// vfio_mode=vfio needs the VFIO control device add it to the list
// of devices to be added to the VM.
if modeIsVFIO && isVFIOControlDevice && !hotPlugVFIO {
vfioDevices = append(vfioDevices, device)
}
if hotPlugVFIO && isVFIODevice {
device.ColdPlug = false
device.Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
vfioDevices = append(vfioDevices, device)
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
}
if coldPlugVFIO && isVFIODevice {
device.ColdPlug = true
device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
vfioDevices = append(vfioDevices, device)
// We need to remove the devices marked for cold-plug
// otherwise at the container level the kata-agent
// will try to hot-plug them.
if modeIsGK {
sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
}
}
}
var filteredDevices []config.DeviceInfo
for _, device := range containers.DeviceInfos {
if device.ID != "remove-we-are-cold-plugging" {
filteredDevices = append(filteredDevices, device)
}
}
sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
}
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
return coldPlugVFIO, nil
}
func (s *Sandbox) createResourceController() error {
var err error
cgroupPath := ""
@@ -2069,26 +2076,26 @@ func (s *Sandbox) AddDevice(ctx context.Context, info config.DeviceInfo) (api.De
}
var err error
b, err := s.devManager.NewDevice(info)
add, err := s.devManager.NewDevice(info)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
s.devManager.RemoveDevice(b.DeviceID())
s.devManager.RemoveDevice(add.DeviceID())
}
}()
if err = s.devManager.AttachDevice(ctx, b.DeviceID(), s); err != nil {
if err = s.devManager.AttachDevice(ctx, add.DeviceID(), s); err != nil {
return nil, err
}
defer func() {
if err != nil {
s.devManager.DetachDevice(ctx, b.DeviceID(), s)
s.devManager.DetachDevice(ctx, add.DeviceID(), s)
}
}()
return b, nil
return add, nil
}
// updateResources will:

View File

@@ -606,6 +606,7 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) {
HostPath: path,
ContainerPath: path,
DevType: "c",
Port: config.RootPort,
}
dev, err := dm.NewDevice(deviceInfo)
assert.Nil(t, err, "deviceManager.NewDevice return error: %v", err)