CCv0: Merge from main -- August 1st

Conflicts:
	src/runtime/pkg/katautils/config.go
	src/runtime/virtcontainers/container.go
	src/runtime/virtcontainers/hypervisor.go
	src/runtime/virtcontainers/qemu_arch_base.go
	src/runtime/virtcontainers/sandbox.go
	tests/integration/kubernetes/gha-run.sh
	tests/integration/kubernetes/setup.sh
	tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
	tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh
	tools/packaging/kata-deploy/scripts/kata-deploy.sh
	tools/packaging/kernel/kata_config_version
	versions.yaml

Fixes: #7433

Signed-off-by: Fabiano Fidêncio <fabiano.fidencio@intel.com>
This commit is contained in:
Fabiano Fidêncio
2023-08-01 17:14:17 +02:00
426 changed files with 64309 additions and 2456 deletions

View File

@@ -103,20 +103,19 @@ type RuntimeVersionInfo struct {
// HypervisorInfo stores hypervisor details
type HypervisorInfo struct {
MachineType string
Version string
Path string
BlockDeviceDriver string
EntropySource string
SharedFS string
VirtioFSDaemon string
SocketPath string
Msize9p uint32
MemorySlots uint32
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
HotplugVFIOOnRootBus bool
Debug bool
MachineType string
Version string
Path string
BlockDeviceDriver string
EntropySource string
SharedFS string
VirtioFSDaemon string
SocketPath string
Msize9p uint32
MemorySlots uint32
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
Debug bool
}
// AgentInfo stores agent details
@@ -307,20 +306,19 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) {
}
return HypervisorInfo{
Debug: config.HypervisorConfig.Debug,
MachineType: config.HypervisorConfig.HypervisorMachineType,
Version: version,
Path: hypervisorPath,
BlockDeviceDriver: config.HypervisorConfig.BlockDeviceDriver,
Msize9p: config.HypervisorConfig.Msize9p,
MemorySlots: config.HypervisorConfig.MemSlots,
EntropySource: config.HypervisorConfig.EntropySource,
SharedFS: config.HypervisorConfig.SharedFS,
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
SocketPath: socketPath,
Debug: config.HypervisorConfig.Debug,
MachineType: config.HypervisorConfig.HypervisorMachineType,
Version: version,
Path: hypervisorPath,
BlockDeviceDriver: config.HypervisorConfig.BlockDeviceDriver,
Msize9p: config.HypervisorConfig.Msize9p,
MemorySlots: config.HypervisorConfig.MemSlots,
EntropySource: config.HypervisorConfig.EntropySource,
SharedFS: config.HypervisorConfig.SharedFS,
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
SocketPath: socketPath,
}, nil
}

View File

@@ -87,7 +87,6 @@ func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.Runti
disableBlock := true
blockStorageDriver := "virtio-scsi"
enableIOThreads := true
hotplugVFIOOnRootBus := true
hotPlugVFIO = config.BridgePort
coldPlugVFIO = config.NoPort
disableNewNetNs := false
@@ -132,7 +131,6 @@ func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.Runti
DisableBlock: disableBlock,
BlockDeviceDriver: blockStorageDriver,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
DisableNewNetNs: disableNewNetNs,
@@ -276,10 +274,8 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo {
EntropySource: config.HypervisorConfig.EntropySource,
SharedFS: config.HypervisorConfig.SharedFS,
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
}
if os.Geteuid() == 0 {

View File

@@ -330,31 +330,29 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string,
disableBlockDevice := true
blockDeviceDriver := "virtio-scsi"
enableIOThreads := true
hotplugVFIOOnRootBus := true
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := path.Join(dir, "virtiofsd")
hotPlugVFIO = config.BridgePort
coldPlugVFIO = config.RootPort
coldPlugVFIO = config.NoPort
configFileOptions := ktu.RuntimeConfigOptions{
Hypervisor: "qemu",
HypervisorPath: hypervisorPath,
KernelPath: kernelPath,
ImagePath: imagePath,
RootfsType: rootfsType,
KernelParams: kernelParams,
MachineType: machineType,
LogPath: logPath,
DisableBlock: disableBlockDevice,
BlockDeviceDriver: blockDeviceDriver,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
DisableNewNetNs: disableNewNetNs,
SharedFS: sharedFS,
VirtioFSDaemon: virtioFSdaemon,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
Hypervisor: "qemu",
HypervisorPath: hypervisorPath,
KernelPath: kernelPath,
ImagePath: imagePath,
RootfsType: rootfsType,
KernelParams: kernelParams,
MachineType: machineType,
LogPath: logPath,
DisableBlock: disableBlockDevice,
BlockDeviceDriver: blockDeviceDriver,
EnableIOThreads: enableIOThreads,
DisableNewNetNs: disableNewNetNs,
SharedFS: sharedFS,
VirtioFSDaemon: virtioFSdaemon,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
}
runtimeConfigFileData := ktu.MakeRuntimeConfigFileData(configFileOptions)

View File

@@ -111,7 +111,7 @@ func GetVFIODeviceType(deviceFilePath string) (config.VFIODeviceType, error) {
return config.VFIODeviceErrorType, err
}
if strings.HasPrefix(deviceSysfsDev, vfioAPSysfsDir) {
if strings.Contains(deviceSysfsDev, vfioAPSysfsDir) {
return config.VFIOAPDeviceMediatedType, nil
}
@@ -178,22 +178,22 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
}
id := utils.MakeNameID("vfio", device.ID+strconv.Itoa(i), maxDevIDSize)
pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass)
// We need to ignore Host or PCI Bridges that are in the same IOMMU group as the
// passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge.
// Class 0x0604 is PCI bridge, 0x0600 is Host bridge
ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600)
if err != nil {
return nil, err
}
if ignorePCIDevice {
continue
}
var vfio config.VFIODev
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
// This is vfio-pci and vfio-mdev specific
pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass)
// We need to ignore Host or PCI Bridges that are in the same IOMMU group as the
// passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge.
// Class 0x0604 is PCI bridge, 0x0600 is Host bridge
ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600)
if err != nil {
return nil, err
}
if ignorePCIDevice {
continue
}
// Do not directly assign to `vfio` -- need to access field still
vfio = config.VFIODev{
ID: id,
@@ -216,6 +216,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe
SysfsDev: deviceSysfsDev,
Type: config.VFIOAPDeviceMediatedType,
APDevices: devices,
Port: device.Port,
}
default:
return nil, fmt.Errorf("Failed to append device: VFIO device type unrecognized")

View File

@@ -69,7 +69,14 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
if err != nil {
return err
}
for _, vfio := range device.VfioDevs {
// If vfio.Port is not set we bail out, users should set
// explicitly the port in the config file
if vfio.Port == "" {
return fmt.Errorf("cold_plug_vfio= or hot_plug_vfio= port is not set for device %s (BridgePort | RootPort | SwitchPort)", vfio.BDF)
}
if vfio.IsPCIe {
busIndex := len(config.PCIeDevices[vfio.Port])
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)

View File

@@ -120,7 +120,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device
if devInfo.ID, err = dm.newDeviceID(); err != nil {
return nil, err
}
if IsVFIO(devInfo.HostPath) {
if IsVFIODevice(devInfo.HostPath) {
return drivers.NewVFIODevice(&devInfo), nil
} else if IsVhostUserBlk(devInfo) {
if devInfo.DriverOptions == nil {
@@ -191,12 +191,12 @@ func (dm *deviceManager) AttachDevice(ctx context.Context, id string, dr api.Dev
dm.Lock()
defer dm.Unlock()
d, ok := dm.devices[id]
dev, ok := dm.devices[id]
if !ok {
return ErrDeviceNotExist
}
if err := d.Attach(ctx, dr); err != nil {
if err := dev.Attach(ctx, dr); err != nil {
return err
}
return nil

View File

@@ -90,6 +90,100 @@ func TestNewDevice(t *testing.T) {
assert.Equal(t, vfioDev.DeviceInfo.GID, uint32(2))
}
func TestAttachVFIOAPDevice(t *testing.T) {
var err error
var ok bool
dm := &deviceManager{
devices: make(map[string]api.Device),
}
tmpDir := t.TempDir()
// sys/devices/vfio_ap/matrix/f94290f8-78ac-45fb-bb22-e55e519fa64f
testSysfsAP := "/sys/devices/vfio_ap/"
testDeviceAP := "f94290f8-78ac-45fb-bb22-e55e519fa64f"
testVFIOGroup := "42"
matrixDir := filepath.Join(tmpDir, testSysfsAP, "matrix")
err = os.MkdirAll(matrixDir, dirMode)
assert.Nil(t, err)
deviceAPFile := filepath.Join(matrixDir, testDeviceAP)
err = os.MkdirAll(deviceAPFile, dirMode)
assert.Nil(t, err)
matrixDeviceAPFile := filepath.Join(deviceAPFile, "matrix")
_, err = os.Create(matrixDeviceAPFile)
assert.Nil(t, err)
// create AP devices in the matrix file
APDevices := []byte("05.001f\n")
err = os.WriteFile(matrixDeviceAPFile, APDevices, 0644)
assert.Nil(t, err)
devicesVFIOGroupDir := filepath.Join(tmpDir, testVFIOGroup, "devices")
err = os.MkdirAll(devicesVFIOGroupDir, dirMode)
assert.Nil(t, err)
deviceAPSymlink := filepath.Join(devicesVFIOGroupDir, testDeviceAP)
err = os.Symlink(deviceAPFile, deviceAPSymlink)
assert.Nil(t, err)
savedIOMMUPath := config.SysIOMMUGroupPath
config.SysIOMMUGroupPath = tmpDir
savedSysBusPciDevicesPath := config.SysBusPciDevicesPath
config.SysBusPciDevicesPath = devicesVFIOGroupDir
defer func() {
config.SysIOMMUGroupPath = savedIOMMUPath
config.SysBusPciDevicesPath = savedSysBusPciDevicesPath
}()
path := filepath.Join(vfioPath, testVFIOGroup)
deviceInfo := config.DeviceInfo{
HostPath: path,
ContainerPath: path,
DevType: "c",
ColdPlug: false,
Port: config.RootPort,
}
device, err := dm.NewDevice(deviceInfo)
assert.Nil(t, err)
_, ok = device.(*drivers.VFIODevice)
assert.True(t, ok)
devReceiver := &api.MockDeviceReceiver{}
err = device.Attach(context.Background(), devReceiver)
assert.Nil(t, err)
err = device.Detach(context.Background(), devReceiver)
assert.Nil(t, err)
// If we omit the port setting we should fail
failDm := &deviceManager{
devices: make(map[string]api.Device),
}
failDeviceInfo := config.DeviceInfo{
HostPath: path,
ContainerPath: path,
DevType: "c",
ColdPlug: false,
}
failDevice, err := failDm.NewDevice(failDeviceInfo)
assert.Nil(t, err)
_, ok = failDevice.(*drivers.VFIODevice)
assert.True(t, ok)
failDevReceiver := &api.MockDeviceReceiver{}
err = failDevice.Attach(context.Background(), failDevReceiver)
assert.Error(t, err)
}
func TestAttachVFIODevice(t *testing.T) {
dm := &deviceManager{
blockDriver: config.VirtioBlock,
@@ -132,6 +226,8 @@ func TestAttachVFIODevice(t *testing.T) {
HostPath: path,
ContainerPath: path,
DevType: "c",
ColdPlug: false,
Port: config.RootPort,
}
device, err := dm.NewDevice(deviceInfo)

View File

@@ -17,8 +17,15 @@ const (
vfioPath = "/dev/vfio/"
)
// IsVFIOControlDevice checks if the device provided is a vfio control device.
// Depending no the vfio_mode we need to know if a device is a VFIO device
// or the VFIO control device
func IsVFIOControlDevice(path string) bool {
return path == filepath.Join(vfioPath, "vfio")
}
// IsVFIO checks if the device provided is a vfio group.
func IsVFIO(hostPath string) bool {
func IsVFIODevice(hostPath string) bool {
// Ignore /dev/vfio/vfio character device
if strings.HasPrefix(hostPath, filepath.Join(vfioPath, "vfio")) {
return false

View File

@@ -31,7 +31,7 @@ func TestIsVFIO(t *testing.T) {
}
for _, d := range data {
isVFIO := IsVFIO(d.path)
isVFIO := IsVFIODevice(d.path)
assert.Equal(t, d.expected, isVFIO)
}
}

View File

@@ -163,6 +163,9 @@ const (
// TransportMMIO is the MMIO transport for virtio devices.
TransportMMIO VirtioTransport = "mmio"
// TransportAP is the AP transport for virtio devices.
TransportAP VirtioTransport = "ap"
)
// defaultTransport returns the default transport for the current combination
@@ -199,6 +202,14 @@ func (transport VirtioTransport) isVirtioCCW(config *Config) bool {
return transport == TransportCCW
}
func (transport VirtioTransport) isVirtioAP(config *Config) bool {
if transport == "" {
transport = transport.defaultTransport(config)
}
return transport == TransportAP
}
// getName returns the name of the current transport.
func (transport VirtioTransport) getName(config *Config) string {
if transport == "" {
@@ -1852,6 +1863,9 @@ type VFIODevice struct {
// Transport is the virtio transport for this device.
Transport VirtioTransport
// SysfsDev specifies the sysfs matrix entry for the AP device
SysfsDev string
}
// VFIODeviceTransport is a map of the vfio device name that corresponds to
@@ -1860,11 +1874,13 @@ var VFIODeviceTransport = map[VirtioTransport]string{
TransportPCI: "vfio-pci",
TransportCCW: "vfio-ccw",
TransportMMIO: "vfio-device",
TransportAP: "vfio-ap",
}
// Valid returns true if the VFIODevice structure is valid and complete.
// s390x architecture requires SysfsDev to be set.
func (vfioDev VFIODevice) Valid() bool {
return vfioDev.BDF != ""
return vfioDev.BDF != "" || vfioDev.SysfsDev != ""
}
// QemuParams returns the qemu parameters built out of this vfio device.
@@ -1874,6 +1890,15 @@ func (vfioDev VFIODevice) QemuParams(config *Config) []string {
driver := vfioDev.deviceName(config)
if vfioDev.Transport.isVirtioAP(config) {
deviceParams = append(deviceParams, fmt.Sprintf("%s,sysfsdev=%s", driver, vfioDev.SysfsDev))
qemuParams = append(qemuParams, "-device")
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
return qemuParams
}
deviceParams = append(deviceParams, fmt.Sprintf("%s,host=%s", driver, vfioDev.BDF))
if vfioDev.Transport.isVirtioPCI(config) {
if vfioDev.VendorID != "" {
@@ -2878,10 +2903,9 @@ func (config *Config) appendDevices(logger QMPLog) {
for _, d := range config.Devices {
if !d.Valid() {
logger.Errorf("vm device is not valid: %+v", config.Devices)
logger.Errorf("vm device is not valid: %+v", d)
continue
}
config.qemuParams = append(config.qemuParams, d.QemuParams(config)...)
}
}

View File

@@ -1233,10 +1233,11 @@ func (q *QMP) ExecutePCIVFIOMediatedDeviceAdd(ctx context.Context, devID, sysfsd
}
// ExecuteAPVFIOMediatedDeviceAdd adds a VFIO mediated AP device to a QEMU instance using the device_add command.
func (q *QMP) ExecuteAPVFIOMediatedDeviceAdd(ctx context.Context, sysfsdev string) error {
func (q *QMP) ExecuteAPVFIOMediatedDeviceAdd(ctx context.Context, sysfsdev string, devID string) error {
args := map[string]interface{}{
"driver": VfioAP,
"sysfsdev": sysfsdev,
"id": devID,
}
return q.executeCommand(ctx, "device_add", args, nil)
}

View File

@@ -1128,7 +1128,7 @@ func TestQMPAPVFIOMediatedDeviceAdd(t *testing.T) {
q := startQMPLoop(buf, cfg, connectedCh, disconnectedCh)
checkVersion(t, connectedCh)
sysfsDev := "/sys/devices/vfio_ap/matrix/a297db4a-f4c2-11e6-90f6-d3b88d6c9525"
err := q.ExecuteAPVFIOMediatedDeviceAdd(context.Background(), sysfsDev)
err := q.ExecuteAPVFIOMediatedDeviceAdd(context.Background(), sysfsDev, "test-id")
if err != nil {
t.Fatalf("Unexpected error %v", err)
}

View File

@@ -42,10 +42,9 @@ type HypervisorState struct {
// HotpluggedCPUs is the list of CPUs that were hot-added
HotpluggedVCPUs []CPUDevice
HotpluggedMemory int
VirtiofsDaemonPid int
Pid int
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
HotplugVFIOOnRootBus bool
HotpluggedMemory int
VirtiofsDaemonPid int
Pid int
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
}

View File

@@ -233,7 +233,6 @@ type RuntimeConfigOptions struct {
DefaultMsize9p uint32
DisableBlock bool
EnableIOThreads bool
HotplugVFIOOnRootBus bool
DisableNewNetNs bool
HypervisorDebug bool
RuntimeDebug bool
@@ -317,8 +316,8 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string {
default_memory = ` + strconv.FormatUint(uint64(config.DefaultMemSize), 10) + `
disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + `
enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + `
hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + `
cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `"
hot_plug_vfio = "` + config.HotPlugVFIO.String() + `"
msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + `
enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + `
guest_hook_path = "` + config.DefaultGuestHookPath + `"

View File

@@ -81,7 +81,6 @@ const defaultFileBackedMemRootDir string = ""
const defaultEnableDebug bool = false
const defaultDisableNestingChecks bool = false
const defaultMsize9p uint32 = 8192
const defaultHotplugVFIOOnRootBus bool = false
const defaultEntropySource = "/dev/urandom"
const defaultGuestHookPath string = ""
const defaultVirtioFSCacheMode = "never"

View File

@@ -22,6 +22,7 @@ import (
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
@@ -157,7 +158,6 @@ type hypervisor struct {
DisableNestingChecks bool `toml:"disable_nesting_checks"`
EnableIOThreads bool `toml:"enable_iothreads"`
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"`
ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"`
DisableVhostNet bool `toml:"disable_vhost_net"`
@@ -886,7 +886,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
EnableIOThreads: h.EnableIOThreads,
Msize9p: h.msize9p(),
DisableImageNvdimm: h.DisableImageNvdimm,
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
HotPlugVFIO: h.hotPlugVFIO(),
ColdPlugVFIO: h.coldPlugVFIO(),
DisableVhostNet: h.DisableVhostNet,
@@ -1089,7 +1088,6 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
BlockDeviceCacheNoflush: h.BlockDeviceCacheNoflush,
EnableIOThreads: h.EnableIOThreads,
Msize9p: h.msize9p(),
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
ColdPlugVFIO: h.coldPlugVFIO(),
HotPlugVFIO: h.hotPlugVFIO(),
DisableVhostNet: true,
@@ -1336,7 +1334,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
BlockDeviceCacheNoflush: defaultBlockDeviceCacheNoflush,
EnableIOThreads: defaultEnableIOThreads,
Msize9p: defaultMsize9p,
HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus,
ColdPlugVFIO: defaultColdPlugVFIO,
HotPlugVFIO: defaultHotPlugVFIO,
GuestHookPath: defaultGuestHookPath,
@@ -1722,7 +1719,8 @@ func checkConfig(config oci.RuntimeConfig) error {
hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
machineType := config.HypervisorConfig.HypervisorMachineType
if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil {
hypervisorType := config.HypervisorType
if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType, hypervisorType); err != nil {
return err
}
@@ -1732,10 +1730,9 @@ func checkConfig(config oci.RuntimeConfig) error {
// checkPCIeConfig ensures the PCIe configuration is valid.
// Only allow one of the following settings for cold-plug:
// no-port, root-port, switch-port
func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error {
// Currently only QEMU q35 supports advanced PCIe topologies
// firecracker, dragonball do not have right now any PCIe support
if machineType != "q35" {
func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string, hypervisorType virtcontainers.HypervisorType) error {
if hypervisorType != virtcontainers.QemuHypervisor {
kataUtilsLogger.Warn("Advanced PCIe Topology only available for QEMU hypervisor, ignoring hot(cold)_vfio_port setting")
return nil
}
@@ -1745,6 +1742,12 @@ func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineT
if coldPlug == config.NoPort && hotPlug == config.NoPort {
return nil
}
// Currently only QEMU q35,virt support advanced PCIe topologies
// firecracker, dragonball do not have right now any PCIe support
if machineType != "q35" && machineType != "virt" {
return nil
}
var port config.PCIePort
if coldPlug != config.NoPort {
port = coldPlug
@@ -1752,10 +1755,13 @@ func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineT
if hotPlug != config.NoPort {
port = hotPlug
}
if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort {
if port == config.NoPort {
return fmt.Errorf("invalid vfio_port=%s setting, use on of %s, %s, %s",
port, config.BridgePort, config.RootPort, config.SwitchPort)
}
if port == config.BridgePort || port == config.RootPort || port == config.SwitchPort {
return nil
}
return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s",
coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort)
}

View File

@@ -85,7 +85,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime
blockDeviceDriver := "virtio-scsi"
blockDeviceAIO := "io_uring"
enableIOThreads := true
hotplugVFIOOnRootBus := true
hotPlugVFIO = config.NoPort
coldPlugVFIO = config.BridgePort
disableNewNetNs := false
@@ -108,7 +107,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime
BlockDeviceDriver: blockDeviceDriver,
BlockDeviceAIO: blockDeviceAIO,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
DisableNewNetNs: disableNewNetNs,
@@ -172,7 +170,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime
BlockDeviceAIO: defaultBlockDeviceAIO,
DefaultBridges: defaultBridgesCount,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
Msize9p: defaultMsize9p,
@@ -613,7 +610,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
machineType := "machineType"
disableBlock := true
enableIOThreads := true
hotplugVFIOOnRootBus := true
coldPlugVFIO = config.BridgePort
orgVHostVSockDevicePath := utils.VHostVSockDevicePath
blockDeviceAIO := "io_uring"
@@ -632,7 +628,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
MachineType: machineType,
DisableBlockDeviceUse: disableBlock,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
ColdPlugVFIO: coldPlugVFIO,
RxRateLimiterMaxRate: rxRateLimiterMaxRate,
TxRateLimiterMaxRate: txRateLimiterMaxRate,
@@ -684,10 +679,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
t.Errorf("Expected value for enable IOThreads %v, got %v", enableIOThreads, config.EnableIOThreads)
}
if config.HotplugVFIOOnRootBus != hotplugVFIOOnRootBus {
t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus)
}
if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate {
t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate)
}
@@ -809,7 +800,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
machineType := "machineType"
disableBlock := true
enableIOThreads := true
hotplugVFIOOnRootBus := true
hypervisor := hypervisor{
Path: hypervisorPath,
@@ -819,7 +809,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
MachineType: machineType,
DisableBlockDeviceUse: disableBlock,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
}
_, err := newQemuHypervisorConfig(hypervisor)

View File

@@ -511,12 +511,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
return err
}
if err := newAnnotationConfiguration(ocispec, vcAnnotations.HotplugVFIOOnRootBus).setBool(func(hotplugVFIOOnRootBus bool) {
config.HypervisorConfig.HotplugVFIOOnRootBus = hotplugVFIOOnRootBus
}); err != nil {
return err
}
if err := newAnnotationConfiguration(ocispec, vcAnnotations.UseLegacySerial).setBool(func(useLegacySerial bool) {
config.HypervisorConfig.LegacySerial = useLegacySerial
}); err != nil {

View File

@@ -659,7 +659,6 @@ func TestAddHypervisorAnnotations(t *testing.T) {
ocispec.Annotations[vcAnnotations.DisableVhostNet] = "true"
ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/"
ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true"
ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true"
ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort
ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort
ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true"
@@ -700,7 +699,6 @@ func TestAddHypervisorAnnotations(t *testing.T) {
assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true)
assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/")
assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true)
assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true)
assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort))
assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort))
assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true)

View File

@@ -18,6 +18,7 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
@@ -878,9 +879,16 @@ func (c *Container) create(ctx context.Context) (err error) {
// If cold-plug we've attached the devices already, do not try to
// attach them a second time.
coldPlugVFIO := (c.sandbox.config.HypervisorConfig.ColdPlugVFIO != config.NoPort)
modeVFIO := (c.sandbox.config.VfioMode == config.VFIOModeVFIO)
if coldPlugVFIO {
var cntDevices []ContainerDevice
for _, dev := range c.devices {
isVFIOControlDevice := deviceManager.IsVFIOControlDevice(dev.ContainerPath)
if isVFIOControlDevice && modeVFIO {
cntDevices = append(cntDevices, dev)
}
if strings.HasPrefix(dev.ContainerPath, vfioPath) {
c.Logger().WithFields(logrus.Fields{
"device": dev,

View File

@@ -284,10 +284,6 @@ type HypervisorConfig struct {
// DisableImageNvdimm is used to disable guest rootfs image nvdimm devices
DisableImageNvdimm bool
// HotplugVFIOOnRootBus is used to indicate if devices need to be hotplugged on the
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
// root port, switch, bridge or no port
HotPlugVFIO hv.PCIePort

View File

@@ -312,117 +312,356 @@ type Param struct {
// HypervisorConfig is the hypervisor configuration.
// nolint: govet
type HypervisorConfig struct {
customAssets map[types.AssetType]*types.Asset
SeccompSandbox string
KernelPath string
ImagePath string
InitrdPath string
FirmwarePath string
FirmwareVolumePath string
MachineAccelerators string
CPUFeatures string
HypervisorPath string
HypervisorCtlPath string
GuestPreAttestationKeyset string
BlockDeviceDriver string
HypervisorMachineType string
GuestPreAttestationURI string
GuestPreAttestationMode string
DevicesStatePath string
EntropySource string
SharedFS string
SharedPath string
VirtioFSDaemon string
VirtioFSCache string
FileBackedMemRootDir string
VhostUserStorePath string
GuestMemoryDumpPath string
GuestHookPath string
VMid string
VMStorePath string
RunStorePath string
SELinuxProcessLabel string
JailerPath string
MemoryPath string
SEVCertChainPath string
BlockDeviceAIO string
User string
RemoteHypervisorSocket string
SandboxName string
SandboxNamespace string
JailerPathList []string
EntropySourceList []string
VirtioFSDaemonList []string
VirtioFSExtraArgs []string
EnableAnnotations []string
FileBackedMemRootList []string
PFlash []string
VhostUserStorePathList []string
HypervisorCtlPathList []string
KernelParams []Param
Groups []uint32
HypervisorPathList []string
HypervisorParams []Param
DiskRateLimiterBwOneTimeBurst int64
DiskRateLimiterOpsMaxRate int64
RootfsType string
VhostUserDeviceReconnect uint32
// customAssets is a map of assets.
// Each value in that map takes precedence over the configured assets.
// For example, if there is a value for the "kernel" key in this map,
// it will be used for the sandbox's kernel path instead of KernelPath.
customAssets map[types.AssetType]*types.Asset
// Supplementary group IDs.
Groups []uint32
// KernelPath is the guest kernel host path.
KernelPath string
// ImagePath is the guest image host path.
ImagePath string
// InitrdPath is the guest initrd image host path.
// ImagePath and InitrdPath cannot be set at the same time.
InitrdPath string
// RootfsType is filesystem type of rootfs.
RootfsType string
// FirmwarePath is the bios host path
FirmwarePath string
// FirmwareVolumePath is the configuration volume path for the firmware
FirmwareVolumePath string
// MachineAccelerators are machine specific accelerators
MachineAccelerators string
// CPUFeatures are cpu specific features
CPUFeatures string
// HypervisorPath is the hypervisor executable host path.
HypervisorPath string
// HypervisorCtlPath is the hypervisor ctl executable host path.
HypervisorCtlPath string
// JailerPath is the jailer executable host path.
JailerPath string
// BlockDeviceDriver specifies the driver to be used for block device
// either VirtioSCSI or VirtioBlock with the default driver being defaultBlockDriver
BlockDeviceDriver string
// HypervisorMachineType specifies the type of machine being
// emulated.
HypervisorMachineType string
// MemoryPath is the memory file path of VM memory. Used when either BootToBeTemplate or
// BootFromTemplate is true.
MemoryPath string
// DevicesStatePath is the VM device state file path. Used when either BootToBeTemplate or
// BootFromTemplate is true.
DevicesStatePath string
// EntropySource is the path to a host source of
// entropy (/dev/random, /dev/urandom or real hardware RNG device)
EntropySource string
// Shared file system type:
// - virtio-9p
// - virtio-fs (default)
SharedFS string
// Path for filesystem sharing
SharedPath string
// VirtioFSDaemon is the virtio-fs vhost-user daemon path
VirtioFSDaemon string
// VirtioFSCache cache mode for fs version cache
VirtioFSCache string
// File based memory backend root directory
FileBackedMemRootDir string
// VhostUserStorePath is the directory path where vhost-user devices
// related folders, sockets and device nodes should be.
VhostUserStorePath string
// VhostUserDeviceReconnect is the timeout for reconnecting on non-server spdk sockets
// when the remote end goes away. Zero disables reconnecting.
VhostUserDeviceReconnect uint32
// GuestCoredumpPath is the path in host for saving guest memory dump
GuestMemoryDumpPath string
// GuestHookPath is the path within the VM that will be used for 'drop-in' hooks
GuestHookPath string
// VMid is the id of the VM that create the hypervisor if the VM is created by the factory.
// VMid is "" if the hypervisor is not created by the factory.
VMid string
// VMStorePath is the location on disk where VM information will persist
VMStorePath string
// VMStorePath is the location on disk where runtime information will persist
RunStorePath string
// SELinux label for the VM
SELinuxProcessLabel string
// HypervisorPathList is the list of hypervisor paths names allowed in annotations
HypervisorPathList []string
// HypervisorCtlPathList is the list of hypervisor control paths names allowed in annotations
HypervisorCtlPathList []string
// JailerPathList is the list of jailer paths names allowed in annotations
JailerPathList []string
// EntropySourceList is the list of valid entropy sources
EntropySourceList []string
// VirtioFSDaemonList is the list of valid virtiofs names for annotations
VirtioFSDaemonList []string
// VirtioFSExtraArgs passes options to virtiofsd daemon
VirtioFSExtraArgs []string
// Enable annotations by name
EnableAnnotations []string
// FileBackedMemRootList is the list of valid root directories values for annotations
FileBackedMemRootList []string
// PFlash image paths
PFlash []string
// VhostUserStorePathList is the list of valid values for vhost-user paths
VhostUserStorePathList []string
// SeccompSandbox is the qemu function which enables the seccomp feature
SeccompSandbox string
// BlockiDeviceAIO specifies the I/O API to be used.
BlockDeviceAIO string
// The user maps to the uid.
User string
// KernelParams are additional guest kernel parameters.
KernelParams []Param
// HypervisorParams are additional hypervisor parameters.
HypervisorParams []Param
// SGXEPCSize specifies the size in bytes for the EPC Section.
// Enable SGX. Hardware-based isolation and memory encryption.
SGXEPCSize int64
// DiskRateLimiterBwRate is used to control disk I/O bandwidth on VM level.
// The same value, defined in bits per second, is used for inbound and outbound bandwidth.
DiskRateLimiterBwMaxRate int64
// DiskRateLimiterBwOneTimeBurst is used to control disk I/O bandwidth on VM level.
// This increases the initial max rate and this initial extra credit does *NOT* replenish
// and can be used for an *initial* burst of data.
DiskRateLimiterBwOneTimeBurst int64
// DiskRateLimiterOpsRate is used to control disk I/O operations on VM level.
// The same value, defined in operations per second, is used for inbound and outbound bandwidth.
DiskRateLimiterOpsMaxRate int64
// DiskRateLimiterOpsOneTimeBurst is used to control disk I/O operations on VM level.
// This increases the initial max rate and this initial extra credit does *NOT* replenish
// and can be used for an *initial* burst of data.
DiskRateLimiterOpsOneTimeBurst int64
SGXEPCSize int64
DefaultMaxMemorySize uint64
NetRateLimiterBwMaxRate int64
NetRateLimiterBwOneTimeBurst int64
NetRateLimiterOpsMaxRate int64
NetRateLimiterOpsOneTimeBurst int64
MemOffset uint64
TxRateLimiterMaxRate uint64
DiskRateLimiterBwMaxRate int64
RxRateLimiterMaxRate uint64
MemorySize uint32
DefaultMaxVCPUs uint32
DefaultBridges uint32
Msize9p uint32
MemSlots uint32
VirtioFSCacheSize uint32
VirtioFSQueueSize uint32
Uid uint32
Gid uint32
SEVGuestPolicy uint32
SNPGuestPolicy uint64
NumVCPUs uint32
RemoteHypervisorTimeout uint32
IOMMUPlatform bool
EnableIOThreads bool
Debug bool
MemPrealloc bool
HugePages bool
VirtioMem bool
IOMMU bool
DisableBlockDeviceUse bool
DisableNestingChecks bool
DisableImageNvdimm bool
HotplugVFIOOnRootBus bool
GuestMemoryDumpPaging bool
ConfidentialGuest bool
SevSnpGuest bool
GuestPreAttestation bool
BlockDeviceCacheNoflush bool
BlockDeviceCacheDirect bool
BlockDeviceCacheSet bool
BootToBeTemplate bool
BootFromTemplate bool
DisableVhostNet bool
EnableVhostUserStore bool
GuestSwap bool
Rootless bool
DisableSeccomp bool
DisableSeLinux bool
DisableGuestSeLinux bool
LegacySerial bool
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
VFIODevices []config.DeviceInfo
VhostUserBlkDevices []config.DeviceInfo
// RxRateLimiterMaxRate is used to control network I/O inbound bandwidth on VM level.
RxRateLimiterMaxRate uint64
// TxRateLimiterMaxRate is used to control network I/O outbound bandwidth on VM level.
TxRateLimiterMaxRate uint64
// NetRateLimiterBwRate is used to control network I/O bandwidth on VM level.
// The same value, defined in bits per second, is used for inbound and outbound bandwidth.
NetRateLimiterBwMaxRate int64
// NetRateLimiterBwOneTimeBurst is used to control network I/O bandwidth on VM level.
// This increases the initial max rate and this initial extra credit does *NOT* replenish
// and can be used for an *initial* burst of data.
NetRateLimiterBwOneTimeBurst int64
// NetRateLimiterOpsRate is used to control network I/O operations on VM level.
// The same value, defined in operations per second, is used for inbound and outbound bandwidth.
NetRateLimiterOpsMaxRate int64
// NetRateLimiterOpsOneTimeBurst is used to control network I/O operations on VM level.
// This increases the initial max rate and this initial extra credit does *NOT* replenish
// and can be used for an *initial* burst of data.
NetRateLimiterOpsOneTimeBurst int64
// MemOffset specifies memory space for nvdimm device
MemOffset uint64
// VFIODevices are used to get PCIe device info early before the sandbox
// is started to make better PCIe topology decisions
VFIODevices []config.DeviceInfo
// VhostUserBlkDevices are handled differently in Q35 and Virt machine
// type. capture them early before the sandbox to make better PCIe topology
// decisions
VhostUserBlkDevices []config.DeviceInfo
// HotplugVFIO is used to indicate if devices need to be hotplugged on the
// root port or a switch
HotPlugVFIO config.PCIePort
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
// root port, switch or no port
ColdPlugVFIO config.PCIePort
// NumVCPUs specifies default number of vCPUs for the VM.
NumVCPUs uint32
//DefaultMaxVCPUs specifies the maximum number of vCPUs for the VM.
DefaultMaxVCPUs uint32
// DefaultMem specifies default memory size in MiB for the VM.
MemorySize uint32
// DefaultMaxMemorySize specifies the maximum amount of RAM in MiB for the VM.
DefaultMaxMemorySize uint64
// DefaultBridges specifies default number of bridges for the VM.
// Bridges can be used to hot plug devices
DefaultBridges uint32
// Msize9p is used as the msize for 9p shares
Msize9p uint32
// MemSlots specifies default memory slots the VM.
MemSlots uint32
// VirtioFSCacheSize is the DAX cache size in MiB
VirtioFSCacheSize uint32
// Size of virtqueues
VirtioFSQueueSize uint32
// User ID.
Uid uint32
// Group ID.
Gid uint32
// BlockDeviceCacheSet specifies cache-related options will be set to block devices or not.
BlockDeviceCacheSet bool
// BlockDeviceCacheDirect specifies cache-related options for block devices.
// Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
BlockDeviceCacheDirect bool
// BlockDeviceCacheNoflush specifies cache-related options for block devices.
// Denotes whether flush requests for the device are ignored.
BlockDeviceCacheNoflush bool
// DisableBlockDeviceUse disallows a block device from being used.
DisableBlockDeviceUse bool
// EnableIOThreads enables IO to be processed in a separate thread.
// Supported currently for virtio-scsi driver.
EnableIOThreads bool
// Debug changes the default hypervisor and kernel parameters to
// enable debug output where available. And Debug also enable the hmp socket.
Debug bool
// MemPrealloc specifies if the memory should be pre-allocated
MemPrealloc bool
// HugePages specifies if the memory should be pre-allocated from huge pages
HugePages bool
// VirtioMem is used to enable/disable virtio-mem
VirtioMem bool
// IOMMU specifies if the VM should have a vIOMMU
IOMMU bool
// IOMMUPlatform is used to indicate if IOMMU_PLATFORM is enabled for supported devices
IOMMUPlatform bool
// DisableNestingChecks is used to override customizations performed
// when running on top of another VMM.
DisableNestingChecks bool
// DisableImageNvdimm is used to disable guest rootfs image nvdimm devices
DisableImageNvdimm bool
// GuestMemoryDumpPaging is used to indicate if enable paging
// for QEMU dump-guest-memory command
GuestMemoryDumpPaging bool
// Enable confidential guest support.
// Enable or disable different hardware features, ranging
// from memory encryption to both memory and CPU-state encryption and integrity.
ConfidentialGuest bool
// Enable SEV-SNP guests on AMD machines capable of both
SevSnpGuest bool
// BootToBeTemplate used to indicate if the VM is created to be a template VM
BootToBeTemplate bool
// BootFromTemplate used to indicate if the VM should be created from a template VM
BootFromTemplate bool
// DisableVhostNet is used to indicate if host supports vhost_net
DisableVhostNet bool
// EnableVhostUserStore is used to indicate if host supports vhost-user-blk/scsi
EnableVhostUserStore bool
// GuestSwap Used to enable/disable swap in the guest
GuestSwap bool
// Rootless is used to enable rootless VMM process
Rootless bool
// Disable seccomp from the hypervisor process
DisableSeccomp bool
// Disable selinux from the hypervisor process
DisableSeLinux bool
// Disable selinux from the container process
DisableGuestSeLinux bool
// Use legacy serial for the guest console
LegacySerial bool
GuestPreAttestation bool
GuestPreAttestationKeyset string
GuestPreAttestationURI string
GuestPreAttestationMode string
RemoteHypervisorSocket string
RemoteHypervisorTimeout uint32
SandboxName string
SandboxNamespace string
SEVCertChainPath string
SEVGuestPolicy uint32
SNPGuestPolicy uint64
}
// vcpu mapping from vcpu number to thread number

View File

@@ -244,7 +244,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
FileBackedMemRootList: sconfig.HypervisorConfig.FileBackedMemRootList,
DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks,
DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm,
HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus,
BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate,
BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate,
DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet,
@@ -485,7 +484,6 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
FileBackedMemRootList: hconf.FileBackedMemRootList,
DisableNestingChecks: hconf.DisableNestingChecks,
DisableImageNvdimm: hconf.DisableImageNvdimm,
HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus,
HotPlugVFIO: hconf.HotPlugVFIO,
ColdPlugVFIO: hconf.ColdPlugVFIO,
BootToBeTemplate: hconf.BootToBeTemplate,

View File

@@ -191,10 +191,6 @@ type HypervisorConfig struct {
// DisableImageNvdimm disables nvdimm for guest rootfs image
DisableImageNvdimm bool
// HotplugVFIOOnRootBus is used to indicate if devices need to be hotplugged on the
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
// root, switch, bridge or no-port
HotPlugVFIO config.PCIePort

View File

@@ -139,10 +139,6 @@ const (
// DisableImageNvdimm is a sandbox annotation to specify use of nvdimm device for guest rootfs image.
DisableImageNvdimm = kataAnnotHypervisorPrefix + "disable_image_nvdimm"
// HotplugVFIOOnRootBus is a sandbox annotation used to indicate if devices need to be hotplugged on the
// root bus instead of a bridge.
HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus"
// ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged.
ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio"

View File

@@ -66,11 +66,6 @@ const romFile = ""
// Default value is false.
const defaultDisableModern = false
// A deeper PCIe topology than 5 is already not advisable just for the sake
// of having enough buffer we limit ourselves to 10 and exit if we reach
// the root bus
const maxPCIeTopoDepth = 10
type qmpChannel struct {
qmp *govmmQemu.QMP
ctx context.Context
@@ -81,15 +76,14 @@ type qmpChannel struct {
// QemuState keeps Qemu's state
type QemuState struct {
UUID string
HotPlugVFIO config.PCIePort
Bridges []types.Bridge
HotpluggedVCPUs []hv.CPUDevice
HotpluggedMemory int
VirtiofsDaemonPid int
HotplugVFIOOnRootBus bool
HotplugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
UUID string
HotPlugVFIO config.PCIePort
Bridges []types.Bridge
HotpluggedVCPUs []hv.CPUDevice
HotpluggedMemory int
VirtiofsDaemonPid int
HotplugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
}
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
@@ -285,7 +279,6 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
q.state.UUID = uuid.Generate().String()
q.state.HotPlugVFIO = q.config.HotPlugVFIO
q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
q.state.HotPlugVFIO = q.config.HotPlugVFIO
// The path might already exist, but in case of VM templating,
@@ -808,6 +801,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig
if err != nil {
return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err)
}
devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
if err != nil {
return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err)
@@ -818,7 +812,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig
}
}
}
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus)
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort)
vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices)
@@ -1723,7 +1717,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
config.PCIeDevices[config.RootPort][devID] = true
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
bridgeSlot, err := q.qomGetSlot(bridgeQomPath)
bridgeSlot, err := q.arch.qomGetSlot(bridgeQomPath, &q.qmpMonitorCh)
if err != nil {
return err
}
@@ -1826,88 +1820,6 @@ func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUs
}
}
// Query QMP to find the PCI slot of a device, given its QOM path or ID
func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) {
addr, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qomPath, "addr")
if err != nil {
return types.PciSlot{}, err
}
addrf, ok := addr.(float64)
// XXX going via float makes no real sense, but that's how
// JSON works, and we'll get away with it for the small values
// we have here
if !ok {
return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr)
}
addri := int(addrf)
slotNum, funcNum := addri>>3, addri&0x7
if funcNum != 0 {
return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q",
slotNum, funcNum, qomPath)
}
return types.PciSlotFromInt(slotNum)
}
// Query QMP to find a device's PCI path given its QOM path or ID
func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) {
var slots []types.PciSlot
devSlot, err := q.qomGetSlot(qemuID)
if err != nil {
return types.PciPath{}, err
}
slots = append(slots, devSlot)
// This only works for Q35 and Virt
r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
var parentPath = qemuID
// We do not want to use a forever loop here, a deeper PCIe topology
// than 5 is already not advisable just for the sake of having enough
// buffer we limit ourselves to 10 and leave the loop early if we hit
// the root bus.
for i := 1; i <= maxPCIeTopoDepth; i++ {
parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus")
if err != nil {
return types.PciPath{}, err
}
busQOM, ok := parenBusQOM.(string)
if !ok {
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
}
// If we hit /machine/q35/pcie.0 we're done this is the root bus
// we climbed the complete hierarchy
if r.Match([]byte(busQOM)) {
break
}
// `bus` is the QOM path of the QOM bus object, but we need
// the PCI parent_bus which manages that bus. There doesn't seem
// to be a way to get that other than to simply drop the last
// path component.
idx := strings.LastIndex(busQOM, "/")
if idx == -1 {
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
}
parentBus := busQOM[:idx]
parentSlot, err := q.qomGetSlot(parentBus)
if err != nil {
return types.PciPath{}, err
}
// Prepend the slots, since we're climbing the hierarchy
slots = append([]types.PciSlot{parentSlot}, slots...)
parentPath = parentBus
}
return types.PciPathFromSlots(slots...)
}
func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) {
return q.executeVFIODeviceAdd(device)
}
@@ -1937,7 +1849,7 @@ func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, brid
case config.VFIOPCIDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile)
case config.VFIOAPDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
@@ -1950,7 +1862,7 @@ func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error {
case config.VFIOPCIDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile)
case config.VFIOAPDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
@@ -1968,46 +1880,43 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
"hot-plug-vfio": q.state.HotPlugVFIO,
"device-info": string(buf),
}).Info("Start hot-plug VFIO device")
// In case MachineType is q35, a PCIe device is hotplugged on
// a PCIe Root Port or alternatively on a PCIe Switch Port
if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt {
device.Bus = ""
} else {
var err error
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus {
err = q.hotplugVFIODeviceRootPort(ctx, device)
} else if q.state.HotPlugVFIO == config.SwitchPort {
err = q.hotplugVFIODeviceSwitchPort(ctx, device)
} else {
err = q.hotplugVFIODeviceBridgePort(ctx, device)
}
if err != nil {
return err
}
err = fmt.Errorf("Incorrect hot plug configuration %v for device %v found", q.state.HotPlugVFIO, device)
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotPlugVFIO == config.RootPort {
err = q.hotplugVFIODeviceRootPort(ctx, device)
} else if q.state.HotPlugVFIO == config.SwitchPort {
err = q.hotplugVFIODeviceSwitchPort(ctx, device)
} else if q.state.HotPlugVFIO == config.BridgePort {
err = q.hotplugVFIODeviceBridgePort(ctx, device)
}
// XXX: Depending on whether we're doing root port or
if err != nil {
return err
}
// Depending on whether we're doing root port or
// bridge hotplug, and how the bridge is set up in
// other parts of the code, we may or may not already
// have information about the slot number of the
// bridge and or the device. For simplicity, just
// query both of them back from qemu
device.GuestPciPath, err = q.qomGetPciPath(device.ID)
// query both of them back from qemu based on the arch
device.GuestPciPath, err = q.arch.qomGetPciPath(device.ID, &q.qmpMonitorCh)
return err
}
} else {
q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
if !q.state.HotplugVFIOOnRootBus {
if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
return err
if q.state.HotPlugVFIO == config.BridgePort {
if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
return err
}
}
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
}
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
}
func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
@@ -2966,7 +2875,6 @@ func (q *qemu) Save() (s hv.HypervisorState) {
s.Type = string(QemuHypervisor)
s.UUID = q.state.UUID
s.HotpluggedMemory = q.state.HotpluggedMemory
s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
for _, bridge := range q.arch.getBridges() {
s.Bridges = append(s.Bridges, hv.Bridge{
@@ -2988,7 +2896,6 @@ func (q *qemu) Save() (s hv.HypervisorState) {
func (q *qemu) Load(s hv.HypervisorState) {
q.state.UUID = s.UUID
q.state.HotpluggedMemory = s.HotpluggedMemory
q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid
for _, bridge := range s.Bridges {

View File

@@ -13,6 +13,7 @@ import (
"errors"
"fmt"
"os"
"regexp"
"runtime"
"strings"
@@ -25,6 +26,11 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
)
// A deeper PCIe topology than 5 is already not advisable just for the sake
// of having enough buffer we limit ourselves to 10 and exit if we reach
// the root bus
const maxPCIeTopoDepth = 10
type qemuArch interface {
// enableNestingChecks nesting checks will be honoured
enableNestingChecks()
@@ -169,6 +175,12 @@ type qemuArch interface {
// and 64-bit addressable memory
getBARsMaxAddressableMemory() (uint64, uint64)
// Query QMP to find a device's PCI path given its QOM path or ID
qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error)
// Query QMP to find the PCI slot of a device, given its QOM path or ID
qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error)
// append SEV object type to the VM definition
appendSEVObject(devices []govmmQemu.Device, firmware, firmwareVolume string, config sevKbs.GuestPreAttestationConfig) ([]govmmQemu.Device, string, error)
@@ -915,6 +927,88 @@ func (q *qemuArchBase) appendProtectionDevice(devices []govmmQemu.Device, firmwa
return devices, firmware, nil
}
// Query QMP to find the PCI slot of a device, given its QOM path or ID
func (q *qemuArchBase) qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error) {
addr, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, qomPath, "addr")
if err != nil {
return types.PciSlot{}, err
}
addrf, ok := addr.(float64)
// XXX going via float makes no real sense, but that's how
// JSON works, and we'll get away with it for the small values
// we have here
if !ok {
return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr)
}
addri := int(addrf)
slotNum, funcNum := addri>>3, addri&0x7
if funcNum != 0 {
return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q",
slotNum, funcNum, qomPath)
}
return types.PciSlotFromInt(slotNum)
}
// Query QMP to find a device's PCI path given its QOM path or ID
func (q *qemuArchBase) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) {
var slots []types.PciSlot
devSlot, err := q.qomGetSlot(qemuID, qmpCh)
if err != nil {
return types.PciPath{}, err
}
slots = append(slots, devSlot)
// This only works for Q35 and Virt
r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
var parentPath = qemuID
// We do not want to use a forever loop here, a deeper PCIe topology
// than 5 is already not advisable just for the sake of having enough
// buffer we limit ourselves to 10 and leave the loop early if we hit
// the root bus.
for i := 1; i <= maxPCIeTopoDepth; i++ {
parenBusQOM, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, parentPath, "parent_bus")
if err != nil {
return types.PciPath{}, err
}
busQOM, ok := parenBusQOM.(string)
if !ok {
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
}
// If we hit /machine/q35/pcie.0 we're done this is the root bus
// we climbed the complete hierarchy
if r.Match([]byte(busQOM)) {
break
}
// `bus` is the QOM path of the QOM bus object, but we need
// the PCI parent_bus which manages that bus. There doesn't seem
// to be a way to get that other than to simply drop the last
// path component.
idx := strings.LastIndex(busQOM, "/")
if idx == -1 {
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
}
parentBus := busQOM[:idx]
parentSlot, err := q.qomGetSlot(parentBus, qmpCh)
if err != nil {
return types.PciPath{}, err
}
// Prepend the slots, since we're climbing the hierarchy
slots = append([]types.PciSlot{parentSlot}, slots...)
parentPath = parentBus
}
return types.PciPathFromSlots(slots...)
}
// AMD SEV methods
func (q *qemuArchBase) appendSEVObject(devices []govmmQemu.Device, firmware, firmwareVolume string, config sevKbs.GuestPreAttestationConfig) ([]govmmQemu.Device, string, error) {
hvLogger.WithField("arch", runtime.GOARCH).Warnf("Confidential Computing has not been implemented for this architecture")

View File

@@ -351,3 +351,32 @@ func (q *qemuS390x) appendProtectionDevice(devices []govmmQemu.Device, firmware,
return devices, firmware, fmt.Errorf("Unsupported guest protection technology: %v", q.protection)
}
}
func (q *qemuS390x) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device {
if vfioDev.SysfsDev == "" {
return devices
}
if len(vfioDev.APDevices) > 0 {
devices = append(devices,
govmmQemu.VFIODevice{
SysfsDev: vfioDev.SysfsDev,
Transport: govmmQemu.TransportAP,
},
)
return devices
}
devices = append(devices,
govmmQemu.VFIODevice{
SysfsDev: vfioDev.SysfsDev,
},
)
return devices
}
// Query QMP to find a device's PCI path given its QOM path or ID
func (q *qemuS390x) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) {
hvLogger.Warnf("qomGetPciPath not implemented for s390x")
return types.PciPath{}, nil
}

View File

@@ -30,8 +30,6 @@ import (
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
cri "github.com/containerd/containerd/pkg/cri/annotations"
crio "github.com/containers/podman/v4/pkg/annotations"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/api"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
@@ -611,67 +609,11 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return nil, err
}
if len(sandboxConfig.Containers) > 0 {
// These values are required by remote hypervisor
for _, a := range []string{cri.SandboxName, crio.SandboxName} {
if value, ok := sandboxConfig.Containers[0].Annotations[a]; ok {
sandboxConfig.HypervisorConfig.SandboxName = value
}
}
for _, a := range []string{cri.SandboxNamespace, crio.Namespace} {
if value, ok := sandboxConfig.Containers[0].Annotations[a]; ok {
sandboxConfig.HypervisorConfig.SandboxNamespace = value
}
}
coldPlugVFIO, err := s.coldOrHotPlugVFIO(&sandboxConfig)
if err != nil {
return nil, err
}
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
// until we have TDISP/IDE PCIe support.
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
// Aggregate all the containner devices for hot-plug and use them to dedcue
// the correct amount of ports to reserve for the hypervisor.
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
var vfioDevices []config.DeviceInfo
// vhost-user-block device is a PCIe device in Virt, keep track of it
// for correct number of PCIe root ports.
var vhostUserBlkDevices []config.DeviceInfo
for cnt, containers := range sandboxConfig.Containers {
for dev, device := range containers.DeviceInfos {
if deviceManager.IsVhostUserBlk(device) {
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
continue
}
isVFIO := deviceManager.IsVFIO(device.ContainerPath)
if hotPlugVFIO && isVFIO {
vfioDevices = append(vfioDevices, device)
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
}
if coldPlugVFIO && isVFIO {
device.ColdPlug = true
device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
vfioDevices = append(vfioDevices, device)
// We need to remove the devices marked for cold-plug
// otherwise at the container level the kata-agent
// will try to hot-plug them.
sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
}
}
var filteredDevices []config.DeviceInfo
for _, device := range containers.DeviceInfos {
if device.ID != "remove-we-are-cold-plugging" {
filteredDevices = append(filteredDevices, device)
}
}
sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
}
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
// store doesn't require hypervisor to be stored immediately
if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
return nil, err
@@ -685,7 +627,8 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return s, nil
}
for _, dev := range vfioDevices {
for _, dev := range sandboxConfig.HypervisorConfig.VFIODevices {
s.Logger().Info("cold-plug device: ", dev)
_, err := s.AddDevice(ctx, dev)
if err != nil {
s.Logger().WithError(err).Debug("Cannot cold-plug add device")
@@ -695,6 +638,70 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return s, nil
}
func (s *Sandbox) coldOrHotPlugVFIO(sandboxConfig *SandboxConfig) (bool, error) {
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
// until we have TDISP/IDE PCIe support.
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
// Aggregate all the containner devices for hot-plug and use them to dedcue
// the correct amount of ports to reserve for the hypervisor.
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
modeIsGK := (sandboxConfig.VfioMode == config.VFIOModeGuestKernel)
modeIsVFIO := (sandboxConfig.VfioMode == config.VFIOModeVFIO)
var vfioDevices []config.DeviceInfo
// vhost-user-block device is a PCIe device in Virt, keep track of it
// for correct number of PCIe root ports.
var vhostUserBlkDevices []config.DeviceInfo
for cnt, containers := range sandboxConfig.Containers {
for dev, device := range containers.DeviceInfos {
if deviceManager.IsVhostUserBlk(device) {
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
continue
}
isVFIODevice := deviceManager.IsVFIODevice(device.ContainerPath)
isVFIOControlDevice := deviceManager.IsVFIOControlDevice(device.ContainerPath)
// vfio_mode=vfio needs the VFIO control device add it to the list
// of devices to be added to the VM.
if modeIsVFIO && isVFIOControlDevice && !hotPlugVFIO {
vfioDevices = append(vfioDevices, device)
}
if hotPlugVFIO && isVFIODevice {
device.ColdPlug = false
device.Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
vfioDevices = append(vfioDevices, device)
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
}
if coldPlugVFIO && isVFIODevice {
device.ColdPlug = true
device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
vfioDevices = append(vfioDevices, device)
// We need to remove the devices marked for cold-plug
// otherwise at the container level the kata-agent
// will try to hot-plug them.
if modeIsGK {
sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
}
}
}
var filteredDevices []config.DeviceInfo
for _, device := range containers.DeviceInfos {
if device.ID != "remove-we-are-cold-plugging" {
filteredDevices = append(filteredDevices, device)
}
}
sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
}
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
return coldPlugVFIO, nil
}
func (s *Sandbox) createResourceController() error {
var err error
cgroupPath := ""
@@ -2069,26 +2076,26 @@ func (s *Sandbox) AddDevice(ctx context.Context, info config.DeviceInfo) (api.De
}
var err error
b, err := s.devManager.NewDevice(info)
add, err := s.devManager.NewDevice(info)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
s.devManager.RemoveDevice(b.DeviceID())
s.devManager.RemoveDevice(add.DeviceID())
}
}()
if err = s.devManager.AttachDevice(ctx, b.DeviceID(), s); err != nil {
if err = s.devManager.AttachDevice(ctx, add.DeviceID(), s); err != nil {
return nil, err
}
defer func() {
if err != nil {
s.devManager.DetachDevice(ctx, b.DeviceID(), s)
s.devManager.DetachDevice(ctx, add.DeviceID(), s)
}
}()
return b, nil
return add, nil
}
// updateResources will:

View File

@@ -606,6 +606,7 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) {
HostPath: path,
ContainerPath: path,
DevType: "c",
Port: config.RootPort,
}
dev, err := dm.NewDevice(deviceInfo)
assert.Nil(t, err, "deviceManager.NewDevice return error: %v", err)