mirror of
https://github.com/aljazceru/kata-containers.git
synced 2025-12-18 23:04:20 +01:00
Merge pull request #4492 from zvonkok/pcie-topology
runtime: fix PCIe topology for GPUDirect use-case
This commit is contained in:
@@ -17,7 +17,7 @@ import (
|
||||
"github.com/prometheus/procfs"
|
||||
"github.com/urfave/cli"
|
||||
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
|
||||
@@ -113,8 +113,8 @@ type HypervisorInfo struct {
|
||||
SocketPath string
|
||||
Msize9p uint32
|
||||
MemorySlots uint32
|
||||
PCIeRootPort uint32
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
HotPlugVFIO config.PCIePort
|
||||
ColdPlugVFIO config.PCIePort
|
||||
HotplugVFIOOnRootBus bool
|
||||
Debug bool
|
||||
}
|
||||
@@ -317,9 +317,9 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) {
|
||||
EntropySource: config.HypervisorConfig.EntropySource,
|
||||
SharedFS: config.HypervisorConfig.SharedFS,
|
||||
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
|
||||
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
|
||||
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
|
||||
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
|
||||
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
|
||||
SocketPath: socketPath,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -19,12 +19,12 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/BurntSushi/toml"
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
vcUtils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
|
||||
specs "github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/urfave/cli"
|
||||
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
|
||||
@@ -74,8 +74,9 @@ func createConfig(configPath string, fileData string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) {
|
||||
var coldPlugVFIO hv.PCIePort
|
||||
func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.RuntimeConfig, err error) {
|
||||
var hotPlugVFIO config.PCIePort
|
||||
var coldPlugVFIO config.PCIePort
|
||||
const logPath = "/log/path"
|
||||
hypervisorPath := filepath.Join(prefixDir, "hypervisor")
|
||||
kernelPath := filepath.Join(prefixDir, "kernel")
|
||||
@@ -87,8 +88,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
|
||||
blockStorageDriver := "virtio-scsi"
|
||||
enableIOThreads := true
|
||||
hotplugVFIOOnRootBus := true
|
||||
pcieRootPort := uint32(2)
|
||||
coldPlugVFIO = hv.NoPort
|
||||
hotPlugVFIO = config.BridgePort
|
||||
coldPlugVFIO = config.NoPort
|
||||
disableNewNetNs := false
|
||||
sharedFS := "virtio-9p"
|
||||
virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd")
|
||||
@@ -132,8 +133,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
|
||||
BlockDeviceDriver: blockStorageDriver,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
HotPlugVFIO: hotPlugVFIO,
|
||||
ColdPlugVFIO: coldPlugVFIO,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
DisableNewNetNs: disableNewNetNs,
|
||||
DefaultVCPUCount: hypConfig.NumVCPUs,
|
||||
DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs,
|
||||
@@ -156,12 +157,12 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
|
||||
return "", oci.RuntimeConfig{}, err
|
||||
}
|
||||
|
||||
_, config, err = katautils.LoadConfiguration(configFile, true)
|
||||
_, ociConfig, err = katautils.LoadConfiguration(configFile, true)
|
||||
if err != nil {
|
||||
return "", oci.RuntimeConfig{}, err
|
||||
}
|
||||
|
||||
return configFile, config, nil
|
||||
return configFile, ociConfig, nil
|
||||
}
|
||||
|
||||
func getExpectedAgentDetails(config oci.RuntimeConfig) (AgentInfo, error) {
|
||||
@@ -277,7 +278,7 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo {
|
||||
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
|
||||
|
||||
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
|
||||
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
|
||||
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
|
||||
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
|
||||
}
|
||||
|
||||
|
||||
@@ -357,8 +357,15 @@ pflashes = []
|
||||
# Default false
|
||||
#hotplug_vfio_on_root_bus = true
|
||||
|
||||
# Enable hot-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port"
|
||||
#hot_plug_vfio = "root-port"
|
||||
|
||||
# In a confidential compute environment hot-plugging can compromise
|
||||
# security. Enable cold-plugging of VFIO devices to a root-port.
|
||||
# security.
|
||||
# Enable cold-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port", which means disabled.
|
||||
#cold_plug_vfio = "root-port"
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ import (
|
||||
specs "github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
|
||||
@@ -308,8 +308,9 @@ func TestCreateContainerConfigFail(t *testing.T) {
|
||||
assert.Error(err)
|
||||
}
|
||||
|
||||
func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) {
|
||||
var coldPlugVFIO hv.PCIePort
|
||||
func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, err error) {
|
||||
var hotPlugVFIO config.PCIePort
|
||||
var coldPlugVFIO config.PCIePort
|
||||
if dir == "" {
|
||||
return "", fmt.Errorf("BUG: need directory")
|
||||
}
|
||||
@@ -330,11 +331,11 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
|
||||
blockDeviceDriver := "virtio-scsi"
|
||||
enableIOThreads := true
|
||||
hotplugVFIOOnRootBus := true
|
||||
pcieRootPort := uint32(2)
|
||||
disableNewNetNs := false
|
||||
sharedFS := "virtio-9p"
|
||||
virtioFSdaemon := path.Join(dir, "virtiofsd")
|
||||
coldPlugVFIO = hv.RootPort
|
||||
hotPlugVFIO = config.BridgePort
|
||||
coldPlugVFIO = config.RootPort
|
||||
|
||||
configFileOptions := ktu.RuntimeConfigOptions{
|
||||
Hypervisor: "qemu",
|
||||
@@ -349,10 +350,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
|
||||
BlockDeviceDriver: blockDeviceDriver,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
DisableNewNetNs: disableNewNetNs,
|
||||
SharedFS: sharedFS,
|
||||
VirtioFSDaemon: virtioFSdaemon,
|
||||
HotPlugVFIO: hotPlugVFIO,
|
||||
ColdPlugVFIO: coldPlugVFIO,
|
||||
}
|
||||
|
||||
|
||||
@@ -125,14 +125,117 @@ const (
|
||||
// SysDevPrefix is static string of /sys/dev
|
||||
var SysDevPrefix = "/sys/dev"
|
||||
|
||||
// SysIOMMUPath is static string of /sys/kernel/iommu_groups
|
||||
var SysIOMMUPath = "/sys/kernel/iommu_groups"
|
||||
// SysIOMMUGroupPath is static string of /sys/kernel/iommu_groups
|
||||
var SysIOMMUGroupPath = "/sys/kernel/iommu_groups"
|
||||
|
||||
// SysBusPciDevicesPath is static string of /sys/bus/pci/devices
|
||||
var SysBusPciDevicesPath = "/sys/bus/pci/devices"
|
||||
|
||||
var getSysDevPath = getSysDevPathImpl
|
||||
|
||||
// PCIePortBusPrefix gives us the correct bus nameing dependeing on the port
|
||||
// used to hot(cold)-plug the device
|
||||
type PCIePortBusPrefix string
|
||||
|
||||
const (
|
||||
PCIeRootPortPrefix PCIePortBusPrefix = "rp"
|
||||
PCIeSwitchPortPrefix PCIePortBusPrefix = "sw"
|
||||
PCIeSwitchUpstreamPortPrefix PCIePortBusPrefix = "swup"
|
||||
PCIeSwitchhDownstreamPortPrefix PCIePortBusPrefix = "swdp"
|
||||
PCIBridgePortPrefix PCIePortBusPrefix = "bp"
|
||||
)
|
||||
|
||||
func (p PCIePortBusPrefix) String() string {
|
||||
switch p {
|
||||
case PCIeRootPortPrefix:
|
||||
fallthrough
|
||||
case PCIeSwitchPortPrefix:
|
||||
fallthrough
|
||||
case PCIeSwitchUpstreamPortPrefix:
|
||||
fallthrough
|
||||
case PCIeSwitchhDownstreamPortPrefix:
|
||||
fallthrough
|
||||
case PCIBridgePortPrefix:
|
||||
return string(p)
|
||||
}
|
||||
return fmt.Sprintf("<unknown PCIePortBusPrefix: %s>", string(p))
|
||||
}
|
||||
|
||||
// PCIePort distinguish only between root and switch port
|
||||
type PCIePort string
|
||||
|
||||
const (
|
||||
// RootPort attach VFIO devices to a root-port
|
||||
RootPort PCIePort = "root-port"
|
||||
// SwitchPort attach VFIO devices to a switch-port
|
||||
SwitchPort = "switch-port"
|
||||
// BridgePort is the default
|
||||
BridgePort = "bridge-port"
|
||||
// NoPort is for disabling VFIO hotplug/coldplug
|
||||
NoPort = "no-port"
|
||||
// InvalidPort is for invalid port
|
||||
InvalidPort = "invalid-port"
|
||||
)
|
||||
|
||||
func (p PCIePort) String() string {
|
||||
switch p {
|
||||
case RootPort:
|
||||
fallthrough
|
||||
case SwitchPort:
|
||||
fallthrough
|
||||
case BridgePort:
|
||||
fallthrough
|
||||
case NoPort:
|
||||
fallthrough
|
||||
case InvalidPort:
|
||||
return string(p)
|
||||
}
|
||||
return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
|
||||
}
|
||||
|
||||
var PCIePortPrefixMapping = map[PCIePort]PCIePortBusPrefix{
|
||||
RootPort: PCIeRootPortPrefix,
|
||||
SwitchPort: PCIeSwitchhDownstreamPortPrefix,
|
||||
BridgePort: PCIBridgePortPrefix,
|
||||
}
|
||||
|
||||
func (p PCIePort) Invalid() bool {
|
||||
switch p {
|
||||
case RootPort:
|
||||
fallthrough
|
||||
case SwitchPort:
|
||||
fallthrough
|
||||
case BridgePort:
|
||||
fallthrough
|
||||
case NoPort:
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (p PCIePort) Valid() bool {
|
||||
switch p {
|
||||
case RootPort:
|
||||
fallthrough
|
||||
case SwitchPort:
|
||||
fallthrough
|
||||
case BridgePort:
|
||||
fallthrough
|
||||
case NoPort:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type PCIePortMapping map[string]bool
|
||||
|
||||
var (
|
||||
// Each of this structures keeps track of the devices attached to the
|
||||
// different types of PCI ports. We can deduces the Bus number from it
|
||||
// and eliminate duplicates being assigned.
|
||||
PCIeDevices = map[PCIePort]PCIePortMapping{}
|
||||
)
|
||||
|
||||
// DeviceInfo is an embedded type that contains device data common to all types of devices.
|
||||
type DeviceInfo struct {
|
||||
// DriverOptions is specific options for each device driver
|
||||
@@ -178,6 +281,9 @@ type DeviceInfo struct {
|
||||
// ColdPlug specifies whether the device must be cold plugged (true)
|
||||
// or hot plugged (false).
|
||||
ColdPlug bool
|
||||
|
||||
// Specifies the PCIe port type to which the device is attached
|
||||
Port PCIePort
|
||||
}
|
||||
|
||||
// BlockDrive represents a block storage drive which may be used in case the storage
|
||||
@@ -279,14 +385,8 @@ const (
|
||||
VFIOAPDeviceMediatedType
|
||||
)
|
||||
|
||||
type VFIODev interface {
|
||||
GetID() *string
|
||||
GetType() VFIODeviceType
|
||||
GetSysfsDev() *string
|
||||
}
|
||||
|
||||
// VFIOPCIDev represents a VFIO PCI device used for hotplugging
|
||||
type VFIOPCIDev struct {
|
||||
// VFIODev represents a VFIO PCI device used for hotplugging
|
||||
type VFIODev struct {
|
||||
// ID is used to identify this drive in the hypervisor options.
|
||||
ID string
|
||||
|
||||
@@ -316,44 +416,15 @@ type VFIOPCIDev struct {
|
||||
|
||||
// IsPCIe specifies device is PCIe or PCI
|
||||
IsPCIe bool
|
||||
}
|
||||
|
||||
func (d VFIOPCIDev) GetID() *string {
|
||||
return &d.ID
|
||||
}
|
||||
|
||||
func (d VFIOPCIDev) GetType() VFIODeviceType {
|
||||
return d.Type
|
||||
}
|
||||
|
||||
func (d VFIOPCIDev) GetSysfsDev() *string {
|
||||
return &d.SysfsDev
|
||||
}
|
||||
|
||||
type VFIOAPDev struct {
|
||||
// ID is used to identify this drive in the hypervisor options.
|
||||
ID string
|
||||
|
||||
// sysfsdev of VFIO mediated device
|
||||
SysfsDev string
|
||||
|
||||
// APDevices are the Adjunct Processor devices assigned to the mdev
|
||||
APDevices []string
|
||||
|
||||
// Type of VFIO device
|
||||
Type VFIODeviceType
|
||||
}
|
||||
// Rank identifies a device in a IOMMU group
|
||||
Rank int
|
||||
|
||||
func (d VFIOAPDev) GetID() *string {
|
||||
return &d.ID
|
||||
}
|
||||
|
||||
func (d VFIOAPDev) GetType() VFIODeviceType {
|
||||
return d.Type
|
||||
}
|
||||
|
||||
func (d VFIOAPDev) GetSysfsDev() *string {
|
||||
return &d.SysfsDev
|
||||
// Port is the PCIe port type to which the device is attached
|
||||
Port PCIePort
|
||||
}
|
||||
|
||||
// RNGDev represents a random number generator device
|
||||
|
||||
@@ -47,9 +47,9 @@ func deviceLogger() *logrus.Entry {
|
||||
return api.DeviceLogger()
|
||||
}
|
||||
|
||||
// Identify PCIe device by reading the size of the PCI config space
|
||||
// IsPCIeDevice identifies PCIe device by reading the size of the PCI config space
|
||||
// Plain PCI device have 256 bytes of config space where PCIe devices have 4K
|
||||
func isPCIeDevice(bdf string) bool {
|
||||
func IsPCIeDevice(bdf string) bool {
|
||||
if len(strings.Split(bdf, ":")) == 2 {
|
||||
bdf = PCIDomain + ":" + bdf
|
||||
}
|
||||
@@ -157,14 +157,12 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo
|
||||
|
||||
// GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group
|
||||
// We can reuse this function at various levels, sandbox, container.
|
||||
// Only the VFIO module is allowed to do bus assignments, all other modules need to
|
||||
// ignore it if used as helper function to get VFIO information.
|
||||
func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignment bool) ([]*config.VFIODev, error) {
|
||||
func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODev, error) {
|
||||
|
||||
vfioDevs := []*config.VFIODev{}
|
||||
|
||||
vfioGroup := filepath.Base(device.HostPath)
|
||||
iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices")
|
||||
iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices")
|
||||
|
||||
deviceFiles, err := os.ReadDir(iommuDevicesPath)
|
||||
if err != nil {
|
||||
@@ -174,7 +172,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
|
||||
// Pass all devices in iommu group
|
||||
for i, deviceFile := range deviceFiles {
|
||||
//Get bdf of device eg 0000:00:1c.0
|
||||
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath)
|
||||
deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(deviceFile.Name(), iommuDevicesPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -196,27 +194,24 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
|
||||
|
||||
switch vfioDeviceType {
|
||||
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
|
||||
isPCIe := isPCIeDevice(deviceBDF)
|
||||
// Do not directly assign to `vfio` -- need to access field still
|
||||
vfioPCI := config.VFIOPCIDev{
|
||||
vfio = config.VFIODev{
|
||||
ID: id,
|
||||
Type: vfioDeviceType,
|
||||
BDF: deviceBDF,
|
||||
SysfsDev: deviceSysfsDev,
|
||||
IsPCIe: isPCIe,
|
||||
IsPCIe: IsPCIeDevice(deviceBDF),
|
||||
Class: pciClass,
|
||||
Rank: -1,
|
||||
Port: device.Port,
|
||||
}
|
||||
if isPCIe && !ignoreBusAssignment {
|
||||
vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
|
||||
AllPCIeDevs[deviceBDF] = true
|
||||
}
|
||||
vfio = vfioPCI
|
||||
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
devices, err := GetAPVFIODevices(deviceSysfsDev)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
vfio = config.VFIOAPDev{
|
||||
vfio = config.VFIODev{
|
||||
ID: id,
|
||||
SysfsDev: deviceSysfsDev,
|
||||
Type: config.VFIOAPDeviceMediatedType,
|
||||
|
||||
@@ -28,14 +28,9 @@ const (
|
||||
vfioRemoveIDPath = "/sys/bus/pci/drivers/vfio-pci/remove_id"
|
||||
iommuGroupPath = "/sys/bus/pci/devices/%s/iommu_group"
|
||||
vfioDevPath = "/dev/vfio/%s"
|
||||
pcieRootPortPrefix = "rp"
|
||||
vfioAPSysfsDir = "/sys/devices/vfio_ap"
|
||||
)
|
||||
|
||||
var (
|
||||
AllPCIeDevs = map[string]bool{}
|
||||
)
|
||||
|
||||
// VFIODevice is a vfio device meant to be passed to the hypervisor
|
||||
// to be used by the Virtual Machine.
|
||||
type VFIODevice struct {
|
||||
@@ -70,10 +65,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
|
||||
}
|
||||
}()
|
||||
|
||||
device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo, false)
|
||||
device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, vfio := range device.VfioDevs {
|
||||
if vfio.IsPCIe {
|
||||
busIndex := len(config.PCIeDevices[vfio.Port])
|
||||
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
|
||||
config.PCIeDevices[vfio.Port][vfio.BDF] = true
|
||||
}
|
||||
}
|
||||
|
||||
coldPlug := device.DeviceInfo.ColdPlug
|
||||
deviceLogger().WithField("cold-plug", coldPlug).Info("Attaching VFIO device")
|
||||
@@ -169,23 +171,18 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
|
||||
for _, dev := range ds.VFIODevs {
|
||||
var vfio config.VFIODev
|
||||
|
||||
vfioDeviceType := (*device.VfioDevs[0]).GetType()
|
||||
switch vfioDeviceType {
|
||||
switch dev.Type {
|
||||
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
|
||||
bdf := ""
|
||||
if pciDev, ok := (*dev).(config.VFIOPCIDev); ok {
|
||||
bdf = pciDev.BDF
|
||||
}
|
||||
vfio = config.VFIOPCIDev{
|
||||
ID: *(*dev).GetID(),
|
||||
Type: config.VFIODeviceType((*dev).GetType()),
|
||||
BDF: bdf,
|
||||
SysfsDev: *(*dev).GetSysfsDev(),
|
||||
vfio = config.VFIODev{
|
||||
ID: dev.ID,
|
||||
Type: config.VFIODeviceType(dev.Type),
|
||||
BDF: dev.BDF,
|
||||
SysfsDev: dev.SysfsDev,
|
||||
}
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
vfio = config.VFIOAPDev{
|
||||
ID: *(*dev).GetID(),
|
||||
SysfsDev: *(*dev).GetSysfsDev(),
|
||||
vfio = config.VFIODev{
|
||||
ID: dev.ID,
|
||||
SysfsDev: dev.SysfsDev,
|
||||
}
|
||||
default:
|
||||
deviceLogger().WithError(
|
||||
@@ -200,7 +197,7 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
|
||||
|
||||
// It should implement GetAttachCount() and DeviceID() as api.Device implementation
|
||||
// here it shares function from *GenericDevice so we don't need duplicate codes
|
||||
func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
|
||||
func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
|
||||
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
|
||||
vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr)
|
||||
if err != nil {
|
||||
@@ -210,14 +207,18 @@ func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS
|
||||
switch vfioDeviceType {
|
||||
case config.VFIOPCIDeviceNormalType:
|
||||
// Get bdf of device eg. 0000:00:1c.0
|
||||
deviceBDF = getBDF(deviceFileName)
|
||||
// OLD IMPL: deviceBDF = getBDF(deviceFileName)
|
||||
// The old implementation did not consider the case where
|
||||
// vfio devices are located on different root busses. The
|
||||
// kata-agent will handle the case now, here, use the full PCI addr
|
||||
deviceBDF = deviceFileName
|
||||
// Get sysfs path used by cloud-hypervisor
|
||||
deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName)
|
||||
case config.VFIOPCIDeviceMediatedType:
|
||||
// Get sysfsdev of device eg. /sys/devices/pci0000:00/0000:00:02.0/f79944e4-5a3d-11e8-99ce-479cbab002e4
|
||||
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
|
||||
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
|
||||
deviceBDF = getBDF(getMediatedBDF(deviceSysfsDev))
|
||||
deviceBDF = GetBDF(getMediatedBDF(deviceSysfsDev))
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
|
||||
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
|
||||
@@ -240,7 +241,7 @@ func getMediatedBDF(deviceSysfsDev string) string {
|
||||
|
||||
// getBDF returns the BDF of pci device
|
||||
// Expected input string format is [<domain>]:[<bus>][<slot>].[<func>] eg. 0000:02:10.0
|
||||
func getBDF(deviceSysStr string) string {
|
||||
func GetBDF(deviceSysStr string) string {
|
||||
tokens := strings.SplitN(deviceSysStr, ":", 2)
|
||||
if len(tokens) == 1 {
|
||||
return ""
|
||||
|
||||
@@ -20,7 +20,7 @@ func TestGetVFIODetails(t *testing.T) {
|
||||
}
|
||||
|
||||
data := []testData{
|
||||
{"0000:02:10.0", "02:10.0"},
|
||||
{"0000:02:10.0", "0000:02:10.0"},
|
||||
{"0000:0210.0", ""},
|
||||
{"f79944e4-5a3d-11e8-99ce-", ""},
|
||||
{"f79944e4-5a3d-11e8-99ce", ""},
|
||||
@@ -29,7 +29,7 @@ func TestGetVFIODetails(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, d := range data {
|
||||
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "")
|
||||
deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(d.deviceStr, "")
|
||||
|
||||
switch vfioDeviceType {
|
||||
case config.VFIOPCIDeviceNormalType:
|
||||
|
||||
@@ -71,7 +71,11 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
|
||||
dm.blockDriver = config.VirtioSCSI
|
||||
}
|
||||
|
||||
drivers.AllPCIeDevs = make(map[string]bool)
|
||||
config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping)
|
||||
|
||||
config.PCIeDevices[config.RootPort] = make(map[string]bool)
|
||||
config.PCIeDevices[config.SwitchPort] = make(map[string]bool)
|
||||
config.PCIeDevices[config.BridgePort] = make(map[string]bool)
|
||||
|
||||
for _, dev := range devices {
|
||||
dm.devices[dev.DeviceID()] = dev
|
||||
@@ -118,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device
|
||||
}
|
||||
if IsVFIO(devInfo.HostPath) {
|
||||
return drivers.NewVFIODevice(&devInfo), nil
|
||||
} else if isVhostUserBlk(devInfo) {
|
||||
} else if IsVhostUserBlk(devInfo) {
|
||||
if devInfo.DriverOptions == nil {
|
||||
devInfo.DriverOptions = make(map[string]string)
|
||||
}
|
||||
|
||||
@@ -116,14 +116,14 @@ func TestAttachVFIODevice(t *testing.T) {
|
||||
_, err = os.Create(deviceConfigFile)
|
||||
assert.Nil(t, err)
|
||||
|
||||
savedIOMMUPath := config.SysIOMMUPath
|
||||
config.SysIOMMUPath = tmpDir
|
||||
savedIOMMUPath := config.SysIOMMUGroupPath
|
||||
config.SysIOMMUGroupPath = tmpDir
|
||||
|
||||
savedSysBusPciDevicesPath := config.SysBusPciDevicesPath
|
||||
config.SysBusPciDevicesPath = devicesDir
|
||||
|
||||
defer func() {
|
||||
config.SysIOMMUPath = savedIOMMUPath
|
||||
config.SysIOMMUGroupPath = savedIOMMUPath
|
||||
config.SysBusPciDevicesPath = savedSysBusPciDevicesPath
|
||||
}()
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ func isBlock(devInfo config.DeviceInfo) bool {
|
||||
}
|
||||
|
||||
// isVhostUserBlk checks if the device is a VhostUserBlk device.
|
||||
func isVhostUserBlk(devInfo config.DeviceInfo) bool {
|
||||
func IsVhostUserBlk(devInfo config.DeviceInfo) bool {
|
||||
return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor
|
||||
}
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ func TestIsVhostUserBlk(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, d := range data {
|
||||
isVhostUserBlk := isVhostUserBlk(
|
||||
isVhostUserBlk := IsVhostUserBlk(
|
||||
config.DeviceInfo{
|
||||
DevType: d.devType,
|
||||
Major: d.major,
|
||||
|
||||
@@ -123,6 +123,14 @@ const (
|
||||
// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
|
||||
PCIeRootPort DeviceDriver = "pcie-root-port"
|
||||
|
||||
// PCIeSwitchUpstreamPort is a PCIe switch upstream port
|
||||
// A upstream port connects to a PCIe Root Port
|
||||
PCIeSwitchUpstreamPort DeviceDriver = "x3130-upstream"
|
||||
|
||||
// PCIeSwitchDownstreamPort is a PCIe switch downstream port
|
||||
// PCIe devices can be hot-plugged to the downstream port.
|
||||
PCIeSwitchDownstreamPort DeviceDriver = "xio3130-downstream"
|
||||
|
||||
// Loader is the Loader device driver.
|
||||
Loader DeviceDriver = "loader"
|
||||
|
||||
@@ -236,6 +244,7 @@ const (
|
||||
|
||||
// SecExecGuest represents an s390x Secure Execution (Protected Virtualization in QEMU) object
|
||||
SecExecGuest ObjectType = "s390-pv-guest"
|
||||
|
||||
// PEFGuest represent ppc64le PEF(Protected Execution Facility) object.
|
||||
PEFGuest ObjectType = "pef-guest"
|
||||
)
|
||||
@@ -369,7 +378,6 @@ func (object Object) QemuParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, string(object.Driver))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("id=%s", object.DeviceID))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("host-path=%s", object.File))
|
||||
|
||||
}
|
||||
|
||||
if len(deviceParams) > 0 {
|
||||
@@ -1681,6 +1689,106 @@ func (b PCIeRootPortDevice) Valid() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// PCIeSwitchUpstreamPortDevice is the port connecting to the root port
|
||||
type PCIeSwitchUpstreamPortDevice struct {
|
||||
ID string // format: sup{n}, n>=0
|
||||
Bus string // default is rp0
|
||||
}
|
||||
|
||||
// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
|
||||
func (b PCIeSwitchUpstreamPortDevice) QemuParams(config *Config) []string {
|
||||
var qemuParams []string
|
||||
var deviceParams []string
|
||||
|
||||
driver := PCIeSwitchUpstreamPort
|
||||
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
|
||||
|
||||
qemuParams = append(qemuParams, "-device")
|
||||
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
|
||||
return qemuParams
|
||||
}
|
||||
|
||||
// Valid returns true if the PCIeSwitchUpstreamPortDevice structure is valid and complete.
|
||||
func (b PCIeSwitchUpstreamPortDevice) Valid() bool {
|
||||
if b.ID == "" {
|
||||
return false
|
||||
}
|
||||
if b.Bus == "" {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// PCIeSwitchDownstreamPortDevice is the port connecting to the root port
|
||||
type PCIeSwitchDownstreamPortDevice struct {
|
||||
ID string // format: sup{n}, n>=0
|
||||
Bus string // default is rp0
|
||||
Chassis string // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00
|
||||
Slot string // >=0, default is 0x00
|
||||
// This to work needs patches to QEMU
|
||||
BusReserve string
|
||||
// Pref64 and Pref32 are not allowed to be set simultaneously
|
||||
Pref64Reserve string // reserve prefetched MMIO aperture, 64-bit
|
||||
Pref32Reserve string // reserve prefetched MMIO aperture, 32-bit
|
||||
MemReserve string // reserve non-prefetched MMIO aperture, 32-bit *only*
|
||||
IOReserve string // IO reservation
|
||||
|
||||
}
|
||||
|
||||
// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
|
||||
func (b PCIeSwitchDownstreamPortDevice) QemuParams(config *Config) []string {
|
||||
var qemuParams []string
|
||||
var deviceParams []string
|
||||
driver := PCIeSwitchDownstreamPort
|
||||
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("chassis=%s", b.Chassis))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("slot=%s", b.Slot))
|
||||
if b.BusReserve != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("bus-reserve=%s", b.BusReserve))
|
||||
}
|
||||
|
||||
if b.Pref64Reserve != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", b.Pref64Reserve))
|
||||
}
|
||||
|
||||
if b.Pref32Reserve != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("pref32-reserve=%s", b.Pref32Reserve))
|
||||
}
|
||||
|
||||
if b.MemReserve != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", b.MemReserve))
|
||||
}
|
||||
|
||||
if b.IOReserve != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", b.IOReserve))
|
||||
}
|
||||
|
||||
qemuParams = append(qemuParams, "-device")
|
||||
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
|
||||
return qemuParams
|
||||
}
|
||||
|
||||
// Valid returns true if the PCIeSwitchUpstremPortDevice structure is valid and complete.
|
||||
func (b PCIeSwitchDownstreamPortDevice) Valid() bool {
|
||||
if b.ID == "" {
|
||||
return false
|
||||
}
|
||||
if b.Bus == "" {
|
||||
return false
|
||||
}
|
||||
if b.Chassis == "" {
|
||||
return false
|
||||
}
|
||||
if b.Slot == "" {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// VFIODevice represents a qemu vfio device meant for direct access by guest OS.
|
||||
type VFIODevice struct {
|
||||
// Bus-Device-Function of device
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
package hypervisors
|
||||
|
||||
import "fmt"
|
||||
import "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
|
||||
// Bridge is a bridge where devices can be hot plugged
|
||||
type Bridge struct {
|
||||
@@ -28,37 +28,8 @@ type CPUDevice struct {
|
||||
ID string
|
||||
}
|
||||
|
||||
// PCIePort distinguish only between root and switch port
|
||||
type PCIePort string
|
||||
|
||||
const (
|
||||
// RootPort attach VFIO devices to a root-port
|
||||
RootPort PCIePort = "root-port"
|
||||
// SwitchPort attach VFIO devices to a switch-port
|
||||
SwitchPort = "switch-port"
|
||||
// BridgePort is the default
|
||||
BridgePort = "bridge-port"
|
||||
// NoPort is for disabling VFIO hotplug/coldplug
|
||||
NoPort = "no-port"
|
||||
)
|
||||
|
||||
func (p PCIePort) String() string {
|
||||
switch p {
|
||||
case RootPort:
|
||||
return "root-port"
|
||||
case SwitchPort:
|
||||
return "switch-port"
|
||||
case BridgePort:
|
||||
return "bridge-port"
|
||||
case NoPort:
|
||||
return "no-port"
|
||||
}
|
||||
return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
|
||||
}
|
||||
|
||||
type HypervisorState struct {
|
||||
BlockIndexMap map[int]struct{}
|
||||
|
||||
// Type of hypervisor, E.g. qemu/firecracker/acrn.
|
||||
Type string
|
||||
UUID string
|
||||
@@ -74,7 +45,7 @@ type HypervisorState struct {
|
||||
HotpluggedMemory int
|
||||
VirtiofsDaemonPid int
|
||||
Pid int
|
||||
PCIeRootPort int
|
||||
ColdPlugVFIO PCIePort
|
||||
HotPlugVFIO config.PCIePort
|
||||
ColdPlugVFIO config.PCIePort
|
||||
HotplugVFIOOnRootBus bool
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ import (
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
@@ -224,8 +224,8 @@ type RuntimeConfigOptions struct {
|
||||
JaegerUser string
|
||||
JaegerPassword string
|
||||
PFlash []string
|
||||
PCIeRootPort uint32
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
HotPlugVFIO config.PCIePort
|
||||
ColdPlugVFIO config.PCIePort
|
||||
DefaultVCPUCount uint32
|
||||
DefaultMaxVCPUCount uint32
|
||||
DefaultMemSize uint32
|
||||
@@ -318,7 +318,6 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string {
|
||||
disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + `
|
||||
enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + `
|
||||
hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + `
|
||||
pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + `
|
||||
cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `"
|
||||
msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + `
|
||||
enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + `
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
package katautils
|
||||
|
||||
import (
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
config "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
)
|
||||
|
||||
// name is the name of the runtime
|
||||
@@ -82,7 +82,6 @@ const defaultEnableDebug bool = false
|
||||
const defaultDisableNestingChecks bool = false
|
||||
const defaultMsize9p uint32 = 8192
|
||||
const defaultHotplugVFIOOnRootBus bool = false
|
||||
const defaultPCIeRootPort = 0
|
||||
const defaultEntropySource = "/dev/urandom"
|
||||
const defaultGuestHookPath string = ""
|
||||
const defaultVirtioFSCacheMode = "never"
|
||||
@@ -108,4 +107,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock"
|
||||
// Default config file used by stateless systems.
|
||||
var defaultRuntimeConfiguration = "@CONFIG_PATH@"
|
||||
|
||||
const defaultColdPlugVFIO = hv.NoPort
|
||||
const defaultHotPlugVFIO = config.NoPort
|
||||
const defaultColdPlugVFIO = config.NoPort
|
||||
|
||||
@@ -20,7 +20,6 @@ import (
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
|
||||
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
@@ -131,7 +130,6 @@ type hypervisor struct {
|
||||
MemSlots uint32 `toml:"memory_slots"`
|
||||
DefaultBridges uint32 `toml:"default_bridges"`
|
||||
Msize9p uint32 `toml:"msize_9p"`
|
||||
PCIeRootPort uint32 `toml:"pcie_root_port"`
|
||||
NumVCPUs int32 `toml:"default_vcpus"`
|
||||
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
|
||||
BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
|
||||
@@ -149,7 +147,8 @@ type hypervisor struct {
|
||||
EnableIOThreads bool `toml:"enable_iothreads"`
|
||||
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
|
||||
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
|
||||
ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"`
|
||||
HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"`
|
||||
ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"`
|
||||
DisableVhostNet bool `toml:"disable_vhost_net"`
|
||||
GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
|
||||
ConfidentialGuest bool `toml:"confidential_guest"`
|
||||
@@ -287,12 +286,18 @@ func (h hypervisor) firmware() (string, error) {
|
||||
return ResolvePath(p)
|
||||
}
|
||||
|
||||
func (h hypervisor) coldPlugVFIO() hv.PCIePort {
|
||||
func (h hypervisor) coldPlugVFIO() config.PCIePort {
|
||||
if h.ColdPlugVFIO == "" {
|
||||
return defaultColdPlugVFIO
|
||||
}
|
||||
return h.ColdPlugVFIO
|
||||
}
|
||||
func (h hypervisor) hotPlugVFIO() config.PCIePort {
|
||||
if h.HotPlugVFIO == "" {
|
||||
return defaultHotPlugVFIO
|
||||
}
|
||||
return h.HotPlugVFIO
|
||||
}
|
||||
|
||||
func (h hypervisor) firmwareVolume() (string, error) {
|
||||
p := h.FirmwareVolume
|
||||
@@ -863,8 +868,8 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
Msize9p: h.msize9p(),
|
||||
DisableImageNvdimm: h.DisableImageNvdimm,
|
||||
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
|
||||
HotPlugVFIO: h.hotPlugVFIO(),
|
||||
ColdPlugVFIO: h.coldPlugVFIO(),
|
||||
PCIeRootPort: h.PCIeRootPort,
|
||||
DisableVhostNet: h.DisableVhostNet,
|
||||
EnableVhostUserStore: h.EnableVhostUserStore,
|
||||
VhostUserStorePath: h.vhostUserStorePath(),
|
||||
@@ -1060,7 +1065,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
Msize9p: h.msize9p(),
|
||||
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
|
||||
ColdPlugVFIO: h.coldPlugVFIO(),
|
||||
PCIeRootPort: h.PCIeRootPort,
|
||||
HotPlugVFIO: h.hotPlugVFIO(),
|
||||
DisableVhostNet: true,
|
||||
GuestHookPath: h.guestHookPath(),
|
||||
VirtioFSExtraArgs: h.VirtioFSExtraArgs,
|
||||
@@ -1291,7 +1296,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
|
||||
Msize9p: defaultMsize9p,
|
||||
HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus,
|
||||
ColdPlugVFIO: defaultColdPlugVFIO,
|
||||
PCIeRootPort: defaultPCIeRootPort,
|
||||
HotPlugVFIO: defaultHotPlugVFIO,
|
||||
GuestHookPath: defaultGuestHookPath,
|
||||
VhostUserStorePath: defaultVhostUserStorePath,
|
||||
VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect,
|
||||
@@ -1663,9 +1668,10 @@ func checkConfig(config oci.RuntimeConfig) error {
|
||||
return err
|
||||
}
|
||||
|
||||
hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
|
||||
coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
|
||||
machineType := config.HypervisorConfig.HypervisorMachineType
|
||||
if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil {
|
||||
if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -1675,18 +1681,32 @@ func checkConfig(config oci.RuntimeConfig) error {
|
||||
// checkPCIeConfig ensures the PCIe configuration is valid.
|
||||
// Only allow one of the following settings for cold-plug:
|
||||
// no-port, root-port, switch-port
|
||||
func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error {
|
||||
func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error {
|
||||
// Currently only QEMU q35 supports advanced PCIe topologies
|
||||
// firecracker, dragonball do not have right now any PCIe support
|
||||
if machineType != "q35" {
|
||||
return nil
|
||||
}
|
||||
if vfioPort == hv.NoPort || vfioPort == hv.RootPort || vfioPort == hv.SwitchPort {
|
||||
|
||||
if coldPlug != config.NoPort && hotPlug != config.NoPort {
|
||||
return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug)
|
||||
}
|
||||
if coldPlug == config.NoPort && hotPlug == config.NoPort {
|
||||
return nil
|
||||
}
|
||||
var port config.PCIePort
|
||||
if coldPlug != config.NoPort {
|
||||
port = coldPlug
|
||||
}
|
||||
if hotPlug != config.NoPort {
|
||||
port = hotPlug
|
||||
}
|
||||
if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort {
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s",
|
||||
vfioPort, hv.NoPort, hv.RootPort, hv.SwitchPort)
|
||||
return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s",
|
||||
coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort)
|
||||
}
|
||||
|
||||
// checkNetNsConfig performs sanity checks on disable_new_netns config.
|
||||
|
||||
@@ -18,8 +18,8 @@ import (
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
@@ -63,15 +63,16 @@ func createConfig(configPath string, fileData string) error {
|
||||
|
||||
// createAllRuntimeConfigFiles creates all files necessary to call
|
||||
// loadConfiguration().
|
||||
func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConfig, err error) {
|
||||
func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntimeConfig, err error) {
|
||||
if dir == "" {
|
||||
return config, fmt.Errorf("BUG: need directory")
|
||||
return testConfig, fmt.Errorf("BUG: need directory")
|
||||
}
|
||||
|
||||
if hypervisor == "" {
|
||||
return config, fmt.Errorf("BUG: need hypervisor")
|
||||
return testConfig, fmt.Errorf("BUG: need hypervisor")
|
||||
}
|
||||
var coldPlugVFIO hv.PCIePort
|
||||
var hotPlugVFIO config.PCIePort
|
||||
var coldPlugVFIO config.PCIePort
|
||||
hypervisorPath := path.Join(dir, "hypervisor")
|
||||
kernelPath := path.Join(dir, "kernel")
|
||||
kernelParams := "foo=bar xyz"
|
||||
@@ -85,8 +86,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
blockDeviceAIO := "io_uring"
|
||||
enableIOThreads := true
|
||||
hotplugVFIOOnRootBus := true
|
||||
pcieRootPort := uint32(2)
|
||||
coldPlugVFIO = hv.RootPort
|
||||
hotPlugVFIO = config.NoPort
|
||||
coldPlugVFIO = config.BridgePort
|
||||
disableNewNetNs := false
|
||||
sharedFS := "virtio-9p"
|
||||
virtioFSdaemon := path.Join(dir, "virtiofsd")
|
||||
@@ -108,7 +109,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
BlockDeviceAIO: blockDeviceAIO,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
HotPlugVFIO: hotPlugVFIO,
|
||||
ColdPlugVFIO: coldPlugVFIO,
|
||||
DisableNewNetNs: disableNewNetNs,
|
||||
DefaultVCPUCount: defaultVCPUCount,
|
||||
@@ -134,7 +135,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
configPath := path.Join(dir, "runtime.toml")
|
||||
err = createConfig(configPath, runtimeConfigFileData)
|
||||
if err != nil {
|
||||
return config, err
|
||||
return testConfig, err
|
||||
}
|
||||
|
||||
configPathLink := path.Join(filepath.Dir(configPath), "link-to-configuration.toml")
|
||||
@@ -142,7 +143,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
// create a link to the config file
|
||||
err = syscall.Symlink(configPath, configPathLink)
|
||||
if err != nil {
|
||||
return config, err
|
||||
return testConfig, err
|
||||
}
|
||||
|
||||
files := []string{hypervisorPath, kernelPath, imagePath}
|
||||
@@ -151,7 +152,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
// create the resource (which must be >0 bytes)
|
||||
err := WriteFile(file, "foo", testFileMode)
|
||||
if err != nil {
|
||||
return config, err
|
||||
return testConfig, err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,7 +173,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
DefaultBridges: defaultBridgesCount,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
HotPlugVFIO: hotPlugVFIO,
|
||||
ColdPlugVFIO: coldPlugVFIO,
|
||||
Msize9p: defaultMsize9p,
|
||||
MemSlots: defaultMemSlots,
|
||||
@@ -216,10 +217,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
|
||||
err = SetKernelParams(&runtimeConfig)
|
||||
if err != nil {
|
||||
return config, err
|
||||
return testConfig, err
|
||||
}
|
||||
|
||||
config = testRuntimeConfig{
|
||||
rtimeConfig := testRuntimeConfig{
|
||||
RuntimeConfig: runtimeConfig,
|
||||
RuntimeConfigFile: configPath,
|
||||
ConfigPath: configPath,
|
||||
@@ -228,7 +229,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
LogPath: logPath,
|
||||
}
|
||||
|
||||
return config, nil
|
||||
return rtimeConfig, nil
|
||||
}
|
||||
|
||||
// testLoadConfiguration accepts an optional function that can be used
|
||||
@@ -568,6 +569,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
|
||||
VirtioFSCache: defaultVirtioFSCacheMode,
|
||||
BlockDeviceAIO: defaultBlockDeviceAIO,
|
||||
DisableGuestSeLinux: defaultDisableGuestSeLinux,
|
||||
HotPlugVFIO: defaultHotPlugVFIO,
|
||||
ColdPlugVFIO: defaultColdPlugVFIO,
|
||||
}
|
||||
|
||||
@@ -602,7 +604,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
|
||||
|
||||
func TestNewQemuHypervisorConfig(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
var coldPlugVFIO hv.PCIePort
|
||||
var coldPlugVFIO config.PCIePort
|
||||
hypervisorPath := path.Join(dir, "hypervisor")
|
||||
kernelPath := path.Join(dir, "kernel")
|
||||
imagePath := path.Join(dir, "image")
|
||||
@@ -610,8 +612,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
|
||||
disableBlock := true
|
||||
enableIOThreads := true
|
||||
hotplugVFIOOnRootBus := true
|
||||
pcieRootPort := uint32(2)
|
||||
coldPlugVFIO = hv.RootPort
|
||||
coldPlugVFIO = config.BridgePort
|
||||
orgVHostVSockDevicePath := utils.VHostVSockDevicePath
|
||||
blockDeviceAIO := "io_uring"
|
||||
defer func() {
|
||||
@@ -630,7 +631,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
|
||||
DisableBlockDeviceUse: disableBlock,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
ColdPlugVFIO: coldPlugVFIO,
|
||||
RxRateLimiterMaxRate: rxRateLimiterMaxRate,
|
||||
TxRateLimiterMaxRate: txRateLimiterMaxRate,
|
||||
@@ -686,10 +686,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
|
||||
t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus)
|
||||
}
|
||||
|
||||
if config.PCIeRootPort != pcieRootPort {
|
||||
t.Errorf("Expected value for PCIeRootPort %v, got %v", pcieRootPort, config.PCIeRootPort)
|
||||
}
|
||||
|
||||
if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate {
|
||||
t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate)
|
||||
}
|
||||
@@ -812,7 +808,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
|
||||
disableBlock := true
|
||||
enableIOThreads := true
|
||||
hotplugVFIOOnRootBus := true
|
||||
pcieRootPort := uint32(2)
|
||||
|
||||
hypervisor := hypervisor{
|
||||
Path: hypervisorPath,
|
||||
@@ -823,7 +818,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
|
||||
DisableBlockDeviceUse: disableBlock,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
}
|
||||
|
||||
_, err := newQemuHypervisorConfig(hypervisor)
|
||||
|
||||
@@ -453,6 +453,10 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
|
||||
return err
|
||||
}
|
||||
|
||||
if err := addHypervisorHotColdPlugVfioOverrides(ocispec, config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if value, ok := ocispec.Annotations[vcAnnotations.MachineType]; ok {
|
||||
if value != "" {
|
||||
config.HypervisorConfig.HypervisorMachineType = value
|
||||
@@ -508,12 +512,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
|
||||
return err
|
||||
}
|
||||
|
||||
if err := newAnnotationConfiguration(ocispec, vcAnnotations.PCIeRootPort).setUint(func(pcieRootPort uint64) {
|
||||
config.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort)
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok {
|
||||
if !checkPathIsInGlobs(runtime.HypervisorConfig.EntropySourceList, value) {
|
||||
return fmt.Errorf("entropy source %v required from annotation is not valid", value)
|
||||
@@ -576,6 +574,37 @@ func addHypervisorPathOverrides(ocispec specs.Spec, config *vc.SandboxConfig, ru
|
||||
return nil
|
||||
}
|
||||
|
||||
func addHypervisorPCIePortOverride(value string) (config.PCIePort, error) {
|
||||
if value == "" {
|
||||
return config.NoPort, nil
|
||||
}
|
||||
port := config.PCIePort(value)
|
||||
if port.Invalid() {
|
||||
return config.InvalidPort, fmt.Errorf("Invalid PCIe port \"%v\" specified in annotation", value)
|
||||
}
|
||||
return port, nil
|
||||
}
|
||||
|
||||
func addHypervisorHotColdPlugVfioOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error {
|
||||
|
||||
var err error
|
||||
if value, ok := ocispec.Annotations[vcAnnotations.HotPlugVFIO]; ok {
|
||||
if sbConfig.HypervisorConfig.HotPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
|
||||
return err
|
||||
}
|
||||
// If hot-plug is specified disable cold-plug and vice versa
|
||||
sbConfig.HypervisorConfig.ColdPlugVFIO = config.NoPort
|
||||
}
|
||||
if value, ok := ocispec.Annotations[vcAnnotations.ColdPlugVFIO]; ok {
|
||||
if sbConfig.HypervisorConfig.ColdPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
|
||||
return err
|
||||
}
|
||||
// If cold-plug is specified disable hot-plug and vice versa
|
||||
sbConfig.HypervisorConfig.HotPlugVFIO = config.NoPort
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, runtime RuntimeConfig) error {
|
||||
|
||||
if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMemory).setUintWithCheck(func(memorySz uint64) error {
|
||||
|
||||
@@ -599,7 +599,7 @@ func TestContainerPipeSizeAnnotation(t *testing.T) {
|
||||
func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
config := vc.SandboxConfig{
|
||||
sbConfig := vc.SandboxConfig{
|
||||
Annotations: make(map[string]string),
|
||||
}
|
||||
|
||||
@@ -628,8 +628,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
runtimeConfig.HypervisorConfig.VirtioFSDaemonList = []string{"/bin/*ls*"}
|
||||
|
||||
ocispec.Annotations[vcAnnotations.KernelParams] = "vsyscall=emulate iommu=on"
|
||||
addHypervisorConfigOverrides(ocispec, &config, runtimeConfig)
|
||||
assert.Exactly(expectedHyperConfig, config.HypervisorConfig)
|
||||
addHypervisorConfigOverrides(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.Exactly(expectedHyperConfig, sbConfig.HypervisorConfig)
|
||||
|
||||
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
|
||||
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"
|
||||
@@ -660,7 +660,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/"
|
||||
ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true"
|
||||
ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true"
|
||||
ocispec.Annotations[vcAnnotations.PCIeRootPort] = "2"
|
||||
ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort
|
||||
ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort
|
||||
ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true"
|
||||
ocispec.Annotations[vcAnnotations.SGXEPC] = "64Mi"
|
||||
ocispec.Annotations[vcAnnotations.UseLegacySerial] = "true"
|
||||
@@ -668,55 +669,58 @@ func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
ocispec.Annotations[vcAnnotations.RxRateLimiterMaxRate] = "10000000"
|
||||
ocispec.Annotations[vcAnnotations.TxRateLimiterMaxRate] = "10000000"
|
||||
|
||||
addAnnotations(ocispec, &config, runtimeConfig)
|
||||
assert.Equal(config.HypervisorConfig.NumVCPUs, uint32(1))
|
||||
assert.Equal(config.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
|
||||
assert.Equal(config.HypervisorConfig.MemorySize, uint32(1024))
|
||||
assert.Equal(config.HypervisorConfig.MemSlots, uint32(20))
|
||||
assert.Equal(config.HypervisorConfig.MemOffset, uint64(512))
|
||||
assert.Equal(config.HypervisorConfig.VirtioMem, true)
|
||||
assert.Equal(config.HypervisorConfig.MemPrealloc, true)
|
||||
assert.Equal(config.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
|
||||
assert.Equal(config.HypervisorConfig.HugePages, true)
|
||||
assert.Equal(config.HypervisorConfig.IOMMU, true)
|
||||
assert.Equal(config.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
|
||||
assert.Equal(config.HypervisorConfig.BlockDeviceAIO, "io_uring")
|
||||
assert.Equal(config.HypervisorConfig.DisableBlockDeviceUse, true)
|
||||
assert.Equal(config.HypervisorConfig.EnableIOThreads, true)
|
||||
assert.Equal(config.HypervisorConfig.BlockDeviceCacheSet, true)
|
||||
assert.Equal(config.HypervisorConfig.BlockDeviceCacheDirect, true)
|
||||
assert.Equal(config.HypervisorConfig.BlockDeviceCacheNoflush, true)
|
||||
assert.Equal(config.HypervisorConfig.SharedFS, "virtio-fs")
|
||||
assert.Equal(config.HypervisorConfig.VirtioFSDaemon, "/bin/false")
|
||||
assert.Equal(config.HypervisorConfig.VirtioFSCache, "auto")
|
||||
assert.ElementsMatch(config.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
|
||||
assert.Equal(config.HypervisorConfig.Msize9p, uint32(512))
|
||||
assert.Equal(config.HypervisorConfig.HypervisorMachineType, "q35")
|
||||
assert.Equal(config.HypervisorConfig.MachineAccelerators, "nofw")
|
||||
assert.Equal(config.HypervisorConfig.CPUFeatures, "pmu=off")
|
||||
assert.Equal(config.HypervisorConfig.DisableVhostNet, true)
|
||||
assert.Equal(config.HypervisorConfig.GuestHookPath, "/usr/bin/")
|
||||
assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true)
|
||||
assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true)
|
||||
assert.Equal(config.HypervisorConfig.PCIeRootPort, uint32(2))
|
||||
assert.Equal(config.HypervisorConfig.IOMMUPlatform, true)
|
||||
assert.Equal(config.HypervisorConfig.SGXEPCSize, int64(67108864))
|
||||
assert.Equal(config.HypervisorConfig.LegacySerial, true)
|
||||
assert.Equal(config.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
|
||||
assert.Equal(config.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
|
||||
err := addAnnotations(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.NoError(err)
|
||||
|
||||
assert.Equal(sbConfig.HypervisorConfig.NumVCPUs, uint32(1))
|
||||
assert.Equal(sbConfig.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
|
||||
assert.Equal(sbConfig.HypervisorConfig.MemorySize, uint32(1024))
|
||||
assert.Equal(sbConfig.HypervisorConfig.MemSlots, uint32(20))
|
||||
assert.Equal(sbConfig.HypervisorConfig.MemOffset, uint64(512))
|
||||
assert.Equal(sbConfig.HypervisorConfig.VirtioMem, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.MemPrealloc, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
|
||||
assert.Equal(sbConfig.HypervisorConfig.HugePages, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.IOMMU, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceAIO, "io_uring")
|
||||
assert.Equal(sbConfig.HypervisorConfig.DisableBlockDeviceUse, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.EnableIOThreads, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheSet, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheDirect, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheNoflush, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.SharedFS, "virtio-fs")
|
||||
assert.Equal(sbConfig.HypervisorConfig.VirtioFSDaemon, "/bin/false")
|
||||
assert.Equal(sbConfig.HypervisorConfig.VirtioFSCache, "auto")
|
||||
assert.ElementsMatch(sbConfig.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
|
||||
assert.Equal(sbConfig.HypervisorConfig.Msize9p, uint32(512))
|
||||
assert.Equal(sbConfig.HypervisorConfig.HypervisorMachineType, "q35")
|
||||
assert.Equal(sbConfig.HypervisorConfig.MachineAccelerators, "nofw")
|
||||
assert.Equal(sbConfig.HypervisorConfig.CPUFeatures, "pmu=off")
|
||||
assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/")
|
||||
assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true)
|
||||
assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort))
|
||||
assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort))
|
||||
assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.SGXEPCSize, int64(67108864))
|
||||
assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
|
||||
assert.Equal(sbConfig.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
|
||||
|
||||
// In case an absurd large value is provided, the config value if not over-ridden
|
||||
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "655536"
|
||||
err := addAnnotations(ocispec, &config, runtimeConfig)
|
||||
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.Error(err)
|
||||
|
||||
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "-1"
|
||||
err = addAnnotations(ocispec, &config, runtimeConfig)
|
||||
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.Error(err)
|
||||
|
||||
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
|
||||
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "-1"
|
||||
err = addAnnotations(ocispec, &config, runtimeConfig)
|
||||
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.Error(err)
|
||||
|
||||
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"
|
||||
|
||||
@@ -864,12 +864,12 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
|
||||
defer cancel()
|
||||
|
||||
// Create the clh device config via the constructor to ensure default values are properly assigned
|
||||
clhDevice := *chclient.NewDeviceConfig(*(*device).GetSysfsDev())
|
||||
clhDevice := *chclient.NewDeviceConfig(device.SysfsDev)
|
||||
pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err))
|
||||
}
|
||||
clh.devicesIds[*(*device).GetID()] = pciInfo.GetId()
|
||||
clh.devicesIds[device.ID] = pciInfo.GetId()
|
||||
|
||||
// clh doesn't use bridges, so the PCI path is simply the slot
|
||||
// number of the device. This will break if clh starts using
|
||||
@@ -886,14 +886,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
|
||||
return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf)
|
||||
}
|
||||
|
||||
guestPciPath, err := types.PciPathFromString(tokens[0])
|
||||
|
||||
pciDevice, ok := (*device).(config.VFIOPCIDev)
|
||||
if !ok {
|
||||
if device.Type == config.VFIOAPDeviceMediatedType {
|
||||
return fmt.Errorf("VFIO device %+v is not PCI, only PCI is supported in Cloud Hypervisor", device)
|
||||
}
|
||||
pciDevice.GuestPciPath = guestPciPath
|
||||
*device = pciDevice
|
||||
|
||||
device.GuestPciPath, err = types.PciPathFromString(tokens[0])
|
||||
|
||||
return err
|
||||
}
|
||||
@@ -937,7 +934,7 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int
|
||||
case BlockDev:
|
||||
deviceID = clhDriveIndexToID(devInfo.(*config.BlockDrive).Index)
|
||||
case VfioDev:
|
||||
deviceID = *devInfo.(config.VFIODev).GetID()
|
||||
deviceID = devInfo.(*config.VFIODev).ID
|
||||
default:
|
||||
clh.Logger().WithFields(log.Fields{"devInfo": devInfo,
|
||||
"deviceType": devType}).Error("HotplugRemoveDevice: unsupported device")
|
||||
|
||||
@@ -673,7 +673,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
|
||||
_, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev)
|
||||
assert.NoError(err, "Hotplug remove block device expected no error")
|
||||
|
||||
_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIOPCIDev{}, VfioDev)
|
||||
_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIODev{}, VfioDev)
|
||||
assert.NoError(err, "Hotplug remove vfio block device expected no error")
|
||||
|
||||
_, err = clh.HotplugRemoveDevice(context.Background(), nil, NetDev)
|
||||
|
||||
@@ -288,12 +288,12 @@ type HypervisorConfig struct {
|
||||
// root bus instead of a bridge.
|
||||
HotplugVFIOOnRootBus bool
|
||||
|
||||
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
|
||||
// The PCIe Root Port device is used to hot-plug the PCIe device
|
||||
PCIeRootPort uint32
|
||||
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
|
||||
// root port, switch, bridge or no port
|
||||
HotPlugVFIO hv.PCIePort
|
||||
|
||||
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
|
||||
// root port, switch or no port
|
||||
// root port, switch, bridge or no port
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
|
||||
// BootToBeTemplate used to indicate if the VM is created to be a template VM
|
||||
|
||||
@@ -505,13 +505,21 @@ type HypervisorConfig struct {
|
||||
// MemOffset specifies memory space for nvdimm device
|
||||
MemOffset uint64
|
||||
|
||||
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
|
||||
// The PCIe Root Port device is used to hot-plug the PCIe device
|
||||
PCIeRootPort uint32
|
||||
// VFIODevices are used to get PCIe device info early before the sandbox
|
||||
// is started to make better PCIe topology decisions
|
||||
VFIODevices []config.DeviceInfo
|
||||
// VhostUserBlkDevices are handled differently in Q35 and Virt machine
|
||||
// type. capture them early before the sandbox to make better PCIe topology
|
||||
// decisions
|
||||
VhostUserBlkDevices []config.DeviceInfo
|
||||
|
||||
// HotplugVFIO is used to indicate if devices need to be hotplugged on the
|
||||
// root port or a switch
|
||||
HotPlugVFIO config.PCIePort
|
||||
|
||||
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
|
||||
// root port, switch or no port
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
ColdPlugVFIO config.PCIePort
|
||||
|
||||
// NumVCPUs specifies default number of vCPUs for the VM.
|
||||
NumVCPUs uint32
|
||||
|
||||
@@ -21,6 +21,7 @@ import (
|
||||
"github.com/docker/go-units"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/api"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
|
||||
volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/uuid"
|
||||
@@ -1137,7 +1138,7 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
|
||||
ContainerPath: dev.ContainerPath,
|
||||
Type: kataVfioPciDevType,
|
||||
Id: groupNum,
|
||||
Options: nil,
|
||||
Options: make([]string, len(devList)),
|
||||
}
|
||||
|
||||
// We always pass the device information to the agent, since
|
||||
@@ -1147,16 +1148,16 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
|
||||
if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel {
|
||||
kataDevice.Type = kataVfioPciGuestKernelDevType
|
||||
}
|
||||
|
||||
if (*devList[0]).GetType() == config.VFIOAPDeviceMediatedType {
|
||||
for i, dev := range devList {
|
||||
if dev.Type == config.VFIOAPDeviceMediatedType {
|
||||
kataDevice.Type = kataVfioApDevType
|
||||
kataDevice.Options = (*devList[0]).(config.VFIOAPDev).APDevices
|
||||
kataDevice.Options = dev.APDevices
|
||||
} else {
|
||||
kataDevice.Options = make([]string, len(devList))
|
||||
for i, device := range devList {
|
||||
pciDevice := (*device).(config.VFIOPCIDev)
|
||||
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDevice.BDF, pciDevice.GuestPciPath)
|
||||
|
||||
devBDF := drivers.GetBDF(dev.BDF)
|
||||
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", devBDF, dev.GuestPciPath)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return kataDevice
|
||||
@@ -1342,7 +1343,6 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
|
||||
if _, err = k.sendReq(ctx, req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buildProcessFromExecID(req.ExecId)
|
||||
}
|
||||
|
||||
|
||||
@@ -245,7 +245,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
|
||||
DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks,
|
||||
DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm,
|
||||
HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus,
|
||||
PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort,
|
||||
BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate,
|
||||
BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate,
|
||||
DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet,
|
||||
@@ -487,8 +486,8 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
|
||||
DisableNestingChecks: hconf.DisableNestingChecks,
|
||||
DisableImageNvdimm: hconf.DisableImageNvdimm,
|
||||
HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus,
|
||||
HotPlugVFIO: hconf.HotPlugVFIO,
|
||||
ColdPlugVFIO: hconf.ColdPlugVFIO,
|
||||
PCIeRootPort: hconf.PCIeRootPort,
|
||||
BootToBeTemplate: hconf.BootToBeTemplate,
|
||||
BootFromTemplate: hconf.BootFromTemplate,
|
||||
DisableVhostNet: hconf.DisableVhostNet,
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
package persistapi
|
||||
|
||||
import (
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
specs "github.com/opencontainers/runtime-spec/specs-go"
|
||||
)
|
||||
@@ -131,10 +131,6 @@ type HypervisorConfig struct {
|
||||
// Enable SGX. Hardware-based isolation and memory encryption.
|
||||
SGXEPCSize int64
|
||||
|
||||
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
|
||||
// The PCIe Root Port device is used to hot-plug the PCIe device
|
||||
PCIeRootPort uint32
|
||||
|
||||
// NumVCPUs specifies default number of vCPUs for the VM.
|
||||
NumVCPUs uint32
|
||||
|
||||
@@ -199,9 +195,13 @@ type HypervisorConfig struct {
|
||||
// root bus instead of a bridge.
|
||||
HotplugVFIOOnRootBus bool
|
||||
|
||||
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
|
||||
// root, switch, bridge or no-port
|
||||
HotPlugVFIO config.PCIePort
|
||||
|
||||
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
|
||||
// root port or a switch or no-port
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
// root, bridge, switch or no-port
|
||||
ColdPlugVFIO config.PCIePort
|
||||
|
||||
// BootToBeTemplate used to indicate if the VM is created to be a template VM
|
||||
BootToBeTemplate bool
|
||||
|
||||
@@ -126,9 +126,11 @@ const (
|
||||
// root bus instead of a bridge.
|
||||
HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus"
|
||||
|
||||
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
|
||||
// The PCIe Root Port device is used to hot-plug the PCIe device
|
||||
PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port"
|
||||
// ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged.
|
||||
ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio"
|
||||
|
||||
// HotPlugVFIO is a sandbox annotation used to indicate if devices need to be hotplugged.
|
||||
HotPlugVFIO = kataAnnotHypervisorPrefix + "hot_plug_vfio"
|
||||
|
||||
// EntropySource is a sandbox annotation to specify the path to a host source of
|
||||
// entropy (/dev/random, /dev/urandom or real hardware RNG device)
|
||||
|
||||
@@ -65,6 +65,11 @@ const romFile = ""
|
||||
// Default value is false.
|
||||
const defaultDisableModern = false
|
||||
|
||||
// A deeper PCIe topology than 5 is already not advisable just for the sake
|
||||
// of having enough buffer we limit ourselves to 10 and exit if we reach
|
||||
// the root bus
|
||||
const maxPCIeTopoDepth = 10
|
||||
|
||||
type qmpChannel struct {
|
||||
qmp *govmmQemu.QMP
|
||||
ctx context.Context
|
||||
@@ -76,14 +81,14 @@ type qmpChannel struct {
|
||||
// QemuState keeps Qemu's state
|
||||
type QemuState struct {
|
||||
UUID string
|
||||
HotPlugVFIO config.PCIePort
|
||||
Bridges []types.Bridge
|
||||
// HotpluggedCPUs is the list of CPUs that were hot-added
|
||||
HotpluggedVCPUs []hv.CPUDevice
|
||||
HotpluggedMemory int
|
||||
VirtiofsDaemonPid int
|
||||
PCIeRootPort int
|
||||
HotplugVFIOOnRootBus bool
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
HotplugVFIO config.PCIePort
|
||||
ColdPlugVFIO config.PCIePort
|
||||
}
|
||||
|
||||
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
|
||||
@@ -282,10 +287,10 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
|
||||
|
||||
q.Logger().Debug("Creating UUID")
|
||||
q.state.UUID = uuid.Generate().String()
|
||||
|
||||
q.state.HotPlugVFIO = q.config.HotPlugVFIO
|
||||
q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
|
||||
q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
|
||||
q.state.PCIeRootPort = int(q.config.PCIeRootPort)
|
||||
q.state.HotPlugVFIO = q.config.HotPlugVFIO
|
||||
|
||||
// The path might already exist, but in case of VM templating,
|
||||
// we have to create it since the sandbox has not created it yet.
|
||||
@@ -701,27 +706,12 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
|
||||
}
|
||||
}
|
||||
|
||||
// Add PCIe Root Port devices to hypervisor
|
||||
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port.
|
||||
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
|
||||
memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
|
||||
|
||||
if hypervisorConfig.PCIeRootPort > 0 {
|
||||
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit)
|
||||
if machine.Type == QemuQ35 || machine.Type == QemuVirt {
|
||||
if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type); err != nil {
|
||||
q.Logger().WithError(err).Errorf("Cannot create PCIe topology")
|
||||
return err
|
||||
}
|
||||
|
||||
// The default OVMF MMIO aperture is too small for some PCIe devices
|
||||
// with huge BARs so we need to increase it.
|
||||
// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
|
||||
if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
|
||||
pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
|
||||
fwCfg := govmmQemu.FwCfg{
|
||||
Name: "opt/ovmf/X-PciMmio64Mb",
|
||||
Str: pciMmio64Mb,
|
||||
}
|
||||
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
|
||||
}
|
||||
|
||||
q.qemuConfig = qemuConfig
|
||||
|
||||
q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath)
|
||||
@@ -747,6 +737,101 @@ func (q *qemu) checkBpfEnabled() {
|
||||
}
|
||||
}
|
||||
|
||||
// If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need
|
||||
// to hotplug 32 devices. We do not have enough PCIe root bus slots to
|
||||
// accomplish this task. Kata will use already some slots for vfio-xxxx-pci
|
||||
// devices.
|
||||
// Max PCI slots per root bus is 32
|
||||
// Max PCIe root ports is 16
|
||||
// Max PCIe switch ports is 16
|
||||
// There is only 64kB of IO memory each root,switch port will consume 4k hence
|
||||
// only 16 ports possible.
|
||||
func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string) error {
|
||||
|
||||
// If no-port set just return no need to add PCIe Root Port or PCIe Switches
|
||||
if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Add PCIe Root Port or PCIe Switches to the hypervisor
|
||||
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged
|
||||
// into a PCIe Root Port or PCIe Switch.
|
||||
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
|
||||
|
||||
// Deduce the right values for mem-reserve and pref-64-reserve memory regions
|
||||
memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
|
||||
|
||||
// The default OVMF MMIO aperture is too small for some PCIe devices
|
||||
// with huge BARs so we need to increase it.
|
||||
// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
|
||||
if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
|
||||
pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
|
||||
fwCfg := govmmQemu.FwCfg{
|
||||
Name: "opt/ovmf/X-PciMmio64Mb",
|
||||
Str: pciMmio64Mb,
|
||||
}
|
||||
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
|
||||
}
|
||||
|
||||
// Get the number of hot(cold)-pluggable ports needed from the provided
|
||||
// VFIO devices and VhostUserBlockDevices
|
||||
var numOfPluggablePorts uint32 = 0
|
||||
for _, dev := range hypervisorConfig.VFIODevices {
|
||||
var err error
|
||||
dev.HostPath, err = config.GetHostPath(dev, false, "")
|
||||
if err != nil {
|
||||
return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err)
|
||||
}
|
||||
devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err)
|
||||
}
|
||||
for _, vfioDevice := range devicesPerIOMMUGroup {
|
||||
if drivers.IsPCIeDevice(vfioDevice.BDF) {
|
||||
numOfPluggablePorts = numOfPluggablePorts + 1
|
||||
}
|
||||
}
|
||||
}
|
||||
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus)
|
||||
vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
|
||||
|
||||
numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices)
|
||||
|
||||
// If number of PCIe root ports > 16 then bail out otherwise we may
|
||||
// use up all slots or IO memory on the root bus and vfio-XXX-pci devices
|
||||
// cannot be added which are crucial for Kata max slots on root bus is 32
|
||||
// max slots on the complete pci(e) topology is 256 in QEMU
|
||||
if vfioOnRootPort {
|
||||
// On Arm the vhost-user-block device is a PCIe device we need
|
||||
// to account for it in the number of pluggable ports
|
||||
if machineType == QemuVirt {
|
||||
numOfPluggablePorts = numOfPluggablePorts + uint32(numOfVhostUserBlockDevices)
|
||||
}
|
||||
if numOfPluggablePorts > maxPCIeRootPort {
|
||||
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
|
||||
}
|
||||
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
|
||||
return nil
|
||||
}
|
||||
if vfioOnSwitchPort {
|
||||
// On Arm the vhost-user-block device is a PCIe device we need
|
||||
// to account for it in the number of pluggable ports
|
||||
if machineType == QemuVirt {
|
||||
numOfPluggableRootPorts := uint32(numOfVhostUserBlockDevices)
|
||||
if numOfPluggableRootPorts > maxPCIeRootPort {
|
||||
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
|
||||
}
|
||||
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggableRootPorts, memSize32bit, memSize64bit)
|
||||
}
|
||||
if numOfPluggablePorts > maxPCIeSwitchPort {
|
||||
return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort)
|
||||
}
|
||||
qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (q *qemu) vhostFSSocketPath(id string) (string, error) {
|
||||
return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket)
|
||||
}
|
||||
@@ -1527,6 +1612,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri
|
||||
}
|
||||
|
||||
func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) {
|
||||
|
||||
err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -1544,18 +1630,14 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
|
||||
|
||||
switch machineType {
|
||||
case QemuVirt:
|
||||
if q.state.PCIeRootPort <= 0 {
|
||||
return fmt.Errorf("Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt")
|
||||
}
|
||||
|
||||
//The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0
|
||||
//Since the dev is the first and only one on this bus(root port), it should be 0.
|
||||
addr := "00"
|
||||
|
||||
bridgeId := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs))
|
||||
drivers.AllPCIeDevs[devID] = true
|
||||
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort]))
|
||||
config.PCIeDevices[config.RootPort][devID] = true
|
||||
|
||||
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeId)
|
||||
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
|
||||
bridgeSlot, err := q.qomGetSlot(bridgeQomPath)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -1571,7 +1653,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
|
||||
return err
|
||||
}
|
||||
|
||||
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeId); err != nil {
|
||||
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -1685,41 +1767,108 @@ func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) {
|
||||
|
||||
// Query QMP to find a device's PCI path given its QOM path or ID
|
||||
func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) {
|
||||
// XXX: For now we assume there's exactly one bridge, since
|
||||
// that's always how we configure qemu from Kata for now. It
|
||||
// would be good to generalize this to different PCI
|
||||
// topologies
|
||||
|
||||
var slots []types.PciSlot
|
||||
|
||||
devSlot, err := q.qomGetSlot(qemuID)
|
||||
if err != nil {
|
||||
return types.PciPath{}, err
|
||||
}
|
||||
slots = append(slots, devSlot)
|
||||
|
||||
busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus")
|
||||
// This only works for Q35 and Virt
|
||||
r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
|
||||
|
||||
var parentPath = qemuID
|
||||
// We do not want to use a forever loop here, a deeper PCIe topology
|
||||
// than 5 is already not advisable just for the sake of having enough
|
||||
// buffer we limit ourselves to 10 and leave the loop early if we hit
|
||||
// the root bus.
|
||||
for i := 1; i <= maxPCIeTopoDepth; i++ {
|
||||
parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus")
|
||||
if err != nil {
|
||||
return types.PciPath{}, err
|
||||
}
|
||||
|
||||
bus, ok := busq.(string)
|
||||
busQOM, ok := parenBusQOM.(string)
|
||||
if !ok {
|
||||
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq)
|
||||
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
|
||||
}
|
||||
|
||||
// If we hit /machine/q35/pcie.0 we're done this is the root bus
|
||||
// we climbed the complete hierarchy
|
||||
if r.Match([]byte(busQOM)) {
|
||||
break
|
||||
}
|
||||
|
||||
// `bus` is the QOM path of the QOM bus object, but we need
|
||||
// the PCI bridge which manages that bus. There doesn't seem
|
||||
// the PCI parent_bus which manages that bus. There doesn't seem
|
||||
// to be a way to get that other than to simply drop the last
|
||||
// path component.
|
||||
idx := strings.LastIndex(bus, "/")
|
||||
idx := strings.LastIndex(busQOM, "/")
|
||||
if idx == -1 {
|
||||
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus)
|
||||
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
|
||||
}
|
||||
bridge := bus[:idx]
|
||||
parentBus := busQOM[:idx]
|
||||
|
||||
bridgeSlot, err := q.qomGetSlot(bridge)
|
||||
parentSlot, err := q.qomGetSlot(parentBus)
|
||||
if err != nil {
|
||||
return types.PciPath{}, err
|
||||
}
|
||||
|
||||
return types.PciPathFromSlots(bridgeSlot, devSlot)
|
||||
// Prepend the slots, since we're climbing the hierarchy
|
||||
slots = append([]types.PciSlot{parentSlot}, slots...)
|
||||
parentPath = parentBus
|
||||
}
|
||||
return types.PciPathFromSlots(slots...)
|
||||
}
|
||||
|
||||
func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) {
|
||||
return q.executeVFIODeviceAdd(device)
|
||||
}
|
||||
|
||||
func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) {
|
||||
return q.executeVFIODeviceAdd(device)
|
||||
}
|
||||
|
||||
func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) {
|
||||
addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
q.arch.removeDeviceFromBridge(device.ID)
|
||||
}
|
||||
}()
|
||||
return q.executePCIVFIODeviceAdd(device, addr, bridge.ID)
|
||||
}
|
||||
|
||||
func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error {
|
||||
switch device.Type {
|
||||
case config.VFIOPCIDeviceNormalType:
|
||||
return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile)
|
||||
case config.VFIOPCIDeviceMediatedType:
|
||||
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile)
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
|
||||
default:
|
||||
return fmt.Errorf("Incorrect VFIO device type found")
|
||||
}
|
||||
}
|
||||
|
||||
func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error {
|
||||
switch device.Type {
|
||||
case config.VFIOPCIDeviceNormalType:
|
||||
return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile)
|
||||
case config.VFIOPCIDeviceMediatedType:
|
||||
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile)
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
|
||||
default:
|
||||
return fmt.Errorf("Incorrect VFIO device type found")
|
||||
}
|
||||
}
|
||||
|
||||
func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) {
|
||||
@@ -1727,85 +1876,32 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
|
||||
return err
|
||||
}
|
||||
|
||||
devID := *(*device).GetID()
|
||||
machineType := q.HypervisorConfig().HypervisorMachineType
|
||||
|
||||
if op == AddDevice {
|
||||
|
||||
buf, _ := json.Marshal(device)
|
||||
q.Logger().WithFields(logrus.Fields{
|
||||
"machine-type": machineType,
|
||||
"hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus,
|
||||
"pcie-root-port": q.state.PCIeRootPort,
|
||||
"machine-type": q.HypervisorConfig().HypervisorMachineType,
|
||||
"hot-plug-vfio": q.state.HotPlugVFIO,
|
||||
"device-info": string(buf),
|
||||
}).Info("Start hot-plug VFIO device")
|
||||
|
||||
// In case MachineType is q35, a PCIe device is hotplugged on
|
||||
// a PCIe Root Port or alternatively on a PCIe Switch Port
|
||||
if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt {
|
||||
device.Bus = ""
|
||||
} else {
|
||||
var err error
|
||||
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
|
||||
// for pc machine type instead of bridge. This is useful for devices that require
|
||||
// a large PCI BAR which is a currently a limitation with PCI bridges.
|
||||
if q.state.HotplugVFIOOnRootBus {
|
||||
switch (*device).GetType() {
|
||||
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
|
||||
// In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
|
||||
pciDevice, ok := (*device).(config.VFIOPCIDev)
|
||||
if !ok {
|
||||
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
|
||||
}
|
||||
switch machineType {
|
||||
case QemuQ35:
|
||||
if pciDevice.IsPCIe && q.state.PCIeRootPort <= 0 {
|
||||
q.Logger().WithField("dev-id", (*device).GetID()).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
|
||||
pciDevice.Bus = ""
|
||||
}
|
||||
default:
|
||||
pciDevice.Bus = ""
|
||||
}
|
||||
*device = pciDevice
|
||||
|
||||
if pciDevice.Type == config.VFIOPCIDeviceNormalType {
|
||||
err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, pciDevice.Bus, romFile)
|
||||
if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus {
|
||||
err = q.hotplugVFIODeviceRootPort(ctx, device)
|
||||
} else if q.state.HotPlugVFIO == config.SwitchPort {
|
||||
err = q.hotplugVFIODeviceSwitchPort(ctx, device)
|
||||
} else {
|
||||
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), "", pciDevice.Bus, romFile)
|
||||
}
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
|
||||
}
|
||||
} else {
|
||||
addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
q.arch.removeDeviceFromBridge(devID)
|
||||
}
|
||||
}()
|
||||
|
||||
switch (*device).GetType() {
|
||||
case config.VFIOPCIDeviceNormalType:
|
||||
pciDevice, ok := (*device).(config.VFIOPCIDev)
|
||||
if !ok {
|
||||
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
|
||||
}
|
||||
err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, addr, bridge.ID, romFile)
|
||||
case config.VFIOPCIDeviceMediatedType:
|
||||
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), addr, bridge.ID, romFile)
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
|
||||
default:
|
||||
return fmt.Errorf("Incorrect VFIO device type found")
|
||||
}
|
||||
err = q.hotplugVFIODeviceBridgePort(ctx, device)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch (*device).GetType() {
|
||||
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
|
||||
pciDevice, ok := (*device).(config.VFIOPCIDev)
|
||||
if !ok {
|
||||
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
|
||||
}
|
||||
// XXX: Depending on whether we're doing root port or
|
||||
// bridge hotplug, and how the bridge is set up in
|
||||
@@ -1813,23 +1909,20 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
|
||||
// have information about the slot number of the
|
||||
// bridge and or the device. For simplicity, just
|
||||
// query both of them back from qemu
|
||||
guestPciPath, err := q.qomGetPciPath(devID)
|
||||
pciDevice.GuestPciPath = guestPciPath
|
||||
*device = pciDevice
|
||||
device.GuestPciPath, err = q.qomGetPciPath(device.ID)
|
||||
return err
|
||||
}
|
||||
return err
|
||||
} else {
|
||||
q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")
|
||||
|
||||
q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
|
||||
|
||||
if !q.state.HotplugVFIOOnRootBus {
|
||||
if err := q.arch.removeDeviceFromBridge(devID); err != nil {
|
||||
if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID)
|
||||
}
|
||||
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
|
||||
|
||||
}
|
||||
|
||||
func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
|
||||
@@ -2527,7 +2620,7 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
|
||||
for i := uint32(0); i < number; i++ {
|
||||
devices = append(devices,
|
||||
govmmQemu.PCIeRootPortDevice{
|
||||
ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i),
|
||||
ID: fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, i),
|
||||
Bus: bus,
|
||||
Chassis: chassis,
|
||||
Slot: strconv.FormatUint(uint64(i), 10),
|
||||
@@ -2541,6 +2634,79 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
|
||||
return devices
|
||||
}
|
||||
|
||||
// gollangci-lint enforces multi-line comments to be a block comment
|
||||
// not multiple single line comments ...
|
||||
/* pcie.0 bus
|
||||
// -------------------------------------------------
|
||||
// |
|
||||
// -------------
|
||||
// | Root Port |
|
||||
// -------------
|
||||
// -------------------------|------------------------
|
||||
// | ----------------- |
|
||||
// | PCI Express | Upstream Port | |
|
||||
// | Switch ----------------- |
|
||||
// | | | |
|
||||
// | ------------------- ------------------- |
|
||||
// | | Downstream Port | | Downstream Port | |
|
||||
// | ------------------- ------------------- |
|
||||
// -------------|-----------------------|------------
|
||||
// ------------- --------------
|
||||
// | GPU/ACCEL | | IB/ETH NIC |
|
||||
// ------------- --------------
|
||||
*/
|
||||
// genericAppendPCIeSwitch adds a PCIe Swtich
|
||||
func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
|
||||
|
||||
// Q35, Virt have the correct PCIe support,
|
||||
// hence ignore all other machines
|
||||
if machineType != QemuQ35 && machineType != QemuVirt {
|
||||
return devices
|
||||
}
|
||||
|
||||
// Using an own ID for the root port, so we do not clash with already
|
||||
// existing root ports adding "s" for switch prefix
|
||||
pcieRootPort := govmmQemu.PCIeRootPortDevice{
|
||||
ID: fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0),
|
||||
Bus: defaultBridgeBus,
|
||||
Chassis: "1",
|
||||
Slot: strconv.FormatUint(uint64(0), 10),
|
||||
Multifunction: false,
|
||||
Addr: "0",
|
||||
MemReserve: fmt.Sprintf("%dB", memSize32bit),
|
||||
Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
|
||||
}
|
||||
|
||||
devices = append(devices, pcieRootPort)
|
||||
|
||||
pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{
|
||||
ID: fmt.Sprintf("%s%d", config.PCIeSwitchUpstreamPortPrefix, 0),
|
||||
Bus: pcieRootPort.ID,
|
||||
}
|
||||
devices = append(devices, pcieSwitchUpstreamPort)
|
||||
|
||||
currentChassis, err := strconv.Atoi(pcieRootPort.Chassis)
|
||||
if err != nil {
|
||||
return devices
|
||||
}
|
||||
nextChassis := currentChassis + 1
|
||||
|
||||
for i := uint32(0); i < number; i++ {
|
||||
|
||||
pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{
|
||||
ID: fmt.Sprintf("%s%d", config.PCIeSwitchhDownstreamPortPrefix, i),
|
||||
Bus: pcieSwitchUpstreamPort.ID,
|
||||
Chassis: fmt.Sprintf("%d", nextChassis),
|
||||
Slot: strconv.FormatUint(uint64(i), 10),
|
||||
// TODO: MemReserve: fmt.Sprintf("%dB", memSize32bit),
|
||||
// TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
|
||||
}
|
||||
devices = append(devices, pcieSwitchDownstreamPort)
|
||||
}
|
||||
|
||||
return devices
|
||||
}
|
||||
|
||||
func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
|
||||
span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id})
|
||||
defer span.End()
|
||||
@@ -2716,7 +2882,6 @@ func (q *qemu) Save() (s hv.HypervisorState) {
|
||||
s.UUID = q.state.UUID
|
||||
s.HotpluggedMemory = q.state.HotpluggedMemory
|
||||
s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
|
||||
s.PCIeRootPort = q.state.PCIeRootPort
|
||||
|
||||
for _, bridge := range q.arch.getBridges() {
|
||||
s.Bridges = append(s.Bridges, hv.Bridge{
|
||||
@@ -2740,7 +2905,6 @@ func (q *qemu) Load(s hv.HypervisorState) {
|
||||
q.state.HotpluggedMemory = s.HotpluggedMemory
|
||||
q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
|
||||
q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid
|
||||
q.state.PCIeRootPort = s.PCIeRootPort
|
||||
|
||||
for _, bridge := range s.Bridges {
|
||||
q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr))
|
||||
|
||||
@@ -124,7 +124,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
|
||||
legacySerial: config.LegacySerial,
|
||||
},
|
||||
vmFactory: factory,
|
||||
snpGuest: config.SevSnpGuest,
|
||||
snpGuest: config.ConfidentialGuest,
|
||||
}
|
||||
|
||||
if config.ConfidentialGuest {
|
||||
@@ -308,6 +308,7 @@ func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware,
|
||||
ReducedPhysBits: 1,
|
||||
}), "", nil
|
||||
case noneProtection:
|
||||
|
||||
return devices, firmware, nil
|
||||
|
||||
default:
|
||||
|
||||
@@ -140,6 +140,9 @@ type qemuArch interface {
|
||||
// appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus
|
||||
appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
|
||||
|
||||
// appendPCIeSwitch appends a ioh3420 device to a pcie-root-port
|
||||
appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
|
||||
|
||||
// append vIOMMU device
|
||||
appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error)
|
||||
|
||||
@@ -183,7 +186,8 @@ const (
|
||||
defaultBridgeBus = "pcie.0"
|
||||
defaultPCBridgeBus = "pci.0"
|
||||
maxDevIDSize = 31
|
||||
pcieRootPortPrefix = "rp"
|
||||
maxPCIeRootPort = 16 // Limitation from QEMU
|
||||
maxPCIeSwitchPort = 16 // Limitation from QEMU
|
||||
)
|
||||
|
||||
// This is the PCI start address assigned to the first bridge that
|
||||
@@ -677,17 +681,17 @@ func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govm
|
||||
}
|
||||
|
||||
func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device {
|
||||
pciDevice := vfioDev.(config.VFIOPCIDev)
|
||||
if pciDevice.BDF == "" {
|
||||
|
||||
if vfioDev.BDF == "" {
|
||||
return devices
|
||||
}
|
||||
|
||||
devices = append(devices,
|
||||
govmmQemu.VFIODevice{
|
||||
BDF: pciDevice.BDF,
|
||||
VendorID: pciDevice.VendorID,
|
||||
DeviceID: pciDevice.DeviceID,
|
||||
Bus: pciDevice.Bus,
|
||||
BDF: vfioDev.BDF,
|
||||
VendorID: vfioDev.VendorID,
|
||||
DeviceID: vfioDev.DeviceID,
|
||||
Bus: vfioDev.Bus,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -803,6 +807,13 @@ func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, numb
|
||||
return genericAppendPCIeRootPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
|
||||
}
|
||||
|
||||
// appendPCIeSwitchPortDevice appends a PCIe Switch with <number> ports
|
||||
func (q *qemuArchBase) appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
|
||||
return genericAppendPCIeSwitchPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
|
||||
}
|
||||
|
||||
// getBARsMaxAddressableMemory we need to know the BAR sizes to configure the
|
||||
// PCIe Root Port or PCIe Downstream Port attaching a device with huge BARs.
|
||||
func (q *qemuArchBase) getBARsMaxAddressableMemory() (uint64, uint64) {
|
||||
|
||||
pci := nvpci.New()
|
||||
|
||||
@@ -470,7 +470,7 @@ func TestQemuArchBaseAppendVFIODevice(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
vfDevice := config.VFIOPCIDev{
|
||||
vfDevice := config.VFIODev{
|
||||
BDF: bdf,
|
||||
}
|
||||
|
||||
@@ -490,7 +490,7 @@ func TestQemuArchBaseAppendVFIODeviceWithVendorDeviceID(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
vfDevice := config.VFIOPCIDev{
|
||||
vfDevice := config.VFIODev{
|
||||
BDF: bdf,
|
||||
VendorID: vendorID,
|
||||
DeviceID: deviceID,
|
||||
|
||||
@@ -111,9 +111,6 @@ func TestQemuCreateVM(t *testing.T) {
|
||||
config6 := newQemuConfig()
|
||||
config6.DisableGuestSeLinux = false
|
||||
|
||||
config7 := newQemuConfig()
|
||||
config7.PCIeRootPort = 1
|
||||
|
||||
config8 := newQemuConfig()
|
||||
config8.EnableVhostUserStore = true
|
||||
config8.HugePages = true
|
||||
@@ -161,7 +158,6 @@ func TestQemuCreateVM(t *testing.T) {
|
||||
{config3, false, true},
|
||||
{config5, false, true},
|
||||
{config6, false, false},
|
||||
{config7, false, true},
|
||||
{config8, false, true},
|
||||
{config9, true, false},
|
||||
{config10, false, true},
|
||||
|
||||
@@ -32,7 +32,6 @@ import (
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
|
||||
deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
|
||||
resCtrl "github.com/kata-containers/kata-containers/src/runtime/pkg/resourcecontrol"
|
||||
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
|
||||
@@ -101,15 +100,10 @@ type HypervisorPidKey struct{}
|
||||
|
||||
// SandboxStatus describes a sandbox status.
|
||||
type SandboxStatus struct {
|
||||
ContainersStatus []ContainerStatus
|
||||
|
||||
// Annotations allow clients to store arbitrary values,
|
||||
// for example to add additional status values required
|
||||
// to support particular specifications.
|
||||
Annotations map[string]string
|
||||
|
||||
ID string
|
||||
Hypervisor HypervisorType
|
||||
ContainersStatus []ContainerStatus
|
||||
State types.SandboxState
|
||||
HypervisorConfig HypervisorConfig
|
||||
}
|
||||
@@ -176,10 +170,8 @@ type SandboxConfig struct {
|
||||
|
||||
// SharePidNs sets all containers to share the same sandbox level pid namespace.
|
||||
SharePidNs bool
|
||||
|
||||
// SystemdCgroup enables systemd cgroup support
|
||||
SystemdCgroup bool
|
||||
|
||||
// SandboxCgroupOnly enables cgroup only at podlevel in the host
|
||||
SandboxCgroupOnly bool
|
||||
|
||||
@@ -623,22 +615,49 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
|
||||
|
||||
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
|
||||
// until we have TDISP/IDE PCIe support.
|
||||
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort)
|
||||
var devs []config.DeviceInfo
|
||||
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
|
||||
// Aggregate all the containner devices for hot-plug and use them to dedcue
|
||||
// the correct amount of ports to reserve for the hypervisor.
|
||||
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
|
||||
|
||||
var vfioDevices []config.DeviceInfo
|
||||
// vhost-user-block device is a PCIe device in Virt, keep track of it
|
||||
// for correct number of PCIe root ports.
|
||||
var vhostUserBlkDevices []config.DeviceInfo
|
||||
|
||||
for cnt, containers := range sandboxConfig.Containers {
|
||||
for dev, device := range containers.DeviceInfos {
|
||||
if coldPlugVFIO && deviceManager.IsVFIO(device.ContainerPath) {
|
||||
|
||||
if deviceManager.IsVhostUserBlk(device) {
|
||||
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
|
||||
continue
|
||||
}
|
||||
isVFIO := deviceManager.IsVFIO(device.ContainerPath)
|
||||
if hotPlugVFIO && isVFIO {
|
||||
vfioDevices = append(vfioDevices, device)
|
||||
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
|
||||
}
|
||||
if coldPlugVFIO && isVFIO {
|
||||
device.ColdPlug = true
|
||||
devs = append(devs, device)
|
||||
device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
|
||||
vfioDevices = append(vfioDevices, device)
|
||||
// We need to remove the devices marked for cold-plug
|
||||
// otherwise at the container level the kata-agent
|
||||
// will try to hot-plug them.
|
||||
infos := sandboxConfig.Containers[cnt].DeviceInfos
|
||||
infos = append(infos[:dev], infos[dev+1:]...)
|
||||
sandboxConfig.Containers[cnt].DeviceInfos = infos
|
||||
sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
|
||||
}
|
||||
}
|
||||
var filteredDevices []config.DeviceInfo
|
||||
for _, device := range containers.DeviceInfos {
|
||||
if device.ID != "remove-we-are-cold-plugging" {
|
||||
filteredDevices = append(filteredDevices, device)
|
||||
}
|
||||
}
|
||||
sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
|
||||
|
||||
}
|
||||
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
|
||||
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
|
||||
|
||||
// store doesn't require hypervisor to be stored immediately
|
||||
if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
|
||||
@@ -653,7 +672,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
|
||||
return s, nil
|
||||
}
|
||||
|
||||
for _, dev := range devs {
|
||||
for _, dev := range vfioDevices {
|
||||
_, err := s.AddDevice(ctx, dev)
|
||||
if err != nil {
|
||||
s.Logger().WithError(err).Debug("Cannot cold-plug add device")
|
||||
@@ -1709,7 +1728,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
|
||||
defer span.End()
|
||||
|
||||
for i := range s.config.Containers {
|
||||
|
||||
c, err := newContainer(ctx, s, &s.config.Containers[i])
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -1728,7 +1746,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
|
||||
if err := s.updateResources(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := s.resourceControllerUpdate(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -1740,7 +1757,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
|
||||
if err := s.storeSandbox(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1904,15 +1920,11 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
|
||||
// adding a group of VFIO devices
|
||||
for _, dev := range vfioDevices {
|
||||
if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil {
|
||||
bdf := ""
|
||||
if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
|
||||
bdf = pciDevice.BDF
|
||||
}
|
||||
s.Logger().
|
||||
WithFields(logrus.Fields{
|
||||
"sandbox": s.id,
|
||||
"vfio-device-ID": (*dev).GetID(),
|
||||
"vfio-device-BDF": bdf,
|
||||
"vfio-device-ID": dev.ID,
|
||||
"vfio-device-BDF": dev.BDF,
|
||||
}).WithError(err).Error("failed to hotplug VFIO device")
|
||||
return err
|
||||
}
|
||||
@@ -1927,6 +1939,7 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
|
||||
return err
|
||||
case config.VhostUserBlk:
|
||||
vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice)
|
||||
|
||||
if !ok {
|
||||
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
|
||||
}
|
||||
@@ -1961,15 +1974,11 @@ func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, de
|
||||
// remove a group of VFIO devices
|
||||
for _, dev := range vfioDevices {
|
||||
if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil {
|
||||
bdf := ""
|
||||
if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
|
||||
bdf = pciDevice.BDF
|
||||
}
|
||||
s.Logger().WithError(err).
|
||||
WithFields(logrus.Fields{
|
||||
"sandbox": s.id,
|
||||
"vfio-device-ID": (*dev).GetID(),
|
||||
"vfio-device-BDF": bdf,
|
||||
"vfio-device-ID": dev.ID,
|
||||
"vfio-device-BDF": dev.BDF,
|
||||
}).Error("failed to hot unplug VFIO device")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -593,11 +593,11 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) {
|
||||
_, err = os.Create(deviceFile)
|
||||
assert.Nil(t, err)
|
||||
|
||||
savedIOMMUPath := config.SysIOMMUPath
|
||||
config.SysIOMMUPath = tmpDir
|
||||
savedIOMMUPath := config.SysIOMMUGroupPath
|
||||
config.SysIOMMUGroupPath = tmpDir
|
||||
|
||||
defer func() {
|
||||
config.SysIOMMUPath = savedIOMMUPath
|
||||
config.SysIOMMUGroupPath = savedIOMMUPath
|
||||
}()
|
||||
|
||||
dm := manager.NewDeviceManager(config.VirtioSCSI, false, "", 0, nil)
|
||||
|
||||
Reference in New Issue
Block a user