Merge pull request #4492 from zvonkok/pcie-topology

runtime: fix PCIe topology for GPUDirect use-case
This commit is contained in:
Peng Tao
2023-07-03 09:17:12 +08:00
committed by GitHub
35 changed files with 952 additions and 560 deletions

View File

@@ -17,7 +17,7 @@ import (
"github.com/prometheus/procfs"
"github.com/urfave/cli"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
@@ -113,8 +113,8 @@ type HypervisorInfo struct {
SocketPath string
Msize9p uint32
MemorySlots uint32
PCIeRootPort uint32
ColdPlugVFIO hv.PCIePort
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
HotplugVFIOOnRootBus bool
Debug bool
}
@@ -317,9 +317,9 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) {
EntropySource: config.HypervisorConfig.EntropySource,
SharedFS: config.HypervisorConfig.SharedFS,
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
SocketPath: socketPath,
}, nil
}

View File

@@ -19,12 +19,12 @@ import (
"testing"
"github.com/BurntSushi/toml"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
vcUtils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/urfave/cli"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
@@ -74,8 +74,9 @@ func createConfig(configPath string, fileData string) error {
return nil
}
func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) {
var coldPlugVFIO hv.PCIePort
func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.RuntimeConfig, err error) {
var hotPlugVFIO config.PCIePort
var coldPlugVFIO config.PCIePort
const logPath = "/log/path"
hypervisorPath := filepath.Join(prefixDir, "hypervisor")
kernelPath := filepath.Join(prefixDir, "kernel")
@@ -87,8 +88,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
blockStorageDriver := "virtio-scsi"
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
coldPlugVFIO = hv.NoPort
hotPlugVFIO = config.BridgePort
coldPlugVFIO = config.NoPort
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd")
@@ -132,8 +133,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
BlockDeviceDriver: blockStorageDriver,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
PCIeRootPort: pcieRootPort,
DisableNewNetNs: disableNewNetNs,
DefaultVCPUCount: hypConfig.NumVCPUs,
DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs,
@@ -156,12 +157,12 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
return "", oci.RuntimeConfig{}, err
}
_, config, err = katautils.LoadConfiguration(configFile, true)
_, ociConfig, err = katautils.LoadConfiguration(configFile, true)
if err != nil {
return "", oci.RuntimeConfig{}, err
}
return configFile, config, nil
return configFile, ociConfig, nil
}
func getExpectedAgentDetails(config oci.RuntimeConfig) (AgentInfo, error) {
@@ -277,7 +278,7 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo {
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
}

View File

@@ -357,8 +357,15 @@ pflashes = []
# Default false
#hotplug_vfio_on_root_bus = true
# Enable hot-plugging of VFIO devices to a bridge-port,
# root-port or switch-port.
# The default setting is "no-port"
#hot_plug_vfio = "root-port"
# In a confidential compute environment hot-plugging can compromise
# security. Enable cold-plugging of VFIO devices to a root-port.
# security.
# Enable cold-plugging of VFIO devices to a bridge-port,
# root-port or switch-port.
# The default setting is "no-port", which means disabled.
#cold_plug_vfio = "root-port"

View File

@@ -20,7 +20,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
@@ -308,8 +308,9 @@ func TestCreateContainerConfigFail(t *testing.T) {
assert.Error(err)
}
func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) {
var coldPlugVFIO hv.PCIePort
func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, err error) {
var hotPlugVFIO config.PCIePort
var coldPlugVFIO config.PCIePort
if dir == "" {
return "", fmt.Errorf("BUG: need directory")
}
@@ -330,11 +331,11 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
blockDeviceDriver := "virtio-scsi"
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := path.Join(dir, "virtiofsd")
coldPlugVFIO = hv.RootPort
hotPlugVFIO = config.BridgePort
coldPlugVFIO = config.RootPort
configFileOptions := ktu.RuntimeConfigOptions{
Hypervisor: "qemu",
@@ -349,10 +350,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
BlockDeviceDriver: blockDeviceDriver,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
DisableNewNetNs: disableNewNetNs,
SharedFS: sharedFS,
VirtioFSDaemon: virtioFSdaemon,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
}

View File

@@ -125,14 +125,117 @@ const (
// SysDevPrefix is static string of /sys/dev
var SysDevPrefix = "/sys/dev"
// SysIOMMUPath is static string of /sys/kernel/iommu_groups
var SysIOMMUPath = "/sys/kernel/iommu_groups"
// SysIOMMUGroupPath is static string of /sys/kernel/iommu_groups
var SysIOMMUGroupPath = "/sys/kernel/iommu_groups"
// SysBusPciDevicesPath is static string of /sys/bus/pci/devices
var SysBusPciDevicesPath = "/sys/bus/pci/devices"
var getSysDevPath = getSysDevPathImpl
// PCIePortBusPrefix gives us the correct bus nameing dependeing on the port
// used to hot(cold)-plug the device
type PCIePortBusPrefix string
const (
PCIeRootPortPrefix PCIePortBusPrefix = "rp"
PCIeSwitchPortPrefix PCIePortBusPrefix = "sw"
PCIeSwitchUpstreamPortPrefix PCIePortBusPrefix = "swup"
PCIeSwitchhDownstreamPortPrefix PCIePortBusPrefix = "swdp"
PCIBridgePortPrefix PCIePortBusPrefix = "bp"
)
func (p PCIePortBusPrefix) String() string {
switch p {
case PCIeRootPortPrefix:
fallthrough
case PCIeSwitchPortPrefix:
fallthrough
case PCIeSwitchUpstreamPortPrefix:
fallthrough
case PCIeSwitchhDownstreamPortPrefix:
fallthrough
case PCIBridgePortPrefix:
return string(p)
}
return fmt.Sprintf("<unknown PCIePortBusPrefix: %s>", string(p))
}
// PCIePort distinguish only between root and switch port
type PCIePort string
const (
// RootPort attach VFIO devices to a root-port
RootPort PCIePort = "root-port"
// SwitchPort attach VFIO devices to a switch-port
SwitchPort = "switch-port"
// BridgePort is the default
BridgePort = "bridge-port"
// NoPort is for disabling VFIO hotplug/coldplug
NoPort = "no-port"
// InvalidPort is for invalid port
InvalidPort = "invalid-port"
)
func (p PCIePort) String() string {
switch p {
case RootPort:
fallthrough
case SwitchPort:
fallthrough
case BridgePort:
fallthrough
case NoPort:
fallthrough
case InvalidPort:
return string(p)
}
return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
}
var PCIePortPrefixMapping = map[PCIePort]PCIePortBusPrefix{
RootPort: PCIeRootPortPrefix,
SwitchPort: PCIeSwitchhDownstreamPortPrefix,
BridgePort: PCIBridgePortPrefix,
}
func (p PCIePort) Invalid() bool {
switch p {
case RootPort:
fallthrough
case SwitchPort:
fallthrough
case BridgePort:
fallthrough
case NoPort:
return false
}
return true
}
func (p PCIePort) Valid() bool {
switch p {
case RootPort:
fallthrough
case SwitchPort:
fallthrough
case BridgePort:
fallthrough
case NoPort:
return true
}
return false
}
type PCIePortMapping map[string]bool
var (
// Each of this structures keeps track of the devices attached to the
// different types of PCI ports. We can deduces the Bus number from it
// and eliminate duplicates being assigned.
PCIeDevices = map[PCIePort]PCIePortMapping{}
)
// DeviceInfo is an embedded type that contains device data common to all types of devices.
type DeviceInfo struct {
// DriverOptions is specific options for each device driver
@@ -178,6 +281,9 @@ type DeviceInfo struct {
// ColdPlug specifies whether the device must be cold plugged (true)
// or hot plugged (false).
ColdPlug bool
// Specifies the PCIe port type to which the device is attached
Port PCIePort
}
// BlockDrive represents a block storage drive which may be used in case the storage
@@ -279,14 +385,8 @@ const (
VFIOAPDeviceMediatedType
)
type VFIODev interface {
GetID() *string
GetType() VFIODeviceType
GetSysfsDev() *string
}
// VFIOPCIDev represents a VFIO PCI device used for hotplugging
type VFIOPCIDev struct {
// VFIODev represents a VFIO PCI device used for hotplugging
type VFIODev struct {
// ID is used to identify this drive in the hypervisor options.
ID string
@@ -316,44 +416,15 @@ type VFIOPCIDev struct {
// IsPCIe specifies device is PCIe or PCI
IsPCIe bool
}
func (d VFIOPCIDev) GetID() *string {
return &d.ID
}
func (d VFIOPCIDev) GetType() VFIODeviceType {
return d.Type
}
func (d VFIOPCIDev) GetSysfsDev() *string {
return &d.SysfsDev
}
type VFIOAPDev struct {
// ID is used to identify this drive in the hypervisor options.
ID string
// sysfsdev of VFIO mediated device
SysfsDev string
// APDevices are the Adjunct Processor devices assigned to the mdev
APDevices []string
// Type of VFIO device
Type VFIODeviceType
}
// Rank identifies a device in a IOMMU group
Rank int
func (d VFIOAPDev) GetID() *string {
return &d.ID
}
func (d VFIOAPDev) GetType() VFIODeviceType {
return d.Type
}
func (d VFIOAPDev) GetSysfsDev() *string {
return &d.SysfsDev
// Port is the PCIe port type to which the device is attached
Port PCIePort
}
// RNGDev represents a random number generator device

View File

@@ -47,9 +47,9 @@ func deviceLogger() *logrus.Entry {
return api.DeviceLogger()
}
// Identify PCIe device by reading the size of the PCI config space
// IsPCIeDevice identifies PCIe device by reading the size of the PCI config space
// Plain PCI device have 256 bytes of config space where PCIe devices have 4K
func isPCIeDevice(bdf string) bool {
func IsPCIeDevice(bdf string) bool {
if len(strings.Split(bdf, ":")) == 2 {
bdf = PCIDomain + ":" + bdf
}
@@ -157,14 +157,12 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo
// GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group
// We can reuse this function at various levels, sandbox, container.
// Only the VFIO module is allowed to do bus assignments, all other modules need to
// ignore it if used as helper function to get VFIO information.
func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignment bool) ([]*config.VFIODev, error) {
func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODev, error) {
vfioDevs := []*config.VFIODev{}
vfioGroup := filepath.Base(device.HostPath)
iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices")
iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices")
deviceFiles, err := os.ReadDir(iommuDevicesPath)
if err != nil {
@@ -174,7 +172,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
// Pass all devices in iommu group
for i, deviceFile := range deviceFiles {
//Get bdf of device eg 0000:00:1c.0
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath)
deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(deviceFile.Name(), iommuDevicesPath)
if err != nil {
return nil, err
}
@@ -196,27 +194,24 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
isPCIe := isPCIeDevice(deviceBDF)
// Do not directly assign to `vfio` -- need to access field still
vfioPCI := config.VFIOPCIDev{
vfio = config.VFIODev{
ID: id,
Type: vfioDeviceType,
BDF: deviceBDF,
SysfsDev: deviceSysfsDev,
IsPCIe: isPCIe,
IsPCIe: IsPCIeDevice(deviceBDF),
Class: pciClass,
Rank: -1,
Port: device.Port,
}
if isPCIe && !ignoreBusAssignment {
vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
AllPCIeDevs[deviceBDF] = true
}
vfio = vfioPCI
case config.VFIOAPDeviceMediatedType:
devices, err := GetAPVFIODevices(deviceSysfsDev)
if err != nil {
return nil, err
}
vfio = config.VFIOAPDev{
vfio = config.VFIODev{
ID: id,
SysfsDev: deviceSysfsDev,
Type: config.VFIOAPDeviceMediatedType,

View File

@@ -28,14 +28,9 @@ const (
vfioRemoveIDPath = "/sys/bus/pci/drivers/vfio-pci/remove_id"
iommuGroupPath = "/sys/bus/pci/devices/%s/iommu_group"
vfioDevPath = "/dev/vfio/%s"
pcieRootPortPrefix = "rp"
vfioAPSysfsDir = "/sys/devices/vfio_ap"
)
var (
AllPCIeDevs = map[string]bool{}
)
// VFIODevice is a vfio device meant to be passed to the hypervisor
// to be used by the Virtual Machine.
type VFIODevice struct {
@@ -70,10 +65,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
}
}()
device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo, false)
device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo)
if err != nil {
return err
}
for _, vfio := range device.VfioDevs {
if vfio.IsPCIe {
busIndex := len(config.PCIeDevices[vfio.Port])
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
config.PCIeDevices[vfio.Port][vfio.BDF] = true
}
}
coldPlug := device.DeviceInfo.ColdPlug
deviceLogger().WithField("cold-plug", coldPlug).Info("Attaching VFIO device")
@@ -169,23 +171,18 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
for _, dev := range ds.VFIODevs {
var vfio config.VFIODev
vfioDeviceType := (*device.VfioDevs[0]).GetType()
switch vfioDeviceType {
switch dev.Type {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
bdf := ""
if pciDev, ok := (*dev).(config.VFIOPCIDev); ok {
bdf = pciDev.BDF
}
vfio = config.VFIOPCIDev{
ID: *(*dev).GetID(),
Type: config.VFIODeviceType((*dev).GetType()),
BDF: bdf,
SysfsDev: *(*dev).GetSysfsDev(),
vfio = config.VFIODev{
ID: dev.ID,
Type: config.VFIODeviceType(dev.Type),
BDF: dev.BDF,
SysfsDev: dev.SysfsDev,
}
case config.VFIOAPDeviceMediatedType:
vfio = config.VFIOAPDev{
ID: *(*dev).GetID(),
SysfsDev: *(*dev).GetSysfsDev(),
vfio = config.VFIODev{
ID: dev.ID,
SysfsDev: dev.SysfsDev,
}
default:
deviceLogger().WithError(
@@ -200,7 +197,7 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
// It should implement GetAttachCount() and DeviceID() as api.Device implementation
// here it shares function from *GenericDevice so we don't need duplicate codes
func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr)
if err != nil {
@@ -210,14 +207,18 @@ func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType:
// Get bdf of device eg. 0000:00:1c.0
deviceBDF = getBDF(deviceFileName)
// OLD IMPL: deviceBDF = getBDF(deviceFileName)
// The old implementation did not consider the case where
// vfio devices are located on different root busses. The
// kata-agent will handle the case now, here, use the full PCI addr
deviceBDF = deviceFileName
// Get sysfs path used by cloud-hypervisor
deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName)
case config.VFIOPCIDeviceMediatedType:
// Get sysfsdev of device eg. /sys/devices/pci0000:00/0000:00:02.0/f79944e4-5a3d-11e8-99ce-479cbab002e4
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
deviceBDF = getBDF(getMediatedBDF(deviceSysfsDev))
deviceBDF = GetBDF(getMediatedBDF(deviceSysfsDev))
case config.VFIOAPDeviceMediatedType:
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
@@ -240,7 +241,7 @@ func getMediatedBDF(deviceSysfsDev string) string {
// getBDF returns the BDF of pci device
// Expected input string format is [<domain>]:[<bus>][<slot>].[<func>] eg. 0000:02:10.0
func getBDF(deviceSysStr string) string {
func GetBDF(deviceSysStr string) string {
tokens := strings.SplitN(deviceSysStr, ":", 2)
if len(tokens) == 1 {
return ""

View File

@@ -20,7 +20,7 @@ func TestGetVFIODetails(t *testing.T) {
}
data := []testData{
{"0000:02:10.0", "02:10.0"},
{"0000:02:10.0", "0000:02:10.0"},
{"0000:0210.0", ""},
{"f79944e4-5a3d-11e8-99ce-", ""},
{"f79944e4-5a3d-11e8-99ce", ""},
@@ -29,7 +29,7 @@ func TestGetVFIODetails(t *testing.T) {
}
for _, d := range data {
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "")
deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(d.deviceStr, "")
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType:

View File

@@ -71,7 +71,11 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
dm.blockDriver = config.VirtioSCSI
}
drivers.AllPCIeDevs = make(map[string]bool)
config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping)
config.PCIeDevices[config.RootPort] = make(map[string]bool)
config.PCIeDevices[config.SwitchPort] = make(map[string]bool)
config.PCIeDevices[config.BridgePort] = make(map[string]bool)
for _, dev := range devices {
dm.devices[dev.DeviceID()] = dev
@@ -118,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device
}
if IsVFIO(devInfo.HostPath) {
return drivers.NewVFIODevice(&devInfo), nil
} else if isVhostUserBlk(devInfo) {
} else if IsVhostUserBlk(devInfo) {
if devInfo.DriverOptions == nil {
devInfo.DriverOptions = make(map[string]string)
}

View File

@@ -116,14 +116,14 @@ func TestAttachVFIODevice(t *testing.T) {
_, err = os.Create(deviceConfigFile)
assert.Nil(t, err)
savedIOMMUPath := config.SysIOMMUPath
config.SysIOMMUPath = tmpDir
savedIOMMUPath := config.SysIOMMUGroupPath
config.SysIOMMUGroupPath = tmpDir
savedSysBusPciDevicesPath := config.SysBusPciDevicesPath
config.SysBusPciDevicesPath = devicesDir
defer func() {
config.SysIOMMUPath = savedIOMMUPath
config.SysIOMMUGroupPath = savedIOMMUPath
config.SysBusPciDevicesPath = savedSysBusPciDevicesPath
}()

View File

@@ -37,7 +37,7 @@ func isBlock(devInfo config.DeviceInfo) bool {
}
// isVhostUserBlk checks if the device is a VhostUserBlk device.
func isVhostUserBlk(devInfo config.DeviceInfo) bool {
func IsVhostUserBlk(devInfo config.DeviceInfo) bool {
return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor
}

View File

@@ -70,7 +70,7 @@ func TestIsVhostUserBlk(t *testing.T) {
}
for _, d := range data {
isVhostUserBlk := isVhostUserBlk(
isVhostUserBlk := IsVhostUserBlk(
config.DeviceInfo{
DevType: d.devType,
Major: d.major,

View File

@@ -123,6 +123,14 @@ const (
// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
PCIeRootPort DeviceDriver = "pcie-root-port"
// PCIeSwitchUpstreamPort is a PCIe switch upstream port
// A upstream port connects to a PCIe Root Port
PCIeSwitchUpstreamPort DeviceDriver = "x3130-upstream"
// PCIeSwitchDownstreamPort is a PCIe switch downstream port
// PCIe devices can be hot-plugged to the downstream port.
PCIeSwitchDownstreamPort DeviceDriver = "xio3130-downstream"
// Loader is the Loader device driver.
Loader DeviceDriver = "loader"
@@ -236,6 +244,7 @@ const (
// SecExecGuest represents an s390x Secure Execution (Protected Virtualization in QEMU) object
SecExecGuest ObjectType = "s390-pv-guest"
// PEFGuest represent ppc64le PEF(Protected Execution Facility) object.
PEFGuest ObjectType = "pef-guest"
)
@@ -369,7 +378,6 @@ func (object Object) QemuParams(config *Config) []string {
deviceParams = append(deviceParams, string(object.Driver))
deviceParams = append(deviceParams, fmt.Sprintf("id=%s", object.DeviceID))
deviceParams = append(deviceParams, fmt.Sprintf("host-path=%s", object.File))
}
if len(deviceParams) > 0 {
@@ -1681,6 +1689,106 @@ func (b PCIeRootPortDevice) Valid() bool {
return true
}
// PCIeSwitchUpstreamPortDevice is the port connecting to the root port
type PCIeSwitchUpstreamPortDevice struct {
ID string // format: sup{n}, n>=0
Bus string // default is rp0
}
// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
func (b PCIeSwitchUpstreamPortDevice) QemuParams(config *Config) []string {
var qemuParams []string
var deviceParams []string
driver := PCIeSwitchUpstreamPort
deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
qemuParams = append(qemuParams, "-device")
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
return qemuParams
}
// Valid returns true if the PCIeSwitchUpstreamPortDevice structure is valid and complete.
func (b PCIeSwitchUpstreamPortDevice) Valid() bool {
if b.ID == "" {
return false
}
if b.Bus == "" {
return false
}
return true
}
// PCIeSwitchDownstreamPortDevice is the port connecting to the root port
type PCIeSwitchDownstreamPortDevice struct {
ID string // format: sup{n}, n>=0
Bus string // default is rp0
Chassis string // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00
Slot string // >=0, default is 0x00
// This to work needs patches to QEMU
BusReserve string
// Pref64 and Pref32 are not allowed to be set simultaneously
Pref64Reserve string // reserve prefetched MMIO aperture, 64-bit
Pref32Reserve string // reserve prefetched MMIO aperture, 32-bit
MemReserve string // reserve non-prefetched MMIO aperture, 32-bit *only*
IOReserve string // IO reservation
}
// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
func (b PCIeSwitchDownstreamPortDevice) QemuParams(config *Config) []string {
var qemuParams []string
var deviceParams []string
driver := PCIeSwitchDownstreamPort
deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
deviceParams = append(deviceParams, fmt.Sprintf("chassis=%s", b.Chassis))
deviceParams = append(deviceParams, fmt.Sprintf("slot=%s", b.Slot))
if b.BusReserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("bus-reserve=%s", b.BusReserve))
}
if b.Pref64Reserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", b.Pref64Reserve))
}
if b.Pref32Reserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("pref32-reserve=%s", b.Pref32Reserve))
}
if b.MemReserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", b.MemReserve))
}
if b.IOReserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", b.IOReserve))
}
qemuParams = append(qemuParams, "-device")
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
return qemuParams
}
// Valid returns true if the PCIeSwitchUpstremPortDevice structure is valid and complete.
func (b PCIeSwitchDownstreamPortDevice) Valid() bool {
if b.ID == "" {
return false
}
if b.Bus == "" {
return false
}
if b.Chassis == "" {
return false
}
if b.Slot == "" {
return false
}
return true
}
// VFIODevice represents a qemu vfio device meant for direct access by guest OS.
type VFIODevice struct {
// Bus-Device-Function of device

View File

@@ -5,7 +5,7 @@
package hypervisors
import "fmt"
import "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
// Bridge is a bridge where devices can be hot plugged
type Bridge struct {
@@ -28,37 +28,8 @@ type CPUDevice struct {
ID string
}
// PCIePort distinguish only between root and switch port
type PCIePort string
const (
// RootPort attach VFIO devices to a root-port
RootPort PCIePort = "root-port"
// SwitchPort attach VFIO devices to a switch-port
SwitchPort = "switch-port"
// BridgePort is the default
BridgePort = "bridge-port"
// NoPort is for disabling VFIO hotplug/coldplug
NoPort = "no-port"
)
func (p PCIePort) String() string {
switch p {
case RootPort:
return "root-port"
case SwitchPort:
return "switch-port"
case BridgePort:
return "bridge-port"
case NoPort:
return "no-port"
}
return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
}
type HypervisorState struct {
BlockIndexMap map[int]struct{}
// Type of hypervisor, E.g. qemu/firecracker/acrn.
Type string
UUID string
@@ -74,7 +45,7 @@ type HypervisorState struct {
HotpluggedMemory int
VirtiofsDaemonPid int
Pid int
PCIeRootPort int
ColdPlugVFIO PCIePort
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
HotplugVFIOOnRootBus bool
}

View File

@@ -14,7 +14,7 @@ import (
"strconv"
"testing"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
)
@@ -224,8 +224,8 @@ type RuntimeConfigOptions struct {
JaegerUser string
JaegerPassword string
PFlash []string
PCIeRootPort uint32
ColdPlugVFIO hv.PCIePort
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
DefaultVCPUCount uint32
DefaultMaxVCPUCount uint32
DefaultMemSize uint32
@@ -318,7 +318,6 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string {
disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + `
enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + `
hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + `
pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + `
cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `"
msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + `
enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + `

View File

@@ -10,7 +10,7 @@
package katautils
import (
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
config "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
)
// name is the name of the runtime
@@ -82,7 +82,6 @@ const defaultEnableDebug bool = false
const defaultDisableNestingChecks bool = false
const defaultMsize9p uint32 = 8192
const defaultHotplugVFIOOnRootBus bool = false
const defaultPCIeRootPort = 0
const defaultEntropySource = "/dev/urandom"
const defaultGuestHookPath string = ""
const defaultVirtioFSCacheMode = "never"
@@ -108,4 +107,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock"
// Default config file used by stateless systems.
var defaultRuntimeConfiguration = "@CONFIG_PATH@"
const defaultColdPlugVFIO = hv.NoPort
const defaultHotPlugVFIO = config.NoPort
const defaultColdPlugVFIO = config.NoPort

View File

@@ -20,7 +20,6 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@@ -78,88 +77,88 @@ type factory struct {
}
type hypervisor struct {
Path string `toml:"path"`
JailerPath string `toml:"jailer_path"`
Kernel string `toml:"kernel"`
CtlPath string `toml:"ctlpath"`
Initrd string `toml:"initrd"`
Image string `toml:"image"`
RootfsType string `toml:"rootfs_type"`
Firmware string `toml:"firmware"`
FirmwareVolume string `toml:"firmware_volume"`
MachineAccelerators string `toml:"machine_accelerators"`
CPUFeatures string `toml:"cpu_features"`
KernelParams string `toml:"kernel_params"`
MachineType string `toml:"machine_type"`
BlockDeviceDriver string `toml:"block_device_driver"`
EntropySource string `toml:"entropy_source"`
SharedFS string `toml:"shared_fs"`
VirtioFSDaemon string `toml:"virtio_fs_daemon"`
VirtioFSCache string `toml:"virtio_fs_cache"`
VhostUserStorePath string `toml:"vhost_user_store_path"`
FileBackedMemRootDir string `toml:"file_mem_backend"`
GuestHookPath string `toml:"guest_hook_path"`
GuestMemoryDumpPath string `toml:"guest_memory_dump_path"`
SeccompSandbox string `toml:"seccompsandbox"`
BlockDeviceAIO string `toml:"block_device_aio"`
HypervisorPathList []string `toml:"valid_hypervisor_paths"`
JailerPathList []string `toml:"valid_jailer_paths"`
CtlPathList []string `toml:"valid_ctlpaths"`
VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"`
VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"`
PFlashList []string `toml:"pflashes"`
VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"`
FileBackedMemRootList []string `toml:"valid_file_mem_backends"`
EntropySourceList []string `toml:"valid_entropy_sources"`
EnableAnnotations []string `toml:"enable_annotations"`
RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"`
TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"`
MemOffset uint64 `toml:"memory_offset"`
DefaultMaxMemorySize uint64 `toml:"default_maxmemory"`
DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"`
DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"`
DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"`
DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"`
NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"`
NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"`
NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"`
NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"`
VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"`
VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"`
DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"`
MemorySize uint32 `toml:"default_memory"`
MemSlots uint32 `toml:"memory_slots"`
DefaultBridges uint32 `toml:"default_bridges"`
Msize9p uint32 `toml:"msize_9p"`
PCIeRootPort uint32 `toml:"pcie_root_port"`
NumVCPUs int32 `toml:"default_vcpus"`
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
MemPrealloc bool `toml:"enable_mem_prealloc"`
HugePages bool `toml:"enable_hugepages"`
VirtioMem bool `toml:"enable_virtio_mem"`
IOMMU bool `toml:"enable_iommu"`
IOMMUPlatform bool `toml:"enable_iommu_platform"`
Debug bool `toml:"enable_debug"`
DisableNestingChecks bool `toml:"disable_nesting_checks"`
EnableIOThreads bool `toml:"enable_iothreads"`
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"`
DisableVhostNet bool `toml:"disable_vhost_net"`
GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
ConfidentialGuest bool `toml:"confidential_guest"`
SevSnpGuest bool `toml:"sev_snp_guest"`
GuestSwap bool `toml:"enable_guest_swap"`
Rootless bool `toml:"rootless"`
DisableSeccomp bool `toml:"disable_seccomp"`
DisableSeLinux bool `toml:"disable_selinux"`
DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
LegacySerial bool `toml:"use_legacy_serial"`
Path string `toml:"path"`
JailerPath string `toml:"jailer_path"`
Kernel string `toml:"kernel"`
CtlPath string `toml:"ctlpath"`
Initrd string `toml:"initrd"`
Image string `toml:"image"`
RootfsType string `toml:"rootfs_type"`
Firmware string `toml:"firmware"`
FirmwareVolume string `toml:"firmware_volume"`
MachineAccelerators string `toml:"machine_accelerators"`
CPUFeatures string `toml:"cpu_features"`
KernelParams string `toml:"kernel_params"`
MachineType string `toml:"machine_type"`
BlockDeviceDriver string `toml:"block_device_driver"`
EntropySource string `toml:"entropy_source"`
SharedFS string `toml:"shared_fs"`
VirtioFSDaemon string `toml:"virtio_fs_daemon"`
VirtioFSCache string `toml:"virtio_fs_cache"`
VhostUserStorePath string `toml:"vhost_user_store_path"`
FileBackedMemRootDir string `toml:"file_mem_backend"`
GuestHookPath string `toml:"guest_hook_path"`
GuestMemoryDumpPath string `toml:"guest_memory_dump_path"`
SeccompSandbox string `toml:"seccompsandbox"`
BlockDeviceAIO string `toml:"block_device_aio"`
HypervisorPathList []string `toml:"valid_hypervisor_paths"`
JailerPathList []string `toml:"valid_jailer_paths"`
CtlPathList []string `toml:"valid_ctlpaths"`
VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"`
VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"`
PFlashList []string `toml:"pflashes"`
VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"`
FileBackedMemRootList []string `toml:"valid_file_mem_backends"`
EntropySourceList []string `toml:"valid_entropy_sources"`
EnableAnnotations []string `toml:"enable_annotations"`
RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"`
TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"`
MemOffset uint64 `toml:"memory_offset"`
DefaultMaxMemorySize uint64 `toml:"default_maxmemory"`
DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"`
DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"`
DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"`
DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"`
NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"`
NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"`
NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"`
NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"`
VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"`
VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"`
DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"`
MemorySize uint32 `toml:"default_memory"`
MemSlots uint32 `toml:"memory_slots"`
DefaultBridges uint32 `toml:"default_bridges"`
Msize9p uint32 `toml:"msize_9p"`
NumVCPUs int32 `toml:"default_vcpus"`
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
MemPrealloc bool `toml:"enable_mem_prealloc"`
HugePages bool `toml:"enable_hugepages"`
VirtioMem bool `toml:"enable_virtio_mem"`
IOMMU bool `toml:"enable_iommu"`
IOMMUPlatform bool `toml:"enable_iommu_platform"`
Debug bool `toml:"enable_debug"`
DisableNestingChecks bool `toml:"disable_nesting_checks"`
EnableIOThreads bool `toml:"enable_iothreads"`
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"`
ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"`
DisableVhostNet bool `toml:"disable_vhost_net"`
GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
ConfidentialGuest bool `toml:"confidential_guest"`
SevSnpGuest bool `toml:"sev_snp_guest"`
GuestSwap bool `toml:"enable_guest_swap"`
Rootless bool `toml:"rootless"`
DisableSeccomp bool `toml:"disable_seccomp"`
DisableSeLinux bool `toml:"disable_selinux"`
DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
LegacySerial bool `toml:"use_legacy_serial"`
}
type runtime struct {
@@ -287,12 +286,18 @@ func (h hypervisor) firmware() (string, error) {
return ResolvePath(p)
}
func (h hypervisor) coldPlugVFIO() hv.PCIePort {
func (h hypervisor) coldPlugVFIO() config.PCIePort {
if h.ColdPlugVFIO == "" {
return defaultColdPlugVFIO
}
return h.ColdPlugVFIO
}
func (h hypervisor) hotPlugVFIO() config.PCIePort {
if h.HotPlugVFIO == "" {
return defaultHotPlugVFIO
}
return h.HotPlugVFIO
}
func (h hypervisor) firmwareVolume() (string, error) {
p := h.FirmwareVolume
@@ -863,8 +868,8 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
Msize9p: h.msize9p(),
DisableImageNvdimm: h.DisableImageNvdimm,
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
HotPlugVFIO: h.hotPlugVFIO(),
ColdPlugVFIO: h.coldPlugVFIO(),
PCIeRootPort: h.PCIeRootPort,
DisableVhostNet: h.DisableVhostNet,
EnableVhostUserStore: h.EnableVhostUserStore,
VhostUserStorePath: h.vhostUserStorePath(),
@@ -1060,7 +1065,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
Msize9p: h.msize9p(),
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
ColdPlugVFIO: h.coldPlugVFIO(),
PCIeRootPort: h.PCIeRootPort,
HotPlugVFIO: h.hotPlugVFIO(),
DisableVhostNet: true,
GuestHookPath: h.guestHookPath(),
VirtioFSExtraArgs: h.VirtioFSExtraArgs,
@@ -1291,7 +1296,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
Msize9p: defaultMsize9p,
HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus,
ColdPlugVFIO: defaultColdPlugVFIO,
PCIeRootPort: defaultPCIeRootPort,
HotPlugVFIO: defaultHotPlugVFIO,
GuestHookPath: defaultGuestHookPath,
VhostUserStorePath: defaultVhostUserStorePath,
VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect,
@@ -1663,9 +1668,10 @@ func checkConfig(config oci.RuntimeConfig) error {
return err
}
hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
machineType := config.HypervisorConfig.HypervisorMachineType
if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil {
if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil {
return err
}
@@ -1675,18 +1681,32 @@ func checkConfig(config oci.RuntimeConfig) error {
// checkPCIeConfig ensures the PCIe configuration is valid.
// Only allow one of the following settings for cold-plug:
// no-port, root-port, switch-port
func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error {
func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error {
// Currently only QEMU q35 supports advanced PCIe topologies
// firecracker, dragonball do not have right now any PCIe support
if machineType != "q35" {
return nil
}
if vfioPort == hv.NoPort || vfioPort == hv.RootPort || vfioPort == hv.SwitchPort {
if coldPlug != config.NoPort && hotPlug != config.NoPort {
return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug)
}
if coldPlug == config.NoPort && hotPlug == config.NoPort {
return nil
}
var port config.PCIePort
if coldPlug != config.NoPort {
port = coldPlug
}
if hotPlug != config.NoPort {
port = hotPlug
}
if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort {
return nil
}
return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s",
vfioPort, hv.NoPort, hv.RootPort, hv.SwitchPort)
return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s",
coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort)
}
// checkNetNsConfig performs sanity checks on disable_new_netns config.

View File

@@ -18,8 +18,8 @@ import (
"syscall"
"testing"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@@ -63,15 +63,16 @@ func createConfig(configPath string, fileData string) error {
// createAllRuntimeConfigFiles creates all files necessary to call
// loadConfiguration().
func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConfig, err error) {
func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntimeConfig, err error) {
if dir == "" {
return config, fmt.Errorf("BUG: need directory")
return testConfig, fmt.Errorf("BUG: need directory")
}
if hypervisor == "" {
return config, fmt.Errorf("BUG: need hypervisor")
return testConfig, fmt.Errorf("BUG: need hypervisor")
}
var coldPlugVFIO hv.PCIePort
var hotPlugVFIO config.PCIePort
var coldPlugVFIO config.PCIePort
hypervisorPath := path.Join(dir, "hypervisor")
kernelPath := path.Join(dir, "kernel")
kernelParams := "foo=bar xyz"
@@ -85,8 +86,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
blockDeviceAIO := "io_uring"
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
coldPlugVFIO = hv.RootPort
hotPlugVFIO = config.NoPort
coldPlugVFIO = config.BridgePort
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := path.Join(dir, "virtiofsd")
@@ -108,7 +109,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
BlockDeviceAIO: blockDeviceAIO,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
DisableNewNetNs: disableNewNetNs,
DefaultVCPUCount: defaultVCPUCount,
@@ -134,7 +135,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
configPath := path.Join(dir, "runtime.toml")
err = createConfig(configPath, runtimeConfigFileData)
if err != nil {
return config, err
return testConfig, err
}
configPathLink := path.Join(filepath.Dir(configPath), "link-to-configuration.toml")
@@ -142,7 +143,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
// create a link to the config file
err = syscall.Symlink(configPath, configPathLink)
if err != nil {
return config, err
return testConfig, err
}
files := []string{hypervisorPath, kernelPath, imagePath}
@@ -151,7 +152,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
// create the resource (which must be >0 bytes)
err := WriteFile(file, "foo", testFileMode)
if err != nil {
return config, err
return testConfig, err
}
}
@@ -172,7 +173,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
DefaultBridges: defaultBridgesCount,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
Msize9p: defaultMsize9p,
MemSlots: defaultMemSlots,
@@ -216,10 +217,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
err = SetKernelParams(&runtimeConfig)
if err != nil {
return config, err
return testConfig, err
}
config = testRuntimeConfig{
rtimeConfig := testRuntimeConfig{
RuntimeConfig: runtimeConfig,
RuntimeConfigFile: configPath,
ConfigPath: configPath,
@@ -228,7 +229,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
LogPath: logPath,
}
return config, nil
return rtimeConfig, nil
}
// testLoadConfiguration accepts an optional function that can be used
@@ -568,6 +569,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
VirtioFSCache: defaultVirtioFSCacheMode,
BlockDeviceAIO: defaultBlockDeviceAIO,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
HotPlugVFIO: defaultHotPlugVFIO,
ColdPlugVFIO: defaultColdPlugVFIO,
}
@@ -602,7 +604,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
func TestNewQemuHypervisorConfig(t *testing.T) {
dir := t.TempDir()
var coldPlugVFIO hv.PCIePort
var coldPlugVFIO config.PCIePort
hypervisorPath := path.Join(dir, "hypervisor")
kernelPath := path.Join(dir, "kernel")
imagePath := path.Join(dir, "image")
@@ -610,8 +612,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
disableBlock := true
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
coldPlugVFIO = hv.RootPort
coldPlugVFIO = config.BridgePort
orgVHostVSockDevicePath := utils.VHostVSockDevicePath
blockDeviceAIO := "io_uring"
defer func() {
@@ -630,7 +631,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
DisableBlockDeviceUse: disableBlock,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
ColdPlugVFIO: coldPlugVFIO,
RxRateLimiterMaxRate: rxRateLimiterMaxRate,
TxRateLimiterMaxRate: txRateLimiterMaxRate,
@@ -686,10 +686,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus)
}
if config.PCIeRootPort != pcieRootPort {
t.Errorf("Expected value for PCIeRootPort %v, got %v", pcieRootPort, config.PCIeRootPort)
}
if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate {
t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate)
}
@@ -812,7 +808,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
disableBlock := true
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
hypervisor := hypervisor{
Path: hypervisorPath,
@@ -823,7 +818,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
DisableBlockDeviceUse: disableBlock,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
}
_, err := newQemuHypervisorConfig(hypervisor)

View File

@@ -453,6 +453,10 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
return err
}
if err := addHypervisorHotColdPlugVfioOverrides(ocispec, config); err != nil {
return err
}
if value, ok := ocispec.Annotations[vcAnnotations.MachineType]; ok {
if value != "" {
config.HypervisorConfig.HypervisorMachineType = value
@@ -508,12 +512,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
return err
}
if err := newAnnotationConfiguration(ocispec, vcAnnotations.PCIeRootPort).setUint(func(pcieRootPort uint64) {
config.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort)
}); err != nil {
return err
}
if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok {
if !checkPathIsInGlobs(runtime.HypervisorConfig.EntropySourceList, value) {
return fmt.Errorf("entropy source %v required from annotation is not valid", value)
@@ -576,6 +574,37 @@ func addHypervisorPathOverrides(ocispec specs.Spec, config *vc.SandboxConfig, ru
return nil
}
func addHypervisorPCIePortOverride(value string) (config.PCIePort, error) {
if value == "" {
return config.NoPort, nil
}
port := config.PCIePort(value)
if port.Invalid() {
return config.InvalidPort, fmt.Errorf("Invalid PCIe port \"%v\" specified in annotation", value)
}
return port, nil
}
func addHypervisorHotColdPlugVfioOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error {
var err error
if value, ok := ocispec.Annotations[vcAnnotations.HotPlugVFIO]; ok {
if sbConfig.HypervisorConfig.HotPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
return err
}
// If hot-plug is specified disable cold-plug and vice versa
sbConfig.HypervisorConfig.ColdPlugVFIO = config.NoPort
}
if value, ok := ocispec.Annotations[vcAnnotations.ColdPlugVFIO]; ok {
if sbConfig.HypervisorConfig.ColdPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
return err
}
// If cold-plug is specified disable hot-plug and vice versa
sbConfig.HypervisorConfig.HotPlugVFIO = config.NoPort
}
return nil
}
func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, runtime RuntimeConfig) error {
if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMemory).setUintWithCheck(func(memorySz uint64) error {

View File

@@ -599,7 +599,7 @@ func TestContainerPipeSizeAnnotation(t *testing.T) {
func TestAddHypervisorAnnotations(t *testing.T) {
assert := assert.New(t)
config := vc.SandboxConfig{
sbConfig := vc.SandboxConfig{
Annotations: make(map[string]string),
}
@@ -628,8 +628,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
runtimeConfig.HypervisorConfig.VirtioFSDaemonList = []string{"/bin/*ls*"}
ocispec.Annotations[vcAnnotations.KernelParams] = "vsyscall=emulate iommu=on"
addHypervisorConfigOverrides(ocispec, &config, runtimeConfig)
assert.Exactly(expectedHyperConfig, config.HypervisorConfig)
addHypervisorConfigOverrides(ocispec, &sbConfig, runtimeConfig)
assert.Exactly(expectedHyperConfig, sbConfig.HypervisorConfig)
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"
@@ -660,7 +660,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/"
ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true"
ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true"
ocispec.Annotations[vcAnnotations.PCIeRootPort] = "2"
ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort
ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort
ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true"
ocispec.Annotations[vcAnnotations.SGXEPC] = "64Mi"
ocispec.Annotations[vcAnnotations.UseLegacySerial] = "true"
@@ -668,55 +669,58 @@ func TestAddHypervisorAnnotations(t *testing.T) {
ocispec.Annotations[vcAnnotations.RxRateLimiterMaxRate] = "10000000"
ocispec.Annotations[vcAnnotations.TxRateLimiterMaxRate] = "10000000"
addAnnotations(ocispec, &config, runtimeConfig)
assert.Equal(config.HypervisorConfig.NumVCPUs, uint32(1))
assert.Equal(config.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
assert.Equal(config.HypervisorConfig.MemorySize, uint32(1024))
assert.Equal(config.HypervisorConfig.MemSlots, uint32(20))
assert.Equal(config.HypervisorConfig.MemOffset, uint64(512))
assert.Equal(config.HypervisorConfig.VirtioMem, true)
assert.Equal(config.HypervisorConfig.MemPrealloc, true)
assert.Equal(config.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
assert.Equal(config.HypervisorConfig.HugePages, true)
assert.Equal(config.HypervisorConfig.IOMMU, true)
assert.Equal(config.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
assert.Equal(config.HypervisorConfig.BlockDeviceAIO, "io_uring")
assert.Equal(config.HypervisorConfig.DisableBlockDeviceUse, true)
assert.Equal(config.HypervisorConfig.EnableIOThreads, true)
assert.Equal(config.HypervisorConfig.BlockDeviceCacheSet, true)
assert.Equal(config.HypervisorConfig.BlockDeviceCacheDirect, true)
assert.Equal(config.HypervisorConfig.BlockDeviceCacheNoflush, true)
assert.Equal(config.HypervisorConfig.SharedFS, "virtio-fs")
assert.Equal(config.HypervisorConfig.VirtioFSDaemon, "/bin/false")
assert.Equal(config.HypervisorConfig.VirtioFSCache, "auto")
assert.ElementsMatch(config.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
assert.Equal(config.HypervisorConfig.Msize9p, uint32(512))
assert.Equal(config.HypervisorConfig.HypervisorMachineType, "q35")
assert.Equal(config.HypervisorConfig.MachineAccelerators, "nofw")
assert.Equal(config.HypervisorConfig.CPUFeatures, "pmu=off")
assert.Equal(config.HypervisorConfig.DisableVhostNet, true)
assert.Equal(config.HypervisorConfig.GuestHookPath, "/usr/bin/")
assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true)
assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true)
assert.Equal(config.HypervisorConfig.PCIeRootPort, uint32(2))
assert.Equal(config.HypervisorConfig.IOMMUPlatform, true)
assert.Equal(config.HypervisorConfig.SGXEPCSize, int64(67108864))
assert.Equal(config.HypervisorConfig.LegacySerial, true)
assert.Equal(config.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
assert.Equal(config.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
err := addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.NoError(err)
assert.Equal(sbConfig.HypervisorConfig.NumVCPUs, uint32(1))
assert.Equal(sbConfig.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
assert.Equal(sbConfig.HypervisorConfig.MemorySize, uint32(1024))
assert.Equal(sbConfig.HypervisorConfig.MemSlots, uint32(20))
assert.Equal(sbConfig.HypervisorConfig.MemOffset, uint64(512))
assert.Equal(sbConfig.HypervisorConfig.VirtioMem, true)
assert.Equal(sbConfig.HypervisorConfig.MemPrealloc, true)
assert.Equal(sbConfig.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
assert.Equal(sbConfig.HypervisorConfig.HugePages, true)
assert.Equal(sbConfig.HypervisorConfig.IOMMU, true)
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceAIO, "io_uring")
assert.Equal(sbConfig.HypervisorConfig.DisableBlockDeviceUse, true)
assert.Equal(sbConfig.HypervisorConfig.EnableIOThreads, true)
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheSet, true)
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheDirect, true)
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheNoflush, true)
assert.Equal(sbConfig.HypervisorConfig.SharedFS, "virtio-fs")
assert.Equal(sbConfig.HypervisorConfig.VirtioFSDaemon, "/bin/false")
assert.Equal(sbConfig.HypervisorConfig.VirtioFSCache, "auto")
assert.ElementsMatch(sbConfig.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
assert.Equal(sbConfig.HypervisorConfig.Msize9p, uint32(512))
assert.Equal(sbConfig.HypervisorConfig.HypervisorMachineType, "q35")
assert.Equal(sbConfig.HypervisorConfig.MachineAccelerators, "nofw")
assert.Equal(sbConfig.HypervisorConfig.CPUFeatures, "pmu=off")
assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true)
assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/")
assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true)
assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true)
assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort))
assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort))
assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true)
assert.Equal(sbConfig.HypervisorConfig.SGXEPCSize, int64(67108864))
assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true)
assert.Equal(sbConfig.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
assert.Equal(sbConfig.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
// In case an absurd large value is provided, the config value if not over-ridden
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "655536"
err := addAnnotations(ocispec, &config, runtimeConfig)
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.Error(err)
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "-1"
err = addAnnotations(ocispec, &config, runtimeConfig)
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.Error(err)
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "-1"
err = addAnnotations(ocispec, &config, runtimeConfig)
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.Error(err)
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"

View File

@@ -864,12 +864,12 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
defer cancel()
// Create the clh device config via the constructor to ensure default values are properly assigned
clhDevice := *chclient.NewDeviceConfig(*(*device).GetSysfsDev())
clhDevice := *chclient.NewDeviceConfig(device.SysfsDev)
pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice)
if err != nil {
return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err))
}
clh.devicesIds[*(*device).GetID()] = pciInfo.GetId()
clh.devicesIds[device.ID] = pciInfo.GetId()
// clh doesn't use bridges, so the PCI path is simply the slot
// number of the device. This will break if clh starts using
@@ -886,14 +886,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf)
}
guestPciPath, err := types.PciPathFromString(tokens[0])
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
if device.Type == config.VFIOAPDeviceMediatedType {
return fmt.Errorf("VFIO device %+v is not PCI, only PCI is supported in Cloud Hypervisor", device)
}
pciDevice.GuestPciPath = guestPciPath
*device = pciDevice
device.GuestPciPath, err = types.PciPathFromString(tokens[0])
return err
}
@@ -937,7 +934,7 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int
case BlockDev:
deviceID = clhDriveIndexToID(devInfo.(*config.BlockDrive).Index)
case VfioDev:
deviceID = *devInfo.(config.VFIODev).GetID()
deviceID = devInfo.(*config.VFIODev).ID
default:
clh.Logger().WithFields(log.Fields{"devInfo": devInfo,
"deviceType": devType}).Error("HotplugRemoveDevice: unsupported device")

View File

@@ -673,7 +673,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
_, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev)
assert.NoError(err, "Hotplug remove block device expected no error")
_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIOPCIDev{}, VfioDev)
_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIODev{}, VfioDev)
assert.NoError(err, "Hotplug remove vfio block device expected no error")
_, err = clh.HotplugRemoveDevice(context.Background(), nil, NetDev)

View File

@@ -288,12 +288,12 @@ type HypervisorConfig struct {
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort uint32
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
// root port, switch, bridge or no port
HotPlugVFIO hv.PCIePort
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
// root port, switch or no port
// root port, switch, bridge or no port
ColdPlugVFIO hv.PCIePort
// BootToBeTemplate used to indicate if the VM is created to be a template VM

View File

@@ -505,13 +505,21 @@ type HypervisorConfig struct {
// MemOffset specifies memory space for nvdimm device
MemOffset uint64
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort uint32
// VFIODevices are used to get PCIe device info early before the sandbox
// is started to make better PCIe topology decisions
VFIODevices []config.DeviceInfo
// VhostUserBlkDevices are handled differently in Q35 and Virt machine
// type. capture them early before the sandbox to make better PCIe topology
// decisions
VhostUserBlkDevices []config.DeviceInfo
// HotplugVFIO is used to indicate if devices need to be hotplugged on the
// root port or a switch
HotPlugVFIO config.PCIePort
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
// root port, switch or no port
ColdPlugVFIO hv.PCIePort
ColdPlugVFIO config.PCIePort
// NumVCPUs specifies default number of vCPUs for the VM.
NumVCPUs uint32

View File

@@ -21,6 +21,7 @@ import (
"github.com/docker/go-units"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/api"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/pkg/uuid"
@@ -1137,7 +1138,7 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
ContainerPath: dev.ContainerPath,
Type: kataVfioPciDevType,
Id: groupNum,
Options: nil,
Options: make([]string, len(devList)),
}
// We always pass the device information to the agent, since
@@ -1147,16 +1148,16 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel {
kataDevice.Type = kataVfioPciGuestKernelDevType
}
for i, dev := range devList {
if dev.Type == config.VFIOAPDeviceMediatedType {
kataDevice.Type = kataVfioApDevType
kataDevice.Options = dev.APDevices
} else {
if (*devList[0]).GetType() == config.VFIOAPDeviceMediatedType {
kataDevice.Type = kataVfioApDevType
kataDevice.Options = (*devList[0]).(config.VFIOAPDev).APDevices
} else {
kataDevice.Options = make([]string, len(devList))
for i, device := range devList {
pciDevice := (*device).(config.VFIOPCIDev)
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDevice.BDF, pciDevice.GuestPciPath)
devBDF := drivers.GetBDF(dev.BDF)
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", devBDF, dev.GuestPciPath)
}
}
return kataDevice
@@ -1342,7 +1343,6 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
if _, err = k.sendReq(ctx, req); err != nil {
return nil, err
}
return buildProcessFromExecID(req.ExecId)
}

View File

@@ -245,7 +245,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks,
DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm,
HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort,
BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate,
BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate,
DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet,
@@ -487,8 +486,8 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
DisableNestingChecks: hconf.DisableNestingChecks,
DisableImageNvdimm: hconf.DisableImageNvdimm,
HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus,
HotPlugVFIO: hconf.HotPlugVFIO,
ColdPlugVFIO: hconf.ColdPlugVFIO,
PCIeRootPort: hconf.PCIeRootPort,
BootToBeTemplate: hconf.BootToBeTemplate,
BootFromTemplate: hconf.BootFromTemplate,
DisableVhostNet: hconf.DisableVhostNet,

View File

@@ -7,7 +7,7 @@
package persistapi
import (
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/opencontainers/runc/libcontainer/configs"
specs "github.com/opencontainers/runtime-spec/specs-go"
)
@@ -131,10 +131,6 @@ type HypervisorConfig struct {
// Enable SGX. Hardware-based isolation and memory encryption.
SGXEPCSize int64
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort uint32
// NumVCPUs specifies default number of vCPUs for the VM.
NumVCPUs uint32
@@ -199,9 +195,13 @@ type HypervisorConfig struct {
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
// root, switch, bridge or no-port
HotPlugVFIO config.PCIePort
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
// root port or a switch or no-port
ColdPlugVFIO hv.PCIePort
// root, bridge, switch or no-port
ColdPlugVFIO config.PCIePort
// BootToBeTemplate used to indicate if the VM is created to be a template VM
BootToBeTemplate bool

View File

@@ -126,9 +126,11 @@ const (
// root bus instead of a bridge.
HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus"
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port"
// ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged.
ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio"
// HotPlugVFIO is a sandbox annotation used to indicate if devices need to be hotplugged.
HotPlugVFIO = kataAnnotHypervisorPrefix + "hot_plug_vfio"
// EntropySource is a sandbox annotation to specify the path to a host source of
// entropy (/dev/random, /dev/urandom or real hardware RNG device)

View File

@@ -65,6 +65,11 @@ const romFile = ""
// Default value is false.
const defaultDisableModern = false
// A deeper PCIe topology than 5 is already not advisable just for the sake
// of having enough buffer we limit ourselves to 10 and exit if we reach
// the root bus
const maxPCIeTopoDepth = 10
type qmpChannel struct {
qmp *govmmQemu.QMP
ctx context.Context
@@ -75,15 +80,15 @@ type qmpChannel struct {
// QemuState keeps Qemu's state
type QemuState struct {
UUID string
Bridges []types.Bridge
// HotpluggedCPUs is the list of CPUs that were hot-added
UUID string
HotPlugVFIO config.PCIePort
Bridges []types.Bridge
HotpluggedVCPUs []hv.CPUDevice
HotpluggedMemory int
VirtiofsDaemonPid int
PCIeRootPort int
HotplugVFIOOnRootBus bool
ColdPlugVFIO hv.PCIePort
HotplugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
}
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
@@ -282,10 +287,10 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
q.Logger().Debug("Creating UUID")
q.state.UUID = uuid.Generate().String()
q.state.HotPlugVFIO = q.config.HotPlugVFIO
q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
q.state.PCIeRootPort = int(q.config.PCIeRootPort)
q.state.HotPlugVFIO = q.config.HotPlugVFIO
// The path might already exist, but in case of VM templating,
// we have to create it since the sandbox has not created it yet.
@@ -701,27 +706,12 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
}
}
// Add PCIe Root Port devices to hypervisor
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port.
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
if hypervisorConfig.PCIeRootPort > 0 {
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit)
}
// The default OVMF MMIO aperture is too small for some PCIe devices
// with huge BARs so we need to increase it.
// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
fwCfg := govmmQemu.FwCfg{
Name: "opt/ovmf/X-PciMmio64Mb",
Str: pciMmio64Mb,
if machine.Type == QemuQ35 || machine.Type == QemuVirt {
if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type); err != nil {
q.Logger().WithError(err).Errorf("Cannot create PCIe topology")
return err
}
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
}
q.qemuConfig = qemuConfig
q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath)
@@ -747,6 +737,101 @@ func (q *qemu) checkBpfEnabled() {
}
}
// If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need
// to hotplug 32 devices. We do not have enough PCIe root bus slots to
// accomplish this task. Kata will use already some slots for vfio-xxxx-pci
// devices.
// Max PCI slots per root bus is 32
// Max PCIe root ports is 16
// Max PCIe switch ports is 16
// There is only 64kB of IO memory each root,switch port will consume 4k hence
// only 16 ports possible.
func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string) error {
// If no-port set just return no need to add PCIe Root Port or PCIe Switches
if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 {
return nil
}
// Add PCIe Root Port or PCIe Switches to the hypervisor
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged
// into a PCIe Root Port or PCIe Switch.
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
// Deduce the right values for mem-reserve and pref-64-reserve memory regions
memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
// The default OVMF MMIO aperture is too small for some PCIe devices
// with huge BARs so we need to increase it.
// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
fwCfg := govmmQemu.FwCfg{
Name: "opt/ovmf/X-PciMmio64Mb",
Str: pciMmio64Mb,
}
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
}
// Get the number of hot(cold)-pluggable ports needed from the provided
// VFIO devices and VhostUserBlockDevices
var numOfPluggablePorts uint32 = 0
for _, dev := range hypervisorConfig.VFIODevices {
var err error
dev.HostPath, err = config.GetHostPath(dev, false, "")
if err != nil {
return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err)
}
devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
if err != nil {
return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err)
}
for _, vfioDevice := range devicesPerIOMMUGroup {
if drivers.IsPCIeDevice(vfioDevice.BDF) {
numOfPluggablePorts = numOfPluggablePorts + 1
}
}
}
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus)
vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices)
// If number of PCIe root ports > 16 then bail out otherwise we may
// use up all slots or IO memory on the root bus and vfio-XXX-pci devices
// cannot be added which are crucial for Kata max slots on root bus is 32
// max slots on the complete pci(e) topology is 256 in QEMU
if vfioOnRootPort {
// On Arm the vhost-user-block device is a PCIe device we need
// to account for it in the number of pluggable ports
if machineType == QemuVirt {
numOfPluggablePorts = numOfPluggablePorts + uint32(numOfVhostUserBlockDevices)
}
if numOfPluggablePorts > maxPCIeRootPort {
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
}
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
return nil
}
if vfioOnSwitchPort {
// On Arm the vhost-user-block device is a PCIe device we need
// to account for it in the number of pluggable ports
if machineType == QemuVirt {
numOfPluggableRootPorts := uint32(numOfVhostUserBlockDevices)
if numOfPluggableRootPorts > maxPCIeRootPort {
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
}
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggableRootPorts, memSize32bit, memSize64bit)
}
if numOfPluggablePorts > maxPCIeSwitchPort {
return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort)
}
qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
return nil
}
return nil
}
func (q *qemu) vhostFSSocketPath(id string) (string, error) {
return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket)
}
@@ -1527,6 +1612,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri
}
func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) {
err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime)
if err != nil {
return err
@@ -1544,18 +1630,14 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
switch machineType {
case QemuVirt:
if q.state.PCIeRootPort <= 0 {
return fmt.Errorf("Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt")
}
//The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0
//Since the dev is the first and only one on this bus(root port), it should be 0.
addr := "00"
bridgeId := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs))
drivers.AllPCIeDevs[devID] = true
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort]))
config.PCIeDevices[config.RootPort][devID] = true
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeId)
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
bridgeSlot, err := q.qomGetSlot(bridgeQomPath)
if err != nil {
return err
@@ -1571,7 +1653,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
return err
}
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeId); err != nil {
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil {
return err
}
@@ -1685,41 +1767,108 @@ func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) {
// Query QMP to find a device's PCI path given its QOM path or ID
func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) {
// XXX: For now we assume there's exactly one bridge, since
// that's always how we configure qemu from Kata for now. It
// would be good to generalize this to different PCI
// topologies
var slots []types.PciSlot
devSlot, err := q.qomGetSlot(qemuID)
if err != nil {
return types.PciPath{}, err
}
slots = append(slots, devSlot)
busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus")
// This only works for Q35 and Virt
r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
var parentPath = qemuID
// We do not want to use a forever loop here, a deeper PCIe topology
// than 5 is already not advisable just for the sake of having enough
// buffer we limit ourselves to 10 and leave the loop early if we hit
// the root bus.
for i := 1; i <= maxPCIeTopoDepth; i++ {
parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus")
if err != nil {
return types.PciPath{}, err
}
busQOM, ok := parenBusQOM.(string)
if !ok {
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
}
// If we hit /machine/q35/pcie.0 we're done this is the root bus
// we climbed the complete hierarchy
if r.Match([]byte(busQOM)) {
break
}
// `bus` is the QOM path of the QOM bus object, but we need
// the PCI parent_bus which manages that bus. There doesn't seem
// to be a way to get that other than to simply drop the last
// path component.
idx := strings.LastIndex(busQOM, "/")
if idx == -1 {
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
}
parentBus := busQOM[:idx]
parentSlot, err := q.qomGetSlot(parentBus)
if err != nil {
return types.PciPath{}, err
}
// Prepend the slots, since we're climbing the hierarchy
slots = append([]types.PciSlot{parentSlot}, slots...)
parentPath = parentBus
}
return types.PciPathFromSlots(slots...)
}
func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) {
return q.executeVFIODeviceAdd(device)
}
func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) {
return q.executeVFIODeviceAdd(device)
}
func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) {
addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI)
if err != nil {
return types.PciPath{}, err
return err
}
bus, ok := busq.(string)
if !ok {
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq)
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(device.ID)
}
}()
return q.executePCIVFIODeviceAdd(device, addr, bridge.ID)
}
// `bus` is the QOM path of the QOM bus object, but we need
// the PCI bridge which manages that bus. There doesn't seem
// to be a way to get that other than to simply drop the last
// path component.
idx := strings.LastIndex(bus, "/")
if idx == -1 {
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus)
func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error {
switch device.Type {
case config.VFIOPCIDeviceNormalType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile)
case config.VFIOPCIDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile)
case config.VFIOAPDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
bridge := bus[:idx]
}
bridgeSlot, err := q.qomGetSlot(bridge)
if err != nil {
return types.PciPath{}, err
func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error {
switch device.Type {
case config.VFIOPCIDeviceNormalType:
return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile)
case config.VFIOPCIDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile)
case config.VFIOAPDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
return types.PciPathFromSlots(bridgeSlot, devSlot)
}
func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) {
@@ -1727,109 +1876,53 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
return err
}
devID := *(*device).GetID()
machineType := q.HypervisorConfig().HypervisorMachineType
if op == AddDevice {
buf, _ := json.Marshal(device)
q.Logger().WithFields(logrus.Fields{
"machine-type": machineType,
"hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus,
"pcie-root-port": q.state.PCIeRootPort,
"device-info": string(buf),
"machine-type": q.HypervisorConfig().HypervisorMachineType,
"hot-plug-vfio": q.state.HotPlugVFIO,
"device-info": string(buf),
}).Info("Start hot-plug VFIO device")
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotplugVFIOOnRootBus {
switch (*device).GetType() {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
// In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
}
switch machineType {
case QemuQ35:
if pciDevice.IsPCIe && q.state.PCIeRootPort <= 0 {
q.Logger().WithField("dev-id", (*device).GetID()).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
pciDevice.Bus = ""
}
default:
pciDevice.Bus = ""
}
*device = pciDevice
if pciDevice.Type == config.VFIOPCIDeviceNormalType {
err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, pciDevice.Bus, romFile)
} else {
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), "", pciDevice.Bus, romFile)
}
case config.VFIOAPDeviceMediatedType:
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
}
// In case MachineType is q35, a PCIe device is hotplugged on
// a PCIe Root Port or alternatively on a PCIe Switch Port
if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt {
device.Bus = ""
} else {
addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI)
var err error
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus {
err = q.hotplugVFIODeviceRootPort(ctx, device)
} else if q.state.HotPlugVFIO == config.SwitchPort {
err = q.hotplugVFIODeviceSwitchPort(ctx, device)
} else {
err = q.hotplugVFIODeviceBridgePort(ctx, device)
}
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(devID)
}
}()
switch (*device).GetType() {
case config.VFIOPCIDeviceNormalType:
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
}
err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, addr, bridge.ID, romFile)
case config.VFIOPCIDeviceMediatedType:
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), addr, bridge.ID, romFile)
case config.VFIOAPDeviceMediatedType:
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
}
if err != nil {
return err
}
switch (*device).GetType() {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
}
// XXX: Depending on whether we're doing root port or
// bridge hotplug, and how the bridge is set up in
// other parts of the code, we may or may not already
// have information about the slot number of the
// bridge and or the device. For simplicity, just
// query both of them back from qemu
guestPciPath, err := q.qomGetPciPath(devID)
pciDevice.GuestPciPath = guestPciPath
*device = pciDevice
return err
}
// XXX: Depending on whether we're doing root port or
// bridge hotplug, and how the bridge is set up in
// other parts of the code, we may or may not already
// have information about the slot number of the
// bridge and or the device. For simplicity, just
// query both of them back from qemu
device.GuestPciPath, err = q.qomGetPciPath(device.ID)
return err
} else {
q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")
if !q.state.HotplugVFIOOnRootBus {
if err := q.arch.removeDeviceFromBridge(devID); err != nil {
return err
}
}
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID)
}
q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
if !q.state.HotplugVFIOOnRootBus {
if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
return err
}
}
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
}
func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
@@ -2527,7 +2620,7 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
for i := uint32(0); i < number; i++ {
devices = append(devices,
govmmQemu.PCIeRootPortDevice{
ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i),
ID: fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, i),
Bus: bus,
Chassis: chassis,
Slot: strconv.FormatUint(uint64(i), 10),
@@ -2541,6 +2634,79 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
return devices
}
// gollangci-lint enforces multi-line comments to be a block comment
// not multiple single line comments ...
/* pcie.0 bus
// -------------------------------------------------
// |
// -------------
// | Root Port |
// -------------
// -------------------------|------------------------
// | ----------------- |
// | PCI Express | Upstream Port | |
// | Switch ----------------- |
// | | | |
// | ------------------- ------------------- |
// | | Downstream Port | | Downstream Port | |
// | ------------------- ------------------- |
// -------------|-----------------------|------------
// ------------- --------------
// | GPU/ACCEL | | IB/ETH NIC |
// ------------- --------------
*/
// genericAppendPCIeSwitch adds a PCIe Swtich
func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
// Q35, Virt have the correct PCIe support,
// hence ignore all other machines
if machineType != QemuQ35 && machineType != QemuVirt {
return devices
}
// Using an own ID for the root port, so we do not clash with already
// existing root ports adding "s" for switch prefix
pcieRootPort := govmmQemu.PCIeRootPortDevice{
ID: fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0),
Bus: defaultBridgeBus,
Chassis: "1",
Slot: strconv.FormatUint(uint64(0), 10),
Multifunction: false,
Addr: "0",
MemReserve: fmt.Sprintf("%dB", memSize32bit),
Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
}
devices = append(devices, pcieRootPort)
pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{
ID: fmt.Sprintf("%s%d", config.PCIeSwitchUpstreamPortPrefix, 0),
Bus: pcieRootPort.ID,
}
devices = append(devices, pcieSwitchUpstreamPort)
currentChassis, err := strconv.Atoi(pcieRootPort.Chassis)
if err != nil {
return devices
}
nextChassis := currentChassis + 1
for i := uint32(0); i < number; i++ {
pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{
ID: fmt.Sprintf("%s%d", config.PCIeSwitchhDownstreamPortPrefix, i),
Bus: pcieSwitchUpstreamPort.ID,
Chassis: fmt.Sprintf("%d", nextChassis),
Slot: strconv.FormatUint(uint64(i), 10),
// TODO: MemReserve: fmt.Sprintf("%dB", memSize32bit),
// TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
}
devices = append(devices, pcieSwitchDownstreamPort)
}
return devices
}
func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
@@ -2716,7 +2882,6 @@ func (q *qemu) Save() (s hv.HypervisorState) {
s.UUID = q.state.UUID
s.HotpluggedMemory = q.state.HotpluggedMemory
s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
s.PCIeRootPort = q.state.PCIeRootPort
for _, bridge := range q.arch.getBridges() {
s.Bridges = append(s.Bridges, hv.Bridge{
@@ -2740,7 +2905,6 @@ func (q *qemu) Load(s hv.HypervisorState) {
q.state.HotpluggedMemory = s.HotpluggedMemory
q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid
q.state.PCIeRootPort = s.PCIeRootPort
for _, bridge := range s.Bridges {
q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr))

View File

@@ -124,7 +124,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
legacySerial: config.LegacySerial,
},
vmFactory: factory,
snpGuest: config.SevSnpGuest,
snpGuest: config.ConfidentialGuest,
}
if config.ConfidentialGuest {
@@ -308,6 +308,7 @@ func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware,
ReducedPhysBits: 1,
}), "", nil
case noneProtection:
return devices, firmware, nil
default:

View File

@@ -140,6 +140,9 @@ type qemuArch interface {
// appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus
appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
// appendPCIeSwitch appends a ioh3420 device to a pcie-root-port
appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
// append vIOMMU device
appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error)
@@ -183,7 +186,8 @@ const (
defaultBridgeBus = "pcie.0"
defaultPCBridgeBus = "pci.0"
maxDevIDSize = 31
pcieRootPortPrefix = "rp"
maxPCIeRootPort = 16 // Limitation from QEMU
maxPCIeSwitchPort = 16 // Limitation from QEMU
)
// This is the PCI start address assigned to the first bridge that
@@ -677,17 +681,17 @@ func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govm
}
func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device {
pciDevice := vfioDev.(config.VFIOPCIDev)
if pciDevice.BDF == "" {
if vfioDev.BDF == "" {
return devices
}
devices = append(devices,
govmmQemu.VFIODevice{
BDF: pciDevice.BDF,
VendorID: pciDevice.VendorID,
DeviceID: pciDevice.DeviceID,
Bus: pciDevice.Bus,
BDF: vfioDev.BDF,
VendorID: vfioDev.VendorID,
DeviceID: vfioDev.DeviceID,
Bus: vfioDev.Bus,
},
)
@@ -803,6 +807,13 @@ func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, numb
return genericAppendPCIeRootPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
}
// appendPCIeSwitchPortDevice appends a PCIe Switch with <number> ports
func (q *qemuArchBase) appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
return genericAppendPCIeSwitchPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
}
// getBARsMaxAddressableMemory we need to know the BAR sizes to configure the
// PCIe Root Port or PCIe Downstream Port attaching a device with huge BARs.
func (q *qemuArchBase) getBARsMaxAddressableMemory() (uint64, uint64) {
pci := nvpci.New()

View File

@@ -470,7 +470,7 @@ func TestQemuArchBaseAppendVFIODevice(t *testing.T) {
},
}
vfDevice := config.VFIOPCIDev{
vfDevice := config.VFIODev{
BDF: bdf,
}
@@ -490,7 +490,7 @@ func TestQemuArchBaseAppendVFIODeviceWithVendorDeviceID(t *testing.T) {
},
}
vfDevice := config.VFIOPCIDev{
vfDevice := config.VFIODev{
BDF: bdf,
VendorID: vendorID,
DeviceID: deviceID,

View File

@@ -111,9 +111,6 @@ func TestQemuCreateVM(t *testing.T) {
config6 := newQemuConfig()
config6.DisableGuestSeLinux = false
config7 := newQemuConfig()
config7.PCIeRootPort = 1
config8 := newQemuConfig()
config8.EnableVhostUserStore = true
config8.HugePages = true
@@ -161,7 +158,6 @@ func TestQemuCreateVM(t *testing.T) {
{config3, false, true},
{config5, false, true},
{config6, false, false},
{config7, false, true},
{config8, false, true},
{config9, true, false},
{config10, false, true},

View File

@@ -32,7 +32,6 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
resCtrl "github.com/kata-containers/kata-containers/src/runtime/pkg/resourcecontrol"
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
@@ -101,15 +100,10 @@ type HypervisorPidKey struct{}
// SandboxStatus describes a sandbox status.
type SandboxStatus struct {
ContainersStatus []ContainerStatus
// Annotations allow clients to store arbitrary values,
// for example to add additional status values required
// to support particular specifications.
Annotations map[string]string
Annotations map[string]string
ID string
Hypervisor HypervisorType
ContainersStatus []ContainerStatus
State types.SandboxState
HypervisorConfig HypervisorConfig
}
@@ -176,10 +170,8 @@ type SandboxConfig struct {
// SharePidNs sets all containers to share the same sandbox level pid namespace.
SharePidNs bool
// SystemdCgroup enables systemd cgroup support
SystemdCgroup bool
// SandboxCgroupOnly enables cgroup only at podlevel in the host
SandboxCgroupOnly bool
@@ -623,22 +615,49 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
// until we have TDISP/IDE PCIe support.
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort)
var devs []config.DeviceInfo
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
// Aggregate all the containner devices for hot-plug and use them to dedcue
// the correct amount of ports to reserve for the hypervisor.
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
var vfioDevices []config.DeviceInfo
// vhost-user-block device is a PCIe device in Virt, keep track of it
// for correct number of PCIe root ports.
var vhostUserBlkDevices []config.DeviceInfo
for cnt, containers := range sandboxConfig.Containers {
for dev, device := range containers.DeviceInfos {
if coldPlugVFIO && deviceManager.IsVFIO(device.ContainerPath) {
if deviceManager.IsVhostUserBlk(device) {
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
continue
}
isVFIO := deviceManager.IsVFIO(device.ContainerPath)
if hotPlugVFIO && isVFIO {
vfioDevices = append(vfioDevices, device)
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
}
if coldPlugVFIO && isVFIO {
device.ColdPlug = true
devs = append(devs, device)
device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
vfioDevices = append(vfioDevices, device)
// We need to remove the devices marked for cold-plug
// otherwise at the container level the kata-agent
// will try to hot-plug them.
infos := sandboxConfig.Containers[cnt].DeviceInfos
infos = append(infos[:dev], infos[dev+1:]...)
sandboxConfig.Containers[cnt].DeviceInfos = infos
sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
}
}
var filteredDevices []config.DeviceInfo
for _, device := range containers.DeviceInfos {
if device.ID != "remove-we-are-cold-plugging" {
filteredDevices = append(filteredDevices, device)
}
}
sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
}
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
// store doesn't require hypervisor to be stored immediately
if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
@@ -653,7 +672,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return s, nil
}
for _, dev := range devs {
for _, dev := range vfioDevices {
_, err := s.AddDevice(ctx, dev)
if err != nil {
s.Logger().WithError(err).Debug("Cannot cold-plug add device")
@@ -1709,7 +1728,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
defer span.End()
for i := range s.config.Containers {
c, err := newContainer(ctx, s, &s.config.Containers[i])
if err != nil {
return err
@@ -1728,7 +1746,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
if err := s.updateResources(ctx); err != nil {
return err
}
if err := s.resourceControllerUpdate(ctx); err != nil {
return err
}
@@ -1740,7 +1757,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
if err := s.storeSandbox(ctx); err != nil {
return err
}
return nil
}
@@ -1904,15 +1920,11 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
// adding a group of VFIO devices
for _, dev := range vfioDevices {
if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil {
bdf := ""
if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
bdf = pciDevice.BDF
}
s.Logger().
WithFields(logrus.Fields{
"sandbox": s.id,
"vfio-device-ID": (*dev).GetID(),
"vfio-device-BDF": bdf,
"vfio-device-ID": dev.ID,
"vfio-device-BDF": dev.BDF,
}).WithError(err).Error("failed to hotplug VFIO device")
return err
}
@@ -1927,6 +1939,7 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
return err
case config.VhostUserBlk:
vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice)
if !ok {
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
}
@@ -1961,15 +1974,11 @@ func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, de
// remove a group of VFIO devices
for _, dev := range vfioDevices {
if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil {
bdf := ""
if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
bdf = pciDevice.BDF
}
s.Logger().WithError(err).
WithFields(logrus.Fields{
"sandbox": s.id,
"vfio-device-ID": (*dev).GetID(),
"vfio-device-BDF": bdf,
"vfio-device-ID": dev.ID,
"vfio-device-BDF": dev.BDF,
}).Error("failed to hot unplug VFIO device")
return err
}

View File

@@ -593,11 +593,11 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) {
_, err = os.Create(deviceFile)
assert.Nil(t, err)
savedIOMMUPath := config.SysIOMMUPath
config.SysIOMMUPath = tmpDir
savedIOMMUPath := config.SysIOMMUGroupPath
config.SysIOMMUGroupPath = tmpDir
defer func() {
config.SysIOMMUPath = savedIOMMUPath
config.SysIOMMUGroupPath = savedIOMMUPath
}()
dm := manager.NewDeviceManager(config.VirtioSCSI, false, "", 0, nil)