runtime: Add support for GPUDirect and GPUDirect RDMA PCIe topology

Fixes: #4491

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
This commit is contained in:
Zvonko Kaiser
2022-05-19 01:34:24 -07:00
parent 95bec479ca
commit de39fb7d38
34 changed files with 696 additions and 420 deletions

View File

@@ -114,8 +114,8 @@ const (
// SysDevPrefix is static string of /sys/dev
var SysDevPrefix = "/sys/dev"
// SysIOMMUPath is static string of /sys/kernel/iommu_groups
var SysIOMMUPath = "/sys/kernel/iommu_groups"
// SysIOMMUGroupPath is static string of /sys/kernel/iommu_groups
var SysIOMMUGroupPath = "/sys/kernel/iommu_groups"
// SysBusPciDevicesPath is static string of /sys/bus/pci/devices
var SysBusPciDevicesPath = "/sys/bus/pci/devices"
@@ -268,14 +268,8 @@ const (
VFIOAPDeviceMediatedType
)
type VFIODev interface {
GetID() *string
GetType() VFIODeviceType
GetSysfsDev() *string
}
// VFIOPCIDev represents a VFIO PCI device used for hotplugging
type VFIOPCIDev struct {
type VFIODev struct {
// ID is used to identify this drive in the hypervisor options.
ID string
@@ -305,44 +299,12 @@ type VFIOPCIDev struct {
// IsPCIe specifies device is PCIe or PCI
IsPCIe bool
}
func (d VFIOPCIDev) GetID() *string {
return &d.ID
}
func (d VFIOPCIDev) GetType() VFIODeviceType {
return d.Type
}
func (d VFIOPCIDev) GetSysfsDev() *string {
return &d.SysfsDev
}
type VFIOAPDev struct {
// ID is used to identify this drive in the hypervisor options.
ID string
// sysfsdev of VFIO mediated device
SysfsDev string
// APDevices are the Adjunct Processor devices assigned to the mdev
APDevices []string
// Type of VFIO device
Type VFIODeviceType
}
func (d VFIOAPDev) GetID() *string {
return &d.ID
}
func (d VFIOAPDev) GetType() VFIODeviceType {
return d.Type
}
func (d VFIOAPDev) GetSysfsDev() *string {
return &d.SysfsDev
// Rank identifies a device in a IOMMU group
Rank int
}
// RNGDev represents a random number generator device

View File

@@ -47,9 +47,9 @@ func deviceLogger() *logrus.Entry {
return api.DeviceLogger()
}
// Identify PCIe device by reading the size of the PCI config space
// IsPCIeDevice Identify PCIe device by reading the size of the PCI config space
// Plain PCI device have 256 bytes of config space where PCIe devices have 4K
func isPCIeDevice(bdf string) bool {
func IsPCIeDevice(bdf string) bool {
if len(strings.Split(bdf, ":")) == 2 {
bdf = PCIDomain + ":" + bdf
}
@@ -164,7 +164,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
vfioDevs := []*config.VFIODev{}
vfioGroup := filepath.Base(device.HostPath)
iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices")
iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices")
deviceFiles, err := os.ReadDir(iommuDevicesPath)
if err != nil {
@@ -174,7 +174,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
// Pass all devices in iommu group
for i, deviceFile := range deviceFiles {
//Get bdf of device eg 0000:00:1c.0
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath)
deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(deviceFile.Name(), iommuDevicesPath)
if err != nil {
return nil, err
}
@@ -196,15 +196,16 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
isPCIe := isPCIeDevice(deviceBDF)
isPCIe := IsPCIeDevice(deviceBDF)
// Do not directly assign to `vfio` -- need to access field still
vfioPCI := config.VFIOPCIDev{
vfioPCI := config.VFIODev{
ID: id,
Type: vfioDeviceType,
BDF: deviceBDF,
SysfsDev: deviceSysfsDev,
IsPCIe: isPCIe,
Class: pciClass,
Rank: -1,
}
if isPCIe && !ignoreBusAssignment {
vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
@@ -216,7 +217,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
if err != nil {
return nil, err
}
vfio = config.VFIOAPDev{
vfio = config.VFIODev{
ID: id,
SysfsDev: deviceSysfsDev,
Type: config.VFIOAPDeviceMediatedType,

View File

@@ -33,6 +33,9 @@ const (
)
var (
// AllPCIeDevs deduces the correct bus number. The BDF keeps track that
// we're not accounting for the very same device, if a user provides the
// devices multiple times.
AllPCIeDevs = map[string]bool{}
)
@@ -169,23 +172,18 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
for _, dev := range ds.VFIODevs {
var vfio config.VFIODev
vfioDeviceType := (*device.VfioDevs[0]).GetType()
switch vfioDeviceType {
switch dev.Type {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
bdf := ""
if pciDev, ok := (*dev).(config.VFIOPCIDev); ok {
bdf = pciDev.BDF
}
vfio = config.VFIOPCIDev{
ID: *(*dev).GetID(),
Type: config.VFIODeviceType((*dev).GetType()),
BDF: bdf,
SysfsDev: *(*dev).GetSysfsDev(),
vfio = config.VFIODev{
ID: dev.ID,
Type: config.VFIODeviceType(dev.Type),
BDF: dev.BDF,
SysfsDev: dev.SysfsDev,
}
case config.VFIOAPDeviceMediatedType:
vfio = config.VFIOAPDev{
ID: *(*dev).GetID(),
SysfsDev: *(*dev).GetSysfsDev(),
vfio = config.VFIODev{
ID: dev.ID,
SysfsDev: dev.SysfsDev,
}
default:
deviceLogger().WithError(
@@ -200,7 +198,7 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
// It should implement GetAttachCount() and DeviceID() as api.Device implementation
// here it shares function from *GenericDevice so we don't need duplicate codes
func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr)
if err != nil {

View File

@@ -29,7 +29,7 @@ func TestGetVFIODetails(t *testing.T) {
}
for _, d := range data {
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "")
deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(d.deviceStr, "")
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType:

View File

@@ -116,14 +116,14 @@ func TestAttachVFIODevice(t *testing.T) {
_, err = os.Create(deviceConfigFile)
assert.Nil(t, err)
savedIOMMUPath := config.SysIOMMUPath
config.SysIOMMUPath = tmpDir
savedIOMMUPath := config.SysIOMMUGroupPath
config.SysIOMMUGroupPath = tmpDir
savedSysBusPciDevicesPath := config.SysBusPciDevicesPath
config.SysBusPciDevicesPath = devicesDir
defer func() {
config.SysIOMMUPath = savedIOMMUPath
config.SysIOMMUGroupPath = savedIOMMUPath
config.SysBusPciDevicesPath = savedSysBusPciDevicesPath
}()

View File

@@ -123,6 +123,14 @@ const (
// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
PCIeRootPort DeviceDriver = "pcie-root-port"
// PCIeSwitchUpstreamPort is a PCIe switch upstream port
// A upstream port connects to a PCIe Root Port
PCIeSwitchUpstreamPort DeviceDriver = "x3130-upstream"
// PCIeSwitchDownstreamPort is a PCIe switch downstream port
// PCIe devices can be hot-plugged to the downstream port.
PCIeSwitchDownstreamPort DeviceDriver = "xio3130-downstream"
// Loader is the Loader device driver.
Loader DeviceDriver = "loader"
@@ -1681,6 +1689,106 @@ func (b PCIeRootPortDevice) Valid() bool {
return true
}
// PCIeSwitchUpstreamPortDevice is the port connecting to the root port
type PCIeSwitchUpstreamPortDevice struct {
ID string // format: sup{n}, n>=0
Bus string // default is rp0
}
// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
func (b PCIeSwitchUpstreamPortDevice) QemuParams(config *Config) []string {
var qemuParams []string
var deviceParams []string
driver := PCIeSwitchUpstreamPort
deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
qemuParams = append(qemuParams, "-device")
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
return qemuParams
}
// Valid returns true if the PCIeSwitchUpstreamPortDevice structure is valid and complete.
func (b PCIeSwitchUpstreamPortDevice) Valid() bool {
if b.ID == "" {
return false
}
if b.Bus == "" {
return false
}
return true
}
// PCIeSwitchDownstreamPortDevice is the port connecting to the root port
type PCIeSwitchDownstreamPortDevice struct {
ID string // format: sup{n}, n>=0
Bus string // default is rp0
Chassis string // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00
Slot string // >=0, default is 0x00
// This to work needs patches to QEMU
BusReserve string
// Pref64 and Pref32 are not allowed to be set simultaneously
Pref64Reserve string // reserve prefetched MMIO aperture, 64-bit
Pref32Reserve string // reserve prefetched MMIO aperture, 32-bit
MemReserve string // reserve non-prefetched MMIO aperture, 32-bit *only*
IOReserve string // IO reservation
}
// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
func (b PCIeSwitchDownstreamPortDevice) QemuParams(config *Config) []string {
var qemuParams []string
var deviceParams []string
driver := PCIeSwitchDownstreamPort
deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
deviceParams = append(deviceParams, fmt.Sprintf("chassis=%s", b.Chassis))
deviceParams = append(deviceParams, fmt.Sprintf("slot=%s", b.Slot))
if b.BusReserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("bus-reserve=%s", b.BusReserve))
}
if b.Pref64Reserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", b.Pref64Reserve))
}
if b.Pref32Reserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("pref32-reserve=%s", b.Pref32Reserve))
}
if b.MemReserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", b.MemReserve))
}
if b.IOReserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", b.IOReserve))
}
qemuParams = append(qemuParams, "-device")
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
return qemuParams
}
// Valid returns true if the PCIeSwitchUpstremPortDevice structure is valid and complete.
func (b PCIeSwitchDownstreamPortDevice) Valid() bool {
if b.ID == "" {
return false
}
if b.Bus == "" {
return false
}
if b.Chassis == "" {
return false
}
if b.Slot == "" {
return false
}
return true
}
// VFIODevice represents a qemu vfio device meant for direct access by guest OS.
type VFIODevice struct {
// Bus-Device-Function of device

View File

@@ -656,6 +656,7 @@ func (q *QMP) executeCommand(ctx context.Context, name string, args map[string]i
filter *qmpEventFilter) error {
_, err := q.executeCommandWithResponse(ctx, name, args, nil, filter)
return err
}
@@ -1191,6 +1192,7 @@ func (q *QMP) ExecutePCIVFIODeviceAdd(ctx context.Context, devID, bdf, addr, bus
if bus != "" {
args["bus"] = bus
}
return q.executeCommand(ctx, "device_add", args, nil)
}

View File

@@ -58,7 +58,6 @@ func (p PCIePort) String() string {
type HypervisorState struct {
BlockIndexMap map[int]struct{}
// Type of hypervisor, E.g. qemu/firecracker/acrn.
Type string
UUID string
@@ -68,6 +67,7 @@ type HypervisorState struct {
// Belows are qemu specific
// Refs: virtcontainers/qemu.go:QemuState
Bridges []Bridge
// HotpluggedCPUs is the list of CPUs that were hot-added
HotpluggedVCPUs []CPUDevice
@@ -75,6 +75,8 @@ type HypervisorState struct {
VirtiofsDaemonPid int
Pid int
PCIeRootPort int
PCIeSwitchPort int
ColdPlugVFIO PCIePort
HotPlugVFIO PCIePort
HotplugVFIOOnRootBus bool
}

View File

@@ -109,3 +109,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock"
var defaultRuntimeConfiguration = "@CONFIG_PATH@"
const defaultColdPlugVFIO = hv.NoPort
const defaultHotPlugVFIO = hv.BridgePort
const defaultPCIeSwitchPort = 0

View File

@@ -76,7 +76,6 @@ type factory struct {
VMCacheNumber uint `toml:"vm_cache_number"`
Template bool `toml:"enable_template"`
}
type hypervisor struct {
Path string `toml:"path"`
JailerPath string `toml:"jailer_path"`
@@ -131,6 +130,7 @@ type hypervisor struct {
MemSlots uint32 `toml:"memory_slots"`
DefaultBridges uint32 `toml:"default_bridges"`
Msize9p uint32 `toml:"msize_9p"`
PCIeSwitchPort uint32 `toml:"pcie_switch_port"`
PCIeRootPort uint32 `toml:"pcie_root_port"`
NumVCPUs int32 `toml:"default_vcpus"`
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
@@ -149,6 +149,7 @@ type hypervisor struct {
EnableIOThreads bool `toml:"enable_iothreads"`
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
HotPlugVFIO hv.PCIePort `toml:"hotplug_vfio"`
ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"`
DisableVhostNet bool `toml:"disable_vhost_net"`
GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
@@ -293,6 +294,12 @@ func (h hypervisor) coldPlugVFIO() hv.PCIePort {
}
return h.ColdPlugVFIO
}
func (h hypervisor) hotPlugVFIO() hv.PCIePort {
if h.HotPlugVFIO == "" {
return defaultHotPlugVFIO
}
return h.HotPlugVFIO
}
func (h hypervisor) firmwareVolume() (string, error) {
p := h.FirmwareVolume
@@ -864,7 +871,9 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
DisableImageNvdimm: h.DisableImageNvdimm,
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
ColdPlugVFIO: h.coldPlugVFIO(),
HotPlugVFIO: h.HotPlugVFIO,
PCIeRootPort: h.PCIeRootPort,
PCIeSwitchPort: h.PCIeSwitchPort,
DisableVhostNet: h.DisableVhostNet,
EnableVhostUserStore: h.EnableVhostUserStore,
VhostUserStorePath: h.vhostUserStorePath(),
@@ -877,7 +886,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
GuestMemoryDumpPath: h.GuestMemoryDumpPath,
GuestMemoryDumpPaging: h.GuestMemoryDumpPaging,
ConfidentialGuest: h.ConfidentialGuest,
SevSnpGuest: h.SevSnpGuest,
GuestSwap: h.GuestSwap,
Rootless: h.Rootless,
LegacySerial: h.LegacySerial,
@@ -1059,7 +1067,9 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
Msize9p: h.msize9p(),
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
ColdPlugVFIO: h.coldPlugVFIO(),
HotPlugVFIO: h.hotPlugVFIO(),
PCIeRootPort: h.PCIeRootPort,
PCIeSwitchPort: h.PCIeSwitchPort,
DisableVhostNet: true,
GuestHookPath: h.guestHookPath(),
VirtioFSExtraArgs: h.VirtioFSExtraArgs,
@@ -1290,6 +1300,8 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
Msize9p: defaultMsize9p,
HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus,
ColdPlugVFIO: defaultColdPlugVFIO,
HotPlugVFIO: defaultHotPlugVFIO,
PCIeSwitchPort: defaultPCIeSwitchPort,
PCIeRootPort: defaultPCIeRootPort,
GuestHookPath: defaultGuestHookPath,
VhostUserStorePath: defaultVhostUserStorePath,

View File

@@ -566,6 +566,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
GuestHookPath: defaultGuestHookPath,
VhostUserStorePath: defaultVhostUserStorePath,
VirtioFSCache: defaultVirtioFSCacheMode,
HotPlugVFIO: defaultHotPlugVFIO,
BlockDeviceAIO: defaultBlockDeviceAIO,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
ColdPlugVFIO: defaultColdPlugVFIO,

View File

@@ -821,22 +821,22 @@ func TestRegexpContains(t *testing.T) {
//nolint: govet
type testData struct {
regexps []string
toMatch string
regexps []string
expected bool
}
data := []testData{
{[]string{}, "", false},
{[]string{}, "nonempty", false},
{[]string{"simple"}, "simple", true},
{[]string{"simple"}, "some_simple_text", true},
{[]string{"simple"}, "simp", false},
{[]string{"one", "two"}, "one", true},
{[]string{"one", "two"}, "two", true},
{[]string{"o*"}, "oooo", true},
{[]string{"o*"}, "oooa", true},
{[]string{"^o*$"}, "oooa", false},
{regexps: []string{}, toMatch: "", expected: false},
{regexps: []string{}, toMatch: "nonempty", expected: false},
{regexps: []string{"simple"}, toMatch: "simple", expected: true},
{regexps: []string{"simple"}, toMatch: "some_simple_text", expected: true},
{regexps: []string{"simple"}, toMatch: "simp", expected: false},
{regexps: []string{"one", "two"}, toMatch: "one", expected: true},
{regexps: []string{"one", "two"}, toMatch: "two", expected: true},
{regexps: []string{"o*"}, toMatch: "oooo", expected: true},
{regexps: []string{"o*"}, toMatch: "oooa", expected: true},
{regexps: []string{"^o*$"}, toMatch: "oooa", expected: false},
}
for _, d := range data {
@@ -850,25 +850,25 @@ func TestCheckPathIsInGlobs(t *testing.T) {
//nolint: govet
type testData struct {
globs []string
toMatch string
globs []string
expected bool
}
data := []testData{
{[]string{}, "", false},
{[]string{}, "nonempty", false},
{[]string{"simple"}, "simple", false},
{[]string{"simple"}, "some_simple_text", false},
{[]string{"/bin/ls"}, "/bin/ls", true},
{[]string{"/bin/ls", "/bin/false"}, "/bin/ls", true},
{[]string{"/bin/ls", "/bin/false"}, "/bin/false", true},
{[]string{"/bin/ls", "/bin/false"}, "/bin/bar", false},
{[]string{"/bin/*ls*"}, "/bin/ls", true},
{[]string{"/bin/*ls*"}, "/bin/false", true},
{[]string{"bin/ls"}, "/bin/ls", false},
{[]string{"./bin/ls"}, "/bin/ls", false},
{[]string{"*/bin/ls"}, "/bin/ls", false},
{globs: []string{}, toMatch: "", expected: false},
{globs: []string{}, toMatch: "nonempty", expected: false},
{globs: []string{"simple"}, toMatch: "simple", expected: false},
{globs: []string{"simple"}, toMatch: "some_simple_text", expected: false},
{globs: []string{"/bin/ls"}, toMatch: "/bin/ls", expected: true},
{globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/ls", expected: true},
{globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/false", expected: true},
{globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/bar", expected: false},
{globs: []string{"/bin/*ls*"}, toMatch: "/bin/ls", expected: true},
{globs: []string{"/bin/*ls*"}, toMatch: "/bin/false", expected: true},
{globs: []string{"bin/ls"}, toMatch: "/bin/ls", expected: false},
{globs: []string{"./bin/ls"}, toMatch: "/bin/ls", expected: false},
{globs: []string{"*/bin/ls"}, toMatch: "/bin/ls", expected: false},
}
for _, d := range data {
@@ -923,10 +923,10 @@ func TestParseAnnotationUintConfiguration(t *testing.T) {
// nolint: govet
testCases := []struct {
annotations map[string]string
expected uint64
err error
annotations map[string]string
validFunc func(uint64) error
expected uint64
}{
{
annotations: map[string]string{key: ""},
@@ -1007,10 +1007,10 @@ func TestParseAnnotationBoolConfiguration(t *testing.T) {
// nolint: govet
testCases := []struct {
err error
annotationKey string
annotationValueList []string
expected bool
err error
}{
{
annotationKey: boolKey,
@@ -1207,8 +1207,8 @@ func TestNewMount(t *testing.T) {
assert := assert.New(t)
testCases := []struct {
out vc.Mount
in specs.Mount
out vc.Mount
}{
{
in: specs.Mount{