From 57ab408576290a7d7f1b172fd45cac5ecb18916b Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 8 Oct 2021 16:37:28 +1100 Subject: [PATCH] runtime: Introduce "vfio_mode" config variable and annotation In order to support DPDK workloads, we need to change the way VFIO devices will be handled in Kata containers. However, the current method, although it is not remotely OCI compliant has real uses. Therefore, introduce a new runtime configuration field "vfio_mode" to control how VFIO devices will be presented to the container. We also add a new sandbox annotation - io.katacontainers.config.runtime.vfio_mode - to override this on a per-sandbox basis. For now, the only allowed value is "guest-kernel" which refers to the current behaviour where VFIO devices added to the container will be bound to whatever driver in the VM kernel claims them. Signed-off-by: David Gibson --- src/runtime/Makefile | 2 ++ src/runtime/config/configuration-clh.toml.in | 14 +++++++++++ src/runtime/config/configuration-qemu.toml.in | 14 +++++++++++ .../pkg/katautils/config-settings.go.in | 1 + src/runtime/pkg/katautils/config.go | 14 +++++++++++ .../virtcontainers/device/config/config.go | 25 +++++++++++++++++++ .../pkg/annotations/annotations.go | 4 +++ src/runtime/virtcontainers/pkg/oci/utils.go | 13 ++++++++++ src/runtime/virtcontainers/sandbox.go | 2 ++ 9 files changed, 89 insertions(+) diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 5bfc803f1..4fbbcbfb7 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -190,6 +190,7 @@ DEFVALIDVHOSTUSERSTOREPATHS := [\"$(DEFVHOSTUSERSTOREPATH)\"] DEFFILEMEMBACKEND := "" DEFVALIDFILEMEMBACKENDS := [\"$(DEFFILEMEMBACKEND)\"] DEFMSIZE9P := 8192 +DEFVFIOMODE := guest-kernel # Default cgroup model DEFSANDBOXCGROUPONLY ?= false @@ -459,6 +460,7 @@ USER_VARS += DEFENTROPYSOURCE USER_VARS += DEFVALIDENTROPYSOURCES USER_VARS += DEFSANDBOXCGROUPONLY USER_VARS += DEFBINDMOUNTS +USER_VARS += DEFVFIOMODE USER_VARS += FEATURE_SELINUX USER_VARS += BUILDFLAGS diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index a128aea6a..076a11394 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -263,6 +263,20 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ # These will not be exposed to the container workloads, and are only provided for potential guest services. sandbox_bind_mounts=@DEFBINDMOUNTS@ +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode="@DEFVFIOMODE@" + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # they may break compatibility, and are prepared for a big version bump. diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index f055ae130..f2c7c4fa7 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -543,6 +543,20 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ # These will not be exposed to the container workloads, and are only provided for potential guest services. sandbox_bind_mounts=@DEFBINDMOUNTS@ +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode="@DEFVFIOMODE@" + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # they may break compatibility, and are prepared for a big version bump. diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 66f30e073..bd793e23f 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -88,6 +88,7 @@ const defaultConfidentialGuest = false const defaultGuestSwap = false const defaultRootlessHypervisor = false const defaultDisableSeccomp = false +const defaultVfioMode = "guest-kernel" var defaultSGXEPCSize = int64(0) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 93a870e8a..79634ba26 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -143,6 +143,7 @@ type runtime struct { JaegerEndpoint string `toml:"jaeger_endpoint"` JaegerUser string `toml:"jaeger_user"` JaegerPassword string `toml:"jaeger_password"` + VfioMode string `toml:"vfio_mode"` SandboxBindMounts []string `toml:"sandbox_bind_mounts"` Experimental []string `toml:"experimental"` Debug bool `toml:"enable_debug"` @@ -1068,6 +1069,11 @@ func initConfig() (config oci.RuntimeConfig, err error) { return oci.RuntimeConfig{}, err } + err = config.VfioMode.VFIOSetMode(defaultVfioMode) + if err != nil { + return oci.RuntimeConfig{}, err + } + config = oci.RuntimeConfig{ HypervisorType: defaultHypervisor, HypervisorConfig: GetDefaultHypervisorConfig(), @@ -1114,6 +1120,14 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat } } + if tomlConf.Runtime.VfioMode != "" { + err = config.VfioMode.VFIOSetMode(tomlConf.Runtime.VfioMode) + + if err != nil { + return "", config, err + } + } + if !ignoreLogging { err := handleSystemLog("", "") if err != nil { diff --git a/src/runtime/virtcontainers/device/config/config.go b/src/runtime/virtcontainers/device/config/config.go index 8627094a9..5e62d0c17 100644 --- a/src/runtime/virtcontainers/device/config/config.go +++ b/src/runtime/virtcontainers/device/config/config.go @@ -187,6 +187,31 @@ type BlockDrive struct { Swap bool } +// VFIOMode indicates e behaviour mode for handling devices in the VM +type VFIOModeType uint32 + +const ( + // VFIOModeGuestKernel specifies Kata-specific behaviour + // useful in certain cases: VFIO devices specified to Kata are + // bound to whatever driver in the VM will take them. This + // requires specialized containers expecting this behaviour to + // locate and use the devices + VFIOModeGuestKernel = iota +) + +const ( + vfioModeGuestKernelStr = "guest-kernel" +) + +func (m *VFIOModeType) VFIOSetMode(modeName string) error { + switch modeName { + case vfioModeGuestKernelStr: + *m = VFIOModeGuestKernel + return nil + } + return fmt.Errorf("Unknown VFIO mode %s", modeName) +} + // VFIODeviceType indicates VFIO device type type VFIODeviceType uint32 diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index 2a7e88f72..5161e84b1 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -250,6 +250,10 @@ const ( // DisableNewNetNs is a sandbox annotation that determines if create a netns for hypervisor process. DisableNewNetNs = kataAnnotRuntimePrefix + "disable_new_netns" + + // VfioMode is a sandbox annotation to specify how attached VFIO devices should be treated + // Overrides the runtime.vfio_mode parameter in the global configuration.toml + VfioMode = kataAnnotRuntimePrefix + "vfio_mode" ) // Agent related annotations diff --git a/src/runtime/virtcontainers/pkg/oci/utils.go b/src/runtime/virtcontainers/pkg/oci/utils.go index ba7977db1..d6fc10d8d 100644 --- a/src/runtime/virtcontainers/pkg/oci/utils.go +++ b/src/runtime/virtcontainers/pkg/oci/utils.go @@ -116,6 +116,10 @@ type RuntimeConfig struct { //the container network interface InterNetworkModel vc.NetInterworkingModel + //Determines how VFIO devices should be presented to the + //container + VfioMode config.VFIOModeType + Debug bool Trace bool @@ -826,6 +830,13 @@ func addRuntimeConfigOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, r sbConfig.NetworkConfig.InterworkingModel = runtimeConfig.InterNetworkModel } + if value, ok := ocispec.Annotations[vcAnnotations.VfioMode]; ok { + if err := sbConfig.VfioMode.VFIOSetMode(value); err != nil { + return fmt.Errorf("Unknown VFIO mode \"%s\" in annotation %s", + value, vcAnnotations.VfioMode) + } + } + return nil } @@ -893,6 +904,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid, c ShmSize: shmSize, + VfioMode: runtime.VfioMode, + SystemdCgroup: systemdCgroup, SandboxCgroupOnly: runtime.SandboxCgroupOnly, diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 8ce76497a..5b9bc7fc4 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -134,6 +134,8 @@ type SandboxConfig struct { ShmSize uint64 + VfioMode config.VFIOModeType + // SharePidNs sets all containers to share the same sandbox level pid namespace. SharePidNs bool