diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 5bfc803f1..4fbbcbfb7 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -190,6 +190,7 @@ DEFVALIDVHOSTUSERSTOREPATHS := [\"$(DEFVHOSTUSERSTOREPATH)\"] DEFFILEMEMBACKEND := "" DEFVALIDFILEMEMBACKENDS := [\"$(DEFFILEMEMBACKEND)\"] DEFMSIZE9P := 8192 +DEFVFIOMODE := guest-kernel # Default cgroup model DEFSANDBOXCGROUPONLY ?= false @@ -459,6 +460,7 @@ USER_VARS += DEFENTROPYSOURCE USER_VARS += DEFVALIDENTROPYSOURCES USER_VARS += DEFSANDBOXCGROUPONLY USER_VARS += DEFBINDMOUNTS +USER_VARS += DEFVFIOMODE USER_VARS += FEATURE_SELINUX USER_VARS += BUILDFLAGS diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index a128aea6a..076a11394 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -263,6 +263,20 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ # These will not be exposed to the container workloads, and are only provided for potential guest services. sandbox_bind_mounts=@DEFBINDMOUNTS@ +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode="@DEFVFIOMODE@" + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # they may break compatibility, and are prepared for a big version bump. diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index f055ae130..f2c7c4fa7 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -543,6 +543,20 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ # These will not be exposed to the container workloads, and are only provided for potential guest services. sandbox_bind_mounts=@DEFBINDMOUNTS@ +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode="@DEFVFIOMODE@" + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # they may break compatibility, and are prepared for a big version bump. diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 66f30e073..bd793e23f 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -88,6 +88,7 @@ const defaultConfidentialGuest = false const defaultGuestSwap = false const defaultRootlessHypervisor = false const defaultDisableSeccomp = false +const defaultVfioMode = "guest-kernel" var defaultSGXEPCSize = int64(0) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 93a870e8a..79634ba26 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -143,6 +143,7 @@ type runtime struct { JaegerEndpoint string `toml:"jaeger_endpoint"` JaegerUser string `toml:"jaeger_user"` JaegerPassword string `toml:"jaeger_password"` + VfioMode string `toml:"vfio_mode"` SandboxBindMounts []string `toml:"sandbox_bind_mounts"` Experimental []string `toml:"experimental"` Debug bool `toml:"enable_debug"` @@ -1068,6 +1069,11 @@ func initConfig() (config oci.RuntimeConfig, err error) { return oci.RuntimeConfig{}, err } + err = config.VfioMode.VFIOSetMode(defaultVfioMode) + if err != nil { + return oci.RuntimeConfig{}, err + } + config = oci.RuntimeConfig{ HypervisorType: defaultHypervisor, HypervisorConfig: GetDefaultHypervisorConfig(), @@ -1114,6 +1120,14 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat } } + if tomlConf.Runtime.VfioMode != "" { + err = config.VfioMode.VFIOSetMode(tomlConf.Runtime.VfioMode) + + if err != nil { + return "", config, err + } + } + if !ignoreLogging { err := handleSystemLog("", "") if err != nil { diff --git a/src/runtime/virtcontainers/device/config/config.go b/src/runtime/virtcontainers/device/config/config.go index 8627094a9..5e62d0c17 100644 --- a/src/runtime/virtcontainers/device/config/config.go +++ b/src/runtime/virtcontainers/device/config/config.go @@ -187,6 +187,31 @@ type BlockDrive struct { Swap bool } +// VFIOMode indicates e behaviour mode for handling devices in the VM +type VFIOModeType uint32 + +const ( + // VFIOModeGuestKernel specifies Kata-specific behaviour + // useful in certain cases: VFIO devices specified to Kata are + // bound to whatever driver in the VM will take them. This + // requires specialized containers expecting this behaviour to + // locate and use the devices + VFIOModeGuestKernel = iota +) + +const ( + vfioModeGuestKernelStr = "guest-kernel" +) + +func (m *VFIOModeType) VFIOSetMode(modeName string) error { + switch modeName { + case vfioModeGuestKernelStr: + *m = VFIOModeGuestKernel + return nil + } + return fmt.Errorf("Unknown VFIO mode %s", modeName) +} + // VFIODeviceType indicates VFIO device type type VFIODeviceType uint32 diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index 2a7e88f72..5161e84b1 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -250,6 +250,10 @@ const ( // DisableNewNetNs is a sandbox annotation that determines if create a netns for hypervisor process. DisableNewNetNs = kataAnnotRuntimePrefix + "disable_new_netns" + + // VfioMode is a sandbox annotation to specify how attached VFIO devices should be treated + // Overrides the runtime.vfio_mode parameter in the global configuration.toml + VfioMode = kataAnnotRuntimePrefix + "vfio_mode" ) // Agent related annotations diff --git a/src/runtime/virtcontainers/pkg/oci/utils.go b/src/runtime/virtcontainers/pkg/oci/utils.go index ba7977db1..d6fc10d8d 100644 --- a/src/runtime/virtcontainers/pkg/oci/utils.go +++ b/src/runtime/virtcontainers/pkg/oci/utils.go @@ -116,6 +116,10 @@ type RuntimeConfig struct { //the container network interface InterNetworkModel vc.NetInterworkingModel + //Determines how VFIO devices should be presented to the + //container + VfioMode config.VFIOModeType + Debug bool Trace bool @@ -826,6 +830,13 @@ func addRuntimeConfigOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, r sbConfig.NetworkConfig.InterworkingModel = runtimeConfig.InterNetworkModel } + if value, ok := ocispec.Annotations[vcAnnotations.VfioMode]; ok { + if err := sbConfig.VfioMode.VFIOSetMode(value); err != nil { + return fmt.Errorf("Unknown VFIO mode \"%s\" in annotation %s", + value, vcAnnotations.VfioMode) + } + } + return nil } @@ -893,6 +904,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid, c ShmSize: shmSize, + VfioMode: runtime.VfioMode, + SystemdCgroup: systemdCgroup, SandboxCgroupOnly: runtime.SandboxCgroupOnly, diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 8ce76497a..5b9bc7fc4 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -134,6 +134,8 @@ type SandboxConfig struct { ShmSize uint64 + VfioMode config.VFIOModeType + // SharePidNs sets all containers to share the same sandbox level pid namespace. SharePidNs bool