mirror of
https://github.com/aljazceru/kata-containers.git
synced 2025-12-20 15:54:19 +01:00
Merge pull request #2795 from dgibson/vfio-as-vfio
Allow VFIO devices to be used as VFIO devices in the container
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use libc::uid_t;
|
||||
use nix::errno::Errno;
|
||||
use nix::fcntl::{self, OFlag};
|
||||
@@ -19,7 +19,7 @@ use std::fs::{self, OpenOptions};
|
||||
use std::mem::MaybeUninit;
|
||||
use std::os::unix;
|
||||
use std::os::unix::io::RawFd;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::{Component, Path, PathBuf};
|
||||
|
||||
use path_absolutize::*;
|
||||
use std::fs::File;
|
||||
@@ -828,18 +828,35 @@ fn default_symlinks() -> Result<()> {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn dev_rel_path(path: &str) -> Option<&Path> {
|
||||
let path = Path::new(path);
|
||||
|
||||
if !path.starts_with("/dev")
|
||||
|| path == Path::new("/dev")
|
||||
|| path.components().any(|c| c == Component::ParentDir)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
path.strip_prefix("/").ok()
|
||||
}
|
||||
|
||||
fn create_devices(devices: &[LinuxDevice], bind: bool) -> Result<()> {
|
||||
let op: fn(&LinuxDevice) -> Result<()> = if bind { bind_dev } else { mknod_dev };
|
||||
let op: fn(&LinuxDevice, &Path) -> Result<()> = if bind { bind_dev } else { mknod_dev };
|
||||
let old = stat::umask(Mode::from_bits_truncate(0o000));
|
||||
for dev in DEFAULT_DEVICES.iter() {
|
||||
op(dev)?;
|
||||
let path = Path::new(&dev.path[1..]);
|
||||
op(dev, path).context(format!("Creating container device {:?}", dev))?;
|
||||
}
|
||||
for dev in devices {
|
||||
if !dev.path.starts_with("/dev") || dev.path.contains("..") {
|
||||
let path = dev_rel_path(&dev.path).ok_or_else(|| {
|
||||
let msg = format!("{} is not a valid device path", dev.path);
|
||||
bail!(anyhow!(msg));
|
||||
anyhow!(msg)
|
||||
})?;
|
||||
if let Some(dir) = path.parent() {
|
||||
fs::create_dir_all(dir).context(format!("Creating container device {:?}", dev))?;
|
||||
}
|
||||
op(dev)?;
|
||||
op(dev, path).context(format!("Creating container device {:?}", dev))?;
|
||||
}
|
||||
stat::umask(old);
|
||||
Ok(())
|
||||
@@ -861,21 +878,21 @@ lazy_static! {
|
||||
};
|
||||
}
|
||||
|
||||
fn mknod_dev(dev: &LinuxDevice) -> Result<()> {
|
||||
fn mknod_dev(dev: &LinuxDevice, relpath: &Path) -> Result<()> {
|
||||
let f = match LINUXDEVICETYPE.get(dev.r#type.as_str()) {
|
||||
Some(v) => v,
|
||||
None => return Err(anyhow!("invalid spec".to_string())),
|
||||
};
|
||||
|
||||
stat::mknod(
|
||||
&dev.path[1..],
|
||||
relpath,
|
||||
*f,
|
||||
Mode::from_bits_truncate(dev.file_mode.unwrap_or(0)),
|
||||
nix::sys::stat::makedev(dev.major as u64, dev.minor as u64),
|
||||
)?;
|
||||
|
||||
unistd::chown(
|
||||
&dev.path[1..],
|
||||
relpath,
|
||||
Some(Uid::from_raw(dev.uid.unwrap_or(0) as uid_t)),
|
||||
Some(Gid::from_raw(dev.gid.unwrap_or(0) as uid_t)),
|
||||
)?;
|
||||
@@ -883,9 +900,9 @@ fn mknod_dev(dev: &LinuxDevice) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn bind_dev(dev: &LinuxDevice) -> Result<()> {
|
||||
fn bind_dev(dev: &LinuxDevice, relpath: &Path) -> Result<()> {
|
||||
let fd = fcntl::open(
|
||||
&dev.path[1..],
|
||||
relpath,
|
||||
OFlag::O_RDWR | OFlag::O_CREAT,
|
||||
Mode::from_bits_truncate(0o644),
|
||||
)?;
|
||||
@@ -894,7 +911,7 @@ fn bind_dev(dev: &LinuxDevice) -> Result<()> {
|
||||
|
||||
mount(
|
||||
Some(&*dev.path),
|
||||
&dev.path[1..],
|
||||
relpath,
|
||||
None::<&str>,
|
||||
MsFlags::MS_BIND,
|
||||
None::<&str>,
|
||||
@@ -1258,11 +1275,12 @@ mod tests {
|
||||
uid: Some(unistd::getuid().as_raw()),
|
||||
gid: Some(unistd::getgid().as_raw()),
|
||||
};
|
||||
let path = Path::new("fifo");
|
||||
|
||||
let ret = mknod_dev(&dev);
|
||||
let ret = mknod_dev(&dev, path);
|
||||
assert!(ret.is_ok(), "Should pass. Got: {:?}", ret);
|
||||
|
||||
let ret = stat::stat("fifo");
|
||||
let ret = stat::stat(path);
|
||||
assert!(ret.is_ok(), "Should pass. Got: {:?}", ret);
|
||||
}
|
||||
#[test]
|
||||
@@ -1379,4 +1397,26 @@ mod tests {
|
||||
assert!(result == t.result, "{}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dev_rel_path() {
|
||||
// Valid device paths
|
||||
assert_eq!(dev_rel_path("/dev/sda").unwrap(), Path::new("dev/sda"));
|
||||
assert_eq!(dev_rel_path("//dev/sda").unwrap(), Path::new("dev/sda"));
|
||||
assert_eq!(
|
||||
dev_rel_path("/dev/vfio/99").unwrap(),
|
||||
Path::new("dev/vfio/99")
|
||||
);
|
||||
assert_eq!(dev_rel_path("/dev/...").unwrap(), Path::new("dev/..."));
|
||||
assert_eq!(dev_rel_path("/dev/a..b").unwrap(), Path::new("dev/a..b"));
|
||||
assert_eq!(dev_rel_path("/dev//foo").unwrap(), Path::new("dev/foo"));
|
||||
|
||||
// Bad device paths
|
||||
assert!(dev_rel_path("/devfoo").is_none());
|
||||
assert!(dev_rel_path("/etc/passwd").is_none());
|
||||
assert!(dev_rel_path("/dev/../etc/passwd").is_none());
|
||||
assert!(dev_rel_path("dev/foo").is_none());
|
||||
assert!(dev_rel_path("").is_none());
|
||||
assert!(dev_rel_path("/dev").is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,10 @@ use libc::{c_uint, major, minor};
|
||||
use nix::sys::stat;
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
use std::ffi::OsStr;
|
||||
use std::fmt;
|
||||
use std::fs;
|
||||
use std::os::unix::ffi::OsStrExt;
|
||||
use std::os::unix::fs::MetadataExt;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
@@ -46,6 +49,9 @@ pub const DRIVER_LOCAL_TYPE: &str = "local";
|
||||
pub const DRIVER_WATCHABLE_BIND_TYPE: &str = "watchable-bind";
|
||||
// VFIO device to be bound to a guest kernel driver
|
||||
pub const DRIVER_VFIO_GK_TYPE: &str = "vfio-gk";
|
||||
// VFIO device to be bound to vfio-pci and made available inside the
|
||||
// container as a VFIO device node
|
||||
pub const DRIVER_VFIO_TYPE: &str = "vfio";
|
||||
|
||||
#[derive(Debug)]
|
||||
struct DevIndexEntry {
|
||||
@@ -62,6 +68,83 @@ pub fn online_device(path: &str) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Force a given PCI device to bind to the given driver, does
|
||||
// basically the same thing as
|
||||
// driverctl set-override <PCI address> <driver>
|
||||
#[instrument]
|
||||
pub fn pci_driver_override<T, U>(syspci: T, dev: pci::Address, drv: U) -> Result<()>
|
||||
where
|
||||
T: AsRef<OsStr> + std::fmt::Debug,
|
||||
U: AsRef<OsStr> + std::fmt::Debug,
|
||||
{
|
||||
let syspci = Path::new(&syspci);
|
||||
let drv = drv.as_ref();
|
||||
info!(sl!(), "rebind_pci_driver: {} => {:?}", dev, drv);
|
||||
|
||||
let devpath = syspci.join("devices").join(dev.to_string());
|
||||
let overridepath = &devpath.join("driver_override");
|
||||
|
||||
fs::write(overridepath, drv.as_bytes())?;
|
||||
|
||||
let drvpath = &devpath.join("driver");
|
||||
let need_unbind = match fs::read_link(drvpath) {
|
||||
Ok(d) if d.file_name() == Some(drv) => return Ok(()), // Nothing to do
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => false, // No current driver
|
||||
Err(e) => return Err(anyhow!("Error checking driver on {}: {}", dev, e)),
|
||||
Ok(_) => true, // Current driver needs unbinding
|
||||
};
|
||||
if need_unbind {
|
||||
let unbindpath = &drvpath.join("unbind");
|
||||
fs::write(unbindpath, dev.to_string())?;
|
||||
}
|
||||
let probepath = syspci.join("drivers_probe");
|
||||
fs::write(probepath, dev.to_string())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Represents an IOMMU group
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct IommuGroup(u32);
|
||||
|
||||
impl fmt::Display for IommuGroup {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
// Determine the IOMMU group of a PCI device
|
||||
#[instrument]
|
||||
fn pci_iommu_group<T>(syspci: T, dev: pci::Address) -> Result<Option<IommuGroup>>
|
||||
where
|
||||
T: AsRef<OsStr> + std::fmt::Debug,
|
||||
{
|
||||
let syspci = Path::new(&syspci);
|
||||
let grouppath = syspci
|
||||
.join("devices")
|
||||
.join(dev.to_string())
|
||||
.join("iommu_group");
|
||||
|
||||
match fs::read_link(&grouppath) {
|
||||
// Device has no group
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None),
|
||||
Err(e) => Err(anyhow!("Error reading link {:?}: {}", &grouppath, e)),
|
||||
Ok(group) => {
|
||||
if let Some(group) = group.file_name() {
|
||||
if let Some(group) = group.to_str() {
|
||||
if let Ok(group) = group.parse::<u32>() {
|
||||
return Ok(Some(IommuGroup(group)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(anyhow!(
|
||||
"Unexpected IOMMU group link {:?} => {:?}",
|
||||
grouppath,
|
||||
group
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// pcipath_to_sysfs fetches the sysfs path for a PCI path, relative to
|
||||
// the sysfs path for the PCI host bridge, based on the PCI path
|
||||
// provided.
|
||||
@@ -296,6 +379,33 @@ pub async fn wait_for_pci_device(
|
||||
Ok(addr)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct VfioMatcher {
|
||||
syspath: String,
|
||||
}
|
||||
|
||||
impl VfioMatcher {
|
||||
fn new(grp: IommuGroup) -> VfioMatcher {
|
||||
VfioMatcher {
|
||||
syspath: format!("/devices/virtual/vfio/{}", grp),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UeventMatcher for VfioMatcher {
|
||||
fn is_match(&self, uev: &Uevent) -> bool {
|
||||
uev.devpath == self.syspath
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
async fn get_vfio_device_name(sandbox: &Arc<Mutex<Sandbox>>, grp: IommuGroup) -> Result<String> {
|
||||
let matcher = VfioMatcher::new(grp);
|
||||
|
||||
let uev = wait_for_uevent(sandbox, matcher).await?;
|
||||
Ok(format!("{}/{}", SYSTEM_DEV_PATH, &uev.devname))
|
||||
}
|
||||
|
||||
/// Scan SCSI bus for the given SCSI address(SCSI-Id and LUN)
|
||||
#[instrument]
|
||||
fn scan_scsi_bus(scsi_addr: &str) -> Result<()> {
|
||||
@@ -326,24 +436,27 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// update_spec_device_list takes a device description provided by the caller,
|
||||
// trying to find it on the guest. Once this device has been identified, the
|
||||
// "real" information that can be read from inside the VM is used to update
|
||||
// the same device in the list of devices provided through the OCI spec.
|
||||
// This is needed to update information about minor/major numbers that cannot
|
||||
// be predicted from the caller.
|
||||
// update_spec_device updates the device list in the OCI spec to make
|
||||
// it include details appropriate for the VM, instead of the host. It
|
||||
// is given the host path to the device (to locate the device in the
|
||||
// original OCI spec) and the VM path which it uses to determine the
|
||||
// VM major/minor numbers, and the final path with which to present
|
||||
// the device in the (inner) container
|
||||
#[instrument]
|
||||
fn update_spec_device_list(device: &Device, spec: &mut Spec, devidx: &DevIndex) -> Result<()> {
|
||||
fn update_spec_device(
|
||||
spec: &mut Spec,
|
||||
devidx: &DevIndex,
|
||||
host_path: &str,
|
||||
vm_path: &str,
|
||||
final_path: &str,
|
||||
) -> Result<()> {
|
||||
let major_id: c_uint;
|
||||
let minor_id: c_uint;
|
||||
|
||||
// If no container_path is provided, we won't be able to match and
|
||||
// update the device in the OCI spec device list. This is an error.
|
||||
if device.container_path.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"container_path cannot empty for device {:?}",
|
||||
device
|
||||
));
|
||||
if host_path.is_empty() {
|
||||
return Err(anyhow!("Host path cannot empty for device"));
|
||||
}
|
||||
|
||||
let linux = spec
|
||||
@@ -351,11 +464,11 @@ fn update_spec_device_list(device: &Device, spec: &mut Spec, devidx: &DevIndex)
|
||||
.as_mut()
|
||||
.ok_or_else(|| anyhow!("Spec didn't container linux field"))?;
|
||||
|
||||
if !Path::new(&device.vm_path).exists() {
|
||||
return Err(anyhow!("vm_path:{} doesn't exist", device.vm_path));
|
||||
if !Path::new(vm_path).exists() {
|
||||
return Err(anyhow!("vm_path:{} doesn't exist", vm_path));
|
||||
}
|
||||
|
||||
let meta = fs::metadata(&device.vm_path)?;
|
||||
let meta = fs::metadata(vm_path)?;
|
||||
let dev_id = meta.rdev();
|
||||
unsafe {
|
||||
major_id = major(dev_id);
|
||||
@@ -364,24 +477,27 @@ fn update_spec_device_list(device: &Device, spec: &mut Spec, devidx: &DevIndex)
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"got the device: dev_path: {}, major: {}, minor: {}\n", &device.vm_path, major_id, minor_id
|
||||
"update_spec_device(): vm_path={}, major: {}, minor: {}\n", vm_path, major_id, minor_id
|
||||
);
|
||||
|
||||
if let Some(idxdata) = devidx.0.get(device.container_path.as_str()) {
|
||||
if let Some(idxdata) = devidx.0.get(host_path) {
|
||||
let dev = &mut linux.devices[idxdata.idx];
|
||||
let host_major = dev.major;
|
||||
let host_minor = dev.minor;
|
||||
|
||||
dev.major = major_id as i64;
|
||||
dev.minor = minor_id as i64;
|
||||
dev.path = final_path.to_string();
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"change the device from major: {} minor: {} to vm device major: {} minor: {}",
|
||||
"change the device from path: {} major: {} minor: {} to vm device path: {} major: {} minor: {}",
|
||||
host_path,
|
||||
host_major,
|
||||
host_minor,
|
||||
major_id,
|
||||
minor_id
|
||||
dev.path,
|
||||
dev.major,
|
||||
dev.minor,
|
||||
);
|
||||
|
||||
// Resources must be updated since they are used to identify
|
||||
@@ -402,7 +518,7 @@ fn update_spec_device_list(device: &Device, spec: &mut Spec, devidx: &DevIndex)
|
||||
} else {
|
||||
Err(anyhow!(
|
||||
"Should have found a matching device {} in the spec",
|
||||
device.vm_path
|
||||
vm_path
|
||||
))
|
||||
}
|
||||
}
|
||||
@@ -420,7 +536,13 @@ async fn virtiommio_blk_device_handler(
|
||||
return Err(anyhow!("Invalid path for virtio mmio blk device"));
|
||||
}
|
||||
|
||||
update_spec_device_list(device, spec, devidx)
|
||||
update_spec_device(
|
||||
spec,
|
||||
devidx,
|
||||
&device.container_path,
|
||||
&device.vm_path,
|
||||
&device.container_path,
|
||||
)
|
||||
}
|
||||
|
||||
// device.Id should be a PCI path string
|
||||
@@ -436,7 +558,13 @@ async fn virtio_blk_device_handler(
|
||||
|
||||
dev.vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?;
|
||||
|
||||
update_spec_device_list(&dev, spec, devidx)
|
||||
update_spec_device(
|
||||
spec,
|
||||
devidx,
|
||||
&dev.container_path,
|
||||
&dev.vm_path,
|
||||
&dev.container_path,
|
||||
)
|
||||
}
|
||||
|
||||
// device.id should be a CCW path string
|
||||
@@ -451,7 +579,13 @@ async fn virtio_blk_ccw_device_handler(
|
||||
let mut dev = device.clone();
|
||||
let ccw_device = ccw::Device::from_str(&device.id)?;
|
||||
dev.vm_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?;
|
||||
update_spec_device_list(&dev, spec, devidx)
|
||||
update_spec_device(
|
||||
spec,
|
||||
devidx,
|
||||
&dev.container_path,
|
||||
&dev.vm_path,
|
||||
&dev.container_path,
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "s390x"))]
|
||||
@@ -475,7 +609,13 @@ async fn virtio_scsi_device_handler(
|
||||
) -> Result<()> {
|
||||
let mut dev = device.clone();
|
||||
dev.vm_path = get_scsi_device_name(sandbox, &device.id).await?;
|
||||
update_spec_device_list(&dev, spec, devidx)
|
||||
update_spec_device(
|
||||
spec,
|
||||
devidx,
|
||||
&dev.container_path,
|
||||
&dev.vm_path,
|
||||
&dev.container_path,
|
||||
)
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
@@ -489,7 +629,13 @@ async fn virtio_nvdimm_device_handler(
|
||||
return Err(anyhow!("Invalid path for nvdimm device"));
|
||||
}
|
||||
|
||||
update_spec_device_list(device, spec, devidx)
|
||||
update_spec_device(
|
||||
spec,
|
||||
devidx,
|
||||
&device.container_path,
|
||||
&device.vm_path,
|
||||
&device.container_path,
|
||||
)
|
||||
}
|
||||
|
||||
fn split_vfio_option(opt: &str) -> Option<(&str, &str)> {
|
||||
@@ -507,19 +653,54 @@ fn split_vfio_option(opt: &str) -> Option<(&str, &str)> {
|
||||
// Each option should have the form "DDDD:BB:DD.F=<pcipath>"
|
||||
// DDDD:BB:DD.F is the device's PCI address in the host
|
||||
// <pcipath> is a PCI path to the device in the guest (see pci.rs)
|
||||
async fn vfio_gk_device_handler(
|
||||
async fn vfio_device_handler(
|
||||
device: &Device,
|
||||
_: &mut Spec,
|
||||
spec: &mut Spec,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
_: &DevIndex,
|
||||
devidx: &DevIndex,
|
||||
) -> Result<()> {
|
||||
let vfio_in_guest = device.field_type != DRIVER_VFIO_GK_TYPE;
|
||||
let mut group = None;
|
||||
|
||||
for opt in device.options.iter() {
|
||||
let (_, pcipath) =
|
||||
split_vfio_option(opt).ok_or_else(|| anyhow!("Malformed VFIO option {:?}", opt))?;
|
||||
let pcipath = pci::Path::from_str(pcipath)?;
|
||||
|
||||
wait_for_pci_device(sandbox, &pcipath).await?;
|
||||
let guestdev = wait_for_pci_device(sandbox, &pcipath).await?;
|
||||
if vfio_in_guest {
|
||||
pci_driver_override(SYSFS_BUS_PCI_PATH, guestdev, "vfio-pci")?;
|
||||
|
||||
let devgroup = pci_iommu_group(SYSFS_BUS_PCI_PATH, guestdev)?;
|
||||
if devgroup.is_none() {
|
||||
// Devices must have an IOMMU group to be usable via VFIO
|
||||
return Err(anyhow!("{} has no IOMMU group", guestdev));
|
||||
}
|
||||
|
||||
if group.is_some() && group != devgroup {
|
||||
// If PCI devices associated with the same VFIO device
|
||||
// (and therefore group) in the host don't end up in
|
||||
// the same group in the guest, something has gone
|
||||
// horribly wrong
|
||||
return Err(anyhow!(
|
||||
"{} is not in guest IOMMU group {}",
|
||||
guestdev,
|
||||
group.unwrap()
|
||||
));
|
||||
}
|
||||
|
||||
group = devgroup;
|
||||
}
|
||||
}
|
||||
|
||||
if vfio_in_guest {
|
||||
// If there are any devices at all, logic above ensures that group is not None
|
||||
let group = group.unwrap();
|
||||
let vmpath = get_vfio_device_name(sandbox, group).await?;
|
||||
|
||||
update_spec_device(spec, devidx, &device.container_path, &vmpath, &vmpath)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -592,7 +773,9 @@ async fn add_device(
|
||||
DRIVER_MMIO_BLK_TYPE => virtiommio_blk_device_handler(device, spec, sandbox, devidx).await,
|
||||
DRIVER_NVDIMM_TYPE => virtio_nvdimm_device_handler(device, spec, sandbox, devidx).await,
|
||||
DRIVER_SCSI_TYPE => virtio_scsi_device_handler(device, spec, sandbox, devidx).await,
|
||||
DRIVER_VFIO_GK_TYPE => vfio_gk_device_handler(device, spec, sandbox, devidx).await,
|
||||
DRIVER_VFIO_GK_TYPE | DRIVER_VFIO_TYPE => {
|
||||
vfio_device_handler(device, spec, sandbox, devidx).await
|
||||
}
|
||||
_ => Err(anyhow!("Unknown device type {}", device.field_type)),
|
||||
}
|
||||
}
|
||||
@@ -657,28 +840,28 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update_spec_device_list() {
|
||||
fn test_update_spec_device() {
|
||||
let (major, minor) = (7, 2);
|
||||
let mut device = Device::default();
|
||||
let mut spec = Spec::default();
|
||||
|
||||
// container_path empty
|
||||
let container_path = "";
|
||||
let vm_path = "";
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device_list(&device, &mut spec, &devidx);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_err());
|
||||
|
||||
device.container_path = "/dev/null".to_string();
|
||||
|
||||
// linux is empty
|
||||
let container_path = "/dev/null";
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device_list(&device, &mut spec, &devidx);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_err());
|
||||
|
||||
spec.linux = Some(Linux::default());
|
||||
|
||||
// linux.devices is empty
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device_list(&device, &mut spec, &devidx);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_err());
|
||||
|
||||
spec.linux.as_mut().unwrap().devices = vec![oci::LinuxDevice {
|
||||
@@ -690,26 +873,32 @@ mod tests {
|
||||
|
||||
// vm_path empty
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device_list(&device, &mut spec, &devidx);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_err());
|
||||
|
||||
device.vm_path = "/dev/null".to_string();
|
||||
let vm_path = "/dev/null";
|
||||
|
||||
// guest and host path are not the same
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device_list(&device, &mut spec, &devidx);
|
||||
assert!(res.is_err(), "device={:?} spec={:?}", device, spec);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(
|
||||
res.is_err(),
|
||||
"container_path={:?} vm_path={:?} spec={:?}",
|
||||
container_path,
|
||||
vm_path,
|
||||
spec
|
||||
);
|
||||
|
||||
spec.linux.as_mut().unwrap().devices[0].path = device.container_path.clone();
|
||||
spec.linux.as_mut().unwrap().devices[0].path = container_path.to_string();
|
||||
|
||||
// spec.linux.resources is empty
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device_list(&device, &mut spec, &devidx);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_ok());
|
||||
|
||||
// update both devices and cgroup lists
|
||||
spec.linux.as_mut().unwrap().devices = vec![oci::LinuxDevice {
|
||||
path: device.container_path.clone(),
|
||||
path: container_path.to_string(),
|
||||
major,
|
||||
minor,
|
||||
..oci::LinuxDevice::default()
|
||||
@@ -725,12 +914,12 @@ mod tests {
|
||||
});
|
||||
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device_list(&device, &mut spec, &devidx);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update_spec_device_list_guest_host_conflict() {
|
||||
fn test_update_spec_device_guest_host_conflict() {
|
||||
let null_rdev = fs::metadata("/dev/null").unwrap().rdev();
|
||||
let zero_rdev = fs::metadata("/dev/zero").unwrap().rdev();
|
||||
let full_rdev = fs::metadata("/dev/full").unwrap().rdev();
|
||||
@@ -781,20 +970,14 @@ mod tests {
|
||||
};
|
||||
let devidx = DevIndex::new(&spec);
|
||||
|
||||
let dev_a = Device {
|
||||
container_path: "/dev/a".to_string(),
|
||||
vm_path: "/dev/zero".to_string(),
|
||||
..Device::default()
|
||||
};
|
||||
let container_path_a = "/dev/a";
|
||||
let vm_path_a = "/dev/zero";
|
||||
|
||||
let guest_major_a = stat::major(zero_rdev) as i64;
|
||||
let guest_minor_a = stat::minor(zero_rdev) as i64;
|
||||
|
||||
let dev_b = Device {
|
||||
container_path: "/dev/b".to_string(),
|
||||
vm_path: "/dev/full".to_string(),
|
||||
..Device::default()
|
||||
};
|
||||
let container_path_b = "/dev/b";
|
||||
let vm_path_b = "/dev/full";
|
||||
|
||||
let guest_major_b = stat::major(full_rdev) as i64;
|
||||
let guest_minor_b = stat::minor(full_rdev) as i64;
|
||||
@@ -811,7 +994,13 @@ mod tests {
|
||||
assert_eq!(Some(host_major_b), specresources.devices[1].major);
|
||||
assert_eq!(Some(host_minor_b), specresources.devices[1].minor);
|
||||
|
||||
let res = update_spec_device_list(&dev_a, &mut spec, &devidx);
|
||||
let res = update_spec_device(
|
||||
&mut spec,
|
||||
&devidx,
|
||||
container_path_a,
|
||||
vm_path_a,
|
||||
container_path_a,
|
||||
);
|
||||
assert!(res.is_ok());
|
||||
|
||||
let specdevices = &spec.linux.as_ref().unwrap().devices;
|
||||
@@ -826,7 +1015,13 @@ mod tests {
|
||||
assert_eq!(Some(host_major_b), specresources.devices[1].major);
|
||||
assert_eq!(Some(host_minor_b), specresources.devices[1].minor);
|
||||
|
||||
let res = update_spec_device_list(&dev_b, &mut spec, &devidx);
|
||||
let res = update_spec_device(
|
||||
&mut spec,
|
||||
&devidx,
|
||||
container_path_b,
|
||||
vm_path_b,
|
||||
container_path_b,
|
||||
);
|
||||
assert!(res.is_ok());
|
||||
|
||||
let specdevices = &spec.linux.as_ref().unwrap().devices;
|
||||
@@ -843,7 +1038,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update_spec_device_list_char_block_conflict() {
|
||||
fn test_update_spec_device_char_block_conflict() {
|
||||
let null_rdev = fs::metadata("/dev/null").unwrap().rdev();
|
||||
|
||||
let guest_major = stat::major(null_rdev) as i64;
|
||||
@@ -892,11 +1087,8 @@ mod tests {
|
||||
};
|
||||
let devidx = DevIndex::new(&spec);
|
||||
|
||||
let dev = Device {
|
||||
container_path: "/dev/char".to_string(),
|
||||
vm_path: "/dev/null".to_string(),
|
||||
..Device::default()
|
||||
};
|
||||
let container_path = "/dev/char";
|
||||
let vm_path = "/dev/null";
|
||||
|
||||
let specresources = spec.linux.as_ref().unwrap().resources.as_ref().unwrap();
|
||||
assert_eq!(Some(host_major), specresources.devices[0].major);
|
||||
@@ -904,7 +1096,7 @@ mod tests {
|
||||
assert_eq!(Some(host_major), specresources.devices[1].major);
|
||||
assert_eq!(Some(host_minor), specresources.devices[1].minor);
|
||||
|
||||
let res = update_spec_device_list(&dev, &mut spec, &devidx);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_ok());
|
||||
|
||||
// Only the char device, not the block device should be updated
|
||||
@@ -915,6 +1107,43 @@ mod tests {
|
||||
assert_eq!(Some(host_minor), specresources.devices[1].minor);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update_spec_device_final_path() {
|
||||
let null_rdev = fs::metadata("/dev/null").unwrap().rdev();
|
||||
let guest_major = stat::major(null_rdev) as i64;
|
||||
let guest_minor = stat::minor(null_rdev) as i64;
|
||||
|
||||
let host_path = "/dev/host";
|
||||
let host_major: i64 = 99;
|
||||
let host_minor: i64 = 99;
|
||||
|
||||
let mut spec = Spec {
|
||||
linux: Some(Linux {
|
||||
devices: vec![oci::LinuxDevice {
|
||||
path: host_path.to_string(),
|
||||
r#type: "c".to_string(),
|
||||
major: host_major,
|
||||
minor: host_minor,
|
||||
..oci::LinuxDevice::default()
|
||||
}],
|
||||
..Linux::default()
|
||||
}),
|
||||
..Spec::default()
|
||||
};
|
||||
let devidx = DevIndex::new(&spec);
|
||||
|
||||
let vm_path = "/dev/null";
|
||||
let final_path = "/dev/final";
|
||||
|
||||
let res = update_spec_device(&mut spec, &devidx, host_path, vm_path, final_path);
|
||||
assert!(res.is_ok());
|
||||
|
||||
let specdevices = &spec.linux.as_ref().unwrap().devices;
|
||||
assert_eq!(guest_major, specdevices[0].major);
|
||||
assert_eq!(guest_minor, specdevices[0].minor);
|
||||
assert_eq!(final_path, specdevices[0].path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pcipath_to_sysfs() {
|
||||
let testdir = tempdir().expect("failed to create tmpdir");
|
||||
@@ -1142,6 +1371,27 @@ mod tests {
|
||||
assert!(!matcher_a.is_match(&uev_b));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_vfio_matcher() {
|
||||
let grpa = IommuGroup(1);
|
||||
let grpb = IommuGroup(22);
|
||||
|
||||
let mut uev_a = crate::uevent::Uevent::default();
|
||||
uev_a.action = crate::linux_abi::U_EVENT_ACTION_ADD.to_string();
|
||||
uev_a.devname = format!("vfio/{}", grpa);
|
||||
uev_a.devpath = format!("/devices/virtual/vfio/{}", grpa);
|
||||
let matcher_a = VfioMatcher::new(grpa);
|
||||
|
||||
let mut uev_b = uev_a.clone();
|
||||
uev_b.devpath = format!("/devices/virtual/vfio/{}", grpb);
|
||||
let matcher_b = VfioMatcher::new(grpb);
|
||||
|
||||
assert!(matcher_a.is_match(&uev_a));
|
||||
assert!(matcher_b.is_match(&uev_b));
|
||||
assert!(!matcher_b.is_match(&uev_a));
|
||||
assert!(!matcher_a.is_match(&uev_b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_vfio_option() {
|
||||
assert_eq!(
|
||||
@@ -1151,4 +1401,81 @@ mod tests {
|
||||
assert_eq!(split_vfio_option("0000:01:00.0=02/01=rubbish"), None);
|
||||
assert_eq!(split_vfio_option("0000:01:00.0"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pci_driver_override() {
|
||||
let testdir = tempdir().expect("failed to create tmpdir");
|
||||
let syspci = testdir.path(); // Path to mock /sys/bus/pci
|
||||
|
||||
let dev0 = pci::Address::new(0, 0, pci::SlotFn::new(0, 0).unwrap());
|
||||
let dev0path = syspci.join("devices").join(dev0.to_string());
|
||||
let dev0drv = dev0path.join("driver");
|
||||
let dev0override = dev0path.join("driver_override");
|
||||
|
||||
let drvapath = syspci.join("drivers").join("drv_a");
|
||||
let drvaunbind = drvapath.join("unbind");
|
||||
|
||||
let probepath = syspci.join("drivers_probe");
|
||||
|
||||
// Start mocking dev0 as being unbound
|
||||
fs::create_dir_all(&dev0path).unwrap();
|
||||
|
||||
pci_driver_override(syspci, dev0, "drv_a").unwrap();
|
||||
assert_eq!(fs::read_to_string(&dev0override).unwrap(), "drv_a");
|
||||
assert_eq!(fs::read_to_string(&probepath).unwrap(), dev0.to_string());
|
||||
|
||||
// Now mock dev0 already being attached to drv_a
|
||||
fs::create_dir_all(&drvapath).unwrap();
|
||||
std::os::unix::fs::symlink(&drvapath, dev0drv).unwrap();
|
||||
std::fs::remove_file(&probepath).unwrap();
|
||||
|
||||
pci_driver_override(syspci, dev0, "drv_a").unwrap(); // no-op
|
||||
assert_eq!(fs::read_to_string(&dev0override).unwrap(), "drv_a");
|
||||
assert!(!probepath.exists());
|
||||
|
||||
// Now try binding to a different driver
|
||||
pci_driver_override(syspci, dev0, "drv_b").unwrap();
|
||||
assert_eq!(fs::read_to_string(&dev0override).unwrap(), "drv_b");
|
||||
assert_eq!(fs::read_to_string(&probepath).unwrap(), dev0.to_string());
|
||||
assert_eq!(fs::read_to_string(&drvaunbind).unwrap(), dev0.to_string());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pci_iommu_group() {
|
||||
let testdir = tempdir().expect("failed to create tmpdir"); // mock /sys
|
||||
let syspci = testdir.path().join("bus").join("pci");
|
||||
|
||||
// Mock dev0, which has no group
|
||||
let dev0 = pci::Address::new(0, 0, pci::SlotFn::new(0, 0).unwrap());
|
||||
let dev0path = syspci.join("devices").join(dev0.to_string());
|
||||
|
||||
fs::create_dir_all(&dev0path).unwrap();
|
||||
|
||||
// Test dev0
|
||||
assert!(pci_iommu_group(&syspci, dev0).unwrap().is_none());
|
||||
|
||||
// Mock dev1, which is in group 12
|
||||
let dev1 = pci::Address::new(0, 1, pci::SlotFn::new(0, 0).unwrap());
|
||||
let dev1path = syspci.join("devices").join(dev1.to_string());
|
||||
let dev1group = dev1path.join("iommu_group");
|
||||
|
||||
fs::create_dir_all(&dev1path).unwrap();
|
||||
std::os::unix::fs::symlink("../../../kernel/iommu_groups/12", &dev1group).unwrap();
|
||||
|
||||
// Test dev1
|
||||
assert_eq!(
|
||||
pci_iommu_group(&syspci, dev1).unwrap(),
|
||||
Some(IommuGroup(12))
|
||||
);
|
||||
|
||||
// Mock dev2, which has a bogus group (dir instead of symlink)
|
||||
let dev2 = pci::Address::new(0, 2, pci::SlotFn::new(0, 0).unwrap());
|
||||
let dev2path = syspci.join("devices").join(dev2.to_string());
|
||||
let dev2group = dev2path.join("iommu_group");
|
||||
|
||||
fs::create_dir_all(&dev2group).unwrap();
|
||||
|
||||
// Test dev2
|
||||
assert!(pci_iommu_group(&syspci, dev2).is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,6 +83,8 @@ pub const SYSFS_MEMORY_ONLINE_PATH: &str = "/sys/devices/system/memory";
|
||||
|
||||
pub const SYSFS_SCSI_HOST_PATH: &str = "/sys/class/scsi_host";
|
||||
|
||||
pub const SYSFS_BUS_PCI_PATH: &str = "/sys/bus/pci";
|
||||
|
||||
pub const SYSFS_CGROUPPATH: &str = "/sys/fs/cgroup";
|
||||
pub const SYSFS_ONLINE_FILE: &str = "online";
|
||||
|
||||
|
||||
@@ -190,6 +190,7 @@ DEFVALIDVHOSTUSERSTOREPATHS := [\"$(DEFVHOSTUSERSTOREPATH)\"]
|
||||
DEFFILEMEMBACKEND := ""
|
||||
DEFVALIDFILEMEMBACKENDS := [\"$(DEFFILEMEMBACKEND)\"]
|
||||
DEFMSIZE9P := 8192
|
||||
DEFVFIOMODE := guest-kernel
|
||||
|
||||
# Default cgroup model
|
||||
DEFSANDBOXCGROUPONLY ?= false
|
||||
@@ -459,6 +460,7 @@ USER_VARS += DEFENTROPYSOURCE
|
||||
USER_VARS += DEFVALIDENTROPYSOURCES
|
||||
USER_VARS += DEFSANDBOXCGROUPONLY
|
||||
USER_VARS += DEFBINDMOUNTS
|
||||
USER_VARS += DEFVFIOMODE
|
||||
USER_VARS += FEATURE_SELINUX
|
||||
USER_VARS += BUILDFLAGS
|
||||
|
||||
|
||||
@@ -263,6 +263,27 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@
|
||||
# These will not be exposed to the container workloads, and are only provided for potential guest services.
|
||||
sandbox_bind_mounts=@DEFBINDMOUNTS@
|
||||
|
||||
# VFIO Mode
|
||||
# Determines how VFIO devices should be be presented to the container.
|
||||
# Options:
|
||||
#
|
||||
# - vfio
|
||||
# Matches behaviour of OCI runtimes (e.g. runc) as much as
|
||||
# possible. VFIO devices will appear in the container as VFIO
|
||||
# character devices under /dev/vfio. The exact names may differ
|
||||
# from the host (they need to match the VM's IOMMU group numbers
|
||||
# rather than the host's)
|
||||
#
|
||||
# - guest-kernel
|
||||
# This is a Kata-specific behaviour that's useful in certain cases.
|
||||
# The VFIO device is managed by whatever driver in the VM kernel
|
||||
# claims it. This means it will appear as one or more device nodes
|
||||
# or network interfaces depending on the nature of the device.
|
||||
# Using this mode requires specially built workloads that know how
|
||||
# to locate the relevant device interfaces within the VM.
|
||||
#
|
||||
vfio_mode="@DEFVFIOMODE@"
|
||||
|
||||
# Enabled experimental feature list, format: ["a", "b"].
|
||||
# Experimental features are features not stable enough for production,
|
||||
# they may break compatibility, and are prepared for a big version bump.
|
||||
|
||||
@@ -543,6 +543,27 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@
|
||||
# These will not be exposed to the container workloads, and are only provided for potential guest services.
|
||||
sandbox_bind_mounts=@DEFBINDMOUNTS@
|
||||
|
||||
# VFIO Mode
|
||||
# Determines how VFIO devices should be be presented to the container.
|
||||
# Options:
|
||||
#
|
||||
# - vfio
|
||||
# Matches behaviour of OCI runtimes (e.g. runc) as much as
|
||||
# possible. VFIO devices will appear in the container as VFIO
|
||||
# character devices under /dev/vfio. The exact names may differ
|
||||
# from the host (they need to match the VM's IOMMU group numbers
|
||||
# rather than the host's)
|
||||
#
|
||||
# - guest-kernel
|
||||
# This is a Kata-specific behaviour that's useful in certain cases.
|
||||
# The VFIO device is managed by whatever driver in the VM kernel
|
||||
# claims it. This means it will appear as one or more device nodes
|
||||
# or network interfaces depending on the nature of the device.
|
||||
# Using this mode requires specially built workloads that know how
|
||||
# to locate the relevant device interfaces within the VM.
|
||||
#
|
||||
vfio_mode="@DEFVFIOMODE@"
|
||||
|
||||
# Enabled experimental feature list, format: ["a", "b"].
|
||||
# Experimental features are features not stable enough for production,
|
||||
# they may break compatibility, and are prepared for a big version bump.
|
||||
|
||||
@@ -88,6 +88,7 @@ const defaultConfidentialGuest = false
|
||||
const defaultGuestSwap = false
|
||||
const defaultRootlessHypervisor = false
|
||||
const defaultDisableSeccomp = false
|
||||
const defaultVfioMode = "guest-kernel"
|
||||
|
||||
var defaultSGXEPCSize = int64(0)
|
||||
|
||||
|
||||
@@ -143,6 +143,7 @@ type runtime struct {
|
||||
JaegerEndpoint string `toml:"jaeger_endpoint"`
|
||||
JaegerUser string `toml:"jaeger_user"`
|
||||
JaegerPassword string `toml:"jaeger_password"`
|
||||
VfioMode string `toml:"vfio_mode"`
|
||||
SandboxBindMounts []string `toml:"sandbox_bind_mounts"`
|
||||
Experimental []string `toml:"experimental"`
|
||||
Debug bool `toml:"enable_debug"`
|
||||
@@ -1068,6 +1069,11 @@ func initConfig() (config oci.RuntimeConfig, err error) {
|
||||
return oci.RuntimeConfig{}, err
|
||||
}
|
||||
|
||||
err = config.VfioMode.VFIOSetMode(defaultVfioMode)
|
||||
if err != nil {
|
||||
return oci.RuntimeConfig{}, err
|
||||
}
|
||||
|
||||
config = oci.RuntimeConfig{
|
||||
HypervisorType: defaultHypervisor,
|
||||
HypervisorConfig: GetDefaultHypervisorConfig(),
|
||||
@@ -1114,6 +1120,14 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat
|
||||
}
|
||||
}
|
||||
|
||||
if tomlConf.Runtime.VfioMode != "" {
|
||||
err = config.VfioMode.VFIOSetMode(tomlConf.Runtime.VfioMode)
|
||||
|
||||
if err != nil {
|
||||
return "", config, err
|
||||
}
|
||||
}
|
||||
|
||||
if !ignoreLogging {
|
||||
err := handleSystemLog("", "")
|
||||
if err != nil {
|
||||
|
||||
@@ -187,6 +187,40 @@ type BlockDrive struct {
|
||||
Swap bool
|
||||
}
|
||||
|
||||
// VFIOMode indicates e behaviour mode for handling devices in the VM
|
||||
type VFIOModeType uint32
|
||||
|
||||
const (
|
||||
// VFIOModeVFIO specifies OCI compliant behaviour: VFIO
|
||||
// devices specified to Kata appear as VFIO devices within the
|
||||
// container
|
||||
VFIOModeVFIO VFIOModeType = iota
|
||||
|
||||
// VFIOModeGuestKernel specifies Kata-specific behaviour
|
||||
// useful in certain cases: VFIO devices specified to Kata are
|
||||
// bound to whatever driver in the VM will take them. This
|
||||
// requires specialized containers expecting this behaviour to
|
||||
// locate and use the devices
|
||||
VFIOModeGuestKernel
|
||||
)
|
||||
|
||||
const (
|
||||
vfioModeVfioStr = "vfio"
|
||||
vfioModeGuestKernelStr = "guest-kernel"
|
||||
)
|
||||
|
||||
func (m *VFIOModeType) VFIOSetMode(modeName string) error {
|
||||
switch modeName {
|
||||
case vfioModeVfioStr:
|
||||
*m = VFIOModeVFIO
|
||||
return nil
|
||||
case vfioModeGuestKernelStr:
|
||||
*m = VFIOModeGuestKernel
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("Unknown VFIO mode %s", modeName)
|
||||
}
|
||||
|
||||
// VFIODeviceType indicates VFIO device type
|
||||
type VFIODeviceType uint32
|
||||
|
||||
|
||||
@@ -92,6 +92,7 @@ var (
|
||||
kataNvdimmDevType = "nvdimm"
|
||||
kataVirtioFSDevType = "virtio-fs"
|
||||
kataWatchableBindDevType = "watchable-bind"
|
||||
kataVfioDevType = "vfio" // VFIO device to used as VFIO in the container
|
||||
kataVfioGuestKernelDevType = "vfio-gk" // VFIO device for consumption by the guest kernel
|
||||
sharedDir9pOptions = []string{"trans=virtio,version=9p2000.L,cache=mmap", "nodev"}
|
||||
sharedDirVirtioFSOptions = []string{}
|
||||
@@ -995,7 +996,7 @@ func (k *kataAgent) replaceOCIMountsForStorages(spec *specs.Spec, volumeStorages
|
||||
return nil
|
||||
}
|
||||
|
||||
func (k *kataAgent) constraintGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool) {
|
||||
func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, stripVfio bool) {
|
||||
// Disable Hooks since they have been handled on the host and there is
|
||||
// no reason to send them to the agent. It would make no sense to try
|
||||
// to apply them on the guest.
|
||||
@@ -1058,8 +1059,11 @@ func (k *kataAgent) constraintGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool) {
|
||||
}
|
||||
grpcSpec.Linux.Namespaces = tmpNamespaces
|
||||
|
||||
// VFIO char device shouldn't not appear in the guest,
|
||||
// the device driver should handle it and determinate its group.
|
||||
if stripVfio {
|
||||
// VFIO char device shouldn't appear in the guest
|
||||
// (because the VM device driver will do something
|
||||
// with it rather than just presenting it to the
|
||||
// container unmodified)
|
||||
var linuxDevices []grpc.LinuxDevice
|
||||
for _, dev := range grpcSpec.Linux.Devices {
|
||||
if dev.Type == "c" && strings.HasPrefix(dev.Path, vfioPath) {
|
||||
@@ -1069,6 +1073,7 @@ func (k *kataAgent) constraintGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool) {
|
||||
linuxDevices = append(linuxDevices, dev)
|
||||
}
|
||||
grpcSpec.Linux.Devices = linuxDevices
|
||||
}
|
||||
}
|
||||
|
||||
func (k *kataAgent) handleShm(mounts []specs.Mount, sandbox *Sandbox) {
|
||||
@@ -1179,11 +1184,19 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
|
||||
// (see qomGetPciPath() for details).
|
||||
kataDevice := &grpc.Device{
|
||||
ContainerPath: dev.ContainerPath,
|
||||
Type: kataVfioGuestKernelDevType,
|
||||
Type: kataVfioDevType,
|
||||
Id: groupNum,
|
||||
Options: make([]string, len(devList)),
|
||||
}
|
||||
|
||||
// We always pass the device information to the agent, since
|
||||
// it needs that to wait for them to be ready. But depending
|
||||
// on the vfio_mode, we need to use a different device type so
|
||||
// the agent can handle it properly
|
||||
if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel {
|
||||
kataDevice.Type = kataVfioGuestKernelDevType
|
||||
}
|
||||
|
||||
for i, pciDev := range devList {
|
||||
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDev.BDF, pciDev.GuestPciPath)
|
||||
}
|
||||
@@ -1411,9 +1424,9 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
|
||||
|
||||
passSeccomp := !sandbox.config.DisableGuestSeccomp && sandbox.seccompSupported
|
||||
|
||||
// We need to constraint the spec to make sure we're not passing
|
||||
// irrelevant information to the agent.
|
||||
k.constraintGRPCSpec(grpcSpec, passSeccomp)
|
||||
// We need to constrain the spec to make sure we're not
|
||||
// passing irrelevant information to the agent.
|
||||
k.constrainGRPCSpec(grpcSpec, passSeccomp, sandbox.config.VfioMode == config.VFIOModeGuestKernel)
|
||||
|
||||
req := &grpc.CreateContainerRequest{
|
||||
ContainerId: c.id,
|
||||
|
||||
@@ -541,7 +541,7 @@ func TestAppendVhostUserBlkDevices(t *testing.T) {
|
||||
updatedDevList, expected)
|
||||
}
|
||||
|
||||
func TestConstraintGRPCSpec(t *testing.T) {
|
||||
func TestConstrainGRPCSpec(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
expectedCgroupPath := "/foo/bar"
|
||||
|
||||
@@ -589,7 +589,7 @@ func TestConstraintGRPCSpec(t *testing.T) {
|
||||
}
|
||||
|
||||
k := kataAgent{}
|
||||
k.constraintGRPCSpec(g, true)
|
||||
k.constrainGRPCSpec(g, true, true)
|
||||
|
||||
// Check nil fields
|
||||
assert.Nil(g.Hooks)
|
||||
|
||||
@@ -250,6 +250,10 @@ const (
|
||||
|
||||
// DisableNewNetNs is a sandbox annotation that determines if create a netns for hypervisor process.
|
||||
DisableNewNetNs = kataAnnotRuntimePrefix + "disable_new_netns"
|
||||
|
||||
// VfioMode is a sandbox annotation to specify how attached VFIO devices should be treated
|
||||
// Overrides the runtime.vfio_mode parameter in the global configuration.toml
|
||||
VfioMode = kataAnnotRuntimePrefix + "vfio_mode"
|
||||
)
|
||||
|
||||
// Agent related annotations
|
||||
|
||||
@@ -116,6 +116,10 @@ type RuntimeConfig struct {
|
||||
//the container network interface
|
||||
InterNetworkModel vc.NetInterworkingModel
|
||||
|
||||
//Determines how VFIO devices should be presented to the
|
||||
//container
|
||||
VfioMode config.VFIOModeType
|
||||
|
||||
Debug bool
|
||||
Trace bool
|
||||
|
||||
@@ -826,6 +830,13 @@ func addRuntimeConfigOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, r
|
||||
sbConfig.NetworkConfig.InterworkingModel = runtimeConfig.InterNetworkModel
|
||||
}
|
||||
|
||||
if value, ok := ocispec.Annotations[vcAnnotations.VfioMode]; ok {
|
||||
if err := sbConfig.VfioMode.VFIOSetMode(value); err != nil {
|
||||
return fmt.Errorf("Unknown VFIO mode \"%s\" in annotation %s",
|
||||
value, vcAnnotations.VfioMode)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -893,6 +904,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid, c
|
||||
|
||||
ShmSize: shmSize,
|
||||
|
||||
VfioMode: runtime.VfioMode,
|
||||
|
||||
SystemdCgroup: systemdCgroup,
|
||||
|
||||
SandboxCgroupOnly: runtime.SandboxCgroupOnly,
|
||||
|
||||
@@ -134,6 +134,8 @@ type SandboxConfig struct {
|
||||
|
||||
ShmSize uint64
|
||||
|
||||
VfioMode config.VFIOModeType
|
||||
|
||||
// SharePidNs sets all containers to share the same sandbox level pid namespace.
|
||||
SharePidNs bool
|
||||
|
||||
|
||||
Reference in New Issue
Block a user