From 93609aa0cd8b0e4c4ac0395ca3aa37a4dd56e5ad Mon Sep 17 00:00:00 2001 From: Xuewei Niu Date: Tue, 8 Aug 2023 11:21:01 +0800 Subject: [PATCH 01/33] deps: Bump dependent crate versions This pull request is mainly for updating vm-memory and vmm-sys-util. The affacted crates include: - vm-memory: from 0.9.0 to 0.10.0 - vmm-sys-util: from 0.10.0 to 0.11.0 - virtio-queue: from 0.6.0 to 0.7.0 - fuse-backend-rs: from 0.10.4 to 0.10.5 - linux-loader: from 0.6.0 to 0.8.0 - nydus-api: from 0.3.0 to 0.3.1 - nydus-rafs: from 0.3.1 to 0.3.2 - nydus-storage: from 0.6.3 to 0.6.4 Fixes: #0000 Signed-off-by: Xuewei Niu (cherry picked from commit b23c5ed15594d74d11975a7e347ee24598da2296) Signed-off-by: Greg Kurz --- src/dragonball/Cargo.toml | 8 +-- src/dragonball/src/api/v1/vmm_action.rs | 3 +- .../src/dbs_address_space/Cargo.toml | 2 +- src/dragonball/src/dbs_arch/Cargo.toml | 4 +- src/dragonball/src/dbs_boot/Cargo.toml | 4 +- .../src/dbs_virtio_devices/Cargo.toml | 14 ++-- src/dragonball/src/device_manager/mod.rs | 2 +- src/dragonball/src/test_utils.rs | 2 +- src/dragonball/src/vm/kernel_config.rs | 2 +- src/dragonball/src/vm/mod.rs | 2 +- src/runtime-rs/Cargo.lock | 71 +++++-------------- 11 files changed, 41 insertions(+), 73 deletions(-) diff --git a/src/dragonball/Cargo.toml b/src/dragonball/Cargo.toml index 68f08060d..bbb5166f2 100644 --- a/src/dragonball/Cargo.toml +++ b/src/dragonball/Cargo.toml @@ -27,7 +27,7 @@ kvm-bindings = "0.6.0" kvm-ioctls = "0.12.0" lazy_static = "1.2" libc = "0.2.39" -linux-loader = "0.6.0" +linux-loader = "0.8.0" log = "0.4.14" nix = "0.24.2" procfs = "0.12.0" @@ -40,10 +40,10 @@ slog = "2.5.2" slog-scope = "4.4.0" thiserror = "1" vmm-sys-util = "0.11.0" -virtio-queue = { version = "0.6.0", optional = true } -vm-memory = { version = "0.9.0", features = ["backend-mmap"] } +virtio-queue = { version = "0.7.0", optional = true } +vm-memory = { version = "0.10.0", features = ["backend-mmap"] } crossbeam-channel = "0.5.6" -fuse-backend-rs = "=0.10.4" +fuse-backend-rs = "0.10.5" [dev-dependencies] slog-async = "2.7.0" diff --git a/src/dragonball/src/api/v1/vmm_action.rs b/src/dragonball/src/api/v1/vmm_action.rs index 5c1c728fe..95525c4e6 100644 --- a/src/dragonball/src/api/v1/vmm_action.rs +++ b/src/dragonball/src/api/v1/vmm_action.rs @@ -358,7 +358,8 @@ impl VmmService { Some(ref path) => Some(File::open(path).map_err(|e| BootSource(InvalidInitrdPath(e)))?), }; - let mut cmdline = linux_loader::cmdline::Cmdline::new(dbs_boot::layout::CMDLINE_MAX_SIZE); + let mut cmdline = linux_loader::cmdline::Cmdline::new(dbs_boot::layout::CMDLINE_MAX_SIZE) + .map_err(|err| BootSource(InvalidKernelCommandLine(err)))?; let boot_args = boot_source_config .boot_args .unwrap_or_else(|| String::from(DEFAULT_KERNEL_CMDLINE)); diff --git a/src/dragonball/src/dbs_address_space/Cargo.toml b/src/dragonball/src/dbs_address_space/Cargo.toml index f507fa4dc..2ebd84fe6 100644 --- a/src/dragonball/src/dbs_address_space/Cargo.toml +++ b/src/dragonball/src/dbs_address_space/Cargo.toml @@ -17,4 +17,4 @@ nix = "0.23.1" lazy_static = "1" thiserror = "1" vmm-sys-util = "0.11.0" -vm-memory = { version = "0.9", features = ["backend-mmap", "backend-atomic"] } +vm-memory = { version = "0.10", features = ["backend-mmap", "backend-atomic"] } diff --git a/src/dragonball/src/dbs_arch/Cargo.toml b/src/dragonball/src/dbs_arch/Cargo.toml index b6deb0ba1..79b2957fc 100644 --- a/src/dragonball/src/dbs_arch/Cargo.toml +++ b/src/dragonball/src/dbs_arch/Cargo.toml @@ -15,12 +15,12 @@ memoffset = "0.6" kvm-bindings = { version = "0.6.0", features = ["fam-wrappers"] } kvm-ioctls = "0.12.0" thiserror = "1" -vm-memory = { version = "0.9" } +vm-memory = { version = "0.10" } vmm-sys-util = "0.11.0" libc = ">=0.2.39" [dev-dependencies] -vm-memory = { version = "0.9", features = ["backend-mmap"] } +vm-memory = { version = "0.10", features = ["backend-mmap"] } [package.metadata.docs.rs] all-features = true diff --git a/src/dragonball/src/dbs_boot/Cargo.toml b/src/dragonball/src/dbs_boot/Cargo.toml index 7216795d5..1ecac6421 100644 --- a/src/dragonball/src/dbs_boot/Cargo.toml +++ b/src/dragonball/src/dbs_boot/Cargo.toml @@ -17,10 +17,10 @@ kvm-ioctls = "0.12.0" lazy_static = "1" libc = "0.2.39" thiserror = "1" -vm-memory = "0.9.0" +vm-memory = "0.10.0" vm-fdt = "0.2.0" [dev-dependencies] -vm-memory = { version = "0.9.0", features = ["backend-mmap"] } +vm-memory = { version = "0.10.0", features = ["backend-mmap"] } device_tree = ">=1.1.0" dbs-device = { path = "../dbs_device" } diff --git a/src/dragonball/src/dbs_virtio_devices/Cargo.toml b/src/dragonball/src/dbs_virtio_devices/Cargo.toml index c26b5ffd2..b7bd8e60f 100644 --- a/src/dragonball/src/dbs_virtio_devices/Cargo.toml +++ b/src/dragonball/src/dbs_virtio_devices/Cargo.toml @@ -18,28 +18,28 @@ dbs-interrupt = { path = "../dbs_interrupt", features = ["kvm-legacy-irq", "kvm- dbs-utils = { path = "../dbs_utils" } epoll = ">=4.3.1, <4.3.2" io-uring = "0.5.2" -fuse-backend-rs = { version = "0.10.0", optional = true } +fuse-backend-rs = { version = "0.10.5", optional = true } kvm-bindings = "0.6.0" kvm-ioctls = "0.12.0" libc = "0.2.119" log = "0.4.14" nix = "0.24.3" -nydus-api = "0.3.0" -nydus-rafs = "0.3.1" -nydus-storage = "0.6.3" +nydus-api = "0.3.1" +nydus-rafs = "0.3.2" +nydus-storage = "0.6.4" rlimit = "0.7.0" serde = "1.0.27" serde_json = "1.0.9" thiserror = "1" threadpool = "1" virtio-bindings = "0.1.0" -virtio-queue = "0.6.0" +virtio-queue = "0.7.0" vmm-sys-util = "0.11.0" -vm-memory = { version = "0.9.0", features = [ "backend-mmap" ] } +vm-memory = { version = "0.10.0", features = [ "backend-mmap" ] } sendfd = "0.4.3" [dev-dependencies] -vm-memory = { version = "0.9.0", features = [ "backend-mmap", "backend-atomic" ] } +vm-memory = { version = "0.10.0", features = [ "backend-mmap", "backend-atomic" ] } [features] virtio-mmio = [] diff --git a/src/dragonball/src/device_manager/mod.rs b/src/dragonball/src/device_manager/mod.rs index fefdf4f77..1579f329b 100644 --- a/src/dragonball/src/device_manager/mod.rs +++ b/src/dragonball/src/device_manager/mod.rs @@ -1195,7 +1195,7 @@ mod tests { let mut cmdline = crate::vm::KernelConfigInfo::new( kernel_file, None, - linux_loader::cmdline::Cmdline::new(0x1000), + linux_loader::cmdline::Cmdline::new(0x1000).unwrap(), ); let address_space = vm.vm_address_space().cloned(); diff --git a/src/dragonball/src/test_utils.rs b/src/dragonball/src/test_utils.rs index 577b5df86..dec006f43 100644 --- a/src/dragonball/src/test_utils.rs +++ b/src/dragonball/src/test_utils.rs @@ -17,7 +17,7 @@ pub mod tests { let epoll_manager = EpollManager::default(); let mut vm = Vm::new(None, instance_info, epoll_manager).unwrap(); let kernel_file = TempFile::new().unwrap(); - let cmd_line = Cmdline::new(64); + let cmd_line = Cmdline::new(64).unwrap(); vm.set_kernel_config(KernelConfigInfo::new( kernel_file.into_file(), None, diff --git a/src/dragonball/src/vm/kernel_config.rs b/src/dragonball/src/vm/kernel_config.rs index 34516266a..fb51f8fc1 100644 --- a/src/dragonball/src/vm/kernel_config.rs +++ b/src/dragonball/src/vm/kernel_config.rs @@ -62,7 +62,7 @@ mod tests { fn test_kernel_config_info() { let kernel = TempFile::new().unwrap(); let initrd = TempFile::new().unwrap(); - let mut cmdline = linux_loader::cmdline::Cmdline::new(1024); + let mut cmdline = linux_loader::cmdline::Cmdline::new(1024).unwrap(); cmdline.insert_str("ro").unwrap(); let mut info = KernelConfigInfo::new(kernel.into_file(), Some(initrd.into_file()), cmdline); diff --git a/src/dragonball/src/vm/mod.rs b/src/dragonball/src/vm/mod.rs index 48a5a5e89..f1c9700b9 100644 --- a/src/dragonball/src/vm/mod.rs +++ b/src/dragonball/src/vm/mod.rs @@ -1027,7 +1027,7 @@ pub mod tests { ); let kernel_file = TempFile::new().unwrap(); - let cmd_line = Cmdline::new(64); + let cmd_line = Cmdline::new(64).unwrap(); vm.set_kernel_config(KernelConfigInfo::new( kernel_file.into_file(), diff --git a/src/runtime-rs/Cargo.lock b/src/runtime-rs/Cargo.lock index ab090028d..90f29db44 100644 --- a/src/runtime-rs/Cargo.lock +++ b/src/runtime-rs/Cargo.lock @@ -828,6 +828,7 @@ dependencies = [ "dbs-upcall", "dbs-utils", "dbs-virtio-devices", + "fuse-backend-rs", "kvm-bindings", "kvm-ioctls", "lazy_static", @@ -995,9 +996,9 @@ checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" [[package]] name = "fuse-backend-rs" -version = "0.10.4" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc24820b14267bec37fa87f5c2a32b5f1c5405b8c60cc3aa77afd481bd2628a6" +checksum = "f85357722be4bf3d0b7548bedf7499686c77628c2c61cb99c6519463f7a9e5f0" dependencies = [ "arc-swap", "bitflags 1.3.2", @@ -1008,10 +1009,9 @@ dependencies = [ "log", "mio", "nix 0.24.3", - "tokio-uring", "virtio-queue", "vm-memory", - "vmm-sys-util 0.10.0", + "vmm-sys-util 0.11.1", ] [[package]] @@ -1593,17 +1593,6 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" -[[package]] -name = "leaky-bucket" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e8b256cabb5f5c7affd490acbb12f951d725385971fa602dedb11e09c896b6d" -dependencies = [ - "parking_lot 0.12.1", - "tokio", - "tracing", -] - [[package]] name = "libc" version = "0.2.147" @@ -1625,9 +1614,9 @@ dependencies = [ [[package]] name = "linux-loader" -version = "0.6.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62a2f912deca034ec34b0a43a390059ea98daac40e440ebe8bea88f3315fe168" +checksum = "b9259ddbfbb52cc918f6bbc60390004ddd0228cf1d85f402009ff2b3d95de83f" dependencies = [ "vm-memory", ] @@ -1938,11 +1927,10 @@ dependencies = [ [[package]] name = "nydus-api" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33a6ca41dd10813e3d29397550fbb0f15ad149381f312e04659d39e0adcf2002" +checksum = "c64c62d8a36c10b654b87246a39861b2c05f68e96ab3b2f002f5a54f406d5e0e" dependencies = [ - "backtrace", "libc", "log", "serde", @@ -1952,9 +1940,9 @@ dependencies = [ [[package]] name = "nydus-rafs" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed21e44a99472850d2afc4fb07427ed46d4e6a8b1cce28b42bd689319e45076d" +checksum = "adde865ef71c91c5f139c4c05ca5aedb6fbd53f530d646b13409ac5220b85467" dependencies = [ "anyhow", "arc-swap", @@ -1974,16 +1962,15 @@ dependencies = [ [[package]] name = "nydus-storage" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9591fbee1875895bf1f765656695d0be6887fe65372fbf4924b8b3959bd61375" +checksum = "4023f15303dbbda47797d07e9acd2045862ce82c7e28cd66f70b09bda5584cbb" dependencies = [ "arc-swap", "bitflags 1.3.2", "fuse-backend-rs", "hex", "lazy_static", - "leaky-bucket", "libc", "log", "nix 0.24.3", @@ -1998,9 +1985,9 @@ dependencies = [ [[package]] name = "nydus-utils" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe8b9269e3a370682f272a1b2cac4bdaf6d6657f3f6966560c4fedab36548362" +checksum = "c1f7bcde0f3906cf49101f2d40e485b0155eee97e3358eefd4783448c4f69c96" dependencies = [ "blake3", "flate2", @@ -2926,12 +2913,6 @@ dependencies = [ "libc", ] -[[package]] -name = "scoped-tls" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" - [[package]] name = "scopeguard" version = "1.2.0" @@ -3505,20 +3486,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-uring" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d5e02bb137e030b3a547c65a3bd2f1836d66a97369fdcc69034002b10e155ef" -dependencies = [ - "io-uring", - "libc", - "scoped-tls", - "slab", - "socket2", - "tokio", -] - [[package]] name = "tokio-util" version = "0.7.8" @@ -3828,14 +3795,14 @@ checksum = "3ff512178285488516ed85f15b5d0113a7cdb89e9e8a760b269ae4f02b84bd6b" [[package]] name = "virtio-queue" -version = "0.6.1" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "435dd49c7b38419729afd43675850c7b5dc4728f2fabd70c7a9079a331e4f8c6" +checksum = "3ba81e2bcc21c0d2fc5e6683e79367e26ad219197423a498df801d79d5ba77bd" dependencies = [ "log", "virtio-bindings", "vm-memory", - "vmm-sys-util 0.10.0", + "vmm-sys-util 0.11.1", ] [[package]] @@ -3846,9 +3813,9 @@ checksum = "f43fb5a6bd1a7d423ad72802801036719b7546cf847a103f8fe4575f5b0d45a6" [[package]] name = "vm-memory" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583f213899e8a5eea23d9c507252d4bed5bc88f0ecbe0783262f80034630744b" +checksum = "688a70366615b45575a424d9c665561c1b5ab2224d494f706b6a6812911a827c" dependencies = [ "arc-swap", "libc", From ef65c5767fd73b8ae70b8bbe3cb64e68a8628e43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julien=20Rop=C3=A9?= Date: Tue, 22 Aug 2023 11:30:18 +0200 Subject: [PATCH 02/33] kata-agent: use default filemode for block device when it is set to 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the FileMode field for the device is unset (0), use a default value instead to allow the use of the device from the container. This behaviour is seen from cri-o typically. Note: this is what runc is doing, which is why regular containers don't have an issue. This change makes sure kata behaves the same as runc. Fixes: #7717 Signed-off-by: Julien Ropé (cherry picked from commit 40914b25d4dcabad82080020b075769b10d0cf28) Signed-off-by: Greg Kurz --- src/agent/rustjail/src/lib.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/agent/rustjail/src/lib.rs b/src/agent/rustjail/src/lib.rs index de91f81bb..d3647f42e 100644 --- a/src/agent/rustjail/src/lib.rs +++ b/src/agent/rustjail/src/lib.rs @@ -423,12 +423,18 @@ fn linux_grpc_to_oci(l: &grpc::Linux) -> oci::Linux { let mut r = Vec::new(); for d in l.Devices.iter() { + // if the filemode for the device is 0 (unset), use a default value as runc does + let filemode = if d.FileMode != 0 { + Some(d.FileMode) + } else { + Some(0o666) + }; r.push(oci::LinuxDevice { path: d.Path.clone(), r#type: d.Type.clone(), major: d.Major, minor: d.Minor, - file_mode: Some(d.FileMode), + file_mode: filemode, uid: Some(d.UID), gid: Some(d.GID), }); From 59fae423b5f5626947752bd0f20adc0cb0c63444 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 22 Aug 2023 07:42:59 +0000 Subject: [PATCH 03/33] runtime/qemu: fix image/initrd annotation handling Right now if we configure an image annotation and have a config file setting initrd, the initrd config would override the image annotation. Add a helper function ImageOrInitrdAssetPath to make sure annotations are preferred over config options in image and initrd path handling. Signed-off-by: Peng Tao (cherry picked from commit 1a0092d6316a385ff8ccd9b2c581dd5733c45ed3) Signed-off-by: Greg Kurz --- src/runtime/virtcontainers/hypervisor.go | 41 ++++++++++++ src/runtime/virtcontainers/qemu.go | 77 ++++++++++------------- src/runtime/virtcontainers/types/asset.go | 2 + 3 files changed, 76 insertions(+), 44 deletions(-) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 14d8b1937..7599d2794 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -84,6 +84,7 @@ const ( var ( hvLogger = logrus.WithField("source", "virtcontainers/hypervisor") noGuestMemHotplugErr error = errors.New("guest memory hotplug not supported") + conflictingAssets error = errors.New("cannot set both image and initrd at the same time") ) // In some architectures the maximum number of vCPUs depends on the number of physical cores. @@ -698,6 +699,46 @@ func (conf *HypervisorConfig) AddCustomAsset(a *types.Asset) error { return nil } +// ImageOrInitrdAssetPath returns an image or an initrd path, along with the corresponding asset type +// Annotation path is preferred to config path. +func (conf *HypervisorConfig) ImageOrInitrdAssetPath() (string, types.AssetType, error) { + var image, initrd string + + checkAndReturn := func(image string, initrd string) (string, types.AssetType, error) { + if image != "" && initrd != "" { + return "", types.UnkownAsset, conflictingAssets + } + + if image != "" { + return image, types.ImageAsset, nil + } + + if initrd != "" { + return initrd, types.InitrdAsset, nil + } + + return "", types.UnkownAsset, fmt.Errorf("one of image and initrd must be set") + } + + if a, ok := conf.customAssets[types.ImageAsset]; ok { + image = a.Path() + } + + if a, ok := conf.customAssets[types.InitrdAsset]; ok { + initrd = a.Path() + } + + path, assetType, err := checkAndReturn(image, initrd) + if assetType != types.UnkownAsset { + return path, assetType, nil + } + if err == conflictingAssets { + return "", types.UnkownAsset, errors.Wrapf(err, "conflicting annotations") + } + + return checkAndReturn(conf.ImagePath, conf.InitrdPath) +} + func (conf *HypervisorConfig) assetPath(t types.AssetType) (string, error) { // Custom assets take precedence over the configured ones a, ok := conf.customAssets[t] diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 57fa036f6..b03b176b4 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -347,22 +347,6 @@ func (q *qemu) getQemuMachine() (govmmQemu.Machine, error) { return machine, nil } -func (q *qemu) appendImage(ctx context.Context, devices []govmmQemu.Device) ([]govmmQemu.Device, error) { - imagePath, err := q.config.ImageAssetPath() - if err != nil { - return nil, err - } - - if imagePath != "" { - devices, err = q.arch.appendImage(ctx, devices, imagePath) - if err != nil { - return nil, err - } - } - - return devices, nil -} - func (q *qemu) createQmpSocket() ([]govmmQemu.QMPSocket, error) { monitorSockPath, err := q.qmpSocketPath(q.id) if err != nil { @@ -400,12 +384,16 @@ func (q *qemu) createQmpSocket() ([]govmmQemu.QMPSocket, error) { return sockets, nil } -func (q *qemu) buildDevices(ctx context.Context, initrdPath string) ([]govmmQemu.Device, *govmmQemu.IOThread, error) { +func (q *qemu) buildDevices(ctx context.Context, kernelPath string) ([]govmmQemu.Device, *govmmQemu.IOThread, *govmmQemu.Kernel, error) { var devices []govmmQemu.Device + kernel := &govmmQemu.Kernel{ + Path: kernelPath, + } + _, console, err := q.GetVMConsole(ctx, q.id) if err != nil { - return nil, nil, err + return nil, nil, nil, err } // Add bridges before any other devices. This way we make sure that @@ -414,20 +402,28 @@ func (q *qemu) buildDevices(ctx context.Context, initrdPath string) ([]govmmQemu devices, err = q.arch.appendConsole(ctx, devices, console) if err != nil { - return nil, nil, err + return nil, nil, nil, err } - if initrdPath == "" { - devices, err = q.appendImage(ctx, devices) + assetPath, assetType, err := q.config.ImageOrInitrdAssetPath() + if err != nil { + return nil, nil, nil, err + } + + if assetType == types.ImageAsset { + devices, err = q.arch.appendImage(ctx, devices, assetPath) if err != nil { - return nil, nil, err + return nil, nil, nil, err } + } else { + // InitrdAsset, need to set kernel initrd path + kernel.InitrdPath = assetPath } if q.config.IOMMU { devices, err = q.arch.appendIOMMU(devices) if err != nil { - return nil, nil, err + return nil, nil, nil, err } } @@ -438,10 +434,13 @@ func (q *qemu) buildDevices(ctx context.Context, initrdPath string) ([]govmmQemu var ioThread *govmmQemu.IOThread if q.config.BlockDeviceDriver == config.VirtioSCSI { - return q.arch.appendSCSIController(ctx, devices, q.config.EnableIOThreads) + devices, ioThread, err = q.arch.appendSCSIController(ctx, devices, q.config.EnableIOThreads) + if err != nil { + return nil, nil, nil, err + } } - return devices, ioThread, nil + return devices, ioThread, kernel, nil } func (q *qemu) setupTemplate(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) govmmQemu.Incoming { @@ -562,16 +561,6 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi IOMMUPlatform: q.config.IOMMUPlatform, } - kernelPath, err := q.config.KernelAssetPath() - if err != nil { - return err - } - - initrdPath, err := q.config.InitrdAssetPath() - if err != nil { - return err - } - incoming := q.setupTemplate(&knobs, &memory) // With the current implementations, VM templating will not work with file @@ -615,7 +604,12 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi return err } - devices, ioThread, err := q.buildDevices(ctx, initrdPath) + kernelPath, err := q.config.KernelAssetPath() + if err != nil { + return err + } + + devices, ioThread, kernel, err := q.buildDevices(ctx, kernelPath) if err != nil { return err } @@ -643,13 +637,8 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi return err } - // Breaks hypervisor abstraction has Kata Specific logic - kernel := govmmQemu.Kernel{ - Path: kernelPath, - InitrdPath: initrdPath, - // some devices configuration may also change kernel params, make sure this is called afterwards - Params: q.kernelParameters(), - } + // some devices configuration may also change kernel params, make sure this is called afterwards + kernel.Params = q.kernelParameters() q.checkBpfEnabled() qemuConfig := govmmQemu.Config{ @@ -666,7 +655,7 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi Devices: devices, CPUModel: cpuModel, SeccompSandbox: q.config.SeccompSandbox, - Kernel: kernel, + Kernel: *kernel, RTC: rtc, QMPSockets: qmpSockets, Knobs: knobs, diff --git a/src/runtime/virtcontainers/types/asset.go b/src/runtime/virtcontainers/types/asset.go index 3b00b5a20..6cad7dd33 100644 --- a/src/runtime/virtcontainers/types/asset.go +++ b/src/runtime/virtcontainers/types/asset.go @@ -41,6 +41,8 @@ const ( FirmwareAsset AssetType = "firmware" FirmwareVolumeAsset AssetType = "firmware_volume" + + UnkownAsset AssetType = "unknown" ) // AssetTypes returns a list of all known asset types. From f86bfe0da33dd2cdec783f33d154c6422c3627a9 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 22 Aug 2023 07:49:26 +0000 Subject: [PATCH 04/33] runtime/clh: fix image/initrd annotation handling We should make sure annotations are preferred over config options in image and initrd path handling. Fixes: #7705 Signed-off-by: Peng Tao (cherry picked from commit 9fda7059a50b43b0461bbcd4c4dfa2e2cc3d1212) Signed-off-by: Greg Kurz --- src/runtime/virtcontainers/clh.go | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index f031811e9..0d438b63c 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -549,14 +549,14 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net clh.vmconfig.Rng.SetIommu(clh.config.IOMMU) // set the initial root/boot disk of hypervisor - imagePath, err := clh.config.ImageAssetPath() + assetPath, assetType, err := clh.config.ImageOrInitrdAssetPath() if err != nil { return err } - if imagePath != "" { + if assetType == types.ImageAsset { if clh.config.ConfidentialGuest { - disk := chclient.NewDiskConfig(imagePath) + disk := chclient.NewDiskConfig(assetPath) disk.SetReadonly(true) diskRateLimiterConfig := clh.getDiskRateLimiterConfig() @@ -570,7 +570,7 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net clh.vmconfig.Disks = &[]chclient.DiskConfig{*disk} } } else { - pmem := chclient.NewPmemConfig(imagePath) + pmem := chclient.NewPmemConfig(assetPath) *pmem.DiscardWrites = true pmem.SetIommu(clh.config.IOMMU) @@ -581,12 +581,8 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net } } } else { - initrdPath, err := clh.config.InitrdAssetPath() - if err != nil { - return err - } - - clh.vmconfig.Payload.SetInitramfs(initrdPath) + // assetType == types.InitrdAsset + clh.vmconfig.Payload.SetInitramfs(assetPath) } if clh.config.ConfidentialGuest { From 9ce8ee6c0ca65b6bcd6756eac1c8e95d458f2e56 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 22 Aug 2023 07:53:34 +0000 Subject: [PATCH 05/33] runtime/fc: fix image/initrd annotation handling Right now if we configure an image annotation and have a config file setting initrd, the initrd config would override the image annotation. Make sure annotations are preferred over config options in image and initrd path handling. Signed-off-by: Peng Tao (cherry picked from commit 18d42da21ea1ee39401fb7a0d436fb8922366392) Signed-off-by: Greg Kurz --- src/runtime/virtcontainers/fc.go | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/runtime/virtcontainers/fc.go b/src/runtime/virtcontainers/fc.go index 520d73eaa..c7139a4a4 100644 --- a/src/runtime/virtcontainers/fc.go +++ b/src/runtime/virtcontainers/fc.go @@ -722,19 +722,12 @@ func (fc *firecracker) fcInitConfiguration(ctx context.Context) error { return err } - image, err := fc.config.InitrdAssetPath() + assetPath, _, err := fc.config.ImageOrInitrdAssetPath() if err != nil { return err } - if image == "" { - image, err = fc.config.ImageAssetPath() - if err != nil { - return err - } - } - - if err := fc.fcSetVMRootfs(ctx, image); err != nil { + if err := fc.fcSetVMRootfs(ctx, assetPath); err != nil { return err } From 65e0b99eb4a87a34b24acd519dd820c047c1d950 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 5 May 2023 16:45:15 +0200 Subject: [PATCH 06/33] versions: tdx: Update QEMU to v7.2 + TDX v1.10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the version that's been used and tested inside Intel, and it matches with https://github.com/intel/tdx-tools/releases/tag/2023ww15. Fixes: #7770 Signed-off-by: Fabiano Fidêncio (cherry picked from commit 9803b24286e06a08996cb3714b8e916a03f04d26) Signed-off-by: Greg Kurz --- .../no_patches.txt | 0 versions.yaml | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 tools/packaging/qemu/patches/tag_patches/b67b00e6b4c7831a3f5bc684bc0df7a9bfd1bd56-plus-TDX-v1.10/no_patches.txt diff --git a/tools/packaging/qemu/patches/tag_patches/b67b00e6b4c7831a3f5bc684bc0df7a9bfd1bd56-plus-TDX-v1.10/no_patches.txt b/tools/packaging/qemu/patches/tag_patches/b67b00e6b4c7831a3f5bc684bc0df7a9bfd1bd56-plus-TDX-v1.10/no_patches.txt new file mode 100644 index 000000000..e69de29bb diff --git a/versions.yaml b/versions.yaml index 2d23f2113..8a0a482eb 100644 --- a/versions.yaml +++ b/versions.yaml @@ -106,9 +106,9 @@ assets: qemu-tdx-experimental: # yamllint disable-line rule:line-length - description: "QEMU with TDX support - based on https://github.com/intel/tdx-tools/releases/tag/2023ww01" + description: "QEMU with TDX support - based on https://github.com/intel/tdx-tools/releases/tag/2023ww15" url: "https://github.com/kata-containers/qemu" - tag: "ad4c7f529a279685da84297773b4ec8080153c2d-plus-TDX-v1.3" + tag: "b67b00e6b4c7831a3f5bc684bc0df7a9bfd1bd56-plus-TDX-v1.10" qemu-snp-experimental: description: "QEMU with experimental SNP support (no UPM)" From a36064c729f6c56e40473d79082ce3543e605736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 5 May 2023 14:59:00 +0200 Subject: [PATCH 07/33] versions: tdx: Update TDVF to the "edk2-stable202302" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the version that's been used and tested inside Intel, and it matches with https://github.com/intel/tdx-tools/releases/tag/2023ww15. Fixes: #7770 Signed-off-by: Fabiano Fidêncio (cherry picked from commit ec18180f34988bfae04da3183fcfaf7296d66885) Signed-off-by: Greg Kurz --- versions.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/versions.yaml b/versions.yaml index 8a0a482eb..ccd727c8f 100644 --- a/versions.yaml +++ b/versions.yaml @@ -299,8 +299,8 @@ externals: package_output_dir: "AmdSev" tdx: # yamllint disable-line rule:line-length - description: "QEMU with TDX support - based on https://github.com/intel/tdx-tools/releases/tag/2023ww01" - version: "edk2-stable202211" + description: "QEMU with TDX support - based on https://github.com/intel/tdx-tools/releases/tag/2023ww15" + version: "edk2-stable202302" package: "OvmfPkg/IntelTdx/IntelTdxX64.dsc" package_output_dir: "IntelTdx" From 2f28866f262e03570fb1bd8c40684b15615814d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 9 May 2023 14:11:19 +0200 Subject: [PATCH 08/33] versions: tdx: Update Kernel to 6.2 + TDX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the version that's been used and tested inside Intel, and it matches with https://github.com/intel/tdx-tools/releases/tag/2023ww15. Fixes: #7770 Signed-off-by: Fabiano Fidêncio (cherry picked from commit 8115a0522db723245ff30fdcc67e4e00b8394e86) [Greg: Fix tools/packaging/kernel/kata_config_version that got messed up by 32be55aa8a2abd70172f21c9c018ab7605d3f39b) Signed-off-by: Greg Kurz --- tools/packaging/kernel/configs/fragments/whitelist.conf | 4 ++++ tools/packaging/kernel/configs/fragments/x86_64/tdx/tdx.conf | 3 --- tools/packaging/kernel/kata_config_version | 2 +- tools/packaging/kernel/patches/6.2-TDX-v1.x/no_patches.txt | 0 versions.yaml | 4 ++-- 5 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 tools/packaging/kernel/patches/6.2-TDX-v1.x/no_patches.txt diff --git a/tools/packaging/kernel/configs/fragments/whitelist.conf b/tools/packaging/kernel/configs/fragments/whitelist.conf index a6396a582..e6b271151 100644 --- a/tools/packaging/kernel/configs/fragments/whitelist.conf +++ b/tools/packaging/kernel/configs/fragments/whitelist.conf @@ -2,6 +2,10 @@ # without generating an error in fragment merging CONFIG_ARCH_RANDOM CONFIG_ARM64_CRYPTO +CONFIG_GENERIC_MSI_IRQ_DOMAIN +CONFIG_PCI_MSI_IRQ_DOMAIN +CONFIG_CLK_LGM_CGU +CONFIG_MEMCG_SWAP CONFIG_NF_NAT_IPV4 CONFIG_NF_NAT_NEEDED CONFIG_NF_NAT_PROTO_DCCP diff --git a/tools/packaging/kernel/configs/fragments/x86_64/tdx/tdx.conf b/tools/packaging/kernel/configs/fragments/x86_64/tdx/tdx.conf index 2f877a5c9..7d88bb9f0 100644 --- a/tools/packaging/kernel/configs/fragments/x86_64/tdx/tdx.conf +++ b/tools/packaging/kernel/configs/fragments/x86_64/tdx/tdx.conf @@ -1,13 +1,10 @@ # Intel Trust Domain Extensions (Intel TDX) -CONFIG_CLK_LGM_CGU=y -CONFIG_DMA_RESTRICTED_POOL=y CONFIG_EFI=y CONFIG_EFI_STUB=y CONFIG_INTEL_IOMMU_SVM=y CONFIG_INTEL_TDX_GUEST=y CONFIG_OF=y -CONFIG_OF_RESERVED_MEM=y CONFIG_X86_5LEVEL=y CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y CONFIG_X86_PLATFORM_DEVICES=y diff --git a/tools/packaging/kernel/kata_config_version b/tools/packaging/kernel/kata_config_version index 9b252fd09..dee79f109 100644 --- a/tools/packaging/kernel/kata_config_version +++ b/tools/packaging/kernel/kata_config_version @@ -1 +1 @@ -113 +114 diff --git a/tools/packaging/kernel/patches/6.2-TDX-v1.x/no_patches.txt b/tools/packaging/kernel/patches/6.2-TDX-v1.x/no_patches.txt new file mode 100644 index 000000000..e69de29bb diff --git a/versions.yaml b/versions.yaml index ccd727c8f..759e6ee92 100644 --- a/versions.yaml +++ b/versions.yaml @@ -191,9 +191,9 @@ assets: kernel-tdx-experimental: # yamllint disable-line rule:line-length - description: "Linux kernel with TDX support -- based on https://github.com/intel/tdx-tools/releases/tag/2023ww01" + description: "Linux kernel with TDX support -- based on https://github.com/intel/tdx-tools/releases/tag/2023ww15" url: "https://github.com/kata-containers/linux/archive/refs/tags" - version: "5.19-TDX-v2.2" + version: "6.2-TDX-v1.8" externals: description: "Third-party projects used by the system" From 07471cd7a64e92d0ff562bb7bf2277a322306378 Mon Sep 17 00:00:00 2001 From: Archana Shinde Date: Thu, 24 Aug 2023 01:41:47 -0700 Subject: [PATCH 09/33] qemu: tdx: Adapt to the TDX 1.5 stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QEMU for TDX 1.5 makes use of private memory map/unmap. Make changes to govmm to support this. Support for private backing fd for memory is added as knob to the qemu config. Userspace's map/unmap operations are done by fallocate() ioctl on the backing store fd. Reference: https://lore.kernel.org/linux-mm/20220519153713.819591-1-chao.p.peng@linux.intel.com/ Fixes: #7770 Signed-off-by: Archana Shinde Signed-off-by: Fabiano Fidêncio (cherry picked from commit 1e34220c41c813a3a585da226e8624f3551a77df) Signed-off-by: Greg Kurz --- src/runtime/pkg/govmm/qemu/qemu.go | 19 +++++++++++++--- src/runtime/pkg/govmm/qemu/qemu_test.go | 23 ++++++++++++++++++++ src/runtime/virtcontainers/qemu.go | 17 +++++++++++++++ src/runtime/virtcontainers/qemu_arch_base.go | 7 ++++++ 4 files changed, 63 insertions(+), 3 deletions(-) diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 46e845f9e..c0c692a80 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -141,9 +141,16 @@ const ( func isDimmSupported(config *Config) bool { switch runtime.GOARCH { case "amd64", "386", "ppc64le", "arm64": - if config != nil && config.Machine.Type == MachineTypeMicrovm { - // microvm does not support NUMA - return false + if config != nil { + if config.Machine.Type == MachineTypeMicrovm { + // microvm does not support NUMA + return false + } + if config.Knobs.MemFDPrivate { + // TDX guests rely on MemFD Private, which + // does not have NUMA support yet + return false + } } return true default: @@ -2628,6 +2635,9 @@ type Knobs struct { // MemPrealloc will allocate all the RAM upfront MemPrealloc bool + // Private Memory FD meant for private memory map/unmap. + MemFDPrivate bool + // FileBackedMem requires Memory.Size and Memory.Path of the VM to // be set. FileBackedMem bool @@ -2992,10 +3002,13 @@ func (config *Config) appendMemoryKnobs() { return } var objMemParam, numaMemParam string + dimmName := "dimm1" if config.Knobs.HugePages { objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=/dev/hugepages" numaMemParam = "node,memdev=" + dimmName + } else if config.Knobs.MemFDPrivate { + objMemParam = "memory-backend-memfd-private,id=" + dimmName + ",size=" + config.Memory.Size } else if config.Knobs.FileBackedMem && config.Memory.Path != "" { objMemParam = "memory-backend-file,id=" + dimmName + ",size=" + config.Memory.Size + ",mem-path=" + config.Memory.Path numaMemParam = "node,memdev=" + dimmName diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index 34ca776df..f585714d0 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -632,6 +632,29 @@ func TestAppendMemoryFileBackedMemPrealloc(t *testing.T) { testConfigAppend(conf, knobs, memString+" "+knobsString, t) } +func TestAppendMemoryBackedMemFdPrivate(t *testing.T) { + conf := &Config{ + Memory: Memory{ + Size: "1G", + Slots: 8, + }, + } + memString := "-m 1G,slots=8" + testConfigAppend(conf, conf.Memory, memString, t) + + knobs := Knobs{ + MemFDPrivate: true, + MemShared: false, + } + objMemString := "-object memory-backend-memfd-private,id=dimm1,size=1G" + memBackendString := "-machine memory-backend=dimm1" + + knobsString := objMemString + " " + knobsString += memBackendString + + testConfigAppend(conf, knobs, memString+" "+knobsString, t) +} + func TestNoRebootKnob(t *testing.T) { conf := &Config{} diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index b03b176b4..f4de0d0d2 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -604,6 +604,23 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi return err } + if q.config.ConfidentialGuest { + // At this point we're safe to just check for the protection field + // on the hypervisor specific code, as availableGuestProtection() + // has been called earlier and we know we have the value stored. + if q.arch.getProtection() == tdxProtection { + knobs.MemFDPrivate = true + + // In case Nydus or VirtioFS is used, which may become a reality + // in the future, whenever we get those hardened for TDX, those + // knobs below would be automatically set. Let's make sure we + // pre-emptively disable them, and with that we can avoid some + // headaches in the future. + knobs.FileBackedMem = false + knobs.MemShared = false + } + } + kernelPath, err := q.config.KernelAssetPath() if err != nil { return err diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index 0dc9f46cd..763ccbb44 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -71,6 +71,9 @@ type qemuArch interface { // memoryTopology returns the memory topology using the given amount of memoryMb and hostMemoryMb memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory + // protection returns platform protection + getProtection() guestProtection + // appendConsole appends a console to devices appendConsole(ctx context.Context, devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) @@ -280,6 +283,10 @@ func (q *qemuArchBase) machine() govmmQemu.Machine { return q.qemuMachine } +func (q *qemuArchBase) getProtection() guestProtection { + return q.protection +} + func (q *qemuArchBase) qemuPath() string { return q.qemuExePath } From fa824af2349ae664abb00512b0b034156f563984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 21 Aug 2023 15:21:34 +0200 Subject: [PATCH 10/33] qemu: tdx: Workaround SMP issue with TDX 1.5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `...,sockets=1,cores=numvcpus,threads=1,...` must be used. Fixes: #7770 Signed-off-by: Fabiano Fidêncio (cherry picked from commit d1b54ede290e95762099fff4e0bcdad10f816126) Signed-off-by: Greg Kurz --- src/runtime/virtcontainers/qemu.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index f4de0d0d2..a3c14b25c 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -618,6 +618,13 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi // headaches in the future. knobs.FileBackedMem = false knobs.MemShared = false + + // SMP is currently broken with TDX 1.5, and + // we must ensure we use something like: + // `...,sockets=1,cores=numvcpus,threads=1,...` + smp.Sockets = 1 + smp.Cores = q.config.NumVCPUs + smp.Threads = 1 } } From 7e6f8010bd6a14da67be46cc6f716ba5560a6408 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Mon, 21 Aug 2023 09:26:42 +0000 Subject: [PATCH 11/33] runtime: run prestart hooks before starting VM for FC Add a new hypervisor capability to tell if it supports device hotplug. If not, we should run prestart hooks before starting new VMs as nerdctl is using the prestart hooks to set up netns. To make nerdctl + FC to work, we need to run the prestart hooks before starting new VMs. Fixes: #6384 Signed-off-by: Peng Tao (cherry picked from commit 32fd013716fa7b4818b2edd88d7b3a62e0e22be1) Signed-off-by: Greg Kurz --- src/runtime/virtcontainers/acrn_arch_base.go | 1 + .../virtcontainers/acrn_arch_base_test.go | 1 + src/runtime/virtcontainers/acrn_test.go | 1 + src/runtime/virtcontainers/clh.go | 1 + src/runtime/virtcontainers/clh_test.go | 3 ++ src/runtime/virtcontainers/qemu_amd64.go | 1 + src/runtime/virtcontainers/qemu_amd64_test.go | 2 + src/runtime/virtcontainers/qemu_arch_base.go | 1 + .../virtcontainers/qemu_arch_base_test.go | 1 + src/runtime/virtcontainers/qemu_ppc64le.go | 1 + src/runtime/virtcontainers/qemu_test.go | 1 + src/runtime/virtcontainers/sandbox.go | 50 ++++++++++++++----- .../virtcontainers/types/capabilities.go | 11 ++++ 13 files changed, 63 insertions(+), 12 deletions(-) diff --git a/src/runtime/virtcontainers/acrn_arch_base.go b/src/runtime/virtcontainers/acrn_arch_base.go index 0b9ee53cc..fa8ce0fe6 100644 --- a/src/runtime/virtcontainers/acrn_arch_base.go +++ b/src/runtime/virtcontainers/acrn_arch_base.go @@ -366,6 +366,7 @@ func (a *acrnArchBase) capabilities(config HypervisorConfig) types.Capabilities caps.SetBlockDeviceSupport() caps.SetBlockDeviceHotplugSupport() + caps.SetNetworkDeviceHotplugSupported() return caps } diff --git a/src/runtime/virtcontainers/acrn_arch_base_test.go b/src/runtime/virtcontainers/acrn_arch_base_test.go index c34974e69..39b3ba98f 100644 --- a/src/runtime/virtcontainers/acrn_arch_base_test.go +++ b/src/runtime/virtcontainers/acrn_arch_base_test.go @@ -89,6 +89,7 @@ func TestAcrnArchBaseCapabilities(t *testing.T) { assert.True(c.IsBlockDeviceSupported()) assert.True(c.IsBlockDeviceHotplugSupported()) assert.False(c.IsFsSharingSupported()) + assert.True(c.IsNetworkDeviceHotplugSupported()) } func TestAcrnArchBaseMemoryTopology(t *testing.T) { diff --git a/src/runtime/virtcontainers/acrn_test.go b/src/runtime/virtcontainers/acrn_test.go index 9ce189812..8c7e646c1 100644 --- a/src/runtime/virtcontainers/acrn_test.go +++ b/src/runtime/virtcontainers/acrn_test.go @@ -82,6 +82,7 @@ func TestAcrnCapabilities(t *testing.T) { caps := a.Capabilities(a.ctx) assert.True(caps.IsBlockDeviceSupported()) assert.True(caps.IsBlockDeviceHotplugSupported()) + assert.True(caps.IsNetworkDeviceHotplugSupported()) } func testAcrnAddDevice(t *testing.T, devInfo interface{}, devType DeviceType, expected []Device) { diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 0d438b63c..04aba8545 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -1221,6 +1221,7 @@ func (clh *cloudHypervisor) Capabilities(ctx context.Context) types.Capabilities caps.SetFsSharingSupport() } caps.SetBlockDeviceHotplugSupport() + caps.SetNetworkDeviceHotplugSupported() return caps } diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index c141adfd9..cc559f441 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -753,4 +753,7 @@ func TestClhCapabilities(t *testing.T) { c = clh.Capabilities(ctx) assert.False(c.IsFsSharingSupported()) + + assert.True(c.IsNetworkDeviceHotplugSupported()) + assert.True(c.IsBlockDeviceHotplugSupported()) } diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go index 64a0e0915..96aa68946 100644 --- a/src/runtime/virtcontainers/qemu_amd64.go +++ b/src/runtime/virtcontainers/qemu_amd64.go @@ -162,6 +162,7 @@ func (q *qemuAmd64) capabilities(hConfig HypervisorConfig) types.Capabilities { if q.qemuMachine.Type == QemuQ35 || q.qemuMachine.Type == QemuVirt { caps.SetBlockDeviceHotplugSupport() + caps.SetNetworkDeviceHotplugSupported() } caps.SetMultiQueueSupport() diff --git a/src/runtime/virtcontainers/qemu_amd64_test.go b/src/runtime/virtcontainers/qemu_amd64_test.go index 82005b745..1425cb38c 100644 --- a/src/runtime/virtcontainers/qemu_amd64_test.go +++ b/src/runtime/virtcontainers/qemu_amd64_test.go @@ -47,10 +47,12 @@ func TestQemuAmd64Capabilities(t *testing.T) { amd64 := newTestQemu(assert, QemuQ35) caps := amd64.capabilities(config) assert.True(caps.IsBlockDeviceHotplugSupported()) + assert.True(caps.IsNetworkDeviceHotplugSupported()) amd64 = newTestQemu(assert, QemuMicrovm) caps = amd64.capabilities(config) assert.False(caps.IsBlockDeviceHotplugSupported()) + assert.False(caps.IsNetworkDeviceHotplugSupported()) } func TestQemuAmd64Bridges(t *testing.T) { diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index 763ccbb44..fd92be772 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -307,6 +307,7 @@ func (q *qemuArchBase) capabilities(hConfig HypervisorConfig) types.Capabilities var caps types.Capabilities caps.SetBlockDeviceHotplugSupport() caps.SetMultiQueueSupport() + caps.SetNetworkDeviceHotplugSupported() if hConfig.SharedFS != config.NoSharedFS { caps.SetFsSharingSupport() } diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go index 75d7de029..3090e765b 100644 --- a/src/runtime/virtcontainers/qemu_arch_base_test.go +++ b/src/runtime/virtcontainers/qemu_arch_base_test.go @@ -123,6 +123,7 @@ func TestQemuArchBaseCapabilities(t *testing.T) { c := qemuArchBase.capabilities(hConfig) assert.True(c.IsBlockDeviceHotplugSupported()) assert.True(c.IsFsSharingSupported()) + assert.True(c.IsNetworkDeviceHotplugSupported()) hConfig.SharedFS = config.NoSharedFS c = qemuArchBase.capabilities(hConfig) diff --git a/src/runtime/virtcontainers/qemu_ppc64le.go b/src/runtime/virtcontainers/qemu_ppc64le.go index 7d71d72ba..015d1758c 100644 --- a/src/runtime/virtcontainers/qemu_ppc64le.go +++ b/src/runtime/virtcontainers/qemu_ppc64le.go @@ -104,6 +104,7 @@ func (q *qemuPPC64le) capabilities(hConfig HypervisorConfig) types.Capabilities // pseries machine type supports hotplugging drives if q.qemuMachine.Type == QemuPseries { caps.SetBlockDeviceHotplugSupport() + caps.SetNetworkDeviceHotplugSupported() } caps.SetMultiQueueSupport() diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index 418075e26..b47d23490 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -475,6 +475,7 @@ func TestQemuCapabilities(t *testing.T) { caps := q.Capabilities(q.ctx) assert.True(caps.IsBlockDeviceHotplugSupported()) + assert.True(caps.IsNetworkDeviceHotplugSupported()) } func TestQemuQemuPath(t *testing.T) { diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 3a3c275bb..b84174b43 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -1288,6 +1288,23 @@ func (s *Sandbox) cleanSwap(ctx context.Context) { } } +func (s *Sandbox) runPrestartHooks(ctx context.Context, prestartHookFunc func(context.Context) error) error { + hid, err := s.GetHypervisorPid() + if err != nil { + s.Logger().Errorf("fail to get hypervisor pid for sandbox %s", s.id) + return err + } + s.Logger().Infof("sandbox %s hypervisor pid is %v", s.id, hid) + ctx = context.WithValue(ctx, HypervisorPidKey{}, hid) + + if err := prestartHookFunc(ctx); err != nil { + s.Logger().Errorf("fail to run prestartHook for sandbox %s: %s", s.id, err) + return err + } + + return nil +} + // startVM starts the VM. func (s *Sandbox) startVM(ctx context.Context, prestartHookFunc func(context.Context) error) (err error) { span, ctx := katatrace.Trace(ctx, s.Logger(), "startVM", sandboxTracingTags, map[string]string{"sandbox_id": s.id}) @@ -1310,6 +1327,17 @@ func (s *Sandbox) startVM(ctx context.Context, prestartHookFunc func(context.Con } }() + caps := s.hypervisor.Capabilities(ctx) + // If the hypervisor does not support device hotplug, run prestart hooks + // before spawning the VM so that it is possible to let the hooks set up + // netns and thus network devices are set up statically. + if !caps.IsNetworkDeviceHotplugSupported() && prestartHookFunc != nil { + err = s.runPrestartHooks(ctx, prestartHookFunc) + if err != nil { + return err + } + } + if err := s.network.Run(ctx, func() error { if s.factory != nil { vm, err := s.factory.GetVM(ctx, VMConfig{ @@ -1329,24 +1357,22 @@ func (s *Sandbox) startVM(ctx context.Context, prestartHookFunc func(context.Con return err } - if prestartHookFunc != nil { - hid, err := s.GetHypervisorPid() + if caps.IsNetworkDeviceHotplugSupported() && prestartHookFunc != nil { + err = s.runPrestartHooks(ctx, prestartHookFunc) if err != nil { return err } - s.Logger().Infof("hypervisor pid is %v", hid) - ctx = context.WithValue(ctx, HypervisorPidKey{}, hid) - - if err := prestartHookFunc(ctx); err != nil { - return err - } } - // 1. Do not scan the netns if we want no network for the vmm. - // 2. In case of vm factory, scan the netns to hotplug interfaces after vm is started. - // 3. In case of prestartHookFunc, network config might have been changed. We need to + // 1. Do not scan the netns if we want no network for the vmm + // 2. Do not scan the netns if the vmm does not support device hotplug, in which case + // the network is already set up statically + // 3. In case of vm factory, scan the netns to hotplug interfaces after vm is started. + // 4. In case of prestartHookFunc, network config might have been changed. We need to // rescan and handle the change. - if !s.config.NetworkConfig.DisableNewNetwork && (s.factory != nil || prestartHookFunc != nil) { + if !s.config.NetworkConfig.DisableNewNetwork && + caps.IsNetworkDeviceHotplugSupported() && + (s.factory != nil || prestartHookFunc != nil) { if _, err := s.network.AddEndpoints(ctx, s, nil, true); err != nil { return err } diff --git a/src/runtime/virtcontainers/types/capabilities.go b/src/runtime/virtcontainers/types/capabilities.go index c035444b6..71c7f2d8f 100644 --- a/src/runtime/virtcontainers/types/capabilities.go +++ b/src/runtime/virtcontainers/types/capabilities.go @@ -10,6 +10,7 @@ const ( blockDeviceHotplugSupport multiQueueSupport fsSharingSupported + networkDeviceHotplugSupport ) // Capabilities describe a virtcontainers hypervisor capabilities @@ -57,3 +58,13 @@ func (caps *Capabilities) IsFsSharingSupported() bool { func (caps *Capabilities) SetFsSharingSupport() { caps.flags |= fsSharingSupported } + +// IsDeviceHotplugSupported tells if an hypervisor supports device hotplug. +func (caps *Capabilities) IsNetworkDeviceHotplugSupported() bool { + return caps.flags&networkDeviceHotplugSupport != 0 +} + +// SetDeviceHotplugSupported sets the host filesystem sharing capability to true. +func (caps *Capabilities) SetNetworkDeviceHotplugSupported() { + caps.flags |= networkDeviceHotplugSupport +} From c17cbd30f0eaca910bdabd3e5381d4b18be35301 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 22 Aug 2023 11:17:31 +0000 Subject: [PATCH 12/33] runtime: fail early when starting docker container with FC FC does not support network device hotplug. Let's add a check to fail early when starting containers created by docker. Signed-off-by: Peng Tao (cherry picked from commit 21204caf20a51e514e60586c705d8b7b836b118c) Signed-off-by: Greg Kurz --- src/runtime/virtcontainers/sandbox.go | 11 +++++++++ src/runtime/virtcontainers/utils/utils.go | 20 ++++++++++++++++ .../virtcontainers/utils/utils_test.go | 23 +++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index b84174b43..a53003fc6 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -933,6 +933,17 @@ func (s *Sandbox) createNetwork(ctx context.Context) error { return nil } + // docker container needs the hypervisor process ID to find out the container netns, + // which means that the hypervisor has to support network device hotplug so that docker + // can use the prestart hooks to set up container netns. + caps := s.hypervisor.Capabilities(ctx) + if !caps.IsNetworkDeviceHotplugSupported() { + spec := s.GetPatchedOCISpec() + if utils.IsDockerContainer(spec) { + return errors.New("docker container needs network device hotplug but the configured hypervisor does not support it") + } + } + span, ctx := katatrace.Trace(ctx, s.Logger(), "createNetwork", sandboxTracingTags, map[string]string{"sandbox_id": s.id}) defer span.End() katatrace.AddTags(span, "network", s.network, "NetworkConfig", s.config.NetworkConfig) diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index bac62e8cf..735cf3a2f 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -12,9 +12,11 @@ import ( "os" "os/exec" "path/filepath" + "strings" "syscall" "time" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -494,3 +496,21 @@ func RevertBytes(num uint64) uint64 { } return 1024*RevertBytes(a) + b } + +// IsDockerContainer returns if the container is managed by docker +// This is done by checking the prestart hook for `libnetwork` arguments. +func IsDockerContainer(spec *specs.Spec) bool { + if spec == nil || spec.Hooks == nil { + return false + } + + for _, hook := range spec.Hooks.Prestart { + for _, arg := range hook.Args { + if strings.HasPrefix(arg, "libnetwork") { + return true + } + } + } + + return false +} diff --git a/src/runtime/virtcontainers/utils/utils_test.go b/src/runtime/virtcontainers/utils/utils_test.go index 8f3d0eed2..bae9af8db 100644 --- a/src/runtime/virtcontainers/utils/utils_test.go +++ b/src/runtime/virtcontainers/utils/utils_test.go @@ -16,6 +16,7 @@ import ( "syscall" "testing" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" ) @@ -580,3 +581,25 @@ func TestRevertBytes(t *testing.T) { num := RevertBytes(testNum) assert.Equal(expectedNum, num) } + +func TestIsDockerContainer(t *testing.T) { + assert := assert.New(t) + + ociSpec := &specs.Spec{ + Hooks: &specs.Hooks{ + Prestart: []specs.Hook{ + { + Args: []string{ + "haha", + }, + }, + }, + }, + } + assert.False(IsDockerContainer(ociSpec)) + + ociSpec.Hooks.Prestart = append(ociSpec.Hooks.Prestart, specs.Hook{ + Args: []string{"libnetwork-xxx"}, + }) + assert.True(IsDockerContainer(ociSpec)) +} From e0513094a02d196193cf6494ebebb0c1ddbe60cb Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 30 Aug 2023 03:01:22 +0000 Subject: [PATCH 13/33] runtime/vc: runPrestartHooks should ignore GetHypervisorPid failure If we are running FC hypervisor, it is not started when prestart hooks are executed. So we should just ignore such error and just go ahead and run the hooks. Signed-off-by: Peng Tao (cherry picked from commit 2e4c874726a9e10b53fcae52bca1a45bc642f689) Signed-off-by: Greg Kurz --- src/runtime/virtcontainers/sandbox.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index a53003fc6..bbbe9ef19 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -1300,13 +1300,12 @@ func (s *Sandbox) cleanSwap(ctx context.Context) { } func (s *Sandbox) runPrestartHooks(ctx context.Context, prestartHookFunc func(context.Context) error) error { - hid, err := s.GetHypervisorPid() - if err != nil { - s.Logger().Errorf("fail to get hypervisor pid for sandbox %s", s.id) - return err + hid, _ := s.GetHypervisorPid() + // Ignore errors here as hypervisor might not have been started yet, likely in FC case. + if hid > 0 { + s.Logger().Infof("sandbox %s hypervisor pid is %v", s.id, hid) + ctx = context.WithValue(ctx, HypervisorPidKey{}, hid) } - s.Logger().Infof("sandbox %s hypervisor pid is %v", s.id, hid) - ctx = context.WithValue(ctx, HypervisorPidKey{}, hid) if err := prestartHookFunc(ctx); err != nil { s.Logger().Errorf("fail to run prestartHook for sandbox %s: %s", s.id, err) From 03d712ab252c8b9c005ceeef680f5026ea652429 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 6 Sep 2023 17:11:16 +0200 Subject: [PATCH 14/33] runtime: Allow virtio_fs_extra_args annotation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some use cases may just require passing extra arguments to virtiofsd, and having this disabled by default makes it impossible to set when using kata-deploy, as changes in the configuration file would be overwritten by the daemon-set. With this in mind, let's allow users to pass whatever thet need (and here I'm specifically looking at `--xattr`) as a virtio_fs_extra_arg. Fixes: #7853 Signed-off-by: Fabiano Fidêncio (cherry picked from commit b1dd09a4d345e03191aa64b866eca03179efd9ea) Signed-off-by: Greg Kurz --- src/runtime/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/Makefile b/src/runtime/Makefile index f34d21249..7655a060c 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -192,7 +192,7 @@ DEFMEMSLOTS := 10 DEFMAXMEMSZ := 0 #Default number of bridges DEFBRIDGES := 1 -DEFENABLEANNOTATIONS := [\"enable_iommu\"] +DEFENABLEANNOTATIONS := [\"enable_iommu\", \"virtio_fs_extra_args\"] DEFDISABLEGUESTSECCOMP := true DEFDISABLEGUESTEMPTYDIR := false #Default experimental features enabled From 4679aa77124978548bb30627209c85d76786dcbc Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Wed, 6 Sep 2023 17:43:17 +0200 Subject: [PATCH 15/33] runtime/qemu: Pass "--xattr" to virtiofsd instead of "-o xattr" The "-o" syntax belongs to the legacy C virtiofsd. It is deprecated with the rust implementation. Signed-off-by: Greg Kurz (cherry picked from commit 81536f21af71e0fa0d24754006f9ac395fabee9e) Signed-off-by: Greg Kurz --- src/runtime/virtcontainers/qemu.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index a3c14b25c..e650a9cc3 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -513,7 +513,7 @@ func (q *qemu) createVirtiofsDaemon(sharedPath string) (VirtiofsDaemon, error) { // in virtiofs if SELinux on the guest side is enabled. if !q.config.DisableGuestSeLinux { q.Logger().Info("Set the xattr option for virtiofsd") - q.config.VirtioFSExtraArgs = append(q.config.VirtioFSExtraArgs, "-o", "xattr") + q.config.VirtioFSExtraArgs = append(q.config.VirtioFSExtraArgs, "--xattr") } // default use virtiofsd From f0278f41d71db478cbbf273e1618832a33e55fe5 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Wed, 6 Sep 2023 17:56:01 +0200 Subject: [PATCH 16/33] runtime/virtiofsd: Drop all references to "--cache=none" This syntax belongs to the legacy C virtiofsd implementation that we don't support anymore since kata-containers 3.1.3 because of other API breaking changes. People have been warned to switch from "none" to "never" since kata-containers 2.5.2. Let's officially do that. The compat code that would convert "none" to "never" isn't needed anymore. Just drop it. Fixes #7864 Signed-off-by: Greg Kurz (cherry picked from commit 72c510d057a21d96628ce4b2b12bfa3b31bec99c) Signed-off-by: Greg Kurz --- src/runtime/virtcontainers/virtiofsd.go | 4 ---- src/runtime/virtcontainers/virtiofsd_test.go | 13 ++++--------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/runtime/virtcontainers/virtiofsd.go b/src/runtime/virtcontainers/virtiofsd.go index 3e02756eb..0a6264684 100644 --- a/src/runtime/virtcontainers/virtiofsd.go +++ b/src/runtime/virtcontainers/virtiofsd.go @@ -39,7 +39,6 @@ var ( const ( typeVirtioFSCacheModeNever = "never" - typeVirtioFSCacheModeNone = "none" typeVirtioFSCacheModeAlways = "always" typeVirtioFSCacheModeAuto = "auto" ) @@ -219,9 +218,6 @@ func (v *virtiofsd) valid() error { if v.cache == "" { v.cache = typeVirtioFSCacheModeAuto - } else if v.cache == typeVirtioFSCacheModeNone { - v.Logger().Warn("virtio-fs cache mode `none` is deprecated since Kata Containers 2.5.0 and will be removed in the future release, please use `never` instead. For more details please refer to https://github.com/kata-containers/kata-containers/issues/4234.") - v.cache = typeVirtioFSCacheModeNever } else if v.cache != typeVirtioFSCacheModeAuto && v.cache != typeVirtioFSCacheModeAlways && v.cache != typeVirtioFSCacheModeNever { return errVirtiofsdInvalidVirtiofsCacheMode(v.cache) } diff --git a/src/runtime/virtcontainers/virtiofsd_test.go b/src/runtime/virtcontainers/virtiofsd_test.go index c7d1e1e78..d44a523e6 100644 --- a/src/runtime/virtcontainers/virtiofsd_test.go +++ b/src/runtime/virtcontainers/virtiofsd_test.go @@ -76,15 +76,15 @@ func TestVirtiofsdArgs(t *testing.T) { v := &virtiofsd{ path: "/usr/bin/virtiofsd", sourcePath: "/run/kata-shared/foo", - cache: "none", + cache: "never", } - expected := "--syslog --cache=none --shared-dir=/run/kata-shared/foo --fd=123" + expected := "--syslog --cache=never --shared-dir=/run/kata-shared/foo --fd=123" args, err := v.args(123) assert.NoError(err) assert.Equal(expected, strings.Join(args, " ")) - expected = "--syslog --cache=none --shared-dir=/run/kata-shared/foo --fd=456" + expected = "--syslog --cache=never --shared-dir=/run/kata-shared/foo --fd=456" args, err = v.args(456) assert.NoError(err) assert.Equal(expected, strings.Join(args, " ")) @@ -130,12 +130,7 @@ func TestValid(t *testing.T) { {"source is not available", func(v *virtiofsd) { v.sourcePath = "/foo/bar" }, errVirtiofsdSourceNotAvailable, nil}, - {"replace cache mode none by never", func(v *virtiofsd) { - v.cache = "none" - }, nil, func(name string, a *assert.Assertions, v *virtiofsd) { - a.Equal("never", v.cache, "test case %+s, cache mode none should be replaced by never", name) - }}, - {"invald cache mode: replace none by never", func(v *virtiofsd) { + {"invalid cache mode", func(v *virtiofsd) { v.cache = "foo" }, errVirtiofsdInvalidVirtiofsCacheMode("foo"), nil}, } From e7579d20f767eabcf9ab47722588b8ff5b999c26 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Thu, 7 Sep 2023 16:44:27 +0200 Subject: [PATCH 17/33] runtime/qemu: Rework QMP/HMP support PR #6146 added the possibility to control QEMU with an extra HMP socket as an aid for debugging. This is great for development or bug chasing but this raises some concerns in production. The HMP monitor allows to temper with the VM state in a variety of ways. This could be intentionally or mistakenly used to inject subtle bugs in the VM that would be extremely hard if not even impossible to debug. We definitely don't want that to be enabled by default. The feature is currently wired to the `enable_debug` setting in the `[hypervisor.qemu]` section of the configuration file. This setting has historically been used to control "debug output" and it is used as such by some downstream users (e.g. Openshift). Forcing people to have the extra HMP backdoor at the same time is abusive and dangerous. A new `extra_monitor_socket` is added to `[hypervisor.qemu]` to give fine control on whether the HMP socket is wanted or not. This setting is still gated by `enable_debug = true` to make it clear it is for debug only. The default is to not have the HMP socket though. This isn't backward compatible with #6416 but it is for the sake of "better safe than sorry". An extra monitor socket makes the QEMU instance untrusted. A warning is thus logged to the journal when one is requested. While here, also allow the user to choose between HMP and QMP for the extra monitor socket. Motivation is that QMP offers way more options to control or introspect the VM than HMP does. Users can also ask for pretty json formatting well suited for human reading. This will improve the debugging experience. This feature is only made visible in the base and GPU configurations of QEMU for now. Fixes #7952 Signed-off-by: Greg Kurz (cherry picked from commit 1f16b6627be26fd9fe165fe9d1ef8eb5f7bad26e) Signed-off-by: Greg Kurz --- .../configuration-qemu-nvidia-gpu.toml.in | 17 +- src/runtime/config/configuration-qemu.toml.in | 17 +- src/runtime/pkg/govmm/qemu/qemu.go | 31 ++- src/runtime/pkg/govmm/qemu/qemu_test.go | 43 ++-- .../pkg/katautils/config-settings.go.in | 2 + src/runtime/pkg/katautils/config.go | 186 ++++++++++-------- src/runtime/virtcontainers/hypervisor.go | 6 +- src/runtime/virtcontainers/qemu.go | 41 ++-- 8 files changed, 217 insertions(+), 126 deletions(-) diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in index 4861cb1ed..c88e64bbf 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in @@ -327,11 +327,26 @@ valid_file_mem_backends = @DEFVALIDFILEMEMBACKENDS@ pflashes = [] # This option changes the default hypervisor and kernel parameters -# to enable debug output where available. And Debug also enable the hmp socket. +# to enable debug output where available. # # Default false #enable_debug = true +# This option allows to add an extra HMP or QMP socket when `enable_debug = true` +# +# WARNING: Anyone with access to the extra socket can take full control of +# Qemu. This is for debugging purpose only and must *NEVER* be used in +# production. +# +# Valid values are : +# - "hmp" +# - "qmp" +# - "qmp-pretty" (same as "qmp" with pretty json formatting) +# +# If set to the empty string "", no extra monitor socket is added. This is +# the default. +#extra_monitor_socket = hmp + # Disable the customizations done in the runtime when it detects # that it is running on top a VMM. This will result in the runtime # behaving as it would when running on bare metal. diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index fc38a0866..8179069f1 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -327,11 +327,26 @@ valid_file_mem_backends = @DEFVALIDFILEMEMBACKENDS@ pflashes = [] # This option changes the default hypervisor and kernel parameters -# to enable debug output where available. And Debug also enable the hmp socket. +# to enable debug output where available. # # Default false #enable_debug = true +# This option allows to add an extra HMP or QMP socket when `enable_debug = true` +# +# WARNING: Anyone with access to the extra socket can take full control of +# Qemu. This is for debugging purpose only and must *NEVER* be used in +# production. +# +# Valid values are : +# - "hmp" +# - "qmp" +# - "qmp-pretty" (same as "qmp" with pretty json formatting) +# +# If set to the empty string "", no extra monitor socket is added. This is +# the default. +#extra_monitor_socket = hmp + # Disable the customizations done in the runtime when it detects # that it is running on top a VMM. This will result in the runtime # behaving as it would when running on bare metal. diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index c0c692a80..ffb464c33 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -2471,14 +2471,28 @@ const ( Unix QMPSocketType = "unix" ) -// QMPSocket represents a qemu QMP socket configuration. +// MonitorProtocol tells what protocol is used on a QMPSocket +type MonitorProtocol string + +const ( + // Socket using a human-friendly text-based protocol. + Hmp MonitorProtocol = "hmp" + + // Socket using a richer json-based protocol. + Qmp MonitorProtocol = "qmp" + + // Same as Qmp with pretty json formatting. + QmpPretty MonitorProtocol = "qmp-pretty" +) + +// QMPSocket represents a qemu QMP or HMP socket configuration. // nolint: govet type QMPSocket struct { // Type is the socket type (e.g. "unix"). Type QMPSocketType - // Human Monitor Interface (HMP) (true for HMP, false for QMP, default false) - IsHmp bool + // Protocol is the protocol to be used on the socket. + Protocol MonitorProtocol // QMP listener file descriptor to be passed to qemu FD *os.File @@ -2504,6 +2518,10 @@ func (qmp QMPSocket) Valid() bool { return false } + if qmp.Protocol != Hmp && qmp.Protocol != Qmp && qmp.Protocol != QmpPretty { + return false + } + return true } @@ -2855,10 +2873,11 @@ func (config *Config) appendQMPSockets() { } } - if q.IsHmp { + switch q.Protocol { + case Hmp: config.qemuParams = append(config.qemuParams, "-monitor") - } else { - config.qemuParams = append(config.qemuParams, "-qmp") + default: + config.qemuParams = append(config.qemuParams, fmt.Sprintf("-%s", q.Protocol)) } config.qemuParams = append(config.qemuParams, strings.Join(qmpParams, ",")) diff --git a/src/runtime/pkg/govmm/qemu/qemu_test.go b/src/runtime/pkg/govmm/qemu/qemu_test.go index f585714d0..e1cb2a2d1 100644 --- a/src/runtime/pkg/govmm/qemu/qemu_test.go +++ b/src/runtime/pkg/govmm/qemu/qemu_test.go @@ -726,10 +726,11 @@ var qmpSingleSocketString = "-qmp unix:path=cc-qmp" func TestAppendSingleQMPSocketServer(t *testing.T) { qmp := QMPSocket{ - Type: "unix", - Name: "cc-qmp", - Server: true, - NoWait: true, + Type: "unix", + Name: "cc-qmp", + Server: true, + NoWait: true, + Protocol: Qmp, } testAppend(qmp, qmpSingleSocketServerString, t) @@ -737,9 +738,10 @@ func TestAppendSingleQMPSocketServer(t *testing.T) { func TestAppendSingleQMPSocket(t *testing.T) { qmp := QMPSocket{ - Type: Unix, - Name: "cc-qmp", - Server: false, + Type: Unix, + Name: "cc-qmp", + Server: false, + Protocol: Qmp, } testAppend(qmp, qmpSingleSocketString, t) @@ -756,10 +758,11 @@ func TestAppendQMPSocketServerFd(t *testing.T) { }() qmp := QMPSocket{ - Type: "unix", - FD: foo, - Server: true, - NoWait: true, + Type: "unix", + FD: foo, + Server: true, + NoWait: true, + Protocol: Qmp, } testAppend(qmp, qmpSocketServerFdString, t) @@ -770,16 +773,18 @@ var qmpSocketServerString = "-qmp unix:path=cc-qmp-1,server=on,wait=off -qmp uni func TestAppendQMPSocketServer(t *testing.T) { qmp := []QMPSocket{ { - Type: "unix", - Name: "cc-qmp-1", - Server: true, - NoWait: true, + Type: "unix", + Name: "cc-qmp-1", + Server: true, + NoWait: true, + Protocol: Qmp, }, { - Type: "unix", - Name: "cc-qmp-2", - Server: true, - NoWait: true, + Type: "unix", + Name: "cc-qmp-2", + Server: true, + NoWait: true, + Protocol: Qmp, }, } diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 77caa53fb..58faec56c 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -11,6 +11,7 @@ package katautils import ( config "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" + govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" ) // name is the name of the runtime @@ -79,6 +80,7 @@ const defaultEnableIOMMU bool = false const defaultEnableIOMMUPlatform bool = false const defaultFileBackedMemRootDir string = "" const defaultEnableDebug bool = false +const defaultExtraMonitorSocket govmmQemu.MonitorProtocol = "" const defaultDisableNestingChecks bool = false const defaultMsize9p uint32 = 8192 const defaultEntropySource = "/dev/urandom" diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index a4e17b239..c950ed66a 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -78,87 +78,88 @@ type factory struct { } type hypervisor struct { - Path string `toml:"path"` - JailerPath string `toml:"jailer_path"` - Kernel string `toml:"kernel"` - CtlPath string `toml:"ctlpath"` - Initrd string `toml:"initrd"` - Image string `toml:"image"` - RootfsType string `toml:"rootfs_type"` - Firmware string `toml:"firmware"` - FirmwareVolume string `toml:"firmware_volume"` - MachineAccelerators string `toml:"machine_accelerators"` - CPUFeatures string `toml:"cpu_features"` - KernelParams string `toml:"kernel_params"` - MachineType string `toml:"machine_type"` - BlockDeviceDriver string `toml:"block_device_driver"` - EntropySource string `toml:"entropy_source"` - SharedFS string `toml:"shared_fs"` - VirtioFSDaemon string `toml:"virtio_fs_daemon"` - VirtioFSCache string `toml:"virtio_fs_cache"` - VhostUserStorePath string `toml:"vhost_user_store_path"` - FileBackedMemRootDir string `toml:"file_mem_backend"` - GuestHookPath string `toml:"guest_hook_path"` - GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` - SeccompSandbox string `toml:"seccompsandbox"` - BlockDeviceAIO string `toml:"block_device_aio"` - HypervisorPathList []string `toml:"valid_hypervisor_paths"` - JailerPathList []string `toml:"valid_jailer_paths"` - CtlPathList []string `toml:"valid_ctlpaths"` - VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` - VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` - PFlashList []string `toml:"pflashes"` - VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` - FileBackedMemRootList []string `toml:"valid_file_mem_backends"` - EntropySourceList []string `toml:"valid_entropy_sources"` - EnableAnnotations []string `toml:"enable_annotations"` - RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` - TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` - MemOffset uint64 `toml:"memory_offset"` - DefaultMaxMemorySize uint64 `toml:"default_maxmemory"` - DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"` - DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"` - DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"` - DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"` - NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` - NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` - NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` - NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` - VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` - VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"` - DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` - MemorySize uint32 `toml:"default_memory"` - MemSlots uint32 `toml:"memory_slots"` - DefaultBridges uint32 `toml:"default_bridges"` - Msize9p uint32 `toml:"msize_9p"` - NumVCPUs int32 `toml:"default_vcpus"` - BlockDeviceCacheSet bool `toml:"block_device_cache_set"` - BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` - BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` - EnableVhostUserStore bool `toml:"enable_vhost_user_store"` - VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"` - DisableBlockDeviceUse bool `toml:"disable_block_device_use"` - MemPrealloc bool `toml:"enable_mem_prealloc"` - HugePages bool `toml:"enable_hugepages"` - VirtioMem bool `toml:"enable_virtio_mem"` - IOMMU bool `toml:"enable_iommu"` - IOMMUPlatform bool `toml:"enable_iommu_platform"` - Debug bool `toml:"enable_debug"` - DisableNestingChecks bool `toml:"disable_nesting_checks"` - EnableIOThreads bool `toml:"enable_iothreads"` - DisableImageNvdimm bool `toml:"disable_image_nvdimm"` - HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"` - ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"` - DisableVhostNet bool `toml:"disable_vhost_net"` - GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` - ConfidentialGuest bool `toml:"confidential_guest"` - SevSnpGuest bool `toml:"sev_snp_guest"` - GuestSwap bool `toml:"enable_guest_swap"` - Rootless bool `toml:"rootless"` - DisableSeccomp bool `toml:"disable_seccomp"` - DisableSeLinux bool `toml:"disable_selinux"` - DisableGuestSeLinux bool `toml:"disable_guest_selinux"` - LegacySerial bool `toml:"use_legacy_serial"` + Path string `toml:"path"` + JailerPath string `toml:"jailer_path"` + Kernel string `toml:"kernel"` + CtlPath string `toml:"ctlpath"` + Initrd string `toml:"initrd"` + Image string `toml:"image"` + RootfsType string `toml:"rootfs_type"` + Firmware string `toml:"firmware"` + FirmwareVolume string `toml:"firmware_volume"` + MachineAccelerators string `toml:"machine_accelerators"` + CPUFeatures string `toml:"cpu_features"` + KernelParams string `toml:"kernel_params"` + MachineType string `toml:"machine_type"` + BlockDeviceDriver string `toml:"block_device_driver"` + EntropySource string `toml:"entropy_source"` + SharedFS string `toml:"shared_fs"` + VirtioFSDaemon string `toml:"virtio_fs_daemon"` + VirtioFSCache string `toml:"virtio_fs_cache"` + VhostUserStorePath string `toml:"vhost_user_store_path"` + FileBackedMemRootDir string `toml:"file_mem_backend"` + GuestHookPath string `toml:"guest_hook_path"` + GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` + SeccompSandbox string `toml:"seccompsandbox"` + BlockDeviceAIO string `toml:"block_device_aio"` + HypervisorPathList []string `toml:"valid_hypervisor_paths"` + JailerPathList []string `toml:"valid_jailer_paths"` + CtlPathList []string `toml:"valid_ctlpaths"` + VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` + VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` + PFlashList []string `toml:"pflashes"` + VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` + FileBackedMemRootList []string `toml:"valid_file_mem_backends"` + EntropySourceList []string `toml:"valid_entropy_sources"` + EnableAnnotations []string `toml:"enable_annotations"` + RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` + TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` + MemOffset uint64 `toml:"memory_offset"` + DefaultMaxMemorySize uint64 `toml:"default_maxmemory"` + DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"` + DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"` + DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"` + DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"` + NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` + NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` + NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` + NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` + VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` + VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"` + DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` + MemorySize uint32 `toml:"default_memory"` + MemSlots uint32 `toml:"memory_slots"` + DefaultBridges uint32 `toml:"default_bridges"` + Msize9p uint32 `toml:"msize_9p"` + NumVCPUs int32 `toml:"default_vcpus"` + BlockDeviceCacheSet bool `toml:"block_device_cache_set"` + BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` + BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` + EnableVhostUserStore bool `toml:"enable_vhost_user_store"` + VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"` + DisableBlockDeviceUse bool `toml:"disable_block_device_use"` + MemPrealloc bool `toml:"enable_mem_prealloc"` + HugePages bool `toml:"enable_hugepages"` + VirtioMem bool `toml:"enable_virtio_mem"` + IOMMU bool `toml:"enable_iommu"` + IOMMUPlatform bool `toml:"enable_iommu_platform"` + Debug bool `toml:"enable_debug"` + DisableNestingChecks bool `toml:"disable_nesting_checks"` + EnableIOThreads bool `toml:"enable_iothreads"` + DisableImageNvdimm bool `toml:"disable_image_nvdimm"` + HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"` + ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"` + DisableVhostNet bool `toml:"disable_vhost_net"` + GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` + ConfidentialGuest bool `toml:"confidential_guest"` + SevSnpGuest bool `toml:"sev_snp_guest"` + GuestSwap bool `toml:"enable_guest_swap"` + Rootless bool `toml:"rootless"` + DisableSeccomp bool `toml:"disable_seccomp"` + DisableSeLinux bool `toml:"disable_selinux"` + DisableGuestSeLinux bool `toml:"disable_guest_selinux"` + LegacySerial bool `toml:"use_legacy_serial"` + ExtraMonitorSocket govmmQemu.MonitorProtocol `toml:"extra_monitor_socket"` } type runtime struct { @@ -516,6 +517,22 @@ func (h hypervisor) blockDeviceAIO() (string, error) { return "", fmt.Errorf("Invalid hypervisor block storage I/O mechanism %v specified (supported AIO: %v)", h.BlockDeviceAIO, supportedBlockAIO) } +func (h hypervisor) extraMonitorSocket() (govmmQemu.MonitorProtocol, error) { + supportedExtraMonitor := []govmmQemu.MonitorProtocol{govmmQemu.Hmp, govmmQemu.Qmp, govmmQemu.QmpPretty} + + if h.ExtraMonitorSocket == "" { + return "", nil + } + + for _, extra := range supportedExtraMonitor { + if extra == h.ExtraMonitorSocket { + return extra, nil + } + } + + return "", fmt.Errorf("Invalid hypervisor extra monitor socket %v specified (supported values: %v)", h.ExtraMonitorSocket, supportedExtraMonitor) +} + func (h hypervisor) sharedFS() (string, error) { supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus, config.NoSharedFS} @@ -819,6 +836,11 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { rxRateLimiterMaxRate := h.getRxRateLimiterCfg() txRateLimiterMaxRate := h.getTxRateLimiterCfg() + extraMonitorSocket, err := h.extraMonitorSocket() + if err != nil { + return vc.HypervisorConfig{}, err + } + return vc.HypervisorConfig{ HypervisorPath: hypervisor, HypervisorPathList: h.HypervisorPathList, @@ -887,6 +909,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { LegacySerial: h.LegacySerial, DisableSeLinux: h.DisableSeLinux, DisableGuestSeLinux: h.DisableGuestSeLinux, + ExtraMonitorSocket: extraMonitorSocket, }, nil } @@ -1284,6 +1307,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { IOMMUPlatform: defaultEnableIOMMUPlatform, FileBackedMemRootDir: defaultFileBackedMemRootDir, Debug: defaultEnableDebug, + ExtraMonitorSocket: defaultExtraMonitorSocket, DisableNestingChecks: defaultDisableNestingChecks, BlockDeviceDriver: defaultBlockDeviceDriver, BlockDeviceAIO: defaultBlockDeviceAIO, diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 7599d2794..7d0575fb6 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -17,6 +17,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm" + govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" @@ -575,7 +576,7 @@ type HypervisorConfig struct { EnableIOThreads bool // Debug changes the default hypervisor and kernel parameters to - // enable debug output where available. And Debug also enable the hmp socket. + // enable debug output where available. Debug bool // MemPrealloc specifies if the memory should be pre-allocated @@ -641,6 +642,9 @@ type HypervisorConfig struct { // Use legacy serial for the guest console LegacySerial bool + + // ExtraMonitorSocket allows to add an extra HMP or QMP socket when the VMM is Qemu + ExtraMonitorSocket govmmQemu.MonitorProtocol } // vcpu mapping from vcpu number to thread number diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index e650a9cc3..03b5fa676 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -119,11 +119,11 @@ type qemu struct { } const ( - consoleSocket = "console.sock" - qmpSocket = "qmp.sock" - hmpSocket = "hmp.sock" - vhostFSSocket = "vhost-fs.sock" - nydusdAPISock = "nydusd-api.sock" + consoleSocket = "console.sock" + qmpSocket = "qmp.sock" + extraMonitorSocket = "extra-monitor.sock" + vhostFSSocket = "vhost-fs.sock" + nydusdAPISock = "nydusd-api.sock" // memory dump format will be set to elf memoryDumpFormat = "elf" @@ -329,8 +329,8 @@ func (q *qemu) qmpSocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, qmpSocket) } -func (q *qemu) hmpSocketPath(id string) (string, error) { - return utils.BuildSocketPath(q.config.VMStorePath, id, hmpSocket) +func (q *qemu) extraMonitorSocketPath(id string) (string, error) { + return utils.BuildSocketPath(q.config.VMStorePath, id, extraMonitorSocket) } func (q *qemu) getQemuMachine() (govmmQemu.Machine, error) { @@ -361,24 +361,31 @@ func (q *qemu) createQmpSocket() ([]govmmQemu.QMPSocket, error) { var sockets []govmmQemu.QMPSocket sockets = append(sockets, govmmQemu.QMPSocket{ - Type: "unix", - Server: true, - NoWait: true, + Type: "unix", + Protocol: govmmQemu.Qmp, + Server: true, + NoWait: true, }) - if q.HypervisorConfig().Debug { - humanMonitorSockPath, err := q.hmpSocketPath(q.id) + // The extra monitor socket allows an external user to take full + // control on Qemu and silently break the VM in all possible ways. + // It should only ever be used for debugging purposes, hence the + // check on Debug. + if q.HypervisorConfig().Debug && q.config.ExtraMonitorSocket != "" { + extraMonitorSockPath, err := q.extraMonitorSocketPath(q.id) if err != nil { return nil, err } sockets = append(sockets, govmmQemu.QMPSocket{ - Type: "unix", - IsHmp: true, - Name: humanMonitorSockPath, - Server: true, - NoWait: true, + Type: "unix", + Protocol: q.config.ExtraMonitorSocket, + Name: extraMonitorSockPath, + Server: true, + NoWait: true, }) + + q.Logger().Warn("QEMU configured to start with an untrusted monitor") } return sockets, nil From 9eb8723a5b5f283fccf015dfddd8057648311ff0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 18 Sep 2023 14:14:10 +0200 Subject: [PATCH 18/33] clh: arm: Use static_sandbox_resource_mgmt=true MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Users have noticed that this is needed, as CLH does not yet implement a way to hotplug resources on aarh64. With this patch, when building for x86_64, I can see the this is the resulting config: ``` $ ARCH=amd64 make ... $ cat config/configuration-clh.toml | grep static_sandbox_resource_mgmt static_sandbox_resource_mgmt=false ``` And when building for aarch64: ``` $ ARCH=arm64 make ... $ cat config/configuration-clh.toml | grep static_sandbox_resource_mgmt static_sandbox_resource_mgmt=true ``` Fixes: #7941 Signed-off-by: Fabiano Fidêncio (cherry picked from commit 72599f191109e59b1be653094eca56d37aee1d2b) Signed-off-by: Greg Kurz --- src/runtime/Makefile | 1 + src/runtime/arch/amd64-options.mk | 2 ++ src/runtime/arch/arm64-options.mk | 2 ++ src/runtime/config/configuration-clh.toml.in | 2 +- 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 7655a060c..33fa8f253 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -603,6 +603,7 @@ USER_VARS += DEFENTROPYSOURCE USER_VARS += DEFVALIDENTROPYSOURCES USER_VARS += DEFSANDBOXCGROUPONLY USER_VARS += DEFSTATICRESOURCEMGMT +USER_VARS += DEFSTATICRESOURCEMGMT_CLH USER_VARS += DEFSTATICRESOURCEMGMT_FC USER_VARS += DEFSTATICRESOURCEMGMT_TEE USER_VARS += DEFBINDMOUNTS diff --git a/src/runtime/arch/amd64-options.mk b/src/runtime/arch/amd64-options.mk index ab2b1d2d2..e6068158c 100644 --- a/src/runtime/arch/amd64-options.mk +++ b/src/runtime/arch/amd64-options.mk @@ -26,3 +26,5 @@ ACRNCTLCMD := acrnctl # cloud-hypervisor binary name CLHCMD := cloud-hypervisor + +DEFSTATICRESOURCEMGMT_CLH := false diff --git a/src/runtime/arch/arm64-options.mk b/src/runtime/arch/arm64-options.mk index ad5ef5d43..7f74ae311 100644 --- a/src/runtime/arch/arm64-options.mk +++ b/src/runtime/arch/arm64-options.mk @@ -19,3 +19,5 @@ FCJAILERCMD := jailer # cloud-hypervisor binary name CLHCMD := cloud-hypervisor + +DEFSTATICRESOURCEMGMT_CLH := true diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index 4531a4e65..9dcd82aff 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -400,7 +400,7 @@ sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ # - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O # does not yet support sandbox sizing annotations. # - When running single containers using a tool like ctr, container sizing information will be available. -static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT@ +static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT_CLH@ # If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path. # This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. From 11e2f2a458d529a1203badd8e22c213a80056ad1 Mon Sep 17 00:00:00 2001 From: Simon Kaegi Date: Mon, 18 Sep 2023 14:17:32 -0400 Subject: [PATCH 19/33] versions: Bump virtiofsd to v1.8.0 https://gitlab.com/virtio-fs/virtiofsd/-/releases/v1.8.0 was released two weeks ago. We have fully tested and are using this version. Also bumps toolchain version to match what virtiofsd used. Fixes: #7960 Signed-off-by: Simon Kaegi (cherry picked from commit 44c7c082d9f728c07fde847267bdd003871a00ac) Signed-off-by: Greg Kurz --- versions.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/versions.yaml b/versions.yaml index 759e6ee92..57e3c94df 100644 --- a/versions.yaml +++ b/versions.yaml @@ -313,14 +313,14 @@ externals: virtiofsd: description: "vhost-user virtio-fs device backend written in Rust" url: "https://gitlab.com/virtio-fs/virtiofsd" - version: "v1.6.1" - toolchain: "1.66.0" + version: "v1.8.0" + toolchain: "1.72.0" meta: - # From https://gitlab.com/virtio-fs/virtiofsd/-/releases/v1.7.0, - # this is the link labelled virtiofsd-v1.7.0.zip + # From https://gitlab.com/virtio-fs/virtiofsd/-/releases/v1.8.0, + # this is the link labelled virtiofsd-v1.8.0.zip # # yamllint disable-line rule:line-length - binary: "https://gitlab.com/virtio-fs/virtiofsd/uploads/dc56ecbf86ce1226bdc31f946cfded75/virtiofsd-v1.7.0.zip" + binary: "https://gitlab.com/virtio-fs/virtiofsd/uploads/9ec473efd0203219d016e66aac4190aa/virtiofsd-v1.8.0.zip" languages: description: | From 668c8979f022a3c5661c3e594f361f5094c62e5a Mon Sep 17 00:00:00 2001 From: Peteris Rudzusiks Date: Thu, 21 Sep 2023 00:33:59 +0200 Subject: [PATCH 20/33] runtime: fix reading cgroup stats of sandboxes The cgroup stats come from resourcecontrol package in the form of pointers to structs. The sandbox Stat() method incorrectly was expecting structs. This caused the cpu and memory stats to always be 0, which in turn caused incorrect pod overhead metrics. Fixes #8035 Signed-off-by: Peteris Rudzusiks (cherry picked from commit 94e2ccc2d592eb7e4cf5038c5409d3bb6b7704e5) Signed-off-by: Greg Kurz --- src/runtime/virtcontainers/sandbox.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index bbbe9ef19..a7640bb04 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -1720,12 +1720,14 @@ func (s *Sandbox) Stats(ctx context.Context) (SandboxStats, error) { // TODO Do we want to aggregate the overhead cgroup stats to the sandbox ones? switch mt := metrics.(type) { - case v1.Metrics: + case *v1.Metrics: stats.CgroupStats.CPUStats.CPUUsage.TotalUsage = mt.CPU.Usage.Total stats.CgroupStats.MemoryStats.Usage.Usage = mt.Memory.Usage.Usage - case v2.Metrics: + case *v2.Metrics: stats.CgroupStats.CPUStats.CPUUsage.TotalUsage = mt.CPU.UsageUsec stats.CgroupStats.MemoryStats.Usage.Usage = mt.Memory.Usage + default: + return SandboxStats{}, fmt.Errorf("unknown metrics type %T", mt) } tids, err := s.hypervisor.GetThreadIDs(ctx) From 9b48525af1d88d267121bfd26378c339b84eb863 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 11 Oct 2023 11:29:49 +0200 Subject: [PATCH 21/33] release: tag_repos: Stop tagging / updating the `tests` repo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As we've moved all the tests to the `kata-containers` repo, the `tests` repo will become a read-only repo. Fixes: #8200 Signed-off-by: Fabiano Fidêncio (cherry picked from commit 65b1a2d277b3c8c49a1169593334d7170083f5bc) Signed-off-by: Greg Kurz --- tools/packaging/release/tag_repos.sh | 1 - tools/packaging/release/update-repository-version.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/tools/packaging/release/tag_repos.sh b/tools/packaging/release/tag_repos.sh index 21fc6737d..e2e104a65 100755 --- a/tools/packaging/release/tag_repos.sh +++ b/tools/packaging/release/tag_repos.sh @@ -69,7 +69,6 @@ info() { repos=( "kata-containers" - "tests" ) diff --git a/tools/packaging/release/update-repository-version.sh b/tools/packaging/release/update-repository-version.sh index 26f48daa1..2306a2e96 100755 --- a/tools/packaging/release/update-repository-version.sh +++ b/tools/packaging/release/update-repository-version.sh @@ -295,7 +295,6 @@ EOF repos=( "kata-containers" - "tests" ) main(){ From 961daee9835ec5a013a9c8aa4be9adc973fd529d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 11 Oct 2023 11:48:22 +0200 Subject: [PATCH 22/33] scripts: Use install_yq from the `kata-containers` repo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As the file is already part of the kata-containers repo, and the tests repo is about to become read-only, we're good to drop the tests references from here and use everything coming from the `kata-containers` repo instead. Signed-off-by: Fabiano Fidêncio (cherry picked from commit fbc8f8f466e1b8b38d59223cbc6461cabb85b181) Signed-off-by: Greg Kurz --- tools/packaging/scripts/lib.sh | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/tools/packaging/scripts/lib.sh b/tools/packaging/scripts/lib.sh index d1e17e20d..0bd873721 100644 --- a/tools/packaging/scripts/lib.sh +++ b/tools/packaging/scripts/lib.sh @@ -6,8 +6,6 @@ # export GOPATH=${GOPATH:-${HOME}/go} -export tests_repo="${tests_repo:-github.com/kata-containers/tests}" -export tests_repo_dir="$GOPATH/src/$tests_repo" export BUILDER_REGISTRY="${BUILDER_REGISTRY:-quay.io/kata-containers/builders}" export PUSH_TO_REGISTRY="${PUSH_TO_REGISTRY:-"no"}" @@ -29,19 +27,8 @@ ARCH=${ARCH:-$(uname -m)} TARGET_OS=${TARGET_OS:-linux} [ "${CROSS_BUILD}" == "true" ] && BUILDX=buildx && PLATFORM="--platform=${TARGET_OS}/${TARGET_ARCH}" -clone_tests_repo() { - # KATA_CI_NO_NETWORK is (has to be) ignored if there is - # no existing clone. - if [ -d "${tests_repo_dir}" ] && [ -n "${KATA_CI_NO_NETWORK:-}" ]; then - return - fi - - go get -d -u "$tests_repo" || true -} - install_yq() { - clone_tests_repo - pushd "$tests_repo_dir" + pushd "${repo_root_dir}" .ci/install_yq.sh popd } From 4495a797210f04d25030c4ed3b4186247522fc0b Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Tue, 10 Oct 2023 19:58:29 +0000 Subject: [PATCH 23/33] tests: Enable scability test for stability CI This PR enables the scability test for stability CI gha. Fixes #8196 Signed-off-by: Gabriela Cervantes (cherry picked from commit 30ff58904eb05b18454b9479a5e1b881a60973c1) Signed-off-by: Greg Kurz --- tests/stability/gha-run.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/stability/gha-run.sh b/tests/stability/gha-run.sh index 01672534a..c792053f5 100755 --- a/tests/stability/gha-run.sh +++ b/tests/stability/gha-run.sh @@ -39,6 +39,9 @@ function run() { export ITERATIONS=2 MAX_CONTAINERS=20 bash "${stability_dir}/soak_parallel_rm.sh" + + info "Running scability test using ${KATA_HYPERVISOR} hypervisor" + bash "${stability_dir}/scability_test.sh 10 10" } function main() { From a380437380c5177a2036fa356dbfa36e136f9a06 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Tue, 10 Oct 2023 21:18:29 +0000 Subject: [PATCH 24/33] tests: Fix path for versions yaml for soak parallel test This PR fixes the path for versions yaml for soak parallel test. Signed-off-by: Gabriela Cervantes (cherry picked from commit c6463cb5aea43b49a2bb2d0bf595820e69336ce2) Signed-off-by: Greg Kurz --- tests/stability/gha-run.sh | 2 +- tests/stability/soak_parallel_rm.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/stability/gha-run.sh b/tests/stability/gha-run.sh index c792053f5..a6e9bce5a 100755 --- a/tests/stability/gha-run.sh +++ b/tests/stability/gha-run.sh @@ -41,7 +41,7 @@ function run() { bash "${stability_dir}/soak_parallel_rm.sh" info "Running scability test using ${KATA_HYPERVISOR} hypervisor" - bash "${stability_dir}/scability_test.sh 10 10" + bash "${stability_dir}/scability_test.sh" 10 10 } function main() { diff --git a/tests/stability/soak_parallel_rm.sh b/tests/stability/soak_parallel_rm.sh index c05c2fade..9404496d5 100755 --- a/tests/stability/soak_parallel_rm.sh +++ b/tests/stability/soak_parallel_rm.sh @@ -172,7 +172,7 @@ function init() { check_kata_components=0 fi - versions_file="${cidir}/../versions.yaml" + versions_file="${cidir}/../../versions.yaml" nginx_version=$("${GOPATH}/bin/yq" read "$versions_file" "docker_images.nginx.version") nginx_image="docker.io/library/nginx:$nginx_version" From 5d911db5e27b6757a91a439272119e2b9291cff0 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Tue, 10 Oct 2023 23:15:26 +0000 Subject: [PATCH 25/33] tests: Remove unused function from scability test This PR removes an unused function from scability test. Signed-off-by: Gabriela Cervantes (cherry picked from commit ef6388e8158139e1db0a616c319d1278568c1af1) Signed-off-by: Greg Kurz --- tests/stability/gha-run.sh | 2 +- tests/stability/scability_test.sh | 15 ++++----------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/tests/stability/gha-run.sh b/tests/stability/gha-run.sh index a6e9bce5a..47dfb2b32 100755 --- a/tests/stability/gha-run.sh +++ b/tests/stability/gha-run.sh @@ -41,7 +41,7 @@ function run() { bash "${stability_dir}/soak_parallel_rm.sh" info "Running scability test using ${KATA_HYPERVISOR} hypervisor" - bash "${stability_dir}/scability_test.sh" 10 10 + bash "${stability_dir}/scability_test.sh" 15 60 } function main() { diff --git a/tests/stability/scability_test.sh b/tests/stability/scability_test.sh index 86d941e65..c3c381a59 100755 --- a/tests/stability/scability_test.sh +++ b/tests/stability/scability_test.sh @@ -39,8 +39,7 @@ function main() { local containers=() local not_started_count="${NUM_CONTAINERS}" - init_env - check_cmds "${cmds[@]}" + clean_env_ctr sudo -E ctr i pull "${IMAGE}" info "Creating ${NUM_CONTAINERS} containers" @@ -53,16 +52,10 @@ function main() { done # Check that the requested number of containers are running - check_containers_are_up & pid=$! - (sleep "${TIMEOUT_LAUNCH}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$! + check_containers_are_up "${NUM_CONTAINERS}" - if wait "${pid}" 2>/dev/null; then - pkill -HUP -P "${pid_tout}" - wait "${pid_tout}" - else - warn "Time out exceeded" - return 1 - fi + # Check that the requested number of containers are running + check_containers_are_running "${NUM_CONTAINERS}" clean_env_ctr } From 0e0aabfd872bb4aee441dd1f0334939d6b993c1d Mon Sep 17 00:00:00 2001 From: David Esparza Date: Mon, 9 Oct 2023 15:07:51 -0600 Subject: [PATCH 26/33] metrics: removal of reference in the documentation to the dax test. This PR removes the reference in the documentation to the DAX subtest of the FIO benchmark, because this metric is currently WIP. Fixes: #8159 Signed-off-by: David Esparza (cherry picked from commit 89c9454fca441bc2f03b72ac8c42dfa92063e0f1) Signed-off-by: Greg Kurz --- tests/metrics/storage/README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/metrics/storage/README.md b/tests/metrics/storage/README.md index 305c7e227..cb9e3c8b6 100644 --- a/tests/metrics/storage/README.md +++ b/tests/metrics/storage/README.md @@ -16,12 +16,16 @@ $ bash storage/blogbench.sh ``` ## `fio` test -The `fio` test utilises the [fio tool](https://github.com/axboe/fio), configured -to perform measurements upon a single test file. +The `fio` test utilizes the [fio tool](https://github.com/axboe/fio), configured to perform measurements upon a single test file. -The test configuration used by the script can be modified by setting a number of -environment variables to change or over-ride the test defaults. +The test spawns 8 jobs that exercise the I/O types `sequential read`, `random read`, `sequential write` and `random write`, while collecting +data using a block size of 4 Kb, an I/O depth of 2, and uses the `libaio` engine on a workload with a size of 10 gigabytes for a period of +10 seconds on each I/O type. -## DAX `virtio-fs` `fio` Kubernetes tests +The results show the average bandwidth and average number of IOPS per I/O type in JSON format. -[Test](fio-k8s/README.md) to compare the use of DAX option in `virtio-fs`. +The `fio` test can be run by hand, for example: +``` +$ cd metrics +$ bash storage/fio_test.sh +``` From 26c6ca93d3c2c010130e8be937e221e003351777 Mon Sep 17 00:00:00 2001 From: David Esparza Date: Wed, 11 Oct 2023 18:54:11 -0600 Subject: [PATCH 27/33] metrics: removing trailing comma characters from json file. This PR removes trailing commas so that the json results file is valid. This PR also changes the way data results are collected by terating through the array of memory values to calculate their average. Fixes: #8204 Signed-off-by: David Esparza (cherry picked from commit c2763120aa1df13bad2f61c2b87fc94db182b43a) Signed-off-by: Greg Kurz --- tests/metrics/density/memory_usage.sh | 62 ++++++++++++++++----------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/tests/metrics/density/memory_usage.sh b/tests/metrics/density/memory_usage.sh index 4487f0dec..96cfbdf40 100755 --- a/tests/metrics/density/memory_usage.sh +++ b/tests/metrics/density/memory_usage.sh @@ -38,6 +38,7 @@ global_shim_mem=0 function remove_tmp_file() { rm -rf "${MEM_TMP_FILE}" "${PS_TMP_FILE}" + clean_env_ctr } trap remove_tmp_file EXIT @@ -97,19 +98,22 @@ function get_pss_memory(){ local avg=0 [ -z "${ps}" ] && die "No argument to get_pss_memory()" + ps="$(readlink -f ${ps})" # Save all the processes names # This will be help us to retrieve raw information echo "${ps}" >> "${PS_TMP_FILE}" - data=$(sudo "${SMEM_BIN}" --no-header -P "^${ps}" -c "pss" | sed 's/[[:space:]]//g' | tr '\n' ' ' | sed 's/[[:blank:]]*$//') + data="$(sudo "${SMEM_BIN}" --no-header -P "^${ps}" -c "pss" | sed 's/[[:space:]]//g' | tr '\n' ' ' | sed 's/[[:blank:]]*$//')" # Save all the smem results # This will help us to retrieve raw information echo "${data}" >> "${MEM_TMP_FILE}" - for i in ${data[*]}; do + arrData=($data) + + for i in "${arrData[@]}"; do if [ ${i} -gt 0 ]; then let "mem_amount+=i" let "count+=1" @@ -124,7 +128,7 @@ function get_pss_memory(){ if [ "${shim_result_on}" -eq "1" ]; then global_shim_mem="${avg}" else - global_hypervisor_mem="${avg}" + global_hypervisor_mem="${avg}" fi } @@ -144,13 +148,15 @@ function get_pss_memory_virtiofsd() { mem_amount=0 count=0 avg=0 - virtiofsd_path=${1:-} + [ -z "${virtiofsd_path}" ] && die "virtiofsd_path not provided" echo "${virtiofsd_path}" >> "${PS_TMP_FILE}" - virtiofsd_pids=$(ps aux | grep [v]irtiofsd | awk '{print $2}' | head -1) - data=$(sudo smem --no-header -P "^${virtiofsd_path}" -c pid -c "pid pss") + + virtiofsd_pids="$(ps aux | grep '[v]irtiofsd' | awk '{print $2}' | head -1)" + + data="$(sudo smem --no-header -P "^${virtiofsd_path}" -c "pid pss")" for p in "${virtiofsd_pids}"; do parent_pid=$(ppid "${p}") @@ -176,7 +182,6 @@ function get_pss_memory_virtiofsd() { done [ "${count}" -gt 0 ] && global_virtiofsd_mem=$(bc -l <<< "scale=2; ${mem_amount} / ${count}") - } function get_individual_memory(){ @@ -194,32 +199,45 @@ function get_individual_memory(){ read -r -a second_values <<< "${second_process_result}" read -r -a third_values <<< "${third_process_result}" + declare -a fv_array + declare -a sv_array + declare -a tv_array + + # remove null values from arrays of results + for ((i=0;i<"${NUM_CONTAINERS}";++i)); do + [ -n "${first_values[i]}" ] && fv_array+=( "${first_values[i]}" ) + [ -n "${second_values[i]}" ] && sv_array+=( "${second_values[i]}" ) + [ -n "${third_values[i]}" ] && tv_array+=( "${third_values[i]}" ) + done + + # remove trailing commas + fv_array[-1]="$(sed -r 's/,*$//g' <<< "${fv_array[-1]}")" + sv_array[-1]="$(sed -r 's/,*$//g' <<< "${sv_array[-1]}")" + tv_array[-1]="$(sed -r 's/,*$//g' <<< "${tv_array[-1]}")" + metrics_json_start_array local json="$(cat << EOF { "${first_process_name} memory": [ - $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do - [ -n "${first_values[i]}" ] && - printf '%s\n\t\t\t' "${first_values[i]}" + $(for i in "${fv_array[@]}"; do + printf '\n\t\t\t%s' "${i}" done) ], "${second_process_name} memory": [ - $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do - [ -n "${second_values[i]}" ] && - printf '%s\n\t\t\t' "${second_values[i]}" + $(for i in "${sv_array[@]}"; do + printf '\n\t\t\t%s' "${i}" done) ], "${third_process_name} memory": [ - $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do - [ -n "${third_values[i]}" ] && - printf '%s\n\t\t\t' "${third_values[i]}" + $(for i in "${tv_array[@]}"; do + printf '\n\t\t\t%s' "${i}" done) ] } EOF )" - metrics_json_add_array_element "$json" + metrics_json_add_array_element "${json}" metrics_json_end_array "Raw results" } @@ -272,7 +290,6 @@ function get_memory_usage(){ } EOF )" - else [ "$RUNTIME" == "kata-runtime" ] || [ "$RUNTIME" == "kata-qemu" ] # Get PSS memory of VM runtime components. # And check that the smem search has found the process - we get a "0" @@ -281,7 +298,6 @@ EOF # As an added bonus - this script must be run as root. # Now if you do not have enough rights # the smem failure to read the stats will also be trapped. - get_pss_memory ${HYPERVISOR_PATH} if [ "${global_hypervisor_mem}" == "0" ]; then @@ -293,13 +309,11 @@ EOF if [ "${global_virtiofsd_mem}" == "0" ]; then echo >&2 "WARNING: Failed to find PSS for ${VIRTIOFSD_PATH}" fi - get_pss_memory ${SHIM_PATH} 1 if [ "${global_shim_mem}" == "0" ]; then die "Failed to find PSS for ${SHIM_PATH}" fi - mem_usage="$(bc -l <<< "scale=2; ${global_hypervisor_mem} + ${global_virtiofsd_mem} + ${global_shim_mem}")" local json="$(cat << EOF @@ -361,10 +375,10 @@ function main(){ #Check for KSM before reporting test name, as it can modify it check_for_ksm - init_env check_cmds "${SMEM_BIN}" bc + clean_env_ctr + init_env check_images "${IMAGE}" - if [ "${CTR_RUNTIME}" == "io.containerd.kata.v2" ]; then export RUNTIME="kata-runtime" elif [ "${CTR_RUNTIME}" == "io.containerd.runc.v2" ]; then @@ -372,7 +386,6 @@ function main(){ else die "Unknown runtime ${CTR_RUNTIME}" fi - metrics_json_init save_config get_memory_usage @@ -385,7 +398,6 @@ function main(){ info "memory usage test completed" metrics_json_save - clean_env_ctr } main "$@" From 544f261433a6632911e98ce806390fca04598201 Mon Sep 17 00:00:00 2001 From: David Esparza Date: Fri, 13 Oct 2023 18:02:00 +0000 Subject: [PATCH 28/33] metrics: skips docker restart when it is not installed or is masked. To avoid errors when initializing the test environment, the kill_processes_before_start() helper function needs to verify that docker is installed before attempting to stop it. Fixes: #8218 Signed-off-by: David Esparza (cherry picked from commit 908519db9d49eb55d8d5d16a9dc65d576f78d364) Signed-off-by: Greg Kurz --- tests/metrics/lib/common.bash | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index 8528d2c7b..8ec227a80 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -168,15 +168,18 @@ function init_env() cmd=("docker" "ctr") - sudo systemctl restart docker - # check dependencies check_cmds "${cmd[@]}" # Remove all stopped containers - clean_env clean_env_ctr + # restart docker only if it is not masked by systemd + docker_masked="$(systemctl list-unit-files --state=masked | grep -c docker)" + if [ "${docker_masked}" -eq 0 ]; then + sudo systemctl restart docker + fi + # This clean up is more aggressive, this is in order to # decrease the factors that could affect the metrics results. kill_processes_before_start @@ -188,8 +191,12 @@ function init_env() # killed to start test with clean environment. function kill_processes_before_start() { - DOCKER_PROCS=$(sudo "${DOCKER_EXE}" ps -q) - [[ -n "${DOCKER_PROCS}" ]] && clean_env + docker_masked="$(systemctl list-unit-files --state=masked | grep -c "${DOCKER_EXE}")" + + if [ "${docker_masked}" -eq 0 ]; then + DOCKER_PROCS=$(sudo "${DOCKER_EXE}" ps -q) + [[ -n "${DOCKER_PROCS}" ]] && clean_env + fi CTR_PROCS=$(sudo "${CTR_EXE}" t list -q) [[ -n "${CTR_PROCS}" ]] && clean_env_ctr From 8cf5506700a7bcd396282c4fcf3b137ab9440b33 Mon Sep 17 00:00:00 2001 From: David Esparza Date: Mon, 16 Oct 2023 15:57:57 -0600 Subject: [PATCH 29/33] metrics: fixes common.sh function to always return true This PR corrects the init env() helper function, to make that systemctl always returns true when enumerating masked services, and preventing the test from failing Fixes: #8242 Signed-off-by: David Esparza (cherry picked from commit 4f9681b411c56f542ad471f43038c7764173b30f) Signed-off-by: Greg Kurz --- tests/metrics/lib/common.bash | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index 8ec227a80..e42a596ac 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -175,7 +175,7 @@ function init_env() clean_env_ctr # restart docker only if it is not masked by systemd - docker_masked="$(systemctl list-unit-files --state=masked | grep -c docker)" + docker_masked="$(systemctl list-unit-files --state=masked | grep -c docker)" || true if [ "${docker_masked}" -eq 0 ]; then sudo systemctl restart docker fi @@ -191,7 +191,7 @@ function init_env() # killed to start test with clean environment. function kill_processes_before_start() { - docker_masked="$(systemctl list-unit-files --state=masked | grep -c "${DOCKER_EXE}")" + docker_masked="$(systemctl list-unit-files --state=masked | grep -c "${DOCKER_EXE}")" || true if [ "${docker_masked}" -eq 0 ]; then DOCKER_PROCS=$(sudo "${DOCKER_EXE}" ps -q) From 92f283f0625875f09f32a9e6d11c06e410a4ce68 Mon Sep 17 00:00:00 2001 From: "James O. D. Hunt" Date: Thu, 12 Oct 2023 12:13:53 +0100 Subject: [PATCH 30/33] runtime: Validate hypervisor section name in config file Previously, if you accidentally modified the name of the hypervisor section in the config file, the default golang runtime gives a cryptic error message ("`VM memory cannot be zero`"). This can be demonstrated using the `kata-runtime` utility program which uses the same golang config package as the actual runtime (`containerd-shim-kata-v2`): ```bash $ kata-runtime env >/dev/null; echo $? 0 $ sudo sed -i 's!^\[hypervisor\.qemu\]!\[hypervisor\.foo\]!g' /etc/kata-containers/configuration.toml $ kata-runtime env >/dev/null; echo $? VM memory cannot be zero 1 ``` The hypervisor name is now validated so that the behaviour becomes: ```bash $ kata-runtime env >/dev/null; echo $? 0 $ sudo sed -i 's!^\[hypervisor\.qemu\]!\[hypervisor\.foo\]!g' /etc/kata-containers/configuration.toml $ ./kata-runtime env >/dev/null; echo $? /etc/kata-containers/configuration.toml: configuration file contains invalid hypervisor section: "foo" 1 ``` Fixes: #8212. Signed-off-by: James O. D. Hunt (cherry picked from commit 3e8cf6959cbb829a3e54146c77b0e8c74c8c72af) Signed-off-by: Greg Kurz --- src/runtime/pkg/katautils/config.go | 4 ++ src/runtime/pkg/katautils/config_test.go | 56 ++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index c950ed66a..feeef68b9 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -55,6 +55,8 @@ const ( // the maximum amount of PCI bridges that can be cold plugged in a VM maxPCIBridges uint32 = 5 + + errInvalidHypervisorPrefix = "configuration file contains invalid hypervisor section" ) type tomlConfig struct { @@ -1176,6 +1178,8 @@ func updateRuntimeConfigHypervisor(configPath string, tomlConf tomlConfig, confi case dragonballHypervisorTableType: config.HypervisorType = vc.DragonballHypervisor hConfig, err = newDragonballHypervisorConfig(hypervisor) + default: + err = fmt.Errorf("%s: %+q", errInvalidHypervisorPrefix, k) } if err != nil { diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 90f9f4d56..75511c6fe 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -1735,3 +1735,59 @@ vfio_mode="vfio" assert.Equal(t, config.Runtime.InterNetworkModel, "macvtap") assert.Equal(t, config.Runtime.VfioMode, "vfio") } + +func TestUpdateRuntimeConfigHypervisor(t *testing.T) { + assert := assert.New(t) + + type tableTypeEntry struct { + name string + valid bool + } + + configFile := "/some/where/configuration.toml" + + // Note: We cannot test acrnHypervisorTableType since + // newAcrnHypervisorConfig() expects ACRN binaries to be + // installed. + var entries = []tableTypeEntry{ + {clhHypervisorTableType, true}, + {dragonballHypervisorTableType, true}, + {firecrackerHypervisorTableType, true}, + {qemuHypervisorTableType, true}, + {"foo", false}, + {"bar", false}, + {clhHypervisorTableType + "baz", false}, + } + + for i, h := range entries { + config := oci.RuntimeConfig{} + + tomlConf := tomlConfig{ + Hypervisor: map[string]hypervisor{ + h.name: { + NumVCPUs: int32(2), + MemorySize: uint32(2048), + Path: "/", + Kernel: "/", + Image: "/", + Firmware: "/", + FirmwareVolume: "/", + SharedFS: "virtio-fs", + VirtioFSDaemon: "/usr/libexec/kata-qemu/virtiofsd", + }, + }, + } + + err := updateRuntimeConfigHypervisor(configFile, tomlConf, &config) + + if h.valid { + assert.NoError(err, "test %d (%+v)", i, h) + } else { + assert.Error(err, "test %d (%+v)", i, h) + + expectedErr := fmt.Errorf("%v: %v: %+q", configFile, errInvalidHypervisorPrefix, h.name) + + assert.Equal(err, expectedErr, "test %d (%+v)", i, h) + } + } +} From 37c99a46b1afa08591168bab78e8e5a3af75424b Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Mon, 16 Oct 2023 20:14:37 +0000 Subject: [PATCH 31/33] tests: Enable agent stability test This PR enables the agent stability test for stability gha CI. Fixes #8240 Signed-off-by: Gabriela Cervantes (cherry picked from commit 82a0814fc2528629d17019683673ae992584cdc4) Signed-off-by: Greg Kurz --- tests/stability/gha-run.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/stability/gha-run.sh b/tests/stability/gha-run.sh index 47dfb2b32..23b991332 100755 --- a/tests/stability/gha-run.sh +++ b/tests/stability/gha-run.sh @@ -42,6 +42,9 @@ function run() { info "Running scability test using ${KATA_HYPERVISOR} hypervisor" bash "${stability_dir}/scability_test.sh" 15 60 + + info "Running agent stability test using ${KATA_HYPERVISOR} hypervisor" + bash "${stability_dir}/agent_stability_test.sh" } function main() { From 12b8cbb4f6ddafcaf330cf245ada68d58e9aaddd Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Tue, 17 Oct 2023 15:14:45 +0000 Subject: [PATCH 32/33] tests: Adjust timeout for agent stability test This PR adjusts the timeout for the agent stability test to run on the gha. Signed-off-by: Gabriela Cervantes (cherry picked from commit d01daf749b598514d6833ef843e2d11a9faceb96) Signed-off-by: Greg Kurz --- tests/stability/agent_stability_test.sh | 20 +++++--------------- tests/stability/gha-run.sh | 4 ++-- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/tests/stability/agent_stability_test.sh b/tests/stability/agent_stability_test.sh index 902d9d48c..9db14362d 100755 --- a/tests/stability/agent_stability_test.sh +++ b/tests/stability/agent_stability_test.sh @@ -8,7 +8,7 @@ # running container, the main purpose of this # test is to stress the agent -set -e -x +set -x cidir=$(dirname "$0") @@ -19,16 +19,8 @@ IMAGE="${IMAGE:-quay.io/prometheus/busybox:latest}" CONTAINER_NAME="${CONTAINER_NAME:-test}" PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}" - -# Timeout is the duration of this test (seconds) -# We want to stress the agent for a significant -# time (approximately running for two days) -timeout=186400 -start_time=$(date +%s) -end_time=$((start_time+timeout)) - function setup { - restart_containerd_service + clean_env_ctr sudo ctr image pull $IMAGE sudo ctr run --runtime=$CTR_RUNTIME -d $IMAGE $CONTAINER_NAME sh -c $PAYLOAD_ARGS } @@ -47,7 +39,7 @@ function exec_loop { } function teardown { - echo "Ending stability test" + echo "Ending agent stability test" clean_env_ctr } trap teardown EXIT @@ -55,7 +47,5 @@ trap teardown EXIT info "Starting stability test" setup -info "Running stability test" -while [[ $end_time > $(date +%s) ]]; do - exec_loop -done +info "Running agent stability test" +exec_loop diff --git a/tests/stability/gha-run.sh b/tests/stability/gha-run.sh index 23b991332..50561de33 100755 --- a/tests/stability/gha-run.sh +++ b/tests/stability/gha-run.sh @@ -43,8 +43,8 @@ function run() { info "Running scability test using ${KATA_HYPERVISOR} hypervisor" bash "${stability_dir}/scability_test.sh" 15 60 - info "Running agent stability test using ${KATA_HYPERVISOR} hypervisor" - bash "${stability_dir}/agent_stability_test.sh" +# info "Running agent stability test using ${KATA_HYPERVISOR} hypervisor" +# bash "${stability_dir}/agent_stability_test.sh" } function main() { From 93c7d165dccbd327fdea6a7b491e4c04a4271ad6 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Thu, 19 Oct 2023 21:50:35 +0200 Subject: [PATCH 33/33] ci: k8s: Fix bogus firecracker check in k8s-credentials-secrets.bat Fixes #8259 Signed-off-by: Greg Kurz (cherry picked from commit 36109da93f7bf3f00be0996d388d4e440d1cf8e9) Signed-off-by: Greg Kurz --- tests/integration/kubernetes/k8s-credentials-secrets.bats | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/kubernetes/k8s-credentials-secrets.bats b/tests/integration/kubernetes/k8s-credentials-secrets.bats index 8becf2e2e..9a8a963f5 100644 --- a/tests/integration/kubernetes/k8s-credentials-secrets.bats +++ b/tests/integration/kubernetes/k8s-credentials-secrets.bats @@ -10,7 +10,7 @@ load "${BATS_TEST_DIRNAME}/tests_common.sh" setup() { [ "${KATA_HYPERVISOR}" == "firecracker" ] && skip "test not working see: ${fc_limitations}" - [ "${KATA_HYPERVISOR}" == "fcr" ] && skip "test not working see: ${fc_limitations}" + [ "${KATA_HYPERVISOR}" == "fc" ] && skip "test not working see: ${fc_limitations}" get_pod_config_dir }