From b975f2e8d243d5b778e4cebacda51bb02bdc8bd3 Mon Sep 17 00:00:00 2001 From: Jaylyn Ren Date: Fri, 25 Mar 2022 17:01:36 +0800 Subject: [PATCH 01/29] Virtcontainers: Enable hot plugging vhost-user-blk device on ARM The vhost-user-blk can be hotplugged on the PCI bridge successfully on X86, but failed on Arm. However, hotplugging it on Root Port as a PCIe device can work well on ARM. Open the "pcie_root_port" in configuration.toml is needed. Fixes: #4019 Signed-off-by: Jaylyn Ren --- src/runtime/virtcontainers/qemu.go | 91 ++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 94c35c833..c78e5df93 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -38,6 +38,7 @@ import ( pkgUtils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" "github.com/kata-containers/kata-containers/src/runtime/pkg/uuid" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/config" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/drivers" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" vcTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" @@ -129,6 +130,8 @@ const ( fallbackFileBackedMemDir = "/dev/shm" qemuStopSandboxTimeoutSecs = 15 + + qomPathPrefix = "/machine/peripheral/" ) // agnostic list of kernel parameters @@ -1396,31 +1399,68 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V }() driver := "vhost-user-blk-pci" - addr, bridge, err := q.arch.addDeviceToBridge(ctx, vAttr.DevID, types.PCI) - if err != nil { - return err - } - defer func() { - if err != nil { - q.arch.removeDeviceFromBridge(vAttr.DevID) + machineType := q.HypervisorConfig().HypervisorMachineType + + switch machineType { + case QemuVirt: + if q.state.PCIeRootPort <= 0 { + return fmt.Errorf("Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt") } - }() - bridgeSlot, err := vcTypes.PciSlotFromInt(bridge.Addr) - if err != nil { - return err - } - devSlot, err := vcTypes.PciSlotFromString(addr) - if err != nil { - return err - } - vAttr.PCIPath, err = vcTypes.PciPathFromSlots(bridgeSlot, devSlot) + //The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0 + //Since the dev is the first and only one on this bus(root port), it should be 0. + addr := "00" - if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridge.ID); err != nil { - return err - } + bridgeId := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs)) + drivers.AllPCIeDevs[devID] = true + bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeId) + bridgeSlot, err := q.qomGetSlot(bridgeQomPath) + if err != nil { + return err + } + + devSlot, err := vcTypes.PciSlotFromString(addr) + if err != nil { + return err + } + + vAttr.PCIPath, err = vcTypes.PciPathFromSlots(bridgeSlot, devSlot) + if err != nil { + return err + } + + if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeId); err != nil { + return err + } + + default: + addr, bridge, err := q.arch.addDeviceToBridge(ctx, vAttr.DevID, types.PCI) + if err != nil { + return err + } + defer func() { + if err != nil { + q.arch.removeDeviceFromBridge(vAttr.DevID) + } + }() + + bridgeSlot, err := vcTypes.PciSlotFromInt(bridge.Addr) + if err != nil { + return err + } + + devSlot, err := vcTypes.PciSlotFromString(addr) + if err != nil { + return err + } + vAttr.PCIPath, err = vcTypes.PciPathFromSlots(bridgeSlot, devSlot) + + if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridge.ID); err != nil { + return err + } + } return nil } @@ -1462,8 +1502,13 @@ func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUs return fmt.Errorf("Incorrect vhost-user device type found") } } else { - if err := q.arch.removeDeviceFromBridge(vAttr.DevID); err != nil { - return err + + machineType := q.HypervisorConfig().HypervisorMachineType + + if machineType != QemuVirt { + if err := q.arch.removeDeviceFromBridge(vAttr.DevID); err != nil { + return err + } } if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil { @@ -2316,7 +2361,7 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin addr string ) switch machineType { - case QemuQ35: + case QemuQ35, QemuVirt: bus = defaultBridgeBus chassis = "0" multiFunction = false From 81f6b486265a00704e330d3bf9722d9539361f14 Mon Sep 17 00:00:00 2001 From: Braden Rayhorn Date: Tue, 19 Apr 2022 22:08:40 -0500 Subject: [PATCH 02/29] agent: add tests for create_logger_task function Add tests for create_logger_task function in src/main.rs. Fixes: #4113 Signed-off-by: Braden Rayhorn --- src/agent/src/main.rs | 56 +++++++++++++++++++++++++++++++++++++ src/agent/src/mount.rs | 8 +----- src/agent/src/test_utils.rs | 9 +++++- 3 files changed, 65 insertions(+), 8 deletions(-) diff --git a/src/agent/src/main.rs b/src/agent/src/main.rs index f75f885b5..4386dde5c 100644 --- a/src/agent/src/main.rs +++ b/src/agent/src/main.rs @@ -416,3 +416,59 @@ fn reset_sigpipe() { use crate::config::AgentConfig; use std::os::unix::io::{FromRawFd, RawFd}; + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::test_utils::TestUserType; + + #[tokio::test] + async fn test_create_logger_task() { + #[derive(Debug)] + struct TestData { + vsock_port: u32, + test_user: TestUserType, + result: Result<()>, + } + + let tests = &[ + TestData { + // non-root user cannot use privileged vsock port + vsock_port: 1, + test_user: TestUserType::NonRootOnly, + result: Err(anyhow!(nix::errno::Errno::from_i32(libc::EACCES))), + }, + TestData { + // passing vsock_port 0 causes logger task to write to stdout + vsock_port: 0, + test_user: TestUserType::Any, + result: Ok(()), + }, + ]; + + for (i, d) in tests.iter().enumerate() { + if d.test_user == TestUserType::RootOnly { + skip_if_not_root!(); + } else if d.test_user == TestUserType::NonRootOnly { + skip_if_root!(); + } + + let msg = format!("test[{}]: {:?}", i, d); + let (rfd, wfd) = unistd::pipe2(OFlag::O_CLOEXEC).unwrap(); + defer!({ + // rfd is closed by the use of PipeStream in the crate_logger_task function, + // but we will attempt to close in case of a failure + let _ = unistd::close(rfd); + unistd::close(wfd).unwrap(); + }); + + let (shutdown_tx, shutdown_rx) = channel(true); + + shutdown_tx.send(true).unwrap(); + let result = create_logger_task(rfd, d.vsock_port, shutdown_rx).await; + + let msg = format!("{}, result: {:?}", msg, result); + assert_result!(d.result, result, msg); + } + } +} diff --git a/src/agent/src/mount.rs b/src/agent/src/mount.rs index 95bcd14ed..04b3cf9eb 100644 --- a/src/agent/src/mount.rs +++ b/src/agent/src/mount.rs @@ -1017,6 +1017,7 @@ fn parse_options(option_list: Vec) -> HashMap { #[cfg(test)] mod tests { use super::*; + use crate::test_utils::test_utils::TestUserType; use crate::{skip_if_not_root, skip_loop_if_not_root, skip_loop_if_root}; use protobuf::RepeatedField; use protocols::agent::FSGroup; @@ -1026,13 +1027,6 @@ mod tests { use std::path::PathBuf; use tempfile::tempdir; - #[derive(Debug, PartialEq)] - enum TestUserType { - RootOnly, - NonRootOnly, - Any, - } - #[test] fn test_mount() { #[derive(Debug)] diff --git a/src/agent/src/test_utils.rs b/src/agent/src/test_utils.rs index becb845fa..92171bad2 100644 --- a/src/agent/src/test_utils.rs +++ b/src/agent/src/test_utils.rs @@ -5,7 +5,14 @@ #![allow(clippy::module_inception)] #[cfg(test)] -mod test_utils { +pub mod test_utils { + #[derive(Debug, PartialEq)] + pub enum TestUserType { + RootOnly, + NonRootOnly, + Any, + } + #[macro_export] macro_rules! skip_if_root { () => { From 832c33d5b5290062342f65859f6e2800439350c4 Mon Sep 17 00:00:00 2001 From: Jason Zhang Date: Wed, 27 Apr 2022 11:28:03 +0800 Subject: [PATCH 03/29] docs: remove pc machine type supports Currently the 'pc' machine type is no longer supported in kata configuration, so remove it in the design docs. Fixes: #4155 Signed-off-by: Jason Zhang --- docs/design/virtualization.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/design/virtualization.md b/docs/design/virtualization.md index f12ab4447..3bd6ae582 100644 --- a/docs/design/virtualization.md +++ b/docs/design/virtualization.md @@ -39,7 +39,7 @@ Details of each solution and a summary are provided below. Kata Containers with QEMU has complete compatibility with Kubernetes. Depending on the host architecture, Kata Containers supports various machine types, -for example `pc` and `q35` on x86 systems, `virt` on ARM systems and `pseries` on IBM Power systems. The default Kata Containers +for example `q35` on x86 systems, `virt` on ARM systems and `pseries` on IBM Power systems. The default Kata Containers machine type is `q35`. The machine type and its [`Machine accelerators`](#machine-accelerators) can be changed by editing the runtime [`configuration`](architecture/README.md#configuration) file. @@ -60,9 +60,8 @@ Machine accelerators are architecture specific and can be used to improve the pe and enable specific features of the machine types. The following machine accelerators are used in Kata Containers: -- NVDIMM: This machine accelerator is x86 specific and only supported by `pc` and -`q35` machine types. `nvdimm` is used to provide the root filesystem as a persistent -memory device to the Virtual Machine. +- NVDIMM: This machine accelerator is x86 specific and only supported by `q35` machine types. +`nvdimm` is used to provide the root filesystem as a persistent memory device to the Virtual Machine. #### Hotplug devices From dd4bd7f47109e40b893422ba40610074e44bb0e4 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Fri, 18 Mar 2022 03:40:26 -0700 Subject: [PATCH 04/29] doc: Added initial doc update for NV GPUs Fixed rpm vs deb references Update to the shell portion Fixes #3379 Signed-off-by: Zvonko Kaiser --- docs/use-cases/GPU-passthrough-and-Kata.md | 2 +- .../NVIDIA-GPU-passthrough-and-Kata.md | 372 ++++++++++++++++++ .../Nvidia-GPU-passthrough-and-Kata.md | 293 -------------- 3 files changed, 373 insertions(+), 294 deletions(-) create mode 100644 docs/use-cases/NVIDIA-GPU-passthrough-and-Kata.md delete mode 100644 docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md diff --git a/docs/use-cases/GPU-passthrough-and-Kata.md b/docs/use-cases/GPU-passthrough-and-Kata.md index 493c947c9..4f00ec52e 100644 --- a/docs/use-cases/GPU-passthrough-and-Kata.md +++ b/docs/use-cases/GPU-passthrough-and-Kata.md @@ -3,4 +3,4 @@ Kata Containers supports passing certain GPUs from the host into the container. Select the GPU vendor for detailed information: - [Intel](Intel-GPU-passthrough-and-Kata.md) -- [Nvidia](Nvidia-GPU-passthrough-and-Kata.md) +- [NVIDIA](NVIDIA-GPU-passthrough-and-Kata.md) diff --git a/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata.md b/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata.md new file mode 100644 index 000000000..32943b8d3 --- /dev/null +++ b/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata.md @@ -0,0 +1,372 @@ +# Using NVIDIA GPU device with Kata Containers + +An NVIDIA GPU device can be passed to a Kata Containers container using GPU +passthrough (NVIDIA GPU pass-through mode) as well as GPU mediated passthrough +(NVIDIA vGPU mode). + +NVIDIA GPU pass-through mode, an entire physical GPU is directly assigned to one +VM, bypassing the NVIDIA Virtual GPU Manager. In this mode of operation, the GPU +is accessed exclusively by the NVIDIA driver running in the VM to which it is +assigned. The GPU is not shared among VMs. + +NVIDIA Virtual GPU (vGPU) enables multiple virtual machines (VMs) to have +simultaneous, direct access to a single physical GPU, using the same NVIDIA +graphics drivers that are deployed on non-virtualized operating systems. By +doing this, NVIDIA vGPU provides VMs with unparalleled graphics performance, +compute performance, and application compatibility, together with the +cost-effectiveness and scalability brought about by sharing a GPU among multiple +workloads. A vGPU can be either time-sliced or Multi-Instance GPU (MIG)-backed +with [MIG-slices](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/). + +| Technology | Description | Behavior | Detail | +| --- | --- | --- | --- | +| NVIDIA GPU pass-through mode | GPU passthrough | Physical GPU assigned to a single VM | Direct GPU assignment to VM without limitation | +| NVIDIA vGPU time-sliced | GPU time-sliced | Physical GPU time-sliced for multiple VMs | Mediated passthrough | +| NVIDIA vGPU MIG-backed | GPU with MIG-slices | Physical GPU MIG-sliced for multiple VMs | Mediated passthrough | + +## Hardware Requirements + +NVIDIA GPUs Recommended for Virtualization: + +- NVIDIA Tesla (T4, M10, P6, V100 or newer) +- NVIDIA Quadro RTX 6000/8000 + +## Host BIOS Requirements + +Some hardware requires a larger PCI BARs window, for example, NVIDIA Tesla P100, +K40m + +```sh +$ lspci -s d0:00.0 -vv | grep Region + Region 0: Memory at e7000000 (32-bit, non-prefetchable) [size=16M] + Region 1: Memory at 222800000000 (64-bit, prefetchable) [size=32G] # Above 4G + Region 3: Memory at 223810000000 (64-bit, prefetchable) [size=32M] +``` + +For large BARs devices, MMIO mapping above 4G address space should be `enabled` +in the PCI configuration of the BIOS. + +Some hardware vendors use different name in BIOS, such as: + +- Above 4G Decoding +- Memory Hole for PCI MMIO +- Memory Mapped I/O above 4GB + +If one is using a GPU based on the Ampere architecture and later additionally +SR-IOV needs to be enabled for the vGPU use-case. + +The following steps outline the workflow for using an NVIDIA GPU with Kata. + +## Host Kernel Requirements + +The following configurations need to be enabled on your host kernel: + +- `CONFIG_VFIO` +- `CONFIG_VFIO_IOMMU_TYPE1` +- `CONFIG_VFIO_MDEV` +- `CONFIG_VFIO_MDEV_DEVICE` +- `CONFIG_VFIO_PCI` + +Your host kernel needs to be booted with `intel_iommu=on` on the kernel command +line. + +## Install and configure Kata Containers + +To use non-large BARs devices (for example, NVIDIA Tesla T4), you need Kata +version 1.3.0 or above. Follow the [Kata Containers setup +instructions](../install/README.md) to install the latest version of Kata. + +To use large BARs devices (for example, NVIDIA Tesla P100), you need Kata +version 1.11.0 or above. + +The following configuration in the Kata `configuration.toml` file as shown below +can work: + +Hotplug for PCI devices with small BARs by `acpi_pcihp` (Linux's ACPI PCI +Hotplug driver): + +```sh +machine_type = "q35" + +hotplug_vfio_on_root_bus = false +``` + +Hotplug for PCIe devices with large BARs by `pciehp` (Linux's PCIe Hotplug +driver): + +```sh +machine_type = "q35" + +hotplug_vfio_on_root_bus = true +pcie_root_port = 1 +``` + +## Build Kata Containers kernel with GPU support + +The default guest kernel installed with Kata Containers does not provide GPU +support. To use an NVIDIA GPU with Kata Containers, you need to build a kernel +with the necessary GPU support. + +The following kernel config options need to be enabled: + +```sh +# Support PCI/PCIe device hotplug (Required for large BARs device) +CONFIG_HOTPLUG_PCI_PCIE=y + +# Support for loading modules (Required for load NVIDIA drivers) +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y + +# Enable the MMIO access method for PCIe devices (Required for large BARs device) + CONFIG_PCI_MMCONFIG=y +``` + +The following kernel config options need to be disabled: + +```sh +# Disable Open Source NVIDIA driver nouveau +# It conflicts with NVIDIA official driver +CONFIG_DRM_NOUVEAU=n +``` + +> **Note**: `CONFIG_DRM_NOUVEAU` is normally disabled by default. +It is worth checking that it is not enabled in your kernel configuration to +prevent any conflicts. + +Build the Kata Containers kernel with the previous config options, using the +instructions described in [Building Kata Containers +kernel](../../tools/packaging/kernel). For further details on building and +installing guest kernels, see [the developer +guide](../Developer-Guide.md#install-guest-kernel-images). + +There is an easy way to build a guest kernel that supports NVIDIA GPU: + +```sh +## Build guest kernel with ../../tools/packaging/kernel + +# Prepare (download guest kernel source, generate .config) +$ ./build-kernel.sh -v 5.15.23 -g nvidia -f setup + +# Build guest kernel +$ ./build-kernel.sh -v 5.15.23 -g nvidia build + +# Install guest kernel +$ sudo -E ./build-kernel.sh -v 5.15.23 -g nvidia install +``` + +To build NVIDIA Driver in Kata container, `linux-headers` is required. +This is a way to generate deb packages for `linux-headers`: + +> **Note**: +> Run `make rpm-pkg` to build the rpm package. +> Run `make deb-pkg` to build the deb package. +> + +```sh +$ cd kata-linux-5.15.23-89 +$ make deb-pkg +``` +Before using the new guest kernel, please update the `kernel` parameters in + `configuration.toml`. + +```sh +kernel = "/usr/share/kata-containers/vmlinuz-nvidia-gpu.container" +``` + +## NVIDIA GPU pass-through mode with Kata Containers + +Use the following steps to pass an NVIDIA GPU device in pass-through mode with Kata: + +1. Find the Bus-Device-Function (BDF) for GPU device on host: + + ```sh + $ sudo lspci -nn -D | grep -i nvidia + 0000:d0:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:20b9] (rev a1) + ``` + + > PCI address `0000:d0:00.0` is assigned to the hardware GPU device. + > `10de:20b9` is the device ID of the hardware GPU device. + +2. Find the IOMMU group for the GPU device: + + ```sh + $ BDF="0000:d0:00.0" + $ readlink -e /sys/bus/pci/devices/$BDF/iommu_group + ``` + + The previous output shows that the GPU belongs to IOMMU group 192. The next + step is to bind the GPU to the VFIO-PCI driver. + + ```sh + $ BDF="0000:d0:00.0" + $ DEV="/sys/bus/pci/devices/$BDF" + $ echo "vfio-pci" > $DEV/driver_override + $ echo $BDF > $DEV/driver/unbind + $ echo $BDF > /sys/bus/pci/drivers_probe + # To return the device to the standard driver, we simply clear the + # driver_override and reprobe the device, ex: + $ echo > $DEV/preferred_driver + $ echo $BDF > $DEV/driver/unbind + $ echo $BDF > /sys/bus/pci/drivers_probe + ``` + +3. Check the IOMMU group number under `/dev/vfio`: + + ```sh + $ ls -l /dev/vfio + total 0 + crw------- 1 zvonkok zvonkok 243, 0 Mar 18 03:06 192 + crw-rw-rw- 1 root root 10, 196 Mar 18 02:27 vfio + ``` + +4. Start a Kata container with GPU device: + + ```sh + # You may need to `modprobe vhost-vsock` if you get + # host system doesn't support vsock: stat /dev/vhost-vsock + $ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch uname -r + ``` + +5. Run `lspci` within the container to verify the GPU device is seen in the list + of the PCI devices. Note the vendor-device id of the GPU (`10de:20b9`) in the `lspci` output. + + ```sh + $ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch sh -c "lspci -nn | grep '10de:20b9'" + ``` + +6. Additionally, you can check the PCI BARs space of ​​the NVIDIA GPU device in the container: + + ```sh + $ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch sh -c "lspci -s 02:00.0 -vv | grep Region" + ``` + + > **Note**: If you see a message similar to the above, the BAR space of the NVIDIA + > GPU has been successfully allocated. + +## NVIDIA vGPU mode with Kata Containers + +NVIDIA vGPU is a licensed product on all supported GPU boards. A software license +is required to enable all vGPU features within the guest VM. + +> **TODO**: Will follow up with instructions + +## Install NVIDIA Driver + Toolkit in Kata Containers Guest OS + +Consult the [Developer-Guide](https://github.com/kata-containers/kata-containers/blob/main/docs/Developer-Guide.md#create-a-rootfs-image) on how to create a +rootfs base image for a distribution of your choice. This is going to be used as +a base for a NVIDIA enabled guest OS. Use the `EXTRA_PKGS` variable to install +all the needed packages to compile the drivers. Also copy the kernel development +packages from the previous `make deb-pkg` into `$ROOTFS_DIR`. + +```sh +export EXTRA_PKGS="gcc make curl gnupg" +``` + +Having the `$ROOTFS_DIR` exported in the previous step we can now install all the +need parts in the guest OS. In this case we have an Ubuntu based rootfs. + +First off all mount the special filesystems into the rootfs + +```sh +$ sudo mount -t sysfs -o ro none ${ROOTFS_DIR}/sys +$ sudo mount -t proc -o ro none ${ROOTFS_DIR}/proc +$ sudo mount -t tmpfs none ${ROOTFS_DIR}/tmp +$ sudo mount -o bind,ro /dev ${ROOTFS_DIR}/dev +$ sudo mount -t devpts none ${ROOTFS_DIR}/dev/pts +``` + +Now we can enter `chroot` + +```sh +$ sudo chroot ${ROOTFS_DIR} +``` + +Inside the rootfs one is going to install the drivers and toolkit to enable easy +creation of GPU containers with Kata. We can also use this rootfs for any other +container not specifically only for GPUs. + +As a prerequisite install the copied kernel development packages + +```sh +$ sudo dpkg -i *.deb +``` + +Get the driver run file, since we need to build the driver against a kernel that +is not running on the host we need the ability to specify the exact version we +want the driver to build against. Take the kernel version one used for building +the NVIDIA kernel (`5.15.23-nvidia-gpu`). + +```sh +$ wget https://us.download.nvidia.com/XFree86/Linux-x86_64/510.54/NVIDIA-Linux-x86_64-510.54.run +$ chmod +x NVIDIA-Linux-x86_64-510.54.run +# Extract the source files so we can run the installer with arguments +$ ./NVIDIA-Linux-x86_64-510.54.run -x +$ cd NVIDIA-Linux-x86_64-510.54 +$ ./nvidia-installer -k 5.15.23-nvidia-gpu +``` +Having the drivers installed we need to install the toolkit which will take care +of providing the right bits into the container. + +```sh +$ distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +$ curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg +$ curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +$ apt update +$ apt install nvidia-container-toolkit +``` + +Create the hook execution file for Kata: + +``` +# Content of $ROOTFS_DIR/usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh + +#!/bin/bash -x + +/usr/bin/nvidia-container-toolkit -debug $@ +``` + +As a last step one can do some cleanup of files or package caches. Build the +rootfs and configure it for use with Kata according to the development guide. + +Enable the `guest_hook_path` in Kata's `configuration.toml` + +```sh +guest_hook_path = "/usr/share/oci/hooks" +``` + +One has build a NVIDIA rootfs, kernel and now we can run any GPU container +without installing the drivers into the container. Check NVIDIA device status +with `nvidia-smi` + +```sh +$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/nvidia/cuda:11.6.0-base-ubuntu20.04" cuda nvidia-smi +Fri Mar 18 10:36:59 2022 ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 510.54 Driver Version: 510.54 CUDA Version: 11.6 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|===============================+======================+======================| +| 0 NVIDIA A30X Off | 00000000:02:00.0 Off | 0 | +| N/A 38C P0 67W / 230W | 0MiB / 24576MiB | 0% Default | +| | | Disabled | ++-------------------------------+----------------------+----------------------+ + ++-----------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=============================================================================| +| No running processes found | ++-----------------------------------------------------------------------------+ +``` + +As a last step one can remove the additional packages and files that were added +to the `$ROOTFS_DIR` to keep it as small as possible. + +## References + +- [Configuring a VM for GPU Pass-Through by Using the QEMU Command Line](https://docs.nvidia.com/grid/latest/grid-vgpu-user-guide/index.html#using-gpu-pass-through-red-hat-el-qemu-cli) +- https://gitlab.com/nvidia/container-images/driver/-/tree/master +- https://github.com/NVIDIA/nvidia-docker/wiki/Driver-containers diff --git a/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md b/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md deleted file mode 100644 index 58b36c02d..000000000 --- a/docs/use-cases/Nvidia-GPU-passthrough-and-Kata.md +++ /dev/null @@ -1,293 +0,0 @@ -# Using Nvidia GPU device with Kata Containers - -An Nvidia GPU device can be passed to a Kata Containers container using GPU passthrough -(Nvidia GPU pass-through mode) as well as GPU mediated passthrough (Nvidia vGPU mode).  - -Nvidia GPU pass-through mode, an entire physical GPU is directly assigned to one VM, -bypassing the Nvidia Virtual GPU Manager. In this mode of operation, the GPU is accessed -exclusively by the Nvidia driver running in the VM to which it is assigned. -The GPU is not shared among VMs. - -Nvidia Virtual GPU (vGPU) enables multiple virtual machines (VMs) to have simultaneous, -direct access to a single physical GPU, using the same Nvidia graphics drivers that are -deployed on non-virtualized operating systems. By doing this, Nvidia vGPU provides VMs -with unparalleled graphics performance, compute performance, and application compatibility, -together with the cost-effectiveness and scalability brought about by sharing a GPU -among multiple workloads. - -| Technology | Description | Behaviour | Detail | -| --- | --- | --- | --- | -| Nvidia GPU pass-through mode | GPU passthrough | Physical GPU assigned to a single VM | Direct GPU assignment to VM without limitation | -| Nvidia vGPU mode | GPU sharing | Physical GPU shared by multiple VMs | Mediated passthrough | - -## Hardware Requirements -Nvidia GPUs Recommended for Virtualization: - -- Nvidia Tesla (T4, M10, P6, V100 or newer) -- Nvidia Quadro RTX 6000/8000 - -## Host BIOS Requirements - -Some hardware requires a larger PCI BARs window, for example, Nvidia Tesla P100, K40m -``` -$ lspci -s 04:00.0 -vv | grep Region - Region 0: Memory at c6000000 (32-bit, non-prefetchable) [size=16M] - Region 1: Memory at 383800000000 (64-bit, prefetchable) [size=16G] #above 4G - Region 3: Memory at 383c00000000 (64-bit, prefetchable) [size=32M] -``` - -For large BARs devices, MMIO mapping above 4G address space should be `enabled` -in the PCI configuration of the BIOS. - -Some hardware vendors use different name in BIOS, such as: - -- Above 4G Decoding -- Memory Hole for PCI MMIO -- Memory Mapped I/O above 4GB - -The following steps outline the workflow for using an Nvidia GPU with Kata. - -## Host Kernel Requirements -The following configurations need to be enabled on your host kernel: - -- `CONFIG_VFIO` -- `CONFIG_VFIO_IOMMU_TYPE1` -- `CONFIG_VFIO_MDEV` -- `CONFIG_VFIO_MDEV_DEVICE` -- `CONFIG_VFIO_PCI` - -Your host kernel needs to be booted with `intel_iommu=on` on the kernel command line. - -## Install and configure Kata Containers -To use non-large BARs devices (for example, Nvidia Tesla T4), you need Kata version 1.3.0 or above. -Follow the [Kata Containers setup instructions](../install/README.md) -to install the latest version of Kata. - -To use large BARs devices (for example, Nvidia Tesla P100), you need Kata version 1.11.0 or above. - -The following configuration in the Kata `configuration.toml` file as shown below can work: - -Hotplug for PCI devices by `acpi_pcihp` (Linux's ACPI PCI Hotplug driver): -``` -machine_type = "q35" - -hotplug_vfio_on_root_bus = false -``` - -Hotplug for PCIe devices by `pciehp` (Linux's PCIe Hotplug driver): -``` -machine_type = "q35" - -hotplug_vfio_on_root_bus = true -pcie_root_port = 1 -``` - -## Build Kata Containers kernel with GPU support -The default guest kernel installed with Kata Containers does not provide GPU support. -To use an Nvidia GPU with Kata Containers, you need to build a kernel with the -necessary GPU support. - -The following kernel config options need to be enabled: -``` -# Support PCI/PCIe device hotplug (Required for large BARs device) -CONFIG_HOTPLUG_PCI_PCIE=y - -# Support for loading modules (Required for load Nvidia drivers) -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y - -# Enable the MMIO access method for PCIe devices (Required for large BARs device) -CONFIG_PCI_MMCONFIG=y -``` - -The following kernel config options need to be disabled: -``` -# Disable Open Source Nvidia driver nouveau -# It conflicts with Nvidia official driver -CONFIG_DRM_NOUVEAU=n -``` -> **Note**: `CONFIG_DRM_NOUVEAU` is normally disabled by default. -It is worth checking that it is not enabled in your kernel configuration to prevent any conflicts. - - -Build the Kata Containers kernel with the previous config options, -using the instructions described in [Building Kata Containers kernel](../../tools/packaging/kernel). -For further details on building and installing guest kernels, -see [the developer guide](../Developer-Guide.md#install-guest-kernel-images). - -There is an easy way to build a guest kernel that supports Nvidia GPU: -``` -## Build guest kernel with ../../tools/packaging/kernel - -# Prepare (download guest kernel source, generate .config) -$ ./build-kernel.sh -v 4.19.86 -g nvidia -f setup - -# Build guest kernel -$ ./build-kernel.sh -v 4.19.86 -g nvidia build - -# Install guest kernel -$ sudo -E ./build-kernel.sh -v 4.19.86 -g nvidia install -/usr/share/kata-containers/vmlinux-nvidia-gpu.container -> vmlinux-4.19.86-70-nvidia-gpu -/usr/share/kata-containers/vmlinuz-nvidia-gpu.container -> vmlinuz-4.19.86-70-nvidia-gpu -``` - -To build Nvidia Driver in Kata container, `kernel-devel` is required. -This is a way to generate rpm packages for `kernel-devel`: -``` -$ cd kata-linux-4.19.86-68 -$ make rpm-pkg -Output RPMs: -~/rpmbuild/RPMS/x86_64/kernel-devel-4.19.86_nvidia_gpu-1.x86_64.rpm -``` -> **Note**: -> - `kernel-devel` should be installed in Kata container before run Nvidia driver installer. -> - Run `make deb-pkg` to build the deb package. - -Before using the new guest kernel, please update the `kernel` parameters in `configuration.toml`. -``` -kernel = "/usr/share/kata-containers/vmlinuz-nvidia-gpu.container" -``` - -## Nvidia GPU pass-through mode with Kata Containers -Use the following steps to pass an Nvidia GPU device in pass-through mode with Kata: - -1. Find the Bus-Device-Function (BDF) for GPU device on host: - ``` - $ sudo lspci -nn -D | grep -i nvidia - 0000:04:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:15f8] (rev a1) - 0000:84:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:15f8] (rev a1) - ``` - > PCI address `0000:04:00.0` is assigned to the hardware GPU device. - > `10de:15f8` is the device ID of the hardware GPU device. - -2. Find the IOMMU group for the GPU device: - ``` - $ BDF="0000:04:00.0" - $ readlink -e /sys/bus/pci/devices/$BDF/iommu_group - /sys/kernel/iommu_groups/45 - ``` - The previous output shows that the GPU belongs to IOMMU group 45. - -3. Check the IOMMU group number under `/dev/vfio`: - ``` - $ ls -l /dev/vfio - total 0 - crw------- 1 root root 248, 0 Feb 28 09:57 45 - crw------- 1 root root 248, 1 Feb 28 09:57 54 - crw-rw-rw- 1 root root 10, 196 Feb 28 09:57 vfio - ``` - -4. Start a Kata container with GPU device: - ``` - $ sudo docker run -it --runtime=kata-runtime --cap-add=ALL --device /dev/vfio/45 centos /bin/bash - ``` - -5. Run `lspci` within the container to verify the GPU device is seen in the list - of the PCI devices. Note the vendor-device id of the GPU (`10de:15f8`) in the `lspci` output. - ``` - $ lspci -nn -D | grep '10de:15f8' - 0000:01:01.0 3D controller [0302]: NVIDIA Corporation GP100GL [Tesla P100 PCIe 16GB] [10de:15f8] (rev a1) - ``` - -6. Additionally, you can check the PCI BARs space of ​​the Nvidia GPU device in the container: - ``` - $ lspci -s 01:01.0 -vv | grep Region - Region 0: Memory at c0000000 (32-bit, non-prefetchable) [disabled] [size=16M] - Region 1: Memory at 4400000000 (64-bit, prefetchable) [disabled] [size=16G] - Region 3: Memory at 4800000000 (64-bit, prefetchable) [disabled] [size=32M] - ``` - > **Note**: If you see a message similar to the above, the BAR space of the Nvidia - > GPU has been successfully allocated. - -## Nvidia vGPU mode with Kata Containers - -Nvidia vGPU is a licensed product on all supported GPU boards. A software license -is required to enable all vGPU features within the guest VM. - -> **Note**: There is no suitable test environment, so it is not written here. - - -## Install Nvidia Driver in Kata Containers -Download the official Nvidia driver from -[https://www.nvidia.com/Download/index.aspx](https://www.nvidia.com/Download/index.aspx), -for example `NVIDIA-Linux-x86_64-418.87.01.run`. - -Install the `kernel-devel`(generated in the previous steps) for guest kernel: -``` -$ sudo rpm -ivh kernel-devel-4.19.86_gpu-1.x86_64.rpm -``` - -Here is an example to extract, compile and install Nvidia driver: -``` -## Extract -$ sh ./NVIDIA-Linux-x86_64-418.87.01.run -x - -## Compile and install (It will take some time) -$ cd NVIDIA-Linux-x86_64-418.87.01 -$ sudo ./nvidia-installer -a -q --ui=none \ - --no-cc-version-check \ - --no-opengl-files --no-install-libglvnd \ - --kernel-source-path=/usr/src/kernels/`uname -r` -``` - -Or just run one command line: -``` -$ sudo sh ./NVIDIA-Linux-x86_64-418.87.01.run -a -q --ui=none \ - --no-cc-version-check \ - --no-opengl-files --no-install-libglvnd \ - --kernel-source-path=/usr/src/kernels/`uname -r` -``` - -To view detailed logs of the installer: -``` -$ tail -f /var/log/nvidia-installer.log -``` - -Load Nvidia driver module manually -``` -# Optional(generate modules.dep and map files for Nvidia driver) -$ sudo depmod - -# Load module -$ sudo modprobe nvidia-drm - -# Check module -$ lsmod | grep nvidia -nvidia_drm 45056 0 -nvidia_modeset 1093632 1 nvidia_drm -nvidia 18202624 1 nvidia_modeset -drm_kms_helper 159744 1 nvidia_drm -drm 364544 3 nvidia_drm,drm_kms_helper -i2c_core 65536 3 nvidia,drm_kms_helper,drm -ipmi_msghandler 49152 1 nvidia -``` - - -Check Nvidia device status with `nvidia-smi` -``` -$ nvidia-smi -Tue Mar 3 00:03:49 2020 -+-----------------------------------------------------------------------------+ -| NVIDIA-SMI 418.87.01 Driver Version: 418.87.01 CUDA Version: 10.1 | -|-------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | -|===============================+======================+======================| -| 0 Tesla P100-PCIE... Off | 00000000:01:01.0 Off | 0 | -| N/A 27C P0 25W / 250W | 0MiB / 16280MiB | 0% Default | -+-------------------------------+----------------------+----------------------+ - -+-----------------------------------------------------------------------------+ -| Processes: GPU Memory | -| GPU PID Type Process name Usage | -|=============================================================================| -| No running processes found | -+-----------------------------------------------------------------------------+ - -``` - -## References - -- [Configuring a VM for GPU Pass-Through by Using the QEMU Command Line](https://docs.nvidia.com/grid/latest/grid-vgpu-user-guide/index.html#using-gpu-pass-through-red-hat-el-qemu-cli) -- https://gitlab.com/nvidia/container-images/driver/-/tree/master -- https://github.com/NVIDIA/nvidia-docker/wiki/Driver-containers From 2c218a07b99e55401107f7f59b31c5b38550e7f2 Mon Sep 17 00:00:00 2001 From: Manabu Sugimoto Date: Thu, 30 Sep 2021 21:07:24 +0900 Subject: [PATCH 05/29] agent: Modify Kata agent for runk Generate an oci-kata-agent which is a customized agent to be called from runk which is a Rust-based standard OCI container runtime based on Kata agent. Fixes: #2784 Signed-off-by: Manabu Sugimoto --- src/agent/Cargo.lock | 1 + src/agent/Cargo.toml | 10 ++++ src/agent/Makefile | 8 +++ src/agent/rustjail/Cargo.toml | 2 + src/agent/rustjail/src/console.rs | 82 +++++++++++++++++++++++++++++ src/agent/rustjail/src/container.rs | 72 +++++++++++++++++++++---- src/agent/rustjail/src/lib.rs | 2 + src/agent/rustjail/src/specconv.rs | 2 +- src/agent/src/rpc.rs | 2 +- 9 files changed, 168 insertions(+), 13 deletions(-) create mode 100644 src/agent/rustjail/src/console.rs diff --git a/src/agent/Cargo.lock b/src/agent/Cargo.lock index 5c9862e25..096a81a8e 100644 --- a/src/agent/Cargo.lock +++ b/src/agent/Cargo.lock @@ -1370,6 +1370,7 @@ dependencies = [ "async-trait", "capctl", "caps", + "cfg-if 0.1.10", "cgroups-rs", "futures", "inotify", diff --git a/src/agent/Cargo.toml b/src/agent/Cargo.toml index 521fbd5dc..220e66287 100644 --- a/src/agent/Cargo.toml +++ b/src/agent/Cargo.toml @@ -76,3 +76,13 @@ lto = true [features] seccomp = ["rustjail/seccomp"] +standard-oci-runtime = ["rustjail/standard-oci-runtime"] + +[[bin]] +name = "kata-agent" +path = "src/main.rs" + +[[bin]] +name = "oci-kata-agent" +path = "src/main.rs" +required-features = ["standard-oci-runtime"] diff --git a/src/agent/Makefile b/src/agent/Makefile index 13f2ce618..9d1327c00 100644 --- a/src/agent/Makefile +++ b/src/agent/Makefile @@ -33,6 +33,14 @@ ifeq ($(SECCOMP),yes) override EXTRA_RUSTFEATURES += seccomp endif +##VAR STANDARD_OCI_RUNTIME=yes|no define if agent enables standard oci runtime feature +STANDARD_OCI_RUNTIME := no + +# Enable standard oci runtime feature of rust build +ifeq ($(STANDARD_OCI_RUNTIME),yes) + override EXTRA_RUSTFEATURES += standard-oci-runtime +endif + ifneq ($(EXTRA_RUSTFEATURES),) override EXTRA_RUSTFEATURES := --features "$(EXTRA_RUSTFEATURES)" endif diff --git a/src/agent/rustjail/Cargo.toml b/src/agent/rustjail/Cargo.toml index 9e0624649..306af2795 100644 --- a/src/agent/rustjail/Cargo.toml +++ b/src/agent/rustjail/Cargo.toml @@ -25,6 +25,7 @@ path-absolutize = "1.2.0" anyhow = "1.0.32" cgroups = { package = "cgroups-rs", version = "0.2.8" } rlimit = "0.5.3" +cfg-if = "0.1.0" tokio = { version = "1.2.0", features = ["sync", "io-util", "process", "time", "macros"] } futures = "0.3.17" @@ -38,3 +39,4 @@ tempfile = "3.1.0" [features] seccomp = ["libseccomp"] +standard-oci-runtime = [] diff --git a/src/agent/rustjail/src/console.rs b/src/agent/rustjail/src/console.rs new file mode 100644 index 000000000..23cf6e840 --- /dev/null +++ b/src/agent/rustjail/src/console.rs @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2021 Sony Group Corporation +// + +use anyhow::{anyhow, Result}; +use nix::errno::Errno; +use nix::pty; +use nix::sys::{socket, uio}; +use nix::unistd::{self, dup2}; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::path::Path; + +pub fn setup_console_socket(csocket_path: &str) -> Result> { + if csocket_path.is_empty() { + return Ok(None); + } + + let socket_fd = socket::socket( + socket::AddressFamily::Unix, + socket::SockType::Stream, + socket::SockFlag::empty(), + None, + )?; + + match socket::connect( + socket_fd, + &socket::SockAddr::Unix(socket::UnixAddr::new(Path::new(csocket_path))?), + ) { + Ok(()) => Ok(Some(socket_fd)), + Err(errno) => Err(anyhow!("failed to open console fd: {}", errno)), + } +} + +pub fn setup_master_console(socket_fd: RawFd) -> Result<()> { + let pseudo = pty::openpty(None, None)?; + + let pty_name: &[u8] = b"/dev/ptmx"; + let iov = [uio::IoVec::from_slice(pty_name)]; + let fds = [pseudo.master]; + let cmsg = socket::ControlMessage::ScmRights(&fds); + + socket::sendmsg(socket_fd, &iov, &[cmsg], socket::MsgFlags::empty(), None)?; + + unistd::setsid()?; + let ret = unsafe { libc::ioctl(pseudo.slave, libc::TIOCSCTTY) }; + Errno::result(ret).map_err(|e| anyhow!(e).context("ioctl TIOCSCTTY"))?; + + dup2(pseudo.slave, std::io::stdin().as_raw_fd())?; + dup2(pseudo.slave, std::io::stdout().as_raw_fd())?; + dup2(pseudo.slave, std::io::stderr().as_raw_fd())?; + + unistd::close(socket_fd)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::skip_if_not_root; + use std::fs::File; + use std::os::unix::net::UnixListener; + use std::path::PathBuf; + use tempfile::{self, tempdir}; + + const CONSOLE_SOCKET: &str = "console-socket"; + + #[test] + fn test_setup_console_socket() { + let dir = tempdir() + .map_err(|e| anyhow!(e).context("tempdir failed")) + .unwrap(); + let socket_path = dir.path().join(CONSOLE_SOCKET); + + let _listener = UnixListener::bind(&socket_path).unwrap(); + + let ret = setup_console_socket(socket_path.to_str().unwrap()); + + assert!(ret.is_ok()); + } +} diff --git a/src/agent/rustjail/src/container.rs b/src/agent/rustjail/src/container.rs index c4cc8a095..37ffa091d 100644 --- a/src/agent/rustjail/src/container.rs +++ b/src/agent/rustjail/src/container.rs @@ -23,6 +23,8 @@ use crate::cgroups::fs::Manager as FsManager; #[cfg(test)] use crate::cgroups::mock::Manager as FsManager; use crate::cgroups::Manager; +#[cfg(feature = "standard-oci-runtime")] +use crate::console; use crate::log_child; use crate::process::Process; #[cfg(feature = "seccomp")] @@ -64,7 +66,7 @@ use tokio::sync::Mutex; use crate::utils; -const EXEC_FIFO_FILENAME: &str = "exec.fifo"; +pub const EXEC_FIFO_FILENAME: &str = "exec.fifo"; const INIT: &str = "INIT"; const NO_PIVOT: &str = "NO_PIVOT"; @@ -74,6 +76,10 @@ const CLOG_FD: &str = "CLOG_FD"; const FIFO_FD: &str = "FIFO_FD"; const HOME_ENV_KEY: &str = "HOME"; const PIDNS_FD: &str = "PIDNS_FD"; +const CONSOLE_SOCKET_FD: &str = "CONSOLE_SOCKET_FD"; + +#[cfg(feature = "standard-oci-runtime")] +const OCI_AGENT_BINARY: &str = "oci-kata-agent"; #[derive(Debug)] pub struct ContainerStatus { @@ -82,7 +88,7 @@ pub struct ContainerStatus { } impl ContainerStatus { - fn new() -> Self { + pub fn new() -> Self { ContainerStatus { pre_status: ContainerState::Created, cur_status: ContainerState::Created, @@ -99,6 +105,12 @@ impl ContainerStatus { } } +impl Default for ContainerStatus { + fn default() -> Self { + Self::new() + } +} + pub type Config = CreateOpts; type NamespaceType = String; @@ -106,7 +118,7 @@ lazy_static! { // This locker ensures the child exit signal will be received by the right receiver. pub static ref WAIT_PID_LOCKER: Arc> = Arc::new(Mutex::new(false)); - static ref NAMESPACES: HashMap<&'static str, CloneFlags> = { + pub static ref NAMESPACES: HashMap<&'static str, CloneFlags> = { let mut m = HashMap::new(); m.insert("user", CloneFlags::CLONE_NEWUSER); m.insert("ipc", CloneFlags::CLONE_NEWIPC); @@ -119,7 +131,7 @@ lazy_static! { }; // type to name hashmap, better to be in NAMESPACES - static ref TYPETONAME: HashMap<&'static str, &'static str> = { + pub static ref TYPETONAME: HashMap<&'static str, &'static str> = { let mut m = HashMap::new(); m.insert("ipc", "ipc"); m.insert("user", "user"); @@ -236,6 +248,8 @@ pub struct LinuxContainer { pub status: ContainerStatus, pub created: SystemTime, pub logger: Logger, + #[cfg(feature = "standard-oci-runtime")] + pub console_socket: PathBuf, } #[derive(Serialize, Deserialize, Debug)] @@ -359,7 +373,6 @@ fn do_init_child(cwfd: RawFd) -> Result<()> { ))); } } - log_child!(cfd_log, "child process start run"); let buf = read_sync(crfd)?; let spec_str = std::str::from_utf8(&buf)?; @@ -379,6 +392,9 @@ fn do_init_child(cwfd: RawFd) -> Result<()> { let cm: FsManager = serde_json::from_str(cm_str)?; + #[cfg(feature = "standard-oci-runtime")] + let csocket_fd = console::setup_console_socket(&std::env::var(CONSOLE_SOCKET_FD)?)?; + let p = if spec.process.is_some() { spec.process.as_ref().unwrap() } else { @@ -670,10 +686,19 @@ fn do_init_child(cwfd: RawFd) -> Result<()> { let _ = unistd::close(crfd); let _ = unistd::close(cwfd); - unistd::setsid().context("create a new session")?; if oci_process.terminal { - unsafe { - libc::ioctl(0, libc::TIOCSCTTY); + cfg_if::cfg_if! { + if #[cfg(feature = "standard-oci-runtime")] { + if let Some(csocket_fd) = csocket_fd { + console::setup_master_console(csocket_fd)?; + } else { + return Err(anyhow!("failed to get console master socket fd")); + } + } + else { + unistd::setsid().context("create a new session")?; + unsafe { libc::ioctl(0, libc::TIOCSCTTY) }; + } } } @@ -926,8 +951,24 @@ impl BaseContainer for LinuxContainer { let _ = unistd::close(pid); }); - let exec_path = std::env::current_exe()?; + cfg_if::cfg_if! { + if #[cfg(feature = "standard-oci-runtime")] { + let exec_path = PathBuf::from(OCI_AGENT_BINARY); + } + else { + let exec_path = std::env::current_exe()?; + } + } + let mut child = std::process::Command::new(exec_path); + + #[allow(unused_mut)] + let mut console_name = PathBuf::from(""); + #[cfg(feature = "standard-oci-runtime")] + if !self.console_socket.as_os_str().is_empty() { + console_name = self.console_socket.clone(); + } + let mut child = child .arg("init") .stdin(child_stdin) @@ -937,7 +978,8 @@ impl BaseContainer for LinuxContainer { .env(NO_PIVOT, format!("{}", self.config.no_pivot_root)) .env(CRFD_FD, format!("{}", crfd)) .env(CWFD_FD, format!("{}", cwfd)) - .env(CLOG_FD, format!("{}", cfd_log)); + .env(CLOG_FD, format!("{}", cfd_log)) + .env(CONSOLE_SOCKET_FD, console_name); if p.init { child = child.env(FIFO_FD, format!("{}", fifofd)); @@ -1419,8 +1461,16 @@ impl LinuxContainer { .unwrap() .as_secs(), logger: logger.new(o!("module" => "rustjail", "subsystem" => "container", "cid" => id)), + #[cfg(feature = "standard-oci-runtime")] + console_socket: Path::new("").to_path_buf(), }) } + + #[cfg(feature = "standard-oci-runtime")] + pub fn set_console_socket(&mut self, console_socket: &Path) -> Result<()> { + self.console_socket = console_socket.to_path_buf(); + Ok(()) + } } fn setgroups(grps: &[libc::gid_t]) -> Result<()> { @@ -1460,7 +1510,7 @@ use std::process::Stdio; use std::time::Duration; use tokio::io::{AsyncReadExt, AsyncWriteExt}; -async fn execute_hook(logger: &Logger, h: &Hook, st: &OCIState) -> Result<()> { +pub async fn execute_hook(logger: &Logger, h: &Hook, st: &OCIState) -> Result<()> { let logger = logger.new(o!("action" => "execute-hook")); let binary = PathBuf::from(h.path.as_str()); diff --git a/src/agent/rustjail/src/lib.rs b/src/agent/rustjail/src/lib.rs index 4d1b06a27..f5f154261 100644 --- a/src/agent/rustjail/src/lib.rs +++ b/src/agent/rustjail/src/lib.rs @@ -30,6 +30,8 @@ extern crate regex; pub mod capabilities; pub mod cgroups; +#[cfg(feature = "standard-oci-runtime")] +pub mod console; pub mod container; pub mod mount; pub mod pipestream; diff --git a/src/agent/rustjail/src/specconv.rs b/src/agent/rustjail/src/specconv.rs index b61e544a3..0c8bb46db 100644 --- a/src/agent/rustjail/src/specconv.rs +++ b/src/agent/rustjail/src/specconv.rs @@ -5,7 +5,7 @@ use oci::Spec; -#[derive(Debug)] +#[derive(Serialize, Deserialize, Debug, Default, Clone)] pub struct CreateOpts { pub cgroup_name: String, pub use_systemd_cgroup: bool, diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index 94e3879b9..2dc4dd337 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -1802,7 +1802,7 @@ async fn do_add_swap(sandbox: &Arc>, req: &AddSwapRequest) -> Res // - config.json at ///config.json // - container rootfs bind mounted at ///rootfs // - modify container spec root to point to ///rootfs -fn setup_bundle(cid: &str, spec: &mut Spec) -> Result { +pub fn setup_bundle(cid: &str, spec: &mut Spec) -> Result { let spec_root = if let Some(sr) = &spec.root { sr } else { From b221a2590fb70e011896a2d85bbca74d501b85b7 Mon Sep 17 00:00:00 2001 From: Manabu Sugimoto Date: Thu, 13 Jan 2022 17:26:15 +0900 Subject: [PATCH 06/29] tools: Add runk Add a Rust-based standard OCI container runtime based on Kata agent. You can build and install runk as follows: ```sh $ cd src/tools/runk $ make $ sudo make install $ runk --help ``` Fixes: #2784 Signed-off-by: Manabu Sugimoto --- Makefile | 1 + README.md | 1 + src/tools/runk/.gitignore | 1 + src/tools/runk/Cargo.lock | 1474 ++++++++++++++++++ src/tools/runk/Cargo.toml | 29 + src/tools/runk/Makefile | 60 + src/tools/runk/README.md | 282 ++++ src/tools/runk/libcontainer/Cargo.toml | 23 + src/tools/runk/libcontainer/src/builder.rs | 121 ++ src/tools/runk/libcontainer/src/cgroup.rs | 40 + src/tools/runk/libcontainer/src/container.rs | 151 ++ src/tools/runk/libcontainer/src/lib.rs | 10 + src/tools/runk/libcontainer/src/status.rs | 246 +++ src/tools/runk/libcontainer/src/utils.rs | 106 ++ src/tools/runk/src/commands/create.rs | 37 + src/tools/runk/src/commands/delete.rs | 103 ++ src/tools/runk/src/commands/kill.rs | 82 + src/tools/runk/src/commands/mod.rs | 12 + src/tools/runk/src/commands/run.rs | 26 + src/tools/runk/src/commands/spec.rs | 207 +++ src/tools/runk/src/commands/start.rs | 48 + src/tools/runk/src/commands/state.rs | 79 + src/tools/runk/src/main.rs | 111 ++ 23 files changed, 3250 insertions(+) create mode 100644 src/tools/runk/.gitignore create mode 100644 src/tools/runk/Cargo.lock create mode 100644 src/tools/runk/Cargo.toml create mode 100644 src/tools/runk/Makefile create mode 100644 src/tools/runk/README.md create mode 100644 src/tools/runk/libcontainer/Cargo.toml create mode 100644 src/tools/runk/libcontainer/src/builder.rs create mode 100644 src/tools/runk/libcontainer/src/cgroup.rs create mode 100644 src/tools/runk/libcontainer/src/container.rs create mode 100644 src/tools/runk/libcontainer/src/lib.rs create mode 100644 src/tools/runk/libcontainer/src/status.rs create mode 100644 src/tools/runk/libcontainer/src/utils.rs create mode 100644 src/tools/runk/src/commands/create.rs create mode 100644 src/tools/runk/src/commands/delete.rs create mode 100644 src/tools/runk/src/commands/kill.rs create mode 100644 src/tools/runk/src/commands/mod.rs create mode 100644 src/tools/runk/src/commands/run.rs create mode 100644 src/tools/runk/src/commands/spec.rs create mode 100644 src/tools/runk/src/commands/start.rs create mode 100644 src/tools/runk/src/commands/state.rs create mode 100644 src/tools/runk/src/main.rs diff --git a/Makefile b/Makefile index 6bb914271..6f398c323 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,7 @@ TOOLS = TOOLS += agent-ctl TOOLS += trace-forwarder +TOOLS += runk STANDARD_TARGETS = build check clean install test vendor diff --git a/README.md b/README.md index 431d687de..3cac62400 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,7 @@ The table below lists the remaining parts of the project: | [osbuilder](tools/osbuilder) | infrastructure | Tool to create "mini O/S" rootfs and initrd images and kernel for the hypervisor. | | [`agent-ctl`](src/tools/agent-ctl) | utility | Tool that provides low-level access for testing the agent. | | [`trace-forwarder`](src/tools/trace-forwarder) | utility | Agent tracing helper. | +| [`runk`](src/tools/runk) | utility | Standard OCI container runtime based on the agent. | | [`ci`](https://github.com/kata-containers/ci) | CI | Continuous Integration configuration files and scripts. | | [`katacontainers.io`](https://github.com/kata-containers/www.katacontainers.io) | Source for the [`katacontainers.io`](https://www.katacontainers.io) site. | diff --git a/src/tools/runk/.gitignore b/src/tools/runk/.gitignore new file mode 100644 index 000000000..57872d0f1 --- /dev/null +++ b/src/tools/runk/.gitignore @@ -0,0 +1 @@ +/vendor/ diff --git a/src/tools/runk/Cargo.lock b/src/tools/runk/Cargo.lock new file mode 100644 index 000000000..f8628cbf7 --- /dev/null +++ b/src/tools/runk/Cargo.lock @@ -0,0 +1,1474 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "anyhow" +version = "1.0.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f9b8508dccb7687a1d6c4ce66b2b0ecef467c94667de27d8d7fe1f8d2a9cdc" + +[[package]] +name = "arc-swap" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5d78ce20460b82d3fa150275ed9d55e21064fc7951177baacf86a145c4a4b1f" + +[[package]] +name = "async-trait" +version = "0.1.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed6aa3524a2dfcf9fe180c51eae2b58738348d819517ceadf95789c51fff7600" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c" +dependencies = [ + "byteorder", + "iovec", +] + +[[package]] +name = "bytes" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" + +[[package]] +name = "capctl" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "526c6a8746a7cfb052c15d20259c4f5c021966affdc7c960c71ca640f824c801" +dependencies = [ + "bitflags", + "cfg-if 1.0.0", + "libc", +] + +[[package]] +name = "caps" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61bf7211aad104ce2769ec05efcdfabf85ee84ac92461d142f22cf8badd0e54c" +dependencies = [ + "errno", + "libc", + "thiserror", +] + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "cgroups-rs" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdae996d9638ba03253ffa1c93345a585974a97abbdeab9176c77922f3efc1e8" +dependencies = [ + "libc", + "log", + "nix 0.23.1", + "regex", +] + +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "serde", + "time 0.1.44", + "winapi", +] + +[[package]] +name = "clap" +version = "3.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c167e37342afc5f33fd87bbc870cedd020d2a6dffa05d45ccd9241fbdd146db" +dependencies = [ + "atty", + "bitflags", + "clap_derive", + "clap_lex", + "indexmap", + "lazy_static", + "strsim", + "termcolor", + "textwrap", +] + +[[package]] +name = "clap_derive" +version = "3.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3aab4734e083b809aaf5794e14e756d1c798d2c69c7f7de7a09a2f5214993c1" +dependencies = [ + "heck 0.4.0", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "189ddd3b5d32a70b35e7686054371742a937b0d99128e76dde6340210e966669" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" +dependencies = [ + "cfg-if 1.0.0", + "lazy_static", +] + +[[package]] +name = "darling" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f2c43f534ea4b0b049015d00269734195e6d3f0f6635cb692251aca6f9f8b3c" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e91455b86830a1c21799d94524df0845183fa55bafd9aa137b01c7d1065fa36" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29b5acf0dea37a7f66f7b25d2c5e93fd46f8f6968b1a5d7a3e02e97768afc95a" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "derive-new" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d13202debe11181040ae9063d739fa32cfcaaebe2275fe387703460ae2365b30" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66e616858f6187ed828df7c64a6d71720d83767a7f19740b2d1b6fe6327b36e5" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58a94ace95092c5acb1e97a7e846b310cfbd499652f72297da7493f618a98d73" +dependencies = [ + "derive_builder_core", + "syn", +] + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fastrand" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf" +dependencies = [ + "instant", +] + +[[package]] +name = "fixedbitset" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "futures" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f73fe65f54d1e12b726f517d3e2135ca3125a437b6d998caf1962961f7172d9e" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" + +[[package]] +name = "futures-executor" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" + +[[package]] +name = "futures-macro" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868" + +[[package]] +name = "futures-task" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a" + +[[package]] +name = "futures-util" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "indexmap" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f647032dfaa1f8b6dc29bd3edb7bbef4861b8b8007ebb118d6db284fd59f6ee" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "inotify" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +dependencies = [ + "bitflags", + "futures-core", + "inotify-sys", + "libc", + "tokio", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "iovec" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" +dependencies = [ + "libc", +] + +[[package]] +name = "itertools" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.124" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21a41fed9d98f27ab1c6d161da622a4fa35e8a54a8adc24bbf3ddd0ef70b0e50" + +[[package]] +name = "libcontainer" +version = "0.0.1" +dependencies = [ + "anyhow", + "chrono", + "derive_builder", + "libc", + "logging", + "nix 0.23.1", + "oci", + "rustjail", + "serde", + "serde_json", + "slog", + "tempfile", +] + +[[package]] +name = "liboci-cli" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c25aa929b6575121059b01bfd7289dedf25a988534b6550d3472ccd5064be8ce" +dependencies = [ + "clap", +] + +[[package]] +name = "lock_api" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6389c490849ff5bc16be905ae24bc913a9c8892e19b2341dbc175e14c341c2b8" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "logging" +version = "0.1.0" +dependencies = [ + "serde_json", + "slog", + "slog-async", + "slog-json", + "slog-scope", +] + +[[package]] +name = "memchr" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + +[[package]] +name = "mio" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52da4364ffb0e4fe33a9841a98a3f3014fb964045ce4f7a45a398243c8d6b0c9" +dependencies = [ + "libc", + "log", + "miow", + "ntapi", + "wasi 0.11.0+wasi-snapshot-preview1", + "winapi", +] + +[[package]] +name = "miow" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +dependencies = [ + "winapi", +] + +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + +[[package]] +name = "nix" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd0eaf8df8bab402257e0a5c17a254e4cc1f72a93588a1ddfb5d356c801aa7cb" +dependencies = [ + "bitflags", + "cc", + "cfg-if 0.1.10", + "libc", + "void", +] + +[[package]] +name = "nix" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +dependencies = [ + "bitflags", + "cc", + "cfg-if 1.0.0", + "libc", + "memoffset", +] + +[[package]] +name = "ntapi" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f" +dependencies = [ + "winapi", +] + +[[package]] +name = "num-integer" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "num_threads" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aba1801fb138d8e85e11d0fc70baf4fe1cdfffda7c6cd34a854905df588e5ed0" +dependencies = [ + "libc", +] + +[[package]] +name = "oci" +version = "0.1.0" +dependencies = [ + "libc", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "once_cell" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" + +[[package]] +name = "os_str_bytes" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" + +[[package]] +name = "parking_lot" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "995f667a6c822200b0433ac218e05582f0e2efa1b922a3fd2fbaadc5f87bab37" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "redox_syscall", + "smallvec", + "windows-sys", +] + +[[package]] +name = "path-absolutize" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ceeedc827d9a758b4641457683ced2f02d4252cc1bd8794c415ed20256114290" +dependencies = [ + "path-dedot", + "slash-formatter", +] + +[[package]] +name = "path-dedot" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45c58ab1edb03f77d0bb3f08e4a179dd43ce9bc8eab9867ec53a78285ea3039b" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "petgraph" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec757218438d5fda206afc041538b2f6d889286160d649a86a24d37e1235afd1" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "prost" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de5e2533f59d08fcf364fd374ebda0692a70bd6d7e66ef97f306f45c6c5d8020" +dependencies = [ + "bytes 1.1.0", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "355f634b43cdd80724ee7848f95770e7e70eefa6dcf14fea676216573b8fd603" +dependencies = [ + "bytes 1.1.0", + "heck 0.3.3", + "itertools", + "log", + "multimap", + "petgraph", + "prost", + "prost-types", + "tempfile", + "which", +] + +[[package]] +name = "prost-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "600d2f334aa05acb02a755e217ef1ab6dea4d51b58b7846588b747edec04efba" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "603bbd6394701d13f3f25aada59c7de9d35a6a5887cfc156181234a44002771b" +dependencies = [ + "bytes 1.1.0", + "prost", +] + +[[package]] +name = "protobuf" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e86d370532557ae7573551a1ec8235a0f8d6cb276c7c9e6aa490b511c447485" +dependencies = [ + "serde", + "serde_derive", +] + +[[package]] +name = "protobuf-codegen" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de113bba758ccf2c1ef816b127c958001b7831136c9bc3f8e9ec695ac4e82b0c" +dependencies = [ + "protobuf", +] + +[[package]] +name = "protobuf-codegen-pure" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d1a4febc73bf0cada1d77c459a0c8e5973179f1cfd5b0f1ab789d45b17b6440" +dependencies = [ + "protobuf", + "protobuf-codegen", +] + +[[package]] +name = "protocols" +version = "0.1.0" +dependencies = [ + "async-trait", + "protobuf", + "ttrpc", + "ttrpc-codegen", +] + +[[package]] +name = "quote" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1feb54ed693b93a84e14094943b84b7c4eae204c512b7ccb95ab0c66d278ad1" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "rlimit" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81a9ed03edbed449d6897c2092c71ab5f7b5fb80f6f0b1a3ed6d40a6f9fc0720" +dependencies = [ + "libc", +] + +[[package]] +name = "runk" +version = "0.0.1" +dependencies = [ + "anyhow", + "chrono", + "clap", + "libc", + "libcontainer", + "liboci-cli", + "logging", + "nix 0.23.1", + "oci", + "rustjail", + "serde", + "serde_json", + "slog", + "slog-async", + "tokio", +] + +[[package]] +name = "rustjail" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "capctl", + "caps", + "cfg-if 0.1.10", + "cgroups-rs", + "futures", + "inotify", + "lazy_static", + "libc", + "nix 0.23.1", + "oci", + "path-absolutize", + "protobuf", + "protocols", + "regex", + "rlimit", + "scan_fmt", + "scopeguard", + "serde", + "serde_derive", + "serde_json", + "slog", + "slog-scope", + "tokio", +] + +[[package]] +name = "ryu" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" + +[[package]] +name = "scan_fmt" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b53b0a5db882a8e2fdaae0a43f7b39e7e9082389e978398bdf223a55b581248" +dependencies = [ + "regex", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "serde" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e8d9fa5c3b304765ce1fd9c4c8a3de2c8db365a5b91be52f186efc675681d95" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +dependencies = [ + "libc", +] + +[[package]] +name = "slab" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32" + +[[package]] +name = "slash-formatter" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f7fb98e76e2022054673f3ebc43a4e12890ec6272530629df6237cafbb70569" + +[[package]] +name = "slog" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06" + +[[package]] +name = "slog-async" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "766c59b252e62a34651412870ff55d8c4e6d04df19b43eecb2703e417b097ffe" +dependencies = [ + "crossbeam-channel", + "slog", + "take_mut", + "thread_local", +] + +[[package]] +name = "slog-json" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e1e53f61af1e3c8b852eef0a9dee29008f55d6dd63794f3f12cef786cf0f219" +dependencies = [ + "serde", + "serde_json", + "slog", + "time 0.3.9", +] + +[[package]] +name = "slog-scope" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f95a4b4c3274cd2869549da82b57ccc930859bdbf5bcea0424bc5f140b3c786" +dependencies = [ + "arc-swap", + "lazy_static", + "slog", +] + +[[package]] +name = "smallvec" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" + +[[package]] +name = "socket2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "1.0.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b683b2b825c8eef438b77c36a06dc262294da3d5a5813fac20da149241dcd44d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "take_mut" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" + +[[package]] +name = "tempfile" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +dependencies = [ + "cfg-if 1.0.0", + "fastrand", + "libc", + "redox_syscall", + "remove_dir_all", + "winapi", +] + +[[package]] +name = "termcolor" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "textwrap" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" + +[[package]] +name = "thiserror" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" +dependencies = [ + "once_cell", +] + +[[package]] +name = "time" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" +dependencies = [ + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", + "winapi", +] + +[[package]] +name = "time" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +dependencies = [ + "itoa", + "libc", + "num_threads", +] + +[[package]] +name = "tokio" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" +dependencies = [ + "bytes 1.1.0", + "libc", + "memchr", + "mio", + "num_cpus", + "once_cell", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "winapi", +] + +[[package]] +name = "tokio-macros" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b557f72f448c511a979e2564e55d74e6c4432fc96ff4f6241bc6bded342643b7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-vsock" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e0723fc001950a3b018947b05eeb45014fd2b7c6e8f292502193ab74486bdb6" +dependencies = [ + "bytes 0.4.12", + "futures", + "libc", + "tokio", + "vsock", +] + +[[package]] +name = "ttrpc" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "004604e91de38bc16cb9c7898187343075388ea414ad24896a21fc4e91a7c861" +dependencies = [ + "async-trait", + "byteorder", + "futures", + "libc", + "log", + "nix 0.16.1", + "protobuf", + "protobuf-codegen-pure", + "thiserror", + "tokio", + "tokio-vsock", +] + +[[package]] +name = "ttrpc-codegen" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809eda4e459820237104e4b61d6b41bbe6c9e1ce6adf4057955e6e6722a90408" +dependencies = [ + "protobuf", + "protobuf-codegen", + "protobuf-codegen-pure", + "ttrpc-compiler", +] + +[[package]] +name = "ttrpc-compiler" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2978ed3fa047d8fd55cbeb4d4a61d461fb3021a90c9618519c73ce7e5bb66c15" +dependencies = [ + "derive-new", + "prost", + "prost-build", + "prost-types", + "protobuf", + "protobuf-codegen", + "tempfile", +] + +[[package]] +name = "unicode-segmentation" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + +[[package]] +name = "vsock" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e32675ee2b3ce5df274c0ab52d19b28789632406277ca26bffee79a8e27dc133" +dependencies = [ + "libc", + "nix 0.23.1", +] + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "which" +version = "4.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c4fb54e6113b6a8772ee41c3404fb0301ac79604489467e0a9ce1f3e97c24ae" +dependencies = [ + "either", + "lazy_static", + "libc", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5acdd78cb4ba54c0045ac14f62d8f94a03d10047904ae2a40afa1e99d8f70825" +dependencies = [ + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_msvc" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17cffbe740121affb56fad0fc0e421804adf0ae00891205213b5cecd30db881d" + +[[package]] +name = "windows_i686_gnu" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2564fde759adb79129d9b4f54be42b32c89970c18ebf93124ca8870a498688ed" + +[[package]] +name = "windows_i686_msvc" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cd9d32ba70453522332c14d38814bceeb747d80b3958676007acadd7e166956" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfce6deae227ee8d356d19effc141a509cc503dfd1f850622ec4b0f84428e1f4" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d19538ccc21819d01deaf88d6a17eae6596a12e9aafdbb97916fb49896d89de9" diff --git a/src/tools/runk/Cargo.toml b/src/tools/runk/Cargo.toml new file mode 100644 index 000000000..83aca82ca --- /dev/null +++ b/src/tools/runk/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "runk" +version = "0.0.1" +authors = ["The Kata Containers community "] +description = "runk: Kata OCI container runtime based on Kata agent" +license = "Apache-2.0" +edition = "2018" + +[dependencies] +libcontainer = { path = "./libcontainer" } +rustjail = { path = "../../agent/rustjail", features = ["standard-oci-runtime"] } +oci = { path = "../../libs/oci" } +logging = { path = "../../libs/logging" } +liboci-cli = "0.0.3" +clap = { version = "3.0.6", features = ["derive", "cargo"] } +libc = "0.2.108" +nix = "0.23.0" +anyhow = "1.0.52" +slog = "2.7.0" +chrono = { version = "0.4.19", features = ["serde"] } +slog-async = "2.7.0" +tokio = { version = "1.15.0", features = ["full"] } +serde = { version = "1.0.133", features = ["derive"] } +serde_json = "1.0.74" + +[workspace] +members = [ + "libcontainer" +] diff --git a/src/tools/runk/Makefile b/src/tools/runk/Makefile new file mode 100644 index 000000000..cfd795fb6 --- /dev/null +++ b/src/tools/runk/Makefile @@ -0,0 +1,60 @@ +# Copyright 2021-2022 Sony Group Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +include ../../../utils.mk + +TARGET = runk +TARGET_PATH = target/$(TRIPLE)/$(BUILD_TYPE)/$(TARGET) + +AGENT_TARGET = oci-kata-agent +AGENT_TARGET_PATH = target/$(TRIPLE)/$(BUILD_TYPE)/$(AGENT_TARGET) +AGENT_SOURCE_PATH = ../../agent + +# BINDIR is a directory for installing executable programs +BINDIR := /usr/local/bin + +.DEFAULT_GOAL := default +default: build + +build: build-agent build-runk + +build-agent: + make -C $(AGENT_SOURCE_PATH) STANDARD_OCI_RUNTIME=yes + +build-runk: + @RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) + +install: install-agent install-runk + +install-agent: + install -D $(AGENT_SOURCE_PATH)/$(AGENT_TARGET_PATH) $(BINDIR)/$(AGENT_TARGET) + +install-runk: + install -D $(TARGET_PATH) $(BINDIR)/$(TARGET) + +clean: + cargo clean + +vendor: + cargo vendor + +test: + cargo test --all --target $(TRIPLE) -- --nocapture + +check: standard_rust_check + +.PHONY: \ + build \ + build-agent \ + build-runk \ + install \ + install-agent \ + install-runk \ + clean \ + clippy \ + format \ + vendor \ + test \ + check \ diff --git a/src/tools/runk/README.md b/src/tools/runk/README.md new file mode 100644 index 000000000..81f02fc59 --- /dev/null +++ b/src/tools/runk/README.md @@ -0,0 +1,282 @@ +# runk + +## Overview + +> **Warnings:** +> `runk` is currently an experimental tool. +> Only continue if you are using a non-critical system. + +`runk` is a standard OCI container runtime written in Rust based on a modified version of +the [Kata Container agent](https://github.com/kata-containers/kata-containers/tree/main/src/agent), `kata-agent`. + +`runk` conforms to the [OCI Container Runtime specifications](https://github.com/opencontainers/runtime-spec). + +Unlike the [Kata Container runtime](https://github.com/kata-containers/kata-containers/tree/main/src/agent#features), +`kata-runtime`, `runk` spawns and runs containers on the host machine directly. +The user can run `runk` in the same way as the existing container runtimes such as `runc`, +the most used implementation of the OCI runtime specs. + +## Why does `runk` exist? + +The `kata-agent` is a process running inside a virtual machine (VM) as a supervisor for managing containers +and processes running within those containers. +In other words, the `kata-agent` is a kind of "low-level" container runtime inside VM because the agent +spawns and runs containers according to the OCI runtime specs. +However, the `kata-agent` does not have the OCI Command-Line Interface (CLI) that is defined in the +[runtime spec](https://github.com/opencontainers/runtime-spec/blob/master/runtime.md). +The `kata-runtime` provides the CLI part of the Kata Containers runtime component, +but the `kata-runtime` is a container runtime for creating hardware-virtualized containers running on the host. + +`runk` is a Rust-based standard OCI container runtime that manages normal containers, +not hardware-virtualized containers. +`runk` aims to become one of the alternatives to existing OCI compliant container runtimes. +The `kata-agent` has most of the [features](https://github.com/kata-containers/kata-containers/tree/main/src/agent#features) +needed for the container runtime and delivers high performance with a low memory footprint owing to the +implementation by Rust language. +Therefore, `runk` leverages the mechanism of the `kata-agent` to avoid reinventing the wheel. + +## Performance + +`runk` is faster than `runc` and has a lower memory footprint. + +This table shows the average of the elapsed time and the memory footprint (maximum resident set size) +for running sequentially 100 containers, the containers run `/bin/true` using `run` command with +[detached mode](https://github.com/opencontainers/runc/blob/master/docs/terminals.md#detached) +on 12 CPU cores (`3.8 GHz AMD Ryzen 9 3900X`) and 32 GiB of RAM. +`runk` always runs containers with detached mode currently. + +Evaluation Results: + +| | `runk` (v0.0.1) | `runc` (v1.0.3) | `crun` (v1.4.2) | +|-----------------------|---------------|---------------|---------------| +| time [ms] | 39.83 | 50.39 | 38.41 | +| memory footprint [MB] | 4.013 | 10.78 | 1.738 | + +## Status of `runk` + +We drafted the initial code here, and any contributions to `runk` and [`kata-agent`](https://github.com/kata-containers/kata-containers/tree/main/src/agent) +are welcome. + +Regarding features compared to `runc`, see the `Status of runk` section in the [issue](https://github.com/kata-containers/kata-containers/issues/2784). + +## Building + +`runk` uses the modified the `kata-agent` binary, `oci-kata-agent`, which is an agent to be called from `runk`. +Therefore, you also need to build the `oci-kata-agent` to run `runk`. + +You can build both `runk` and `oci-kata-agent` as follows. + +```bash +$ cd runk +$ make +``` + +To install `runk` and `oci-kata-agent` into default directory for install executable program (`/usr/local/bin`): + +```bash +$ sudo make install +``` + +## Using `runk` directly + +Please note that `runk` is a low level tool not developed with an end user in mind. +It is mostly employed by other higher-level container software like `containerd`. + +If you still want to use `runk` directly, here's how. + +### Prerequisites + +It is necessary to create an OCI bundle to use the tool. The simplest method is: + +``` bash +$ bundle_dir="bundle" +$ rootfs_dir="$bundle_dir/rootfs" +$ image="busybox" +$ mkdir -p "$rootfs_dir" && (cd "$bundle_dir" && runk spec) +$ sudo docker export $(sudo docker create "$image") | tar -C "$rootfs_dir" -xvf - +``` + +> **Note:** +> If you use the unmodified `runk spec` template, this should give a `sh` session inside the container. +> However, if you use `runk` directly and run a container with the unmodified template, +> `runk` cannot launch the `sh` session because `runk` does not support terminal handling yet. +> You need to edit the process field in the `config.json` should look like this below +> with `"terminal": false` and `"args": ["sleep", "10"]`. + +```json +"process": { + "terminal": false, + "user": { + "uid": 0, + "gid": 0 + }, + "args": [ + "sleep", + "10" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + [...] +} +``` + +If you want to launch the `sh` session inside the container, you need to run `runk` from `containerd`. + +Please refer to the [Using `runk` from containerd](#using-runk-from-containerd) section + +### Running a container + +Now you can go through the [lifecycle operations](https://github.com/opencontainers/runtime-spec/blob/master/runtime.md) +in your shell. +You need to run `runk` as `root` because `runk` does not have the rootless feature which is the ability +to run containers without root privileges. + +```bash +$ cd $bundle_dir + +# Create a container +$ sudo runk create test + +# View the container is created and in the "created" state +$ sudo runk state test + +# Start the process inside the container +$ sudo runk start test + +# After 10 seconds view that the container has exited and is now in the "stopped" state +$ sudo runk state test + +# Now delete the container +$ sudo runk delete test +``` + +## Using `runk` from `containerd` + +`runk` can run containers with the containerd runtime handler support on `containerd`. + +### Prerequisites for `runk` with containerd + +* `containerd` v1.2.4 or above +* `cri-tool` + +> **Note:** +> [`cri-tools`](https://github.com/kubernetes-sigs/cri-tools) is a set of tools for CRI +> used for development and testing. + +Install `cri-tools` from source code: + +```bash +$ go get github.com/kubernetes-incubator/cri-tools +$ pushd $GOPATH/src/github.com/kubernetes-incubator/cri-tools +$ make +$ sudo -E make install +$ popd +``` + +Write the `crictl` configuration file: + +``` bash +$ cat <"] +description = "Library for runk container" +license = "Apache-2.0" +edition = "2018" + +[dependencies] +rustjail = { path = "../../../agent/rustjail", features = ["standard-oci-runtime"] } +oci = { path = "../../../libs/oci" } +logging = { path = "../../../libs/logging" } +derive_builder = "0.10.2" +libc = "0.2.108" +nix = "0.23.0" +anyhow = "1.0.52" +slog = "2.7.0" +chrono = { version = "0.4.19", features = ["serde"] } +serde = { version = "1.0.133", features = ["derive"] } +serde_json = "1.0.74" + +[dev-dependencies] +tempfile = "3.3.0" diff --git a/src/tools/runk/libcontainer/src/builder.rs b/src/tools/runk/libcontainer/src/builder.rs new file mode 100644 index 000000000..738c639ae --- /dev/null +++ b/src/tools/runk/libcontainer/src/builder.rs @@ -0,0 +1,121 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use crate::container::{get_config_path, ContainerContext}; +use anyhow::{anyhow, Result}; +use derive_builder::Builder; +use oci::Spec; +use std::path::{Path, PathBuf}; + +#[derive(Default, Builder, Debug)] +pub struct Container { + id: String, + bundle: PathBuf, + root: PathBuf, + console_socket: Option, +} + +impl Container { + pub fn create_ctx(self) -> Result { + let bundle_canon = self.bundle.canonicalize()?; + let config_path = get_config_path(&bundle_canon); + let mut spec = Spec::load( + config_path + .to_str() + .ok_or_else(|| anyhow!("invalid config path"))?, + )?; + + if spec.root.is_some() { + let mut spec_root = spec + .root + .as_mut() + .ok_or_else(|| anyhow!("root config was not present in the spec file"))?; + let rootfs_path = Path::new(&spec_root.path); + + // If the rootfs path in the spec file is a relative path, + // convert it into a canonical path to pass validation of rootfs in the agent. + if !&rootfs_path.is_absolute() { + let rootfs_name = rootfs_path + .file_name() + .ok_or_else(|| anyhow!("invalid rootfs name"))?; + spec_root.path = bundle_canon + .join(rootfs_name) + .to_str() + .map(|s| s.to_string()) + .ok_or_else(|| anyhow!("failed to convert bundle path"))?; + } + } + + Ok(ContainerContext { + id: self.id, + bundle: self.bundle, + state_root: self.root, + spec, + // TODO: liboci-cli does not support --no-pivot option for create and run command. + // After liboci-cli supports the option, we will change the following code. + // no_pivot_root: self.no_pivot, + no_pivot_root: false, + console_socket: self.console_socket, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::container::CONFIG_FILE_NAME; + use oci::Spec; + use std::{fs::File, path::PathBuf}; + use tempfile::tempdir; + + #[derive(Debug)] + struct TestData { + id: String, + bundle: PathBuf, + root: PathBuf, + console_socket: Option, + spec: Spec, + no_pivot_root: bool, + } + + #[test] + fn test_create_ctx() { + let bundle_dir = tempdir().unwrap(); + let config_file = bundle_dir.path().join(CONFIG_FILE_NAME); + let spec = Spec::default(); + let file = File::create(config_file).unwrap(); + serde_json::to_writer(&file, &spec).unwrap(); + + let test_data = TestData { + id: String::from("test"), + bundle: PathBuf::from(bundle_dir.into_path()), + root: PathBuf::from("test"), + console_socket: Some(PathBuf::from("test")), + spec: Spec::default(), + no_pivot_root: false, + }; + + let test_ctx = ContainerContext { + id: test_data.id.clone(), + bundle: test_data.bundle.clone(), + state_root: test_data.root.clone(), + spec: test_data.spec.clone(), + no_pivot_root: test_data.no_pivot_root, + console_socket: test_data.console_socket.clone(), + }; + + let ctx = ContainerBuilder::default() + .id(test_data.id.clone()) + .bundle(test_data.bundle.clone()) + .root(test_data.root.clone()) + .console_socket(test_data.console_socket.clone()) + .build() + .unwrap() + .create_ctx() + .unwrap(); + + assert_eq!(test_ctx, ctx); + } +} diff --git a/src/tools/runk/libcontainer/src/cgroup.rs b/src/tools/runk/libcontainer/src/cgroup.rs new file mode 100644 index 000000000..9b53bb368 --- /dev/null +++ b/src/tools/runk/libcontainer/src/cgroup.rs @@ -0,0 +1,40 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Result}; +use rustjail::cgroups::fs::Manager as CgroupManager; +use std::{ + path::Path, + {fs, thread, time}, +}; + +pub fn destroy_cgroup(cgroup_mg: &CgroupManager) -> Result<()> { + for path in cgroup_mg.paths.values() { + remove_cgroup_dir(Path::new(path))?; + } + + Ok(()) +} + +// Try to remove the provided cgroups path five times with increasing delay between tries. +// If after all there are not removed cgroups, an appropriate error will be returned. +fn remove_cgroup_dir(path: &Path) -> Result<()> { + let mut retries = 5; + let mut delay = time::Duration::from_millis(10); + while retries != 0 { + if retries != 5 { + delay *= 2; + thread::sleep(delay); + } + + if !path.exists() || fs::remove_dir(path).is_ok() { + return Ok(()); + } + + retries -= 1; + } + + return Err(anyhow!("failed to remove cgroups paths: {:?}", path)); +} diff --git a/src/tools/runk/libcontainer/src/container.rs b/src/tools/runk/libcontainer/src/container.rs new file mode 100644 index 000000000..d5464e923 --- /dev/null +++ b/src/tools/runk/libcontainer/src/container.rs @@ -0,0 +1,151 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use crate::status::Status; +use anyhow::{anyhow, Result}; +use nix::unistd::{chdir, unlink, Pid}; +use oci::Spec; +use rustjail::{ + container::{BaseContainer, LinuxContainer, EXEC_FIFO_FILENAME}, + process::Process, + specconv::CreateOpts, +}; +use slog::Logger; +use std::{ + env::current_dir, + path::{Path, PathBuf}, +}; + +pub const CONFIG_FILE_NAME: &str = "config.json"; + +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum ContainerAction { + Create, + Run, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct ContainerContext { + pub id: String, + pub bundle: PathBuf, + pub state_root: PathBuf, + pub spec: Spec, + pub no_pivot_root: bool, + pub console_socket: Option, +} + +impl ContainerContext { + pub async fn launch(&self, action: ContainerAction, logger: &Logger) -> Result { + Status::create_dir(&self.state_root, &self.id)?; + + let current_dir = current_dir()?; + chdir(&self.bundle)?; + + let create_opts = CreateOpts { + cgroup_name: "".to_string(), + use_systemd_cgroup: false, + no_pivot_root: self.no_pivot_root, + no_new_keyring: false, + spec: Some(self.spec.clone()), + rootless_euid: false, + rootless_cgroup: false, + }; + + let mut ctr = LinuxContainer::new( + &self.id, + &self + .state_root + .to_str() + .map(|s| s.to_string()) + .ok_or_else(|| anyhow!("failed to convert bundle path"))?, + create_opts.clone(), + logger, + )?; + + let process = if self.spec.process.is_some() { + Process::new( + logger, + self.spec + .process + .as_ref() + .ok_or_else(|| anyhow!("process config was not present in the spec file"))?, + &self.id, + true, + 0, + )? + } else { + return Err(anyhow!("no process configuration")); + }; + + if let Some(ref csocket_path) = self.console_socket { + ctr.set_console_socket(csocket_path)?; + } + + match action { + ContainerAction::Create => { + ctr.start(process).await?; + } + ContainerAction::Run => { + ctr.run(process).await?; + } + } + + let oci_state = ctr.oci_state()?; + let status = Status::new( + &self.state_root, + oci_state, + ctr.init_process_start_time, + ctr.created, + ctr.cgroup_manager + .ok_or_else(|| anyhow!("cgroup manager was not present"))?, + create_opts, + )?; + + status.save()?; + + if action == ContainerAction::Run { + let fifo_path = get_fifo_path(&status); + if fifo_path.exists() { + unlink(&fifo_path)?; + } + } + + chdir(¤t_dir)?; + + Ok(Pid::from_raw(ctr.init_process_pid)) + } +} + +pub fn get_config_path>(bundle: P) -> PathBuf { + bundle.as_ref().join(CONFIG_FILE_NAME) +} + +pub fn get_fifo_path(status: &Status) -> PathBuf { + status.root.join(&status.id).join(EXEC_FIFO_FILENAME) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::test_utils::*; + use rustjail::container::EXEC_FIFO_FILENAME; + use std::path::PathBuf; + + #[test] + fn test_get_config_path() { + let test_data = PathBuf::from(TEST_BUNDLE_PATH).join(CONFIG_FILE_NAME); + assert_eq!(get_config_path(TEST_BUNDLE_PATH), test_data); + } + + #[test] + fn test_get_fifo_path() { + let test_data = PathBuf::from(TEST_BUNDLE_PATH) + .join(TEST_CONTAINER_ID) + .join(EXEC_FIFO_FILENAME); + let status = create_dummy_status(); + + assert_eq!(get_fifo_path(&status), test_data); + } +} diff --git a/src/tools/runk/libcontainer/src/lib.rs b/src/tools/runk/libcontainer/src/lib.rs new file mode 100644 index 000000000..49dae4694 --- /dev/null +++ b/src/tools/runk/libcontainer/src/lib.rs @@ -0,0 +1,10 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +pub mod builder; +pub mod cgroup; +pub mod container; +pub mod status; +pub mod utils; diff --git a/src/tools/runk/libcontainer/src/status.rs b/src/tools/runk/libcontainer/src/status.rs new file mode 100644 index 000000000..21ba00cda --- /dev/null +++ b/src/tools/runk/libcontainer/src/status.rs @@ -0,0 +1,246 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use crate::container::get_fifo_path; +use crate::utils::*; +use anyhow::{anyhow, Result}; +use chrono::{DateTime, Utc}; +use libc::pid_t; +use nix::{ + errno::Errno, + sys::{signal::kill, stat::Mode}, + unistd::Pid, +}; +use oci::{ContainerState, State as OCIState}; +use rustjail::{cgroups::fs::Manager as CgroupManager, specconv::CreateOpts}; +use serde::{Deserialize, Serialize}; +use std::{ + fs::{self, File, OpenOptions}, + path::{Path, PathBuf}, + time::SystemTime, +}; + +const STATUS_FILE: &str = "status.json"; + +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct Status { + pub oci_version: String, + pub id: String, + pub pid: pid_t, + pub root: PathBuf, + pub bundle: PathBuf, + pub rootfs: String, + pub process_start_time: u64, + pub created: DateTime, + pub cgroup_manager: CgroupManager, + pub config: CreateOpts, +} + +impl Status { + pub fn new( + root: &Path, + oci_state: OCIState, + process_start_time: u64, + created_time: SystemTime, + cgroup_mg: CgroupManager, + config: CreateOpts, + ) -> Result { + let created = DateTime::from(created_time); + let rootfs = config + .clone() + .spec + .ok_or_else(|| anyhow!("spec config was not present"))? + .root + .as_ref() + .ok_or_else(|| anyhow!("root config was not present in the spec"))? + .path + .clone(); + + Ok(Self { + oci_version: oci_state.version, + id: oci_state.id, + pid: oci_state.pid, + root: root.to_path_buf(), + bundle: PathBuf::from(&oci_state.bundle), + rootfs, + process_start_time, + created, + cgroup_manager: cgroup_mg, + config, + }) + } + + pub fn save(&self) -> Result<()> { + let state_file_path = Self::get_file_path(&self.root, &self.id); + + if !&self.root.exists() { + create_dir_with_mode(&self.root, Mode::S_IRWXU, true)?; + } + + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(state_file_path)?; + + serde_json::to_writer(&file, self)?; + + Ok(()) + } + + pub fn load(state_root: &Path, id: &str) -> Result { + let state_file_path = Self::get_file_path(state_root, id); + if !state_file_path.exists() { + return Err(anyhow!("container \"{}\" does not exist", id)); + } + + let file = File::open(&state_file_path)?; + let state: Self = serde_json::from_reader(&file)?; + + Ok(state) + } + + pub fn create_dir(state_root: &Path, id: &str) -> Result<()> { + let state_dir_path = Self::get_dir_path(state_root, id); + if !state_dir_path.exists() { + create_dir_with_mode(state_dir_path, Mode::S_IRWXU, true)?; + } else { + return Err(anyhow!("container with id exists: \"{}\"", id)); + } + + Ok(()) + } + + pub fn remove_dir(&self) -> Result<()> { + let state_dir_path = Self::get_dir_path(&self.root, &self.id); + fs::remove_dir_all(state_dir_path)?; + + Ok(()) + } + + pub fn get_dir_path(state_root: &Path, id: &str) -> PathBuf { + state_root.join(id) + } + + pub fn get_file_path(state_root: &Path, id: &str) -> PathBuf { + state_root.join(id).join(STATUS_FILE) + } +} + +pub fn is_process_running(pid: Pid) -> Result { + match kill(pid, None) { + Err(errno) => { + if errno != Errno::ESRCH { + return Err(anyhow!("no such process")); + } + Ok(false) + } + Ok(()) => Ok(true), + } +} + +pub fn get_current_container_state(status: &Status) -> Result { + let running = is_process_running(Pid::from_raw(status.pid))?; + let mut has_fifo = false; + + if running { + let fifo = get_fifo_path(status); + if fifo.exists() { + has_fifo = true + } + } + + if running && !has_fifo { + // TODO: Check paused status. + // runk does not support pause command currently. + } + + if !running { + Ok(ContainerState::Stopped) + } else if has_fifo { + Ok(ContainerState::Created) + } else { + Ok(ContainerState::Running) + } +} + +pub fn get_all_pid(cgm: &CgroupManager) -> Result> { + let cgroup_path = cgm.paths.get("devices"); + match cgroup_path { + Some(v) => { + let path = Path::new(v); + if !path.exists() { + return Err(anyhow!("cgroup devices file does not exist")); + } + + let procs_path = path.join("cgroup.procs"); + let pids: Vec = lines_from_file(&procs_path)? + .into_iter() + .map(|v| { + Pid::from_raw( + v.parse::() + .expect("failed to parse string into pid_t"), + ) + }) + .collect(); + Ok(pids) + } + None => Err(anyhow!("cgroup devices file dose not exist")), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::test_utils::*; + use chrono::{DateTime, Utc}; + use nix::unistd::getpid; + use oci::ContainerState; + use rustjail::cgroups::fs::Manager as CgroupManager; + use std::path::Path; + use std::time::SystemTime; + + #[test] + fn test_status() { + let cgm: CgroupManager = serde_json::from_str(TEST_CGM_DATA).unwrap(); + let oci_state = create_dummy_oci_state(); + let created = SystemTime::now(); + let status = Status::new( + Path::new(TEST_BUNDLE_PATH), + oci_state.clone(), + 1, + created, + cgm, + create_dummy_opts(), + ) + .unwrap(); + + assert_eq!(status.id, oci_state.id); + assert_eq!(status.pid, oci_state.pid); + assert_eq!(status.process_start_time, 1); + assert_eq!(status.created, DateTime::::from(created)); + } + + #[test] + fn test_is_process_running() { + let pid = getpid(); + let ret = is_process_running(pid).unwrap(); + assert!(ret); + } + + #[test] + fn test_get_current_container_state() { + let status = create_dummy_status(); + let state = get_current_container_state(&status).unwrap(); + assert_eq!(state, ContainerState::Running); + } + + #[test] + fn test_get_all_pid() { + let cgm: CgroupManager = serde_json::from_str(TEST_CGM_DATA).unwrap(); + assert!(get_all_pid(&cgm).is_ok()); + } +} diff --git a/src/tools/runk/libcontainer/src/utils.rs b/src/tools/runk/libcontainer/src/utils.rs new file mode 100644 index 000000000..5a356d7c2 --- /dev/null +++ b/src/tools/runk/libcontainer/src/utils.rs @@ -0,0 +1,106 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Result}; +use nix::sys::stat::Mode; +use std::{ + fs::{DirBuilder, File}, + io::{prelude::*, BufReader}, + os::unix::fs::DirBuilderExt, + path::Path, +}; + +pub fn lines_from_file>(path: P) -> Result> { + let file = File::open(&path)?; + let buf = BufReader::new(file); + Ok(buf + .lines() + .map(|v| v.expect("could not parse line")) + .collect()) +} + +pub fn create_dir_with_mode>(path: P, mode: Mode, recursive: bool) -> Result<()> { + let path = path.as_ref(); + if path.exists() { + return Err(anyhow!("{} already exists", path.display())); + } + + Ok(DirBuilder::new() + .recursive(recursive) + .mode(mode.bits()) + .create(path)?) +} + +#[cfg(test)] +pub(crate) mod test_utils { + use crate::status::Status; + use nix::unistd::getpid; + use oci::State as OCIState; + use oci::{ContainerState, Root, Spec}; + use rustjail::cgroups::fs::Manager as CgroupManager; + use rustjail::specconv::CreateOpts; + use std::path::Path; + use std::time::SystemTime; + + pub const TEST_CONTAINER_ID: &str = "test"; + pub const TEST_BUNDLE_PATH: &str = "/test"; + pub const TEST_ANNOTATION: &str = "test"; + pub const TEST_CGM_DATA: &str = r#"{ + "paths": { + "devices": "/sys/fs/cgroup/devices" + }, + "mounts": { + "devices": "/sys/fs/cgroup/devices" + }, + "cpath": "test" + }"#; + + pub fn create_dummy_opts() -> CreateOpts { + let spec = Spec { + root: Some(Root::default()), + ..Default::default() + }; + CreateOpts { + cgroup_name: "".to_string(), + use_systemd_cgroup: false, + no_pivot_root: false, + no_new_keyring: false, + spec: Some(spec), + rootless_euid: false, + rootless_cgroup: false, + } + } + + pub fn create_dummy_oci_state() -> OCIState { + OCIState { + version: "1.0.0".to_string(), + id: TEST_CONTAINER_ID.to_string(), + status: ContainerState::Running, + pid: getpid().as_raw(), + bundle: TEST_BUNDLE_PATH.to_string(), + annotations: [(TEST_ANNOTATION.to_string(), TEST_ANNOTATION.to_string())] + .iter() + .cloned() + .collect(), + } + } + + pub fn create_dummy_status() -> Status { + let cgm: CgroupManager = serde_json::from_str(TEST_CGM_DATA).unwrap(); + let oci_state = create_dummy_oci_state(); + let created = SystemTime::now(); + let status = Status::new( + Path::new(TEST_BUNDLE_PATH), + oci_state.clone(), + 1, + created, + cgm, + create_dummy_opts(), + ) + .unwrap(); + + status + } +} diff --git a/src/tools/runk/src/commands/create.rs b/src/tools/runk/src/commands/create.rs new file mode 100644 index 000000000..b29d5b914 --- /dev/null +++ b/src/tools/runk/src/commands/create.rs @@ -0,0 +1,37 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; +use libcontainer::{builder::ContainerBuilder, container::ContainerAction}; +use liboci_cli::Create; +use nix::unistd::Pid; +use slog::{info, Logger}; +use std::{fs, path::Path}; + +pub async fn run(opts: Create, root: &Path, logger: &Logger) -> Result<()> { + let ctx = ContainerBuilder::default() + .id(opts.container_id) + .bundle(opts.bundle) + .root(root.to_path_buf()) + .console_socket(opts.console_socket) + .build()? + .create_ctx()?; + + let pid = ctx.launch(ContainerAction::Create, logger).await?; + + if let Some(ref pid_file) = opts.pid_file { + create_pid_file(pid_file, pid)?; + } + + info!(&logger, "create command finished successfully"); + + Ok(()) +} + +fn create_pid_file>(pid_file: P, pid: Pid) -> Result<()> { + fs::write(pid_file.as_ref(), format!("{}", pid))?; + + Ok(()) +} diff --git a/src/tools/runk/src/commands/delete.rs b/src/tools/runk/src/commands/delete.rs new file mode 100644 index 000000000..f84c79d08 --- /dev/null +++ b/src/tools/runk/src/commands/delete.rs @@ -0,0 +1,103 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Result}; +use libcontainer::{ + cgroup, + status::{get_current_container_state, Status}, +}; +use liboci_cli::Delete; +use nix::{ + errno::Errno, + sys::signal::{kill, Signal}, + unistd::Pid, +}; +use oci::{ContainerState, State as OCIState}; +use rustjail::container; +use slog::{info, Logger}; +use std::{fs, path::Path}; + +pub async fn run(opts: Delete, root: &Path, logger: &Logger) -> Result<()> { + let container_id = &opts.container_id; + let status_dir = Status::get_dir_path(root, container_id); + if !status_dir.exists() { + return Err(anyhow!("container {} does not exist", container_id)); + } + + let status = if let Ok(value) = Status::load(root, container_id) { + value + } else { + fs::remove_dir_all(status_dir)?; + return Ok(()); + }; + + let spec = status + .config + .spec + .as_ref() + .ok_or_else(|| anyhow!("spec config was not present in the status"))?; + + let oci_state = OCIState { + version: status.oci_version.clone(), + id: status.id.clone(), + status: get_current_container_state(&status)?, + pid: status.pid, + bundle: status + .bundle + .to_str() + .ok_or_else(|| anyhow!("invalid bundle path"))? + .to_string(), + annotations: spec.annotations.clone(), + }; + + if spec.hooks.is_some() { + let hooks = spec + .hooks + .as_ref() + .ok_or_else(|| anyhow!("hooks config was not present"))?; + for h in hooks.poststop.iter() { + container::execute_hook(logger, h, &oci_state).await?; + } + } + + match oci_state.status { + ContainerState::Stopped => { + destroy_container(&status)?; + } + ContainerState::Created => { + kill(Pid::from_raw(status.pid), Some(Signal::SIGKILL))?; + destroy_container(&status)?; + } + _ => { + if opts.force { + match kill(Pid::from_raw(status.pid), Some(Signal::SIGKILL)) { + Err(errno) => { + if errno != Errno::ESRCH { + return Err(anyhow!("{}", errno)); + } + } + Ok(()) => {} + } + destroy_container(&status)?; + } else { + return Err(anyhow!( + "cannot delete container {} that is not stopped", + container_id + )); + } + } + } + + info!(&logger, "delete command finished successfully"); + + Ok(()) +} + +fn destroy_container(status: &Status) -> Result<()> { + cgroup::destroy_cgroup(&status.cgroup_manager)?; + status.remove_dir()?; + + Ok(()) +} diff --git a/src/tools/runk/src/commands/kill.rs b/src/tools/runk/src/commands/kill.rs new file mode 100644 index 000000000..e9de81359 --- /dev/null +++ b/src/tools/runk/src/commands/kill.rs @@ -0,0 +1,82 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Result}; +use libcontainer::status::{self, get_current_container_state, Status}; +use liboci_cli::Kill; +use nix::{ + sys::signal::{kill, Signal}, + unistd::Pid, +}; +use oci::ContainerState; +use slog::{info, Logger}; +use std::{convert::TryFrom, path::Path, str::FromStr}; + +pub fn run(opts: Kill, state_root: &Path, logger: &Logger) -> Result<()> { + let container_id = &opts.container_id; + let status = Status::load(state_root, container_id)?; + let current_state = get_current_container_state(&status)?; + let sig = parse_signal(&opts.signal)?; + + // TODO: liboci-cli does not support --all option for kill command. + // After liboci-cli supports the option, we will change the following code. + let all = false; + if all { + let pids = status::get_all_pid(&status.cgroup_manager)?; + for pid in pids { + if !status::is_process_running(pid)? { + continue; + } + kill(pid, sig)?; + } + } else { + if current_state == ContainerState::Stopped { + return Err(anyhow!("container {} not running", container_id)); + } + + let p = Pid::from_raw(status.pid); + if status::is_process_running(p)? { + kill(p, sig)?; + } + } + + info!(&logger, "kill command finished successfully"); + + Ok(()) +} + +fn parse_signal(signal: &str) -> Result { + if let Ok(num) = signal.parse::() { + return Ok(Signal::try_from(num)?); + } + + let mut signal_upper = signal.to_uppercase(); + if !signal_upper.starts_with("SIG") { + signal_upper = "SIG".to_string() + &signal_upper; + } + + Ok(Signal::from_str(&signal_upper)?) +} + +#[cfg(test)] +mod tests { + use super::*; + use nix::sys::signal::Signal; + + #[test] + fn test_parse_signal() { + assert_eq!(Signal::SIGHUP, parse_signal("1").unwrap()); + assert_eq!(Signal::SIGHUP, parse_signal("sighup").unwrap()); + assert_eq!(Signal::SIGHUP, parse_signal("hup").unwrap()); + assert_eq!(Signal::SIGHUP, parse_signal("SIGHUP").unwrap()); + assert_eq!(Signal::SIGHUP, parse_signal("HUP").unwrap()); + + assert_eq!(Signal::SIGKILL, parse_signal("9").unwrap()); + assert_eq!(Signal::SIGKILL, parse_signal("sigkill").unwrap()); + assert_eq!(Signal::SIGKILL, parse_signal("kill").unwrap()); + assert_eq!(Signal::SIGKILL, parse_signal("SIGKILL").unwrap()); + assert_eq!(Signal::SIGKILL, parse_signal("KILL").unwrap()); + } +} diff --git a/src/tools/runk/src/commands/mod.rs b/src/tools/runk/src/commands/mod.rs new file mode 100644 index 000000000..12017263d --- /dev/null +++ b/src/tools/runk/src/commands/mod.rs @@ -0,0 +1,12 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +pub mod create; +pub mod delete; +pub mod kill; +pub mod run; +pub mod spec; +pub mod start; +pub mod state; diff --git a/src/tools/runk/src/commands/run.rs b/src/tools/runk/src/commands/run.rs new file mode 100644 index 000000000..bdf2e91d9 --- /dev/null +++ b/src/tools/runk/src/commands/run.rs @@ -0,0 +1,26 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; +use libcontainer::{builder::ContainerBuilder, container::ContainerAction}; +use liboci_cli::Run; +use slog::{info, Logger}; +use std::path::Path; + +pub async fn run(opts: Run, root: &Path, logger: &Logger) -> Result<()> { + let ctx = ContainerBuilder::default() + .id(opts.container_id) + .bundle(opts.bundle) + .root(root.to_path_buf()) + .console_socket(opts.console_socket) + .build()? + .create_ctx()?; + + ctx.launch(ContainerAction::Run, logger).await?; + + info!(&logger, "run command finished successfully"); + + Ok(()) +} diff --git a/src/tools/runk/src/commands/spec.rs b/src/tools/runk/src/commands/spec.rs new file mode 100644 index 000000000..4dcf4d9eb --- /dev/null +++ b/src/tools/runk/src/commands/spec.rs @@ -0,0 +1,207 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +//use crate::container::get_config_path; +use anyhow::Result; +use libcontainer::container::CONFIG_FILE_NAME; +use liboci_cli::Spec; +use slog::{info, Logger}; +use std::{fs::File, io::Write, path::Path}; + +pub const DEFAULT_SPEC: &str = r#"{ + "ociVersion": "1.0.2-dev", + "process": { + "terminal": true, + "user": { + "uid": 0, + "gid": 0 + }, + "args": [ + "sh" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "effective": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "inheritable": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "permitted": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "ambient": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ] + }, + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + } + ], + "noNewPrivileges": true + }, + "root": { + "path": "rootfs", + "readonly": true + }, + "hostname": "runk", + "mounts": [ + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev", + "type": "tmpfs", + "source": "tmpfs", + "options": [ + "nosuid", + "strictatime", + "mode=755", + "size=65536k" + ] + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + }, + { + "destination": "/dev/shm", + "type": "tmpfs", + "source": "shm", + "options": [ + "nosuid", + "noexec", + "nodev", + "mode=1777", + "size=65536k" + ] + }, + { + "destination": "/dev/mqueue", + "type": "mqueue", + "source": "mqueue", + "options": [ + "nosuid", + "noexec", + "nodev" + ] + }, + { + "destination": "/sys", + "type": "sysfs", + "source": "sysfs", + "options": [ + "nosuid", + "noexec", + "nodev", + "ro" + ] + }, + { + "destination": "/sys/fs/cgroup", + "type": "cgroup", + "source": "cgroup", + "options": [ + "nosuid", + "noexec", + "nodev", + "relatime", + "ro" + ] + } + ], + "linux": { + "resources": { + "devices": [ + { + "allow": false, + "access": "rwm" + } + ] + }, + "namespaces": [ + { + "type": "pid" + }, + { + "type": "network" + }, + { + "type": "ipc" + }, + { + "type": "uts" + }, + { + "type": "mount" + } + ], + "maskedPaths": [ + "/proc/acpi", + "/proc/asound", + "/proc/kcore", + "/proc/keys", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi" + ], + "readonlyPaths": [ + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +}"#; + +pub fn run(_opts: Spec, logger: &Logger) -> Result<()> { + // TODO: liboci-cli does not support --bundle option for spec command. + // After liboci-cli supports the option, we will change the following code. + // let config_path = get_config_path(&opts.bundle); + let config_path = Path::new(".").join(CONFIG_FILE_NAME); + let config_data = DEFAULT_SPEC; + + let mut file = File::create(config_path)?; + file.write_all(config_data.as_bytes())?; + + info!(&logger, "spec command finished successfully"); + + Ok(()) +} diff --git a/src/tools/runk/src/commands/start.rs b/src/tools/runk/src/commands/start.rs new file mode 100644 index 000000000..750493688 --- /dev/null +++ b/src/tools/runk/src/commands/start.rs @@ -0,0 +1,48 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use crate::commands::state::get_container_state_name; +use anyhow::{anyhow, Result}; +use libcontainer::{ + container::get_fifo_path, + status::{get_current_container_state, Status}, +}; +use liboci_cli::Start; +use nix::unistd::unlink; +use oci::ContainerState; +use slog::{info, Logger}; +use std::{fs::OpenOptions, io::prelude::*, path::Path, time::SystemTime}; + +pub fn run(opts: Start, state_root: &Path, logger: &Logger) -> Result<()> { + let mut status = Status::load(state_root, &opts.container_id)?; + let state = get_current_container_state(&status)?; + if state != ContainerState::Created { + return Err(anyhow!( + "cannot start a container in the {} state", + get_container_state_name(state) + )); + }; + + let fifo_path = get_fifo_path(&status); + let mut file = OpenOptions::new().write(true).open(&fifo_path)?; + + file.write_all("0".as_bytes())?; + + info!(&logger, "container started"); + + status.process_start_time = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH)? + .as_secs(); + + status.save()?; + + if fifo_path.exists() { + unlink(&fifo_path)?; + } + + info!(&logger, "start command finished successfully"); + + Ok(()) +} diff --git a/src/tools/runk/src/commands/state.rs b/src/tools/runk/src/commands/state.rs new file mode 100644 index 000000000..eb6b87d49 --- /dev/null +++ b/src/tools/runk/src/commands/state.rs @@ -0,0 +1,79 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; +use chrono::{DateTime, Utc}; +use libcontainer::status::{get_current_container_state, Status}; +use liboci_cli::State; +use oci::ContainerState; +use serde::{Deserialize, Serialize}; +use slog::{info, Logger}; +use std::path::{Path, PathBuf}; + +#[derive(Serialize, Deserialize, Debug)] +#[serde(rename_all = "camelCase")] +pub struct RuntimeState { + pub oci_version: String, + pub id: String, + pub pid: i32, + pub status: String, + pub bundle: PathBuf, + pub created: DateTime, +} + +impl RuntimeState { + pub fn new(status: Status, state: ContainerState) -> Self { + Self { + oci_version: status.oci_version, + id: status.id, + pid: status.pid, + status: get_container_state_name(state), + bundle: status.bundle, + created: status.created, + } + } +} + +pub fn run(opts: State, state_root: &Path, logger: &Logger) -> Result<()> { + let status = Status::load(state_root, &opts.container_id)?; + let state = get_current_container_state(&status)?; + let oci_state = RuntimeState::new(status, state); + let json_state = &serde_json::to_string_pretty(&oci_state)?; + + println!("{}", json_state); + + info!(&logger, "state command finished successfully"); + + Ok(()) +} + +pub fn get_container_state_name(state: ContainerState) -> String { + match state { + ContainerState::Creating => "creating", + ContainerState::Created => "created", + ContainerState::Running => "running", + ContainerState::Stopped => "stopped", + ContainerState::Paused => "paused", + } + .into() +} + +#[cfg(test)] +mod tests { + use super::*; + use oci::ContainerState; + + #[test] + fn test_get_container_state_name() { + assert_eq!( + "creating", + get_container_state_name(ContainerState::Creating) + ); + assert_eq!("created", get_container_state_name(ContainerState::Created)); + assert_eq!("running", get_container_state_name(ContainerState::Running)); + assert_eq!("stopped", get_container_state_name(ContainerState::Stopped)); + assert_eq!("paused", get_container_state_name(ContainerState::Paused)); + } +} diff --git a/src/tools/runk/src/main.rs b/src/tools/runk/src/main.rs new file mode 100644 index 000000000..865a64258 --- /dev/null +++ b/src/tools/runk/src/main.rs @@ -0,0 +1,111 @@ +// Copyright 2021-2022 Sony Group Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Result}; +use clap::{crate_description, crate_name, Parser}; +use liboci_cli::{CommonCmd, GlobalOpts, StandardCmd}; +use slog::{o, Logger}; +use slog_async::AsyncGuard; +use std::{ + fs::OpenOptions, + path::{Path, PathBuf}, + process::exit, +}; + +const DEFAULT_ROOT_DIR: &str = "/run/runk"; +const DEFAULT_LOG_LEVEL: slog::Level = slog::Level::Info; + +mod commands; + +#[derive(Parser, Debug)] +enum SubCommand { + #[clap(flatten)] + Standard(StandardCmd), + #[clap(flatten)] + Common(CommonCmd), +} + +#[derive(Parser, Debug)] +#[clap(version, author, about = crate_description!())] +struct Cli { + #[clap(flatten)] + global: GlobalOpts, + #[clap(subcommand)] + subcmd: SubCommand, +} + +async fn cmd_run(subcmd: SubCommand, root_path: &Path, logger: &Logger) -> Result<()> { + match subcmd { + SubCommand::Standard(cmd) => match cmd { + StandardCmd::Create(create) => commands::create::run(create, root_path, logger).await, + StandardCmd::Start(start) => commands::start::run(start, root_path, logger), + StandardCmd::Kill(kill) => commands::kill::run(kill, root_path, logger), + StandardCmd::Delete(delete) => commands::delete::run(delete, root_path, logger).await, + StandardCmd::State(state) => commands::state::run(state, root_path, logger), + }, + SubCommand::Common(cmd) => match cmd { + CommonCmd::Run(run) => commands::run::run(run, root_path, logger).await, + CommonCmd::Spec(spec) => commands::spec::run(spec, logger), + _ => { + return Err(anyhow!("command is not implemented yet")); + } + }, + } +} + +fn setup_logger( + log_file: Option, + log_level: slog::Level, +) -> Result<(Logger, Option)> { + if let Some(ref file) = log_file { + let log_writer = OpenOptions::new() + .write(true) + .read(true) + .create(true) + .truncate(true) + .open(&file)?; + + // TODO: Support 'text' log format. + let (logger_local, logger_async_guard_local) = + logging::create_logger(crate_name!(), crate_name!(), log_level, log_writer); + + Ok((logger_local, Some(logger_async_guard_local))) + } else { + let logger = slog::Logger::root(slog::Discard, o!()); + Ok((logger, None)) + } +} + +async fn real_main() -> Result<()> { + let cli = Cli::parse(); + + let root_path = if let Some(path) = cli.global.root { + path + } else { + PathBuf::from(DEFAULT_ROOT_DIR) + }; + + let log_level = if cli.global.debug { + slog::Level::Debug + } else { + DEFAULT_LOG_LEVEL + }; + + let (logger, _async_guard) = setup_logger(cli.global.log, log_level)?; + + cmd_run(cli.subcmd, &root_path, &logger).await?; + + Ok(()) +} + +#[tokio::main] +async fn main() { + if let Err(e) = real_main().await { + eprintln!("ERROR: {}", e); + exit(1); + } + + exit(0); +} From 6948b4b360cf64d588022cfe8daa0b9d84c9bf9b Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Wed, 27 Apr 2022 16:52:53 +0000 Subject: [PATCH 07/29] docs: Update containerd link to installation guide This PR updates the containerd url link for the installation guide Fixes #4162 Signed-off-by: Gabriela Cervantes --- docs/install/container-manager/containerd/containerd-install.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/install/container-manager/containerd/containerd-install.md b/docs/install/container-manager/containerd/containerd-install.md index b5008187b..15e1e332e 100644 --- a/docs/install/container-manager/containerd/containerd-install.md +++ b/docs/install/container-manager/containerd/containerd-install.md @@ -81,7 +81,7 @@ - Download the standard `systemd(1)` service file and install to `/etc/systemd/system/`: - - https://raw.githubusercontent.com/containerd/containerd/master/containerd.service + - https://raw.githubusercontent.com/containerd/containerd/main/containerd.service > **Notes:** > From 18d27f794970b32a3d760748cbafba08d666e719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 27 Apr 2022 18:54:04 +0200 Subject: [PATCH 08/29] kata-deploy: Add a missing `$` prefix in the README MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit short-log says it all. Signed-off-by: Fabiano Fidêncio --- tools/packaging/kata-deploy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/packaging/kata-deploy/README.md b/tools/packaging/kata-deploy/README.md index c900a95b9..088ebaae2 100644 --- a/tools/packaging/kata-deploy/README.md +++ b/tools/packaging/kata-deploy/README.md @@ -45,7 +45,7 @@ $ kubectl apply -k kata-deploy/overlays/k3s #### Ensure kata-deploy is ready ```bash -kubectl -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod +$ kubectl -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod ``` ### Run a sample workload From 9d39362e30e7e988d21f6794677de99bb0c5f063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 27 Apr 2022 19:01:21 +0200 Subject: [PATCH 09/29] kata-deploy: Reestructure the installing section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's move the specific installation instructions, such as for k3s, upper in the document. This helps reading (and also skipping) according to what the user is looking for. Signed-off-by: Fabiano Fidêncio --- tools/packaging/kata-deploy/README.md | 33 ++++++++++++++++----------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/tools/packaging/kata-deploy/README.md b/tools/packaging/kata-deploy/README.md index 088ebaae2..a1ef2b69b 100644 --- a/tools/packaging/kata-deploy/README.md +++ b/tools/packaging/kata-deploy/README.md @@ -11,7 +11,25 @@ be utilized to install Kata Containers on a running Kubernetes cluster. ### Install Kata on a running Kubernetes cluster -#### Installing the latest image +#### k3s cluster + +For your [k3s](https://k3s.io/) cluster, run: + +```sh +$ git clone github.com/kata-containers/kata-containers +``` + +Check and switch to the stable branch of your choice, if wanted, and then run: + +```bash +$ cd kata-containers/kata-containers/tools/packaging/kata-deploy +$ kubectl apply -f kata-rbac/base/kata-rbac.yaml +$ kubectl apply -k kata-deploy/overlays/k3s +``` + +#### Vanilla Kubernetes cluster + +##### Installing the latest image The latest image refers to pre-release and release candidate content. For stable releases, please, use the "stable" instructions. @@ -20,7 +38,7 @@ $ kubectl apply -f https://raw.githubusercontent.com/kata-containers/kata-contai $ kubectl apply -f https://raw.githubusercontent.com/kata-containers/kata-containers/main/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml ``` -#### Installing the stable image +##### Installing the stable image The stable image refers to the last stable releases content. @@ -32,17 +50,6 @@ $ kubectl apply -f https://raw.githubusercontent.com/kata-containers/kata-contai $ kubectl apply -f https://raw.githubusercontent.com/kata-containers/kata-containers/main/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy-stable.yaml ``` -#### For your [k3s](https://k3s.io/) cluster, do: - -```sh -$ GO111MODULE=auto go get github.com/kata-containers/kata-containers -``` - -```bash -$ cd $GOPATH/src/github.com/kata-containers/kata-containers/tools/packaging/kata-deploy -$ kubectl apply -k kata-deploy/overlays/k3s -``` - #### Ensure kata-deploy is ready ```bash $ kubectl -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod From ccb01839345afac75ec65538c90649bbdb86ef72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 26 Apr 2022 18:47:51 +0200 Subject: [PATCH 10/29] kata-deploy: Add support to RKE2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "RKE2 - Rancher's Next Generation Kuberentes Distribution" can easily be supported by kata-deploy with some simple adjustments to what we've been relying on for "k3s". The main differences between k3s and RKE2 are, basically: 1. The location where the containerd configuration is stored - k3s: /var/lib/rancher/k3s/agent/etc/containerd/ - rke2: /var/lib/rancher/rke2/agent/etc/containerd/ 2. The name of the systemd services used: - k3s: k3s.service or k3s-agent.service - rke2: rke2-server.service or rke2-agent.service Knowing this, let's add a new overlay for RKE2, adapt the kata-deploy and the kata-cleanup scripts, and that's it. Fixes: #4160 Signed-off-by: Fabiano Fidêncio --- tools/packaging/kata-deploy/README.md | 16 ++++++++++++++++ .../overlays/rke2/kustomization.yaml | 5 +++++ .../overlays/rke2/mount_rke2_conf.yaml | 17 +++++++++++++++++ .../overlays/rke2/kustomization.yaml | 5 +++++ .../overlays/rke2/mount_rke2_conf.yaml | 12 ++++++++++++ .../kata-deploy/scripts/kata-deploy.sh | 15 +++++++++------ 6 files changed, 64 insertions(+), 6 deletions(-) create mode 100644 tools/packaging/kata-deploy/kata-cleanup/overlays/rke2/kustomization.yaml create mode 100644 tools/packaging/kata-deploy/kata-cleanup/overlays/rke2/mount_rke2_conf.yaml create mode 100644 tools/packaging/kata-deploy/kata-deploy/overlays/rke2/kustomization.yaml create mode 100644 tools/packaging/kata-deploy/kata-deploy/overlays/rke2/mount_rke2_conf.yaml diff --git a/tools/packaging/kata-deploy/README.md b/tools/packaging/kata-deploy/README.md index a1ef2b69b..f56bb005c 100644 --- a/tools/packaging/kata-deploy/README.md +++ b/tools/packaging/kata-deploy/README.md @@ -27,6 +27,22 @@ $ kubectl apply -f kata-rbac/base/kata-rbac.yaml $ kubectl apply -k kata-deploy/overlays/k3s ``` +#### RKE2 cluster + +For your [RKE2](https://docs.rke2.io/) cluster, run: + +```sh +$ git clone github.com/kata-containers/kata-containers +``` + +Check and switch to the stable branch of your choice, if wanted, and then run: + +```bash +$ cd kata-containers/kata-containers/tools/packaging/kata-deploy +$ kubectl apply -f kata-rbac/base/kata-rbac.yaml +$ kubectl apply -k kata-deploy/overlays/rke2 +``` + #### Vanilla Kubernetes cluster ##### Installing the latest image diff --git a/tools/packaging/kata-deploy/kata-cleanup/overlays/rke2/kustomization.yaml b/tools/packaging/kata-deploy/kata-cleanup/overlays/rke2/kustomization.yaml new file mode 100644 index 000000000..90dbe7bf8 --- /dev/null +++ b/tools/packaging/kata-deploy/kata-cleanup/overlays/rke2/kustomization.yaml @@ -0,0 +1,5 @@ +bases: +- ../../base + +patchesStrategicMerge: +- mount_rke2_conf.yaml diff --git a/tools/packaging/kata-deploy/kata-cleanup/overlays/rke2/mount_rke2_conf.yaml b/tools/packaging/kata-deploy/kata-cleanup/overlays/rke2/mount_rke2_conf.yaml new file mode 100644 index 000000000..19d25db6b --- /dev/null +++ b/tools/packaging/kata-deploy/kata-cleanup/overlays/rke2/mount_rke2_conf.yaml @@ -0,0 +1,17 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: kubelet-kata-cleanup + namespace: kube-system +spec: + template: + spec: + containers: + - name: kube-kata-cleanup + volumeMounts: + - name: containerd-conf + mountPath: /etc/containerd/ + volumes: + - name: containerd-conf + hostPath: + path: /var/lib/rancher/rke2/agent/etc/containerd/ diff --git a/tools/packaging/kata-deploy/kata-deploy/overlays/rke2/kustomization.yaml b/tools/packaging/kata-deploy/kata-deploy/overlays/rke2/kustomization.yaml new file mode 100644 index 000000000..90dbe7bf8 --- /dev/null +++ b/tools/packaging/kata-deploy/kata-deploy/overlays/rke2/kustomization.yaml @@ -0,0 +1,5 @@ +bases: +- ../../base + +patchesStrategicMerge: +- mount_rke2_conf.yaml diff --git a/tools/packaging/kata-deploy/kata-deploy/overlays/rke2/mount_rke2_conf.yaml b/tools/packaging/kata-deploy/kata-deploy/overlays/rke2/mount_rke2_conf.yaml new file mode 100644 index 000000000..b8a22bc00 --- /dev/null +++ b/tools/packaging/kata-deploy/kata-deploy/overlays/rke2/mount_rke2_conf.yaml @@ -0,0 +1,12 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: kata-deploy + namespace: kube-system +spec: + template: + spec: + volumes: + - name: containerd-conf + hostPath: + path: /var/lib/rancher/rke2/agent/etc/containerd/ diff --git a/tools/packaging/kata-deploy/scripts/kata-deploy.sh b/tools/packaging/kata-deploy/scripts/kata-deploy.sh index 43c478dd7..1bd51f4e2 100755 --- a/tools/packaging/kata-deploy/scripts/kata-deploy.sh +++ b/tools/packaging/kata-deploy/scripts/kata-deploy.sh @@ -39,7 +39,11 @@ function get_container_runtime() { die "invalid node name" fi if echo "$runtime" | grep -qE 'containerd.*-k3s'; then - if systemctl is-active --quiet k3s-agent; then + if systemctl is-active --quiet rke2-agent; then + echo "rke2-agent" + elif systemctl is-active --quiet rke2-server; then + echo "rke2-server" + elif systemctl is-active --quiet k3s-agent; then echo "k3s-agent" else echo "k3s" @@ -62,7 +66,7 @@ function configure_cri_runtime() { crio) configure_crio ;; - containerd | k3s | k3s-agent) + containerd | k3s | k3s-agent | rke2-agent | rke2-server) configure_containerd ;; esac @@ -228,7 +232,7 @@ function cleanup_cri_runtime() { crio) cleanup_crio ;; - containerd | k3s | k3s-agent) + containerd | k3s | k3s-agent | rke2-agent | rke2-server) cleanup_containerd ;; esac @@ -267,7 +271,7 @@ function main() { # CRI-O isn't consistent with the naming -- let's use crio to match the service file if [ "$runtime" == "cri-o" ]; then runtime="crio" - elif [ "$runtime" == "k3s" ] || [ "$runtime" == "k3s-agent" ]; then + elif [ "$runtime" == "k3s" ] || [ "$runtime" == "k3s-agent" ] || [ "$runtime" == "rke2-agent" ] || [ "$runtime" == "rke2-server" ]; then containerd_conf_tmpl_file="${containerd_conf_file}.tmpl" if [ ! -f "$containerd_conf_tmpl_file" ]; then cp "$containerd_conf_file" "$containerd_conf_tmpl_file" @@ -290,11 +294,10 @@ function main() { fi # only install / remove / update if we are dealing with CRIO or containerd - if [[ "$runtime" =~ ^(crio|containerd|k3s|k3s-agent)$ ]]; then + if [[ "$runtime" =~ ^(crio|containerd|k3s|k3s-agent|rke2-agent|rke2-server)$ ]]; then case "$action" in install) - install_artifacts configure_cri_runtime "$runtime" kubectl label node "$NODE_NAME" --overwrite katacontainers.io/kata-runtime=true From b0e439cb669b3042904261f8e0672ff8c617e83b Mon Sep 17 00:00:00 2001 From: Braden Rayhorn Date: Tue, 12 Apr 2022 20:04:54 -0500 Subject: [PATCH 11/29] rustjail: add tests for parse_mount_table Add tests for parse_mount_table function in rustjail/src/mount.rs. Includes some minor refactoring improve the testability of the function and improve its error values. Fixes: #4082 Signed-off-by: Braden Rayhorn --- src/agent/rustjail/src/lib.rs | 25 ++++++ src/agent/rustjail/src/mount.rs | 149 +++++++++++++++++++++++++++++--- 2 files changed, 163 insertions(+), 11 deletions(-) diff --git a/src/agent/rustjail/src/lib.rs b/src/agent/rustjail/src/lib.rs index d6f433096..61dbb497f 100644 --- a/src/agent/rustjail/src/lib.rs +++ b/src/agent/rustjail/src/lib.rs @@ -523,6 +523,31 @@ mod tests { }; } + // Parameters: + // + // 1: expected Result + // 2: actual Result + // 3: string used to identify the test on error + #[macro_export] + macro_rules! assert_result { + ($expected_result:expr, $actual_result:expr, $msg:expr) => { + if $expected_result.is_ok() { + let expected_value = $expected_result.as_ref().unwrap(); + let actual_value = $actual_result.unwrap(); + assert!(*expected_value == actual_value, "{}", $msg); + } else { + assert!($actual_result.is_err(), "{}", $msg); + + let expected_error = $expected_result.as_ref().unwrap_err(); + let expected_error_msg = format!("{:?}", expected_error); + + let actual_error_msg = format!("{:?}", $actual_result.unwrap_err()); + + assert!(expected_error_msg == actual_error_msg, "{}", $msg); + } + }; + } + #[test] fn test_process_grpc_to_oci() { #[derive(Debug)] diff --git a/src/agent/rustjail/src/mount.rs b/src/agent/rustjail/src/mount.rs index 3c6d60695..74742c0ff 100644 --- a/src/agent/rustjail/src/mount.rs +++ b/src/agent/rustjail/src/mount.rs @@ -32,16 +32,21 @@ use crate::log_child; // Info reveals information about a particular mounted filesystem. This // struct is populated from the content in the /proc//mountinfo file. -#[derive(std::fmt::Debug)] +#[derive(std::fmt::Debug, PartialEq)] pub struct Info { mount_point: String, optional: String, fstype: String, } -const MOUNTINFOFORMAT: &str = "{d} {d} {d}:{d} {} {} {} {}"; +const MOUNTINFO_FORMAT: &str = "{d} {d} {d}:{d} {} {} {} {}"; +const MOUNTINFO_PATH: &str = "/proc/self/mountinfo"; const PROC_PATH: &str = "/proc"; +const ERR_FAILED_PARSE_MOUNTINFO: &str = "failed to parse mountinfo file"; +const ERR_FAILED_PARSE_MOUNTINFO_FINAL_FIELDS: &str = + "failed to parse final fields in mountinfo file"; + // since libc didn't defined this const for musl, thus redefined it here. #[cfg(all(target_os = "linux", target_env = "gnu", not(target_arch = "s390x")))] const PROC_SUPER_MAGIC: libc::c_long = 0x00009fa0; @@ -518,7 +523,7 @@ pub fn pivot_rootfs(path: &P) -> Result<( } fn rootfs_parent_mount_private(path: &str) -> Result<()> { - let mount_infos = parse_mount_table()?; + let mount_infos = parse_mount_table(MOUNTINFO_PATH)?; let mut max_len = 0; let mut mount_point = String::from(""); @@ -546,8 +551,8 @@ fn rootfs_parent_mount_private(path: &str) -> Result<()> { // Parse /proc/self/mountinfo because comparing Dev and ino does not work from // bind mounts -fn parse_mount_table() -> Result> { - let file = File::open("/proc/self/mountinfo")?; +fn parse_mount_table(mountinfo_path: &str) -> Result> { + let file = File::open(mountinfo_path)?; let reader = BufReader::new(file); let mut infos = Vec::new(); @@ -569,7 +574,7 @@ fn parse_mount_table() -> Result> { let (_id, _parent, _major, _minor, _root, mount_point, _opts, optional) = scan_fmt!( &line, - MOUNTINFOFORMAT, + MOUNTINFO_FORMAT, i32, i32, i32, @@ -578,12 +583,17 @@ fn parse_mount_table() -> Result> { String, String, String - )?; + ) + .map_err(|_| anyhow!(ERR_FAILED_PARSE_MOUNTINFO))?; let fields: Vec<&str> = line.split(" - ").collect(); if fields.len() == 2 { - let (fstype, _source, _vfs_opts) = - scan_fmt!(fields[1], "{} {} {}", String, String, String)?; + let final_fields: Vec<&str> = fields[1].split_whitespace().collect(); + + if final_fields.len() != 3 { + return Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO_FINAL_FIELDS)); + } + let fstype = final_fields[0].to_string(); let mut optional_new = String::new(); if optional != "-" { @@ -598,7 +608,7 @@ fn parse_mount_table() -> Result> { infos.push(info); } else { - return Err(anyhow!("failed to parse mount info file".to_string())); + return Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)); } } @@ -619,7 +629,7 @@ fn chroot(_path: &P) -> Result<(), nix::Error> { pub fn ms_move_root(rootfs: &str) -> Result { unistd::chdir(rootfs)?; - let mount_infos = parse_mount_table()?; + let mount_infos = parse_mount_table(MOUNTINFO_PATH)?; let root_path = Path::new(rootfs); let abs_root_buf = root_path.absolutize()?; @@ -1046,10 +1056,12 @@ fn readonly_path(path: &str) -> Result<()> { #[cfg(test)] mod tests { use super::*; + use crate::assert_result; use crate::skip_if_not_root; use std::fs::create_dir; use std::fs::create_dir_all; use std::fs::remove_dir_all; + use std::io; use std::os::unix::fs; use std::os::unix::io::AsRawFd; use tempfile::tempdir; @@ -1508,6 +1520,121 @@ mod tests { } } + #[test] + fn test_parse_mount_table() { + #[derive(Debug)] + struct TestData<'a> { + mountinfo_data: Option<&'a str>, + result: Result>, + } + + let tests = &[ + TestData { + mountinfo_data: Some( + "22 933 0:20 / /sys rw,nodev shared:2 - sysfs sysfs rw,noexec", + ), + result: Ok(vec![Info { + mount_point: "/sys".to_string(), + optional: "shared:2".to_string(), + fstype: "sysfs".to_string(), + }]), + }, + TestData { + mountinfo_data: Some( + r#"22 933 0:20 / /sys rw,nodev - sysfs sysfs rw,noexec + 81 13 1:2 / /tmp/dir rw shared:2 - tmpfs tmpfs rw"#, + ), + result: Ok(vec![ + Info { + mount_point: "/sys".to_string(), + optional: "".to_string(), + fstype: "sysfs".to_string(), + }, + Info { + mount_point: "/tmp/dir".to_string(), + optional: "shared:2".to_string(), + fstype: "tmpfs".to_string(), + }, + ]), + }, + TestData { + mountinfo_data: Some( + "22 933 0:20 /foo\040-\040bar /sys rw,nodev shared:2 - sysfs sysfs rw,noexec", + ), + result: Ok(vec![Info { + mount_point: "/sys".to_string(), + optional: "shared:2".to_string(), + fstype: "sysfs".to_string(), + }]), + }, + TestData { + mountinfo_data: Some(""), + result: Ok(vec![]), + }, + TestData { + mountinfo_data: Some("invalid line data - sysfs sysfs rw"), + result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)), + }, + TestData { + mountinfo_data: Some("22 96 0:21 / /sys rw,noexec - sysfs"), + result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO_FINAL_FIELDS)), + }, + TestData { + mountinfo_data: Some("22 96 0:21 / /sys rw,noexec - sysfs sysfs rw rw"), + result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO_FINAL_FIELDS)), + }, + TestData { + mountinfo_data: Some("22 96 0:21 / /sys rw,noexec shared:2 - x - x"), + result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)), + }, + TestData { + mountinfo_data: Some("-"), + result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)), + }, + TestData { + mountinfo_data: Some("--"), + result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)), + }, + TestData { + mountinfo_data: Some("- -"), + result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)), + }, + TestData { + mountinfo_data: Some(" - "), + result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)), + }, + TestData { + mountinfo_data: Some( + r#"22 933 0:20 / /sys rw,nodev - sysfs sysfs rw,noexec + invalid line + 81 13 1:2 / /tmp/dir rw shared:2 - tmpfs tmpfs rw"#, + ), + result: Err(anyhow!(ERR_FAILED_PARSE_MOUNTINFO)), + }, + TestData { + mountinfo_data: None, + result: Err(anyhow!(io::Error::from_raw_os_error(libc::ENOENT))), + }, + ]; + + for (i, d) in tests.iter().enumerate() { + let msg = format!("test[{}]: {:?}", i, d); + + let tempdir = tempdir().unwrap(); + let mountinfo_path = tempdir.path().join("mountinfo"); + + if let Some(mountinfo_data) = d.mountinfo_data { + std::fs::write(&mountinfo_path, mountinfo_data).unwrap(); + } + + let result = parse_mount_table(mountinfo_path.to_str().unwrap()); + + let msg = format!("{}: result: {:?}", msg, result); + + assert_result!(d.result, result, msg); + } + } + #[test] fn test_dev_rel_path() { // Valid device paths From 2d35e6066d3fb309952635e46b61bd451eef857c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 21 Apr 2022 01:06:10 +0200 Subject: [PATCH 12/29] hypervisor: Add network bandwidth and operations rate limiters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a similar way to what's already exposed as RxRateLimiterMaxRate and TxRateLimiterMaxRate, let's add four new fields to the Hypervisor's configuration. The values added are related to bandwidth and operations rate limiters, which have to be added so we can expose I/O throttling configurations to users using Cloud Hypervisor as their preferred VMM. The reason we cannot simply re-use {Rx,Tx}RateLimiterMaxRate is because Cloud Hypervisor exposes a single MaxRate to be used for both inbound and outbound queues. The newly added fields are: * NetRateLimiterBwMaxRate, defined in bits per second, which is used to control the network I/O bandwidth at the VM level. * NetRateLimiterBwOneTimeBurst, also defined in bits per second, which is used to define an *initial* max rate, which doesn't replenish. * NetRateLimiterOpsMaxRate, the operations per second equivalent of the NetRateLimiterBwMaxRate. * NetRateLimiterOpsOneTimeBurst, the operations per second equivalent of the NetRateLimiterBwOneTimeBurst. For now those extra fields have only been added to the hypervisor's configuration and they'll be used in the coming patches of this very same series. Signed-off-by: Fabiano Fidêncio --- src/runtime/pkg/katautils/config.go | 160 ++++++++++++++--------- src/runtime/virtcontainers/hypervisor.go | 18 +++ 2 files changed, 114 insertions(+), 64 deletions(-) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 3525bc236..3e15929ff 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -74,70 +74,74 @@ type factory struct { } type hypervisor struct { - Path string `toml:"path"` - JailerPath string `toml:"jailer_path"` - Kernel string `toml:"kernel"` - CtlPath string `toml:"ctlpath"` - Initrd string `toml:"initrd"` - Image string `toml:"image"` - Firmware string `toml:"firmware"` - FirmwareVolume string `toml:"firmware_volume"` - MachineAccelerators string `toml:"machine_accelerators"` - CPUFeatures string `toml:"cpu_features"` - KernelParams string `toml:"kernel_params"` - MachineType string `toml:"machine_type"` - BlockDeviceDriver string `toml:"block_device_driver"` - EntropySource string `toml:"entropy_source"` - SharedFS string `toml:"shared_fs"` - VirtioFSDaemon string `toml:"virtio_fs_daemon"` - VirtioFSCache string `toml:"virtio_fs_cache"` - VhostUserStorePath string `toml:"vhost_user_store_path"` - FileBackedMemRootDir string `toml:"file_mem_backend"` - GuestHookPath string `toml:"guest_hook_path"` - GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` - HypervisorPathList []string `toml:"valid_hypervisor_paths"` - JailerPathList []string `toml:"valid_jailer_paths"` - CtlPathList []string `toml:"valid_ctlpaths"` - VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` - VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` - PFlashList []string `toml:"pflashes"` - VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` - FileBackedMemRootList []string `toml:"valid_file_mem_backends"` - EntropySourceList []string `toml:"valid_entropy_sources"` - EnableAnnotations []string `toml:"enable_annotations"` - RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` - TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` - MemOffset uint64 `toml:"memory_offset"` - VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` - DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` - MemorySize uint32 `toml:"default_memory"` - MemSlots uint32 `toml:"memory_slots"` - DefaultBridges uint32 `toml:"default_bridges"` - Msize9p uint32 `toml:"msize_9p"` - PCIeRootPort uint32 `toml:"pcie_root_port"` - NumVCPUs int32 `toml:"default_vcpus"` - BlockDeviceCacheSet bool `toml:"block_device_cache_set"` - BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` - BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` - EnableVhostUserStore bool `toml:"enable_vhost_user_store"` - DisableBlockDeviceUse bool `toml:"disable_block_device_use"` - MemPrealloc bool `toml:"enable_mem_prealloc"` - HugePages bool `toml:"enable_hugepages"` - VirtioMem bool `toml:"enable_virtio_mem"` - IOMMU bool `toml:"enable_iommu"` - IOMMUPlatform bool `toml:"enable_iommu_platform"` - Debug bool `toml:"enable_debug"` - DisableNestingChecks bool `toml:"disable_nesting_checks"` - EnableIOThreads bool `toml:"enable_iothreads"` - DisableImageNvdimm bool `toml:"disable_image_nvdimm"` - HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` - DisableVhostNet bool `toml:"disable_vhost_net"` - GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` - ConfidentialGuest bool `toml:"confidential_guest"` - GuestSwap bool `toml:"enable_guest_swap"` - Rootless bool `toml:"rootless"` - DisableSeccomp bool `toml:"disable_seccomp"` - DisableSeLinux bool `toml:"disable_selinux"` + Path string `toml:"path"` + JailerPath string `toml:"jailer_path"` + Kernel string `toml:"kernel"` + CtlPath string `toml:"ctlpath"` + Initrd string `toml:"initrd"` + Image string `toml:"image"` + Firmware string `toml:"firmware"` + FirmwareVolume string `toml:"firmware_volume"` + MachineAccelerators string `toml:"machine_accelerators"` + CPUFeatures string `toml:"cpu_features"` + KernelParams string `toml:"kernel_params"` + MachineType string `toml:"machine_type"` + BlockDeviceDriver string `toml:"block_device_driver"` + EntropySource string `toml:"entropy_source"` + SharedFS string `toml:"shared_fs"` + VirtioFSDaemon string `toml:"virtio_fs_daemon"` + VirtioFSCache string `toml:"virtio_fs_cache"` + VhostUserStorePath string `toml:"vhost_user_store_path"` + FileBackedMemRootDir string `toml:"file_mem_backend"` + GuestHookPath string `toml:"guest_hook_path"` + GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` + HypervisorPathList []string `toml:"valid_hypervisor_paths"` + JailerPathList []string `toml:"valid_jailer_paths"` + CtlPathList []string `toml:"valid_ctlpaths"` + VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` + VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` + PFlashList []string `toml:"pflashes"` + VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` + FileBackedMemRootList []string `toml:"valid_file_mem_backends"` + EntropySourceList []string `toml:"valid_entropy_sources"` + EnableAnnotations []string `toml:"enable_annotations"` + RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` + TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` + MemOffset uint64 `toml:"memory_offset"` + NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` + NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` + NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` + NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` + VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` + DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` + MemorySize uint32 `toml:"default_memory"` + MemSlots uint32 `toml:"memory_slots"` + DefaultBridges uint32 `toml:"default_bridges"` + Msize9p uint32 `toml:"msize_9p"` + PCIeRootPort uint32 `toml:"pcie_root_port"` + NumVCPUs int32 `toml:"default_vcpus"` + BlockDeviceCacheSet bool `toml:"block_device_cache_set"` + BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` + BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` + EnableVhostUserStore bool `toml:"enable_vhost_user_store"` + DisableBlockDeviceUse bool `toml:"disable_block_device_use"` + MemPrealloc bool `toml:"enable_mem_prealloc"` + HugePages bool `toml:"enable_hugepages"` + VirtioMem bool `toml:"enable_virtio_mem"` + IOMMU bool `toml:"enable_iommu"` + IOMMUPlatform bool `toml:"enable_iommu_platform"` + Debug bool `toml:"enable_debug"` + DisableNestingChecks bool `toml:"disable_nesting_checks"` + EnableIOThreads bool `toml:"enable_iothreads"` + DisableImageNvdimm bool `toml:"disable_image_nvdimm"` + HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` + DisableVhostNet bool `toml:"disable_vhost_net"` + GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` + ConfidentialGuest bool `toml:"confidential_guest"` + GuestSwap bool `toml:"enable_guest_swap"` + Rootless bool `toml:"rootless"` + DisableSeccomp bool `toml:"disable_seccomp"` + DisableSeLinux bool `toml:"disable_selinux"` } type runtime struct { @@ -490,6 +494,34 @@ func (h hypervisor) getTxRateLimiterCfg() uint64 { return h.TxRateLimiterMaxRate } +func (h hypervisor) getNetRateLimiterBwMaxRate() int64 { + return h.NetRateLimiterBwMaxRate +} + +func (h hypervisor) getNetRateLimiterBwOneTimeBurst() int64 { + if h.NetRateLimiterBwOneTimeBurst != 0 && h.getNetRateLimiterBwMaxRate() == 0 { + kataUtilsLogger.Warn("The NetRateLimiterBwOneTimeBurst is set but NetRateLimiterBwMaxRate is not set, this option will be ignored.") + + h.NetRateLimiterBwOneTimeBurst = 0 + } + + return h.NetRateLimiterBwOneTimeBurst +} + +func (h hypervisor) getNetRateLimiterOpsMaxRate() int64 { + return h.NetRateLimiterOpsMaxRate +} + +func (h hypervisor) getNetRateLimiterOpsOneTimeBurst() int64 { + if h.NetRateLimiterOpsOneTimeBurst != 0 && h.getNetRateLimiterOpsMaxRate() == 0 { + kataUtilsLogger.Warn("The NetRateLimiterOpsOneTimeBurst is set but NetRateLimiterOpsMaxRate is not set, this option will be ignored.") + + h.NetRateLimiterOpsOneTimeBurst = 0 + } + + return h.NetRateLimiterOpsOneTimeBurst +} + func (h hypervisor) getIOMMUPlatform() bool { if h.IOMMUPlatform { kataUtilsLogger.Info("IOMMUPlatform is enabled by default.") diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 12725160b..fa9b66867 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -386,6 +386,24 @@ type HypervisorConfig struct { // TxRateLimiterMaxRate is used to control network I/O outbound bandwidth on VM level. TxRateLimiterMaxRate uint64 + // NetRateLimiterBwRate is used to control network I/O bandwidth on VM level. + // The same value, defined in bits per second, is used for inbound and outbound bandwidth. + NetRateLimiterBwMaxRate int64 + + // NetRateLimiterBwOneTimeBurst is used to control network I/O bandwidth on VM level. + // This increases the initial max rate and this initial extra credit does *NOT* replenish + // and can be used for an *initial* burst of data. + NetRateLimiterBwOneTimeBurst int64 + + // NetRateLimiterOpsRate is used to control network I/O operations on VM level. + // The same value, defined in operations per second, is used for inbound and outbound bandwidth. + NetRateLimiterOpsMaxRate int64 + + // NetRateLimiterOpsOneTimeBurst is used to control network I/O operations on VM level. + // This increases the initial max rate and this initial extra credit does *NOT* replenish + // and can be used for an *initial* burst of data. + NetRateLimiterOpsOneTimeBurst int64 + // MemOffset specifies memory space for nvdimm device MemOffset uint64 From c9f6496d6dfa2478a6e80b5c2d0782502ea0235c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 21 Apr 2022 13:02:34 +0200 Subject: [PATCH 13/29] config: Add NetRateLimiter* to Cloud Hypervisor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's add the newly added network rate limiter configurations to the Cloud Hypervisor's hypervisor configuration. Right now those are not used anywhere, and there's absolutely no way the users can set those up. That's coming later in this very same series. Signed-off-by: Fabiano Fidêncio --- src/runtime/pkg/katautils/config.go | 96 ++++++++++++------------ src/runtime/pkg/katautils/config_test.go | 34 +++++++-- 2 files changed, 79 insertions(+), 51 deletions(-) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 3e15929ff..3cb37780d 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -860,52 +860,56 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { } return vc.HypervisorConfig{ - HypervisorPath: hypervisor, - HypervisorPathList: h.HypervisorPathList, - KernelPath: kernel, - InitrdPath: initrd, - ImagePath: image, - FirmwarePath: firmware, - MachineAccelerators: machineAccelerators, - KernelParams: vc.DeserializeParams(strings.Fields(kernelParams)), - HypervisorMachineType: machineType, - NumVCPUs: h.defaultVCPUs(), - DefaultMaxVCPUs: h.defaultMaxVCPUs(), - MemorySize: h.defaultMemSz(), - MemSlots: h.defaultMemSlots(), - MemOffset: h.defaultMemOffset(), - VirtioMem: h.VirtioMem, - EntropySource: h.GetEntropySource(), - EntropySourceList: h.EntropySourceList, - DefaultBridges: h.defaultBridges(), - DisableBlockDeviceUse: h.DisableBlockDeviceUse, - SharedFS: sharedFS, - VirtioFSDaemon: h.VirtioFSDaemon, - VirtioFSDaemonList: h.VirtioFSDaemonList, - VirtioFSCacheSize: h.VirtioFSCacheSize, - VirtioFSCache: h.VirtioFSCache, - MemPrealloc: h.MemPrealloc, - HugePages: h.HugePages, - FileBackedMemRootDir: h.FileBackedMemRootDir, - FileBackedMemRootList: h.FileBackedMemRootList, - Debug: h.Debug, - DisableNestingChecks: h.DisableNestingChecks, - BlockDeviceDriver: blockDriver, - BlockDeviceCacheSet: h.BlockDeviceCacheSet, - BlockDeviceCacheDirect: h.BlockDeviceCacheDirect, - BlockDeviceCacheNoflush: h.BlockDeviceCacheNoflush, - EnableIOThreads: h.EnableIOThreads, - Msize9p: h.msize9p(), - HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, - PCIeRootPort: h.PCIeRootPort, - DisableVhostNet: true, - GuestHookPath: h.guestHookPath(), - VirtioFSExtraArgs: h.VirtioFSExtraArgs, - SGXEPCSize: defaultSGXEPCSize, - EnableAnnotations: h.EnableAnnotations, - DisableSeccomp: h.DisableSeccomp, - ConfidentialGuest: h.ConfidentialGuest, - DisableSeLinux: h.DisableSeLinux, + HypervisorPath: hypervisor, + HypervisorPathList: h.HypervisorPathList, + KernelPath: kernel, + InitrdPath: initrd, + ImagePath: image, + FirmwarePath: firmware, + MachineAccelerators: machineAccelerators, + KernelParams: vc.DeserializeParams(strings.Fields(kernelParams)), + HypervisorMachineType: machineType, + NumVCPUs: h.defaultVCPUs(), + DefaultMaxVCPUs: h.defaultMaxVCPUs(), + MemorySize: h.defaultMemSz(), + MemSlots: h.defaultMemSlots(), + MemOffset: h.defaultMemOffset(), + VirtioMem: h.VirtioMem, + EntropySource: h.GetEntropySource(), + EntropySourceList: h.EntropySourceList, + DefaultBridges: h.defaultBridges(), + DisableBlockDeviceUse: h.DisableBlockDeviceUse, + SharedFS: sharedFS, + VirtioFSDaemon: h.VirtioFSDaemon, + VirtioFSDaemonList: h.VirtioFSDaemonList, + VirtioFSCacheSize: h.VirtioFSCacheSize, + VirtioFSCache: h.VirtioFSCache, + MemPrealloc: h.MemPrealloc, + HugePages: h.HugePages, + FileBackedMemRootDir: h.FileBackedMemRootDir, + FileBackedMemRootList: h.FileBackedMemRootList, + Debug: h.Debug, + DisableNestingChecks: h.DisableNestingChecks, + BlockDeviceDriver: blockDriver, + BlockDeviceCacheSet: h.BlockDeviceCacheSet, + BlockDeviceCacheDirect: h.BlockDeviceCacheDirect, + BlockDeviceCacheNoflush: h.BlockDeviceCacheNoflush, + EnableIOThreads: h.EnableIOThreads, + Msize9p: h.msize9p(), + HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, + PCIeRootPort: h.PCIeRootPort, + DisableVhostNet: true, + GuestHookPath: h.guestHookPath(), + VirtioFSExtraArgs: h.VirtioFSExtraArgs, + SGXEPCSize: defaultSGXEPCSize, + EnableAnnotations: h.EnableAnnotations, + DisableSeccomp: h.DisableSeccomp, + ConfidentialGuest: h.ConfidentialGuest, + DisableSeLinux: h.DisableSeLinux, + NetRateLimiterBwMaxRate: h.getNetRateLimiterBwMaxRate(), + NetRateLimiterBwOneTimeBurst: h.getNetRateLimiterBwOneTimeBurst(), + NetRateLimiterOpsMaxRate: h.getNetRateLimiterOpsMaxRate(), + NetRateLimiterOpsOneTimeBurst: h.getNetRateLimiterOpsOneTimeBurst(), }, nil } diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 2f85adcb3..778f6ec6d 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -810,6 +810,10 @@ func TestNewClhHypervisorConfig(t *testing.T) { kernelPath := path.Join(tmpdir, "kernel") imagePath := path.Join(tmpdir, "image") virtioFsDaemon := path.Join(tmpdir, "virtiofsd") + netRateLimiterBwMaxRate := int64(1000) + netRateLimiterBwOneTimeBurst := int64(1000) + netRateLimiterOpsMaxRate := int64(0) + netRateLimiterOpsOneTimeBurst := int64(1000) for _, file := range []string{imagePath, hypervisorPath, kernelPath, virtioFsDaemon} { err := createEmptyFile(file) @@ -817,11 +821,15 @@ func TestNewClhHypervisorConfig(t *testing.T) { } hypervisor := hypervisor{ - Path: hypervisorPath, - Kernel: kernelPath, - Image: imagePath, - VirtioFSDaemon: virtioFsDaemon, - VirtioFSCache: "always", + Path: hypervisorPath, + Kernel: kernelPath, + Image: imagePath, + VirtioFSDaemon: virtioFsDaemon, + VirtioFSCache: "always", + NetRateLimiterBwMaxRate: netRateLimiterBwMaxRate, + NetRateLimiterBwOneTimeBurst: netRateLimiterBwOneTimeBurst, + NetRateLimiterOpsMaxRate: netRateLimiterOpsMaxRate, + NetRateLimiterOpsOneTimeBurst: netRateLimiterOpsOneTimeBurst, } config, err := newClhHypervisorConfig(hypervisor) if err != nil { @@ -852,6 +860,22 @@ func TestNewClhHypervisorConfig(t *testing.T) { t.Errorf("Expected VirtioFSCache %v, got %v", true, config.VirtioFSCache) } + if config.NetRateLimiterBwMaxRate != netRateLimiterBwMaxRate { + t.Errorf("Expected value for network bandwidth rate limiter %v, got %v", netRateLimiterBwMaxRate, config.NetRateLimiterBwMaxRate) + } + + if config.NetRateLimiterBwOneTimeBurst != netRateLimiterBwOneTimeBurst { + t.Errorf("Expected value for network bandwidth one time burst %v, got %v", netRateLimiterBwOneTimeBurst, config.NetRateLimiterBwOneTimeBurst) + } + + if config.NetRateLimiterOpsMaxRate != netRateLimiterOpsMaxRate { + t.Errorf("Expected value for network operations rate limiter %v, got %v", netRateLimiterOpsMaxRate, config.NetRateLimiterOpsMaxRate) + } + + // We expect 0 (zero) here as netRateLimiterOpsMaxRate is not set (set to zero). + if config.NetRateLimiterOpsOneTimeBurst != 0 { + t.Errorf("Expected value for network operations one time burst %v, got %v", netRateLimiterOpsOneTimeBurst, config.NetRateLimiterOpsOneTimeBurst) + } } func TestHypervisorDefaults(t *testing.T) { From be1bb7e39f876f693a1d224ce90678be6a856ecb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 21 Apr 2022 03:21:17 +0200 Subject: [PATCH 14/29] utils: Move FC's function to revert bytes to utils MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Firecracker's revertBytes function, now called "RevertBytes", can be exposed as part of the virtcontainers' utils file, as this function will be reused by Cloud Hypervisor, when adding the rate limiter logic there. Signed-off-by: Fabiano Fidêncio --- src/runtime/virtcontainers/fc.go | 16 ++-------------- src/runtime/virtcontainers/fc_test.go | 11 ----------- src/runtime/virtcontainers/utils/utils.go | 16 ++++++++++++++++ src/runtime/virtcontainers/utils/utils_test.go | 11 +++++++++++ 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/runtime/virtcontainers/fc.go b/src/runtime/virtcontainers/fc.go index 4630a810c..cb6e3a260 100644 --- a/src/runtime/virtcontainers/fc.go +++ b/src/runtime/virtcontainers/fc.go @@ -938,7 +938,7 @@ func (fc *firecracker) fcAddNetDevice(ctx context.Context, endpoint Endpoint) { // kata-defined rxSize is in bits with scaling factors of 1000, but firecracker-defined // rxSize is in bytes with scaling factors of 1024, need reversion. - rxSize = revertBytes(rxSize / 8) + rxSize = utils.RevertBytes(rxSize / 8) rxTokenBucket := models.TokenBucket{ RefillTime: &refillTime, Size: &rxSize, @@ -955,7 +955,7 @@ func (fc *firecracker) fcAddNetDevice(ctx context.Context, endpoint Endpoint) { // kata-defined txSize is in bits with scaling factors of 1000, but firecracker-defined // txSize is in bytes with scaling factors of 1024, need reversion. - txSize = revertBytes(txSize / 8) + txSize = utils.RevertBytes(txSize / 8) txTokenBucket := models.TokenBucket{ RefillTime: &refillTime, Size: &txSize, @@ -1266,15 +1266,3 @@ func (fc *firecracker) GenerateSocket(id string) (interface{}, error) { func (fc *firecracker) IsRateLimiterBuiltin() bool { return true } - -// In firecracker, it accepts the size of rate limiter in scaling factors of 2^10(1024) -// But in kata-defined rate limiter, for better Human-readability, we prefer scaling factors of 10^3(1000). -// func revertByte reverts num from scaling factors of 1000 to 1024, e.g. 10000000(10MB) to 10485760. -func revertBytes(num uint64) uint64 { - a := num / 1000 - b := num % 1000 - if a == 0 { - return num - } - return 1024*revertBytes(a) + b -} diff --git a/src/runtime/virtcontainers/fc_test.go b/src/runtime/virtcontainers/fc_test.go index f2b099f1d..78ab704ee 100644 --- a/src/runtime/virtcontainers/fc_test.go +++ b/src/runtime/virtcontainers/fc_test.go @@ -50,17 +50,6 @@ func TestFCTruncateID(t *testing.T) { assert.Equal(expectedID, id) } -func TestRevertBytes(t *testing.T) { - assert := assert.New(t) - - //10MB - testNum := uint64(10000000) - expectedNum := uint64(10485760) - - num := revertBytes(testNum) - assert.Equal(expectedNum, num) -} - func TestFCParseVersion(t *testing.T) { assert := assert.New(t) diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 88c29cec5..048446fd7 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -458,3 +458,19 @@ func getAllParentPaths(path string) []string { // remove the "/" or "." from the return result return paths[1:] } + +// In Cloud Hypervisor, as well as in Firecracker, the crate used by the VMMs +// accepts the size of rate limiter in scaling factors of 2^10(1024). +// But in kata-defined rate limiter, for better Human-readability, we prefer +// scaling factors of 10^3(1000). +// +// func revertBytes reverts num from scaling factors of 1000 to 1024, e.g. +// 10000000(10MB) to 10485760. +func RevertBytes(num uint64) uint64 { + a := num / 1000 + b := num % 1000 + if a == 0 { + return num + } + return 1024*RevertBytes(a) + b +} diff --git a/src/runtime/virtcontainers/utils/utils_test.go b/src/runtime/virtcontainers/utils/utils_test.go index aac6fef90..8f3d0eed2 100644 --- a/src/runtime/virtcontainers/utils/utils_test.go +++ b/src/runtime/virtcontainers/utils/utils_test.go @@ -569,3 +569,14 @@ func TestGetAllParentPaths(t *testing.T) { assert.Equal(tc.parents, getAllParentPaths(tc.targetPath)) } } + +func TestRevertBytes(t *testing.T) { + assert := assert.New(t) + + //10MB + testNum := uint64(10000000) + expectedNum := uint64(10485760) + + num := RevertBytes(testNum) + assert.Equal(expectedNum, num) +} From 00a5b1bda92f11e6ba9f5373260fc6afb873845c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 21 Apr 2022 03:29:07 +0200 Subject: [PATCH 15/29] utils: Define DefaultRateLimiterRefillTimeMilliSecs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Firecracker's driver doesn't expose the RefillTime option of the rate limiter to the user. Instead, it uses a contant value of 1000 miliseconds (1 second). As we're following Firecracker's driver implementation, let's expose create a new constant, use it as part of the Firecracker's driver, and later on re-use it as part of the Cloud Hypervisor's driver. Signed-off-by: Fabiano Fidêncio --- src/runtime/virtcontainers/fc.go | 2 +- src/runtime/virtcontainers/utils/utils.go | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/runtime/virtcontainers/fc.go b/src/runtime/virtcontainers/fc.go index cb6e3a260..384ce15a1 100644 --- a/src/runtime/virtcontainers/fc.go +++ b/src/runtime/virtcontainers/fc.go @@ -930,7 +930,7 @@ func (fc *firecracker) fcAddNetDevice(ctx context.Context, endpoint Endpoint) { // The implementation of rate limiter is based on TBF. // Rate Limiter defines a token bucket with a maximum capacity (size) to store tokens, and an interval for refilling purposes (refill_time). // The refill-rate is derived from size and refill_time, and it is the constant rate at which the tokens replenish. - refillTime := uint64(1000) + refillTime := uint64(utils.DefaultRateLimiterRefillTimeMilliSecs) var rxRateLimiter models.RateLimiter rxSize := fc.config.RxRateLimiterMaxRate if rxSize > 0 { diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 048446fd7..5a6bb7150 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -25,6 +25,11 @@ const cpBinaryName = "cp" const fileMode0755 = os.FileMode(0755) +// The DefaultRateLimiterRefillTime is used for calculating the rate at +// which a TokenBucket is replinished, in cases where a RateLimiter is +// applied to either network or disk I/O. +const DefaultRateLimiterRefillTimeMilliSecs = 1000 + // MibToBytesShift the number to shift needed to convert MiB to Bytes const MibToBytesShift = 20 From 1cf946929742a235e455387bf52bf9778b8236bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 19 Apr 2022 13:22:27 +0200 Subject: [PATCH 16/29] clh: Implement the Network RateLimiter logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's take advantage of the newly added NetRateLimiter* options and apply those to the network device configuration. The logic here is quite similar to the one already present in the Firecracker's driver, with the main difference being the single Inbound / Outbound MaxRate and the presence of both Bandwidth and Operations rate limiter. Signed-off-by: Fabiano Fidêncio --- src/runtime/virtcontainers/clh.go | 44 ++++++ src/runtime/virtcontainers/clh_test.go | 201 +++++++++++++++++++++++-- 2 files changed, 234 insertions(+), 11 deletions(-) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index d01415066..30dc42c47 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -1304,6 +1304,44 @@ func (clh *cloudHypervisor) addVSock(cid int64, path string) { clh.vmconfig.Vsock = chclient.NewVsockConfig(cid, path) } +func (clh *cloudHypervisor) getRateLimiterConfig(bwSize, bwOneTimeBurst, opsSize, opsOneTimeBurst int64) *chclient.RateLimiterConfig { + if bwSize == 0 && opsSize == 0 { + return nil + } + + rateLimiterConfig := chclient.NewRateLimiterConfig() + + if bwSize != 0 { + bwTokenBucket := chclient.NewTokenBucket(bwSize, int64(utils.DefaultRateLimiterRefillTimeMilliSecs)) + + if bwOneTimeBurst != 0 { + bwTokenBucket.SetOneTimeBurst(bwOneTimeBurst) + } + + rateLimiterConfig.SetBandwidth(*bwTokenBucket) + } + + if opsSize != 0 { + opsTokenBucket := chclient.NewTokenBucket(opsSize, int64(utils.DefaultRateLimiterRefillTimeMilliSecs)) + + if opsOneTimeBurst != 0 { + opsTokenBucket.SetOneTimeBurst(opsOneTimeBurst) + } + + rateLimiterConfig.SetOps(*opsTokenBucket) + } + + return rateLimiterConfig +} + +func (clh *cloudHypervisor) getNetRateLimiterConfig() *chclient.RateLimiterConfig { + return clh.getRateLimiterConfig( + int64(utils.RevertBytes(uint64(clh.config.NetRateLimiterBwMaxRate/8))), + int64(utils.RevertBytes(uint64(clh.config.NetRateLimiterBwOneTimeBurst/8))), + clh.config.NetRateLimiterOpsMaxRate, + clh.config.NetRateLimiterOpsOneTimeBurst) +} + func (clh *cloudHypervisor) addNet(e Endpoint) error { clh.Logger().WithField("endpoint-type", e).Debugf("Adding Endpoint of type %v", e) @@ -1323,9 +1361,15 @@ func (clh *cloudHypervisor) addNet(e Endpoint) error { "tap": tapPath, }).Info("Adding Net") + netRateLimiterConfig := clh.getNetRateLimiterConfig() + net := chclient.NewNetConfig() net.Mac = &mac net.Tap = &tapPath + if netRateLimiterConfig != nil { + net.SetRateLimiterConfig(*netRateLimiterConfig) + } + if clh.vmconfig.Net != nil { *clh.vmconfig.Net = append(*clh.vmconfig.Net, *net) } else { diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index 9bfd2e62e..a764910f4 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -52,17 +52,21 @@ func newClhConfig() (HypervisorConfig, error) { } return HypervisorConfig{ - KernelPath: testClhKernelPath, - ImagePath: testClhImagePath, - HypervisorPath: testClhPath, - NumVCPUs: defaultVCPUs, - BlockDeviceDriver: config.VirtioBlock, - MemorySize: defaultMemSzMiB, - DefaultBridges: defaultBridges, - DefaultMaxVCPUs: uint32(64), - SharedFS: config.VirtioFS, - VirtioFSCache: virtioFsCacheAlways, - VirtioFSDaemon: testVirtiofsdPath, + KernelPath: testClhKernelPath, + ImagePath: testClhImagePath, + HypervisorPath: testClhPath, + NumVCPUs: defaultVCPUs, + BlockDeviceDriver: config.VirtioBlock, + MemorySize: defaultMemSzMiB, + DefaultBridges: defaultBridges, + DefaultMaxVCPUs: uint32(64), + SharedFS: config.VirtioFS, + VirtioFSCache: virtioFsCacheAlways, + VirtioFSDaemon: testVirtiofsdPath, + NetRateLimiterBwMaxRate: int64(0), + NetRateLimiterBwOneTimeBurst: int64(0), + NetRateLimiterOpsMaxRate: int64(0), + NetRateLimiterOpsOneTimeBurst: int64(0), }, nil } @@ -191,6 +195,181 @@ func TestCloudHypervisorAddNetCheckEnpointTypes(t *testing.T) { } } +// Check AddNet properly sets up the network rate limiter +func TestCloudHypervisorNetRateLimiter(t *testing.T) { + assert := assert.New(t) + + tapPath := "/path/to/tap" + + validVeth := &VethEndpoint{} + validVeth.NetPair.TapInterface.TAPIface.Name = tapPath + + type args struct { + bwMaxRate int64 + bwOneTimeBurst int64 + opsMaxRate int64 + opsOneTimeBurst int64 + } + + //nolint: govet + tests := []struct { + name string + args args + expectsRateLimiter bool + expectsBwBucketToken bool + expectsOpsBucketToken bool + }{ + // Bandwidth + { + "Bandwidth | max rate with one time burst", + args{ + bwMaxRate: int64(1000), + bwOneTimeBurst: int64(10000), + }, + true, // expectsRateLimiter + true, // expectsBwBucketToken + false, // expectsOpsBucketToken + }, + { + "Bandwidth | max rate without one time burst", + args{ + bwMaxRate: int64(1000), + }, + true, // expectsRateLimiter + true, // expectsBwBucketToken + false, // expectsOpsBucketToken + }, + { + "Bandwidth | no max rate with one time burst", + args{ + bwOneTimeBurst: int64(10000), + }, + false, // expectsRateLimiter + false, // expectsBwBucketToken + false, // expectsOpsBucketToken + }, + { + "Bandwidth | no max rate and no one time burst", + args{}, + false, // expectsRateLimiter + false, // expectsBwBucketToken + false, // expectsOpsBucketToken + }, + + // Operations + { + "Operations | max rate with one time burst", + args{ + opsMaxRate: int64(1000), + opsOneTimeBurst: int64(10000), + }, + true, // expectsRateLimiter + false, // expectsBwBucketToken + true, // expectsOpsBucketToken + }, + { + "Operations | max rate without one time burst", + args{ + opsMaxRate: int64(1000), + }, + true, // expectsRateLimiter + false, // expectsBwBucketToken + true, // expectsOpsBucketToken + }, + { + "Operations | no max rate with one time burst", + args{ + opsOneTimeBurst: int64(10000), + }, + false, // expectsRateLimiter + false, // expectsBwBucketToken + false, // expectsOpsBucketToken + }, + { + "Operations | no max rate and no one time burst", + args{}, + false, // expectsRateLimiter + false, // expectsBwBucketToken + false, // expectsOpsBucketToken + }, + + // Bandwidth and Operations + { + "Bandwidth and Operations | max rate with one time burst", + args{ + bwMaxRate: int64(1000), + bwOneTimeBurst: int64(10000), + opsMaxRate: int64(1000), + opsOneTimeBurst: int64(10000), + }, + true, // expectsRateLimiter + true, // expectsBwBucketToken + true, // expectsOpsBucketToken + }, + { + "Bandwidth and Operations | max rate without one time burst", + args{ + bwMaxRate: int64(1000), + opsMaxRate: int64(1000), + }, + true, // expectsRateLimiter + true, // expectsBwBucketToken + true, // expectsOpsBucketToken + }, + { + "Bandwidth and Operations | no max rate with one time burst", + args{ + bwOneTimeBurst: int64(10000), + opsOneTimeBurst: int64(10000), + }, + false, // expectsRateLimiter + false, // expectsBwBucketToken + false, // expectsOpsBucketToken + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + clhConfig, err := newClhConfig() + assert.NoError(err) + + clhConfig.NetRateLimiterBwMaxRate = tt.args.bwMaxRate + clhConfig.NetRateLimiterBwOneTimeBurst = tt.args.bwOneTimeBurst + clhConfig.NetRateLimiterOpsMaxRate = tt.args.opsMaxRate + clhConfig.NetRateLimiterOpsOneTimeBurst = tt.args.opsOneTimeBurst + + clh := &cloudHypervisor{} + clh.config = clhConfig + clh.APIClient = &clhClientMock{} + + if err := clh.addNet(validVeth); err != nil { + t.Errorf("cloudHypervisor.addNet() error = %v", err) + } else { + netConfig := (*clh.vmconfig.Net)[0] + + assert.Equal(netConfig.HasRateLimiterConfig(), tt.expectsRateLimiter) + if tt.expectsRateLimiter { + rateLimiterConfig := netConfig.GetRateLimiterConfig() + assert.Equal(rateLimiterConfig.HasBandwidth(), tt.expectsBwBucketToken) + assert.Equal(rateLimiterConfig.HasOps(), tt.expectsOpsBucketToken) + + if tt.expectsBwBucketToken { + bwBucketToken := rateLimiterConfig.GetBandwidth() + assert.Equal(bwBucketToken.GetSize(), int64(utils.RevertBytes(uint64(tt.args.bwMaxRate/8)))) + assert.Equal(bwBucketToken.GetOneTimeBurst(), int64(utils.RevertBytes(uint64(tt.args.bwOneTimeBurst/8)))) + } + + if tt.expectsOpsBucketToken { + opsBucketToken := rateLimiterConfig.GetOps() + assert.Equal(opsBucketToken.GetSize(), int64(tt.args.opsMaxRate)) + assert.Equal(opsBucketToken.GetOneTimeBurst(), int64(tt.args.opsOneTimeBurst)) + } + } + } + }) + } +} + func TestCloudHypervisorBootVM(t *testing.T) { clh := &cloudHypervisor{} clh.APIClient = &clhClientMock{} From 5b18575dfed267854ce23a022b7a922bd71c17ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 25 Apr 2022 15:30:21 +0200 Subject: [PATCH 17/29] hypervisor: Add disk bandwidth and operations rate limiters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the disk counterpart of the what was introduced for the network as part of the previous commits in this series. The newly added fields are: * DiskRateLimiterBwMaxRate, defined in bits per second, which is used to control the network I/O bandwidth at the VM level. * DiskRateLimiterBwOneTimeBurst, also defined in bits per second, which is used to define an *initial* max rate, which doesn't replenish. * DiskRateLimiterOpsMaxRate, the operations per second equivalent of the DiskRateLimiterBwMaxRate. * DiskRateLimiterOpsOneTimeBurst, the operations per second equivalent of the DiskRateLimiterBwOneTimeBurst. For now those extra fields have only been added to the hypervisor's configuration and they'll be used in the coming patches of this very same series. Signed-off-by: Fabiano Fidêncio --- src/runtime/pkg/katautils/config.go | 168 ++++++++++++++--------- src/runtime/virtcontainers/hypervisor.go | 18 +++ 2 files changed, 118 insertions(+), 68 deletions(-) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 3cb37780d..0401b2180 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -74,74 +74,78 @@ type factory struct { } type hypervisor struct { - Path string `toml:"path"` - JailerPath string `toml:"jailer_path"` - Kernel string `toml:"kernel"` - CtlPath string `toml:"ctlpath"` - Initrd string `toml:"initrd"` - Image string `toml:"image"` - Firmware string `toml:"firmware"` - FirmwareVolume string `toml:"firmware_volume"` - MachineAccelerators string `toml:"machine_accelerators"` - CPUFeatures string `toml:"cpu_features"` - KernelParams string `toml:"kernel_params"` - MachineType string `toml:"machine_type"` - BlockDeviceDriver string `toml:"block_device_driver"` - EntropySource string `toml:"entropy_source"` - SharedFS string `toml:"shared_fs"` - VirtioFSDaemon string `toml:"virtio_fs_daemon"` - VirtioFSCache string `toml:"virtio_fs_cache"` - VhostUserStorePath string `toml:"vhost_user_store_path"` - FileBackedMemRootDir string `toml:"file_mem_backend"` - GuestHookPath string `toml:"guest_hook_path"` - GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` - HypervisorPathList []string `toml:"valid_hypervisor_paths"` - JailerPathList []string `toml:"valid_jailer_paths"` - CtlPathList []string `toml:"valid_ctlpaths"` - VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` - VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` - PFlashList []string `toml:"pflashes"` - VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` - FileBackedMemRootList []string `toml:"valid_file_mem_backends"` - EntropySourceList []string `toml:"valid_entropy_sources"` - EnableAnnotations []string `toml:"enable_annotations"` - RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` - TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` - MemOffset uint64 `toml:"memory_offset"` - NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` - NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` - NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` - NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` - VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` - DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` - MemorySize uint32 `toml:"default_memory"` - MemSlots uint32 `toml:"memory_slots"` - DefaultBridges uint32 `toml:"default_bridges"` - Msize9p uint32 `toml:"msize_9p"` - PCIeRootPort uint32 `toml:"pcie_root_port"` - NumVCPUs int32 `toml:"default_vcpus"` - BlockDeviceCacheSet bool `toml:"block_device_cache_set"` - BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` - BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` - EnableVhostUserStore bool `toml:"enable_vhost_user_store"` - DisableBlockDeviceUse bool `toml:"disable_block_device_use"` - MemPrealloc bool `toml:"enable_mem_prealloc"` - HugePages bool `toml:"enable_hugepages"` - VirtioMem bool `toml:"enable_virtio_mem"` - IOMMU bool `toml:"enable_iommu"` - IOMMUPlatform bool `toml:"enable_iommu_platform"` - Debug bool `toml:"enable_debug"` - DisableNestingChecks bool `toml:"disable_nesting_checks"` - EnableIOThreads bool `toml:"enable_iothreads"` - DisableImageNvdimm bool `toml:"disable_image_nvdimm"` - HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` - DisableVhostNet bool `toml:"disable_vhost_net"` - GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` - ConfidentialGuest bool `toml:"confidential_guest"` - GuestSwap bool `toml:"enable_guest_swap"` - Rootless bool `toml:"rootless"` - DisableSeccomp bool `toml:"disable_seccomp"` - DisableSeLinux bool `toml:"disable_selinux"` + Path string `toml:"path"` + JailerPath string `toml:"jailer_path"` + Kernel string `toml:"kernel"` + CtlPath string `toml:"ctlpath"` + Initrd string `toml:"initrd"` + Image string `toml:"image"` + Firmware string `toml:"firmware"` + FirmwareVolume string `toml:"firmware_volume"` + MachineAccelerators string `toml:"machine_accelerators"` + CPUFeatures string `toml:"cpu_features"` + KernelParams string `toml:"kernel_params"` + MachineType string `toml:"machine_type"` + BlockDeviceDriver string `toml:"block_device_driver"` + EntropySource string `toml:"entropy_source"` + SharedFS string `toml:"shared_fs"` + VirtioFSDaemon string `toml:"virtio_fs_daemon"` + VirtioFSCache string `toml:"virtio_fs_cache"` + VhostUserStorePath string `toml:"vhost_user_store_path"` + FileBackedMemRootDir string `toml:"file_mem_backend"` + GuestHookPath string `toml:"guest_hook_path"` + GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` + HypervisorPathList []string `toml:"valid_hypervisor_paths"` + JailerPathList []string `toml:"valid_jailer_paths"` + CtlPathList []string `toml:"valid_ctlpaths"` + VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` + VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` + PFlashList []string `toml:"pflashes"` + VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` + FileBackedMemRootList []string `toml:"valid_file_mem_backends"` + EntropySourceList []string `toml:"valid_entropy_sources"` + EnableAnnotations []string `toml:"enable_annotations"` + RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` + TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` + MemOffset uint64 `toml:"memory_offset"` + DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"` + DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"` + DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"` + DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"` + NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` + NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` + NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` + NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` + VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` + DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` + MemorySize uint32 `toml:"default_memory"` + MemSlots uint32 `toml:"memory_slots"` + DefaultBridges uint32 `toml:"default_bridges"` + Msize9p uint32 `toml:"msize_9p"` + PCIeRootPort uint32 `toml:"pcie_root_port"` + NumVCPUs int32 `toml:"default_vcpus"` + BlockDeviceCacheSet bool `toml:"block_device_cache_set"` + BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` + BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` + EnableVhostUserStore bool `toml:"enable_vhost_user_store"` + DisableBlockDeviceUse bool `toml:"disable_block_device_use"` + MemPrealloc bool `toml:"enable_mem_prealloc"` + HugePages bool `toml:"enable_hugepages"` + VirtioMem bool `toml:"enable_virtio_mem"` + IOMMU bool `toml:"enable_iommu"` + IOMMUPlatform bool `toml:"enable_iommu_platform"` + Debug bool `toml:"enable_debug"` + DisableNestingChecks bool `toml:"disable_nesting_checks"` + EnableIOThreads bool `toml:"enable_iothreads"` + DisableImageNvdimm bool `toml:"disable_image_nvdimm"` + HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` + DisableVhostNet bool `toml:"disable_vhost_net"` + GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` + ConfidentialGuest bool `toml:"confidential_guest"` + GuestSwap bool `toml:"enable_guest_swap"` + Rootless bool `toml:"rootless"` + DisableSeccomp bool `toml:"disable_seccomp"` + DisableSeLinux bool `toml:"disable_selinux"` } type runtime struct { @@ -486,6 +490,34 @@ func (h hypervisor) getInitrdAndImage() (initrd string, image string, err error) return } +func (h hypervisor) getDiskRateLimiterBwMaxRate() int64 { + return h.DiskRateLimiterBwMaxRate +} + +func (h hypervisor) getDiskRateLimiterBwOneTimeBurst() int64 { + if h.DiskRateLimiterBwOneTimeBurst != 0 && h.getDiskRateLimiterBwMaxRate() == 0 { + kataUtilsLogger.Warn("The DiskRateLimiterBwOneTimeBurst is set but DiskRateLimiterBwMaxRate is not set, this option will be ignored.") + + h.DiskRateLimiterBwOneTimeBurst = 0 + } + + return h.DiskRateLimiterBwOneTimeBurst +} + +func (h hypervisor) getDiskRateLimiterOpsMaxRate() int64 { + return h.DiskRateLimiterOpsMaxRate +} + +func (h hypervisor) getDiskRateLimiterOpsOneTimeBurst() int64 { + if h.DiskRateLimiterOpsOneTimeBurst != 0 && h.getDiskRateLimiterOpsMaxRate() == 0 { + kataUtilsLogger.Warn("The DiskRateLimiterOpsOneTimeBurst is set but DiskRateLimiterOpsMaxRate is not set, this option will be ignored.") + + h.DiskRateLimiterOpsOneTimeBurst = 0 + } + + return h.DiskRateLimiterOpsOneTimeBurst +} + func (h hypervisor) getRxRateLimiterCfg() uint64 { return h.RxRateLimiterMaxRate } diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index fa9b66867..c689eae3c 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -380,6 +380,24 @@ type HypervisorConfig struct { // Enable SGX. Hardware-based isolation and memory encryption. SGXEPCSize int64 + // DiskRateLimiterBwRate is used to control disk I/O bandwidth on VM level. + // The same value, defined in bits per second, is used for inbound and outbound bandwidth. + DiskRateLimiterBwMaxRate int64 + + // DiskRateLimiterBwOneTimeBurst is used to control disk I/O bandwidth on VM level. + // This increases the initial max rate and this initial extra credit does *NOT* replenish + // and can be used for an *initial* burst of data. + DiskRateLimiterBwOneTimeBurst int64 + + // DiskRateLimiterOpsRate is used to control disk I/O operations on VM level. + // The same value, defined in operations per second, is used for inbound and outbound bandwidth. + DiskRateLimiterOpsMaxRate int64 + + // DiskRateLimiterOpsOneTimeBurst is used to control disk I/O operations on VM level. + // This increases the initial max rate and this initial extra credit does *NOT* replenish + // and can be used for an *initial* burst of data. + DiskRateLimiterOpsOneTimeBurst int64 + // RxRateLimiterMaxRate is used to control network I/O inbound bandwidth on VM level. RxRateLimiterMaxRate uint64 From 511f7f822d28843ee64571129cbd24f4816ab048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 25 Apr 2022 15:35:15 +0200 Subject: [PATCH 18/29] config: Add DiskRateLimiter* to Cloud Hypervisor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's add the newly added disk rate limiter configurations to the Cloud Hypervisor's hypervisor configuration. Right now those are not used anywhere, and there's absolutely no way the users can set those up. That's coming later in this very same series. Signed-off-by: Fabiano Fidêncio --- src/runtime/pkg/katautils/config.go | 104 ++++++++++++----------- src/runtime/pkg/katautils/config_test.go | 43 ++++++++-- 2 files changed, 88 insertions(+), 59 deletions(-) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 0401b2180..ec02b4bc2 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -892,56 +892,60 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { } return vc.HypervisorConfig{ - HypervisorPath: hypervisor, - HypervisorPathList: h.HypervisorPathList, - KernelPath: kernel, - InitrdPath: initrd, - ImagePath: image, - FirmwarePath: firmware, - MachineAccelerators: machineAccelerators, - KernelParams: vc.DeserializeParams(strings.Fields(kernelParams)), - HypervisorMachineType: machineType, - NumVCPUs: h.defaultVCPUs(), - DefaultMaxVCPUs: h.defaultMaxVCPUs(), - MemorySize: h.defaultMemSz(), - MemSlots: h.defaultMemSlots(), - MemOffset: h.defaultMemOffset(), - VirtioMem: h.VirtioMem, - EntropySource: h.GetEntropySource(), - EntropySourceList: h.EntropySourceList, - DefaultBridges: h.defaultBridges(), - DisableBlockDeviceUse: h.DisableBlockDeviceUse, - SharedFS: sharedFS, - VirtioFSDaemon: h.VirtioFSDaemon, - VirtioFSDaemonList: h.VirtioFSDaemonList, - VirtioFSCacheSize: h.VirtioFSCacheSize, - VirtioFSCache: h.VirtioFSCache, - MemPrealloc: h.MemPrealloc, - HugePages: h.HugePages, - FileBackedMemRootDir: h.FileBackedMemRootDir, - FileBackedMemRootList: h.FileBackedMemRootList, - Debug: h.Debug, - DisableNestingChecks: h.DisableNestingChecks, - BlockDeviceDriver: blockDriver, - BlockDeviceCacheSet: h.BlockDeviceCacheSet, - BlockDeviceCacheDirect: h.BlockDeviceCacheDirect, - BlockDeviceCacheNoflush: h.BlockDeviceCacheNoflush, - EnableIOThreads: h.EnableIOThreads, - Msize9p: h.msize9p(), - HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, - PCIeRootPort: h.PCIeRootPort, - DisableVhostNet: true, - GuestHookPath: h.guestHookPath(), - VirtioFSExtraArgs: h.VirtioFSExtraArgs, - SGXEPCSize: defaultSGXEPCSize, - EnableAnnotations: h.EnableAnnotations, - DisableSeccomp: h.DisableSeccomp, - ConfidentialGuest: h.ConfidentialGuest, - DisableSeLinux: h.DisableSeLinux, - NetRateLimiterBwMaxRate: h.getNetRateLimiterBwMaxRate(), - NetRateLimiterBwOneTimeBurst: h.getNetRateLimiterBwOneTimeBurst(), - NetRateLimiterOpsMaxRate: h.getNetRateLimiterOpsMaxRate(), - NetRateLimiterOpsOneTimeBurst: h.getNetRateLimiterOpsOneTimeBurst(), + HypervisorPath: hypervisor, + HypervisorPathList: h.HypervisorPathList, + KernelPath: kernel, + InitrdPath: initrd, + ImagePath: image, + FirmwarePath: firmware, + MachineAccelerators: machineAccelerators, + KernelParams: vc.DeserializeParams(strings.Fields(kernelParams)), + HypervisorMachineType: machineType, + NumVCPUs: h.defaultVCPUs(), + DefaultMaxVCPUs: h.defaultMaxVCPUs(), + MemorySize: h.defaultMemSz(), + MemSlots: h.defaultMemSlots(), + MemOffset: h.defaultMemOffset(), + VirtioMem: h.VirtioMem, + EntropySource: h.GetEntropySource(), + EntropySourceList: h.EntropySourceList, + DefaultBridges: h.defaultBridges(), + DisableBlockDeviceUse: h.DisableBlockDeviceUse, + SharedFS: sharedFS, + VirtioFSDaemon: h.VirtioFSDaemon, + VirtioFSDaemonList: h.VirtioFSDaemonList, + VirtioFSCacheSize: h.VirtioFSCacheSize, + VirtioFSCache: h.VirtioFSCache, + MemPrealloc: h.MemPrealloc, + HugePages: h.HugePages, + FileBackedMemRootDir: h.FileBackedMemRootDir, + FileBackedMemRootList: h.FileBackedMemRootList, + Debug: h.Debug, + DisableNestingChecks: h.DisableNestingChecks, + BlockDeviceDriver: blockDriver, + BlockDeviceCacheSet: h.BlockDeviceCacheSet, + BlockDeviceCacheDirect: h.BlockDeviceCacheDirect, + BlockDeviceCacheNoflush: h.BlockDeviceCacheNoflush, + EnableIOThreads: h.EnableIOThreads, + Msize9p: h.msize9p(), + HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, + PCIeRootPort: h.PCIeRootPort, + DisableVhostNet: true, + GuestHookPath: h.guestHookPath(), + VirtioFSExtraArgs: h.VirtioFSExtraArgs, + SGXEPCSize: defaultSGXEPCSize, + EnableAnnotations: h.EnableAnnotations, + DisableSeccomp: h.DisableSeccomp, + ConfidentialGuest: h.ConfidentialGuest, + DisableSeLinux: h.DisableSeLinux, + NetRateLimiterBwMaxRate: h.getNetRateLimiterBwMaxRate(), + NetRateLimiterBwOneTimeBurst: h.getNetRateLimiterBwOneTimeBurst(), + NetRateLimiterOpsMaxRate: h.getNetRateLimiterOpsMaxRate(), + NetRateLimiterOpsOneTimeBurst: h.getNetRateLimiterOpsOneTimeBurst(), + DiskRateLimiterBwMaxRate: h.getDiskRateLimiterBwMaxRate(), + DiskRateLimiterBwOneTimeBurst: h.getDiskRateLimiterBwOneTimeBurst(), + DiskRateLimiterOpsMaxRate: h.getDiskRateLimiterOpsMaxRate(), + DiskRateLimiterOpsOneTimeBurst: h.getDiskRateLimiterOpsOneTimeBurst(), }, nil } diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 778f6ec6d..0bcac2137 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -814,6 +814,10 @@ func TestNewClhHypervisorConfig(t *testing.T) { netRateLimiterBwOneTimeBurst := int64(1000) netRateLimiterOpsMaxRate := int64(0) netRateLimiterOpsOneTimeBurst := int64(1000) + diskRateLimiterBwMaxRate := int64(1000) + diskRateLimiterBwOneTimeBurst := int64(1000) + diskRateLimiterOpsMaxRate := int64(0) + diskRateLimiterOpsOneTimeBurst := int64(1000) for _, file := range []string{imagePath, hypervisorPath, kernelPath, virtioFsDaemon} { err := createEmptyFile(file) @@ -821,15 +825,19 @@ func TestNewClhHypervisorConfig(t *testing.T) { } hypervisor := hypervisor{ - Path: hypervisorPath, - Kernel: kernelPath, - Image: imagePath, - VirtioFSDaemon: virtioFsDaemon, - VirtioFSCache: "always", - NetRateLimiterBwMaxRate: netRateLimiterBwMaxRate, - NetRateLimiterBwOneTimeBurst: netRateLimiterBwOneTimeBurst, - NetRateLimiterOpsMaxRate: netRateLimiterOpsMaxRate, - NetRateLimiterOpsOneTimeBurst: netRateLimiterOpsOneTimeBurst, + Path: hypervisorPath, + Kernel: kernelPath, + Image: imagePath, + VirtioFSDaemon: virtioFsDaemon, + VirtioFSCache: "always", + NetRateLimiterBwMaxRate: netRateLimiterBwMaxRate, + NetRateLimiterBwOneTimeBurst: netRateLimiterBwOneTimeBurst, + NetRateLimiterOpsMaxRate: netRateLimiterOpsMaxRate, + NetRateLimiterOpsOneTimeBurst: netRateLimiterOpsOneTimeBurst, + DiskRateLimiterBwMaxRate: diskRateLimiterBwMaxRate, + DiskRateLimiterBwOneTimeBurst: diskRateLimiterBwOneTimeBurst, + DiskRateLimiterOpsMaxRate: diskRateLimiterOpsMaxRate, + DiskRateLimiterOpsOneTimeBurst: diskRateLimiterOpsOneTimeBurst, } config, err := newClhHypervisorConfig(hypervisor) if err != nil { @@ -876,6 +884,23 @@ func TestNewClhHypervisorConfig(t *testing.T) { if config.NetRateLimiterOpsOneTimeBurst != 0 { t.Errorf("Expected value for network operations one time burst %v, got %v", netRateLimiterOpsOneTimeBurst, config.NetRateLimiterOpsOneTimeBurst) } + + if config.DiskRateLimiterBwMaxRate != diskRateLimiterBwMaxRate { + t.Errorf("Expected value for disk bandwidth rate limiter %v, got %v", diskRateLimiterBwMaxRate, config.DiskRateLimiterBwMaxRate) + } + + if config.DiskRateLimiterBwOneTimeBurst != diskRateLimiterBwOneTimeBurst { + t.Errorf("Expected value for disk bandwidth one time burst %v, got %v", diskRateLimiterBwOneTimeBurst, config.DiskRateLimiterBwOneTimeBurst) + } + + if config.DiskRateLimiterOpsMaxRate != diskRateLimiterOpsMaxRate { + t.Errorf("Expected value for disk operations rate limiter %v, got %v", diskRateLimiterOpsMaxRate, config.DiskRateLimiterOpsMaxRate) + } + + // We expect 0 (zero) here as diskRateLimiterOpsMaxRate is not set (set to zero). + if config.DiskRateLimiterOpsOneTimeBurst != 0 { + t.Errorf("Expected value for disk operations one time burst %v, got %v", diskRateLimiterOpsOneTimeBurst, config.DiskRateLimiterOpsOneTimeBurst) + } } func TestHypervisorDefaults(t *testing.T) { From 63c4da03a925003e313d39f5e649ff0151479ae0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 25 Apr 2022 16:04:57 +0200 Subject: [PATCH 19/29] clh: Implement the Disk RateLimiter logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's take advantage of the newly added DiskRateLimiter* options and apply those to the network device configuration. The logic here is identical to the one already present in the Network part of Cloud Hypervisor's driver. Signed-off-by: Fabiano Fidêncio --- src/runtime/virtcontainers/clh.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 30dc42c47..b18276a85 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -442,6 +442,11 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net disk := chclient.NewDiskConfig(imagePath) disk.SetReadonly(true) + diskRateLimiterConfig := clh.getDiskRateLimiterConfig() + if diskRateLimiterConfig != nil { + disk.SetRateLimiterConfig(*diskRateLimiterConfig) + } + if clh.vmconfig.Disks != nil { *clh.vmconfig.Disks = append(*clh.vmconfig.Disks, *disk) } else { @@ -667,6 +672,11 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro clhDisk.VhostUser = func(b bool) *bool { return &b }(false) clhDisk.Id = &driveID + diskRateLimiterConfig := clh.getDiskRateLimiterConfig() + if diskRateLimiterConfig != nil { + clhDisk.SetRateLimiterConfig(*diskRateLimiterConfig) + } + pciInfo, _, err := cl.VmAddDiskPut(ctx, clhDisk) if err != nil { @@ -1342,6 +1352,14 @@ func (clh *cloudHypervisor) getNetRateLimiterConfig() *chclient.RateLimiterConfi clh.config.NetRateLimiterOpsOneTimeBurst) } +func (clh *cloudHypervisor) getDiskRateLimiterConfig() *chclient.RateLimiterConfig { + return clh.getRateLimiterConfig( + int64(utils.RevertBytes(uint64(clh.config.DiskRateLimiterBwMaxRate/8))), + int64(utils.RevertBytes(uint64(clh.config.DiskRateLimiterBwOneTimeBurst/8))), + clh.config.DiskRateLimiterOpsMaxRate, + clh.config.DiskRateLimiterOpsOneTimeBurst) +} + func (clh *cloudHypervisor) addNet(e Endpoint) error { clh.Logger().WithField("endpoint-type", e).Debugf("Adding Endpoint of type %v", e) From a88adabaaecbb448120d3e78a703fbf61e7900e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 19 Apr 2022 13:23:14 +0200 Subject: [PATCH 20/29] clh: Cloud Hypervisor has a built-in Rate Limiter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The notion of "built-in rate limiter" was added as part of bd8658e3627c450921c63bde3fe8d7458a01b242, and that commit considered that only Firecracker had a built-in rate limiter, which I think was the case when that was introduced (mid 2020). Nowadays, however, Cloud Hypervisor takes advantage of the very same crate used by Firecraker to do I/O throttling. Signed-off-by: Fabiano Fidêncio --- src/runtime/virtcontainers/clh.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index b18276a85..41d00ee27 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -1497,5 +1497,5 @@ func (clh *cloudHypervisor) vmInfo() (chclient.VmInfo, error) { } func (clh *cloudHypervisor) IsRateLimiterBuiltin() bool { - return false + return true } From 7580bb5a78bf61cff350d0089e4eb74211e6a7f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 21 Apr 2022 00:58:56 +0200 Subject: [PATCH 21/29] clh: Expose net rate limiter config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With everything implemented, let's now expose the net rate limiter configuration options in the Cloud Hypervisor configuration file. Fixes: #4017 Signed-off-by: Fabiano Fidêncio --- src/runtime/config/configuration-clh.toml.in | 36 ++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index b3cfbd583..5dfe2082d 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -180,6 +180,42 @@ block_device_driver = "virtio-blk" # but it will not abort container execution. #guest_hook_path = "/usr/share/oci/hooks" # +# These options are related to network rate limiter at the VMM level, and are +# based on the Cloud Hypervisor I/O throttling. Those are disabled by default +# and we strongly advise users to refer the Cloud Hypervisor official +# documentation for a better understanding of its internals: +# https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/docs/io_throttling.md +# +# Bandwidth rate limiter options +# +# net_rate_limiter_bw_max_rate controls network I/O bandwidth (size in bits/sec +# for SB/VM). +# The same value is used for inbound and outbound bandwidth. +# Default 0-sized value means unlimited rate. +#net_rate_limiter_bw_max_rate = 0 +# +# net_rate_limiter_bw_one_time_burst increases the initial max rate and this +# initial extra credit does *NOT* affect the overall limit and can be used for +# an *initial* burst of data. +# This is *optional* and only takes effect if net_rate_limiter_bw_max_rate is +# set to a non zero value. +#net_rate_limiter_bw_one_time_burst = 0 +# +# Operation rate limiter options +# +# net_rate_limiter_ops_max_rate controls network I/O bandwidth (size in ops/sec +# for SB/VM). +# The same value is used for inbound and outbound bandwidth. +# Default 0-sized value means unlimited rate. +#net_rate_limiter_ops_max_rate = 0 +# +# net_rate_limiter_ops_one_time_burst increases the initial max rate and this +# initial extra credit does *NOT* affect the overall limit and can be used for +# an *initial* burst of data. +# This is *optional* and only takes effect if net_rate_limiter_bw_max_rate is +# set to a non zero value. +#net_rate_limiter_ops_one_time_burst = 0 + [agent.@PROJECT_TYPE@] # If enabled, make the agent display debug-level messages. # (default: disabled) From b6467ddd73458324abe9b8c470a78d9a60f396ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 25 Apr 2022 16:53:58 +0200 Subject: [PATCH 22/29] clh: Expose disk rate limiter config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With everything implemented, let's now expose the disk rate limiter configuration options in the Cloud Hypervisor configuration file. Fixes: #4139 Signed-off-by: Fabiano Fidêncio --- src/runtime/config/configuration-clh.toml.in | 36 ++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index 5dfe2082d..41d4ae493 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -215,6 +215,42 @@ block_device_driver = "virtio-blk" # This is *optional* and only takes effect if net_rate_limiter_bw_max_rate is # set to a non zero value. #net_rate_limiter_ops_one_time_burst = 0 +# +# These options are related to disk rate limiter at the VMM level, and are +# based on the Cloud Hypervisor I/O throttling. Those are disabled by default +# and we strongly advise users to refer the Cloud Hypervisor official +# documentation for a better understanding of its internals: +# https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/docs/io_throttling.md +# +# Bandwidth rate limiter options +# +# disk_rate_limiter_bw_max_rate controls disk I/O bandwidth (size in bits/sec +# for SB/VM). +# The same value is used for inbound and outbound bandwidth. +# Default 0-sized value means unlimited rate. +#disk_rate_limiter_bw_max_rate = 0 +# +# disk_rate_limiter_bw_one_time_burst increases the initial max rate and this +# initial extra credit does *NOT* affect the overall limit and can be used for +# an *initial* burst of data. +# This is *optional* and only takes effect if disk_rate_limiter_bw_max_rate is +# set to a non zero value. +#disk_rate_limiter_bw_one_time_burst = 0 +# +# Operation rate limiter options +# +# disk_rate_limiter_ops_max_rate controls disk I/O bandwidth (size in ops/sec +# for SB/VM). +# The same value is used for inbound and outbound bandwidth. +# Default 0-sized value means unlimited rate. +#disk_rate_limiter_ops_max_rate = 0 +# +# disk_rate_limiter_ops_one_time_burst increases the initial max rate and this +# initial extra credit does *NOT* affect the overall limit and can be used for +# an *initial* burst of data. +# This is *optional* and only takes effect if disk_rate_limiter_bw_max_rate is +# set to a non zero value. +#disk_rate_limiter_ops_one_time_burst = 0 [agent.@PROJECT_TYPE@] # If enabled, make the agent display debug-level messages. From 4b9b62bb3ec1720d9fd2ecd5460cd219f851a3b9 Mon Sep 17 00:00:00 2001 From: "James O. D. Hunt" Date: Thu, 28 Apr 2022 11:56:02 +0100 Subject: [PATCH 23/29] agent-ctl: Fix abstract socket connections Unbreak the `agent-ctl` tool connecting to the agent with a Unix domain socket. It appears that [1] changed the behaviour of connecting to the agent using a local Unix socket (which is not used by Kata under normal operation). The change can be seen by reverting to commit 72b8144b569f1d0fef3001642f244bf79b17ddc6 (the one before [1]) and running the agent manually as: ```bash $ sudo KATA_AGENT_SERVER_ADDR=unix:///tmp/foo.socket target/x86_64-unknown-linux-musl/release/kata-agent ``` Before [1], in another terminal we see this: ```bash $ sudo lsof -U 2>/dev/null |grep foo|awk '{print $9}' @/tmp/foo.socket@ ``` But now, we see the following: ```bash $ sudo lsof -U 2>/dev/null |grep foo|awk '{print $9}' @/tmp/foo.socket ``` Note the last byte which represents a nul (`\0`) value. The `agent-ctl` tool used to add that trailing nul but now it seems to not be needed, so this change removes it, restoring functionality. No external changes are necessary so the `agent-ctl` tool can connect to the agent as below like this: ```bash $ cargo run -- -l debug connect --server-address "unix://@/tmp/foo.socket" --bundle-dir "$bundle_dir" -c Check -c GetGuestDetails ``` [1] - https://github.com/kata-containers/kata-containers/issues/3124 Fixes: #4164. Signed-off-by: James O. D. Hunt --- src/tools/agent-ctl/src/client.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/tools/agent-ctl/src/client.rs b/src/tools/agent-ctl/src/client.rs index a4f1e45a8..bcaa87844 100644 --- a/src/tools/agent-ctl/src/client.rs +++ b/src/tools/agent-ctl/src/client.rs @@ -473,10 +473,8 @@ fn create_ttrpc_client( if path.starts_with('@') { abstract_socket = true; - // Remove the magic abstract-socket request character ('@') - // and crucially add a trailing nul terminator (required to - // interoperate with the ttrpc crate). - path = path[1..].to_string() + &"\x00".to_string(); + // Remove the magic abstract-socket request character ('@'). + path = path[1..].to_string(); } if abstract_socket { From 86d348e065db331817895e2c8c1935fdf2c509ef Mon Sep 17 00:00:00 2001 From: "James O. D. Hunt" Date: Thu, 28 Apr 2022 12:44:27 +0100 Subject: [PATCH 24/29] docs: Use VM term in agent-ctl doc Use the standard "VM" acronym to mean Virtual Machine. Signed-off-by: James O. D. Hunt --- src/tools/agent-ctl/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tools/agent-ctl/README.md b/src/tools/agent-ctl/README.md index 7a8d88d23..53b8646f7 100644 --- a/src/tools/agent-ctl/README.md +++ b/src/tools/agent-ctl/README.md @@ -4,7 +4,7 @@ The Kata Containers agent control tool (`kata-agent-ctl`) is a low-level test tool. It allows basic interaction with the Kata Containers agent, -`kata-agent`, that runs inside the virtual machine. +`kata-agent`, that runs inside the virtual machine (VM). Unlike the Kata Runtime, which only ever makes sequences of correctly ordered and valid agent API calls, this tool allows users to make arbitrary agent API @@ -117,7 +117,7 @@ establish the VSOCK guest CID value to connect to the agent. 1. Start a Kata Container -1. Establish the VSOCK guest CID number for the virtual machine: +1. Establish the VSOCK guest CID number for the VM: ```sh $ guest_cid=$(sudo ss -H --vsock | awk '{print $6}' | cut -d: -f1) From 666aee54d23acd21bdfa31bd46d9d7a0513c327e Mon Sep 17 00:00:00 2001 From: "James O. D. Hunt" Date: Thu, 28 Apr 2022 12:45:36 +0100 Subject: [PATCH 25/29] docs: Add VSOCK localhost example for agent-ctl Update the `agent-ctl` docs to show how to use a VSOCK local address when running the agent and the tool in the same environment. This is an alternative to using a Unix socket. Signed-off-by: James O. D. Hunt --- src/tools/agent-ctl/README.md | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/tools/agent-ctl/README.md b/src/tools/agent-ctl/README.md index 53b8646f7..8b0369a7c 100644 --- a/src/tools/agent-ctl/README.md +++ b/src/tools/agent-ctl/README.md @@ -211,10 +211,12 @@ $ sudo install -o root -g root -m 0755 ~/.cargo/bin/kata-agent-ctl /usr/local/bi > **Warnings:** > -> - This method is **only** for testing and development! +> - These methods are **only** for testing and development! > - Only continue if you are using a non-critical system > (such as a freshly installed VM environment). +#### Use a Unix abstract domain socket + 1. Start the agent, specifying a local socket for it to communicate on: ```sh @@ -233,3 +235,31 @@ $ sudo install -o root -g root -m 0755 ~/.cargo/bin/kata-agent-ctl /usr/local/bi > > The `@` in the server address is required - it denotes an abstract > socket which the agent requires (see `unix(7)`). + +#### Use a VSOCK loopback socket + +VSOCK supports a special CID value of `1` (known symbolically as +`VMADDR_CID_LOCAL`) which assumes that the VM is actually +the local environment. This is effectively a `localhost` or loopback +interface which does not require an actual VM to be +running. + +1. Start the agent, specifying the local VSOCK socket for it to communicate on: + + ```sh + $ vsock_loopback_cid=1 + $ agent_vsock_port=1024 + + $ sudo KATA_AGENT_SERVER_ADDR="vsock://${vsock_loopback_cid}:${agent_vsock_port}" target/x86_64-unknown-linux-musl/release/kata-agent + ``` + + > **Note:** This example assumes an Intel x86-64 system. + +1. Run the tool in the same environment: + + ```sh + $ vsock_loopback_cid=1 + $ agent_vsock_port=1024 + + $ cargo run -- -l debug connect --server-address "vsock://${vsock_loopback_cid}:${agent_vsock_port}" --bundle-dir "$bundle_dir" -c Check -c GetGuestDetails + ``` From 081f6de87444bba3cc4c6a7c17ffaea899e1453c Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Thu, 28 Apr 2022 13:41:40 -0500 Subject: [PATCH 26/29] versions: change qemu tdx url and tag https://github.com/intel/qemu-dcp is the new repo that supports qemu with Intel TDX fixes #4171 Signed-off-by: Julio Montes --- .../no_patches.txt | 0 versions.yaml | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename tools/packaging/qemu/patches/tag_patches/{tdx-qemu-2021.11.29-v6.0.0-rc1-mvp => SPR-BKC-QEMU-v2.2}/no_patches.txt (100%) diff --git a/tools/packaging/qemu/patches/tag_patches/tdx-qemu-2021.11.29-v6.0.0-rc1-mvp/no_patches.txt b/tools/packaging/qemu/patches/tag_patches/SPR-BKC-QEMU-v2.2/no_patches.txt similarity index 100% rename from tools/packaging/qemu/patches/tag_patches/tdx-qemu-2021.11.29-v6.0.0-rc1-mvp/no_patches.txt rename to tools/packaging/qemu/patches/tag_patches/SPR-BKC-QEMU-v2.2/no_patches.txt diff --git a/versions.yaml b/versions.yaml index c885b00c9..bdfd67df9 100644 --- a/versions.yaml +++ b/versions.yaml @@ -100,8 +100,8 @@ assets: .*/v?(\d\S+)\.tar\.gz tdx: description: "VMM that uses KVM and supports TDX" - url: "https://github.com/intel/qemu-tdx" - tag: "tdx-qemu-2021.11.29-v6.0.0-rc1-mvp" + url: "https://github.com/intel/qemu-dcp" + tag: "SPR-BKC-QEMU-v2.2" qemu-experimental: description: "QEMU with virtiofs support" From 7ffe5a16f24ed48e6d17682f5468bf07b042ff54 Mon Sep 17 00:00:00 2001 From: Feng Wang Date: Thu, 9 Dec 2021 15:19:19 -0800 Subject: [PATCH 27/29] docs: Direct-assigned volume design Detail design description on direct-assigned volume Fixes: #1468 Signed-off-by: Feng Wang --- docs/design/README.md | 1 + docs/design/direct-blk-device-assignment.md | 253 ++++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 docs/design/direct-blk-device-assignment.md diff --git a/docs/design/README.md b/docs/design/README.md index d5dee7095..ad20cd720 100644 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -11,6 +11,7 @@ Kata Containers design documents: - [`Inotify` support](inotify.md) - [Metrics(Kata 2.0)](kata-2-0-metrics.md) - [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md) +- [Design for direct-assigned volume](direct-blk-device-assignment.md) --- diff --git a/docs/design/direct-blk-device-assignment.md b/docs/design/direct-blk-device-assignment.md new file mode 100644 index 000000000..9997d0e7a --- /dev/null +++ b/docs/design/direct-blk-device-assignment.md @@ -0,0 +1,253 @@ +# Motivation +Today, there exist a few gaps between Container Storage Interface (CSI) and virtual machine (VM) based runtimes such as Kata Containers +that prevent them from working together smoothly. + +First, it’s cumbersome to use a persistent volume (PV) with Kata Containers. Today, for a PV with Filesystem volume mode, Virtio-fs +is the only way to surface it inside a Kata Container guest VM. But often mounting the filesystem (FS) within the guest operating system (OS) is +desired due to performance benefits, availability of native FS features and security benefits over the Virtio-fs mechanism. + +Second, it’s difficult if not impossible to resize a PV online with Kata Containers. While a PV can be expanded on the host OS, +the updated metadata needs to be propagated to the guest OS in order for the application container to use the expanded volume. +Currently, there is not a way to propagate the PV metadata from the host OS to the guest OS without restarting the Pod sandbox. + +# Proposed Solution + +Because of the OS boundary, these features cannot be implemented in the CSI node driver plugin running on the host OS +as is normally done in the runc container. Instead, they can be done by the Kata Containers agent inside the guest OS, +but it requires the CSI driver to pass the relevant information to the Kata Containers runtime. +An ideal long term solution would be to have the `kubelet` coordinating the communication between the CSI driver and +the container runtime, as described in [KEP-2857](https://github.com/kubernetes/enhancements/pull/2893/files). +However, as the KEP is still under review, we would like to propose a short/medium term solution to unblock our use case. + +The proposed solution is built on top of a previous [proposal](https://github.com/egernst/kata-containers/blob/da-proposal/docs/design/direct-assign-volume.md) +described by Eric Ernst. The previous proposal has two gaps: + +1. Writing a `csiPlugin.json` file to the volume root path introduced a security risk. A malicious user can gain unauthorized +access to a block device by writing their own `csiPlugin.json` to the above location through an ephemeral CSI plugin. + +2. The proposal didn't describe how to establish a mapping between a volume and a kata sandbox, which is needed for +implementing CSI volume resize and volume stat collection APIs. + +This document particularly focuses on how to address these two gaps. + +## Assumptions and Limitations +1. The proposal assumes that a block device volume will only be used by one Pod on a node at a time, which we believe +is the most common pattern in Kata Containers use cases. It’s also unsafe to have the same block device attached to more than +one Kata pod. In the context of Kubernetes, the `PersistentVolumeClaim` (PVC) needs to have the `accessMode` as `ReadWriteOncePod`. +2. More advanced Kubernetes volume features such as, `fsGroup`, `fsGroupChangePolicy`, and `subPath` are not supported. + +## End User Interface + +1. The user specifies a PV as a direct-assigned volume. How a PV is specified as a direct-assigned volume is left for each CSI implementation to decide. +There are a few options for reference: + 1. A storage class parameter specifies whether it's a direct-assigned volume. This avoids any lookups of PVC + or Pod information from the CSI plugin (as external provisioner takes care of these). However, all PVs in the storage class with the parameter set + will have host mounts skipped. + 2. Use a PVC annotation. This approach requires the CSI plugins have `--extra-create-metadata` [set](https://kubernetes-csi.github.io/docs/external-provisioner.html#persistentvolumeclaim-and-persistentvolume-parameters) + to be able to perform a lookup of the PVC annotations from the API server. Pro: API server lookup of annotations only required during creation of PV. + Con: The CSI plugin will always skip host mounting of the PV. + 3. The CSI plugin can also lookup pod `runtimeclass` during `NodePublish`. This approach can be found in the [ALIBABA CSI plugin](https://github.com/kubernetes-sigs/alibaba-cloud-csi-driver/blob/master/pkg/disk/nodeserver.go#L248). +2. The CSI node driver delegates the direct assigned volume to the Kata Containers runtime. The CSI node driver APIs need to + be modified to pass the volume mount information and collect volume information to/from the Kata Containers runtime by invoking `kata-runtime` command line commands. + * **NodePublishVolume** -- It invokes `kata-runtime direct-volume add --volume-path [volumePath] --mount-info [mountInfo]` + to propagate the volume mount information to the Kata Containers runtime for it to carry out the filesystem mount operation. + The `volumePath` is the [target_path](https://github.com/container-storage-interface/spec/blob/master/csi.proto#L1364) in the CSI `NodePublishVolumeRequest`. + The `mountInfo` is a serialized JSON string. + * **NodeGetVolumeStats** -- It invokes `kata-runtime direct-volume stats --volume-path [volumePath]` to retrieve the filesystem stats of direct-assigned volume. + * **NodeExpandVolume** -- It invokes `kata-runtime direct-volume resize --volume-path [volumePath] --size [size]` to send a resize request to the Kata Containers runtime to + resize the direct-assigned volume. + * **NodeStageVolume/NodeUnStageVolume** -- It invokes `kata-runtime direct-volume remove --volume-path [volumePath]` to remove the persisted metadata of a direct-assigned volume. + +The `mountInfo` object is defined as follows: +```Golang +type MountInfo struct { + // The type of the volume (ie. block) + VolumeType string `json:"volume-type"` + // The device backing the volume. + Device string `json:"device"` + // The filesystem type to be mounted on the volume. + FsType string `json:"fstype"` + // Additional metadata to pass to the agent regarding this volume. + Metadata map[string]string `json:"metadata,omitempty"` + // Additional mount options. + Options []string `json:"options,omitempty"` +} +``` +Notes: given that the `mountInfo` is persisted to the disk by the Kata runtime, it shouldn't container any secrets (such as SMB mount password). + +## Implementation Details + +### Kata runtime +Instead of the CSI node driver writing the mount info into a `csiPlugin.json` file under the volume root, +as described in the original proposal, here we propose that the CSI node driver passes the mount information to +the Kata Containers runtime through a new `kata-runtime` commandline command. The `kata-runtime` then writes the mount +information to a `mount-info.json` file in a predefined location (`/run/kata-containers/shared/direct-volumes/[volume_path]/`). + +When the Kata Containers runtime starts a container, it verifies whether a volume mount is a direct-assigned volume by checking +whether there is a `mountInfo` file under the computed Kata `direct-volumes` directory. If it is, the runtime parses the `mountInfo` file, +updates the mount spec with the data in `mountInfo`. The updated mount spec is then passed to the Kata agent in the guest VM together +with other mounts. The Kata Containers runtime also creates a file named by the sandbox id under the `direct-volumes/[volume_path]/` +directory. The reason for adding a sandbox id file is to establish a mapping between the volume and the sandbox using it. +Later, when the Kata Containers runtime handles the `get-stats` and `resize` commands, it uses the sandbox id to identify +the endpoint of the corresponding `containerd-shim-kata-v2`. + +### containerd-shim-kata-v2 changes +`containerd-shim-kata-v2` provides an API for sandbox management through a Unix domain socket. Two new handlers are proposed: `/direct-volume/stats` and `/direct-volume/resize`: + +Example: + +```bash +$ curl --unix-socket "$shim_socket_path" -I -X GET 'http://localhost/direct-volume/stats/[urlSafeVolumePath]' +$ curl --unix-socket "$shim_socket_path" -I -X POST 'http://localhost/direct-volume/resize' -d '{ "volumePath"": [volumePath], "Size": "123123" }' +``` + +The shim then forwards the corresponding request to the `kata-agent` to carry out the operations inside the guest VM. For `resize` operation, +the Kata runtime also needs to notify the hypervisor to resize the block device (e.g. call `block_resize` in QEMU). + +### Kata agent changes + +The mount spec of a direct-assigned volume is passed to `kata-agent` through the existing `Storage` GRPC object. +Two new APIs and three new GRPC objects are added to GRPC protocol between the shim and agent for resizing and getting volume stats: +```protobuf + +rpc GetVolumeStats(VolumeStatsRequest) returns (VolumeStatsResponse); +rpc ResizeVolume(ResizeVolumeRequest) returns (google.protobuf.Empty); + +message VolumeStatsRequest { +// The volume path on the guest outside the container + string volume_guest_path = 1; +} + +message ResizeVolumeRequest { +// Full VM guest path of the volume (outside the container) + string volume_guest_path = 1; + uint64 size = 2; +} + +// This should be kept in sync with CSI NodeGetVolumeStatsResponse (https://github.com/container-storage-interface/spec/blob/v1.5.0/csi.proto) +message VolumeStatsResponse { + // This field is OPTIONAL. + repeated VolumeUsage usage = 1; + // Information about the current condition of the volume. + // This field is OPTIONAL. + // This field MUST be specified if the VOLUME_CONDITION node + // capability is supported. + VolumeCondition volume_condition = 2; +} +message VolumeUsage { + enum Unit { + UNKNOWN = 0; + BYTES = 1; + INODES = 2; + } + // The available capacity in specified Unit. This field is OPTIONAL. + // The value of this field MUST NOT be negative. + uint64 available = 1; + + // The total capacity in specified Unit. This field is REQUIRED. + // The value of this field MUST NOT be negative. + uint64 total = 2; + + // The used capacity in specified Unit. This field is OPTIONAL. + // The value of this field MUST NOT be negative. + uint64 used = 3; + + // Units by which values are measured. This field is REQUIRED. + Unit unit = 4; +} + +// VolumeCondition represents the current condition of a volume. +message VolumeCondition { + + // Normal volumes are available for use and operating optimally. + // An abnormal volume does not meet these criteria. + // This field is REQUIRED. + bool abnormal = 1; + + // The message describing the condition of the volume. + // This field is REQUIRED. + string message = 2; +} + +``` + +### Step by step walk-through + +Given the following definition: +```YAML +--- +apiVersion: v1 +kind: Pod +metadata: + name: app +spec: + runtime-class: kata-qemu + containers: + - name: app + image: centos + command: ["/bin/sh"] + args: ["-c", "while true; do echo $(date -u) >> /data/out.txt; sleep 5; done"] + volumeMounts: + - name: persistent-storage + mountPath: /data + volumes: + - name: persistent-storage + persistentVolumeClaim: + claimName: ebs-claim +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + annotations: + skip-hostmount: "true" + name: ebs-claim +spec: + accessModes: + - ReadWriteOncePod + volumeMode: Filesystem + storageClassName: ebs-sc + resources: + requests: + storage: 4Gi +--- +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: ebs-sc +provisioner: ebs.csi.aws.com +volumeBindingMode: WaitForFirstConsumer +parameters: + csi.storage.k8s.io/fstype: ext4 + +``` +Let’s assume that changes have been made in the `aws-ebs-csi-driver` node driver. + +**Node publish volume** +1. In the node CSI driver, the `NodePublishVolume` API invokes: `kata-runtime direct-volume add --volume-path "/kubelet/a/b/c/d/sdf" --mount-info "{\"Device\": \"/dev/sdf\", \"fstype\": \"ext4\"}"`. +2. The `Kata-runtime` writes the mount-info JSON to a file called `mountInfo.json` under `/run/kata-containers/shared/direct-volumes/kubelet/a/b/c/d/sdf`. + +**Node unstage volume** +1. In the node CSI driver, the `NodeUnstageVolume` API invokes: `kata-runtime direct-volume remove --volume-path "/kubelet/a/b/c/d/sdf"`. +2. Kata-runtime deletes the directory `/run/kata-containers/shared/direct-volumes/kubelet/a/b/c/d/sdf`. + +**Use the volume in sandbox** +1. Upon the request to start a container, the `containerd-shim-kata-v2` examines the container spec, +and iterates through the mounts. For each mount, if there is a `mountInfo.json` file under `/run/kata-containers/shared/direct-volumes/[mount source path]`, +it generates a `storage` GRPC object after overwriting the mount spec with the information in `mountInfo.json`. +2. The shim sends the storage objects to kata-agent through TTRPC. +3. The shim writes a file with the sandbox id as the name under `/run/kata-containers/shared/direct-volumes/[mount source path]`. +4. The kata-agent mounts the storage objects for the container. + +**Node expand volume** +1. In the node CSI driver, the `NodeExpandVolume` API invokes: `kata-runtime direct-volume resize –-volume-path "/kubelet/a/b/c/d/sdf" –-size 8Gi`. +2. The Kata runtime checks whether there is a sandbox id file under the directory `/run/kata-containers/shared/direct-volumes/kubelet/a/b/c/d/sdf`. +3. The Kata runtime identifies the shim instance through the sandbox id, and sends a GRPC request to resize the volume. +4. The shim handles the request, asks the hypervisor to resize the block device and sends a GRPC request to Kata agent to resize the filesystem. +5. Kata agent receives the request and resizes the filesystem. + +**Node get volume stats** +1. In the node CSI driver, the `NodeGetVolumeStats` API invokes: `kata-runtime direct-volume stats –-volume-path "/kubelet/a/b/c/d/sdf"`. +2. The Kata runtime checks whether there is a sandbox id file under the directory `/run/kata-containers/shared/direct-volumes/kubelet/a/b/c/d/sdf`. +3. The Kata runtime identifies the shim instance through the sandbox id, and sends a GRPC request to get the volume stats. +4. The shim handles the request and forwards it to the Kata agent. +5. Kata agent receives the request and returns the filesystem stats. From 7772f7dd995a7ce0ff610f8ae1cc726246cc8dae Mon Sep 17 00:00:00 2001 From: Bin Liu Date: Fri, 29 Apr 2022 22:26:32 +0800 Subject: [PATCH 28/29] runk: set BinaryName for runk for containerd The default runtime for io.containerd.runc.v2 is runc, to use runk, the containerd configuration should set the default runtime to runk or add BinaryName options for the runtime. Fixes: #4177 Signed-off-by: Bin Liu --- src/tools/runk/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tools/runk/README.md b/src/tools/runk/README.md index 81f02fc59..41663625a 100644 --- a/src/tools/runk/README.md +++ b/src/tools/runk/README.md @@ -93,7 +93,7 @@ $ bundle_dir="bundle" $ rootfs_dir="$bundle_dir/rootfs" $ image="busybox" $ mkdir -p "$rootfs_dir" && (cd "$bundle_dir" && runk spec) -$ sudo docker export $(sudo docker create "$image") | tar -C "$rootfs_dir" -xvf - +$ sudo docker export $(sudo docker create "$image") | tar -C "$rootfs_dir" -xf - ``` > **Note:** @@ -160,7 +160,7 @@ $ sudo runk delete test ### Prerequisites for `runk` with containerd * `containerd` v1.2.4 or above -* `cri-tool` +* `cri-tools` > **Note:** > [`cri-tools`](https://github.com/kubernetes-sigs/cri-tools) is a set of tools for CRI @@ -197,6 +197,8 @@ version = 2 runtime_type = "io.containerd.runc.v2" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runk] runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runk.options] + BinaryName = "/usr/local/bin/runk" EOF ``` From 33a8b705588d6f65163e05bf83119db4360b04ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 3 May 2022 16:01:28 +0200 Subject: [PATCH 29/29] clh: Rely on Cloud Hypervisor for generating the device ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We're currently hitting a race condition on the Cloud Hypervisor's driver code when quickly removing and adding a block device. This happens because the device removal is an asynchronous operation, and we currently do *not* monitor events coming from Cloud Hypervisor to know when the device was actually removed. Together with this, the sandbox code doesn't know about that and when a new device is attached it'll quickly assign what may be the very same ID to the new device, leading to the Cloud Hypervisor's driver trying to hotplug a device with the very same ID of the device that was not yet removed. This is, in a nutshell, why the tests with Cloud Hypervisor and devmapper have been failing every now and then. The workaround taken to solve the issue is basically *not* passing down the device ID to Cloud Hypervisor and simply letting Cloud Hypervisor itself generate those, as Cloud Hypervisor does it in a manner that avoids such conflicts. With this addition we have then to keep a map of the device ID and the Cloud Hypervisor's generated ID, so we can properly remove the device. This workaround will probably stay for a while, at least till someone has enough cycles to implement a way to watch the device removal event and then properly act on that. Spoiler alert, this will be a complex change that may not even be worth it considering the race can be avoided with this commit. Fixes: #4176 Signed-off-by: Fabiano Fidêncio --- src/runtime/virtcontainers/clh.go | 10 +++++++--- src/runtime/virtcontainers/clh_test.go | 2 ++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 41d00ee27..c82d0d42c 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -165,6 +165,7 @@ type cloudHypervisor struct { APIClient clhClient ctx context.Context id string + devicesIds map[string]string vmconfig chclient.VmConfig state CloudHypervisorState config HypervisorConfig @@ -358,6 +359,7 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net clh.id = id clh.state.state = clhNotReady + clh.devicesIds = make(map[string]string) clh.Logger().WithField("function", "CreateVM").Info("creating Sandbox") @@ -670,7 +672,6 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro clhDisk := *chclient.NewDiskConfig(drive.File) clhDisk.Readonly = &drive.ReadOnly clhDisk.VhostUser = func(b bool) *bool { return &b }(false) - clhDisk.Id = &driveID diskRateLimiterConfig := clh.getDiskRateLimiterConfig() if diskRateLimiterConfig != nil { @@ -683,6 +684,7 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro return fmt.Errorf("failed to hotplug block device %+v %s", drive, openAPIClientError(err)) } + clh.devicesIds[driveID] = pciInfo.GetId() drive.PCIPath, err = clhPciInfoToPath(pciInfo) return err @@ -696,11 +698,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { // Create the clh device config via the constructor to ensure default values are properly assigned clhDevice := *chclient.NewVmAddDevice() clhDevice.Path = &device.SysfsDev - clhDevice.Id = &device.ID pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice) if err != nil { return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) } + clh.devicesIds[device.ID] = pciInfo.GetId() // clh doesn't use bridges, so the PCI path is simply the slot // number of the device. This will break if clh starts using @@ -761,13 +763,15 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second) defer cancel() + originalDeviceID := clh.devicesIds[deviceID] remove := *chclient.NewVmRemoveDevice() - remove.Id = &deviceID + remove.Id = &originalDeviceID _, err := cl.VmRemoveDevicePut(ctx, remove) if err != nil { err = fmt.Errorf("failed to hotplug remove (unplug) device %+v: %s", devInfo, openAPIClientError(err)) } + delete(clh.devicesIds, deviceID) return nil, err } diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index a764910f4..79e35210b 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -563,6 +563,7 @@ func TestCloudHypervisorHotplugAddBlockDevice(t *testing.T) { clh := &cloudHypervisor{} clh.config = clhConfig clh.APIClient = &clhClientMock{} + clh.devicesIds = make(map[string]string) clh.config.BlockDeviceDriver = config.VirtioBlock err = clh.hotplugAddBlockDevice(&config.BlockDrive{Pmem: false}) @@ -585,6 +586,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) { clh := &cloudHypervisor{} clh.config = clhConfig clh.APIClient = &clhClientMock{} + clh.devicesIds = make(map[string]string) _, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev) assert.NoError(err, "Hotplug remove block device expected no error")